diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md
deleted file mode 100644
index eb39321b173..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Run
-
-## Run FP32 model
-``` python
-python run_llm.py --model [model_name_or_path] --to_graph [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-
-## Run BF16/FP16 model
-``` python
-python run_llm.py --model [model_name_or_path] --approach cast --precision [bf16|fp16]  --to_graph  [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-
-## Run FP8 model
-``` python
-python run_llm.py --model [model_name_or_path] --approach [dynamic|static|cast] --precision [fp8_e4m3|fp8_e5m2] --to_graph  [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-
-# Multi-card Inference
-With deepspeed we can leverage multi-cards inference with a prefix in command, below it's a demonstration of 4 card inference.
-
-```python
-deepspeed --num_gpus=4 run_llm.py --model [model_name_or_path] --approach [dynamic|static|cast] --precision [fp8_e4m3|fp8_e5m2] --to_graph  [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-deepspeed --num_gpus=4 run_llm.py --model facebook/opt-125m --approach static --precision fp8_e4m3 --to_graph  --accuracy --tasks lambada_openai --batch_size 8
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py
deleted file mode 100644
index 35600185f5a..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from transformers import PretrainedConfig
-
-
-class ChatGLMConfig(PretrainedConfig):
-    model_type = "chatglm"
-    def __init__(
-        self,
-        num_layers=28,
-        padded_vocab_size=65024,
-        hidden_size=4096,
-        ffn_hidden_size=13696,
-        kv_channels=128,
-        num_attention_heads=32,
-        seq_length=2048,
-        hidden_dropout=0.0,
-        classifier_dropout=None,
-        attention_dropout=0.0,
-        layernorm_epsilon=1e-5,
-        rmsnorm=True,
-        apply_residual_connection_post_layernorm=False,
-        post_layer_norm=True,
-        add_bias_linear=False,
-        add_qkv_bias=False,
-        bias_dropout_fusion=True,
-        multi_query_attention=False,
-        multi_query_group_num=1,
-        apply_query_key_layer_scaling=True,
-        attention_softmax_in_fp32=True,
-        fp32_residual_connection=False,
-        quantization_bit=0,
-        pre_seq_len=None,
-        prefix_projection=False,
-        **kwargs
-    ):
-        self.num_layers = num_layers
-        self.vocab_size = padded_vocab_size
-        self.padded_vocab_size = padded_vocab_size
-        self.hidden_size = hidden_size
-        self.ffn_hidden_size = ffn_hidden_size
-        self.kv_channels = kv_channels
-        self.num_attention_heads = num_attention_heads
-        self.seq_length = seq_length
-        self.hidden_dropout = hidden_dropout
-        self.classifier_dropout = classifier_dropout
-        self.attention_dropout = attention_dropout
-        self.layernorm_epsilon = layernorm_epsilon
-        self.rmsnorm = rmsnorm
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-        self.post_layer_norm = post_layer_norm
-        self.add_bias_linear = add_bias_linear
-        self.add_qkv_bias = add_qkv_bias
-        self.bias_dropout_fusion = bias_dropout_fusion
-        self.multi_query_attention = multi_query_attention
-        self.multi_query_group_num = multi_query_group_num
-        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
-        self.fp32_residual_connection = fp32_residual_connection
-        self.quantization_bit = quantization_bit
-        self.pre_seq_len = pre_seq_len
-        self.prefix_projection = prefix_projection
-        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py
deleted file mode 100644
index be1cd520af5..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py
+++ /dev/null
@@ -1,1294 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-from copy import deepcopy
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
-_CONFIG_FOR_DOC = "ChatGLMConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm3-6b",
-    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
-]
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config: ChatGLMConfig):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(kv_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, kv_size)
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len,
-                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-
-    def forward_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        return self.forward_impl(
-            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-        )
-
-### INC change ###
-# @torch.jit.script
-
-def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, hidden_states: torch.Tensor):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2:
-            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.permute(2, 0, 1, 3)
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-            # [sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer.transpose(0, 1),  # [b * np, sq, hn]
-                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [sq, b, np, hn]
-            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-            # [sq, b, np, hn] --> [sq, b, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-    ):
-        # hidden_states: [sq, b, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                               (self.num_attention_heads_per_partition,
-                                3 * self.hidden_size_per_attention_head)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            key_layer = torch.cat((cache_k, key_layer), dim=0)
-            value_layer = torch.cat((cache_v, value_layer), dim=0)
-        if use_cache:
-            kv_cache = (key_layer, value_layer)
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
-            key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.unsqueeze(-2)
-            value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-        self.gradient_checkpointing = False
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_cache=kv_caches[index],
-                    use_cache=use_cache
-                )
-            hidden_states, kv_cache = layer_ret
-            if use_cache:
-                presents = presents + (kv_cache,)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # Data format change to avoid explicit transposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-        self.num_layers = config.num_layers
-        self.multi_query_group_num = config.multi_query_group_num
-        self.kv_channels = config.kv_channels
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
-                                              dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-    def get_input_embeddings(self):
-        return self.embedding.word_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.multi_query_group_num,
-            self.kv_channels
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        return past_key_values
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if self.pre_seq_len is not None:
-            if past_key_values is None:
-                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
-                                                  dtype=inputs_embeds.dtype)
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
-                                            attention_mask], dim=-1)
-
-        if full_attention_mask is None:
-            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    def quantize(self, weight_bit_width: int):
-        from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
-        return self
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        model_kwargs["is_first_forward"] = False
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            is_first_forward: bool = True,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if position_ids is None:
-            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-        if not is_first_forward:
-            if past_key_values is not None:
-                position_ids = position_ids[..., -1:]
-                input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "return_last_logit": True,
-            "use_cache": use_cache
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            return_last_logit: Optional[bool] = False,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        if return_last_logit:
-            hidden_states = hidden_states[-1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, output, history):
-        content = ""
-        history = deepcopy(history)
-        for response in output.split("<|assistant|>"):
-            metadata, content = response.split("\n", maxsplit=1)
-            if not metadata.strip():
-                content = content.strip()
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                content = content.replace("[[训练时间]]", "2023年")
-            else:
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                if history[0]["role"] == "system" and "tools" in history[0]:
-                    content = "\n".join(content.split("\n")[1:-1])
-                    def tool_call(**kwargs):
-                        return kwargs
-                    parameters = eval(content)
-                    content = {"name": metadata.strip(), "parameters": parameters}
-                else:
-                    content = {"name": metadata.strip(), "content": content}
-        return content, history
-
-    @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
-             **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = tokenizer.build_chat_input(query, history=history, role=role)
-        inputs = inputs.to(self.device)
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
-                        tokenizer.get_command("<|observation|>")]
-        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-        response = tokenizer.decode(outputs)
-        history.append({"role": role, "content": query})
-        response, history = self.process_response(response, history)
-        return response, history
-
-    @torch.inference_mode()
-    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
-                    logits_processor=None, return_past_key_values=False, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
-                        tokenizer.get_command("<|observation|>")]
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if past_key_values is None:
-            inputs = tokenizer.build_chat_input(query, history=history, role=role)
-        else:
-            inputs = tokenizer.build_chat_input(query, role=role)
-        inputs = inputs.to(self.device)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[0]
-            if self.transformer.pre_seq_len is not None:
-                past_length -= self.transformer.pre_seq_len
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
-            inputs['attention_mask'] = attention_mask
-        history.append({"role": role, "content": query})
-        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
-                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
-                                            **gen_kwargs):
-            if return_past_key_values:
-                outputs, past_key_values = outputs
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-            response = tokenizer.decode(outputs)
-            if response and response[-1] != "�":
-                response, new_history = self.process_response(response, history)
-                if return_past_key_values:
-                    yield response, new_history, past_key_values
-                else:
-                    yield response, new_history
-
-    @torch.inference_mode()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            return_past_key_values=False,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        model_kwargs["use_cache"] = generation_config.use_cache
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul(
-                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-            )
-            if return_past_key_values:
-                yield input_ids, outputs.past_key_values
-            else:
-                yield input_ids
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-
-    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
-                                            **kwargs)
-        return self
-
-
-class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.num_labels = config.num_labels
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-
-        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
-        if config.classifier_dropout is not None:
-            self.dropout = nn.Dropout(config.classifier_dropout)
-        else:
-            self.dropout = None
-        self.config = config
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def forward(
-            self,
-            input_ids: Optional[torch.LongTensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            full_attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.LongTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            full_attention_mask=full_attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        pooled_hidden_states = hidden_states[-1]
-        if self.dropout is not None:
-            pooled_hidden_states = self.dropout(pooled_hidden_states)
-        logits = self.classifier_head(pooled_hidden_states)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits.float(), labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py
deleted file mode 100644
index 4cd1b6e18e8..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py
+++ /dev/null
@@ -1,1263 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.utils.import_utils import is_torch_fx_available
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-### INC code ###
-from neural_compressor.torch.quantization.modules import Matmul, BatchMatmul, Autocast
-
-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
-# It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    warnings.warn(
-        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask"
-    )
-    return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    warnings.warn(
-        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask"
-    )
-    return AttentionMaskConverter._make_causal_mask(
-        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
-    )
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
-
-
-class LlamaRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
-        self._init_rope()
-        ### INC code ###
-        self.matmul1 = Matmul()
-        self.matmul2 = Matmul()
-        self.cast1 = Autocast()
-        self.cast2 = Autocast()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        ### INC code ###
-        key_states = self.cast1(key_states)
-        value_states = self.cast2(value_states)
-        # import habana_frameworks.torch.core as htcore
-        # htcore.mark_step()
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        ### INC code ###
-        attn_weights = self.matmul1(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-
-        ### INC code ###
-        attn_output = self.matmul2(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaFlashAttention2(LlamaAttention):
-    """
-    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # LlamaFlashAttention2 attention does not support output_attentions
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            # Handle the case where the model is quantized
-            if hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=self.is_causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
-            )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = (
-            LlamaAttention(config=config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else LlamaFlashAttention2(config=config)
-        )
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        past_key_values_length = 0
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if getattr(self.config, "_flash_attn_2_enabled", False):
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-            )
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_value,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
-                    logits.device
-                )
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py
deleted file mode 100644
index 5b7054d3227..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright 2023 Baichuan Inc. All Rights Reserved.
-
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
-
-import sentencepiece as spm
-
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {},
-    "tokenizer_file": {},
-}
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
-
-
-class BaichuanTokenizer(PreTrainedTokenizer):
-    """
-    Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        unk_token="<unk>",
-        bos_token="<s>",
-        eos_token="</s>",
-        pad_token=None,
-        sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        add_bos_token=True,
-        add_eos_token=False,
-        clean_up_tokenization_spaces=False,
-        **kwargs,
-    ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        ### INC code ###
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        #self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        #self.sp_model.Load(vocab_file)
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
-
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text):
-        """Returns a tokenized string."""
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special and i != 0:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-
-    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        if token_ids_1 is None, only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of ids.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
-
-        if token_ids_1 is not None:
-            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
-
-        return output
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt
deleted file mode 100644
index d3655acd742..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-datasets
-accelerate
-SentencePiece
-lm_eval==0.3.0
-openpyxl
-einops
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
deleted file mode 100644
index 5cd0f046aba..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import os
-os.environ["EXPERIMENTAL_WEIGHT_SHARING"] = "False"
-
-### USE_GAUDI2_SCALE requires PT_USE_FP8_AMAX for torch.mm/bmm, or got failure
-# os.environ["USE_GAUDI2_SCALE"] = "True"
-# os.environ["PT_USE_FP8_AMAX"] = "True"
-
-### graphs will dump to .graph_dumps folder
-# os.environ["GRAPH_VISUALIZATION"] = "True"
-# import shutil
-# shutil.rmtree(".graph_dumps", ignore_errors=True)
-
-import argparse
-import time
-import json
-import re
-import torch
-import habana_frameworks.torch.hpex
-import torch.nn.functional as F
-import deepspeed
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-import habana_frameworks.torch.core as htcore
-
-from utils import show_msg, eval_func, init_empty_model, init_model, init_tokenizer
-
-
-torch.set_grad_enabled(False)
-htcore.hpu_set_env()
-torch.device('hpu')
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--model", nargs="?", default="facebook/opt-125m"
-)
-parser.add_argument(
-    "--trust_remote_code", default=True,
-    help="Transformers parameter: use the external repo")
-parser.add_argument(
-    "--revision", default=None,
-    help="Transformers parameter: set the model hub commit number")
-parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
-parser.add_argument("--output_dir", nargs="?", default="./saved_results")
-parser.add_argument("--to_graph", action="store_true")
-parser.add_argument("--approach", type=str, default=None,
-                    help="Select from ['dynamic', 'static' 'cast']")
-parser.add_argument("--precision", type=str, default='fp32',
-                    help="Select from ['fp8_e4m3', 'fp8_e5m2', 'bf16', 'fp16', 'fp32'], \
-                        ['bf16', 'fp16'] only work with cast approach")
-parser.add_argument("--autotune", action="store_true")
-parser.add_argument("--accuracy", action="store_true")
-parser.add_argument("--performance", action="store_true")
-parser.add_argument("--generate", action="store_true")
-parser.add_argument("--skip_fp8_mm", action="store_true")
-parser.add_argument("--dump_to_excel", action="store_true")
-parser.add_argument("--save", action="store_true")
-parser.add_argument("--load", action="store_true")
-parser.add_argument("--batch_size", default=1, type=int,
-                    help="For accuracy measurement only.")
-parser.add_argument("--pad_max_length", default=512, type=int,
-                    help="Pad input ids to max length.")
-parser.add_argument("--calib_iters", default=100, type=int,
-                    help="calibration iters.")
-parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], \
-                    type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa", 
-                                       "rte", "openbookqa", "lambada_standard", "wikitext"],
-                    help="tasks list for accuracy validation")
-parser.add_argument("--limit", default=None, type=int,
-                    help="the sample num of evaluation.")
-parser.add_argument("--max_new_tokens", default=100, type=int,
-                    help="calibration iters.")
-parser.add_argument('--buckets', type=int, nargs='+', \
-                    help="Input length buckets to use with static_shapes", default=[256, 512])
-parser.add_argument("--local_rank",
-                    type=int,
-                    default=-1,
-                    help="local_rank for distributed training on gpus")
-parser.add_argument("--skip_lm_head", action="store_true")
-args = parser.parse_args()
-
-
-world_size = int(os.getenv('WORLD_SIZE', '1'))
-local_rank = int(os.getenv('LOCAL_RANK', '-1'))
-
-
-if args.load:
-    user_model = init_empty_model(args.model)
-else:
-    user_model = init_model(args)
-user_model.eval()
-
-
-tokenizer = init_tokenizer(args)
-
-
-### dynamic & static quantization ###
-if args.approach in ["dynamic", "static"] and not args.load:
-    print("device:", next(user_model.parameters()).device)
-    from neural_compressor.torch.quantization import (
-        quantize, autotune, FP8Config, get_default_fp8_config, TuningConfig, get_default_fp8_config_set
-    )
-    dtype = args.precision
-    if args.approach == "dynamic":
-        from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
-        user_model = quantize_dynamic(user_model, dtype, inplace=True)
-    elif args.approach == "static":
-        qconfig = FP8Config(w_dtype=dtype, act_dtype=dtype, approach="static")
-        if args.skip_lm_head:
-            fp32_config = FP8Config(w_dtype="fp32", act_dtype="fp32")
-            qconfig.set_local("lm_head", fp32_config)
-        # dataset
-        from datasets import load_dataset
-        calib_dataset = load_dataset(args.dataset, split="train").select(range(100))
-        calib_dataset = calib_dataset.shuffle(seed=42)
-        calib_data = []
-        for examples in calib_dataset:
-            calib_data.append(
-                tokenizer(
-                    examples["text"], 
-                    return_tensors="pt", 
-                    max_length=64, 
-                    padding="max_length", 
-                    truncation=True
-                )
-            )
-
-        def calib_func(model):
-            for i, calib_input in enumerate(calib_data):
-                if i >= args.calib_iters:
-                    break
-                model(
-                    input_ids=calib_input["input_ids"].to('hpu'),
-                    attention_mask=calib_input["attention_mask"].to('hpu'),
-                )
-
-        user_model = quantize(user_model, qconfig, calib_func, inplace=True)
-        # saving
-        print(user_model)
-        if args.save and local_rank in [-1, 0]:
-            user_model.save("saved_results")
-
-
-if args.load:
-    from neural_compressor.torch.quantization import load
-    user_model = load("saved_results", user_model)
-
-
-if args.approach in ["dynamic", "static"] or args.load:
-    # It enables weights constant folding
-    from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
-    _mark_params_as_const(user_model)  # can reduce memory allocated and speed up
-    _check_params_as_const(user_model)
-
-
-
-# If torch.matmul and torch.bmm are not replaced by INC module, 
-# Below codes can make torch.matmul and torch.bmm run on fp8 by injection.
-if not args.skip_fp8_mm and args.precision in ['fp8_e4m3', 'fp8_e5m2']:
-    def replace_torch_mm_bmm():
-        from neural_compressor.torch.amp.fp8.functions import fp8_matmul
-        torch.matmul = fp8_matmul
-        torch.bmm = fp8_matmul
-
-    replace_torch_mm_bmm()
-
-
-# inference optimization
-if args.to_graph:
-    import habana_frameworks.torch.hpu.graphs as htgraphs
-    user_model = htgraphs.wrap_in_hpu_graph(user_model)
-
-
-# dump message of HPU after quantization or reloading
-show_msg()
-
-
-### generation, performance and accuracy validation ###
-if args.generate:
-    input_prompt = "Here is my prompt"
-    print("Prompt sentence:", input_prompt)
-    generation_config = {
-        "min_new_tokens": args.max_new_tokens, "max_new_tokens": args.max_new_tokens,
-        # "do_sample": False, "temperature": 0.9, "num_beams": 4,
-    }
-    input_tokens = tokenizer(input_prompt, return_tensors="pt").to('hpu')
-    eval_start = time.perf_counter()
-    if args.approach == "cast":
-        from neural_compressor.torch.amp import autocast
-        if args.precision == "fp8_e4m3":
-            dtype = torch.float8_e4m3fn
-        elif args.precision == "fp8_e5m2":
-            dtype = torch.float8_e5m2
-        elif args.precision == "fp16":
-            dtype = torch.float16
-        elif args.precision == "bf16":
-            dtype = torch.bfloat16
-        with autocast('hpu', dtype=dtype):
-            outputs = user_model.generate(**input_tokens, **generation_config)
-    else:
-        outputs = user_model.generate(**input_tokens, **generation_config)
-
-    output_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    eval_end = time.perf_counter()
-    print("Generated sentence:", output_sentence)
-    print("Duration:", eval_end - eval_start)
-
-
-if args.performance:
-    eval_start = time.perf_counter()
-    input_prompt = "Intel is a company which"
-    input_tokens = torch.ones((1, 128), dtype=torch.long).to('hpu')
-    generation_config = {"min_new_tokens": 100, "max_new_tokens": 100}
-    outputs = user_model.generate(input_tokens, **generation_config)
-    print("Duration of generating 100 tokens :", time.perf_counter() - eval_start)
-
-
-if args.accuracy:
-    eval_func(user_model, tokenizer=tokenizer, args=args)
-
-# dump final message of HPU
-show_msg()
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py
deleted file mode 100644
index 843287cddfa..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import os
-import re
-import torch
-from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-
-
-world_size = int(os.getenv('WORLD_SIZE', '1'))
-local_rank = int(os.getenv('LOCAL_RANK', '-1'))
-
-
-def init_model(args):
-    import deepspeed
-    model_dtype = torch.float32
-    if re.search("llama", args.model.lower()) or re.search("bloom", args.model.lower()):
-        if world_size > 1:
-            config = AutoConfig.from_pretrained(args.model)
-            model_dtype = torch.bfloat16 # RuntimeErrorCastToFp8V2 input must be of float or bfloat16 dtype
-            deepspeed.init_distributed(dist_backend="hccl")
-            with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
-                user_model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
-            import tempfile
-            checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-            from optimum.habana.checkpoint_utils import write_checkpoints_json # in optimum-habana
-            write_checkpoints_json(
-                args.model,
-                local_rank,
-                checkpoints_json,
-                token=None,
-            )
-        else:
-            user_model = AutoModelForCausalLM.from_pretrained(
-                args.model,
-                device_map='hpu',
-                torch_dtype=model_dtype,
-            )
-    elif re.search("chatglm", args.model.lower()):
-        from models.modeling_chatglm import ChatGLMForConditionalGeneration
-        user_model = ChatGLMForConditionalGeneration.from_pretrained(
-            args.model,
-            revision=args.revision,
-            device_map='hpu',
-            torch_dtype=model_dtype,
-        )
-        # print(user_model.transformer.output_layer.weight.dtype) # always fp16
-        user_model.float() # static fp8 need float32 for graph compiler
-    else:
-        user_model = AutoModelForCausalLM.from_pretrained(
-            args.model,
-            trust_remote_code=args.trust_remote_code,
-            revision=args.revision,
-            device_map='hpu',
-            torch_dtype=model_dtype,
-        )
-    # load weight for multi-cards
-    if world_size > 1:
-        if re.search("llama", args.model.lower()) or re.search("bloom", args.model.lower()):
-            ds_inference_kwargs = {"dtype": model_dtype}
-            ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
-            ds_inference_kwargs["enable_cuda_graph"] = False
-            from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-            ds_inference_kwargs["injection_policy"] = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
-            ds_inference_kwargs["checkpoint"] = checkpoints_json.name
-            ds_model = deepspeed.init_inference(user_model, **ds_inference_kwargs)
-        else:
-            ds_model = deepspeed.init_inference(user_model,
-                                            mp_size=world_size,
-                                            replace_with_kernel_inject=False)
-        user_model = ds_model.module
-    return user_model
-
-
-def init_empty_model(model_name):
-    from accelerate import init_empty_weights
-    model_dtype = torch.float32
-    config = AutoConfig.from_pretrained(model_name)
-    with init_empty_weights():
-        model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
-    return model
-
-
-def init_tokenizer(args):
-    # tokenizer
-    if re.search("baichuan", args.model.lower()):
-        from models.tokenization_baichuan import BaichuanTokenizer
-        tokenizer = BaichuanTokenizer.from_pretrained(
-            args.model,
-            trust_remote_code=args.trust_remote_code
-        )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.model,
-            trust_remote_code=args.trust_remote_code
-        )
-    tokenizer.pad_token = tokenizer.eos_token
-    return tokenizer
-
-
-def show_msg():
-    import numpy as np
-    import glob
-    from habana_frameworks.torch.hpu import memory_stats
-    print("Number of HPU graphs:", len(glob.glob(".graph_dumps/*PreGraph*")))
-    mem_stats = memory_stats()
-    mem_dict = {
-        "memory_allocated (GB)": np.round(mem_stats["InUse"] / 1024**3, 2),
-        "max_memory_allocated (GB)": np.round(mem_stats["MaxInUse"] / 1024**3, 2),
-        "total_memory_available (GB)": np.round(mem_stats["Limit"] / 1024**3, 2),
-    }
-    for k, v in mem_dict.items():
-        print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-
-
-def itrex_bootstrap_stderr(f, xs, iters):
-    from lm_eval.metrics import _bootstrap_internal, sample_stddev
-    res = []
-    chunk_size = min(1000, iters)
-    it = _bootstrap_internal(f, chunk_size)
-    for i in range(iters // chunk_size):
-        bootstrap = it((i, xs))
-        res.extend(bootstrap)
-    return sample_stddev(res)
-
-
-def save_to_excel(dict):
-    import pandas as pd
-    df_new = pd.DataFrame(dict)
-    try:
-        df_existing = pd.read_excel('output.xlsx')
-    except FileNotFoundError:
-        df_existing = pd.DataFrame()
-    df_combined = pd.concat([df_existing, df_new], axis=0, ignore_index=True)
-    df_combined.to_excel('output.xlsx', index=False, engine='openpyxl', header=True)
-
-
-def eval_func(user_model, tokenizer, args):
-    import os
-    import re
-    import time
-    import json
-    import torch
-    import habana_frameworks.torch.hpex
-    import torch.nn.functional as F
-    import lm_eval
-    import lm_eval.tasks
-    import lm_eval.evaluator
-
-    # to avoid out-of-memory caused by Popen for large language models.
-    lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr
-
-    class HabanaModelAdapter(lm_eval.base.BaseLM):
-        def __init__(self, tokenizer, model, args, options):
-            super().__init__()
-            self.tokenizer = tokenizer
-            self.model = model.eval()
-            self._batch_size = args.batch_size
-            self.buckets = list(sorted(args.buckets))
-            self.options = options
-            self._device = "hpu"
-            torch.set_grad_enabled(False)
-
-        @property
-        def eot_token_id(self):
-            return self.model.config.eos_token_id
-
-        @property
-        def max_length(self):
-            return self.buckets[-1]
-
-        @property
-        def max_gen_toks(self):
-            raise NotImplementedError()
-
-        @property
-        def batch_size(self):
-            return self._batch_size
-
-        @property
-        def device(self):
-            # We need to do padding ourselves, otherwise we'll end up with recompilations
-            # Returning 'cpu' to keep tensors on CPU in lm_eval code
-            return 'cpu' # 'hpu'
-
-        def tok_encode(self, string):
-            if (
-                re.search("chatglm3", args.model.lower()) or
-                re.search("llama", args.model.lower()) or
-                re.search("mistral", args.model.lower())
-            ):
-                string = string.lstrip()
-            return self.tokenizer.encode(string, add_special_tokens=False)
-
-        def tok_decode(self, tokens):
-            return self.tokenizer.decode(tokens, skip_special_tokens=True)
-
-        def _model_generate(self, context, max_length, eos_token_id):
-            raise NotImplementedError()
-
-        def find_bucket(self, length):
-            return [b for b in self.buckets if b >= length][0]
-
-        def _model_call(self, inputs):
-            seq_length = inputs.shape[-1]
-            padding_length = 0
-            bucket_length = self.find_bucket(seq_length)
-            padding_length = bucket_length - seq_length
-            inputs = F.pad(inputs, (0, padding_length), value=self.model.config.pad_token_id)
-            logits = self.model(inputs.to(self._device))["logits"].cpu()
-
-            if padding_length > 0:
-                logits = logits[:, :-padding_length, :]
-            logits = logits.to(torch.float32)
-            return logits
-
-    lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
-    options = None
-    lm = HabanaModelAdapter(tokenizer, user_model, args, options)
-
-    eval_start = time.perf_counter()
-    if args.approach == "cast":
-        from neural_compressor.torch.amp import autocast
-        if args.precision == "fp8_e4m3":
-            dtype = torch.float8_e4m3fn
-        elif args.precision == "fp8_e5m2":
-            dtype = torch.float8_e5m2
-        elif args.precision == "fp16":
-            dtype = torch.float16
-        elif args.precision == "bf16":
-            dtype = torch.bfloat16
-        with autocast('hpu', dtype=dtype):
-            results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
-    else:
-        results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
-    print(lm_eval.evaluator.make_table(results))
-    eval_end = time.perf_counter()
-    print("Duration:", eval_end - eval_start)
-    results['args'] = vars(args)
-    results['duration'] = eval_end - eval_start
-
-    # make sure that result is dumped only once during multi-cards evaluation
-    local_rank = int(os.getenv('LOCAL_RANK', '-1'))
-    if local_rank in [-1, 0]:
-        dumped = json.dumps(results, indent=2)
-        accu_dict = {}
-        case_name = str(args.approach) + "-" + args.precision
-        for task_name in args.tasks:
-            if task_name == "wikitext":
-                print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True)
-                accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]]
-            else:
-                print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True)
-                accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]]
-        accu_dict["duration"] = [args.model, case_name, results["duration"]]
-        if args.dump_to_excel:
-            save_to_excel(accu_dict)
-    return results["results"][task_name]["acc"]
diff --git a/examples/fp8_sample/README.md b/examples/fp8_sample/README.md
new file mode 100644
index 00000000000..b758768ef0f
--- /dev/null
+++ b/examples/fp8_sample/README.md
@@ -0,0 +1,96 @@
+### Usage demo:
+
+#### two steps to get quantized model
+
+```diff
+import torch
++ from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration
+import habana_frameworks.torch.core as htcore
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+model = M().eval()
+
++ config = FP8Config.from_json_file(args.quant_config) # args.quant_config is the path of json file
+
++ if config.measure:
++   model = prepare(model, config)
+
++ if config.quantize:
++     htcore.hpu_initialize()
++     model = convert(model, config)
+
+# user code run
+with torch.no_grad():
+    model.to("hpu")
+    output = model(torch.randn(1, 10).to("hpu"))
+    print(output)
+
++ if config.measure:
++    finalize_calibration(model)
+```
+
+
+Whole script and config refer to [sample_two_steps.py](./sample_two_steps.py), [maxabs_measure.json](./maxabs_measure.json) and [maxabs_quant.json](./maxabs_quant.json).
+
+First, measure the tensor quantization statistic:
+```shell
+python sample_two_steps.py --quant_config=maxabs_measure.json
+```
+
+Then quantize the model based on previous measurements:
+```shell
+python sample_two_steps.py --quant_config=maxabs_quant.json
+```
+
+#### one step to get quantized model
+
+```diff
+import torch
++ from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration
+import habana_frameworks.torch.core as htcore
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+model = M().to("hpu")
+
++ config = FP8Config.from_json_file(args.quant_config) # args.quant_config is the path of json file
++ model = prepare(model, config)
+
+# user code run to do calibration
+with torch.no_grad():
+    output = model(torch.randn(1, 10).to("hpu"))
+    print(output)
+
++ finalize_calibration(model)
++ model = convert(model)
+
+# user code to run benchmark for quantized model
+with torch.no_grad():
+    output = model(torch.randn(1, 10).to("hpu"))
+    print(output)
+```
+
+Whole script and config refer to [sample_one_step.py](./sample_one_step.py).
+
+```shell
+python sample_one_step.py --quant_config=quant_config.json
+```
diff --git a/examples/fp8_sample/maxabs_measure.json b/examples/fp8_sample/maxabs_measure.json
new file mode 100644
index 00000000000..8d55f33e57a
--- /dev/null
+++ b/examples/fp8_sample/maxabs_measure.json
@@ -0,0 +1,7 @@
+{
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/fp8_sample/maxabs_quant.json b/examples/fp8_sample/maxabs_quant.json
new file mode 100644
index 00000000000..d1f76f8f630
--- /dev/null
+++ b/examples/fp8_sample/maxabs_quant.json
@@ -0,0 +1,8 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/fp8_sample/quant_config.json b/examples/fp8_sample/quant_config.json
new file mode 100644
index 00000000000..c139d13bbea
--- /dev/null
+++ b/examples/fp8_sample/quant_config.json
@@ -0,0 +1,8 @@
+{
+    "mode": "AUTO",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/fp8_sample/sample_one_step.py b/examples/fp8_sample/sample_one_step.py
new file mode 100644
index 00000000000..18eb7bfba4c
--- /dev/null
+++ b/examples/fp8_sample/sample_one_step.py
@@ -0,0 +1,57 @@
+import argparse
+import torch
+import habana_frameworks.torch.core as htcore
+htcore.hpu_set_env()
+
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+
+torch.manual_seed(1)
+
+
+# 1. python sample_one_step.py --quant_config=quant_config.json
+
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+
+def eval_func(model):
+    # user's eval func
+    input = torch.randn(1, 10)
+    model(input.to("hpu"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Habana FP8 sample code.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--quant_config", type=str, help="json file of quantization config")
+    args = parser.parse_args()
+
+    model = M().eval().to("hpu")
+    htcore.hpu_initialize()
+
+    config = FP8Config.from_json_file(args.quant_config)
+    model = prepare(model, config)
+
+    # for calibration
+    with torch.no_grad():
+        # model.to("hpu")
+        output = model(torch.randn(1, 10).to("hpu"))
+
+    finalize_calibration(model)
+    model = convert(model)
+    print(model)
+
+    # for benchmark
+    with torch.no_grad():
+        output = model(torch.randn(1, 10).to("hpu"))
+        print(output)
diff --git a/examples/fp8_sample/sample_two_steps.py b/examples/fp8_sample/sample_two_steps.py
new file mode 100644
index 00000000000..9e17748b9b0
--- /dev/null
+++ b/examples/fp8_sample/sample_two_steps.py
@@ -0,0 +1,50 @@
+import argparse
+import torch
+import habana_frameworks.torch.core as htcore
+htcore.hpu_set_env()
+
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+
+torch.manual_seed(1)
+
+# 1. python sample_two_steps.py --quant_config=maxabs_measure.json
+# 2. python sample_two_steps.py --quant_config=maxabs_quant.json
+
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Habana FP8 sample code.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--quant_config", type=str, help="json file of quantization config")
+    args = parser.parse_args()
+
+    model = M().eval()
+    config = FP8Config.from_json_file(args.quant_config)
+
+    if config.measure:
+        model = prepare(model, config)
+
+    if config.quantize:
+        htcore.hpu_initialize()
+        model = convert(model, config)
+        print(model)
+
+    with torch.no_grad():
+        model.to("hpu")
+        output = model(torch.randn(1, 10).to("hpu"))
+        print(output)
+
+    if config.measure:
+        finalize_calibration(model)
diff --git a/neural_compressor/torch/algorithms/habana_fp8/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/__init__.py
similarity index 70%
rename from neural_compressor/torch/algorithms/habana_fp8/__init__.py
rename to neural_compressor/torch/algorithms/fp8_quant/__init__.py
index fe3a05d7d0b..d16760b5e81 100644
--- a/neural_compressor/torch/algorithms/habana_fp8/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/__init__.py
@@ -12,5 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fp8_quant import quantize_dynamic, quantize, white_list
-from .save_load import save, load
+from neural_compressor.torch.algorithms.fp8_quant.common import (
+    update_mode,
+    save_calib_result,
+    restore_patched_module,
+    with_patched_module,
+)
+from neural_compressor.torch.algorithms.fp8_quant.fp8_quant import FP8Quantizer
diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
new file mode 100644
index 00000000000..b038a367a78
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+from collections import namedtuple
+from pathlib import Path
+from typing import Union
+
+import torch
+
+
+def save_calib_result(model):
+    import habana_quantization_toolkit as hqt
+    hqt.finish_measurements(model)
+
+
+def update_mode(config_path, measure_step=False, quant_step=False):
+    with open(config_path, 'r') as file:
+        config = json.load(file)
+
+    if (measure_step and config.get("mode") == "MEASURE") or (quant_step and config.get("mode") == "QUANTIZE"):
+        return config_path
+    else:
+        if measure_step:
+            config["mode"] = "MEASURE"
+        if quant_step:
+            config["mode"] = "QUANTIZE"
+
+        temp_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+        temp_file_path = temp_file.name
+
+        with open(temp_file_path, 'w') as temp_file:
+            json.dump(config, temp_file)
+
+        return temp_file_path
+
+
+def generate_model_info(model):
+    mod_inst_info = namedtuple("ModInstInfo", ["name", "parent"])
+    parent_child_mod_dict = {}
+
+    def create_mod_info_recursion(parent):
+        for name, mod in parent.named_children():
+            parent_child_mod_dict[mod] = mod_inst_info(name=name, parent=parent)
+            create_mod_info_recursion(mod)
+
+    create_mod_info_recursion(model)
+    return parent_child_mod_dict
+
+
+def get_patched_mod_list():
+    from habana_quantization_toolkit._core.common import mod_default_dict
+
+    patched_mod_list = []
+    for patched_mod in mod_default_dict.values():
+        patched_mod_list.append(patched_mod.patched_module.__name__)
+    return patched_mod_list
+
+
+def restore_patched_module(patched_model):
+    from neural_compressor.torch.algorithms.fp8_quant.helper_modules import helper_mods
+    patched_mod_list = get_patched_mod_list()
+
+    parent_child_mod_dict = generate_model_info(patched_model)
+    with torch.no_grad():
+        for name, patched_mod in patched_model.named_modules():
+            patched_mod_type_str = patched_mod.__class__.__name__
+            if patched_mod_type_str in patched_mod_list:
+                parent = parent_child_mod_dict[patched_mod].parent
+                name = parent_child_mod_dict[patched_mod].name
+                class_name_org = getattr(patched_mod, "class_name_org", None) or \
+                    patched_mod.__class__.__name__.split("Patched")[-1]
+                origin_mod = helper_mods[class_name_org](patched_mod)
+                origin_mod.forward = patched_mod.forward_orig
+                setattr(parent, name, origin_mod)
+
+
+def with_patched_module(model):
+    patched_mod_list = get_patched_mod_list()
+
+    for name, mod in model.named_modules():
+        mod_type = mod.__class__.__name__
+        if mod_type in patched_mod_list:
+            return True
+    return False
diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
new file mode 100644
index 00000000000..f9ce9145569
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from neural_compressor.common.utils import FP8_QUANT
+from neural_compressor.torch.algorithms import Quantizer
+from neural_compressor.torch.algorithms.fp8_quant import (
+    restore_patched_module,
+    update_mode,
+    with_patched_module,
+)
+
+
+class FP8Quantizer(Quantizer):
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        if isinstance(quant_config, dict):
+            json_file = [cfg.json_file for cfg in quant_config.values()]
+            assert len(json_file) > 0, "Cannot get json file from config."
+            self.quant_config = json_file[0]
+
+    def prepare(self, model):
+        _prepare(model, self.quant_config)
+        return model
+
+    def convert(self, model):
+        if with_patched_module(model):
+            # for INC flow, it calls `prepare` and then `convert` user-facing API in one run
+            restore_patched_module(model)
+        _convert(model, self.quant_config)
+        return model
+
+
+def _convert(model, config_path):
+    import habana_quantization_toolkit as hqt
+
+    # update mode to QUANTIZE
+    config_path = update_mode(config_path, quant_step=True)
+
+    return hqt.prep_model(model, config_path)
+
+
+def _prepare(model, config_path):
+    import habana_quantization_toolkit as hqt
+
+    # update mode to MEASURE
+    config_path = update_mode(config_path, measure_step=True)
+
+    return hqt.prep_model(model, config_path)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
new file mode 100644
index 00000000000..6c7154328d7
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+# For mapping revert patched module to origin module
+
+helper_mods = {}
+
+def helper_mod_register(name):
+    def decorator(mod):
+        helper_mods[name] = mod
+        return mod
+    return decorator
+
+@helper_mod_register(name="Matmul")
+class Matmul(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="Linear")
+class Linear(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="FalconLinear")
+class FalconLinear(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="KVCache")
+class KVCache(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.allocate = patched_mod.org_allocate
+        self.get_shape = patched_mod.get_shape
+        self.forward = patched_mod.forward
+        self.update = patched_mod.update
+
+@helper_mod_register(name="Conv2d")
+class Conv2d(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LoRACompatibleLinear")
+class LoRACompatibleLinear(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LoRACompatibleConv")
+class LoRACompatibleConv(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="Softmax")
+class Softmax(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LinearLayer")
+class LinearLayer(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LinearAllreduce")
+class LinearAllreduce(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="ScopedLinearAllReduce")
+class ScopedLinearAllReduce(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LmHeadLinearAllreduce")
+class LmHeadLinearAllreduce(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="ModuleFusedSDPA")
+class ModuleFusedSDPA(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
diff --git a/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py b/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
deleted file mode 100644
index 0330bd475ad..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import copy
-import os
-
-import habana_frameworks.torch.core as htcore
-import torch
-from deepspeed.module_inject import LinearAllreduce, LinearLayer
-from deepspeed.module_inject.layers import LmHeadLinearAllreduce
-from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
-
-from neural_compressor.torch.utils import fetch_module, logger, set_module
-
-from .modules import (  # fp32; dynamic modules; static modules; dtype amax
-    Autocast,
-    BatchMatmul,
-    FP8BatchMatmul,
-    FP8Cast,
-    FP8DynamicBatchMatmul,
-    FP8DynamicLinear,
-    FP8DynamicMatmul,
-    FP8Linear,
-    FP8LinearAllreduce,
-    FP8LinearLayer,
-    FP8LmHeadLinearAllreduce,
-    FP8Matmul,
-    Matmul,
-)
-from .observer import observer_mapping
-
-quantization_mapping = {
-    LinearAllreduce: FP8LinearAllreduce,
-    LinearLayer: FP8LinearLayer,
-    LmHeadLinearAllreduce: FP8LmHeadLinearAllreduce,
-    torch.nn.Linear: FP8Linear,
-    BatchMatmul: FP8BatchMatmul,
-    Matmul: FP8Matmul,
-    Autocast: FP8Cast,
-    # torch.matmul: fp8_matmul
-}
-white_list = tuple(quantization_mapping.keys())
-
-
-FP8_DTYPE = [torch.float8_e5m2, torch.float8_e4m3fn, "fp8_e5m2", "fp8_e4m3"]
-dtype_mapping = {"fp8_e5m2": torch.float8_e5m2, "fp8_e4m3": torch.float8_e4m3fn}
-# enable inference optimizations
-htcore.hpu_initialize()
-
-
-def _replace_module(module, qconfig):
-    assert qconfig.w_dtype == qconfig.act_dtype, "weight and activation should be the same dtype."
-    dtype = dtype_mapping[qconfig.w_dtype]
-    # only modules that have weight should use this observer
-    if hasattr(module, "weight"):
-        observer_cls = observer_mapping[qconfig.w_observer]
-        observer_obj = observer_cls(dtype=dtype)
-    if qconfig.approach == "static":
-        if isinstance(module, white_list):
-            QModule = quantization_mapping[type(module)]
-            qmodule = QModule(module, dtype)
-    elif qconfig.approach == "dynamic":
-        if isinstance(module, torch.nn.Linear):
-            # need module for initialization
-            qmodule = FP8DynamicLinear(module, dtype)
-        elif isinstance(module, Matmul):
-            qmodule = FP8DynamicMatmul(dtype)
-        elif isinstance(module, BatchMatmul):
-            qmodule = FP8DynamicBatchMatmul(dtype)
-        elif isinstance(module, Autocast):
-            qmodule = FP8Cast(dtype=dtype)
-    # only modules that have weight should use this API
-    if hasattr(qmodule, "from_float"):
-        qmodule.from_float(module, observer_obj)
-    return qmodule
-
-
-def quantize_dynamic(model, dtype=torch.float8_e4m3fn, inplace=True):
-    torch.set_grad_enabled(False)
-    q_model = model if inplace else copy.deepcopy(model)
-    if isinstance(dtype, str):
-        dtype = dtype_mapping[dtype]
-    for n, m in q_model.named_modules():
-        if isinstance(m, torch.nn.Linear):
-            observer_cls = observer_mapping["minmax_per_channel"]
-            observer_obj = observer_cls(dtype=dtype)
-            new_m = FP8DynamicLinear(m, dtype)  # need m for init
-            new_m.from_float(m, observer_obj)
-            set_module(q_model, n, new_m)
-        elif isinstance(m, Matmul):
-            new_m = FP8DynamicMatmul(dtype)
-            set_module(q_model, n, new_m)
-        elif isinstance(m, BatchMatmul):
-            new_m = FP8DynamicBatchMatmul(dtype)
-            set_module(q_model, n, new_m)
-        elif isinstance(m, Autocast):
-            new_m = FP8Cast(dtype=dtype)
-            set_module(q_model, n, new_m)
-        htcore.mark_step()
-    _mark_params_as_const(q_model)
-    _check_params_as_const(q_model)
-    return q_model
-
-
-def _add_observer(module, qconfig):
-    act_observer = qconfig.act_observer
-
-    def input_observer_forward_pre_hook(self, input):
-        try:
-            if isinstance(input[0], torch.Tensor):
-                self.input_activation_post_process(input[0])
-            if hasattr(self, "input_activation_post_process1") and isinstance(input[1], torch.Tensor):
-                self.input_activation_post_process1(input[1])
-            return input
-        except Exception as e:
-            # The KL act_observer may encounter a overflow error on EltwiseAdd.
-            pass
-
-    ### Insert input observer into model, only for fp8_e4m3 static quantization ###
-    observer_cls = observer_mapping[act_observer]
-    # import pdb;pdb.set_trace()
-
-    if isinstance(module, white_list):
-        observer_obj = observer_cls(dtype=dtype_mapping[qconfig.act_dtype])
-        module.add_module("input_activation_post_process", observer_obj)
-    if isinstance(module, (BatchMatmul, Matmul)):
-        observer_obj = observer_cls(dtype=dtype_mapping[qconfig.act_dtype])
-        module.add_module("input_activation_post_process1", observer_obj)
-    module.register_forward_pre_hook(input_observer_forward_pre_hook)
-
-
-def _remove_observer(module):
-    import deepspeed.comm as dist
-    from torch.distributed import ReduceOp
-
-    if hasattr(module, "input_activation_post_process"):
-        scale = module.input_activation_post_process.calculate_qparams()
-        if dist.is_initialized():
-            scale = scale.to("hpu")
-            dist.all_reduce(scale, op=ReduceOp.MAX)
-        if hasattr(module, "input_activation_post_process1"):
-            module.register_parameter("scale1", torch.nn.Parameter(scale))
-        else:
-            module.register_parameter("scale", torch.nn.Parameter(scale))
-        delattr(module, "input_activation_post_process")
-    if hasattr(module, "input_activation_post_process1"):
-        scale = module.input_activation_post_process1.calculate_qparams()
-        if dist.is_initialized():
-            scale = scale.to("hpu")
-            dist.all_reduce(scale, op=ReduceOp.MAX)
-        module.register_parameter("scale2", torch.nn.Parameter(scale))
-        delattr(module, "input_activation_post_process1")
-
-    # remove observer hooks
-    hook_map = module._forward_pre_hooks
-    handle_ids_to_remove = set()
-    for handle_id, hook_fn in hook_map.items():
-        if hasattr(hook_fn, "__name__") and hook_fn.__name__ == "input_observer_forward_pre_hook":
-            handle_ids_to_remove.add(handle_id)
-    for handle_id in handle_ids_to_remove:
-        hook_map.pop(handle_id)
-
-
-def prepare(model, qconfig_mapping):
-    model.qconfig = qconfig_mapping
-    for (op_name, op_type), qconfig in qconfig_mapping.items():
-        if qconfig.approach == "dynamic":
-            continue
-        if qconfig.w_dtype not in FP8_DTYPE:
-            continue
-        module = fetch_module(model, op_name)
-        if module is None:
-            logger.info(f"{op_name} is not found in model.")
-            continue
-        _add_observer(module, qconfig)
-        set_module(model, op_name, module)
-    return model
-
-
-def convert(model):
-    for (op_name, op_type), qconfig in model.qconfig.items():
-        if qconfig.w_dtype not in FP8_DTYPE:
-            continue
-        module = fetch_module(model, op_name)
-        if module is None:
-            logger.info(f"{op_name} is not found in model.")
-            continue
-        if qconfig.approach != "dynamic":
-            _remove_observer(module)
-        module = _replace_module(module, qconfig)
-        set_module(model, op_name, module)
-        htcore.mark_step()
-    return model
-
-
-def quantize(model, qconfig_mapping, run_fn=None, run_args=None, inplace=True):
-    torch.set_grad_enabled(False)
-    q_model = model if inplace else copy.deepcopy(model)
-    q_model = prepare(q_model, qconfig_mapping)
-    if run_fn is not None:
-        if run_args is not None:
-            run_fn(q_model, *run_args)
-        else:
-            run_fn(q_model)
-    q_model = convert(q_model)
-    _mark_params_as_const(q_model)
-    _check_params_as_const(q_model)
-    return q_model
diff --git a/neural_compressor/torch/algorithms/habana_fp8/modules.py b/neural_compressor/torch/algorithms/habana_fp8/modules.py
deleted file mode 100644
index 99b9faf1f72..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/modules.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import os
-
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.hpex
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-from neural_compressor.common import logger
-
-from .observer import calculate_qparams
-
-
-##################### FP32 modules #######################
-class Matmul(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class BatchMatmul(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.bmm(x, y)
-
-
-class Autocast(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return x
-
-
-##################### FP8 modules #######################
-class FP8DynamicLinear(torch.nn.Module):
-    def __init__(self, org_module, dtype=torch.float8_e4m3fn) -> None:
-        super().__init__()
-        # attributes
-        self.use_amax = True
-        self.dtype = dtype
-        self.in_features = org_module.in_features
-        self.out_features = org_module.out_features
-        self.weight_dtype = self.dtype
-        self.out_dtype = org_module.weight.dtype
-        # register weight, bias
-        self.register_buffer(
-            "weight",
-            torch.empty(
-                self.in_features,
-                self.out_features,
-                device="hpu",
-                dtype=self.weight_dtype,
-            ),
-        )
-        if org_module.bias is not None:
-            self.register_buffer(
-                "bias",
-                torch.empty(
-                    self.out_features,
-                    device="hpu",
-                    dtype=self.out_dtype,
-                ),
-            )
-        else:
-            self.bias = None
-
-    def from_float(self, org_module, w_observer):
-        # register scale
-        if not org_module.weight.device.type == "meta":
-            w_observer(org_module.weight)
-            weight_scale = w_observer.calculate_qparams()
-        else:
-            weight_scale = torch.tensor([1.0])
-        self.register_buffer(
-            "weight_scale",
-            torch.tensor(
-                weight_scale,
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        self.register_buffer(
-            "weight_scale_inv",
-            torch.tensor(
-                torch.reciprocal(weight_scale),
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        # copy weight and bias
-        if not org_module.weight.device.type == "meta":
-            org_module.to("hpu")
-            self.weight.data.copy_(
-                torch.ops.hpu.cast_to_fp8_v2(org_module.weight.T, self.weight_scale_inv, False, False, self.dtype)[0]
-            )
-            if org_module.bias is not None:
-                self.bias.data.copy_(org_module.bias.data.type(self.out_dtype))
-
-    def forward(self, inp):
-        assert inp.shape[-1] == self.in_features, "GEMM not possible"
-        org_middle_shape = inp.shape[1:-1]
-        inp = inp.view(-1, self.in_features)
-        if inp.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            if self.use_amax:
-                input_scale = calculate_qparams(inp.min(), inp.max(), self.dtype)
-                input_scale_inv = torch.reciprocal(input_scale)
-            else:
-                input_scale, input_scale_inv = None, None
-            inp = torch.ops.hpu.cast_to_fp8_v2(inp, input_scale_inv, False, False, self.dtype)[0]
-        else:
-            input_scale, input_scale_inv = None, None
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inp,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            input_scale,  # inv is used for recover scale
-            self.weight_scale,
-            self.bias,
-            False,
-        )
-        out = out.view(-1, *org_middle_shape, out.shape[-1])
-        return out
-
-    def extra_repr(self) -> str:
-        return "in_features={}, out_features={}, bias={}, format={}".format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None,
-            self.dtype,
-        )
-
-
-class FP8DynamicMatmul(torch.nn.Module):
-    def __init__(self, dtype) -> None:
-        super().__init__()
-        self.dtype = dtype
-        self.use_amax = True
-        self.out_dtype = torch.float32
-
-    def forward(self, input1, input2):
-        dim1 = input1.shape[-1]
-        dim2 = input2.shape[-2]
-        assert dim1 == dim2, "GEMM not possible"
-
-        # process input1
-        if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input1.dtype
-            if self.use_amax:
-                input1_scale = calculate_qparams(input1.min(), input1.max(), self.dtype)
-                input1_scale_inv = torch.reciprocal(input1_scale)
-            else:
-                input1_scale, input1_scale_inv = None, None
-            input1 = torch.ops.hpu.cast_to_fp8_v2(input1, input1_scale_inv, False, False, self.dtype)[0]
-        else:
-            # skip cast for input1
-            input1_scale, input1_scale_inv = None, None
-        # process input2
-        if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input2.dtype
-            if self.use_amax:
-                input2_scale = calculate_qparams(input2.min(), input2.max(), self.dtype)
-                input2_scale_inv = torch.reciprocal(input2_scale)
-            else:
-                input2_scale, input2_scale_inv = None, None
-            input2 = torch.ops.hpu.cast_to_fp8_v2(input2, input2_scale_inv, False, False, self.dtype)[0]
-        else:
-            # skip cast for input2
-            input2_scale, input2_scale_inv = None, None
-        # calculate
-        out = torch.ops.hpu.fp8_gemm_v2(
-            input1,
-            False,
-            input2,
-            False,
-            None,
-            self.out_dtype,
-            input1_scale,  # inv is used for recover scale
-            input2_scale,
-            None,
-            False,
-        )
-        return out
-
-    def extra_repr(self) -> str:
-        return "format={}".format(self.dtype)
-
-
-class FP8DynamicBatchMatmul(FP8DynamicMatmul):
-    pass
-
-
-class FP8Linear(torch.nn.Module):
-    def __init__(self, org_module, dtype) -> None:
-        super().__init__()
-        # attributes
-        self.in_features = org_module.in_features
-        self.out_features = org_module.out_features
-        self.dtype = dtype
-        self.weight_dtype = self.dtype
-        self.out_dtype = org_module.weight.dtype
-        self.register_buffer(
-            "weight",
-            torch.empty(
-                self.in_features,
-                self.out_features,
-                device="hpu",
-                dtype=self.weight_dtype,
-            ),
-        )
-        if org_module.bias is not None:
-            self.register_buffer(
-                "bias",
-                torch.empty(
-                    self.out_features,
-                    device="hpu",
-                    dtype=self.out_dtype,
-                ),
-            )
-        else:
-            self.bias = None
-
-    def from_float(self, org_module, w_observer):
-        # register scale
-        if not org_module.weight.device.type == "meta":
-            w_observer(org_module.weight)
-            weight_scale = w_observer.calculate_qparams()
-        else:
-            weight_scale = torch.tensor([1.0])
-        self.register_buffer(
-            "weight_scale",
-            torch.tensor(
-                weight_scale,
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        self.register_buffer(
-            "weight_scale_inv",
-            torch.tensor(
-                torch.reciprocal(weight_scale),
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        # copy weight and bias
-        if not org_module.weight.device.type == "meta":
-            org_module.to("hpu")
-            self.weight.data.copy_(
-                torch.ops.hpu.cast_to_fp8_v2(org_module.weight.T, self.weight_scale_inv, False, False, self.dtype)[0]
-            )
-            if org_module.bias is not None:
-                self.bias.data.copy_(org_module.bias.data.type(self.out_dtype))
-        # register input scale
-        input_scale = org_module.scale if hasattr(org_module, "scale") else torch.tensor([1.0])
-        self.register_buffer(
-            "input_scale",
-            torch.tensor(
-                input_scale,
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        self.register_buffer(
-            "input_scale_inv",
-            torch.tensor(
-                torch.reciprocal(input_scale),
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-
-    def forward(self, inp):
-        assert inp.shape[-1] == self.in_features, "GEMM not possible"
-        org_middle_shape = inp.shape[1:-1]
-        inp = inp.view(-1, self.in_features)
-        inp = torch.ops.hpu.cast_to_fp8_v2(inp, self.input_scale_inv, False, False, self.dtype)[0]
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inp,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            self.input_scale,  # inv is used for recover scale
-            self.weight_scale,
-            self.bias,
-            False,
-        )
-        out = out.view(-1, *org_middle_shape, out.shape[-1])
-        return out
-
-    def extra_repr(self) -> str:
-        return "in_features={}, out_features={}, bias={}, scale={}, format={}".format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None,
-            self.input_scale.tolist() if hasattr(self, "input_scale") else None,
-            self.dtype,
-        )
-
-
-class FP8Matmul(torch.nn.Module):
-    def __init__(self, org_module, dtype) -> None:
-        super().__init__()
-        org_module.to("hpu")
-        self.dtype = dtype
-        self.out_dtype = torch.float32
-        scale1 = org_module.scale1 if hasattr(org_module, "scale1") else 1.0
-        scale2 = org_module.scale2 if hasattr(org_module, "scale2") else 1.0
-        self.register_buffer(
-            "scale1",
-            torch.tensor(
-                scale1,
-                device="hpu",
-                dtype=self.out_dtype,
-            ),
-        )
-        self.register_buffer(
-            "scale2",
-            torch.tensor(
-                scale2,
-                device="hpu",
-                dtype=self.out_dtype,
-            ),
-        )
-
-    def forward(self, input1, input2):
-        dim1 = input1.shape[-1]
-        dim2 = input2.shape[-2]
-        assert dim1 == dim2, "GEMM not possible"
-
-        if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input1.dtype
-            self.scale1_inv = torch.reciprocal(self.scale1)
-            input1 = torch.ops.hpu.cast_to_fp8_v2(input1, self.scale1_inv, False, False, self.dtype)[0]
-        else:
-            self.scale1_inv = None
-        if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input2.dtype
-            self.scale2_inv = torch.reciprocal(self.scale2)
-            input2 = torch.ops.hpu.cast_to_fp8_v2(input2, self.scale2_inv, False, False, self.dtype)[0]
-        else:
-            self.scale2_inv = None
-        out = torch.ops.hpu.fp8_gemm_v2(
-            input1,
-            False,
-            input2,
-            False,
-            None,
-            self.out_dtype,
-            self.scale1,  # inv is used for recover scale
-            self.scale2,
-            None,
-            False,
-        )
-        return out
-
-    def extra_repr(self) -> str:
-        return "scales={}, format={}".format(
-            (self.scale1.tolist(), self.scale2.tolist()),
-            self.dtype,
-        )
-
-
-class FP8BatchMatmul(FP8Matmul):
-    pass
-
-
-class FP8Cast(torch.nn.Module):
-    def __init__(self, org_module=None, dtype=torch.float8_e4m3fn) -> None:
-        super().__init__()
-        self.dtype = dtype
-        if org_module is not None:
-            org_module.to("hpu")
-            scale = org_module.scale if hasattr(org_module, "scale") else 1.0
-            self.register_buffer(
-                "scale",
-                torch.tensor(
-                    scale,
-                    device="hpu",
-                    dtype=torch.float32,
-                ),
-            )
-            self.scale, self.scale_inv = None, None  # due to next matmul doesn't know this scale
-        else:
-            self.scale, self.scale_inv = None, None
-
-    def forward(self, input):
-        if input.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            out = torch.ops.hpu.cast_to_fp8_v2(input, self.scale_inv, False, False, self.dtype)[0]
-        else:
-            out = input
-        return out
-
-    def extra_repr(self) -> str:
-        return "scales={}, format={}".format(
-            self.scale,
-            self.dtype,
-        )
-
-
-FP8LinearLayer = FP8Linear
-
-
-class FP8LinearAllreduce(FP8Linear):
-    def forward(self, inp):
-        assert inp.shape[-1] == self.in_features, "GEMM not possible"
-        inputmat = inp.view(-1, self.in_features)
-        inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.input_scale_inv, False, False, self.dtype)[0]
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inputmat,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            self.input_scale,
-            self.weight_scale,
-            None,
-            False,
-        )
-        from deepspeed import comm as dist
-
-        if self.mp_group is not None:
-            dist.inference_all_reduce(out, group=self.mp_group)
-        if self.bias is not None:
-            out += self.bias
-        return out.view(-1, *inp.shape[1:-1], out.shape[-1])
-
-
-class FP8LmHeadLinearAllreduce(FP8Linear):
-    def forward(self, inp):
-        # from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
-        # input_shard_size = get_shard_size(inp.shape[-1], self.world_size)
-        # input_shard_offset = sum(get_shard_size_list(inp.shape[-1], self.world_size)[0:self.rank])
-
-        # inputmat = inp[:, :, input_shard_offset:input_shard_offset + input_shard_size]
-        assert (
-            inp.shape[-1] % self.world_size == 0
-        ), "Please ensure that self.world_size is divisible by input.shape[-1]"
-        input_shard = inp.shape[-1] // self.world_size
-        inp_part = inp[:, :, self.rank * input_shard : (self.rank + 1) * input_shard]
-        inputmat = inp_part.view(-1, input_shard)  # dim=2 will help kernel speed
-        inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.input_scale_inv, False, False, self.dtype)[0]
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inputmat,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            self.input_scale,
-            self.weight_scale,
-            None,
-            False,
-        )
-        from deepspeed import comm as dist
-
-        if self.mp_group is not None:
-            dist.inference_all_reduce(out, group=self.mp_group)
-        if self.bias is not None:
-            out += self.bias
-        return out.view(-1, *inp.shape[1:-1], out.shape[-1])
diff --git a/neural_compressor/torch/algorithms/habana_fp8/observer.py b/neural_compressor/torch/algorithms/habana_fp8/observer.py
deleted file mode 100644
index fd29892ddb7..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/observer.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import os
-from typing import Tuple
-
-import habana_frameworks.torch.core as htcore
-import torch
-from torch.ao.quantization.observer import *
-
-E4M3_AMAX = torch.tensor(240, dtype=torch.float).to("cpu")
-E5M2_AMAX = torch.tensor(57344, dtype=torch.float).to("cpu")
-USE_HW_SCALE = bool(os.getenv("USE_HW_SCALE", False))
-USE_POW2_SCALE = bool(os.getenv("USE_POW2_SCALE", False))
-observer_mapping = {}
-
-
-def observer_registry(name):
-    def new_observer(observer_cls):
-        global observer_mapping
-        observer_mapping[name] = observer_cls
-        return observer_cls
-
-    return new_observer
-
-
-def _map_gaudi_scale(scale):
-    if USE_HW_SCALE:
-        scale_list = torch.tensor([16, 1, 1 / 16, 1 / 256])
-        return torch.clip(
-            2 ** (torch.ceil(torch.log2(scale) / 4) * 4),
-            torch.tensor(scale_list[-1], dtype=scale.dtype, device=scale.device),
-            torch.tensor(scale_list[0], dtype=scale.dtype, device=scale.device),
-        )
-    elif USE_POW2_SCALE:
-        return 2 ** torch.ceil(torch.log2(scale))
-    else:
-        return scale
-
-
-def calculate_qparams(min_val, max_val, dtype):
-    amax = torch.max(torch.abs(min_val), torch.abs(max_val))
-    dtype_amax = E4M3_AMAX if dtype == torch.float8_e4m3fn else E5M2_AMAX
-    scale = amax / dtype_amax
-    scale = scale.reshape(-1)
-    return _map_gaudi_scale(scale)
-
-
-@observer_registry(name="minmax")
-class FP8MinMaxObserver(ObserverBase):
-    def __init__(
-        self,
-        dtype: torch.dtype = torch.float8_e4m3fn,
-    ) -> None:
-        # bins: The number of bins used for histogram calculation.
-        super().__init__(dtype=dtype)
-        assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype."
-        factory_kwargs = {"device": "cpu", "dtype": torch.float32}
-        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
-        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
-
-    def forward(self, x_orig):
-        r"""Records the running minimum and maximum of ``x``."""
-        if x_orig.numel() == 0:
-            return x_orig
-        x = x_orig.detach()
-        x = x.to(self.min_val.dtype)
-        min_val_cur, max_val_cur = torch.aminmax(x)
-        min_val = torch.min(min_val_cur, self.min_val)
-        max_val = torch.max(max_val_cur, self.max_val)
-        self.min_val.copy_(min_val)
-        self.max_val.copy_(max_val)
-        return x_orig
-
-    def calculate_qparams(self):
-        r"""Calculates the quantization parameters."""
-        scale = calculate_qparams(self.min_val, self.max_val, self.dtype)
-        return scale
-
-    def extra_repr(self):
-        return f"min_val={self.min_val}, max_val={self.max_val}"
-
-    def reset_min_max_vals(self):
-        """Resets the min/max values."""
-        self.min_val.copy_(torch.tensor(float("inf")))
-        self.max_val.copy_(torch.tensor(float("-inf")))
-
-
-@observer_registry(name="minmax_per_channel")
-class FP8PerChannelMinMaxObserver(ObserverBase):
-    def __init__(
-        self,
-        dtype: torch.dtype = torch.float8_e4m3fn,
-        ch_axis=0,  # weight_shape = (out_features, in_features)
-    ) -> None:
-        # bins: The number of bins used for histogram calculation.
-        super().__init__(dtype=dtype)
-        assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype."
-        self.ch_axis = ch_axis
-        factory_kwargs = {"device": "cpu", "dtype": torch.float32}
-        self.register_buffer("min_val", torch.tensor([], **factory_kwargs))
-        self.register_buffer("max_val", torch.tensor([], **factory_kwargs))
-
-    def forward(self, x_orig):
-        if x_orig.numel() == 0:
-            return x_orig
-        x = x_orig.detach()
-        min_val = self.min_val
-        max_val = self.max_val
-        x_dim = x.size()
-
-        new_axis_list = [i for i in range(len(x_dim))]
-        new_axis_list[self.ch_axis] = 0
-        new_axis_list[0] = self.ch_axis
-        y = x.permute(new_axis_list)
-        # Need to match dtype of min/max because the updates to buffers
-        # are done in place and types need to match for comparisons
-        y = y.to(self.min_val.dtype)
-        y = torch.flatten(y, start_dim=1)
-        if min_val.numel() == 0 or max_val.numel() == 0:
-            min_val, max_val = torch.aminmax(y, dim=1)
-        else:
-            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
-            min_val = torch.min(min_val_cur, min_val)
-            max_val = torch.max(max_val_cur, max_val)
-        self.min_val.resize_(min_val.shape)
-        self.max_val.resize_(max_val.shape)
-        self.min_val.copy_(min_val)
-        self.max_val.copy_(max_val)
-        return x_orig
-
-    def calculate_qparams(self):
-        r"""Calculates the quantization parameters."""
-        scale = calculate_qparams(self.min_val, self.max_val, self.dtype)
-        return scale
-
-    def extra_repr(self):
-        return f"min_val={self.min_val}, max_val={self.max_val}"
-
-    def reset_min_max_vals(self):
-        """Resets the min/max values."""
-        self.min_val.copy_(torch.tensor(float("inf")))
-        self.max_val.copy_(torch.tensor(float("-inf")))
-
-
-@observer_registry(name="kl")
-class FP8HistogramObserver(ObserverBase):
-    def __init__(
-        self,
-        dtype: torch.dtype = torch.float8_e4m3fn,
-        bins: int = 2048,
-        upsample_rate: int = 128,
-        qscheme=torch.per_tensor_affine,
-        eps=torch.finfo(torch.float32).eps,
-    ) -> None:
-        # bins: The number of bins used for histogram calculation.
-        super().__init__(dtype=dtype)
-        assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype."
-        self.bins = bins
-        factory_kwargs = {"device": "cpu", "dtype": torch.float32}
-        self.register_buffer("histogram", torch.zeros(self.bins, **factory_kwargs))
-        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
-        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
-        self.dst_nbins = 2 ** torch.finfo(self.dtype).bits
-        self.upsample_rate = upsample_rate
-
-    def calculate_qparams(self, **kwargs):
-        new_min, new_max = self._non_linear_param_search()
-        scale = calculate_qparams(new_min, new_max, self.dtype)
-        return scale
-
-    def _get_norm(self, delta_begin: torch.Tensor, delta_end: torch.Tensor, density: torch.Tensor) -> torch.Tensor:
-        r"""Compute the norm of the values uniformaly distributed between
-        delta_begin and delta_end.
-        Currently only L2 norm is supported.
-
-        norm = density * (integral_{begin, end} x^2)
-             = density * (end^3 - begin^3) / 3
-        """
-        norm = (delta_end * delta_end * delta_end - delta_begin * delta_begin * delta_begin) / 3
-        return density * norm
-
-    def _get_dst_bin(self, src_bin_begin, src_bin_end, dst_bin_max):
-        # get dst bin value
-        FP8_amax = E4M3_AMAX if self.dtype == torch.float8_e4m3fn else E5M2_AMAX
-        scale = FP8_amax / dst_bin_max
-        if torch.isinf(torch.tensor(scale)):
-            scale = torch.tensor(3.4e38)
-        tmp = torch.ops.hpu.cast_to_fp8_v2(src_bin_begin.to("hpu"), scale.to("hpu"), False, False, self.dtype)[0]
-        dst_bin_begin = torch.ops.hpu.cast_from_fp8(tmp, None, torch.float32).to("cpu")
-        tmp = torch.ops.hpu.cast_to_fp8_v2(src_bin_end.to("hpu"), scale.to("hpu"), False, False, self.dtype)[0]
-        dst_bin_end = torch.ops.hpu.cast_from_fp8(tmp, None, torch.float32).to("cpu")
-        # get bin width of dst bin value, dst_bin_begin must contain 0 and the max qvalue.
-        dst_bin = list(set(dst_bin_begin.detach().cpu().numpy()))
-        dst_bin.sort()
-        width_dict = {}
-        bin_of_dst_dict = {}
-        for i, bin in enumerate(dst_bin):
-            bin_of_dst_dict[bin] = i
-            if bin == 0:
-                width_dict[bin] = {"left": 0, "right": dst_bin[i + 1]}
-            elif i == len(dst_bin) - 1:
-                width_dict[bin] = {"left": dst_bin[i] - dst_bin[i - 1], "right": dst_bin[i] - dst_bin[i - 1]}
-            else:
-                width_dict[bin] = {"left": dst_bin[i] - dst_bin[i - 1], "right": dst_bin[i + 1] - dst_bin[i]}
-        dst_bin_of_begin = [bin_of_dst_dict[float(i)] for i in dst_bin_begin]
-        dst_bin_of_end = [bin_of_dst_dict[float(i)] for i in dst_bin_end]
-        left_dst_bin_end_width = [width_dict[float(i)]["left"] for i in dst_bin_end]
-        right_dst_bin_begin_width = [width_dict[float(i)]["right"] for i in dst_bin_begin]
-        return (
-            dst_bin_begin,
-            dst_bin_end,
-            torch.tensor(dst_bin_of_begin),
-            torch.tensor(dst_bin_of_end),
-            torch.tensor(left_dst_bin_end_width),
-            torch.tensor(right_dst_bin_begin_width),
-        )
-
-    def _compute_quantization_error(self, next_start_bin: int, next_end_bin: int):
-        r"""Compute the quantization error if we use start_bin to end_bin as the
-        min and max to do the quantization."""
-        bin_width = (self.max_val.item() - self.min_val.item()) / self.bins
-        dst_bin_max = bin_width * (next_end_bin - next_start_bin + 1)
-
-        src_bin = torch.arange(self.bins, device=self.histogram.device)
-        src_bin_begin = src_bin * bin_width
-        src_bin_end = src_bin_begin + bin_width
-        (
-            dst_bin_begin,
-            dst_bin_end,
-            dst_bin_of_begin,
-            dst_bin_of_end,
-            left_dst_bin_end_width,
-            right_dst_bin_begin_width,
-        ) = self._get_dst_bin(src_bin_begin, src_bin_end, dst_bin_max)
-
-        dst_bin_of_begin_center = dst_bin_begin + right_dst_bin_begin_width
-        dst_bin_of_end_center = dst_bin_end + left_dst_bin_end_width
-
-        density = self.histogram / bin_width
-
-        norm = torch.zeros(self.bins, device=self.histogram.device)
-
-        delta_begin = src_bin_begin - dst_bin_of_begin_center
-        delta_end = right_dst_bin_begin_width
-
-        norm += self._get_norm(delta_begin, delta_end, density)
-
-        norm += (dst_bin_of_end - dst_bin_of_begin - 1) * self._get_norm(
-            torch.tensor(-left_dst_bin_end_width), torch.tensor(right_dst_bin_begin_width), density
-        )
-
-        delta_begin = -left_dst_bin_end_width
-        delta_end = src_bin_end - dst_bin_of_end_center
-        norm += self._get_norm(delta_begin, delta_end, density)
-
-        return norm.sum().item()
-
-    def _non_linear_param_search(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        r"""Non-linear parameter search.
-
-        An approximation for L2 error minimization for selecting min/max.
-        By selecting new min/max, we filter out outliers in input distribution.
-        This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in
-        caffe2/quantization/server/norm_minimization.cc
-        """
-        assert self.histogram.size()[0] == self.bins, "bins mismatch"
-        bin_width = (self.max_val - self.min_val) / self.bins
-
-        # cumulative sum
-        total = torch.sum(self.histogram).item()
-        cSum = torch.cumsum(self.histogram, dim=0)
-
-        stepsize = 1e-5  # granularity
-        alpha = 0.0  # lower bound
-        beta = 1.0  # upper bound
-        start_bin = 0
-        end_bin = self.bins - 1
-        norm_min = float("inf")
-
-        while alpha < beta:
-            # Find the next step
-            next_alpha = alpha
-            next_beta = beta - stepsize
-
-            # find the right bins between the quantile bounds
-            # keep the left bins at zero due to fp8 symmetry
-            l = 0
-            r = end_bin
-            while r > start_bin and cSum[r] > next_beta * total:
-                r = r - 1
-
-            # decide the next move
-            next_start_bin = start_bin
-            next_end_bin = end_bin
-            if (l - start_bin) <= (end_bin - r):
-                # move the end bin
-                next_end_bin = r
-                beta = next_beta
-
-            if next_start_bin == start_bin and next_end_bin == end_bin:
-                continue
-
-            # calculate the quantization error using next_start_bin and next_end_bin
-            norm = self._compute_quantization_error(next_start_bin, next_end_bin)
-
-            if norm > norm_min:
-                break
-            norm_min = norm
-            start_bin = next_start_bin
-            end_bin = next_end_bin
-
-        new_min = self.min_val + bin_width * start_bin
-        new_max = self.min_val + bin_width * (end_bin + 1)
-        return new_min, new_max
-
-    def _adjust_min_max(
-        self, combined_min: torch.Tensor, combined_max: torch.Tensor, upsample_rate: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
-        # We ensure that:
-        # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
-        # This allows us to have a common grid of resolution s, where we can align
-        # the input histogram
-        # start_idx maps min_val to the histogram bin index.
-
-        # Compute the width of histogram bins is a straightforward solution, where
-        # hist_bin_width = (self.max_val - self.min_val) / (self.bins * upsample_rate)
-        # Underflow happens if the numerator is close to the smallest positive subnormal number of FP32
-        # Therefore, we avoid such division operation.
-        downsample_rate = int(
-            torch.ceil((combined_max - combined_min) * upsample_rate / (self.max_val - self.min_val)).item()
-        )
-        e = downsample_rate * (self.max_val - self.min_val) / upsample_rate - (combined_max - combined_min)
-        start_idx = int(
-            torch.round(
-                (self.min_val - combined_min) * self.bins * upsample_rate / (self.max_val - self.min_val)
-            ).item()
-        )
-        combined_max = combined_max + e
-        combined_min = combined_min
-        return combined_min, combined_max, downsample_rate, start_idx
-
-    def _combine_histograms(
-        self,
-        orig_hist: torch.Tensor,
-        new_hist: torch.Tensor,
-        upsample_rate: int,
-        downsample_rate: int,
-        start_idx: int,
-        Nbins: int,
-    ) -> torch.Tensor:
-        # First up-sample the histogram with new data by a factor of L
-        # This creates an approximate probability density that's piecewise constant
-        upsampled_histogram = new_hist.repeat_interleave(upsample_rate)
-        # Now insert the upsampled histogram into the output
-        # histogram, which is initialized with zeros.
-        # The offset at which the histogram is introduced is determined
-        # by the start index as the output histogram can cover a wider range
-        histogram_with_output_range = torch.zeros((Nbins * downsample_rate), device=orig_hist.device)
-        histogram_with_output_range[start_idx : Nbins * upsample_rate + start_idx] = upsampled_histogram
-        # Compute integral histogram, double precision is needed to ensure
-        # that there are no overflows
-        integral_histogram = torch.cumsum(histogram_with_output_range, 0, dtype=torch.double)[
-            downsample_rate - 1 :: downsample_rate
-        ]
-        # Finally perform interpolation
-        shifted_integral_histogram = torch.zeros((Nbins), device=orig_hist.device)
-        shifted_integral_histogram[1:Nbins] = integral_histogram[0:-1]
-        interpolated_histogram = (integral_histogram - shifted_integral_histogram) / upsample_rate
-        orig_hist = orig_hist + interpolated_histogram.to(torch.float)
-        return orig_hist
-
-    def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
-        if x_orig.numel() == 0:
-            return x_orig
-        x = x_orig.detach()
-        # use abs due to fp8 symmetry
-        x = torch.abs(x)
-        min_val = self.min_val
-        max_val = self.max_val
-        same_values = min_val.item() == max_val.item()
-        is_uninitialized = min_val == float("inf") and max_val == float("-inf")
-        if is_uninitialized or same_values:
-            min_val, max_val = torch.aminmax(x)
-            self.min_val.resize_(min_val.shape)
-            self.min_val.copy_(min_val)
-            self.max_val.resize_(max_val.shape)
-            self.max_val.copy_(max_val)
-            assert min_val.numel() == 1 and max_val.numel() == 1, "histogram min/max values must be scalar."
-            torch.histc(x, self.bins, min=int(min_val), max=int(max_val), out=self.histogram)
-        else:
-            new_min, new_max = torch.aminmax(x)
-            combined_min = torch.min(new_min, min_val)
-            combined_max = torch.max(new_max, max_val)
-            # combine the existing histogram and new histogram into 1 histogram
-            # We do this by first upsampling the histogram to a dense grid
-            # and then downsampling the histogram efficiently
-            (
-                combined_min,
-                combined_max,
-                downsample_rate,
-                start_idx,
-            ) = self._adjust_min_max(combined_min, combined_max, self.upsample_rate)
-            assert combined_min.numel() == 1 and combined_max.numel() == 1, "histogram min/max values must be scalar."
-            combined_histogram = torch.histc(x, self.bins, min=int(combined_min), max=int(combined_max))
-            if combined_min == min_val and combined_max == max_val:
-                combined_histogram += self.histogram
-            else:
-                combined_histogram = self._combine_histograms(
-                    combined_histogram,
-                    self.histogram,
-                    self.upsample_rate,
-                    downsample_rate,
-                    start_idx,
-                    self.bins,
-                )
-
-            self.histogram.detach_().resize_(combined_histogram.shape)
-            self.histogram.copy_(combined_histogram)
-            self.min_val.detach_().resize_(combined_min.shape)
-            self.min_val.copy_(combined_min)
-            self.max_val.detach_().resize_(combined_max.shape)
-            self.max_val.copy_(combined_max)
-        return x_orig
-
-    def extra_repr(self):
-        return f"min_val={self.min_val}, max_val={self.max_val}"
diff --git a/neural_compressor/torch/algorithms/habana_fp8/save_load.py b/neural_compressor/torch/algorithms/habana_fp8/save_load.py
deleted file mode 100644
index 8079a130625..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/save_load.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import json
-import os
-
-import habana_frameworks.torch.core as htcore
-import torch
-
-from neural_compressor.common.utils import load_config_mapping, save_config_mapping
-from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
-
-from .fp8_quant import FP8_DTYPE, dtype_mapping
-from .modules import (  # fp32; dynamic modules
-    Autocast,
-    BatchMatmul,
-    FP8Cast,
-    FP8DynamicBatchMatmul,
-    FP8DynamicLinear,
-    FP8DynamicMatmul,
-    Matmul,
-)
-from .observer import observer_mapping
-
-
-def save(model, output_dir="./saved_results"):
-    if not os.path.exists(output_dir):
-        os.mkdir(output_dir)
-    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
-    qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
-    # saving process
-    save_config_mapping(model.qconfig, qconfig_file_path)
-
-    import fp8_convert
-
-    stat_dict = {}
-    for k, v in model.state_dict().items():
-        if v.dtype in FP8_DTYPE:
-            v = fp8_convert.to_u8(v.to("cpu"))
-        stat_dict[k] = v.to("cpu")
-    torch.save(stat_dict, qmodel_file_path)
-
-    logger.info("Save state_dict of quantized model to {}.".format(qmodel_file_path))
-    logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
-
-
-def load(model, output_dir="./saved_results"):
-    from neural_compressor.torch.utils import fetch_module, set_module
-
-    from .fp8_quant import quantization_mapping, white_list
-
-    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
-    stat_dict = torch.load(qmodel_file_path)
-    import fp8_convert
-
-    for (op_name, op_type), op_qconfig in model.qconfig.items():
-        dtype = dtype_mapping[op_qconfig.w_dtype]
-        # only modules that have weight should use this observer
-        observer_cls = observer_mapping[op_qconfig.w_observer]
-        observer_obj = observer_cls(dtype=dtype)
-        choice = 1 if dtype == torch.float8_e4m3fn else 0
-        if op_name + ".weight" in stat_dict:
-            stat_dict[op_name + ".weight"] = fp8_convert.from_u8(stat_dict[op_name + ".weight"], choice)
-        if dtype not in FP8_DTYPE:
-            continue
-        module = fetch_module(model, op_name)
-        # replace module
-        if op_qconfig.approach == "static":
-            if isinstance(module, white_list):
-                QModule = quantization_mapping[type(module)]
-                qmodule = QModule(module, dtype)
-        else:
-            if isinstance(module, torch.nn.Linear):
-                # need module for initialization
-                qmodule = FP8DynamicLinear(module, dtype)
-            elif isinstance(module, Matmul):
-                qmodule = FP8DynamicMatmul(dtype)
-            elif isinstance(module, BatchMatmul):
-                qmodule = FP8DynamicBatchMatmul(dtype)
-            elif isinstance(module, Autocast):
-                qmodule = FP8Cast(dtype=dtype)
-        # only modules that have weight should use this API
-        if hasattr(qmodule, "from_float"):
-            qmodule.from_float(module, observer_obj)
-        # replace module with qmodule
-        set_module(model, op_name, qmodule)
-        htcore.mark_step()
-    model.load_state_dict(stat_dict, assign=True)
-    model.to("hpu")
-    htcore.mark_step()
-    logger.info("Quantized model loading successful.")
-    return model
diff --git a/neural_compressor/torch/algorithms/habana_fp8/scale.py b/neural_compressor/torch/algorithms/habana_fp8/scale.py
deleted file mode 100644
index 1dfaee24502..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/scale.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import habana_frameworks.torch.core as htcore
-import torch
-
-scale_method_mapping = {}
-
-
-def scale_method_registry(name):
-    def new_scale_method(scale_method_cls):
-        global scale_method_mapping
-        scale_method_mapping[name] = scale_method_cls
-        return scale_method_cls
-
-    return new_scale_method
-
-
-@scale_method_registry("hw")
-def hardware_scale_method(scale):
-    scale_list = torch.tensor([16, 1, 1 / 16, 1 / 256])
-    return torch.clip(
-        2 ** (torch.ceil(torch.log2(scale) / 4) * 4),
-        torch.tensor(scale_list[-1], dtype=scale.dtype, device=scale.device),
-        torch.tensor(scale_list[0], dtype=scale.dtype, device=scale.device),
-    )
-
-
-@scale_method_registry("pow2")
-def pow2_scale_method(scale):
-    return 2 ** torch.ceil(torch.log2(scale))
-
-
-@scale_method_registry("unit")
-def unit_scale_method(scale):
-    return torch.tensor(1.0)
-
-
-@scale_method_registry("self")
-def self_scale_method(scale):
-    return scale
-
-
-def map_gaudi_scale(scale, method):
-    scale_method = scale_method_mapping[method]
-    return scale_method(scale)
diff --git a/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py b/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
deleted file mode 100644
index 28f108cb636..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp b/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp
deleted file mode 100644
index f22c5c82c89..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//  Copyright (c) 2024 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-// Temporary implementation of fp8 tensor saving and loading
-// Will remove after Habana torch applies below patch:
-// https://github.com/pytorch/pytorch/pull/114662
-
-
-#include <torch/extension.h>
-
-
-// function prototype declaration
-torch::Tensor to_u8(torch::Tensor tensor);
-torch::Tensor from_u8(torch::Tensor tensor, int choice=1);
-
-
-torch::Tensor to_u8(torch::Tensor tensor) {
-    auto p = tensor.data_ptr();
-    // RuntimeError: HPU device type not enabled.
-    auto options = torch::TensorOptions().device(torch::kCPU).dtype(torch::kUInt8);
-    auto tmp = torch::from_blob(p, tensor.sizes(), options);
-    // copy to avoid memory leak.
-    torch::Tensor tensor_uint8 = torch::empty_like(tensor, torch::kUInt8).copy_(tmp);
-    return tensor_uint8;
-};
-
-
-/*
-choice=1 means torch.float8_e4m3fn;
-others means torch.float8_e5m2;
-*/
-torch::Tensor from_u8(torch::Tensor tensor, int choice) {
-    auto p = tensor.data_ptr();
-    torch::ScalarType dtype;
-    if (choice == 1) {
-        dtype = torch::kFloat8_e4m3fn;
-    }
-    else {
-        dtype = torch::kFloat8_e5m2;
-    }
-    auto options = torch::TensorOptions().device(torch::kCPU).dtype(dtype);
-    auto tmp = torch::from_blob(p, tensor.sizes(), options);
-    // copy to avoid memory leak.
-    torch::Tensor tensor_fp8 = torch::empty_like(tensor, dtype).copy_(tmp);
-    return tensor_fp8;
-};
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("to_u8", &to_u8, "Convert tensor to u8 for saving.");
-    m.def("from_u8", &from_u8, "Recover tensor from u8 for loading.");
-};
diff --git a/neural_compressor/torch/amp/__init__.py b/neural_compressor/torch/amp/__init__.py
deleted file mode 100644
index 87a0c8287d0..00000000000
--- a/neural_compressor/torch/amp/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .autocast import autocast
diff --git a/neural_compressor/torch/amp/autocast.py b/neural_compressor/torch/amp/autocast.py
deleted file mode 100644
index 7375b80c0f5..00000000000
--- a/neural_compressor/torch/amp/autocast.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Any, Optional
-
-import torch
-from torch.types import _dtype
-
-
-class autocast:
-    r"""Instances of :class:`autocast` serve as context managers or decorators that
-    allow regions of your script to run in mixed precision.
-
-    In these regions, ops run in an op-specific dtype chosen by autocast
-    to improve performance while maintaining accuracy.
-
-    When entering an autocast-enabled region, Tensors may be any type.
-    You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting.
-
-    :class:`autocast` should wrap only the forward pass(es) of your network, including the loss
-    computation(s).  Backward passes under autocast are not recommended.
-    Backward ops run in the same type that autocast used for corresponding forward ops.
-
-        # Enables autocasting for the inference pass
-        with torch.autocast(device_type="hpu", dtype=torch.float8_e4m3fn):
-            output = model(input)
-
-    :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model::
-
-        class AutocastModel(nn.Module):
-            ...
-            @torch.autocast(device_type="cuda")
-            def forward(self, input):
-                ...
-
-    The autocast state is thread-local.  If you want it enabled in a new thread, the context manager or decorator
-    must be invoked in that thread.  This affects :class:`torch.nn.DataParallel` and
-    :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process
-    (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
-
-    Args:
-        device_type(str, required):  Device type to use. Possible values are: 'cuda', 'cpu', 'xpu' and 'hpu'.
-                                     The type is the same as the `type` attribute of a :class:`torch.device`.
-                                     Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
-        enabled(bool, optional):  Whether autocasting should be enabled in the region.
-            Default: ``True``
-        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16.
-        cache_enabled(bool, optional):  Whether the weight cache inside autocast should be enabled.
-            Default: ``True``
-    """
-
-    def __init__(
-        self,
-        device_type: str,
-        dtype: Optional[_dtype] = None,
-        enabled: bool = True,
-        cache_enabled: Optional[bool] = None,
-    ):
-        self.device = device_type
-        if dtype is not None:
-            self.fast_dtype = dtype
-        if cache_enabled is not None:
-            self._cache_enabled = cache_enabled
-        if not (device_type == "hpu" and dtype in [torch.float8_e4m3fn, torch.float8_e5m2]):
-            self._autocast = torch.autocast(device_type, dtype, enabled, cache_enabled)
-
-    def __enter__(self) -> None:
-        if self.device == "hpu" and self.fast_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            from neural_compressor.torch.amp.fp8.functions import replace_func
-
-            # This function will replace F.linear and torch.matmul with the fp8 one
-            replace_func(self.fast_dtype)
-        else:
-            self._autocast.__enter__()
-
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
-        if self.device == "hpu" and self.fast_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            from neural_compressor.torch.amp.fp8.functions import recover_func
-
-            # This function will recover F.linear and torch.matmul with the original one
-            recover_func()
-        else:
-            self._autocast.__exit__(exc_type, exc_value, traceback)
diff --git a/neural_compressor/torch/amp/fp8/__init__.py b/neural_compressor/torch/amp/fp8/__init__.py
deleted file mode 100644
index 28f108cb636..00000000000
--- a/neural_compressor/torch/amp/fp8/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/neural_compressor/torch/amp/fp8/functions.py b/neural_compressor/torch/amp/fp8/functions.py
deleted file mode 100644
index f8f19a64b17..00000000000
--- a/neural_compressor/torch/amp/fp8/functions.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint:disable=import-error
-
-import os
-
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.hpex
-import torch
-from torch.nn import functional as F
-
-from neural_compressor.torch.algorithms.habana_fp8.observer import calculate_qparams
-from neural_compressor.torch.utils import logger
-
-_F_linear = F.linear
-_torch_matmul = torch.matmul
-_torch_bmm = torch.bmm
-
-
-DATA_TYPE = torch.float8_e4m3fn
-USE_AMAX = bool(os.getenv("PT_USE_FP8_AMAX", False))
-
-
-def fp8_linear_forward(input, weight, bias=None):
-    out_dtype = torch.float32
-    org_middle_shape = input.shape[1:-1]
-    input = input.view((-1, weight.shape[-1]))
-    # process input
-    if input.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = input.dtype
-        if USE_AMAX:
-            input_scale = calculate_qparams(input.min(), input.max(), DATA_TYPE)
-            input_scale_inv = torch.reciprocal(input_scale)
-        else:
-            input_scale, input_scale_inv = None, None
-        input = torch.ops.hpu.cast_to_fp8_v2(input, input_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for input
-        input_scale, input_scale_inv = None, None
-    # process weight
-    if weight.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = weight.dtype
-        if USE_AMAX:
-            weight_scale = calculate_qparams(weight.min(), weight.max(), DATA_TYPE)
-            weight_scale_inv = torch.reciprocal(weight_scale)
-        else:
-            weight_scale, weight_scale_inv = None, None
-        weight = torch.ops.hpu.cast_to_fp8_v2(weight, weight_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for weight
-        weight_scale, weight_scale_inv = None, None
-    out = torch.ops.hpu.fp8_gemm_v2(
-        input,
-        False,
-        weight,
-        True,
-        None,
-        out_dtype,
-        input_scale,
-        weight_scale,
-        bias,
-        False,
-    )
-    out = out.view(-1, *org_middle_shape, out.shape[-1])
-    return out
-
-
-def fp8_matmul(input1, input2):
-    out_dtype = torch.float32
-    # process input1
-    if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = input1.dtype
-        if USE_AMAX:
-            input1_scale = calculate_qparams(input1.min(), input1.max(), DATA_TYPE)
-            input1_scale_inv = torch.reciprocal(input1_scale)
-        else:
-            input1_scale, input1_scale_inv = None, None
-        input1 = torch.ops.hpu.cast_to_fp8_v2(input1, input1_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for input1
-        input1_scale, input1_scale_inv = None, None
-    # process input2
-    if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = input2.dtype
-        if USE_AMAX:
-            input2_scale = calculate_qparams(input2.min(), input2.max(), DATA_TYPE)
-            input2_scale_inv = torch.reciprocal(input2_scale)
-        else:
-            input2_scale, input2_scale_inv = None, None
-        input2 = torch.ops.hpu.cast_to_fp8_v2(input2, input2_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for input2
-        input2_scale, input2_scale_inv = None, None
-    # calculate
-    out = torch.ops.hpu.fp8_gemm_v2(
-        input1,
-        False,
-        input2,
-        False,
-        None,
-        out_dtype,
-        input1_scale,
-        input2_scale,
-        None,
-        False,
-    )
-    return out
-
-
-def replace_func(dtype):
-    global DATA_TYPE
-    DATA_TYPE = dtype
-    F.linear = fp8_linear_forward
-    torch.matmul = fp8_matmul
-    torch.bmm = fp8_matmul
-    logger.debug("F.linear and torch.matmul are replaced with the fp8 one")
-
-
-def recover_func():
-    F.linear = _F_linear
-    torch.matmul = _torch_matmul
-    torch.bmm = _torch_bmm
-    logger.debug("F.linear and torch.matmul are recovered")
diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py
index 3bc12580848..9f459bbd67f 100644
--- a/neural_compressor/torch/quantization/__init__.py
+++ b/neural_compressor/torch/quantization/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from neural_compressor.torch.quantization.quantize import quantize, prepare, convert
+from neural_compressor.torch.quantization.quantize import quantize, prepare, convert, finalize_calibration
 from neural_compressor.torch.quantization.config import (
     RTNConfig,
     get_default_rtn_config,
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 3e107718e51..ba9f69001c4 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -519,20 +519,22 @@ def hqq_entry(
 
 
 ###################### Habana FP8 Algo Entry ##################################
-from neural_compressor.torch.utils import is_hpex_available
-
-if is_hpex_available():
-    from neural_compressor.torch.algorithms.habana_fp8 import quantize, save
-
-    @register_algo(FP8_QUANT)
-    def fp8_quant_entry(
-        model: torch.nn.Module, configs_mapping: Dict[Tuple[str], FP8Config], *args, **kwargs
-    ) -> torch.nn.Module:
-        kwargs.pop("example_inputs")
-        model = quantize(model, configs_mapping, *args, **kwargs)
-        model.qconfig = configs_mapping
-        model.save = MethodType(save, model)
-        return model
+@register_algo(FP8_QUANT)
+@torch.no_grad()
+def fp8_entry(
+    model: torch.nn.Module,
+    configs_mapping: Dict[Tuple[str], FP8Config],
+    mode: Mode = Mode.QUANTIZE,
+    *args,
+    **kwargs,
+) -> torch.nn.Module:
+    """The main entry to apply fp8 quantization."""
+    from neural_compressor.torch.algorithms.fp8_quant import FP8Quantizer
+
+    quantizer = get_quantizer(model, quantizer_cls=FP8Quantizer, quant_config=configs_mapping)
+    model = quantizer.execute(model, mode=mode)
+    postprocess_model(model, mode, quantizer)
+    return model
 
 
 ###################### MX Quant Algo Entry ##################################
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 05a2a956b56..6a20f2bba42 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -16,6 +16,8 @@
 # limitations under the License.
 # pylint:disable=import-error
 
+import json
+import importlib
 from collections import OrderedDict
 from typing import Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
@@ -1230,81 +1232,142 @@ def get_default_hqq_config() -> HQQConfig:
     return HQQConfig()
 
 
-######################## FP8 Config ###############################
+######################## FP8 Quant Config ###############################
+# refer to habana_quantization_toolkit/_core/common.py
+FP8_WHITE_LIST = (
+        "Matmul", "Linear", "FalconLinear", "KVCache", "Conv2d",
+        "LoRACompatibleLinear", "LoRACompatibleConv", "Softmax", "ModuleFusedSDPA")
+if importlib.util.find_spec("deepspeed"):
+    FP8_WHITE_LIST.append(
+        "LinearLayer", "LinearAllreduce","ScopedLinearAllReduce", "LmHeadLinearAllreduce")
+
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT)
 class FP8Config(BaseConfig):
     """Config class for FP8 quantization."""
 
     name = FP8_QUANT
-    supported_configs: List[OperatorConfig] = []
+
+    # tunable params
     params_list = [
-        "w_dtype",
-        "w_observer",
-        "act_dtype",
-        "act_observer",
-        "approach",
-        "device",
+        "fp8_config",
+        "scale_method",
+        "observer",
+        "measure_exclude",
     ]
 
     def __init__(
         self,
-        w_dtype: str = "fp8_e4m3",
-        w_observer: Union[str, List[str]] = "minmax_per_channel",
-        act_dtype: str = "fp8_e4m3",
-        act_observer: Union[str, List[str]] = "minmax",
-        approach: Union[str, List[str]] = "static",
-        device: Union[str, List[str]] = "hpu",
-        white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
+        dump_stats_path: str = "./hqt_output/measure",
+        fp8_config: str = "E4M3",
+        hp_dtype: torch.dtype = torch.bfloat16,
+        blocklist: dict = {'names': [], 'types': ()},
+        allowlist: dict = {'names': [], 'types': FP8_WHITE_LIST},
+        mode: str = "AUTO",
+        scale_method: str = "maxabs_hw",
+        scale_params: dict = {},
+        observer: str = "maxabs",
+        mod_dict: dict = {},
+        measure_exclude: str = "OUTPUT",
+        **kwargs,
     ):
-        """Init FP8 config.
+        """Init FP8 config."""
+        super().__init__()
+        self.dump_stats_path =dump_stats_path
+        self.fp8_config = fp8_config
+        self.hp_dtype = hp_dtype
+        self.blocklist = blocklist
+        self.allowlist = allowlist
+        self.mode = mode
+        self.scale_method = scale_method
+        self.scale_params = scale_params
+        self.observer = observer
+        self.mod_dict = mod_dict
+        self._json_file = None
+
+    @property
+    def measure(self):
+        return self.mode == "MEASURE"
+
+    @property
+    def quantize(self):
+        return self.mode == "QUANTIZE"
+
+    @property
+    def json_file(self):
+        if self._json_file is None:
+            import tempfile
+            from pathlib import Path
+
+            json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json")
+            self.to_json_file(json_file_tmp.name)
+            self.json_file(json_file_tmp.name)
+        return self._json_file
+
+    @json_file.setter
+    def json_file(self, json_file):
+        self._json_file = json_file
 
-        Args:
-        """
-        super().__init__(white_list=white_list)
-        self.w_dtype = w_dtype
-        self.w_observer = w_observer
-        self.act_dtype = act_dtype
-        self.act_observer = act_observer
-        self.approach = approach
-        self.device = device
-        self._post_init()
+    @classmethod
+    def from_json_file(cls, filename):
+        with open(filename, "r", encoding="utf-8") as file:
+            config_dict = json.load(file)
+        config = cls.from_dict(config_dict)
+        config.json_file = filename
+        return config
 
     @classmethod
-    def register_supported_configs(cls) -> List[OperatorConfig]:
+    def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
+        # just a simple example here
+        # usually write parameter combinations that are more suitable to tune based on experience.
+        return FP8Config(
+            fp8_config=["E4M3", "E5M2"],
+            scale_method=["without_scale", "maxabs_hw"],
+            measure_exclude=["NONE", "OUTPUT"])
+
+    @classmethod
+    def register_supported_configs(cls):
+        """Add all supported configs."""
         supported_configs = []
-        fp8_config = FP8Config(
-            w_dtype=["fp8_e5m2", "fp8_e4m3"],
-            w_observer=["minmax", "minmax_per_channel"],
-            act_dtype=["fp8_e5m2", "fp8_e4m3"],
-            act_observer=["minmax", "kl"],
-            approach=["static", "dynamic"],
-            device=["hpu"],
+        linear_rtn_config = FP8Config(
+            mode=["AUTO", "MEASURE", "QUANTIZE"],
+            fp8_config=["E4M3", "E5M2"],
+            scale_method=["without_scale", "unit_scale", "max", "maxabs_hw",
+                "maxabs_pow2", "maxabs_hw_opt_weight", "maxabs_pow2_opt_weight",
+                "smoothquant_weights_output_channel_maxabs_pow2",
+                "weaksmoothquant_weights_output_channel_maxabs_pow2",
+                "act_maxabs_hw_weights_pcs_maxabs_pow2",
+                "act_maxabs_hw_weights_pcs_opt_pow2",
+                "act_maxabs_pow2_weights_pcs_maxabs_pow2",
+                "act_maxabs_pow2_weights_pcs_opt_pow2",
+                "smoothquant_opt"],
+            observer=["shape", "maxabs", "maxabs_per_channel", "save"],
+            measure_exclude=["NONE", "OUTPUT", "INPUT", "ALL"],
         )
-        if is_hpex_available():
-            from neural_compressor.torch.algorithms.habana_fp8 import white_list
-
-            operators = white_list
-        else:
-            operators = ()
-        supported_configs.append(OperatorConfig(config=fp8_config, operators=operators))
+        operators = list(FP8_WHITE_LIST)
+        supported_configs.append(OperatorConfig(config=linear_rtn_config, operators=operators))
         cls.supported_configs = supported_configs
 
     @staticmethod
     def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
-        from neural_compressor.torch.algorithms.habana_fp8 import white_list
-
         filter_result = []
         for op_name, module in model.named_modules():
-            if isinstance(module, white_list):
-                pair = (op_name, type(module).__name__)
+            if module.__class__.__name__ in FP8_WHITE_LIST or \
+            module.__class__.__name__.split("Patched")[-1] in FP8_WHITE_LIST:
+                pair = (op_name, module.__class__.__name__)
                 filter_result.append(pair)
         logger.debug(f"Get model info: {filter_result}")
         return filter_result
 
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
-        # TODO fwk owner needs to update it.
-        return FP8Config(act_observer=["minmax", "kl"])
+    def to_config_mapping(
+        self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
+    ):
+        config_mapping = OrderedDict()
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            for op_name, op_type in model_info:
+                config_mapping[(op_name, op_type)] = self
+        return config_mapping
 
 
 def get_default_fp8_config() -> FP8Config:
diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py
index fb870a92e77..cb2f8b4010f 100644
--- a/neural_compressor/torch/quantization/load_entry.py
+++ b/neural_compressor/torch/quantization/load_entry.py
@@ -61,7 +61,6 @@ def load(output_dir="./saved_results", model=None):
             return load(output_dir)
 
         model.qconfig = config_mapping
-        if isinstance(config_object, FP8Config):  # FP8
-            from neural_compressor.torch.algorithms.habana_fp8 import load
-
-            return load(model, output_dir)  # pylint: disable=E1121
+        if isinstance(config_object, FP8Config):
+            # TODO: support loading FP8 model
+            raise NotImplementedError("`load` function for FP8 model is not supported yet.")
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 57197a91972..0c2fdcd94d8 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -19,7 +19,7 @@
 
 from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry
 from neural_compressor.common.utils import Mode, log_process
-from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig
+from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig, FP8Config
 from neural_compressor.torch.utils import is_ipex_available, logger
 from neural_compressor.torch.utils.utility import WHITE_MODULE_LIST, algos_mapping, get_model_info
 
@@ -60,8 +60,8 @@ def quantize(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info("Quantize model with config:")
-    logger.info(quant_config.to_dict())
+    logger.debug("Quantize model with config:")
+    logger.debug(quant_config.to_dict())
     # select quantization algo according to config
 
     if is_ipex_available and (
@@ -129,8 +129,8 @@ def prepare(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info("Prepare model with config:")
-    logger.info(quant_config.to_dict())
+    logger.debug("Prepare model with config:")
+    logger.debug(quant_config.to_dict())
 
     # select quantization algo according to config
     if is_ipex_available and (
@@ -176,8 +176,9 @@ def convert(
     """
     q_model = model if inplace else copy.deepcopy(model)
 
-    # TODO: Optimize the check for prepared flag after adding HQT FP8 Quant
-    assert getattr(model, "prepared", False), "Please run prepare function before convert."
+    assert (
+        getattr(model, "prepared", False) or quant_config is not None
+    ), "Please pass quant_config to convert function."
 
     if getattr(model, "prepared", False):
         if quant_config is None:
@@ -192,8 +193,8 @@ def convert(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info("Convert model with config:")
-    logger.info(quant_config.to_dict())
+    logger.debug("Convert model with config:")
+    logger.debug(quant_config.to_dict())
 
     # select quantization algo according to config
     if is_ipex_available and (
@@ -216,3 +217,12 @@ def convert(
                 mode=Mode.CONVERT,
             )
     return q_model
+
+
+def finalize_calibration(model):
+    if hasattr(model, "quant_config") and isinstance(model.quant_config, FP8Config): # FP8
+        from neural_compressor.torch.algorithms.fp8_quant import save_calib_result
+
+        save_calib_result(model)
+    else:
+        raise NotImplementedError("`finalize_calibration` only supports FP8 measurement now.")
diff --git a/setup.py b/setup.py
index cccb6c9feea..a1d4b8de02c 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,8 @@ def get_build_version():
         return __version__
     try:
         result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True)
-        _, distance, commit = result.stdout.strip().split("-")
+        distance = result.stdout.strip().split("-")[-2]
+        commit = result.stdout.strip().split("-")[-1]
         return f"{__version__}.dev{distance}+{commit}"
     except subprocess.CalledProcessError:
         return __version__
diff --git a/test/3x/torch/amp/test_fp8_amp.py b/test/3x/torch/amp/test_fp8_amp.py
deleted file mode 100644
index a5212467723..00000000000
--- a/test/3x/torch/amp/test_fp8_amp.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import copy
-import os
-import shutil
-import unittest
-
-import torch
-
-from neural_compressor.torch.amp import autocast
-from neural_compressor.torch.utils import is_hpex_available
-
-# if not is_hpex_available():
-#     exit()
-
-
-class M(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc1 = torch.nn.Linear(10, 5)
-        self.fc2 = torch.nn.Linear(5, 10)
-
-    def forward(self, inp):
-        x1 = self.fc1(inp)
-        x2 = self.fc2(x1)
-        x3 = torch.matmul(inp.T, x2)
-        x3 = x3.unsqueeze(0)
-        x3 = torch.bmm(x3, x3)
-        return x3
-
-
-@unittest.skipIf(not is_hpex_available(), "HPEX is required for HPU inference")
-class TestPytorchFP8Adaptor(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model = M().to("hpu")
-        self.inp = torch.randn(1, 10).to("hpu")
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("./.graph_dumps", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def test_autocast(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        with autocast("hpu", dtype=torch.bfloat16) and torch.no_grad():
-            bf16_out = m(inp)
-            print("BF16 MSE:", (bf16_out - fp32_out).pow(2).sum())
-
-        with autocast("hpu", dtype=torch.float8_e5m2) and torch.no_grad():
-            e5m2_out = m(inp)
-            print("FP8_E5M2 MSE:", (e5m2_out - fp32_out).pow(2).sum())
-
-        with autocast("hpu", dtype=torch.float8_e4m3fn) and torch.no_grad():
-            e4m3_out = m(inp)
-            print("FP8_E4M3 MSE:", (e4m3_out - fp32_out).pow(2).sum())
-
-    def test_autocast_use_amax(self):
-        os.environ["PT_USE_FP8_AMAX"] = str(1)
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        with autocast("hpu", dtype=torch.float8_e5m2) and torch.no_grad():
-            e5m2_out = m(inp)
-            print("FP8_E5M2 using amax MSE:", (e5m2_out - fp32_out).pow(2).sum())
-
-        with autocast("hpu", dtype=torch.float8_e4m3fn) and torch.no_grad():
-            e4m3_out = m(inp)
-            print("FP8_E4M3 using amax MSE:", (e4m3_out - fp32_out).pow(2).sum())
-        os.environ.pop("PT_USE_FP8_AMAX", None)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/torch/quantization/habana_fp8/test_fp8.py b/test/3x/torch/quantization/habana_fp8/test_fp8.py
deleted file mode 100644
index 8fafc302f65..00000000000
--- a/test/3x/torch/quantization/habana_fp8/test_fp8.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import copy
-import shutil
-
-import pytest
-import torch
-
-from neural_compressor.torch.utils import is_hpex_available
-
-if is_hpex_available():
-    from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
-    from neural_compressor.torch.algorithms.habana_fp8.modules import (
-        BatchMatmul,
-        FP8BatchMatmul,
-        FP8DynamicBatchMatmul,
-        FP8DynamicLinear,
-        FP8DynamicMatmul,
-        FP8Linear,
-        FP8Matmul,
-        Matmul,
-    )
-    from neural_compressor.torch.quantization import (
-        FP8Config,
-        TuningConfig,
-        autotune,
-        get_default_fp8_config,
-        get_default_fp8_config_set,
-        quantize,
-    )
-
-    torch.set_grad_enabled(False)
-
-
-class M(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc1 = torch.nn.Linear(10, 5)
-        self.fc2 = torch.nn.Linear(5, 10)
-        self.mm = Matmul()
-        self.bmm = BatchMatmul()
-
-    def forward(self, inp):
-        x1 = self.fc1(inp)
-        x2 = self.fc2(x1)
-        x3 = self.mm(inp.T, x2)
-        x3 = x3.unsqueeze(0)
-        x4 = self.mm(inp.T, x2)
-        x4 = x4.unsqueeze(0) + 1  ## SW-178838
-        x5 = self.bmm(x3, x4)
-        x6 = self.bmm(x3, x4)
-        out = x5 + x6
-        return out
-
-
-@pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
-class TestPytorchFP8Adaptor:
-    def setup_class(self):
-        self.model = M().to("hpu")
-        self.inp = torch.randn(1, 10).to("hpu")
-        self.fp32_out = self.model(self.inp)
-
-    def teardown_class(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("./.graph_dumps", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def test_dynamic_accu(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        m = quantize_dynamic(m, dtype="fp8_e5m2", inplace=True)
-        assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check."
-        print(m)
-        fp8_out = m(inp)
-        print("Dynamic quantization FP8_E5M2 MSE:", (fp32_out - fp8_out).pow(2).sum())
-
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        m = quantize_dynamic(m, dtype="fp8_e4m3", inplace=True)
-        assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check."
-        print(m)
-        fp8_out = m(inp)
-        print("Dynamic quantization FP8_E4M3 MSE:", (fp32_out - fp8_out).pow(2).sum())
-
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        qconfig = FP8Config(approach="dynamic")
-        m = quantize(m, qconfig, inplace=True)
-        assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check."
-        print(m)
-        fp8_out = m(inp)
-        print("Dynamic quantization FP8_E4M3 MSE:", (fp32_out - fp8_out).pow(2).sum())
-
-    @pytest.mark.parametrize("dtype", ["fp8_e5m2", "fp8_e4m3"])
-    @pytest.mark.parametrize("w_observer", ["minmax", "minmax_per_channel"])
-    @pytest.mark.parametrize("act_observer", ["minmax", "kl"])
-    def test_static_accu(self, dtype, w_observer, act_observer):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        qconfig = FP8Config(
-            w_dtype=dtype, w_observer=w_observer, act_dtype=dtype, act_observer=act_observer, approach="static"
-        )
-
-        def calib_func(model):
-            model(inp)
-
-        m = quantize(m, qconfig, run_fn=calib_func, inplace=True)
-        assert isinstance(m.fc1, FP8Linear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8Matmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8BatchMatmul), "Unexpected result. Please double check."
-        fp8_out = m(inp)
-        print("Static quantization config:", dtype, w_observer, act_observer)
-        print("Static quantization MSE:", (self.fp32_out - fp8_out).pow(2).sum())
-
-    def test_convert(self):
-        # Temporary implementation of fp8 tensor saving and loading
-        # Will remove after Habana torch applies below patch:
-        # https://github.com/pytorch/pytorch/pull/114662
-        # e4m3
-        fp8_inp = torch.ops.hpu.cast_to_fp8_v2(self.inp, 500, dtype=torch.float8_e4m3fn)[0].to("cpu")
-        import fp8_convert
-
-        int8_inp = fp8_convert.to_u8(fp8_inp)
-        torch.save(int8_inp, "tmp.pt")
-        saved_int8_inp = torch.load("tmp.pt")
-        recovered_inp = fp8_convert.from_u8(saved_int8_inp, 1)
-        assert (fp8_inp == recovered_inp).all(), "Unexpected result. Please double check."
-        # e5m2
-        fp8_inp = torch.ops.hpu.cast_to_fp8_v2(self.inp, 500, dtype=torch.float8_e5m2)[0].to("cpu")
-        int8_inp = fp8_convert.to_u8(fp8_inp)
-        recovered_inp = fp8_convert.from_u8(int8_inp, 0)
-        assert (fp8_inp == recovered_inp).all(), "Unexpected result. Please double check."
-
-    def test_save_load(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        qconfig = get_default_fp8_config()
-
-        def calib_func(model):
-            model(inp)
-
-        m = quantize(m, qconfig, run_fn=calib_func, inplace=True)
-        fp8_out = m(inp)
-        m.save("saved_results")
-
-        from neural_compressor.torch.quantization import load
-
-        m = copy.deepcopy(self.model)
-        m = load("saved_results", m)
-        recovered_out = m(inp)
-        assert (recovered_out == fp8_out).all(), "Unexpected result. Please double check."
-        assert isinstance(m.fc1, FP8Linear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8Matmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8BatchMatmul), "Unexpected result. Please double check."
-
-    def test_autotune(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-
-        def calib_func(model):
-            model(inp)
-
-        accu_list = [1.0, 0.9, 0.99]
-
-        def eval_func(model):
-            nonlocal accu_list
-            return accu_list.pop()
-
-        tune_config = TuningConfig(
-            config_set=get_default_fp8_config_set(),
-            tolerable_loss=0.01,
-        )
-        best_model = autotune(
-            model=m,
-            tune_config=tune_config,
-            run_fn=calib_func,
-            eval_fns=eval_func,
-        )
-        assert isinstance(best_model.fc1, FP8Linear), "Unexpected result. Please double check."
-        assert isinstance(best_model.mm, FP8Matmul), "Unexpected result. Please double check."
-        assert isinstance(best_model.bmm, FP8BatchMatmul), "Unexpected result. Please double check."