diff --git a/docs/api_doc/optimization/optimizer.rst b/docs/api_doc/optimization/optimizer.rst
deleted file mode 100644
index f4b31c471b9..00000000000
--- a/docs/api_doc/optimization/optimizer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-PyTorch Optimizer
-==============
-
-.. autoapisummary::
-
-   intel_extension_for_transformers.transformers.optimizer.NoTrainerOptimizer
-   intel_extension_for_transformers.transformers.optimizer.Orchestrate_optimizer
diff --git a/docs/devcatalog.md b/docs/devcatalog.md
index ab826d913d1..30f371489b2 100644
--- a/docs/devcatalog.md
+++ b/docs/devcatalog.md
@@ -99,7 +99,8 @@ raw_datasets = raw_datasets.map(lambda e: tokenizer(e['sentence'], truncation=Tr
 Documentation for API usage can be found [here](https://github.com/intel/intel-extension-for-transformers/tree/main/docs)
 
 ```python
-from intel_extension_for_transformers.transformers import QuantizationConfig, metrics, objectives
+from intel_extension_for_transformers.transformers import metrics, objectives
+from neural_compressor.config import PostTrainingQuantConfig
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
 # load config, model and metric
 config = AutoConfig.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english",num_labels=2)
@@ -120,7 +121,9 @@ trainer = NLPTrainer(model=model,
     tokenizer=tokenizer
 )
 # model quantization using trainer
-q_config = QuantizationConfig(metrics=[metrics.Metric(name="eval_accuracy")])
+tune_metric = metrics.Metric(name="eval_accuracy")
+trainer.metrics = tune_metric
+q_config = PostTrainingQuantConfig()
 model = trainer.quantize(quant_config=q_config)
 
 # test sentiment analysis with quantization
diff --git a/docs/get_started.md b/docs/get_started.md
index 62a603097a9..ea807226f03 100644
--- a/docs/get_started.md
+++ b/docs/get_started.md
@@ -75,17 +75,17 @@ model = trainer.distill(distillation_config=d_conf, teacher_model=teacher_model)
 ## Quantized Length Adaptive Transformer
 Quantized Length Adaptive Transformer leverages sequence-length reduction and low-bit representation techniques to further enhance model inference performance, enabling adaptive sequence-length sizes to accommodate different computational budget requirements with an optimal accuracy efficiency tradeoff.
 ```python
-from intel_extension_for_transformers.transformers import QuantizationConfig, DynamicLengthConfig, metric, objectives
+from intel_extension_for_transformers.transformers import DynamicLengthConfig, metric, objectives
+from neural_compressor.config import PostTrainingQuantConfig
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
 
 # Replace transformers.Trainer with NLPTrainer
 # trainer = transformers.Trainer(...)
 trainer = NLPTrainer(...)
 metric = metrics.Metric(name="eval_f1", is_relative=True, criterion=0.01)
-q_config = QuantizationConfig(
-    approach="static",
-    metrics=[metric],
-    objectives=[objectives.performance]
+trainer.metrics = metric
+q_config = PostTrainingQuantConfig(
+    approach="static"
 )
 # Apply the length config
 dynamic_length_config = DynamicLengthConfig(length_config=length_config)
diff --git a/docs/quantization.md b/docs/quantization.md
index 1c9cda3250d..93e621a0db9 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -157,16 +157,8 @@ In terms of evaluating the status of a specific model during tuning, we should h
     Please refer to [objective document](objectives.md) for the details.
 
 ### Create an Instance of QuantizationConfig
-The QuantizationConfig contains all the information related to the model quantization behavior. If you have created Metric and Objective instance(default Objective is "performance"), then you can create an instance of QuantizationConfig.
+The QuantizationConfig contains all the information related to the model quantization behavior. If you have created Metric and Objective instance(default Objective is "performance"), then you can create an instance of PostTrainingQuantConfig or QuantizationAwareTrainingConfig.
 
-- arguments:
-
-|Argument   |Type       |Description                                        |Default value    |
-|:----------|:----------|:-----------------------------------------------|:----------------|
-|approach   |string     |Which quantization approach you used            |"static"|
-|timeout    |integer    |Tuning timeout(seconds), 0 means early stop; combine with max_trials field to decide when to exit|0    |
-|max_trials |integer    |Max tune times                                  |100              |
-|objective |list of Objective|Objective with accuracy constraint guaranteed|performance|
 
 - example:
     ```python
diff --git a/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb b/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb
index 53c79e94b89..3914a427e34 100644
--- a/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb
+++ b/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb
@@ -1000,10 +1000,9 @@
     "    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n",
     "    criterion=0.25, # Performance tolerance when optimizing the model.\n",
     ")\n",
-    "quantization_config = QuantizationConfig(\n",
-    "    approach=\"static\",\n",
-    "    max_trials=200,\n",
-    "    metrics=[tune_metric],\n",
+    "trainer_static.metrics = tune_metric\n",
+    "quantization_config = PostTrainingQuantConfig(\n",
+    "    approach=\"static\"\n",
     ")\n",
     "\n",
     "# run quantization\n",
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
index b693be23b0c..18a709a59e7 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
@@ -84,7 +84,7 @@ function run_tuning {
                     --evaluation_strategy steps \
                     --save_strategy steps \
                     --save_total_limit 1 \
-                    --safe_serialization False"
+                    --save_safetensors False"
         fi
     elif [ "${topology}" = "gpt_j" ]; then
         if [ "${task}" = "clm" ]; then
@@ -121,7 +121,7 @@ function run_tuning {
                     --save_strategy steps \
                     --metric_for_best_model accuracy \
                     --save_total_limit 1 \
-                    --safe_serialization False"
+                    --save_safetensors False"
         fi
     elif [ "${topology}" = "xlnet" ]; then
         if [ "${task}" = "plm" ]; then
@@ -146,7 +146,7 @@ function run_tuning {
                     --save_strategy steps \
                     --metric_for_best_model accuracy \
                     --save_total_limit 1 \
-                    --safe_serialization False"
+                    --save_safetensors False"
         fi
     elif [ "${topology}" = "gpt_neox" ]; then
         if [ "${task}" = "clm" ]; then
diff --git a/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh b/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh
index 430d403c1d4..b8123802f38 100644
--- a/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh
@@ -57,7 +57,7 @@ function run_tuning {
                    --evaluation_strategy steps \
                    --save_strategy steps \
                    --save_total_limit 1 \
-                   --safe_serialization False"
+                   --save_safetensors False"
     fi
 
     python -u ./run_swag.py \
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/README.md b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/README.md
deleted file mode 100644
index f7054bf95fa..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-Step-by-Step
-============
-
-This document is used to list steps of reproducing PyTorch BERT pruning result.
-
-# Prerequisite
-
-## 1. Environment
-
-Recommend python 3.7 or higher version.
-
-### Install [intel-extension-for-transformers]()
-```
-pip install intel-extension-for-transformers
-```
-
-### Install PyTorch
-
-Install pytorch-gpu, visit [pytorch.org](https://pytorch.org/).
-```bash
-# Install pytorch
-pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
-```
-
-### Install BERT dependency
-
-```bash
-cd examples/pytorch/huggingface/question-answering/pruning/group_lasso
-pip3 install -r requirements.txt --ignore-installed PyYAML
-```
-```bash
-git clone https://github.com/NVIDIA/apex
-cd apex
-pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-```
-> **Note**
->
-> If no CUDA runtime is found, please export CUDA_HOME='/usr/local/cuda'.
-
-## 2. Prepare Dataset
-
-* For SQuAD task, you should download SQuAD dataset from [SQuAD dataset link](https://rajpurkar.github.io/SQuAD-explorer/).
-## 3. Prepare Model
-* Please download BERT large pretrained model from [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/bert_pyt_ckpt_large_pretraining_amp_lamb/files?version=20.03.0).
-```bash
-# wget cmd
-wget https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_pretraining_amp_lamb/versions/20.03.0/files/bert_large_pretrained_amp.pt
-
-# curl cmd
-curl -LO https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_pretraining_amp_lamb/versions/20.03.0/files/bert_large_pretrained_amp.pt
-```
-# Run
-Enter your created conda env, then run the script.
-```bash
-bash scripts/run_squad_sparse.sh /path/to/model.pt 2.0 16 5e-5 tf32 /path/to/data /path/to/outdir prune_bert.yaml
-```
-The default parameters are as follows:
-```shell
-init_checkpoint=${1:-"/path/to/ckpt_8601.pt"}
-epochs=${2:-"2.0"}
-batch_size=${3:-"4"}
-learning_rate=${4:-"3e-5"}
-precision=${5:-"tf32"}
-BERT_PREP_WORKING_DIR=${6:-'/path/to/bert_data'}
-OUT_DIR=${7:-"./results/SQuAD"}
-prune_config=${8:-"prune_bert.yaml"}
-```
- >**Note**: For original BERT readme, please refer [BERT README](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/README.md)
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/bert_config.json b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/bert_config.json
deleted file mode 100644
index a7efa973d74..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/bert_config.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "attention_probs_dropout_prob": 0.1,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 1024,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "max_position_embeddings": 512,
-  "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "type_vocab_size": 2,
-  "vocab_size": 30522
-}
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/extract_features.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/extract_features.py
deleted file mode 100644
index dd206f52221..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/extract_features.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Extract pre-computed feature vectors from a PyTorch BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import logging
-import json
-import re
-
-import torch
-from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from tokenization import BertTokenizer
-from modeling import BertModel
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-
-    def __init__(self, unique_id, text_a, text_b):
-        self.unique_id = unique_id
-        self.text_a = text_a
-        self.text_b = text_b
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
-        self.unique_id = unique_id
-        self.tokens = tokens
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.input_type_ids = input_type_ids
-
-
-def convert_examples_to_features(examples, seq_length, tokenizer):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > seq_length - 2:
-                tokens_a = tokens_a[0:(seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        input_type_ids = []
-        tokens.append("[CLS]")
-        input_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            input_type_ids.append(0)
-        tokens.append("[SEP]")
-        input_type_ids.append(0)
-
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                input_type_ids.append(1)
-            tokens.append("[SEP]")
-            input_type_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            input_type_ids.append(0)
-
-        assert len(input_ids) == seq_length
-        assert len(input_mask) == seq_length
-        assert len(input_type_ids) == seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("unique_id: %s" % (example.unique_id))
-            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
-
-        features.append(
-            InputFeatures(
-                unique_id=example.unique_id,
-                tokens=tokens,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                input_type_ids=input_type_ids))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def read_examples(input_file):
-    """Read a list of `InputExample`s from an input file."""
-    examples = []
-    unique_id = 0
-    with open(input_file, "r", encoding='utf-8') as reader:
-        while True:
-            line = reader.readline()
-            if not line:
-                break
-            line = line.strip()
-            text_a = None
-            text_b = None
-            m = re.match(r"^(.*) \|\|\| (.*)$", line)
-            if m is None:
-                text_a = line
-            else:
-                text_a = m.group(1)
-                text_b = m.group(2)
-            examples.append(
-                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
-            unique_id += 1
-    return examples
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--input_file", default=None, type=str, required=True)
-    parser.add_argument("--output_file", default=None, type=str, required=True)
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
-
-    ## Other parameters
-    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
-                            "than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help = "local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
-
-    layer_indexes = [int(x) for x in args.layers.split(",")]
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    examples = read_examples(args.input_file)
-
-    features = convert_examples_to_features(
-        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
-
-    unique_id_to_feature = {}
-    for feature in features:
-        unique_id_to_feature[feature.unique_id] = feature
-
-    model = BertModel.from_pretrained(args.bert_model)
-    model.to(device)
-
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
-    if args.local_rank == -1:
-        eval_sampler = SequentialSampler(eval_data)
-    else:
-        eval_sampler = DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    model.eval()
-    with open(args.output_file, "w", encoding='utf-8') as writer:
-        for input_ids, input_mask, example_indices in eval_dataloader:
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-
-            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
-            all_encoder_layers = all_encoder_layers
-
-            for b, example_index in enumerate(example_indices):
-                feature = features[example_index.item()]
-                unique_id = int(feature.unique_id)
-                # feature = unique_id_to_feature[unique_id]
-                output_json = collections.OrderedDict()
-                output_json["linex_index"] = unique_id
-                all_out_features = []
-                for (i, token) in enumerate(feature.tokens):
-                    all_layers = []
-                    for (j, layer_index) in enumerate(layer_indexes):
-                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
-                        layer_output = layer_output[b]
-                        layers = collections.OrderedDict()
-                        layers["index"] = layer_index
-                        layers["values"] = [
-                            round(x.item(), 6) for x in layer_output[i]
-                        ]
-                        all_layers.append(layers)
-                    out_features = collections.OrderedDict()
-                    out_features["token"] = token
-                    out_features["layers"] = all_layers
-                    all_out_features.append(out_features)
-                output_json["features"] = all_out_features
-                writer.write(json.dumps(output_json) + "\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/file_utils.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/file_utils.py
deleted file mode 100644
index cdefb125839..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/file_utils.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
-Copyright by the AllenNLP authors.
-"""
-
-from __future__ import (absolute_import, division, print_function, unicode_literals)
-
-import json
-import logging
-import os
-import shutil
-import tempfile
-from functools import wraps
-from hashlib import sha256
-import sys
-from io import open
-
-import boto3
-import requests
-from botocore.exceptions import ClientError
-from tqdm import tqdm
-
-try:
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
-
-try:
-    from pathlib import Path
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                                   Path.home() / '.pytorch_pretrained_bert'))
-except AttributeError:
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-
-def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    """
-    url_bytes = url.encode('utf-8')
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode('utf-8')
-        etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-
-    meta_path = cache_path + '.json'
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata['url']
-    etag = metadata['etag']
-
-    return url, etag
-
-
-def cached_path(url_or_filename, cache_dir=None):
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
-        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir)
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        return url_or_filename
-    elif parsed.scheme == '':
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-
-def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
-
-
-def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
-
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
-
-    return wrapper
-
-
-@s3_request
-def s3_etag(url):
-    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
-
-
-@s3_request
-def s3_get(url, temp_file):
-    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-
-
-def http_get(url, temp_file):
-    req = requests.get(url, stream=True)
-    content_length = req.headers.get('Content-Length')
-    total = int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total)
-    for chunk in req.iter_content(chunk_size=1024):
-        if chunk: # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(url, cache_dir=None):
-    """
-    Given a URL, look for the corresponding dataset in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir)
-
-    # Get eTag to add to filename, if it exists.
-    if url.startswith("s3://"):
-        etag = s3_etag(url)
-    else:
-        response = requests.head(url, allow_redirects=True)
-        if response.status_code != 200:
-            raise IOError("HEAD request failed for url {} with status code {}"
-                          .format(url, response.status_code))
-        etag = response.headers.get("ETag")
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    if not os.path.exists(cache_path):
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                s3_get(url, temp_file)
-            else:
-                http_get(url, temp_file)
-
-            # we are copying the file before closing it, so flush to avoid truncation
-            temp_file.flush()
-            # shutil.copyfileobj() starts at the current position, so go to the start
-            temp_file.seek(0)
-
-            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
-            with open(cache_path, 'wb') as cache_file:
-                shutil.copyfileobj(temp_file, cache_file)
-
-            logger.info("creating metadata file for %s", cache_path)
-            meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
-            with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                json.dump(meta, meta_file)
-
-            logger.info("removing temp file %s", temp_file.name)
-
-    return cache_path
-
-
-def read_set_from_file(filename):
-    '''
-    Extract a de-duped collection (set) of text from a file.
-    Expected file format is one item per line.
-    '''
-    collection = set()
-    with open(filename, 'r', encoding='utf-8') as file_:
-        for line in file_:
-            collection.add(line.rstrip())
-    return collection
-
-
-def get_file_extension(path, dot=True, lower=True):
-    ext = os.path.splitext(path)[1]
-    ext = ext if dot else ext[1:]
-    return ext.lower() if lower else ext
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py
deleted file mode 100644
index cebd2b17f75..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py
+++ /dev/null
@@ -1,1285 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""PyTorch BERT model."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import copy
-import json
-import logging
-import math
-import os
-import shutil
-import tarfile
-import tempfile
-import sys
-from io import open
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from torch.utils import checkpoint
-
-sys.path.append('/workspace/bert/')
-from file_utils import cached_path
-
-from torch.nn import Module
-from torch.nn.parameter import Parameter
-import torch.nn.functional as F
-import torch.nn.init as init
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
-}
-CONFIG_NAME = 'bert_config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
-TF_WEIGHTS_NAME = 'model.ckpt'
-
-def load_tf_weights_in_bert(model, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.ascontiguousarray(np.transpose(array))
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-def gelu(x):
-    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
-
-#used only for triton inference
-def bias_gelu(bias, y):
-    x = bias + y
-    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
-
-# used specifically for training since torch.nn.functional.gelu breaks ONNX export
-def bias_gelu_training(bias, y):
-    x = bias + y
-    return torch.nn.functional.gelu(x) # Breaks ONNX export
-
-def bias_tanh(bias, y):
-    x = bias + y
-    return torch.tanh(x)
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-#torch.nn.functional.gelu(x) # Breaks ONNX export
-ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
-
-class LinearActivation(Module):
-    r"""Fused Linear and activation Module.
-    """
-    __constants__ = ['bias']
-
-    def __init__(self, in_features, out_features, act='gelu', bias=True):
-        super(LinearActivation, self).__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.act_fn = nn.Identity()                                                         #
-        self.biased_act_fn = None                                                           #
-        self.bias = None                                                                    #
-        if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)): # For TorchScript
-            if bias and not 'bias' in act:                                                  # compatibility
-                act = 'bias_' + act                                                         #
-                self.biased_act_fn = ACT2FN[act]                                            #
-
-            else:
-                self.act_fn = ACT2FN[act]
-        else:
-            self.act_fn = act
-        self.weight = Parameter(torch.Tensor(out_features, in_features))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_features))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-        if self.bias is not None:
-            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
-            bound = 1 / math.sqrt(fan_in)
-            init.uniform_(self.bias, -bound, bound)
-
-    def forward(self, input):
-        if not self.bias is None:
-            return self.biased_act_fn(self.bias, F.linear(input, self.weight, None))
-        else:
-            return self.act_fn(F.linear(input, self.weight, self.bias))
-
-    def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None
-        )
-
-
-class BertConfig(object):
-    """Configuration class to store the configuration of a `BertModel`.
-    """
-    def __init__(self,
-                 vocab_size_or_config_json_file,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 output_all_encoded_layers=False):
-        """Constructs BertConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.output_all_encoded_layers = output_all_encoded_layers
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `BertConfig` from a Python dictionary of parameters."""
-        config = BertConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-class BertNonFusedLayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-12):
-        """Construct a layernorm module in the TF style (epsilon inside the square root).
-        """
-        super(BertNonFusedLayerNorm, self).__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.bias = nn.Parameter(torch.zeros(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, x):
-        u = x.mean(-1, keepdim=True)
-        s = (x - u)
-        s = s * s
-        s = s.mean(-1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-        return self.weight * x + self.bias
-
-try:
-    import apex
-    #apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
-    import apex.normalization
-    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
-    #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
-    #BertLayerNorm = apex.normalization.FusedLayerNorm
-    APEX_IS_AVAILABLE = True
-except ImportError:
-    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
-    #BertLayerNorm = BertNonFusedLayerNorm
-    APEX_IS_AVAILABLE = False
-class BertLayerNorm(Module):
-    def __init__(self, hidden_size, eps=1e-12):
-        super(BertLayerNorm, self).__init__()
-        self.shape = torch.Size((hidden_size,))
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.bias = nn.Parameter(torch.zeros(hidden_size))
-        self.apex_enabled = APEX_IS_AVAILABLE
-
-    @torch.jit.unused
-    def fused_layer_norm(self, x):
-        return FusedLayerNormAffineFunction.apply(
-                    x, self.weight, self.bias, self.shape, self.eps)
-
-
-    def forward(self, x):
-        if self.apex_enabled and not torch.jit.is_scripting():
-            x = self.fused_layer_norm(x)
-        else:
-            u = x.mean(-1, keepdim=True)
-            s = (x - u)
-            s = s * s
-            s = s.mean(-1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.eps)
-            x = self.weight * x + self.bias
-        return x
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids):
-        seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-    def __init__(self, config):
-        super(BertSelfAttention, self).__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = torch.reshape(x, new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def transpose_key_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = torch.reshape(x, new_x_shape)
-        return x.permute(0, 2, 3, 1)
-
-    def forward(self, hidden_states, attention_mask):
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_key_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer)
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = F.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = torch.reshape(context_layer, new_context_layer_shape)
-        return context_layer
-
-
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super(BertSelfOutput, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-    def __init__(self, config):
-        super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-
-    def forward(self, input_tensor, attention_mask):
-        self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
-
-
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super(BertIntermediate, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense_act(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super(BertOutput, self).__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-    def __init__(self, config):
-        super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(self, hidden_states, attention_mask):
-        attention_output = self.attention(hidden_states, attention_mask)
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super(BertEncoder, self).__init__()
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.output_all_encoded_layers = config.output_all_encoded_layers
-        self._checkpoint_activations = False
-
-    @torch.jit.unused
-    def checkpointed_forward(self, hidden_states, attention_mask):
-        def custom(start, end):
-            def custom_forward(*inputs):
-                layers = self.layer[start:end]
-                x_ = inputs[0]
-                for layer in layers:
-                    x_ = layer(x_, inputs[1])
-                return x_
-            return custom_forward
-
-        l = 0
-        num_layers = len(self.layer)
-        chunk_length = math.ceil(math.sqrt(num_layers))
-        while l < num_layers:
-            hidden_states = checkpoint.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
-            l += chunk_length
-
-        return hidden_states
-
-    def forward(self, hidden_states, attention_mask):
-        all_encoder_layers = []
-
-        if self._checkpoint_activations:
-            hidden_states = self.checkpointed_forward(hidden_states, attention_mask)
-        else:
-            for i,layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states, attention_mask)
-
-                if self.output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-
-        if not self.output_all_encoded_layers or self._checkpoint_activations:
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super(BertPooler, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh")
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super(BertPredictionHeadTransform, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense_act(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertLMPredictionHead, self).__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
-                                 bert_model_embedding_weights.size(0),
-                                 bias=False)
-        self.decoder.weight = bert_model_embedding_weights
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states) + self.bias
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super(BertOnlyNSPHead, self).__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
-
-    def init_bert_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    def checkpoint_activations(self, val):
-        def _apply_flag(module):
-            if hasattr(module, "_checkpoint_activations"):
-                module._checkpoint_activations=val
-        self.apply(_apply_flag)
-    def enable_apex(self, val):
-        def _apply_flag(module):
-            if hasattr(module, "apex_enabled"):
-                module.apex_enabled=val
-        self.apply(_apply_flag)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        from_tf=False, *inputs, **kwargs):
-        """
-        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `bert-base-uncased`
-                    . `bert-large-uncased`
-                    . `bert-base-cased`
-                    . `bert-large-cased`
-                    . `bert-base-multilingual-uncased`
-                    . `bert-base-multilingual-cased`
-                    . `bert-base-chinese`
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
-            return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
-        else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            if os.path.isfile(resolved_archive_file) and tarfile.is_tarfile(resolved_archive_file):
-                with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-                    archive.extractall(tempdir)
-            else:
-                logger.error("Invalid tar file {}".format(resolved_archive_file))
-            serialization_dir = tempdir
-        # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
-            return load_tf_weights_in_bert(model, weights_path)
-        # Load from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-        start_prefix = ''
-        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
-            start_prefix = 'bert.'
-        load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
-        return model
-
-
-class BertModel(BertPreTrainedModel):
-    """BERT model ("Bidirectional Embedding Representations from a Transformer").
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertModel, self).__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-        self.apply(self.init_bert_weights)
-        self.output_all_encoded_layers = config.output_all_encoded_layers
-
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.embeddings.word_embeddings.weight.dtype) # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(embedding_output, extended_attention_mask)
-        sequence_output = encoded_layers[-1]
-        pooled_output = self.pooler(sequence_output)
-        if not self.output_all_encoded_layers:
-            encoded_layers = encoded_layers[-1:]
-        return encoded_layers, pooled_output
-
-
-class BertForPreTraining(BertPreTrainedModel):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
-            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-            - the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForPreTraining, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        encoded_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
-        sequence_output = encoded_layers[-1]
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        return prediction_scores, seq_relationship_score
-
-
-class BertForMaskedLM(BertPreTrainedModel):
-    """BERT model with the masked language modeling head.
-    This module comprises the BERT model followed by the masked language modeling head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-
-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
-        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask)
-        sequence_output = encoded_layers[-1]
-        prediction_scores = self.cls(sequence_output)
-
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            return masked_lm_loss
-        else:
-            return prediction_scores
-
-
-class BertForNextSentencePrediction(BertPreTrainedModel):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
-        seq_relationship_score = self.cls( pooled_output)
-
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
-            return seq_relationship_score
-
-
-class BertForSequenceClassification(BertPreTrainedModel):
-    """BERT model for classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels):
-        super(BertForSequenceClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
-        pooled_output = self.dropout(pooled_output)
-        return self.classifier(pooled_output)
-
-
-class BertForMultipleChoice(BertPreTrainedModel):
-    """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_choices`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
-            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_choices = 2
-
-    model = BertForMultipleChoice(config, num_choices)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_choices):
-        super(BertForMultipleChoice, self).__init__(config)
-        self.num_choices = num_choices
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, self.num_choices)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
-            return reshaped_logits
-
-
-class BertForTokenClassification(BertPreTrainedModel):
-    """BERT model for token-level classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the full hidden state of the last layer.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForTokenClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels):
-        super(BertForTokenClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask)
-        sequence_output = encoded_layers[-1]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForQuestionAnswering(BertPreTrainedModel):
-    """BERT model for Question Answering (span extraction).
-    This module is composed of the BERT model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-
-    Outputs:
-         Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-         position tokens of shape [batch_size, sequence_length].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask)
-        sequence_output = encoded_layers[-1]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-        return start_logits, end_logits
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/optimization.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/optimization.py
deleted file mode 100644
index 5881a5b5156..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/optimization.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""PyTorch optimization for BERT model."""
-
-import math
-import torch
-from torch.optim import Optimizer
-from torch.optim.optimizer import required
-from torch.nn.utils import clip_grad_norm_
-#from fused_adam_local import FusedAdam
-from apex.optimizers import FusedAdam
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-from utils import is_main_process
-
-multi_tensor_l2norm = amp_C.multi_tensor_l2norm
-lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
-lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda
-scale = amp_C.multi_tensor_scale
-
-
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
-
-def warmup_constant(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 1.0
-
-def warmup_linear(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return max((x - 1. )/ (warmup - 1.), 0.)
-    
-def warmup_poly(x, warmup=0.002, degree=0.5):
-    if x < warmup:
-        return x/warmup
-    return (1.0 - x)**degree
-
-
-SCHEDULES = {
-    'warmup_cosine':warmup_cosine,
-    'warmup_constant':warmup_constant,
-    'warmup_linear':warmup_linear,
-    'warmup_poly':warmup_poly,
-}
-
-class BertAdam(Optimizer):
-    """Implements BERT version of Adam algorithm with weight decay fix.
-    Params:
-        lr: learning rate
-        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
-        t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate. Default: -1
-        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
-        b1: Adams b1. Default: 0.9
-        b2: Adams b2. Default: 0.999
-        e: Adams epsilon. Default: 1e-6
-        weight_decay: Weight decay. Default: 0.01
-        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
-    """
-    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
-                 max_grad_norm=1.0):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
-        if not 0.0 <= b1 < 1.0:
-            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
-        if not 0.0 <= b2 < 1.0:
-            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
-        if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
-        super(BertAdam, self).__init__(params, defaults)
-
-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
-                lr.append(lr_scheduled)
-        return lr
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['next_m'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['next_v'] = torch.zeros_like(p.data)
-
-                next_m, next_v = state['next_m'], state['next_v']
-                beta1, beta2 = group['b1'], group['b2']
-
-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                next_m.mul_(beta1).add_(1 - beta1, grad)
-                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                update = next_m / (next_v.sqrt() + group['e'])
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                if group['weight_decay'] > 0.0:
-                    update += group['weight_decay'] * p.data
-
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
-
-                update_with_lr = lr_scheduled * update
-                p.data.add_(-update_with_lr)
-
-                state['step'] += 1
-
-        return loss
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/requirements.txt b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/requirements.txt
deleted file mode 100644
index 9741bff445c..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/requirements.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# progress bars in model download and training scripts
-tqdm
-# Accessing files from S3 directly.
-boto3
-# Used for downloading models over HTTP
-requests
-six
-ipdb
-#Data processing
-h5py
-nltk
-progressbar
-#Others
-numpy
-onnxruntime
-requests
-urllib3
-git+https://github.com/NVIDIA/dllogger
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/run_squad_sparse.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/run_squad_sparse.py
deleted file mode 100644
index 864c3cb8666..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/run_squad_sparse.py
+++ /dev/null
@@ -1,1285 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Run BERT on SQuAD."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import collections
-import dllogger, time
-import json
-import logging
-import math
-import modeling
-import numpy as np
-import os
-import random
-import sys
-import torch
-from apex import amp
-from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from io import open
-from optimization import BertAdam, warmup_linear
-from schedulers import LinearWarmUpScheduler
-from torch.utils.data import(
-    DataLoader,
-    RandomSampler,
-    SequentialSampler,
-    TensorDataset
-)
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
-from utils import is_main_process, format_step
-import builtins
-import io
-
-safe_builtins = {
-    'range',
-    'complex',
-    'set',
-    'frozenset',
-    'slice',
-}
-
-torch._C._jit_set_profiling_mode(False)
-torch._C._jit_set_profiling_executor(False)
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt='%m/%d/%Y %H:%M:%S',
-                    level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
-    """
-
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-
-    features = []
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
-
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            start_position = None
-            end_position = None
-            if is_training and not example.is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=example.is_impossible))
-            unique_id += 1
-
-    return features
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-
-def get_answers(examples, features, results, args):
-    predictions = collections.defaultdict(list) #it is possible that one example corresponds to multiple features
-    Prediction = collections.namedtuple('Prediction', ['text', 'start_logit', 'end_logit'])
-
-    if args.version_2_with_negative:
-        null_vals = collections.defaultdict(lambda: (float("inf"),0,0))
-    for ex, feat, result in match_results(examples, features, results):
-        start_indices = _get_best_indices(result.start_logits, args.n_best_size)
-        end_indices = _get_best_indices(result.end_logits, args.n_best_size)
-        prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
-        prelim_predictions = sorted(
-                            prelim_predictions,
-                            key=lambda x: (x.start_logit + x.end_logit),
-                            reverse=True)
-        if args.version_2_with_negative:
-            score = result.start_logits[0] + result.end_logits[0]
-            if score < null_vals[ex.qas_id][0]:
-                null_vals[ex.qas_id] = (score, result.start_logits[0], result.end_logits[0])
-
-        curr_predictions = []
-        seen_predictions = []
-        for pred in prelim_predictions:
-            if len(curr_predictions) == args.n_best_size:
-                break
-            if pred.start_index > 0:  # this is a non-null prediction TODO: this probably is irrelevant
-                final_text = get_answer_text(ex, feat, pred, args)
-                if final_text in seen_predictions:
-                    continue
-            else:
-                final_text = ""
-
-            seen_predictions.append(final_text)
-            curr_predictions.append(Prediction(final_text, pred.start_logit, pred.end_logit))
-        predictions[ex.qas_id] += curr_predictions
-
-    #Add empty prediction
-    if args.version_2_with_negative:
-        for qas_id in predictions.keys():
-            predictions[qas_id].append(Prediction('',
-                                                  null_vals[ex.qas_id][1],
-                                                  null_vals[ex.qas_id][2]))
-
-
-    nbest_answers = collections.defaultdict(list)
-    answers = {}
-    for qas_id, preds in predictions.items():
-        nbest = sorted(
-                preds,
-                key=lambda x: (x.start_logit + x.end_logit),
-                reverse=True)[:args.n_best_size]
-
-        # In very rare edge cases we could only have single null prediction.
-        # So we just create a nonce prediction in this case to avoid failure.
-        if not nbest:                                                    
-            nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry and entry.text:
-                best_non_null_entry = entry
-        probs = _compute_softmax(total_scores)
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_answers[qas_id].append(output)
-        if args.version_2_with_negative:
-            score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
-            if score_diff > args.null_score_diff_threshold:
-                answers[qas_id] = ""
-            else:
-                answers[qas_id] = best_non_null_entry.text
-        else:
-            answers[qas_id] = nbest_answers[qas_id][0]['text']
-
-    return answers, nbest_answers
-
-def get_answer_text(example, feature, pred, args):
-    tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-    orig_doc_start = feature.token_to_orig_map[pred.start_index]
-    orig_doc_end = feature.token_to_orig_map[pred.end_index]
-    orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-    tok_text = " ".join(tok_tokens)
-
-    # De-tokenize WordPieces that have been split off.
-    tok_text = tok_text.replace(" ##", "")
-    tok_text = tok_text.replace("##", "")
-
-    # Clean whitespace
-    tok_text = tok_text.strip()
-    tok_text = " ".join(tok_text.split())
-    orig_text = " ".join(orig_tokens)
-
-    final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
-    return final_text
-
-def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
-    
-    _PrelimPrediction = collections.namedtuple(
-        "PrelimPrediction",
-        ["start_index", "end_index", "start_logit", "end_logit"])
-    prelim_predictions = []
-    for start_index in start_indices:
-        for end_index in end_indices:
-            if start_index >= len(feature.tokens):
-                continue
-            if end_index >= len(feature.tokens):
-                continue
-            if start_index not in feature.token_to_orig_map:
-                continue
-            if end_index not in feature.token_to_orig_map:
-                continue
-            if not feature.token_is_max_context.get(start_index, False):
-                continue
-            if end_index < start_index:
-                continue
-            length = end_index - start_index + 1
-            if length > args.max_answer_length:
-                continue
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    start_index=start_index,
-                    end_index=end_index,
-                    start_logit=result.start_logits[start_index],
-                    end_logit=result.end_logits[end_index]))
-    return prelim_predictions
-
-def match_results(examples, features, results):
-    unique_f_ids = set([f.unique_id for f in features])
-    unique_r_ids = set([r.unique_id for r in results])
-    matching_ids = unique_f_ids & unique_r_ids
-    features = [f for f in features if f.unique_id in matching_ids]
-    results = [r for r in results if r.unique_id in matching_ids]
-    features.sort(key=lambda x: x.unique_id)
-    results.sort(key=lambda x: x.unique_id)
-
-    for f, r in zip(features, results): #original code assumes strict ordering of examples. TODO: rewrite this
-        yield examples[f.example_index], f, r
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indices(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indices = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indices.append(index_and_score[i][0])
-    return best_indices
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
-
-from apex.multi_tensor_apply import multi_tensor_applier
-class GradientClipper:
-    """
-    Clips gradient norm of an iterable of parameters. 
-    """
-    def __init__(self, max_grad_norm):
-        self.max_norm = max_grad_norm
-        if multi_tensor_applier.available:
-            import amp_C
-            self._overflow_buf = torch.cuda.IntTensor([0])
-            self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
-            self.multi_tensor_scale = amp_C.multi_tensor_scale
-        else:
-            raise RuntimeError('Gradient clipping requires cuda extensions')
-
-    def step(self, parameters):
-        l = [p.grad for p in parameters if p.grad is not None]
-        total_norm, _ = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [l], False)
-        total_norm = total_norm.item()
-        if (total_norm == float('inf')): return
-        clip_coef = self.max_norm / (total_norm + 1e-6)
-        if clip_coef < 1:
-            multi_tensor_applier(self.multi_tensor_scale, self._overflow_buf, [l, l], clip_coef)
-
-class RestrictedUnpickler(pickle.Unpickler):
-
-    def find_class(self, module, name):
-        # Only allow safe classes from builtins.
-        if module == "builtins" and name in safe_builtins:
-            return getattr(builtins, name)
-        # Forbid everything else.
-        raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
-                                     (module, name))
-
-def restricted_loads(s):
-    """Helper function analogous to pickle.loads()."""
-    return RestrictedUnpickler(io.BytesIO(s)).load()
-
-
-def train_func(model, agent, args, dllogger, global_step, train_examples, num_train_optimization_steps, n_gpu, device, optimizer):
-    model = agent.model.model
-
-    if args.cache_dir is None:
-        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
-            str(args.max_query_length))
-    else:
-        cached_train_features_file = args.cache_dir.strip('/') + '/' + args.train_file.split('/')[-1] + '_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
-            str(args.max_query_length))
-
-    train_features = None
-    try:
-        with open(cached_train_features_file, "rb") as reader:
-            train_features = restricted_loads(reader)
-    except:
-        train_features = convert_examples_to_features(
-            examples=train_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=True)
-
-        if not args.skip_cache and is_main_process():
-            dllogger.log(step="PARAMETER", data={"Cached_train features_file": cached_train_features_file})
-            with open(cached_train_features_file, "wb") as writer:
-                pickle.dump(train_features, writer)
-
-    dllogger.log(step="PARAMETER", data={"train_start": True})
-    dllogger.log(step="PARAMETER", data={"training_samples": len(train_examples)})
-    dllogger.log(step="PARAMETER", data={"training_features": len(train_features)})
-    dllogger.log(step="PARAMETER", data={"train_batch_size":args.train_batch_size})
-    dllogger.log(step="PARAMETER", data={"steps":num_train_optimization_steps})
-    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-    all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-    all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions)
-    if args.local_rank == -1:
-        train_sampler = RandomSampler(train_data)
-    else:
-        train_sampler = DistributedSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu)
-
-    args.train_features = train_features
-    model.train()
-    gradClipper = GradientClipper(max_grad_norm=1.0)
-    final_loss = None
-    train_start = time.time()
-
-    #pruning
-    agent.pre_epoch_begin()
-
-    for epoch in range(int(args.num_train_epochs)):
-        train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
-        agent.on_epoch_begin(epoch)
-        for step, batch in enumerate(train_iter):
-            # Terminate early for benchmarking
-            
-            agent.on_batch_begin(step)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                break
-
-            if n_gpu == 1:
-                batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
-            input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-            start_logits, end_logits = model(input_ids, segment_ids, input_mask)
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            loss = (start_loss + end_loss) / 2
-            if n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu.
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-            
-            # gradient clipping  
-            gradClipper.step(amp.master_params(optimizer))         
-
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16 :
-                    # modify learning rate with special warm up for BERT which FusedAdam doesn't do
-                    scheduler.step()
-
-                optimizer.step()
-                agent.on_post_grad()
-                optimizer.zero_grad()
-
-                global_step += 1
-
-            final_loss = loss.item()
-            if step % args.log_freq == 0:
-                dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss,
-                                                                "learning_rate": optimizer.param_groups[0]['lr']})
-
-            agent.on_batch_end()
-            
-        agent.on_epoch_end()
-    args.time_to_train = time.time() - train_start
-    args.final_loss = final_loss
-
-def eval_func(model, args, dllogger, tokenizer, device):
-    if not args.do_train and args.fp16:
-        model.half()
-
-    eval_examples = read_squad_examples(
-        input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
-    eval_features = convert_examples_to_features(
-        examples=eval_examples,
-        tokenizer=tokenizer,
-        max_seq_length=args.max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=args.max_query_length,
-        is_training=False)
-
-    dllogger.log(step="PARAMETER", data={"infer_start": True})
-    dllogger.log(step="PARAMETER", data={"eval_samples": len(eval_examples)})
-    dllogger.log(step="PARAMETER", data={"eval_features": len(eval_features)})
-    dllogger.log(step="PARAMETER", data={"predict_batch_size": args.predict_batch_size})
-
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-    # Run prediction for full data
-    eval_sampler = SequentialSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
-
-    args.eval_features = eval_features
-    infer_start = time.time()
-    model.eval()
-    all_results = []
-    dllogger.log(step="PARAMETER", data={"eval_start": True})
-    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.disable_progress_bar):
-        if len(all_results) % 1000 == 0:
-            dllogger.log(step="PARAMETER", data={"sample_number": len(all_results)})
-        input_ids = input_ids.to(device)
-        input_mask = input_mask.to(device)
-        segment_ids = segment_ids.to(device)
-        with torch.no_grad():
-            batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
-        for i, example_index in enumerate(example_indices):
-            start_logits = batch_start_logits[i].detach().cpu().tolist()
-            end_logits = batch_end_logits[i].detach().cpu().tolist()
-            eval_feature = eval_features[example_index.item()]
-            unique_id = int(eval_feature.unique_id)
-            all_results.append(RawResult(unique_id=unique_id,
-                                            start_logits=start_logits,
-                                            end_logits=end_logits))
-
-    time_to_infer = time.time() - infer_start
-    output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-
-    answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args)
-    with open(output_prediction_file, "w") as f:
-        f.write(json.dumps(answers, indent=4) + "\n")
-    with open(output_nbest_file, "w") as f:
-        f.write(json.dumps(nbest_answers, indent=4) + "\n")
-
-    # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-    # write_predictions(eval_examples, eval_features, all_results,
-    #                   args.n_best_size, args.max_answer_length,
-    #                   args.do_lower_case, output_prediction_file,
-    #                   output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-    #                   args.version_2_with_negative, args.null_score_diff_threshold)
-
-    if args.do_eval and is_main_process():
-        import sys
-        import subprocess
-        import shlex
-        eval_out = subprocess.check_output([sys.executable, shlex.quote(args.eval_script), 
-                                            shlex.quote(args.predict_file), shlex.quote(args.output_dir) + "/predictions.json"])
-        scores = str(eval_out).strip()
-        exact_match = float(scores.split(":")[1].split(",")[0])
-        f1 = float(scores.split(":")[2].split("}")[0])
-    args.exact_match = exact_match
-    args.f1 = f1
-    args.time_to_infer = time_to_infer
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                             "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
-    parser.add_argument("--init_checkpoint",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The checkpoint file from pretraining")
-
-    ## Other parameters
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1.0, type=float,
-                        help="Total number of training steps to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=os.getenv('LOCAL_RANK', -1),
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16',
-                        default=False,
-                        action='store_true',
-                        help="Mixed precision training")
-    parser.add_argument('--amp',
-                        default=False,
-                        action='store_true',
-                        help="Mixed precision training")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-    parser.add_argument('--vocab_file',
-                        type=str, default=None, required=True,
-                        help="Vocabulary mapping/file BERT was pretrainined on")
-    parser.add_argument("--config_file",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The BERT model config")
-    parser.add_argument('--log_freq',
-                        type=int, default=50,
-                        help='frequency of logging loss.')
-    parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
-                        help='If provided, the json summary will be written to'
-                             'the specified file.')
-    parser.add_argument("--eval_script",
-                        help="Script to evaluate squad predictions",
-                        default="evaluate.py",
-                        type=str)
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to use evaluate accuracy of predictions")
-    parser.add_argument("--use_env",
-                        action='store_true',
-                        help="Whether to read local rank from ENVVAR")
-    parser.add_argument('--skip_checkpoint',
-                        default=False,
-                        action='store_true',
-                        help="Whether to save checkpoints")
-    parser.add_argument('--disable-progress-bar',
-                        default=False,
-                        action='store_true',
-                        help='Disable tqdm progress bar')
-    parser.add_argument("--skip_cache",
-                        default=False,
-                        action='store_true',
-                        help="Whether to cache train features")
-    parser.add_argument("--cache_dir",
-                        default=None,
-                        type=str,
-                        help="Location to cache train feaures. Will default to the dataset directory")
-    parser.add_argument("--prune_config", 
-                        default='prune_bert.yaml', 
-                        help="pruning config")
-    parser.add_argument('--do_prune', 
-                        action='store_true',
-                        help="prune model")
-
-    args = parser.parse_args()
-    args.fp16 = args.fp16 or args.amp    
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl', init_method='env://')
-        n_gpu = 1
-
-    if is_main_process():
-        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
-                                                           filename=args.json_summary),
-                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
-    else:
-        dllogger.init(backends=[])
-        
-    print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-                                device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    dllogger.log(step="PARAMETER", data={"SEED": args.seed})
-
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_predict:
-        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
-
-    if args.do_train:
-        if not args.train_file:
-            raise ValueError(
-                "If `do_train` is True, then `train_file` must be specified.")
-    if args.do_predict:
-        if not args.predict_file:
-            raise ValueError(
-                "If `do_predict` is True, then `predict_file` must be specified.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and os.listdir(args.output_dir)!=['logfile.txt']:
-        print("WARNING: Output directory {} already exists and is not empty.".format(args.output_dir), os.listdir(args.output_dir))
-    if not os.path.exists(args.output_dir) and is_main_process():
-        os.makedirs(args.output_dir)
-
-    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
-    # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-    # Prepare model
-    config = modeling.BertConfig.from_json_file(args.config_file)
-    # Padding for divisibility by 8
-    if config.vocab_size % 8 != 0:
-        config.vocab_size += 8 - (config.vocab_size % 8)
-
-    modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
-    model = modeling.BertForQuestionAnswering(config)
-    # model = modeling.BertForQuestionAnswering.from_pretrained(args.bert_model,
-                # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
-    dllogger.log(step="PARAMETER", data={"loading_checkpoint": True})
-    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
-    #model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
-    dllogger.log(step="PARAMETER", data={"loaded_checkpoint": True})
-    model.to(device)
-    num_weights = sum([p.numel() for p in model.parameters() if p.requires_grad])
-    dllogger.log(step="PARAMETER", data={"model_weights_num":num_weights})
-
-    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-
-    # hack to remove pooler, which is not used
-    # thus it produce None grad that break apex
-    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-    ]
-    if args.do_train:
-        if args.fp16:
-            try:
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError(
-                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False)
-
-            if args.loss_scale == 0:
-                model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False,
-                                                      loss_scale="dynamic")
-            else:
-                model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
-            if args.do_train:
-                scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                    lr=args.learning_rate,
-                                    warmup=args.warmup_proportion,
-                                    t_total=num_train_optimization_steps)
-
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-    
-    global_step = 0
-    
-    if args.do_prune:
-        # Pruning!
-        from intel_extension_for_transformers.transformers import NoTrainerOptimizer, PrunerConfig, PruningConfig
-        pruner_config = PrunerConfig(
-            prune_type="GroupLasso",
-            target_sparsity_ratio=0.7,
-            names=['bert.encoder.layer.0.attention.output.dense.weight'],
-            parameters={"alpha": 0.006, "pattern": "tile_pattern_1x2"},
-        )
-        pruning_conf = PruningConfig(pruner_config=pruner_config)
-        no_trainer_optimizer = NoTrainerOptimizer(model, output_dir=args.output_dir)
-        agent = no_trainer_optimizer.init_pruner(pruning_config=pruning_conf)
-
-    def train_func_nc(model):
-        return train_func(model, agent, args, dllogger, global_step, train_examples, num_train_optimization_steps, n_gpu, device, optimizer)
-    
-    def eval_func_nc(model):
-        return eval_func(model, args, dllogger, tokenizer, device)
-
-    if args.do_train:
-        # train_func(args, dllogger, global_step)
-        no_trainer_optimizer.train_func = train_func_nc
-
-    
-    if args.do_train and is_main_process() and not args.skip_checkpoint:
-        # Save a trained model and the associated configuration
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, modeling.WEIGHTS_NAME)
-        torch.save({"model":model_to_save.state_dict()}, output_model_file)
-        output_config_file = os.path.join(args.output_dir, modeling.CONFIG_NAME)
-        with open(output_config_file, 'w') as f:
-            f.write(model_to_save.config.to_json_string())
-
-    if args.do_predict and (args.local_rank == -1 or is_main_process()):
-        no_trainer_optimizer.eval_func = eval_func_nc
-    
-    if args.do_prune:
-        model = no_trainer_optimizer.prune()
-
-    if args.do_train:
-        gpu_count = n_gpu
-        if torch.distributed.is_initialized():
-            gpu_count = torch.distributed.get_world_size()
-
-        if args.max_steps == -1:
-            dllogger.log(step=tuple(), data={"e2e_train_time": args.time_to_train,
-                                             "training_sequences_per_second": len(args.train_features) * args.num_train_epochs / args.time_to_train,
-                                             "final_loss": args.final_loss})
-        else:
-            dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
-                                             "training_sequences_per_second": args.train_batch_size * args.gradient_accumulation_steps \
-                                              * args.max_steps * gpu_count / time_to_train,
-                                              "final_loss": final_loss})
-    if args.do_predict and is_main_process():
-        dllogger.log(step=tuple(), data={"e2e_inference_time": args.time_to_infer,
-                                                 "inference_sequences_per_second": len(args.eval_features) / args.time_to_infer})
-    if args.do_eval and is_main_process():
-        dllogger.log(step=tuple(), data={"exact_match": args.exact_match, "F1": args.f1})
-
-if __name__ == "__main__":
-    main()
-    dllogger.flush()
-
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/schedulers.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/schedulers.py
deleted file mode 100644
index 4dd99b43a15..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/schedulers.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-from torch.optim.optimizer import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-
-
-class LRScheduler(_LRScheduler):
-    def __init__(self, optimizer, last_epoch=-1):
-        # Check if using mixed precision training
-        self.mixed_training = False
-        base_optimizer = optimizer
- 
-        # Check that optimizer param is valid
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError('{} is not an Optimizer'.format(
-                type(optimizer).__name__))
-
-        super(LRScheduler, self).__init__(base_optimizer, last_epoch)
-
-    def step(self, epoch=None):
-        # Set the current training step
-        # ('epoch' is used to be consistent with _LRScheduler)
-        if self.mixed_training:
-            # The assumption is that the step will be constant
-            state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
-            if 'step' in state_dict:
-                self.last_epoch = state_dict['step'] + 1
-            else:
-                self.last_epoch = 1
-        else:
-            self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
-
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-
-
-class CosineWarmUpScheduler(LRScheduler):
-    """
-    Applies a warm up period to the learning rate.
-    """
-
-    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
-        self.warmup = warmup
-        self.total_steps = total_steps
-        super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        progress = self.last_epoch / self.total_steps
-        if progress < self.warmup:
-            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
-        else:
-            return [base_lr * (0.5 * (1.0 + torch.cos(math.pi + progress))) for base_lr in self.base_lrs]
-
-
-class ConstantWarmUpScheduler(LRScheduler):
-    """
-    Applies a warm up period to the learning rate.
-    """
-
-    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
-        self.warmup = warmup
-        self.total_steps = total_steps
-        super(ConstantWarmUpScheduler, self).__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        progress = self.last_epoch / self.total_steps
-        if progress < self.warmup:
-            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
-        else:
-            return self.base_lrs
-
-
-class LinearWarmUpScheduler(LRScheduler):
-    """
-    Applies a warm up period to the learning rate.
-    """
-
-    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
-        self.warmup = warmup
-        self.total_steps = total_steps
-        super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        progress = self.last_epoch / self.total_steps
-        if progress < self.warmup:
-            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
-        else:
-            return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]
-
-
-class PolyWarmUpScheduler(LRScheduler):
-    """
-    Applies a warm up period to the learning rate.
-    """
-
-    def __init__(self, optimizer, warmup, total_steps, degree=0.5, last_epoch=-1):
-        self.warmup = warmup
-        self.total_steps = total_steps
-        self.degree = degree
-        super(PolyWarmUpScheduler, self).__init__(optimizer, last_epoch)
-
-    def step(self, epoch=None):
-        param_group = self.optimizer.param_groups[0]
-        if 'step' in param_group:
-            self.last_epoch = param_group['step'] + 1
-        else:
-            self.last_epoch = 1
-
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-
-    def get_lr(self):
-        progress = self.last_epoch / self.total_steps
-        if progress < self.warmup:
-            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
-        else:
-            return [base_lr * ((1.0 - progress) ** self.degree) for base_lr in self.base_lrs]
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/scripts/run_squad_sparse.sh b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/scripts/run_squad_sparse.sh
deleted file mode 100644
index 581dd1db883..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/scripts/run_squad_sparse.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-init_checkpoint=${1:-"/path/to/ckpt_8601.pt"}
-epochs=${2:-"2.0"}
-batch_size=${3:-"4"}
-learning_rate=${4:-"3e-5"}
-precision=${5:-"tf32"}
-num_gpu="1"
-seed="1"
-BERT_PREP_WORKING_DIR=${6:-'/path/to/bert_data'}
-squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
-vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
-OUT_DIR=${7:-"./results/SQuAD/"}
-prune_config=${8:-"prune_bert.yaml"}
-json_summary=${9:-"$OUT_DIR/dllogger.json"}
-echo $init_checkpoint $epochs $batch_size $learning_rate \
-$precision $num_gpu $seed $squad_dir $vocab_file \
-$OUT_DIR $prune_config $json_summary
-
-
-#init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
-#epochs=${2:-"2.0"}
-#batch_size=${3:-"4"}
-#learning_rate=${4:-"3e-5"}
-#precision=${5:-"fp16"}
-#num_gpu=${6:-"8"}
-#seed=${7:-"1"}
-#squad_dir=${8:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"}
-#vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
-#OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
-mode=${11:-"train eval"}
-CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
-CONFIG_FILE="$PWD/bert_config.json"
-max_steps=${13:-"-1"} 
-
-echo "out dir is $OUT_DIR"
-mkdir -p $OUT_DIR
-if [ ! -d "$OUT_DIR" ]; then
-  echo "ERROR: non existing $OUT_DIR"
-  exit 1
-fi
-
-use_fp16=""
-if [ "$precision" = "fp16" ] ; then
-  echo "fp16 activated!"
-  use_fp16=" --fp16 "
-fi
-
-if [ "$num_gpu" = "1" ] ; then
-  export CUDA_VISIBLE_DEVICES=0
-  mpi_command=""
-else
-  unset CUDA_VISIBLE_DEVICES
-  mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
-fi
-
-CMD="python  $mpi_command run_squad_sparse.py "
-CMD+="--do_prune "
-CMD+="--prune_config=$prune_config "
-CMD+="--json-summary=$json_summary "
-CMD+="--init_checkpoint=$init_checkpoint "
-if [ "$mode" = "train" ] ; then
-  CMD+="--do_train "
-  CMD+="--train_file=$squad_dir/train-v1.1.json "
-  CMD+="--train_batch_size=$batch_size "
-elif [ "$mode" = "eval" ] ; then
-  CMD+="--do_predict "
-  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
-  CMD+="--predict_batch_size=$batch_size "
-  CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
-  CMD+="--do_eval "
-elif [ "$mode" = "prediction" ] ; then
-  CMD+="--do_predict "
-  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
-  CMD+="--predict_batch_size=$batch_size "
-else
-  CMD+=" --do_train "
-  CMD+=" --train_file=$squad_dir/train-v1.1.json "
-  CMD+=" --train_batch_size=$batch_size "
-  CMD+="--do_predict "
-  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
-  CMD+="--predict_batch_size=$batch_size "
-  CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
-  CMD+="--do_eval "
-fi
-
-CMD+=" --do_lower_case "
-CMD+=" --bert_model=bert-large-uncased "
-CMD+=" --learning_rate=$learning_rate "
-CMD+=" --seed=$seed "
-CMD+=" --num_train_epochs=$epochs "
-CMD+=" --max_seq_length=384 "
-CMD+=" --doc_stride=128 "
-CMD+=" --output_dir=$OUT_DIR "
-CMD+=" --vocab_file=$vocab_file "
-CMD+=" --config_file=$CONFIG_FILE "
-CMD+=" --max_steps=$max_steps "
-CMD+=" $use_fp16"
-
-LOGFILE=$OUT_DIR/logfile.txt
-echo "$CMD |& tee $LOGFILE"
-time $CMD |& tee $LOGFILE
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/tokenization.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/tokenization.py
deleted file mode 100644
index fb3cffe20ca..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/tokenization.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tokenization classes."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import collections
-import logging
-import os
-import unicodedata
-import six
-from io import open
-
-from file_utils import cached_path
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'bert-base-uncased': 512,
-    'bert-large-uncased': 512,
-    'bert-base-cased': 512,
-    'bert-large-cased': 512,
-    'bert-base-multilingual-uncased': 512,
-    'bert-base-multilingual-cased': 512,
-    'bert-base-chinese': 512,
-}
-VOCAB_NAME = 'vocab.txt'
-
-def convert_to_unicode(text):
-  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text.decode("utf-8", "ignore")
-    elif isinstance(text, unicode):
-      return text
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    index = 0
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        while True:
-            token = reader.readline()
-            if not token:
-                break
-            token = token.strip()
-            vocab[token] = index
-            index += 1
-    return vocab
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-
-    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                              never_split=never_split)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-        self.max_len = max_len if max_len is not None else int(1e12)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        if len(ids) > self.max_len:
-            raise ValueError(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
-
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            vocab_file = pretrained_model_name_or_path
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
-
-
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self,
-                 do_lower_case=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BasicTokenizer.
-
-        Args:
-          do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in self.never_split:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        if text in self.never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-                (cp >= 0x3400 and cp <= 0x4DBF) or  #
-                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-                (cp >= 0x2B820 and cp <= 0x2CEAF) or
-                (cp >= 0xF900 and cp <= 0xFAFF) or  #
-                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically control characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/utils.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/utils.py
deleted file mode 100644
index f4f88e8eff9..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/utils.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.distributed as dist
-
-from pathlib import Path
-
-
-def get_rank():
-    if not dist.is_available():
-        return 0
-    if not dist.is_initialized():
-        return 0
-    return dist.get_rank()
-
-
-def get_world_size():
-    if not dist.is_available():
-        return 1
-    if not dist.is_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-def is_main_process():
-    return get_rank() == 0
-
-
-def barrier():
-    if dist.is_available() and dist.is_initialized():
-        dist.barrier()
-
-
-def format_step(step):
-    if isinstance(step, str):
-        return step
-    s = ""
-    if len(step) > 0:
-        s += "Training Epoch: {} ".format(step[0])
-    if len(step) > 1:
-        s += "Training Iteration: {} ".format(step[1])
-    if len(step) > 2:
-        s += "Validation Iteration: {} ".format(step[2])
-    return s
-
-
-def mkdir(path):
-    Path(path).mkdir(parents=True, exist_ok=True)
-
-
-def mkdir_by_main_process(path):
-    if is_main_process():
-        mkdir(path)
-    barrier()
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/README.md b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/README.md
deleted file mode 100644
index 9da0b53c3d2..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-Step-by-Step
-============
-
-This document is used to list steps of reproducing PyTorch longformer-base-4096 pruning result.
-
-
-# Prerequisite
-
-## 1. Environment
-
-```shell
-pip install intel-extension-for-transformers
-pip install -r requirements.txt
-pip install transformers==4.34.1
-```
->**Note**: Please use transformers no higher than 4.34.1
-
-
-## 2. Prepare Dataset
-
-The dataset will be downloaded and converted to squad format automatically with `./scripts/download_data_and_convert.sh`.
-
-```shell
-bash ./scripts/download_data_and_convert.sh
-```
-
-There will generate two squad format files: `squad-wikipedia-train-4096.json` and `squad-wikipedia-dev-4096.json`
-
-
-# Run Examples
-
-### pruning longformer-base-4096
-
-Run the `./scripts/longformer_base_sparse_global_4x1_pruning.sh` to prune with `global sparse 80% and 4*1 pattern`. In this script, we set `per_device_train_batch_size=1` which is same with [the original longformer codes](https://github.com/allenai/longformer).
-
-```shell
-bash ./scripts/longformer_base_sparse_global_4x1_pruning.sh
-```
-
-Fine-tuning of the dense model is also supported by running the `./scripts/longformer_base_dense_fintune.sh`
-
-
-### Results
-The snip-momentum pruning method is used by default and the initial dense model is well fine-tuned.
-
-|  Model  | Dataset  |  Sparsity pattern | sparsity ratio | Dense F1  |Sparse F1 | Relative drop|
-|  :----:  | :----:  | :----: | :----: |:----: |:----:| :----: |
-| longformer-base-4096 | triviaqa |  4x1  | global 80% | 75.2 (from [the paper](https://arxiv.org/abs/2004.05150))/74.9235 (ours) | 74.48 | -0.96% |
-
-## References
-* [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
-
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/modeling_longformer.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/modeling_longformer.py
deleted file mode 100644
index 3a08b4aaf96..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/modeling_longformer.py
+++ /dev/null
@@ -1,2282 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Longformer model."""
-
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN, gelu
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from transformers.utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.models.longformer.configuration_longformer import LongformerConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096"
-_CONFIG_FOR_DOC = "LongformerConfig"
-_TOKENIZER_FOR_DOC = "LongformerTokenizer"
-
-LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "allenai/longformer-base-4096",
-    "allenai/longformer-large-4096",
-    "allenai/longformer-large-4096-finetuned-triviaqa",
-    "allenai/longformer-base-4096-extra.pos.embd.only",
-    "allenai/longformer-large-4096-extra.pos.embd.only",
-    # See all Longformer models at https://huggingface.co/models?filter=longformer
-]
-
-
-@dataclass
-class LongformerBaseModelOutput(ModelOutput):
-    """
-    Base class for Longformer's outputs, with potential hidden states, local and global attentions.
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    last_hidden_state: torch.FloatTensor
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class LongformerBaseModelOutputWithPooling(ModelOutput):
-    """
-    Base class for Longformer's outputs that also contains a pooling of the last hidden states.
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
-            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
-            prediction (classification) objective during pretraining.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    last_hidden_state: torch.FloatTensor
-    pooler_output: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class LongformerMaskedLMOutput(ModelOutput):
-    """
-    Base class for masked language models outputs.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Masked language modeling (MLM) loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class LongformerQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of question answering Longformer models.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class LongformerSequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sentence classification models.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class LongformerMultipleChoiceModelOutput(ModelOutput):
-    """
-    Base class for outputs of multiple choice Longformer models.
-    Args:
-        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
-            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class LongformerTokenClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of token classification models.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
-            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
-            Local attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token in the sequence to every token with
-            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
-            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
-            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
-            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
-            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
-            If the attention window contains a token with global attention, the attention weight at the corresponding
-            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
-            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
-            accessed from `global_attentions`.
-        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
-            where `x` is the number of tokens with global attention mask.
-            Global attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads. Those are the attention weights from every token with global attention to every token
-            in the sequence.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-def _get_question_end_index(input_ids, sep_token_id):
-    """
-    Computes the index of the first occurrence of `sep_token_id`.
-    """
-
-    sep_token_indices = (input_ids == sep_token_id).nonzero()
-    batch_size = input_ids.shape[0]
-
-    assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
-    # here is the revised because of data preprocessing, 
-    # but same to longformer codes: https://github.com/allenai/longformer
-    assert sep_token_indices.shape[0] == 2 * batch_size, (
-        f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You"
-        " might also consider to set `global_attention_mask` manually in the forward function to avoid this error."
-    )
-    return sep_token_indices.view(batch_size, 2, 2)[:, 0, 1]
-
-
-def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
-    """
-    Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is
-    True` else after `sep_token_id`.
-    """
-    question_end_index = _get_question_end_index(input_ids, sep_token_id)
-    question_end_index = question_end_index.unsqueeze(dim=1)  # size: batch_size x 1
-    # bool attention mask with True in locations of global attention
-    attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)
-    if before_sep_token is True:
-        attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.uint8)
-    else:
-        # last token is separation token and should not be counted and in the middle are two separation tokens
-        attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * (
-            attention_mask.expand_as(input_ids) < input_ids.shape[-1]
-        ).to(torch.uint8)
-
-    return attention_mask
-
-
-def create_position_ids_from_input_ids(input_ids, padding_idx):
-    """
-    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-    Args:
-        x: torch.Tensor x:
-    Returns: torch.Tensor
-    """
-    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-    mask = input_ids.ne(padding_idx).int()
-    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
-    return incremental_indices.long() + padding_idx
-
-
-class LongformerEmbeddings(nn.Module):
-    """
-    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-
-        self.padding_idx = config.pad_token_id
-        self.position_embeddings = nn.Embedding(
-            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
-        )
-
-    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
-        if position_ids is None:
-            if input_ids is not None:
-                # Create the position ids from the input token ids. Any padded tokens remain padded.
-                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
-            else:
-                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
-
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """
-        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
-        Args:
-            inputs_embeds: torch.Tensor inputs_embeds:
-        Returns: torch.Tensor
-        """
-        input_shape = inputs_embeds.size()[:-1]
-        sequence_length = input_shape[1]
-
-        position_ids = torch.arange(
-            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-        )
-        return position_ids.unsqueeze(0).expand(input_shape)
-
-
-class LongformerSelfAttention(nn.Module):
-    def __init__(self, config, layer_id):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-        self.num_heads = config.num_attention_heads
-        self.head_dim = int(config.hidden_size / config.num_attention_heads)
-        self.embed_dim = config.hidden_size
-
-        self.query = nn.Linear(config.hidden_size, self.embed_dim)
-        self.key = nn.Linear(config.hidden_size, self.embed_dim)
-        self.value = nn.Linear(config.hidden_size, self.embed_dim)
-
-        # separate projection layers for tokens with global attention
-        self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
-        self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
-        self.value_global = nn.Linear(config.hidden_size, self.embed_dim)
-
-        self.dropout = config.attention_probs_dropout_prob
-
-        self.layer_id = layer_id
-        attention_window = config.attention_window[self.layer_id]
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
-
-        self.one_sided_attn_window_size = attention_window // 2
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        layer_head_mask=None,
-        is_index_masked=None,
-        is_index_global_attn=None,
-        is_global_attn=None,
-        output_attentions=False,
-    ):
-        """
-        [`LongformerSelfAttention`] expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
-        *attention_window* happens in [`LongformerModel.forward`] to avoid redoing the padding on each layer.
-        The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to:
-            - -10000: no attention
-            - 0: local attention
-            - +10000: global attention
-        """
-        hidden_states = hidden_states.transpose(0, 1)
-
-        # project hidden states
-        query_vectors = self.query(hidden_states)
-        key_vectors = self.key(hidden_states)
-        value_vectors = self.value(hidden_states)
-
-        seq_len, batch_size, embed_dim = hidden_states.size()
-        assert (
-            embed_dim == self.embed_dim
-        ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
-
-        # normalize query
-        query_vectors /= math.sqrt(self.head_dim)
-
-        query_vectors = query_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
-        key_vectors = key_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
-
-        attn_scores = self._sliding_chunks_query_key_matmul(
-            query_vectors, key_vectors, self.one_sided_attn_window_size
-        )
-
-        # values to pad for attention probs
-        remove_from_windowed_attention_mask = (attention_mask != 0)[:, :, None, None]
-
-        # cast to fp32/fp16 then replace 1's with -inf
-        float_mask = remove_from_windowed_attention_mask.type_as(query_vectors).masked_fill(
-            remove_from_windowed_attention_mask, torch.finfo(query_vectors.dtype).min
-        )
-        # diagonal mask with zeros everywhere and -inf inplace of padding
-        diagonal_mask = self._sliding_chunks_query_key_matmul(
-            float_mask.new_ones(size=float_mask.size()), float_mask, self.one_sided_attn_window_size
-        )
-
-        # pad local attention probs
-        attn_scores += diagonal_mask
-
-        assert list(attn_scores.size()) == [
-            batch_size,
-            seq_len,
-            self.num_heads,
-            self.one_sided_attn_window_size * 2 + 1,
-        ], (
-            f"local_attn_probs should be of size ({batch_size}, {seq_len}, {self.num_heads},"
-            f" {self.one_sided_attn_window_size * 2 + 1}), but is of size {attn_scores.size()}"
-        )
-
-        # compute local attention probs from global attention keys and contact over window dim
-        if is_global_attn:
-            # compute global attn indices required through out forward fn
-            (
-                max_num_global_attn_indices,
-                is_index_global_attn_nonzero,
-                is_local_index_global_attn_nonzero,
-                is_local_index_no_global_attn_nonzero,
-            ) = self._get_global_attn_indices(is_index_global_attn)
-            # calculate global attn probs from global key
-
-            global_key_attn_scores = self._concat_with_global_key_attn_probs(
-                query_vectors=query_vectors,
-                key_vectors=key_vectors,
-                max_num_global_attn_indices=max_num_global_attn_indices,
-                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
-                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
-            )
-            # concat to local_attn_probs
-            # (batch_size, seq_len, num_heads, extra attention count + 2*window+1)
-            attn_scores = torch.cat((global_key_attn_scores, attn_scores), dim=-1)
-
-            # free memory
-            del global_key_attn_scores
-
-        attn_probs = nn.functional.softmax(
-            attn_scores, dim=-1, dtype=torch.float32
-        )  # use fp32 for numerical stability
-
-        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
-            attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
-
-        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
-        attn_probs = torch.masked_fill(attn_probs, is_index_masked[:, :, None, None], 0.0)
-        attn_probs = attn_probs.type_as(attn_scores)
-
-        # free memory
-        del attn_scores
-
-        # apply dropout
-        attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training)
-
-        value_vectors = value_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
-
-        # compute local attention output with global attention value and add
-        if is_global_attn:
-            # compute sum of global and local attn
-            attn_output = self._compute_attn_output_with_global_indices(
-                value_vectors=value_vectors,
-                attn_probs=attn_probs,
-                max_num_global_attn_indices=max_num_global_attn_indices,
-                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
-                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-            )
-        else:
-            # compute local attn only
-            attn_output = self._sliding_chunks_matmul_attn_probs_value(
-                attn_probs, value_vectors, self.one_sided_attn_window_size
-            )
-
-        assert attn_output.size() == (batch_size, seq_len, self.num_heads, self.head_dim), "Unexpected size"
-        attn_output = attn_output.transpose(0, 1).reshape(seq_len, batch_size, embed_dim).contiguous()
-
-        # compute value for global attention and overwrite to attention output
-        # TODO: remove the redundant computation
-        if is_global_attn:
-            global_attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
-                hidden_states=hidden_states,
-                max_num_global_attn_indices=max_num_global_attn_indices,
-                layer_head_mask=layer_head_mask,
-                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
-                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
-                is_index_masked=is_index_masked,
-            )
-
-            # get only non zero global attn output
-            nonzero_global_attn_output = global_attn_output[
-                is_local_index_global_attn_nonzero[0], :, is_local_index_global_attn_nonzero[1]
-            ]
-
-            # overwrite values with global attention
-            attn_output[is_index_global_attn_nonzero[::-1]] = nonzero_global_attn_output.view(
-                len(is_local_index_global_attn_nonzero[0]), -1
-            )
-            # The attention weights for tokens with global attention are
-            # just filler values, they were never used to compute the output.
-            # Fill with 0 now, the correct values are in 'global_attn_probs'.
-            attn_probs[is_index_global_attn_nonzero] = 0
-
-        outputs = (attn_output.transpose(0, 1),)
-
-        if output_attentions:
-            outputs += (attn_probs,)
-
-        return outputs + (global_attn_probs,) if (is_global_attn and output_attentions) else outputs
-
-    @staticmethod
-    def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
-        """pads rows and then flips rows and columns"""
-        hidden_states_padded = nn.functional.pad(
-            hidden_states_padded, padding
-        )  # padding value is not important because it will be overwritten
-        hidden_states_padded = hidden_states_padded.view(
-            *hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2)
-        )
-        return hidden_states_padded
-
-    @staticmethod
-    def _pad_and_diagonalize(chunked_hidden_states):
-        """
-        shift every row 1 step right, converting columns into diagonals.
-        Example:
-        ```python
-        chunked_hidden_states: [
-            0.4983,
-            2.6918,
-            -0.0071,
-            1.0492,
-            -1.8348,
-            0.7672,
-            0.2986,
-            0.0285,
-            -0.7584,
-            0.4206,
-            -0.0405,
-            0.1599,
-            2.0514,
-            -1.1600,
-            0.5372,
-            0.2629,
-        ]
-        window_overlap = num_rows = 4
-        ```
-                     (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
-                       0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206,
-                       -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
-        """
-        total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
-        chunked_hidden_states = nn.functional.pad(
-            chunked_hidden_states, (0, window_overlap + 1)
-        )  # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
-        chunked_hidden_states = chunked_hidden_states.view(
-            total_num_heads, num_chunks, -1
-        )  # total_num_heads x num_chunks x window_overlap*window_overlap+window_overlap
-        chunked_hidden_states = chunked_hidden_states[
-            :, :, :-window_overlap
-        ]  # total_num_heads x num_chunks x window_overlap*window_overlap
-        chunked_hidden_states = chunked_hidden_states.view(
-            total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
-        )
-        chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
-        return chunked_hidden_states
-
-    @staticmethod
-    def _chunk(hidden_states, window_overlap):
-        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
-
-        # non-overlapping chunks of size = 2w
-        hidden_states = hidden_states.view(
-            hidden_states.size(0),
-            hidden_states.size(1) // (window_overlap * 2),
-            window_overlap * 2,
-            hidden_states.size(2),
-        )
-
-        # use `as_strided` to make the chunks overlap with an overlap size = window_overlap
-        chunk_size = list(hidden_states.size())
-        chunk_size[1] = chunk_size[1] * 2 - 1
-
-        chunk_stride = list(hidden_states.stride())
-        chunk_stride[1] = chunk_stride[1] // 2
-        return hidden_states.as_strided(size=chunk_size, stride=chunk_stride)
-
-    @staticmethod
-    def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor:
-        beginning_mask_2d = input_tensor.new_ones(affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0])
-        beginning_mask = beginning_mask_2d[None, :, None, :]
-        ending_mask = beginning_mask.flip(dims=(1, 3))
-        beginning_input = input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1]
-        beginning_mask = beginning_mask.expand(beginning_input.size())
-        beginning_input.masked_fill_(beginning_mask == 1, -float("inf"))  # `== 1` converts to bool or uint8
-        ending_input = input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :]
-        ending_mask = ending_mask.expand(ending_input.size())
-        ending_input.masked_fill_(ending_mask == 1, -float("inf"))  # `== 1` converts to bool or uint8
-
-    def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tensor, window_overlap: int):
-        """
-        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
-        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an
-        overlap of size window_overlap
-        """
-        batch_size, seq_len, num_heads, head_dim = query.size()
-        assert (
-            seq_len % (window_overlap * 2) == 0
-        ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
-        assert query.size() == key.size()
-
-        chunks_count = seq_len // window_overlap - 1
-
-        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2
-        query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
-        key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
-
-        query = self._chunk(query, window_overlap)
-        key = self._chunk(key, window_overlap)
-
-        # matrix multiplication
-        # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
-        # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
-        # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
-        diagonal_chunked_attention_scores = torch.einsum("bcxd,bcyd->bcxy", (query, key))  # multiply
-
-        # convert diagonals into columns
-        diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(
-            diagonal_chunked_attention_scores, padding=(0, 0, 0, 1)
-        )
-
-        # allocate space for the overall attention matrix where the chunks are combined. The last dimension
-        # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to
-        # window_overlap previous words). The following column is attention score from each word to itself, then
-        # followed by window_overlap columns for the upper triangle.
-
-        diagonal_attention_scores = diagonal_chunked_attention_scores.new_empty(
-            (batch_size * num_heads, chunks_count + 1, window_overlap, window_overlap * 2 + 1)
-        )
-
-        # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions
-        # - copying the main diagonal and the upper triangle
-        diagonal_attention_scores[:, :-1, :, window_overlap:] = diagonal_chunked_attention_scores[
-            :, :, :window_overlap, : window_overlap + 1
-        ]
-        diagonal_attention_scores[:, -1, :, window_overlap:] = diagonal_chunked_attention_scores[
-            :, -1, window_overlap:, : window_overlap + 1
-        ]
-        # - copying the lower triangle
-        diagonal_attention_scores[:, 1:, :, :window_overlap] = diagonal_chunked_attention_scores[
-            :, :, -(window_overlap + 1) : -1, window_overlap + 1 :
-        ]
-
-        diagonal_attention_scores[:, 0, 1:window_overlap, 1:window_overlap] = diagonal_chunked_attention_scores[
-            :, 0, : window_overlap - 1, 1 - window_overlap :
-        ]
-
-        # separate batch_size and num_heads dimensions again
-        diagonal_attention_scores = diagonal_attention_scores.view(
-            batch_size, num_heads, seq_len, 2 * window_overlap + 1
-        ).transpose(2, 1)
-
-        self._mask_invalid_locations(diagonal_attention_scores, window_overlap)
-        return diagonal_attention_scores
-
-    def _sliding_chunks_matmul_attn_probs_value(
-        self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
-    ):
-        """
-        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
-        same shape as `attn_probs`
-        """
-        batch_size, seq_len, num_heads, head_dim = value.size()
-
-        assert seq_len % (window_overlap * 2) == 0
-        assert attn_probs.size()[:3] == value.size()[:3]
-        assert attn_probs.size(3) == 2 * window_overlap + 1
-        chunks_count = seq_len // window_overlap - 1
-        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap
-
-        chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
-            batch_size * num_heads, seq_len // window_overlap, window_overlap, 2 * window_overlap + 1
-        )
-
-        # group batch_size and num_heads dimensions into one
-        value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
-
-        # pad seq_len with w at the beginning of the sequence and another window overlap at the end
-        padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1)
-
-        # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
-        chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim)
-        chunked_value_stride = padded_value.stride()
-        chunked_value_stride = (
-            chunked_value_stride[0],
-            window_overlap * chunked_value_stride[1],
-            chunked_value_stride[1],
-            chunked_value_stride[2],
-        )
-        chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride)
-
-        chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
-
-        context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))
-        return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2)
-
-    @staticmethod
-    def _get_global_attn_indices(is_index_global_attn):
-        """compute global attn indices required throughout forward pass"""
-        # helper variable
-        num_global_attn_indices = is_index_global_attn.long().sum(dim=1)
-
-        # max number of global attn indices in batch
-        max_num_global_attn_indices = num_global_attn_indices.max()
-
-        # indices of global attn
-        is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)
-
-        # helper variable
-        is_local_index_global_attn = torch.arange(
-            max_num_global_attn_indices, device=is_index_global_attn.device
-        ) < num_global_attn_indices.unsqueeze(dim=-1)
-
-        # location of the non-padding values within global attention indices
-        is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)
-
-        # location of the padding values within global attention indices
-        is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)
-        return (
-            max_num_global_attn_indices,
-            is_index_global_attn_nonzero,
-            is_local_index_global_attn_nonzero,
-            is_local_index_no_global_attn_nonzero,
-        )
-
-    def _concat_with_global_key_attn_probs(
-        self,
-        key_vectors,
-        query_vectors,
-        max_num_global_attn_indices,
-        is_index_global_attn_nonzero,
-        is_local_index_global_attn_nonzero,
-        is_local_index_no_global_attn_nonzero,
-    ):
-        batch_size = key_vectors.shape[0]
-
-        # create only global key vectors
-        key_vectors_only_global = key_vectors.new_zeros(
-            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
-        )
-
-        key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero]
-
-        # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
-        attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query_vectors, key_vectors_only_global))
-
-        attn_probs_from_global_key[
-            is_local_index_no_global_attn_nonzero[0], :, :, is_local_index_no_global_attn_nonzero[1]
-        ] = torch.finfo(attn_probs_from_global_key.dtype).min
-
-        return attn_probs_from_global_key
-
-    def _compute_attn_output_with_global_indices(
-        self,
-        value_vectors,
-        attn_probs,
-        max_num_global_attn_indices,
-        is_index_global_attn_nonzero,
-        is_local_index_global_attn_nonzero,
-    ):
-        batch_size = attn_probs.shape[0]
-
-        # cut local attn probs to global only
-        attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices)
-        # get value vectors for global only
-        value_vectors_only_global = value_vectors.new_zeros(
-            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
-        )
-        value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero]
-
-        # use `matmul` because `einsum` crashes sometimes with fp16
-        # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v))
-        # compute attn output only global
-        attn_output_only_global = torch.matmul(
-            attn_probs_only_global.transpose(1, 2).clone(), value_vectors_only_global.transpose(1, 2).clone()
-        ).transpose(1, 2)
-
-        # reshape attn probs
-        attn_probs_without_global = attn_probs.narrow(
-            -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices
-        ).contiguous()
-
-        # compute attn output with global
-        attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
-            attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
-        )
-        return attn_output_only_global + attn_output_without_global
-
-    def _compute_global_attn_output_from_hidden(
-        self,
-        hidden_states,
-        max_num_global_attn_indices,
-        layer_head_mask,
-        is_local_index_global_attn_nonzero,
-        is_index_global_attn_nonzero,
-        is_local_index_no_global_attn_nonzero,
-        is_index_masked,
-    ):
-        seq_len, batch_size = hidden_states.shape[:2]
-
-        # prepare global hidden states
-        global_attn_hidden_states = hidden_states.new_zeros(max_num_global_attn_indices, batch_size, self.embed_dim)
-        global_attn_hidden_states[is_local_index_global_attn_nonzero[::-1]] = hidden_states[
-            is_index_global_attn_nonzero[::-1]
-        ]
-
-        # global key, query, value
-        global_query_vectors_only_global = self.query_global(global_attn_hidden_states)
-        global_key_vectors = self.key_global(hidden_states)
-        global_value_vectors = self.value_global(hidden_states)
-
-        # normalize
-        global_query_vectors_only_global /= math.sqrt(self.head_dim)
-
-        # reshape
-        global_query_vectors_only_global = (
-            global_query_vectors_only_global.contiguous()
-            .view(max_num_global_attn_indices, batch_size * self.num_heads, self.head_dim)
-            .transpose(0, 1)
-        )  # (batch_size * self.num_heads, max_num_global_attn_indices, head_dim)
-        global_key_vectors = (
-            global_key_vectors.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1)
-        )  # batch_size * self.num_heads, seq_len, head_dim)
-        global_value_vectors = (
-            global_value_vectors.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1)
-        )  # batch_size * self.num_heads, seq_len, head_dim)
-
-        # compute attn scores
-        global_attn_scores = torch.bmm(global_query_vectors_only_global, global_key_vectors.transpose(1, 2))
-
-        assert list(global_attn_scores.size()) == [
-            batch_size * self.num_heads,
-            max_num_global_attn_indices,
-            seq_len,
-        ], (
-            "global_attn_scores have the wrong size. Size should be"
-            f" {(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)}, but is"
-            f" {global_attn_scores.size()}."
-        )
-
-        global_attn_scores = global_attn_scores.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
-
-        global_attn_scores[
-            is_local_index_no_global_attn_nonzero[0], :, is_local_index_no_global_attn_nonzero[1], :
-        ] = torch.finfo(global_attn_scores.dtype).min
-
-        global_attn_scores = global_attn_scores.masked_fill(
-            is_index_masked[:, None, None, :],
-            torch.finfo(global_attn_scores.dtype).min,
-        )
-
-        global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)
-
-        # compute global attn probs
-        global_attn_probs_float = nn.functional.softmax(
-            global_attn_scores, dim=-1, dtype=torch.float32
-        )  # use fp32 for numerical stability
-
-        # apply layer head masking
-        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
-            global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
-                batch_size, self.num_heads, max_num_global_attn_indices, seq_len
-            )
-            global_attn_probs_float = global_attn_probs_float.view(
-                batch_size * self.num_heads, max_num_global_attn_indices, seq_len
-            )
-
-        global_attn_probs = nn.functional.dropout(
-            global_attn_probs_float.type_as(global_attn_scores), p=self.dropout, training=self.training
-        )
-
-        # global attn output
-        global_attn_output = torch.bmm(global_attn_probs, global_value_vectors)
-
-        assert list(global_attn_output.size()) == [
-            batch_size * self.num_heads,
-            max_num_global_attn_indices,
-            self.head_dim,
-        ], (
-            "global_attn_output tensor has the wrong size. Size should be"
-            f" {(batch_size * self.num_heads, max_num_global_attn_indices, self.head_dim)}, but is"
-            f" {global_attn_output.size()}."
-        )
-
-        global_attn_probs = global_attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
-        global_attn_output = global_attn_output.view(
-            batch_size, self.num_heads, max_num_global_attn_indices, self.head_dim
-        )
-        return global_attn_output, global_attn_probs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
-class LongformerSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class LongformerAttention(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.self = LongformerSelfAttention(config, layer_id)
-        self.output = LongformerSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        layer_head_mask=None,
-        is_index_masked=None,
-        is_index_global_attn=None,
-        is_global_attn=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            is_index_masked=is_index_masked,
-            is_index_global_attn=is_index_global_attn,
-            is_global_attn=is_global_attn,
-            output_attentions=output_attentions,
-        )
-        attn_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attn_output,) + self_outputs[1:]
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate
-class LongformerIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOutput
-class LongformerOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class LongformerLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.attention = LongformerAttention(config, layer_id)
-        self.intermediate = LongformerIntermediate(config)
-        self.output = LongformerOutput(config)
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        layer_head_mask=None,
-        is_index_masked=None,
-        is_index_global_attn=None,
-        is_global_attn=None,
-        output_attentions=False,
-    ):
-        self_attn_outputs = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            is_index_masked=is_index_masked,
-            is_index_global_attn=is_index_global_attn,
-            is_global_attn=is_global_attn,
-            output_attentions=output_attentions,
-        )
-        attn_output = self_attn_outputs[0]
-        outputs = self_attn_outputs[1:]
-
-        layer_output = apply_chunking_to_forward(
-            self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attn_output
-        )
-        outputs = (layer_output,) + outputs
-        return outputs
-
-    def ff_chunk(self, attn_output):
-        intermediate_output = self.intermediate(attn_output)
-        layer_output = self.output(intermediate_output, attn_output)
-        return layer_output
-
-
-class LongformerEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([LongformerLayer(config, layer_id=i) for i in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        padding_len=0,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-
-        is_index_masked = attention_mask < 0
-        is_index_global_attn = attention_mask > 0
-        is_global_attn = is_index_global_attn.flatten().any().item()
-
-        all_hidden_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None  # All local attentions.
-        all_global_attentions = () if (output_attentions and is_global_attn) else None
-
-        # check if head_mask has a correct number of layers specified if desired
-        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layer)
-            ), f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
-        for idx, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, is_global_attn, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    head_mask[idx] if head_mask is not None else None,
-                    is_index_masked,
-                    is_index_global_attn,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    layer_head_mask=head_mask[idx] if head_mask is not None else None,
-                    is_index_masked=is_index_masked,
-                    is_index_global_attn=is_index_global_attn,
-                    is_global_attn=is_global_attn,
-                    output_attentions=output_attentions,
-                )
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                # bzs x seq_len x num_attn_heads x (num_global_attn + attention_window_len + 1) => bzs x num_attn_heads x seq_len x (num_global_attn + attention_window_len + 1)
-                all_attentions = all_attentions + (layer_outputs[1].transpose(1, 2),)
-
-                if is_global_attn:
-                    # bzs x num_attn_heads x num_global_attn x seq_len => bzs x num_attn_heads x seq_len x num_global_attn
-                    all_global_attentions = all_global_attentions + (layer_outputs[2].transpose(2, 3),)
-
-        # Add last layer
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # undo padding
-        if padding_len > 0:
-            # unpad `hidden_states` because the calling function is expecting a length == input_ids.size(1)
-            hidden_states = hidden_states[:, :-padding_len]
-            if output_hidden_states:
-                all_hidden_states = tuple([state[:, :-padding_len] for state in all_hidden_states])
-
-            if output_attentions:
-                all_attentions = tuple([state[:, :, :-padding_len, :] for state in all_attentions])
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, all_hidden_states, all_attentions, all_global_attentions] if v is not None
-            )
-        return LongformerBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_attentions,
-            global_attentions=all_global_attentions,
-        )
-
-
-# Copied from transformers.models.bert.modeling_bert.BertPooler
-class LongformerPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Longformer
-class LongformerLMHead(nn.Module):
-    """Longformer Head for masked language modeling."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-        self.decoder.bias = self.bias
-
-    def forward(self, features, **kwargs):
-        x = self.dense(features)
-        x = gelu(x)
-        x = self.layer_norm(x)
-
-        # project back to size of vocabulary with bias
-        x = self.decoder(x)
-
-        return x
-
-    def _tie_weights(self):
-        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
-        self.bias = self.decoder.bias
-
-
-class LongformerPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = LongformerConfig
-    base_model_prefix = "longformer"
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LongformerEncoder):
-            module.gradient_checkpointing = value
-
-
-LONGFORMER_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`LongformerConfig`]): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-LONGFORMER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`LongformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        global_attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to decide the attention given on each token, local attention or global attention. Tokens with global
-            attention attends to all other tokens, and all other tokens attend to them. This is important for
-            task-specific finetuning because it makes the model more flexible at representing the task. For example,
-            for classification, the <s> token should be given global attention. For QA, all question tokens should also
-            have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more
-            details. Mask values selected in `[0, 1]`:
-            - 0 for local attention (a sliding window attention),
-            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
-        head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Longformer Model outputting raw hidden-states without any specific head on top.",
-    LONGFORMER_START_DOCSTRING,
-)
-class LongformerModel(LongformerPreTrainedModel):
-    """
-    This class copied code from [`RobertaModel`] and overwrote standard self-attention with longformer self-attention
-    to provide the ability to process long sequences following the self-attention approach described in [Longformer:
-    the Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, and Arman Cohan.
-    Longformer self-attention combines a local (sliding window) and global attention to extend to long documents
-    without the O(n^2) increase in memory and compute.
-    The self-attention module `LongformerSelfAttention` implemented here supports the combination of local and global
-    attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and dilated
-    attention are more relevant for autoregressive language modeling than finetuning on downstream tasks. Future
-    release will add support for autoregressive attention, but the support for dilated attention requires a custom CUDA
-    kernel to be memory and compute efficient.
-    """
-
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
-
-        if isinstance(config.attention_window, int):
-            assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
-            assert config.attention_window > 0, "`config.attention_window` has to be positive"
-            config.attention_window = [config.attention_window] * config.num_hidden_layers  # one value per layer
-        else:
-            assert len(config.attention_window) == config.num_hidden_layers, (
-                "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
-                f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
-            )
-
-        self.embeddings = LongformerEmbeddings(config)
-        self.encoder = LongformerEncoder(config)
-        self.pooler = LongformerPooler(config) if add_pooling_layer else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def _pad_to_window_size(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        pad_token_id: int,
-    ):
-        """A helper function to pad tokens and mask to work with implementation of Longformer self-attention."""
-        # padding
-        attention_window = (
-            self.config.attention_window
-            if isinstance(self.config.attention_window, int)
-            else max(self.config.attention_window)
-        )
-
-        assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
-        input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
-        batch_size, seq_len = input_shape[:2]
-
-        padding_len = (attention_window - seq_len % attention_window) % attention_window
-        if padding_len > 0:
-            '''
-            logger.info(
-                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
-                f"`config.attention_window`: {attention_window}"
-            )
-            '''
-            if input_ids is not None:
-                input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
-            if position_ids is not None:
-                # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings
-                position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id)
-            if inputs_embeds is not None:
-                input_ids_padding = inputs_embeds.new_full(
-                    (batch_size, padding_len),
-                    self.config.pad_token_id,
-                    dtype=torch.long,
-                )
-                inputs_embeds_padding = self.embeddings(input_ids_padding)
-                inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
-
-            attention_mask = nn.functional.pad(
-                attention_mask, (0, padding_len), value=False
-            )  # no attention on the padding tokens
-            token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0)  # pad with token_type_id = 0
-
-        return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds
-
-    def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
-        # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
-        # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
-        # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
-        if attention_mask is not None:
-            attention_mask = attention_mask * (global_attention_mask + 1)
-        else:
-            # simply use `global_attention_mask` as `attention_mask`
-            # if no `attention_mask` is given
-            attention_mask = global_attention_mask + 1
-        return attention_mask
-
-    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=LongformerBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        global_attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LongformerBaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> import torch
-        >>> from transformers import LongformerModel, LongformerTokenizer
-        >>> model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-        >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
-        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
-        >>> attention_mask = torch.ones(
-        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
-        ... )  # initialize to local attention
-        >>> global_attention_mask = torch.zeros(
-        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
-        ... )  # initialize to global attention to be deactivated for all tokens
-        >>> global_attention_mask[
-        ...     :,
-        ...     [
-        ...         1,
-        ...         4,
-        ...         21,
-        ...     ],
-        ... ] = 1  # Set global attention to random tokens for the sake of this example
-        >>> # Usually, set global attention based on the task. For example,
-        >>> # classification: the <s> token
-        >>> # QA: question tokens
-        >>> # LM: potentially on the beginning of sentences and paragraphs
-        >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
-        >>> sequence_output = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # merge `global_attention_mask` and `attention_mask`
-        if global_attention_mask is not None:
-            attention_mask = self._merge_to_attention_mask(attention_mask, global_attention_mask)
-
-        padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = self._pad_to_window_size(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            pad_token_id=self.config.pad_token_id,
-        )
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)[
-            :, 0, 0, :
-        ]
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            padding_len=padding_len,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-        return LongformerBaseModelOutputWithPooling(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            global_attentions=encoder_outputs.global_attentions,
-        )
-
-
-@add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING)
-class LongformerForMaskedLM(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.longformer = LongformerModel(config, add_pooling_layer=False)
-        self.lm_head = LongformerLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=LongformerMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        global_attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LongformerMaskedLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
-            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
-            Used to hide legacy arguments that have been deprecated.
-        Returns:
-        Mask filling example:
-        ```python
-        >>> from transformers import LongformerTokenizer, LongformerForMaskedLM
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-        >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-        ```
-        Let's try a very long input.
-        ```python
-        >>> TXT = (
-        ...     "My friends are <mask> but they eat too many carbs."
-        ...     + " That's why I decide not to eat with them." * 300
-        ... )
-        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
-        >>> logits = model(input_ids).logits
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-        >>> probs = logits[0, masked_index].softmax(dim=0)
-        >>> values, predictions = probs.topk(5)
-        >>> tokenizer.decode(predictions).split()
-        ['healthy', 'skinny', 'thin', 'good', 'vegetarian']
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.longformer(
-            input_ids,
-            attention_mask=attention_mask,
-            global_attention_mask=global_attention_mask,
-            head_mask=head_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        prediction_scores = self.lm_head(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return LongformerMaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            global_attentions=outputs.global_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    LONGFORMER_START_DOCSTRING,
-)
-class LongformerForSequenceClassification(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-
-        self.longformer = LongformerModel(config, add_pooling_layer=False)
-        self.classifier = LongformerClassificationHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="jpelhaw/longformer-base-plagiarism-detection",
-        output_type=LongformerSequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output="'ORIGINAL'",
-        expected_loss=5.44,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        global_attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LongformerSequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if global_attention_mask is None:
-            logger.info("Initializing global attention on CLS token...")
-            global_attention_mask = torch.zeros_like(input_ids)
-            # global attention on cls token
-            global_attention_mask[:, 0] = 1
-
-        outputs = self.longformer(
-            input_ids,
-            attention_mask=attention_mask,
-            global_attention_mask=global_attention_mask,
-            head_mask=head_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return LongformerSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            global_attentions=outputs.global_attentions,
-        )
-
-
-class LongformerClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(self, hidden_states, **kwargs):
-        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        output = self.out_proj(hidden_states)
-        return output
-
-
-@add_start_docstrings(
-    """
-    Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD /
-    TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    LONGFORMER_START_DOCSTRING,
-)
-class LongformerForQuestionAnswering(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.longformer = LongformerModel(config, add_pooling_layer=False)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        global_attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        start_positions: Optional[torch.Tensor] = None,
-        end_positions: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LongformerQuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        Returns:
-        Examples:
-        ```python
-        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
-        >>> import torch
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        >>> encoding = tokenizer(question, text, return_tensors="pt")
-        >>> input_ids = encoding["input_ids"]
-        >>> # default is local attention everywhere
-        >>> # the forward method will automatically set global attention on question tokens
-        >>> attention_mask = encoding["attention_mask"]
-        >>> outputs = model(input_ids, attention_mask=attention_mask)
-        >>> start_logits = outputs.start_logits
-        >>> end_logits = outputs.end_logits
-        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
-        >>> answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1]
-        >>> answer = tokenizer.decode(
-        ...     tokenizer.convert_tokens_to_ids(answer_tokens)
-        ... )  # remove space prepending space token
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if global_attention_mask is None:
-            if input_ids is None:
-                logger.warning(
-                    "It is not possible to automatically generate the `global_attention_mask` because input_ids is"
-                    " None. Please make sure that it is correctly set."
-                )
-            else:
-                # set global attention on question tokens automatically
-                global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id)
-
-        outputs = self.longformer(
-            input_ids,
-            attention_mask=attention_mask,
-            global_attention_mask=global_attention_mask,
-            head_mask=head_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        # because of batch=1 and not max_seq_length, the blow code not use.
-        padding_len = input_ids[0].eq(1).sum()
-        if padding_len > 0:
-            sequence_output = sequence_output[:, :-padding_len]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        # align to original longformer loss.
-        regular_softmax_loss = False
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-
-            if not regular_softmax_loss:
-                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
-                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
-                # but batch size is always 1, so this is not a problem
-                start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1)
-                end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1)
-            else:
-                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
-                start_positions = start_positions[:, 0:1]
-                end_positions = end_positions[:, 0:1]
-                start_loss = loss_fct(start_logits, start_positions[:, 0])
-                end_loss = loss_fct(end_logits, end_positions[:, 0])
-
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return LongformerQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            global_attentions=outputs.global_attentions,
-        )
-
-    def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, dim=-1):
-        """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
-        assert logits.ndim == 2
-        assert target.ndim == 2
-        assert logits.size(0) == target.size(0)
-
-        # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target
-        # here, the numerator is the sum of a few potential targets, where some of them is the correct answer
-
-        # compute a target mask
-        target_mask = target == ignore_index
-        # replaces ignore_index with 0, so `gather` will select logit at index 0 for the msked targets
-        masked_target = target * (1 - target_mask.long())
-        # gather logits
-        gathered_logits = logits.gather(dim=dim, index=masked_target)
-        # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
-        gathered_logits[target_mask] = float('-inf')
-
-        # each batch is one example
-        gathered_logits = gathered_logits.view(1, -1)
-        logits = logits.view(1, -1)
-
-        # numerator = log(sum(exp(gathered logits)))
-        log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
-        # denominator = log(sum(exp(logits)))
-        log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
-
-        # compute the loss
-        loss = -(log_score - log_norm)
-
-        # some of the examples might have a loss of `inf` when `target` is all `ignore_index`.
-        # remove those from the loss before computing the sum. Use sum instead of mean because
-        # it is easier to compute
-        return loss[~torch.isinf(loss)].sum()
-
-
-@add_start_docstrings(
-    """
-    Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
-    for Named-Entity-Recognition (NER) tasks.
-    """,
-    LONGFORMER_START_DOCSTRING,
-)
-class LongformerForTokenClassification(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.longformer = LongformerModel(config, add_pooling_layer=False)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="brad1141/Longformer-finetuned-norm",
-        output_type=LongformerTokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=(
-            "['Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence',"
-            " 'Evidence', 'Evidence', 'Evidence', 'Evidence']"
-        ),
-        expected_loss=0.63,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        global_attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LongformerTokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.longformer(
-            input_ids,
-            attention_mask=attention_mask,
-            global_attention_mask=global_attention_mask,
-            head_mask=head_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return LongformerTokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            global_attentions=outputs.global_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
-    a softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    LONGFORMER_START_DOCSTRING,
-)
-class LongformerForMultipleChoice(LongformerPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.longformer = LongformerModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
-    )
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=LongformerMultipleChoiceModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        global_attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LongformerMultipleChoiceModelOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
-            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
-            `input_ids` above)
-        """
-        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # set global attention on question tokens
-        if global_attention_mask is None and input_ids is not None:
-            logger.info("Initializing global attention on multiple choice...")
-            # put global attention on all tokens after `config.sep_token_id`
-            global_attention_mask = torch.stack(
-                [
-                    _compute_global_attention_mask(input_ids[:, i], self.config.sep_token_id, before_sep_token=False)
-                    for i in range(num_choices)
-                ],
-                dim=1,
-            )
-
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        flat_global_attention_mask = (
-            global_attention_mask.view(-1, global_attention_mask.size(-1))
-            if global_attention_mask is not None
-            else None
-        )
-        flat_inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        outputs = self.longformer(
-            flat_input_ids,
-            position_ids=flat_position_ids,
-            token_type_ids=flat_token_type_ids,
-            attention_mask=flat_attention_mask,
-            global_attention_mask=flat_global_attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=flat_inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-
-        if not return_dict:
-            output = (reshaped_logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return LongformerMultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            global_attentions=outputs.global_attentions,
-        )
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/requirements.txt b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/requirements.txt
deleted file mode 100644
index 84310f9ea50..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-accelerate
-datasets
-transformers
-torch==2.3.0
-neural-compressor==2.0
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/run_qa_no_trainer.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/run_qa_no_trainer.py
deleted file mode 100644
index a0ff5e1e30a..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/run_qa_no_trainer.py
+++ /dev/null
@@ -1,1305 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-# Apache v2 license
-# Copyright (C) 2021 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for question answering.
-"""
-# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
-"""
-This script is based on HuggingFace/transformers example: https://github.com/huggingface/transformers/blob/v4.6.1/examples/pytorch/question-answering/run_qa.py
-Changes made to the script:
- 1. Added pruning capabilities
- 2. Added model distillation capabilities
- 3. Added learning rate rewinding option
- 4. Added methods to save all hyper-parameters used
- 5. Added quantization capabilities
-"""
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-from collections import defaultdict
-from tqdm.auto import tqdm
-import math
-
-import torch
-import datasets
-from datasets import load_dataset, load_metric
-
-import transformers
-from trainer_qa import QuestionAnsweringTrainer
-from transformers import (
-    AutoConfig,
-    AutoModelForQuestionAnswering,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    PreTrainedTokenizerFast,
-    TrainingArguments,
-    default_data_collator,
-    set_seed,
-    get_scheduler,
-    CONFIG_MAPPING,
-    MODEL_MAPPING,
-    SchedulerType
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-from transformers.file_utils import get_full_repo_name
-
-from utils_qa import postprocess_qa_predictions
-
-from huggingface_hub import Repository
-
-from functools import partial
-from accelerate import Accelerator
-from torch.utils.data import DataLoader
-import argparse
-from accelerate.logging import get_logger
-import numpy as np
-import utils_qa
-import json
-from neural_compressor.training import Pruning, prepare_compression
-from neural_compressor.training import WeightPruningConfig
-
-os.environ["WANDB_DISABLED"] = "true"
-os.environ["HTTP_PROXY"] = ""
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.6.0")
-
-logger = get_logger(__name__)
-# You should update this to your particular problem to have better documentation of `model_type`
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-# (['loss', 'start_logits', 'end_logits'])
-# batch(['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids']
-def get_loss_one_logit(student_logit, teacher_logit):
-    t = 2.0
-    from torch.nn import functional as F
-    return F.kl_div(
-        input=F.log_softmax(student_logit / t, dim=-1),
-        target=F.softmax(teacher_logit / t, dim=-1),
-        reduction="batchmean"
-    ) * (t ** 2)
-
-def save_prefixed_metrics(results, output_dir, file_name: str = "all_results.json", metric_key_prefix: str = "eval"):
-    """
-    Save results while prefixing metric names.
-    Args:
-        results: (:obj:`dict`):
-            A dictionary of results.
-        output_dir: (:obj:`str`):
-            An output directory.
-        file_name: (:obj:`str`, `optional`, defaults to :obj:`all_results.json`):
-            An output file name.
-        metric_key_prefix: (:obj:`str`, `optional`, defaults to :obj:`eval`):
-            A metric name prefix.
-    """
-    # Prefix all keys with metric_key_prefix + '_'
-    for key in list(results.keys()):
-        if not key.startswith(f"{metric_key_prefix}_"):
-            results[f"{metric_key_prefix}_{key}"] = results.pop(key)
-
-    with open(os.path.join(output_dir, file_name), "w") as f:
-        json.dump(results, f, indent=4)
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Question Answering task")
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help="The name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The configuration name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--train_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the training data."
-    )
-    parser.add_argument(
-        "--preprocessing_num_workers",
-        type=int, default=10,
-        help="A csv or a json file containing the training data."
-    )
-
-    parser.add_argument(
-        "--do_predict",
-        action="store_true",
-        help="To do prediction on the question answering model"
-    )
-    parser.add_argument(
-        "--validation_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the validation data."
-    )
-    parser.add_argument(
-        "--test_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the Prediction data."
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        type=int,
-        default=384,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
-            " sequences shorter will be padded if `--pad_to_max_lengh` is passed."
-        ),
-    )
-    parser.add_argument(
-        "--pad_to_max_length",
-        action="store_true",
-        help="If passed, pad all samples to `max_seq_length`. Otherwise, dynamic padding is used.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models."
-    )
-    parser.add_argument(
-        "--teacher_model_name_or_path",
-        type=str,
-        default=None,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=False
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--use_slow_tokenizer",
-        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--distill_loss_weight",
-        type=float,
-        default=0.0,
-        help="distiller loss weight"
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--weight_decay",
-        type=float,
-        default=0.0,
-        help="Weight decay to use."
-    )
-    parser.add_argument(
-        "--num_train_epochs",
-        type=int,
-        default=3,
-        help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
-    )
-
-    parser.add_argument(
-        "--warm_epochs",
-        type=int,
-        default=0,
-        help="Number of epochs the network not be purned"
-    )
-    parser.add_argument(
-        "--num_warmup_steps",
-        type=int,
-        default=0,
-        help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default=None,
-        help="Where to store the final model."
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training."
-    )
-    parser.add_argument(
-        "--doc_stride",
-        type=int,
-        default=128,
-        help="When splitting up a long document into chunks how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--n_best_size",
-        type=int,
-        default=20,
-        help="The total number of n-best predictions to generate when looking for an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help=(
-            "The threshold used to select the null answer: if the best answer has a score that is less than "
-            "the score of the null answer minus this threshold, the null answer is selected for this example. "
-            "Only useful when `version_2_with_negative=True`."
-        ),
-    )
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, some of the examples do not have an answer.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        type=int,
-        default=30,
-        help=(
-            "The maximum length of an answer that can be generated. This is needed because the start "
-            "and end predictions are not conditioned on one another."
-        ),
-    )
-    parser.add_argument(
-        "--max_train_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--max_eval_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument(
-        "--max_predict_samples",
-        type=int,
-        default=None,
-        help="For debugging purposes or quicker training, truncate the number of prediction examples to this",
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default=None,
-        help="Model type to use if training from scratch.",
-        choices=MODEL_TYPES,
-    )
-    parser.add_argument(
-        "--cooldown_epochs",
-        type=int, default=0,
-        help="Cooling epochs after pruning."
-    )
-    parser.add_argument(
-        "--do_prune", action="store_true",
-        help="Whether or not to prune the model"
-    )
-    parser.add_argument(
-        "--pruning_scope",
-        type=str, default="global",
-        help="pruning scope, we support global and local."
-    )
-    parser.add_argument(
-        "--pruning_pattern",
-        type=str, default="4x1",
-        help="pruning pattern type, we support NxM and N:M."
-    )
-    parser.add_argument(
-        "--target_sparsity",
-        type=float, default=0.8,
-        help="Target sparsity of the model."
-    )
-    parser.add_argument(
-        "--pruning_frequency",
-        type=int, default=-1,
-        help="Sparse step frequency for iterative pruning, default to a quarter of pruning steps."
-    )
-
-    parser.add_argument(
-        "--keep_conf", action="store_true",
-        help="Whether or not to keep the prune config infos"
-    )
-    parser.add_argument(
-        "--pruning_config",
-        type=str,
-        help="pruning_config"
-    )
-
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the model to the Hub."
-    )
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        help="The name of the repository to keep in sync with the local `output_dir`."
-    )
-    parser.add_argument(
-        "--hub_token",
-        type=str,
-        help="The token to use to push to the Model Hub."
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=str,
-        default=None,
-        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help="If the training should continue from a checkpoint folder.",
-    )
-
-    parser.add_argument(
-        "--with_tracking",
-        action="store_true",
-        help="Whether to enable experiment trackers for logging.",
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="all",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
-            "Only applicable when `--with_tracking` is passed."
-        ),
-    )
-
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="Path to directory to store the pretrained models downloaded from huggingface.co",
-    )
-
-    parser.add_argument(
-        "--model_revision",
-        type=str,
-        default="main",
-        help="The specific model version to use (can be a branch name, tag name or commit id).",
-    )
-
-    parser.add_argument(
-        "--use_auth_token",
-        type=bool,
-        default=False,
-        help="Will use the token generated when running `transformers-cli login` (necessary to use this script with private models).",
-    )
-
-    parser.add_argument(
-        "--do_train",
-        action="store_true",
-        help="Whether to run training.",
-    )
-
-    parser.add_argument(
-        "--do_eval",
-        action="store_true",
-        help="Whether to run eval on the dev set.",
-    )
-
-    args = parser.parse_args()
-
-    # Sanity checks
-    if (
-            args.dataset_name is None
-            and args.train_file is None
-            and args.validation_file is None
-            and args.test_file is None
-    ):
-        raise ValueError("Need either a dataset name or a training/validation/test file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        if args.test_file is not None:
-            extension = args.test_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
-
-    if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
-
-    return args
-
-def main():
-
-    args = parse_args()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    # send_example_telemetry("run_qa_no_trainer", args)
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
-    # in the environment
-    
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
-    
-    '''
-    accelerator_log_kwargs = {}
-    if args.with_tracking:
-        accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
-    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
-    '''
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-    accelerator.wait_for_everyone()
-
-    script_path = os.path.split(os.path.abspath(__file__))[0]
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir)
-    else:
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-            extension = args.train_file.split(".")[-1]
-
-        if args.validation_file is not None:
-            data_files["dev"] = args.validation_file
-            extension = args.validation_file.split(".")[-1]
-        if args.test_file is not None:
-            data_files["test"] = args.test_file
-            extension = args.test_file.split(".")[-1]
-        # datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
-        raw_datasets = load_dataset(os.path.join(script_path, "squad.py"), data_files=data_files, cache_dir=args.cache_dir)
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir,
-        revision=args.model_revision,
-        use_auth_token=True if args.use_auth_token else None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir,
-        use_fast=True,
-        revision=args.model_revision,
-        use_auth_token=True if args.use_auth_token else None,
-    )
-
-    # local py module
-    from modeling_longformer import LongformerForQuestionAnswering
-    model_class = LongformerForQuestionAnswering
-
-    if args.distill_loss_weight > 0:
-        teacher_path = args.teacher_model_name_or_path 
-        if teacher_path is None:
-            teacher_path = args.model_name_or_path
-        teacher_model = model_class.from_pretrained(
-            teacher_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-        )
-
-    if args.model_name_or_path:
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-            cache_dir=args.cache_dir,
-            revision=args.model_revision,
-            use_auth_token=True if args.use_auth_token else None,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = model_class.from_config(config)
-
-    # Preprocessing the datasets.
-    # Preprocessing is slightly different for training and evaluation.
-    if args.do_train:
-        column_names = raw_datasets["train"].column_names
-    elif args.do_eval:
-        column_names = raw_datasets["validation"].column_names
-    else:
-        column_names = raw_datasets["test"].column_names
-    question_column_name = "question" if "question" in column_names else column_names[0]
-    context_column_name = "context" if "context" in column_names else column_names[1]
-    answer_column_name = "answers" if "answers" in column_names else column_names[2]
-
-    # Padding side determines if we do (question|context) or (context|question).
-    # pad_on_right = tokenizer.padding_side == "right"
-    # max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-    max_seq_length = args.max_seq_length
-
-    # preprocess context and answers
-    def preprocess_context(examples):
-        new_examples = {}
-
-        def is_whitespace(c):
-            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-                return True
-            return False
-
-        def pre_tokenize(p):
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in p:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            return ' '.join(doc_tokens), char_to_word_offset
-
-        new_examples[context_column_name] = []
-        new_examples["answer_spans"] = []
-        for i, p in enumerate(examples[context_column_name]):
-            tokenized_p, char_to_word_offset = pre_tokenize(p)
-            new_examples[context_column_name].append(tokenized_p)
-
-            answer_spans = []
-            for orig_answer_text, answer_offset in zip(examples[answer_column_name][i]['text'], examples[answer_column_name][i]['answer_start']):
-                answer_length = len(orig_answer_text)
-                try:
-                    start_position = char_to_word_offset[answer_offset]
-                    end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                    token_ids = tokenizer.encode(orig_answer_text)
-                except RuntimeError:
-                    logger.info(f'Reading example {idx} failed')
-                    start_position = 0
-                    end_position = 0
-                answer_spans.append({'start': start_position, 'end': end_position,
-                    'text': orig_answer_text, 'token_ids': token_ids})
-            new_examples["answer_spans"].append(answer_spans)
-
-        for key in examples:
-            if key != context_column_name:
-                new_examples[key] = examples[key]
-        return new_examples
-
-    # preprocessing
-    def prepare_features(examples, max_question_len=55, max_doc_len=4096, max_num_answers=64, ignore_seq_with_no_answers=False, mode="eval"):
-
-        tokenized_examples = {}
-        tokenized_examples["input_ids"] = []
-        tokenized_examples["attention_mask"] = []
-        if mode == "train":
-            tokenized_examples["start_positions"] = []
-            tokenized_examples["end_positions"] = []
-        elif mode == "eval":
-            tokenized_examples["example_id"] = []
-        else:
-            raise NotImplementedError("not implemented yet.")
-
-        # not use for roberta
-        #tokenized_examples["token_type_ids"] = []
-
-        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
-        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
-        # left whitespace
-        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
-
-        for example_index in range(len(examples[question_column_name])):
-            question_text = examples[question_column_name][example_index]
-            query_tokens = tokenizer.tokenize(question_text)
-            query_tokens = query_tokens[:max_question_len]
-            doc_tokens = examples[context_column_name][example_index].split(" ")
-            answer_spans = examples["answer_spans"][example_index]
-            tok_to_orig_index = []
-            orig_to_tok_index = []
-            all_doc_tokens = []
-            for (i, token) in enumerate(doc_tokens):
-                orig_to_tok_index.append(len(all_doc_tokens))
-                sub_tokens = tokenizer.tokenize(f'. {token}')[1:] if i > 0 else tokenizer.tokenize(token)
-                for sub_token in sub_tokens:
-                    tok_to_orig_index.append(i)
-                    all_doc_tokens.append(sub_token)
-            all_doc_tokens = all_doc_tokens[:max_doc_len]
-            # The -3 accounts for <s>, </s> and </s>
-            max_tokens_per_doc_slice = max_seq_length - len(query_tokens) - 3
-            assert max_tokens_per_doc_slice > 0
-
-            if args.doc_stride < 0:
-                # negative doc_stride indicates no sliding window, but using first slice
-                args.doc_stride = -100 * len(all_doc_tokens)  # large -ve value for the next loop to execute once
-
-            input_ids_list = []
-            input_mask_list = []
-            segment_ids_list = []
-            start_positions_list = []
-            end_positions_list = []
-            answer_token_ids_list = []
-
-            for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - args.doc_stride):
-                slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens))
-                doc_slice_tokens = all_doc_tokens[slice_start:slice_end]
-                tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token] \
-                        + doc_slice_tokens + [tokenizer.sep_token]
-
-                # but don't use for roberta
-                segment_ids = [0] * (len(query_tokens) + 2) + [1] * (len(doc_slice_tokens) + 1)
-                assert len(segment_ids) == len(tokens)
-
-                input_ids = tokenizer.convert_tokens_to_ids(tokens)
-                input_mask = [1] * len(input_ids)
-
-                #if data_args.pad_to_max_length:  # no need to pad if document is not strided
-                if False:
-                    # Zero-pad up to the sequence length.
-                    padding_len = max_seq_length - len(input_ids)
-                    input_ids.extend([tokenizer.pad_token_id] * padding_len)
-                    input_mask.extend([0] * padding_len)
-                    segment_ids.extend([0] * padding_len)
-
-                    assert len(input_ids) == max_seq_length
-                    assert len(input_mask) == max_seq_length
-                    assert len(segment_ids) == max_seq_length
-
-                doc_offset = len(query_tokens) + 2 - slice_start
-
-                start_positions = []
-                end_positions = []
-                answer_token_ids = []
-                for answer_span in answer_spans:
-                    start_position = answer_span['start']
-                    end_position = answer_span['end']
-                    tok_start_position_in_doc = orig_to_tok_index[start_position]
-                    not_end_of_doc = int(end_position + 1 < len(orig_to_tok_index))
-                    tok_end_position_in_doc = orig_to_tok_index[end_position + not_end_of_doc] - not_end_of_doc
-                    if tok_start_position_in_doc < slice_start or tok_end_position_in_doc > slice_end:
-                        # this answer is outside the current slice
-                        continue
-
-                    start_positions.append(tok_start_position_in_doc + doc_offset)
-                    end_positions.append(tok_end_position_in_doc + doc_offset)
-                    answer_token_ids.append(answer_span['token_ids'])
-
-                assert len(start_positions) == len(end_positions)
-                if ignore_seq_with_no_answers and len(start_positions) == 0:
-                    continue
-
-                # answers from start_positions and end_positions if > self.max_num_answers
-                start_positions = start_positions[:max_num_answers]
-                end_positions = end_positions[:max_num_answers]
-                answer_token_ids = answer_token_ids[:max_num_answers]
-
-                # -1 padding up to self.max_num_answers
-                # -1 means empty answer in last token, while normal squad in [CLS] token
-                padding_len = max_num_answers - len(start_positions)
-                start_positions.extend([-1] * padding_len)
-                end_positions.extend([-1] * padding_len)
-                answer_token_ids.extend([[]] * padding_len)
-
-                # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values
-                found_start_positions = set()
-                found_end_positions = set()
-                found_answer_token_ids = set()
-                for i, (start_position, end_position, answer_tokens) in enumerate(
-                        zip(start_positions, end_positions, answer_token_ids)
-                        ):
-                    if start_position in found_start_positions:
-                        start_positions[i] = -1
-                    if end_position in found_end_positions:
-                        end_positions[i] = -1
-                    answer_tokens_as_str = ','.join([str(x) for x in answer_tokens])
-                    if answer_tokens_as_str in found_answer_token_ids:
-                        answer_token_ids[i] = []
-
-                    found_start_positions.add(start_position)
-                    found_end_positions.add(end_position)
-                    found_answer_token_ids.add(answer_tokens_as_str)
-
-                input_ids_list.append(input_ids)
-                input_mask_list.append(input_mask)
-                segment_ids_list.append(segment_ids)
-                start_positions_list.append(start_positions)
-                end_positions_list.append(end_positions)
-                answer_token_ids_list.append(answer_token_ids)
-
-            # pad answers in answer_token_ids_list to the longest answer
-            max_answer_len = max([len(item) for sublist in answer_token_ids_list for item in sublist])  # flat list
-            if max_answer_len == 0:
-                max_answer_len = 2
-            for answers_of_one_slice in answer_token_ids_list:
-                for answer_tokens in answers_of_one_slice:
-                    if len(answer_tokens) == 0:
-                        # TODO: <s></s><pad><pad><pad> or <pad><pad><pad><pad><pad> ?
-                        padding_len = max_answer_len - len(answer_tokens) - 2
-                        answer_tokens.extend([tokenizer.bos_token_id, tokenizer.eos_token_id] +
-                                                 ([tokenizer.pad_token_id] * padding_len))
-                    else:
-                        padding_len = max_answer_len - len(answer_tokens)
-                        answer_tokens.extend([tokenizer.pad_token_id] * padding_len)
-
-
-            tokenized_examples["input_ids"].extend(input_ids_list)
-            tokenized_examples["attention_mask"].extend(input_mask_list)
-
-            if mode == "train":
-                # only one answer used for training
-                #tokenized_examples["start_positions"].extend([each[0] for each in start_positions_list])
-                #tokenized_examples["end_positions"].extend([each[0] for each in end_positions_list])
-                tokenized_examples["start_positions"].append(start_positions_list[0])
-                tokenized_examples["end_positions"].append(end_positions_list[0])
-            elif mode == "eval":
-                tokenized_examples["example_id"].append(examples["id"][example_index])
-
-        return tokenized_examples
-
-    prepare_train_features = partial(prepare_features, mode="train")
-    if args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if args.max_train_samples is not None:
-            # We will select sample from whole data if augment is specified
-            train_dataset = train_dataset.select(range(args.max_train_samples))
-        with accelerator.main_process_first():
-            # preprocess
-            train_dataset = train_dataset.map(
-                    preprocess_context,
-                    batched=True,
-                    num_proc=args.preprocessing_num_workers,
-                    remove_columns=column_names,
-                    load_from_cache_file=not args.overwrite_cache,
-                    )
-
-            # Create train feature from dataset
-            train_dataset = train_dataset.map(
-                prepare_train_features,
-                batched=True,
-                num_proc=args.preprocessing_num_workers,
-                remove_columns=column_names + ["answer_spans"],
-                load_from_cache_file=not args.overwrite_cache,
-            )
-        if args.max_train_samples is not None:
-            # Number of samples might increase during Feature Creation, We select only specified max samples
-            train_dataset = train_dataset.select(range(args.max_train_samples))
-
-    prepare_validation_features = partial(prepare_features, mode="eval")
-
-    if args.do_eval:
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = raw_datasets["validation"]
-        if args.max_eval_samples is not None:
-            # We will select sample from whole data
-            eval_examples = eval_examples.select(range(args.max_eval_samples))
-        with accelerator.main_process_first():
-            # preprocess
-            eval_examples = eval_examples.map(
-                    preprocess_context,
-                    batched=True,
-                    num_proc=args.preprocessing_num_workers,
-                    remove_columns=column_names,
-                    load_from_cache_file=not args.overwrite_cache,
-                    )
-            # Validation Feature Creation
-            eval_dataset = eval_examples.map(
-                prepare_validation_features,
-                batched=True,
-                num_proc=args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not args.overwrite_cache,
-            )
-
-        if args.max_eval_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            eval_dataset = eval_dataset.select(range(args.max_eval_samples))
-
-
-    # DataLoaders creation:
-    if args.pad_to_max_length:
-        # If padding was already done ot max length, we use the default data collator that will just convert everything
-        # to tensors.
-        data_collator = default_data_collator
-    else:
-        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
-        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
-        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
-
-    train_dataloader = DataLoader(
-        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
-    )
-
-    eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "answer_spans"])
-    eval_dataloader = DataLoader(
-        eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
-    )
-
-    # Post-processing:
-    def post_processing_function(examples, features, predictions, stage="eval"):
-        # Post-processing: we match the start logits and end logits to answers in the original context.
-        predictions = postprocess_qa_predictions(
-            examples=examples,
-            features=features,
-            predictions=predictions,
-            tokenizer=tokenizer,
-            version_2_with_negative=args.version_2_with_negative,
-            n_best_size=args.n_best_size,
-            max_answer_length=args.max_answer_length,
-            null_score_diff_threshold=args.null_score_diff_threshold,
-            output_dir=args.output_dir,
-            prefix=stage,
-        )
-        # Format the result to the format the metric expects.
-        if args.version_2_with_negative:
-            formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-            ]
-        else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-
-        references = [{"id": ex["id"], "answers": ex[answer_column_name], "aliases": ex["aliases"]} for ex in examples]
-
-        return EvalPrediction(predictions=predictions, label_ids=references)
-
-    # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
-    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
-        """
-        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
-        Args:
-            start_or_end_logits(:obj:`tensor`):
-                This is the output predictions of the model. We can only enter either start or end logits.
-            eval_dataset: Evaluation dataset
-            max_len(:obj:`int`):
-                The maximum length of the output tensor. ( See the model.eval() part for more details )
-        """
-
-        step = 0
-        # create a numpy array and fill it with -100.
-        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
-        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
-        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
-            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
-            # And after every iteration we have to change the step
-
-            batch_size = output_logit.shape[0]
-            cols = output_logit.shape[1]
-
-            if step + batch_size < len(dataset):
-                logits_concat[step: step + batch_size, :cols] = output_logit
-            else:
-                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
-
-            step += batch_size
-
-        return logits_concat
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    no_decay_outputs = ["bias", "LayerNorm.weight", "qa_outputs"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-    ]
-    if args.do_prune:
-        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas=[0.9, 0.9])
-    else:
-        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    if args.distill_loss_weight > 0:
-        teacher_model, model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-            teacher_model, model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
-        )
-        teacher_model.eval()
-    else:
-        # Prepare everything with our `accelerator`.
-        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
-        )
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # Figure out how many steps we should save the Accelerator states
-    if hasattr(args.checkpointing_steps, "isdigit"):
-        checkpointing_steps = args.checkpointing_steps
-        if args.checkpointing_steps.isdigit():
-            checkpointing_steps = int(args.checkpointing_steps)
-    else:
-        checkpointing_steps = None
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # We initialize the trackers only on main process because `accelerator.log`
-    # only logs on main process and we don't want empty logs/runs on other processes.
-    if args.with_tracking:
-        if accelerator.is_main_process:
-            experiment_config = vars(args)
-            # TensorBoard cannot log Enums, need the raw value
-            experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
-            accelerator.init_trackers("qa_no_trainer", experiment_config)
-
-    # Train!
-    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    completed_steps = 0
-    starting_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the most recent checkpoint
-            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
-            dirs.sort(key=os.path.getctime)
-            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
-        # Extract `epoch_{i}` or `step_{i}`
-        training_difference = os.path.splitext(path)[0]
-
-        if "epoch" in training_difference:
-            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
-            resume_step = None
-        else:
-            resume_step = int(training_difference.replace("step_", ""))
-            starting_epoch = resume_step // len(train_dataloader)
-            resume_step -= starting_epoch * len(train_dataloader)
-
-    # Pruning preparation
-    num_iterations = len(train_dataset) / total_batch_size
-    num_warm = int(args.warm_epochs * num_iterations) + args.num_warmup_steps
-    total_iterations = int(num_iterations * (args.num_train_epochs - args.cooldown_epochs))
-    frequency = int((total_iterations - num_warm + 1) / 40) if args.pruning_frequency == -1 \
-                                                           else args.pruning_frequency
-
-    pruning_start = num_warm
-    pruning_end = total_iterations
-    if not args.do_prune:
-        pruning_start = num_iterations * args.num_train_epochs + 1
-        pruning_end = pruning_start
-
-    pruning_configs=[
-        {
-            "pruning_type": "snip_momentum",
-            "pruning_scope": "global",
-            "sparsity_decay_type": "exp",
-            "excluded_op_names": ["qa_outputs", "pooler", ".*embeddings*"],
-            "pruning_op_types": ["Linear"],
-            "max_sparsity_ratio_per_op": 0.98
-        }
-    ]
-
-    configs = WeightPruningConfig(
-        pruning_configs,
-        pruning_scope=args.pruning_scope,
-        target_sparsity=args.target_sparsity,
-        pattern=args.pruning_pattern,
-        pruning_frequency=frequency,
-        start_step=pruning_start,
-        end_step=pruning_end
-    )
-
-    compression_manager = prepare_compression(model=model, confs=configs)
-    compression_manager.callbacks.on_train_begin()
-    model = compression_manager.model
-
-
-    for epoch in range(starting_epoch, args.num_train_epochs):
-        model.train()
-        if epoch >= args.warm_epochs:
-            if args.with_tracking:
-                total_loss = 0
-            for step, batch in enumerate(train_dataloader):
-                compression_manager.callbacks.on_step_begin(step)
-
-                outputs = model(**batch)
-                loss = outputs.loss
-                # We keep track of the loss at each epoch
-                if args.with_tracking:
-                    total_loss += loss.detach().float()
-                if args.distill_loss_weight > 0:
-                    distill_loss_weight = args.distill_loss_weight
-                    with torch.no_grad():
-                        teacher_outputs = teacher_model(**batch)
-                    loss = (distill_loss_weight) / 2 * get_loss_one_logit(outputs['start_logits'],
-                                                                        teacher_outputs['start_logits']) \
-                        + (distill_loss_weight) / 2 * get_loss_one_logit(outputs['end_logits'],
-                                                                        teacher_outputs['end_logits'])
-                loss = loss / args.gradient_accumulation_steps
-                accelerator.backward(loss)
-                    
-                if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-                    compression_manager.callbacks.on_before_optimizer_step()
-                    optimizer.step()
-                    compression_manager.callbacks.on_after_optimizer_step()
-                    lr_scheduler.step()
-                    optimizer.zero_grad()
-                    progress_bar.update(1)
-                    completed_steps += 1
-
-
-                if isinstance(checkpointing_steps, int):
-                    if completed_steps % checkpointing_steps == 0:
-                        output_dir = f"step_{completed_steps}"
-                        if args.output_dir is not None:
-                            output_dir = os.path.join(args.output_dir, output_dir)
-                        accelerator.save_state(output_dir)
-
-                if completed_steps >= args.max_train_steps:
-                    break
-        else:
-            for step, batch in enumerate(train_dataloader):
-                outputs = model(**batch)
-                loss = outputs.loss
-                loss = loss / args.gradient_accumulation_steps
-                accelerator.backward(loss)
-                if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-                    optimizer.step()
-                    lr_scheduler.step()
-                    optimizer.zero_grad()
-                    progress_bar.update(1)
-                    completed_steps += 1
-
-                if completed_steps >= args.max_train_steps:
-                    break
-
-        if args.checkpointing_steps == "epoch":
-            output_dir = f"epoch_{epoch}"
-            if args.output_dir is not None:
-                output_dir = os.path.join(args.output_dir, output_dir)
-            accelerator.save_state(output_dir)
-
-        if args.push_to_hub and epoch < args.num_train_epochs - 1:
-            accelerator.wait_for_everyone()
-            unwrapped_model = accelerator.unwrap_model(model)
-            unwrapped_model.save_pretrained(
-                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
-            )
-            if accelerator.is_main_process:
-                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(
-                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
-                )
-
-        # eval each epoch
-        logger.info(f"***** Running Evaluation*****")
-        all_start_logits = []
-        all_end_logits = []
-
-        # pruner.on_before_eval()
-        model.eval()
-        for step, batch in enumerate(eval_dataloader):
-            with torch.no_grad():
-                outputs = model(**batch)
-                start_logits = outputs.start_logits
-                end_logits = outputs.end_logits
-
-                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
-                    start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
-                    end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)
-
-                all_start_logits.append(accelerator.gather(start_logits).cpu().numpy())
-                all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())
-
-        max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
-        # pruner.on_after_eval()
-
-        # concatenate the numpy array
-        start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
-        end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
-
-        # delete the list of numpy arrays
-        del all_start_logits
-        del all_end_logits
-
-        outputs_numpy = (start_logits_concat, end_logits_concat)
-        eval_preds = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
-
-        metrics = utils_qa.evaluate_triviaqa(eval_preds.label_ids, eval_preds.predictions)
-        logger.info(metrics)
-
-
-    if args.output_dir is not None:
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model.model)
-        unwrapped_model.save_pretrained(
-            args.output_dir + f"eph{args.num_train_epochs}_lr{args.learning_rate}_bs{total_batch_size}",
-            is_main_process=accelerator.is_main_process, save_function=accelerator.save
-        )
-        if accelerator.is_main_process:
-            tokenizer.save_pretrained(args.output_dir)
-            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
-
-            logger.info(json.dumps(metrics, indent=4))
-            save_prefixed_metrics(metrics, args.output_dir)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/download_data_and_convert.sh b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/download_data_and_convert.sh
deleted file mode 100644
index f0d7d0f3fa4..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/download_data_and_convert.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-# from http://nlp.cs.washington.edu/triviaqa/  and https://github.com/mandarjoshi90/triviaqa
-wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz
-
-tar -xvzf triviaqa-rc.tar.gz
-
-# the blow codes from the original paper code: https://github.com/allenai/longformer
-python -m utils.convert_to_squad_format  \
-  --triviaqa_file ./qa/wikipedia-train.json  \
-  --wikipedia_dir ./evidence/wikipedia/   \
-  --web_dir ./evidence/web/  \
-  --max_num_tokens 4096  \
-  --squad_file squad-wikipedia-train-4096.json
-
-python utils.convert_to_squad_format  \
-  --triviaqa_file ./qa/wikipedia-dev.json  \
-  --wikipedia_dir ./evidence/wikipedia/   \
-  --web_dir ./evidence/web/  \
-  --max_num_tokens 4096  \
-  --squad_file squad-wikipedia-dev-4096.json
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_dense_fintune.sh b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_dense_fintune.sh
deleted file mode 100644
index ce21e329c16..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_dense_fintune.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-set -x
-
-train_file=./squad-wikipedia-train-4096.json
-validation_file=./squad-wikipedia-dev-4096.json
-pretrained_model=allenai/longformer-base-4096
-
-accelerate launch --main_process_port 29245 run_qa_no_trainer.py \
-    --model_name_or_path $pretrained_model \
-    --do_train \
-    --do_eval \
-    --train_file $train_file \
-    --validation_file $validation_file \
-    --cache_dir ./tmp_cached \
-    --max_seq_length 4096 \
-    --doc_stride -1 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --per_device_eval_batch_size 1 \
-    --num_warmup_steps 1000 \
-    --learning_rate 3.5e-5 \
-    --num_train_epochs 4 \
-    --output_dir longformer-base-4096-dense-baseline
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_sparse_global_4x1_pruning.sh b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_sparse_global_4x1_pruning.sh
deleted file mode 100644
index 3c08207aa62..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_sparse_global_4x1_pruning.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-set -x
-
-train_file=./squad-wikipedia-train-4096.json
-validation_file=./squad-wikipedia-dev-4096.json
-teacher_model=Intel/longformer-base-4096-finetuned-triviaqa
-
-accelerate launch --main_process_port 29745 run_qa_no_trainer.py \
-        --model_name_or_path $teacher_model \
-        --do_train \
-        --do_eval \
-        --train_file $train_file \
-        --validation_file $validation_file \
-        --cache_dir ./tmp_cached \
-        --max_seq_length 4096 \
-        --doc_stride -1 \
-        --per_device_train_batch_size 1 \
-        --gradient_accumulation_steps 8 \
-        --per_device_eval_batch_size 1 \
-        --num_warmup_steps 1000 \
-        --do_prune \
-        --target_sparsity 0.8 \
-        --pruning_scope "global" \
-        --pruning_pattern "4x1" \
-        --pruning_frequency 1000 \
-        --cooldown_epochs 10 \
-        --learning_rate 1e-4 \
-        --num_train_epochs 18 \
-        --weight_decay  0.01 \
-        --output_dir longformer-base-4096-pruned-global-sparse80 \
-        --teacher_model_name_or_path $teacher_model \
-        --distill_loss_weight 3
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/squad.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/squad.py
deleted file mode 100644
index b9a2847449d..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/squad.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""SQUAD: The Stanford Question Answering Dataset."""
-
-
-import json
-
-import datasets
-from datasets.tasks import QuestionAnsweringExtractive
-
-
-logger = datasets.logging.get_logger(__name__)
-
-
-_CITATION = """\
-@article{2016arXiv160605250R,
-       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},
-                 Konstantin and {Liang}, Percy},
-        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",
-      journal = {arXiv e-prints},
-         year = 2016,
-          eid = {arXiv:1606.05250},
-        pages = {arXiv:1606.05250},
-archivePrefix = {arXiv},
-       eprint = {1606.05250},
-}
-"""
-
-_DESCRIPTION = """\
-Stanford Question Answering Dataset (SQuAD) is a reading comprehension \
-dataset, consisting of questions posed by crowdworkers on a set of Wikipedia \
-articles, where the answer to every question is a segment of text, or span, \
-from the corresponding reading passage, or the question might be unanswerable.
-"""
-
-_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
-_URLS = {
-    "train": _URL + "train-v1.1.json",
-    "dev": _URL + "dev-v1.1.json",
-}
-
-
-class SquadConfig(datasets.BuilderConfig):
-    """BuilderConfig for SQUAD."""
-
-    def __init__(self, **kwargs):
-        """BuilderConfig for SQUAD.
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(SquadConfig, self).__init__(**kwargs)
-
-
-class Squad(datasets.GeneratorBasedBuilder):
-    """SQUAD: The Stanford Question Answering Dataset. Version 1.1."""
-
-    BUILDER_CONFIGS = [
-        SquadConfig(
-            name="plain_text",
-            version=datasets.Version("1.0.0", ""),
-            description="Plain text",
-        ),
-    ]
-    print(BUILDER_CONFIGS)
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features(
-                {
-                    "id": datasets.Value("string"),
-                    "title": datasets.Value("string"),
-                    "context": datasets.Value("string"),
-                    "question": datasets.Value("string"),
-                    "answers": datasets.features.Sequence(
-                        {
-                            "text": datasets.Value("string"),
-                            "answer_start": datasets.Value("int32"),
-                        }
-                    ),
-                    "aliases": datasets.features.Sequence(datasets.Value("string")),
-                }
-            ),
-            # No default supervised_keys (as we have to pass both question
-            # and context as input).
-            supervised_keys=None,
-            homepage="https://rajpurkar.github.io/SQuAD-explorer/",
-            citation=_CITATION,
-            task_templates=[
-                QuestionAnsweringExtractive(
-                    question_column="question", context_column="context", answers_column="answers"
-                )
-            ],
-        )
-
-    def _split_generators(self, dl_manager):
-        #downloaded_files = dl_manager.download_and_extract(_URLS)
-        downloaded_files = self.config.data_files
-        return [
-            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"][0]}),
-            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"][0]}),
-        ]
-
-    def _generate_examples(self, filepath):
-        """This function returns the examples in the raw (text) form."""
-        logger.info("generating examples from = %s", filepath)
-        key = 0
-        with open(filepath, encoding="utf-8") as f:
-            squad = json.load(f)
-            for article in squad["data"]:
-                title = article.get("title", "")
-                for paragraph in article["paragraphs"]:
-                    context = paragraph["context"]  # do not strip leading blank spaces GH-2585
-                    for qa in paragraph["qas"]:
-                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
-                        answers = [answer["text"] for answer in qa["answers"]]
-                        # Features currently used are "context", "question", and "answers".
-                        # Others are extracted here for the ease of future expansions.
-                        yield key, {
-                            "title": title,
-                            "context": context,
-                            "question": qa["question"],
-                            "id": qa["id"].split('--')[0],
-                            "answers": {
-                                "answer_start": answer_starts,
-                                "text": answers,
-                            },
-                            "aliases": qa["aliases"] if qa.get("aliases") is not None else [],
-                        }
-                        key += 1
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/trainer_qa.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/trainer_qa.py
deleted file mode 100644
index af237521e8a..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/trainer_qa.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# coding=utf-8
-
-# Apache v2 license
-# Copyright (C) 2021 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A subclass of `Trainer` specific to Question-Answering tasks
-"""
-"""
-This script is based on HuggingFace/transformers example: https://github.com/huggingface/transformers/blob/v4.6.1/examples/pytorch/question-answering/trainer_qa.py
-"""
-
-from transformers import Trainer, is_torch_tpu_available
-from transformers.trainer_utils import PredictionOutput
-import utils_qa
-import collections
-from collections import defaultdict
-import numpy as np
-import torch
-import json
-
-
-if is_torch_tpu_available():
-    import torch_xla.core.xla_model as xm
-    import torch_xla.debug.metrics as met
-
-
-class QuestionAnsweringTrainer(Trainer):
-    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.eval_examples = eval_examples
-        self.post_process_function = post_process_function
-
-    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None):
-        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-        eval_examples = self.eval_examples if eval_examples is None else eval_examples
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        try:
-            output = eval_loop(
-                eval_dataloader,
-                description="Evaluation",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=None,
-                ignore_keys=ignore_keys,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-
-        if self.post_process_function is not None and self.compute_metrics is None:
-            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
-            metrics = utils_qa.evaluate_triviaqa(eval_preds.label_ids, eval_preds.predictions)
-            #metrics = self.compute_metrics(eval_preds)
-
-            #self.log(metrics)
-        else:
-            metrics = {}
-
-        #if self.args.tpu_metrics_debug or self.args.debug:
-            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
-        #    xm.master_print(met.metrics_report())
-
-        #self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
-        return metrics
-
-    def predict(self, predict_dataset, predict_examples, ignore_keys=None, n_best_size=20, max_answer_length=30):
-        predict_dataloader = self.get_test_dataloader(predict_dataset)
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        output = eval_loop(
-            predict_dataloader,
-            description="Prediction",
-            # No point gathering the predictions if there are no metrics, otherwise we defer to
-            # self.args.prediction_loss_only
-            prediction_loss_only=None,
-            ignore_keys=ignore_keys,
-        )
-
-        all_start_logits, all_end_logits = output.predictions
-
-        all_predictions = collections.OrderedDict()
-
-        qa_with_duplicates = defaultdict(list)
-
-        for example_index, example in enumerate(predict_examples):
-            input_ids = torch.tensor([predict_dataset[example_index]["input_ids"]])
-            qid = predict_dataset[example_index]["example_id"]
-
-            eos_token_indices = (input_ids == self.tokenizer.eos_token_id).nonzero()
-            question_end_index = eos_token_indices.view(input_ids.size(0), 2, 2)[:, 0, 1]
-            start_logits = all_start_logits[example_index]
-            end_logits = all_end_logits[example_index]
-            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            potential_answers = []
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    if start_index <= question_end_index[0]:
-                        continue
-                    if end_index <= question_end_index[0]:
-                        continue
-                    if start_index > end_index:
-                        continue
-                    answer_len = end_index - start_index + 1
-                    if answer_len > max_answer_length:
-                        continue
-                    potential_answers.append({'start': start_index, 'end': end_index,
-                        'start_logit': start_logits[start_index].item(),
-                        'end_logit': end_logits[end_index].item()})
-            sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True)
-            if len(sorted_answers) == 0:
-                answer = {'text': 'NoAnswerFound', 'score': -1000000}
-            else:
-                answer = sorted_answers[0]
-                answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1]
-                answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids.tolist())
-                text = self.tokenizer.convert_tokens_to_string(answer_tokens)
-                score = answer['start_logit'] + answer['end_logit']
-                answer = {'text': text, 'score': score}
-            qa_with_duplicates[qid].append({'answer_score': answer['score'], 'answer_text': answer['text'], })
-
-        qid_to_answer_text = {}
-        for qid, answer_metrics in qa_with_duplicates.items():
-            top_answer = sorted(answer_metrics, key=lambda x: x['answer_score'], reverse=True)[0]
-            qid_to_answer_text[qid] = top_answer['answer_text']
-
-        with open('predictions.json', 'w') as f:
-            f.write(json.dumps(qid_to_answer_text, indent=4) + "\n")
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/__init__.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/convert_to_squad_format.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/convert_to_squad_format.py
deleted file mode 100644
index 6279320e045..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/convert_to_squad_format.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import file_utils
-from . import dataset_utils
-import os
-from tqdm import tqdm
-import random
-import nltk
-import argparse
-
-
-def get_text(qad, domain):
-    local_file = os.path.join(args.web_dir, qad['Filename']) if domain == 'SearchResults' else os.path.join(args.wikipedia_dir, qad['Filename'])
-    return file_utils.get_file_contents(local_file, encoding='utf-8')
-
-
-def select_relevant_portion(text):
-    paras = text.split('\n')
-    selected = []
-    done = False
-    for para in paras:
-        # nltk is slow, but we have to use its word tokenizer for the distant supervision matching to work
-        # TODO: try both see which one works better
-        # words = para.split()
-        # extra_words = args.max_num_tokens - len(selected)
-        # selected.extend(words[:extra_words])
-        # if len(selected) >= args.max_num_tokens:
-        #     break
-        sents = sent_tokenize.tokenize(para)
-        for sent in sents:
-            words = nltk.word_tokenize(sent)
-            for word in words:
-                selected.append(word)
-                if len(selected) >= args.max_num_tokens:
-                    done = True
-                    break
-            if done:
-                break
-        if done:
-            break
-        selected.append('\n')
-    st = ' '.join(selected).strip()
-    return st
-
-
-def add_triple_data(datum, page, domain):
-    qad = {'Source': domain}
-    for key in ['QuestionId', 'Question', 'Answer']:
-        if key == 'Answer' and key not in datum:
-            qad[key] = {'NormalizedAliases': []}
-            qid = datum['QuestionId']
-            print(f'qid: {qid} does not have an answer.')
-        else:
-            qad[key] = datum[key]
-    for key in page:
-        qad[key] = page[key]
-    return qad
-
-
-def get_qad_triples(data):
-    qad_triples = []
-    for datum in data['Data']:
-        for key in ['EntityPages', 'SearchResults']:
-            for page in datum.get(key, []):
-                qad = add_triple_data(datum, page, key)
-                qad_triples.append(qad)
-    return qad_triples
-
-
-def convert_to_squad_format(qa_json_file, squad_file):
-    qa_json = dataset_utils.read_triviaqa_data(qa_json_file)
-    qad_triples = get_qad_triples(qa_json)
-    random.seed(args.seed)
-    random.shuffle(qad_triples)
-
-    data = []
-    for qad in tqdm(qad_triples):
-        qid = qad['QuestionId']
-
-        text = get_text(qad, qad['Source'])
-        selected_text = select_relevant_portion(text)
-
-        question = qad['Question']
-        para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]}
-        data.append({'paragraphs': [para]})
-        qa = para['qas'][0]
-        qa['id'] = dataset_utils.get_question_doc_string(qid, qad['Filename'])
-        qa['qid'] = qid
-
-        answers_in_doc = dataset_utils.answer_index_in_document(qad['Answer'], selected_text)
-        qa['answers'] = answers_in_doc
-        # We want all answers in the document, not just the first answer
-        # if index == -1:
-        #     if qa_json['Split'] == 'train':
-        #         continue
-        # else:
-        #     qa['answers'].append({'text': ans_string, 'answer_start': index})
-
-        # This doesn't fit the squad format, but we need it for evaluation
-        qa['aliases'] = qad['Answer']['NormalizedAliases']
-
-        if qa_json['Split'] == 'train' and len(data) >= args.sample_size and qa_json['Domain'] == 'Web':
-            break
-
-        if len(data) >= args.sample_size:
-            break
-
-    squad = {'data': data, 'version': qa_json['Version']}
-    file_utils.write_json_to_file(squad, squad_file)
-    print('Added', len(data))
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--triviaqa_file', help='Triviaqa file')
-    parser.add_argument('--squad_file', help='Squad file')
-    parser.add_argument('--wikipedia_dir', help='Wikipedia doc dir')
-    parser.add_argument('--web_dir', help='Web doc dir')
-
-    parser.add_argument('--seed', default=10, type=int, help='Random seed')
-    parser.add_argument('--max_num_tokens', default=800, type=int, help='Maximum number of tokens from a document')
-    parser.add_argument('--sample_size', default=8000000000000, type=int, help='Random seed')
-    parser.add_argument('--tokenizer', default='tokenizers/punkt/english.pickle', help='Sentence tokenizer')
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == '__main__':
-    args = get_args()
-    sent_tokenize = nltk.data.load(args.tokenizer)
-    convert_to_squad_format(args.triviaqa_file, args.squad_file)
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/dataset_utils.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/dataset_utils.py
deleted file mode 100644
index dd42c6cac2a..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/dataset_utils.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import file_utils
-import re
-
-
-# Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple
-def get_key_to_ground_truth(data):
-    if data['Domain'] == 'Wikipedia':
-        return {datum['QuestionId']: datum['Answer'] for datum in data['Data']}
-    else:
-        return get_qd_to_answer(data)
-
-
-def get_question_doc_string(qid, doc_name):
-    return '{}--{}'.format(qid, doc_name)
-
-def get_qd_to_answer(data):
-    key_to_answer = {}
-    for datum in data['Data']:
-        for page in datum.get('EntityPages', []) + datum.get('SearchResults', []):
-            qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename'])
-            key_to_answer[qd_tuple] = datum['Answer']
-    return key_to_answer
-
-
-def read_clean_part(datum):
-    for key in ['EntityPages', 'SearchResults']:
-        new_page_list = []
-        for page in datum.get(key, []):
-            if page['DocPartOfVerifiedEval']:
-                new_page_list.append(page)
-        datum[key] = new_page_list
-    assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
-    return datum
-
-
-def read_triviaqa_data(qajson):
-    data = file_utils.read_json(qajson)
-    # read only documents and questions that are a part of clean data set
-    if data['VerifiedEval']:
-        clean_data = []
-        for datum in data['Data']:
-            if datum['QuestionPartOfVerifiedEval']:
-                if data['Domain'] == 'Web':
-                    datum = read_clean_part(datum)
-                clean_data.append(datum)
-        data['Data'] = clean_data
-    return data
-
-
-def answer_index_in_document(answer, document):
-    answer_list = answer['NormalizedAliases']
-    answers_in_doc = []
-    for answer_string_in_doc in answer_list:
-        indices = [m.start() for m in re.finditer(answer_string_in_doc, document, flags=re.IGNORECASE)]
-        for index in indices:
-            answers_in_doc.append({
-                'text': answer_string_in_doc,
-                'answer_start': index
-            })
-    return answers_in_doc
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/file_utils.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/file_utils.py
deleted file mode 100644
index ad165c545e4..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/file_utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-
-
-def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'):
-    with open(json_file, mode, encoding=encoding) as outfile:
-        json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False)
-
-
-def get_file_contents(filename, encoding='utf-8'):
-    with open(filename, encoding=encoding) as f:
-        content = f.read()
-    return content
-
-
-def read_json(filename, encoding='utf-8'):
-    contents = get_file_contents(filename, encoding=encoding)
-    return json.loads(contents)
-
-
-def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True):
-    contents = get_file_contents(file_path, encoding=encoding)
-    lines = contents.split('\n')
-    lines = [line for line in lines if line != ''] if ignore_blanks else lines
-    return lines
diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils_qa.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils_qa.py
deleted file mode 100644
index 53924013612..00000000000
--- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils_qa.py
+++ /dev/null
@@ -1,451 +0,0 @@
-# coding=utf-8
-
-# Apache v2 license
-# Copyright (C) 2021 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Post-processing utilities for question answering.
-"""
-"""
-This script is based on HuggingFace/transformers examples: https://github.com/huggingface/transformers/blob/v4.6.1/examples/pytorch/question-answering/utils_qa.py
-"""
-import collections
-import json
-import logging
-import os
-from typing import Optional, Tuple
-
-import numpy as np
-from tqdm.auto import tqdm
-import sys
-from collections import Counter
-import string
-import re
-from collections import defaultdict
-import torch
-
-
-logger = logging.getLogger(__name__)
-
-
-def postprocess_qa_predictions(
-    examples,
-    features,
-    predictions: Tuple[np.ndarray, np.ndarray],
-    tokenizer=None,
-    version_2_with_negative: bool = False,
-    n_best_size: int = 20,
-    max_answer_length: int = 30,
-    null_score_diff_threshold: float = 0.0,
-    output_dir: Optional[str] = None,
-    prefix: Optional[str] = None,
-    is_world_process_zero: bool = True,
-):
-    """
-    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
-    original contexts. This is the base postprocessing functions for models that only return start and end logits.
-    Args:
-        examples: The non-preprocessed dataset (see the main script for more information).
-        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
-            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
-            first dimension must match the number of elements of :obj:`features`.
-        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the underlying dataset contains examples with no answers.
-        n_best_size (:obj:`int`, `optional`, defaults to 20):
-            The total number of n-best predictions to generate when looking for an answer.
-        max_answer_length (:obj:`int`, `optional`, defaults to 30):
-            The maximum length of an answer that can be generated. This is needed because the start and end predictions
-            are not conditioned on one another.
-        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
-            The threshold used to select the null answer: if the best answer has a score that is less than the score of
-            the null answer minus this threshold, the null answer is selected for this example (note that the score of
-            the null answer for an example giving several features is the minimum of the scores for the null answer on
-            each feature: all features must be aligned on the fact they `want` to predict a null answer).
-            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
-        output_dir (:obj:`str`, `optional`):
-            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
-            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
-            answers, are saved in `output_dir`.
-        prefix (:obj:`str`, `optional`):
-            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this process is the main process or not (used to determine if logging/saves should be done).
-    """
-    assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
-    all_start_logits, all_end_logits = predictions
-
-    assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features."
-
-    # Build a map example to its corresponding features.
-    example_id_to_index = {}
-    index = 0
-    for qid in examples["id"]:
-        if qid in example_id_to_index:
-            continue
-        example_id_to_index[qid] = index
-        index += 1
-
-    features_per_example = collections.defaultdict(list)
-    for i, feature in enumerate(features):
-        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
-
-    # Logging.
-    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
-    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
-
-    qa_with_duplicates = defaultdict(list)
-
-    for qid in tqdm(example_id_to_index):
-
-        feature_indices = features_per_example[example_id_to_index[qid]]
-        
-        # Looping through all the features associated to the current example.
-        for feature_index in feature_indices:
-            potential_answers = []
-            # We grab the predictions of the model for this feature.
-            start_logits = all_start_logits[feature_index]
-            end_logits = all_end_logits[feature_index]
-
-            input_ids = torch.tensor([features[feature_index]["input_ids"]])
-
-            # Go through all possibilities for the `n_best_size` greater start and end logits.
-            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
-
-            eos_token_indices = (input_ids == tokenizer.eos_token_id).nonzero()
-            question_end_index = eos_token_indices.view(input_ids.size(0), 2, 2)[:, 0, 1]
-            doc_end_index = eos_token_indices.view(input_ids.size(0), 2, 2)[:, 1, 1]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    if start_index >= doc_end_index[0]:
-                        continue
-                    if end_index >= doc_end_index[0]:
-                        continue
-                    if start_index <= question_end_index[0]:
-                        continue
-                    if end_index <= question_end_index[0]:
-                        continue
-                    if start_index > end_index:
-                        continue
-                    answer_len = end_index - start_index + 1
-                    if answer_len > max_answer_length:
-                        continue
-                    potential_answers.append({'start': start_index, 'end': end_index,
-                        'start_logit': start_logits[start_index].item(),
-                        'end_logit': end_logits[end_index].item()})
-            sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True)
-
-            if len(sorted_answers) == 0:
-                answer = {'text': 'NoAnswerFound', 'score': -1000000}
-            else:
-                answer = sorted_answers[0]
-                answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1]
-                answer_tokens = tokenizer.convert_ids_to_tokens(answer_token_ids.tolist())
-                text = tokenizer.convert_tokens_to_string(answer_tokens)
-                score = answer['start_logit'] + answer['end_logit']
-                answer = {'text': text, 'score': score}
-
-            qa_with_duplicates[qid].append({'answer_score': answer['score'], 'answer_text': answer['text'], })
-     
-    qid_to_answer_text = {}
-    for qid, answer_metrics in qa_with_duplicates.items():
-        top_answer = sorted(answer_metrics, key=lambda x: x['answer_score'], reverse=True)[0]
-        qid_to_answer_text[qid] = top_answer['answer_text']
-    
-
-    # If we have an output_dir, let's save all those dicts.
-    if output_dir is not None:
-        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
-
-        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
-        )
-    logger.info(f"Saving predictions to {prediction_file}.")
-    with open(prediction_file, "w") as writer:
-        writer.write(json.dumps(qid_to_answer_text, indent=4) + "\n")
-    return qid_to_answer_text
-
-
-def postprocess_qa_predictions_with_beam_search(
-    examples,
-    features,
-    predictions: Tuple[np.ndarray, np.ndarray],
-    version_2_with_negative: bool = False,
-    n_best_size: int = 20,
-    max_answer_length: int = 30,
-    start_n_top: int = 5,
-    end_n_top: int = 5,
-    output_dir: Optional[str] = None,
-    prefix: Optional[str] = None,
-    is_world_process_zero: bool = True,
-):
-    """
-    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
-    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
-    cls token predictions.
-    Args:
-        examples: The non-preprocessed dataset (see the main script for more information).
-        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
-            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
-            first dimension must match the number of elements of :obj:`features`.
-        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the underlying dataset contains examples with no answers.
-        n_best_size (:obj:`int`, `optional`, defaults to 20):
-            The total number of n-best predictions to generate when looking for an answer.
-        max_answer_length (:obj:`int`, `optional`, defaults to 30):
-            The maximum length of an answer that can be generated. This is needed because the start and end predictions
-            are not conditioned on one another.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
-            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
-            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
-        output_dir (:obj:`str`, `optional`):
-            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
-            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
-            answers, are saved in `output_dir`.
-        prefix (:obj:`str`, `optional`):
-            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this process is the main process or not (used to determine if logging/saves should be done).
-    """
-    assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
-    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
-
-    assert len(predictions[0]) == len(
-        features
-    ), f"Got {len(predictions[0])} predicitions and {len(features)} features."
-
-    # Build a map example to its corresponding features.
-    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
-    features_per_example = collections.defaultdict(list)
-    for i, feature in enumerate(features):
-        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
-
-    # The dictionaries we have to fill.
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
-
-    # Logging.
-    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
-    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
-
-    # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
-        # Those are the indices of the features associated to the current example.
-        feature_indices = features_per_example[example_index]
-
-        min_null_score = None
-        prelim_predictions = []
-
-        # Looping through all the features associated to the current example.
-        for feature_index in feature_indices:
-            # We grab the predictions of the model for this feature.
-            start_log_prob = start_top_log_probs[feature_index]
-            start_indexes = start_top_index[feature_index]
-            end_log_prob = end_top_log_probs[feature_index]
-            end_indexes = end_top_index[feature_index]
-            feature_null_score = cls_logits[feature_index]
-            # This is what will allow us to map some the positions in our logits to span of texts in the original
-            # context.
-            offset_mapping = features[feature_index]["offset_mapping"]
-            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
-            # available in the current feature.
-            token_is_max_context = features[feature_index].get("token_is_max_context", None)
-
-            # Update minimum null prediction
-            if min_null_score is None or feature_null_score < min_null_score:
-                min_null_score = feature_null_score
-
-            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_index = int(start_indexes[i])
-                    j_index = i * end_n_top + j
-                    end_index = int(end_indexes[j_index])
-                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
-                    # p_mask but let's not take any risk)
-                    if (
-                        start_index >= len(offset_mapping)
-                        or end_index >= len(offset_mapping)
-                        or offset_mapping[start_index] is None
-                        or offset_mapping[end_index] is None
-                    ):
-                        continue
-                    # Don't consider answers with a length negative or > max_answer_length.
-                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
-                        continue
-                    # Don't consider answer that don't have the maximum context available (if such information is
-                    # provided).
-                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
-                        continue
-                    prelim_predictions.append(
-                        {
-                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
-                            "score": start_log_prob[i] + end_log_prob[j_index],
-                            "start_log_prob": start_log_prob[i],
-                            "end_log_prob": end_log_prob[j_index],
-                        }
-                    )
-
-        # Only keep the best `n_best_size` predictions.
-        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
-
-        # Use the offsets to gather the answer text in the original context.
-        context = example["context"]
-        for pred in predictions:
-            offsets = pred.pop("offsets")
-            pred["text"] = context[offsets[0] : offsets[1]]
-
-        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
-        # failure.
-        if len(predictions) == 0:
-            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
-
-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
-        # the LogSumExp trick).
-        scores = np.array([pred.pop("score") for pred in predictions])
-        exp_scores = np.exp(scores - np.max(scores))
-        probs = exp_scores / exp_scores.sum()
-
-        # Include the probabilities in our predictions.
-        for prob, pred in zip(probs, predictions):
-            pred["probability"] = prob
-
-        # Pick the best prediction and set the probability for the null answer.
-        all_predictions[example["id"]] = predictions[0]["text"]
-        if version_2_with_negative:
-            scores_diff_json[example["id"]] = float(min_null_score)
-
-        # Make `predictions` JSON-serializable by casting np.float back to float.
-        all_nbest_json[example["id"]] = [
-            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
-            for pred in predictions
-        ]
-
-    # If we have an output_dir, let's save all those dicts.
-    if output_dir is not None:
-        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
-
-        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
-        )
-        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
-        )
-        if version_2_with_negative:
-            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
-            )
-
-        print(f"Saving predictions to {prediction_file}.")
-        with open(prediction_file, "w") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        print(f"Saving nbest_preds to {nbest_file}.")
-        with open(nbest_file, "w") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-        if version_2_with_negative:
-            print(f"Saving null_odds to {null_odds_file}.")
-            with open(null_odds_file, "w") as writer:
-                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions, scores_diff_json
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def handle_punc(text):
-        exclude = set(string.punctuation + "".join([u"‘", u"’", u"´", u"`"]))
-        return ''.join(ch if ch not in exclude else ' ' for ch in text)
-
-    def lower(text):
-        return text.lower()
-
-    def replace_underscore(text):
-        return text.replace('_', ' ')
-
-    return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(s))))).strip()
-
-def f1_score(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-
-
-def is_exact_match(answer_object, prediction):
-    ground_truths = get_ground_truths(answer_object)
-    for ground_truth in ground_truths:
-        if exact_match_score(prediction, ground_truth):
-            return True
-    return False
-
-
-def has_exact_match(ground_truths, candidates):
-    for ground_truth in ground_truths:
-        if ground_truth in candidates:
-            return True
-    return False
-
-def exact_match_score(prediction, ground_truth):
-    return int(normalize_answer(prediction) == normalize_answer(ground_truth))
-
-def evaluate_triviaqa(references, predictions):
-    f1 = exact_match = common = total = 0
-    for qa in references:
-        total += 1
-        if qa["id"] not in predictions:
-            message = "Unanswered question " + qa["id"] + " will receive score 0."
-            print(message, file=sys.stderr)
-            continue
-        common += 1
-        prediction = predictions[qa["id"]]
-        ground_truths = qa["answers"]["text"] + qa["aliases"]
-        em_for_this_question = metric_max_over_ground_truths(
-                exact_match_score, prediction, ground_truths)
-
-        exact_match += em_for_this_question
-
-        f1_for_this_question = metric_max_over_ground_truths(
-                f1_score, prediction, ground_truths)
-        f1 += f1_for_this_question
-    exact_match = 100.0 * exact_match / total
-    f1 = 100.0 * f1 / total
-
-    return {"exact_match": exact_match, "f1": f1}
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/README.md b/examples/huggingface/pytorch/question-answering/pruning/magnitude/README.md
similarity index 100%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/README.md
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/README.md
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/requirements.txt b/examples/huggingface/pytorch/question-answering/pruning/magnitude/requirements.txt
similarity index 100%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/requirements.txt
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/requirements.txt
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_benchmark.sh b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_benchmark.sh
similarity index 100%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_benchmark.sh
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/run_benchmark.sh
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_qa.py b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_qa.py
similarity index 99%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_qa.py
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/run_qa.py
index 8a335ce64b3..7bc9b835440 100644
--- a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_qa.py
+++ b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_qa.py
@@ -657,7 +657,7 @@ def compute_metrics(p: EvalPrediction):
         max_eval_samples = data_args.max_eval_samples \
             if data_args.max_eval_samples is not None else len(eval_dataset)
         eval_samples = min(max_eval_samples, len(eval_dataset))
-        samples = eval_samples - (eval_samples % batch_size) \
+        samples = eval_samples - (eval_samples % optim_args.batch_size) \
             if training_args.dataloader_drop_last else eval_samples
         logger.info("metrics keys: {}".format(results.keys()))
         bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_tuning.sh b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_tuning.sh
similarity index 100%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_tuning.sh
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/run_tuning.sh
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/trainer_qa.py b/examples/huggingface/pytorch/question-answering/pruning/magnitude/trainer_qa.py
similarity index 100%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/trainer_qa.py
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/trainer_qa.py
diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/utils_qa.py b/examples/huggingface/pytorch/question-answering/pruning/magnitude/utils_qa.py
similarity index 100%
rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/utils_qa.py
rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/utils_qa.py
diff --git a/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh b/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh
index 8930936be2d..d4d1d24fe36 100644
--- a/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh
@@ -70,7 +70,7 @@ function run_tuning {
                    --evaluation_strategy steps \
                    --save_strategy steps \
                    --save_total_limit 1 \
-                   --safe_serialization False"
+                   --save_safetensors False"
     elif [ "${topology}" = "bert_large_SQuAD_static" ]; then
         DATASET_NAME="squad"
         model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad"
diff --git a/examples/huggingface/pytorch/text-classification/quantization/run_glue_no_trainer.py b/examples/huggingface/pytorch/text-classification/quantization/run_glue_no_trainer.py
deleted file mode 100644
index 384464a9350..00000000000
--- a/examples/huggingface/pytorch/text-classification/quantization/run_glue_no_trainer.py
+++ /dev/null
@@ -1,575 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning a 🤗 Transformers model for sequence classification on GLUE."""
-import argparse
-import logging
-import math
-import os
-import random
-import time
-from pathlib import Path
-
-import pandas as pd  # to read in different data
-
-import datasets
-from datasets import load_dataset, load_metric
-from torch.utils.data import DataLoader
-
-import transformers
-from accelerate import Accelerator
-from huggingface_hub import Repository
-from intel_extension_for_transformers.transformers import metrics, NoTrainerOptimizer, objectives, OptimizedModel
-from neural_compressor.config import (
-    PostTrainingQuantConfig,
-    TuningCriterion,
-    AccuracyCriterion
-)
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    PretrainedConfig,
-    SchedulerType,
-    default_data_collator,
-    get_scheduler,
-    set_seed,
-)
-from transformers.file_utils import get_full_repo_name
-from transformers.utils.versions import require_version
-
-logger = logging.getLogger(__name__)
-
-require_version("datasets>=1.8.0",
-                "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
-
-task_to_keys = {
-    "cola": ("sentence", None),
-    "mnli": ("premise", "hypothesis"),
-    "mrpc": ("sentence1", "sentence2"),
-    "qnli": ("question", "sentence"),
-    "qqp": ("question1", "question2"),
-    "rte": ("sentence1", "sentence2"),
-    "sst2": ("sentence", None),
-    "stsb": ("sentence1", "sentence2"),
-    "wnli": ("sentence1", "sentence2"),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Finetune a transformers model on a text classification task")
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default=None,
-        help="The name of the glue task to train on.",
-        choices=list(task_to_keys.keys()),
-    )
-    parser.add_argument("--train_file",
-                        type=str,
-                        default=None,
-                        help="A csv or a json file containing the training data.")
-    parser.add_argument("--validation_file",
-                        type=str,
-                        default=None,
-                        help="A csv or a json file containing the validation data.")
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        default=128,
-        help=
-        ("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
-         " sequences shorter will be padded if `--pad_to_max_lengh` is passed."),
-    )
-    parser.add_argument(
-        "--pad_to_max_length",
-        action="store_true",
-        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--use_slow_tokenizer",
-        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument("--num_train_epochs",
-                        type=int,
-                        default=3,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=[
-            "linear", "cosine", "cosine_with_restarts", "polynomial", "constant",
-            "constant_with_warmup"
-        ],
-    )
-    parser.add_argument("--num_warmup_steps",
-                        type=int,
-                        default=0,
-                        help="Number of steps for the warmup in the lr scheduler.")
-    parser.add_argument("--output_dir",
-                        type=str,
-                        default=None,
-                        help="Where to store the final model.")
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument("--push_to_hub",
-                        action="store_true",
-                        help="Whether or not to push the model to the Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        help="The name of the repository to keep in sync with the local `output_dir`.")
-    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
-    parser.add_argument("--tune", action="store_true", help="tune a best model with Intel Extension for Transformers.")
-    parser.add_argument("--quantization_approach",
-                        type=str,
-                        default="static",
-                        help="Quantization approach. Supported approach are static, "
-                        "dynamic and qat.")
-    parser.add_argument("--metric_name",
-                        type=str,
-                        default=None,
-                        help="Metric name used for the tuning strategy.")
-    parser.add_argument("--is_relative",
-                        type=bool,
-                        default=True,
-                        help="Metric tolerance model, expected to be relative or absolute.")
-    parser.add_argument("--perf_tol",
-                        type=float,
-                        default=0.01,
-                        help="Performance tolerance when optimizing the model.")
-    parser.add_argument("--benchmark", action="store_true", help="run benchmark.")
-    parser.add_argument("--int8", action="store_true", help="run benchmark with int8 model.")
-    parser.add_argument("--accuracy_only",
-                        action="store_true",
-                        help="Whether to only test accuracy for model tuned by Neural Compressor.")
-    parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.')
-    parser.add_argument('-w',
-                        "--warmup_iter",
-                        default=1,
-                        type=int,
-                        help='For benchmark measurement only.')
-    args = parser.parse_args()
-
-    # Sanity checks
-    if args.task_name is None and args.train_file is None and args.validation_file is None:
-        raise ValueError("Need either a task name or a training/validation file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv",
-                                 "json"], "`validation_file` should be a csv or a json file."
-
-    if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
-
-    return args
-
-
-def eval_func(args, model, accelerator, eval_dataloader, metric):
-    # Evaluation
-    batch_time = AverageMeter('Time', ':6.3f')
-    is_regression = args.task_name == "stsb"
-    model.eval()
-    for step, batch in enumerate(eval_dataloader):
-        if step >= args.warmup_iter:
-            start = time.time()
-        # soft labels
-        outputs = model(**batch)
-        # measure elapsed time
-        if step >= args.warmup_iter:
-            batch_time.update(time.time() - start)
-        predictions = outputs.logits.argmax(
-            dim=-1) if not is_regression else outputs.logits.squeeze()
-        metric.add_batch(
-            predictions=accelerator.gather(predictions),
-            references=accelerator.gather(batch["labels"]),
-        )
-    eval_metric = metric.compute()
-    batch_size = args.per_device_eval_batch_size
-    print('Batch size = {}'.format(batch_size))
-    print('Latency: %.3f ms' % (batch_time.avg / batch_size * 1000))
-    print('Throughput: %.3f images/sec' % (batch_size / batch_time.avg))
-    logger.info(f"{eval_metric}")
-    return eval_metric
-
-
-def main():
-    # read in the arguments
-    args = parse_args()
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the screen.
-    # accelerator.is_local_main_process is only True for one process per machine.
-    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-    accelerator.wait_for_everyone()
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
-
-    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
-    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
-    # label if at least two columns are provided.
-
-    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
-    # single column. You can easily tweak this behavior (see below)
-
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if args.task_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset("glue", args.task_name)
-        '''
-        06/25/2022
-        Distilled-sparse training for bert_mini, on sst2
-        pre-load an augmented dataset
-        '''
-    else:
-        # Loading the dataset from local csv or json file.
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-        if args.validation_file is not None:
-            data_files["validation"] = args.validation_file
-        extension = (args.train_file
-                     if args.train_file is not None else args.valid_file).split(".")[-1]
-        raw_datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    if args.task_name is not None:
-        is_regression = args.task_name == "stsb"
-        if not is_regression:
-            label_list = raw_datasets["train"].features["label"].names
-            num_labels = len(label_list)
-        else:
-            num_labels = 1
-    else:
-        # Trying to have good defaults here, don't hesitate to tweak to your needs.
-        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
-        if is_regression:
-            num_labels = 1
-        else:
-            # A useful fast method:
-            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
-            label_list = raw_datasets["train"].unique("label")
-            label_list.sort()  # Let's sort it for determinism
-            num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(args.model_name_or_path,
-                                        num_labels=num_labels,
-                                        finetuning_task=args.task_name)
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
-                                              use_fast=not args.use_slow_tokenizer)
-    if args.int8:
-        # Load the model obtained after Intel Neural Compressor (INC) quantization
-        model = OptimizedModel.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config
-        )
-    else:
-        model = AutoModelForSequenceClassification.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config
-        )
-
-    # Preprocessing the datasets
-    if args.task_name is not None:
-        sentence1_key, sentence2_key = task_to_keys[args.task_name]
-    else:
-        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
-        non_label_column_names = [
-            name for name in raw_datasets["train"].column_names if name != "label"
-        ]
-        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
-            sentence1_key, sentence2_key = "sentence1", "sentence2"
-        else:
-            if len(non_label_column_names) >= 2:
-                sentence1_key, sentence2_key = non_label_column_names[:2]
-            else:
-                sentence1_key, sentence2_key = non_label_column_names[0], None
-
-    # Some models have set the order of the labels to use, so let's make sure we do use it.
-    label_to_id = None
-    if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
-            and args.task_name is not None and not is_regression):
-        # Some have all caps in their config, some don't.
-        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
-            logger.info(
-                f"The configuration of the model provided the following label correspondence: {label_name_to_id}. "
-                "Using it!")
-        else:
-            logger.warning(
-                "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
-                "\nIgnoring the model labels as a result.",
-            )
-    elif args.task_name is None:
-        label_to_id = {v: i for i, v in enumerate(label_list)}
-
-    if label_to_id is not None:
-        model.config.label2id = label_to_id
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
-    elif args.task_name is not None and not is_regression:
-        model.config.label2id = {l: i for i, l in enumerate(label_list)}
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
-
-    padding = "max_length" if args.pad_to_max_length else False
-
-    def preprocess_function(examples):
-        # Tokenize the texts
-        texts = ((examples[sentence1_key], ) if sentence2_key is None else
-                 (examples[sentence1_key], examples[sentence2_key]))
-        result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True)
-
-        if "label" in examples:
-            if label_to_id is not None:
-                # Map labels to IDs (not necessary for GLUE tasks)
-                result["labels"] = [label_to_id[l] for l in examples["label"]]
-            else:
-                # In all cases, rename the column to labels because the model will expect that.
-                result["labels"] = examples["label"]
-
-        return result
-
-    with accelerator.main_process_first():
-
-        # original process
-        processed_datasets = raw_datasets.map(
-            preprocess_function,
-            batched=True,
-            remove_columns=raw_datasets["train"].column_names,
-            desc="Running tokenizer on dataset",
-        )
-    train_dataset = processed_datasets["train"]
-    eval_dataset = processed_datasets["validation_matched" if args.task_name ==
-                                      "mnli" else "validation"]
-    #if use_augmented:
-    #    test_dataset = processed_datasets["test"]
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # DataLoaders creation:
-    if args.pad_to_max_length:
-        # If padding was already done ot max length, we use the default data collator that will just convert everything
-        # to tensors.
-        data_collator = default_data_collator
-    else:
-        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
-        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
-        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(
-            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
-
-    train_dataloader = DataLoader(train_dataset,
-                                  shuffle=True,
-                                  collate_fn=data_collator,
-                                  batch_size=args.per_device_train_batch_size)
-    eval_dataloader = DataLoader(eval_dataset,
-                                 collate_fn=data_collator,
-                                 batch_size=args.per_device_eval_batch_size)
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params":
-            [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer = accelerator.prepare(model, optimizer)
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    # Get the metric function
-    if args.task_name is not None:
-        metric = load_metric("glue", args.task_name)
-    else:
-        metric = load_metric("accuracy")
-
-    metric_name = (args.metric_name if args.metric_name is not None else
-                   ("pearson" if args.task_name == "stsb" else
-                    "matthews_correlation" if args.task_name == "cola" else "accuracy"))
-
-    def eval_func_nc(model):
-        ret = eval_func(args, model, accelerator, eval_dataloader, metric)
-        return ret[metric_name]
-
-    # Train!
-    if args.tune:
-        if accelerator.is_main_process:
-            tokenizer.save_pretrained(args.output_dir)
-            config.save_pretrained(args.output_dir,
-                                   is_main_process=accelerator.is_main_process,
-                                   save_function=accelerator.save)
-        tune_metric = metrics.Metric(name=metric_name,
-                                     is_relative=args.is_relative,
-                                     criterion=args.perf_tol)
-        objective = objectives.performance
-        tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name])
-        accuracy_criterion = AccuracyCriterion(
-            higher_is_better=False,  # optional.
-            criterion="relative" if args.is_relative else "absolute",  # optional. Available values are "relative" and "absolute".
-            tolerable_loss=args.perf_tol,  # optional.
-        )
-        quantization_config = PostTrainingQuantConfig(
-            approach=args.quantization_approach,
-            tuning_criterion=tuning_criterion,
-            accuracy_criterion=accuracy_criterion
-        )
-        quantizer = NoTrainerOptimizer(model, args.output_dir)
-        quantizer.metrics = tune_metric
-        model = quantizer.quantize(quantization_config,
-                                   eval_func=eval_func_nc,
-                                   calib_dataloader=train_dataloader)
-
-    if args.benchmark or args.accuracy_only:
-        results = eval_func(args, model, accelerator, eval_dataloader, metric)
-        print("Finally Eval {} Accuracy: {:.5f}".format(metric_name, results[metric_name]))
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self, name, fmt=':f'):
-        self.name = name
-        self.fmt = fmt
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
-        return fmtstr.format(**self.__dict__)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py b/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py
index 609f904cce2..15c0929350c 100644
--- a/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py
+++ b/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py
@@ -30,12 +30,13 @@
 
 from accelerate.utils import set_seed
 from diffusers import StableDiffusionPipeline
-from intel_extension_for_transformers.transformers import metrics , NoTrainerOptimizer
+from intel_extension_for_transformers.transformers import metrics
 from neural_compressor.config import (
     PostTrainingQuantConfig,
     TuningCriterion,
     AccuracyCriterion
 )
+from neural_compressor.quantization import fit
 from intel_extension_for_transformers.transformers.config import WEIGHTS_NAME
 from pytorch_fid import fid_score
 
@@ -318,9 +319,8 @@ def eval_func(model):
                     accuracy_criterion=accuracy_criterion
                 )
                 os.makedirs(args.output_dir, exist_ok=True)
-                quantizer = NoTrainerOptimizer(model, args.output_dir)
-                quantizer.metrics = tune_metric
-                model = quantizer.quantize(quantization_config,
+                model = fit(model,
+                            quantization_config,
                            eval_func=eval_func,
                            calib_func=calibration_func,
                            calib_dataloader=DataLoader(CalibDataset(), batch_size=1),
diff --git a/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh b/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh
index 5a8ced4ea90..ce43f4c35b9 100644
--- a/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh
@@ -69,7 +69,7 @@ function run_tuning {
                    --evaluation_strategy steps \
                    --save_strategy steps \
                    --save_total_limit 1 \
-                   --safe_serialization False"
+                   --save_safetensors False"
     fi
 
     python -u ./run_ner.py \
diff --git a/intel_extension_for_transformers/transformers/trainer.py b/intel_extension_for_transformers/transformers/trainer.py
index 0d14d3843e4..4d4b378353e 100644
--- a/intel_extension_for_transformers/transformers/trainer.py
+++ b/intel_extension_for_transformers/transformers/trainer.py
@@ -289,7 +289,7 @@ def quantize(
         """The main entry point of automatic quantization tuning.
 
         Args:
-            quant_config: The path to the YAML configuration file or QuantizationConfig class containing
+            quant_config: QuantizationConfig class containing
                 accuracy goal, quantization objective and related dataloaders etc.
             provider: The provider used to quantize.
             eval_func (:obj:`Callable`, optional): The function used to evaluate the model.
diff --git a/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py b/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py
index 9e3ba13c89d..3cdab98655f 100644
--- a/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py
+++ b/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py
@@ -24,9 +24,6 @@
     Trainer,
 )
 from intel_extension_for_transformers.transformers import (
-    QuantizationConfig,
-    PruningConfig,
-    PrunerConfig,
     metrics,
     objectives,
 )