From 7939e13e160a36fe5321de23af41b29aeb1e104b Mon Sep 17 00:00:00 2001 From: Juheon Chu Date: Sun, 27 Nov 2022 04:52:52 -0500 Subject: [PATCH 01/16] Added support for Tapas Model --- optimum/bettertransformer/models/__init__.py | 5 + .../models/encoder_models.py | 106 ++++++++++++++++++ .../test_bettertransformer_encoder.py | 7 +- 3 files changed, 117 insertions(+), 1 deletion(-) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index fd9c7a1330c..931fe1a8aa7 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -23,6 +23,8 @@ ViTLayerBetterTransformer, Wav2Vec2EncoderLayerBetterTransformer, WhisperEncoderLayerBetterTransformer, + TapasLayerBetterTransformer, + ) @@ -70,6 +72,9 @@ # FSMTModel: "EncoderLayer": FSMTEncoderLayerBetterTransformer, "ViltLayer": ViltLayerBetterTransformer, + + # Tapas Model + "TapasLayer": TapasLayerBetterTransformer, } diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index 1acea21f4e3..ab3c1b8256b 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -16,6 +16,112 @@ from .base import BetterTransformerBaseLayer +class TapasLayerBetterTransformer: + def _init_(self, tapas_layer, config): + r""" + A simple conversion of the TAPAS layer to its `BetterTransformer` implementation. + + Args: + tapas_layer (`torch.nn.Module`): + The original TAPAS Layer where the weights needs to be retrieved. + """ + super().__init__(config) + # In_proj layer + self.in_proj_weight = nn.Parameter( + torch.cat( + [ + tapas_layer.attention.query.weight, + tapas_layer.attention.key.weight, + tapas_layer.attention.value.weight, + ] + ) + ) + self.in_proj_bias = nn.Parameter( + torch.cat( + [ + tapas_layer.attention.query.bias, + tapas_layer.attention.key.bias, + tapas_layer.attention.value.bias, + ] + ) + ) + + # Out proj layer + self.out_proj_weight = tapas_layer.attention.dense.weight + self.out_proj_bias = tapas_layer.attention.dense.bias + + # Linear layer 1 + self.linear1_weight = tapas_layer.ffn.weight + self.linear1_bias = tapas_layer.ffn.bias + + # Linear layer 2 + self.linear2_weight = tapas_layer.ffn_output.weight + self.linear2_bias = tapas_layer.ffn_output.bias + + # Layer norm 1 + self.norm1_eps = tapas_layer.attention.LayerNorm.eps + self.norm1_weight = tapas_layer.attention.LayerNorm.weight + self.norm1_bias = tapas_layer.attention.LayerNorm.bias + + # Layer norm 2 + self.norm2_eps = tapas_layer.full_layer_layer_norm.eps + self.norm2_weight = tapas_layer.full_layer_layer_norm.weight + self.norm2_bias = tapas_layer.full_layer_layer_norm.bias + + # Model hyper parameters + self.num_heads = tapas_layer.attention.num_attention_heads + self.embed_dim = tapas_layer.attention.all_head_size + + # Last step: set the last layer to `False` -> this will be set to `True` when converting the model + self.is_last_layer = False + + self.validate_bettertransformer() + + def forward(self, hidden_states, attention_mask, *_): + r""" + This is just a wrapper around the forward function proposed in: + https://github.com/huggingface/transformers/pull/19553 + """ + super().forward_checker() + + if hidden_states.is_nested: + attention_mask = None + + if attention_mask is not None: + # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask + # 0->false->keep this token -inf->true->mask this token + attention_mask = attention_mask.bool() + attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1])) + seqlen = attention_mask.shape[1] + lengths = torch.sum(~attention_mask, 1) + if not all([l == seqlen for l in lengths]): + hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask) + attention_mask = None + + hidden_states = torch._transformer_encoder_layer_fwd( + hidden_states, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj_weight, + self.out_proj_bias, + self.use_gelu, + self.norm_first, + self.norm1_eps, + self.norm1_weight, + self.norm1_bias, + self.norm2_weight, + self.norm2_bias, + self.linear1_weight, + self.linear1_bias, + self.linear2_weight, + self.linear2_bias, + attention_mask, + ) + if hidden_states.is_nested and self.is_last_layer: + hidden_states = hidden_states.to_padded_tensor(0.0) + return (hidden_states,) class AlbertLayerBetterTransformer(BetterTransformerBaseLayer): def __init__(self, albert_layer, config): diff --git a/tests/bettertransformer/test_bettertransformer_encoder.py b/tests/bettertransformer/test_bettertransformer_encoder.py index 4fe12b45849..f4f22ca1b85 100644 --- a/tests/bettertransformer/test_bettertransformer_encoder.py +++ b/tests/bettertransformer/test_bettertransformer_encoder.py @@ -49,6 +49,7 @@ ALL_ENCODER_DECODER_MODELS_TO_TEST = [ "hf-internal-testing/tiny-random-FSMTModel", "hf-internal-testing/tiny-random-BartModel", + "hf-internal-testing/tiny-random-TapasModel", ] @@ -87,6 +88,10 @@ def _loop_all_classes(self): elif layer_class == "TransformerBlock": # Hardcode it for distilbert - see https://github.com/huggingface/transformers/pull/19966 class_name = "DistilBert" + + elif layer_class == "TapasLayer": + class_name = "Tapas" + elif "EncoderLayer" in layer_class: class_name = layer_class[:-12] else: @@ -275,7 +280,7 @@ def tearDown(self): gc.collect() def prepare_inputs_for_class(self, model_id=None): - input_dict = { + input_dlsict = { "input_ids": torch.LongTensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]), "attention_mask": torch.LongTensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0]]), "decoder_input_ids": torch.LongTensor([[0], [0]]), From 283186a174b23f4ea8abcb35489e35d443a5327c Mon Sep 17 00:00:00 2001 From: Juheon Chu Date: Sun, 27 Nov 2022 06:17:09 -0500 Subject: [PATCH 02/16] Added support for Tapas Model --- docs/source/bettertransformer/overview.mdx | 33 ++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx index 441075e57ec..a542ca33488 100644 --- a/docs/source/bettertransformer/overview.mdx +++ b/docs/source/bettertransformer/overview.mdx @@ -14,7 +14,6 @@ specific language governing permissions and limitations under the License. 🤗 Optimum provides an integration with `BetterTransformer`, a stable API from PyTorch to benefit from interesting speedups on CPU & GPU through sparsity and fused kernels. - ## Quickstart Since its 1.13 version, PyTorch released the stable version of `BetterTransformer` in its library. You can benefit from interesting speedup on most consumer-type devices, including CPUs, older and newer versions of NIVIDIA GPUs. @@ -23,6 +22,7 @@ You can now use this feature in 🤗 Optimum together with Transformers and use ### Supported models The list of supported model below: + - [AlBERT](https://arxiv.org/abs/1909.11942) - [BART](https://arxiv.org/abs/1910.13461) - [BERT](https://arxiv.org/abs/1810.04805) @@ -39,6 +39,7 @@ The list of supported model below: - [MarkupLM](https://arxiv.org/abs/2110.08518) - [RoBERTa](https://arxiv.org/abs/1907.11692) - [Splinter](https://arxiv.org/abs/2101.00438) +- [Tapas](https://arxiv.org/abs/2211.06550) - [ViLT](https://arxiv.org/abs/2102.03334) - [ViT](https://arxiv.org/abs/2010.11929) - [ViT-MAE](https://arxiv.org/abs/2111.06377) @@ -60,20 +61,36 @@ In order to use the `BetterTransformer` API just run the following commands: >>> model_hf = AutoModelForSequenceClassification.from_pretrained("bert-base-cased") >>> model = BetterTransformer.transform(model_hf, keep_original_model=True) ``` + You can leave `keep_original_model=False` in case you want to overwrite the current model with its `BetterTransformer` version. More details on `tutorials` section to deeply understand how to use it, or check the [Google colab demo](https://colab.research.google.com/drive/1Lv2RCG_AT6bZNdlL1oDDNNiwBBuirwI-?usp=sharing)! - From 7c24a426905aa0fe5956576352a59c1c3f616b76 Mon Sep 17 00:00:00 2001 From: Juheon Chu Date: Mon, 28 Nov 2022 08:50:56 -0500 Subject: [PATCH 03/16] reformatted files with black --- optimum/bettertransformer/models/__init__.py | 2 -- optimum/bettertransformer/models/encoder_models.py | 2 ++ tests/bettertransformer/test_bettertransformer_encoder.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index 931fe1a8aa7..642c5d29736 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -24,7 +24,6 @@ Wav2Vec2EncoderLayerBetterTransformer, WhisperEncoderLayerBetterTransformer, TapasLayerBetterTransformer, - ) @@ -72,7 +71,6 @@ # FSMTModel: "EncoderLayer": FSMTEncoderLayerBetterTransformer, "ViltLayer": ViltLayerBetterTransformer, - # Tapas Model "TapasLayer": TapasLayerBetterTransformer, } diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index ab3c1b8256b..e491f729314 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -16,6 +16,7 @@ from .base import BetterTransformerBaseLayer + class TapasLayerBetterTransformer: def _init_(self, tapas_layer, config): r""" @@ -123,6 +124,7 @@ def forward(self, hidden_states, attention_mask, *_): hidden_states = hidden_states.to_padded_tensor(0.0) return (hidden_states,) + class AlbertLayerBetterTransformer(BetterTransformerBaseLayer): def __init__(self, albert_layer, config): r""" diff --git a/tests/bettertransformer/test_bettertransformer_encoder.py b/tests/bettertransformer/test_bettertransformer_encoder.py index f4f22ca1b85..f3a7726e4fd 100644 --- a/tests/bettertransformer/test_bettertransformer_encoder.py +++ b/tests/bettertransformer/test_bettertransformer_encoder.py @@ -88,10 +88,10 @@ def _loop_all_classes(self): elif layer_class == "TransformerBlock": # Hardcode it for distilbert - see https://github.com/huggingface/transformers/pull/19966 class_name = "DistilBert" - + elif layer_class == "TapasLayer": class_name = "Tapas" - + elif "EncoderLayer" in layer_class: class_name = layer_class[:-12] else: From 12d61545ef3f07062dc41b62e648fe0b77b05036 Mon Sep 17 00:00:00 2001 From: JuheonChu <35699839+JuheonChu@users.noreply.github.com> Date: Mon, 28 Nov 2022 08:52:00 -0500 Subject: [PATCH 04/16] Update tests/bettertransformer/test_bettertransformer_encoder.py test_better_encoder Co-authored-by: Michael Benayoun --- tests/bettertransformer/test_bettertransformer_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bettertransformer/test_bettertransformer_encoder.py b/tests/bettertransformer/test_bettertransformer_encoder.py index f3a7726e4fd..70ef5cc886d 100644 --- a/tests/bettertransformer/test_bettertransformer_encoder.py +++ b/tests/bettertransformer/test_bettertransformer_encoder.py @@ -280,7 +280,7 @@ def tearDown(self): gc.collect() def prepare_inputs_for_class(self, model_id=None): - input_dlsict = { + input_dict = { "input_ids": torch.LongTensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]), "attention_mask": torch.LongTensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0]]), "decoder_input_ids": torch.LongTensor([[0], [0]]), From 14b35ca29974b4d31dd360914137773c95165632 Mon Sep 17 00:00:00 2001 From: JuheonChu <35699839+JuheonChu@users.noreply.github.com> Date: Mon, 28 Nov 2022 08:52:17 -0500 Subject: [PATCH 05/16] Update optimum/bettertransformer/models/encoder_models.py Co-authored-by: Michael Benayoun --- optimum/bettertransformer/models/encoder_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index e491f729314..9b2847ea8f8 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -78,7 +78,7 @@ def _init_(self, tapas_layer, config): self.validate_bettertransformer() - def forward(self, hidden_states, attention_mask, *_): + def forward(self, hidden_states, attention_mask, *_, **__): r""" This is just a wrapper around the forward function proposed in: https://github.com/huggingface/transformers/pull/19553 From d2e6ebb62ee2569be86d8a79e84a13546cbc9391 Mon Sep 17 00:00:00 2001 From: JuheonChu <35699839+JuheonChu@users.noreply.github.com> Date: Mon, 28 Nov 2022 10:16:46 -0500 Subject: [PATCH 06/16] Update tests/bettertransformer/test_bettertransformer_encoder.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- tests/bettertransformer/test_bettertransformer_encoder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/bettertransformer/test_bettertransformer_encoder.py b/tests/bettertransformer/test_bettertransformer_encoder.py index 70ef5cc886d..721f6a530a1 100644 --- a/tests/bettertransformer/test_bettertransformer_encoder.py +++ b/tests/bettertransformer/test_bettertransformer_encoder.py @@ -88,10 +88,8 @@ def _loop_all_classes(self): elif layer_class == "TransformerBlock": # Hardcode it for distilbert - see https://github.com/huggingface/transformers/pull/19966 class_name = "DistilBert" - elif layer_class == "TapasLayer": class_name = "Tapas" - elif "EncoderLayer" in layer_class: class_name = layer_class[:-12] else: From 762a804b3d7bba63267fc46befe11f0636c73404 Mon Sep 17 00:00:00 2001 From: Juheon Chu Date: Mon, 28 Nov 2022 11:25:58 -0500 Subject: [PATCH 07/16] Styled optimum files --- optimum/bettertransformer/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index 642c5d29736..d22a5151ec4 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -19,11 +19,11 @@ BertLayerBetterTransformer, DistilBertLayerBetterTransformer, FSMTEncoderLayerBetterTransformer, + TapasLayerBetterTransformer, ViltLayerBetterTransformer, ViTLayerBetterTransformer, Wav2Vec2EncoderLayerBetterTransformer, WhisperEncoderLayerBetterTransformer, - TapasLayerBetterTransformer, ) From 4047267cb2de3005064e67ff94024afa9cfb9594 Mon Sep 17 00:00:00 2001 From: JuheonChu <35699839+JuheonChu@users.noreply.github.com> Date: Mon, 28 Nov 2022 12:26:38 -0500 Subject: [PATCH 08/16] Update optimum/bettertransformer/models/encoder_models.py Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/bettertransformer/models/encoder_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index 9b2847ea8f8..cd41e425221 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -18,7 +18,7 @@ class TapasLayerBetterTransformer: - def _init_(self, tapas_layer, config): + def __init__(self, tapas_layer, config): r""" A simple conversion of the TAPAS layer to its `BetterTransformer` implementation. From ac716cc249816ede5410f486cb2652b4b3f430c2 Mon Sep 17 00:00:00 2001 From: JuheonChu <35699839+JuheonChu@users.noreply.github.com> Date: Mon, 28 Nov 2022 16:39:54 -0500 Subject: [PATCH 09/16] Update optimum/bettertransformer/models/encoder_models.py Call super() in the _init() to inherit from BetterTransformerBaseLayer Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/bettertransformer/models/encoder_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index cd41e425221..f23a24985e5 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -17,7 +17,7 @@ from .base import BetterTransformerBaseLayer -class TapasLayerBetterTransformer: +class TapasLayerBetterTransformer(BetterTransformerBaseLayer): def __init__(self, tapas_layer, config): r""" A simple conversion of the TAPAS layer to its `BetterTransformer` implementation. From bbd625e5cb1cd88cc4b91aeca85d47d2dec1f2e8 Mon Sep 17 00:00:00 2001 From: Juheon Chu <35699839+JuheonChu@users.noreply.github.com> Date: Tue, 29 Nov 2022 04:52:12 -0500 Subject: [PATCH 10/16] Update tests/bettertransformer/test_bettertransformer_encoder.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- tests/bettertransformer/test_bettertransformer_encoder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/bettertransformer/test_bettertransformer_encoder.py b/tests/bettertransformer/test_bettertransformer_encoder.py index 721f6a530a1..859320b1270 100644 --- a/tests/bettertransformer/test_bettertransformer_encoder.py +++ b/tests/bettertransformer/test_bettertransformer_encoder.py @@ -88,8 +88,6 @@ def _loop_all_classes(self): elif layer_class == "TransformerBlock": # Hardcode it for distilbert - see https://github.com/huggingface/transformers/pull/19966 class_name = "DistilBert" - elif layer_class == "TapasLayer": - class_name = "Tapas" elif "EncoderLayer" in layer_class: class_name = layer_class[:-12] else: From fded9ac7e2c5942b1335ece80f8530ee78af4d2b Mon Sep 17 00:00:00 2001 From: Juheon Chu <35699839+JuheonChu@users.noreply.github.com> Date: Tue, 29 Nov 2022 04:55:45 -0500 Subject: [PATCH 11/16] Moved Tapas Encoder model to Encoder --- tests/bettertransformer/test_bettertransformer_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bettertransformer/test_bettertransformer_encoder.py b/tests/bettertransformer/test_bettertransformer_encoder.py index 859320b1270..4d66bc0cdaf 100644 --- a/tests/bettertransformer/test_bettertransformer_encoder.py +++ b/tests/bettertransformer/test_bettertransformer_encoder.py @@ -44,12 +44,12 @@ "hf-internal-testing/tiny-random-MarkupLMModel", "hf-internal-testing/tiny-random-BertModel", "ybelkada/random-tiny-BertGenerationModel", + "hf-internal-testing/tiny-random-TapasModel", ] ALL_ENCODER_DECODER_MODELS_TO_TEST = [ "hf-internal-testing/tiny-random-FSMTModel", "hf-internal-testing/tiny-random-BartModel", - "hf-internal-testing/tiny-random-TapasModel", ] From 7fcc61605b5d7a36fde11c550c5eeaf2f44b9aea Mon Sep 17 00:00:00 2001 From: Juheon Chu <35699839+JuheonChu@users.noreply.github.com> Date: Tue, 29 Nov 2022 06:21:05 -0500 Subject: [PATCH 12/16] change mapping in __init_.py --- optimum/bettertransformer/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index d22a5151ec4..2ce196f1a56 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -72,7 +72,7 @@ "EncoderLayer": FSMTEncoderLayerBetterTransformer, "ViltLayer": ViltLayerBetterTransformer, # Tapas Model - "TapasLayer": TapasLayerBetterTransformer, + "TapasLayer": BertLayerBetterTransformer, } From 579f889871e860675e27af8c7096319df0cf9cec Mon Sep 17 00:00:00 2001 From: Juheon Chu <35699839+JuheonChu@users.noreply.github.com> Date: Tue, 29 Nov 2022 08:21:54 -0500 Subject: [PATCH 13/16] deleted --- optimum/bettertransformer/models/encoder_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index f23a24985e5..6e77f0793d2 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -17,7 +17,6 @@ from .base import BetterTransformerBaseLayer -class TapasLayerBetterTransformer(BetterTransformerBaseLayer): def __init__(self, tapas_layer, config): r""" A simple conversion of the TAPAS layer to its `BetterTransformer` implementation. From fe547eb2b7b6589edac6ecad4b9a69ab6724f6bd Mon Sep 17 00:00:00 2001 From: Juheon Chu <35699839+JuheonChu@users.noreply.github.com> Date: Tue, 29 Nov 2022 08:25:21 -0500 Subject: [PATCH 14/16] Update __init__.py --- optimum/bettertransformer/models/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index 2ce196f1a56..f344e00ed7c 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -19,7 +19,6 @@ BertLayerBetterTransformer, DistilBertLayerBetterTransformer, FSMTEncoderLayerBetterTransformer, - TapasLayerBetterTransformer, ViltLayerBetterTransformer, ViTLayerBetterTransformer, Wav2Vec2EncoderLayerBetterTransformer, @@ -28,6 +27,8 @@ BETTER_TRANFORMER_LAYERS_MAPPING_DICT = { + # Tapas Model + "TapasLayer": BertLayerBetterTransformer, # Bert Family "BertLayer": BertLayerBetterTransformer, "ElectraLayer": BertLayerBetterTransformer, @@ -71,8 +72,6 @@ # FSMTModel: "EncoderLayer": FSMTEncoderLayerBetterTransformer, "ViltLayer": ViltLayerBetterTransformer, - # Tapas Model - "TapasLayer": BertLayerBetterTransformer, } From 1d64709842e29c90b798bfeb76879e7220364abc Mon Sep 17 00:00:00 2001 From: Juheon Chu <35699839+JuheonChu@users.noreply.github.com> Date: Tue, 29 Nov 2022 08:56:17 -0500 Subject: [PATCH 15/16] Update __init__.py --- optimum/bettertransformer/models/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index f344e00ed7c..46931f0189a 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -27,9 +27,9 @@ BETTER_TRANFORMER_LAYERS_MAPPING_DICT = { - # Tapas Model - "TapasLayer": BertLayerBetterTransformer, + # Bert Family + "TapasLayer": BertLayerBetterTransformer, "BertLayer": BertLayerBetterTransformer, "ElectraLayer": BertLayerBetterTransformer, "Data2VecTextLayer": BertLayerBetterTransformer, From 64e644cc164efad79d139055bdef68795adc4874 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 30 Nov 2022 10:26:02 +0000 Subject: [PATCH 16/16] refactor doc + remove class + styling --- docs/source/bettertransformer/overview.mdx | 30 ++--- optimum/bettertransformer/models/__init__.py | 1 - .../models/encoder_models.py | 107 ------------------ 3 files changed, 7 insertions(+), 131 deletions(-) diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx index a542ca33488..427c0ec381c 100644 --- a/docs/source/bettertransformer/overview.mdx +++ b/docs/source/bettertransformer/overview.mdx @@ -61,36 +61,20 @@ In order to use the `BetterTransformer` API just run the following commands: >>> model_hf = AutoModelForSequenceClassification.from_pretrained("bert-base-cased") >>> model = BetterTransformer.transform(model_hf, keep_original_model=True) ``` - You can leave `keep_original_model=False` in case you want to overwrite the current model with its `BetterTransformer` version. More details on `tutorials` section to deeply understand how to use it, or check the [Google colab demo](https://colab.research.google.com/drive/1Lv2RCG_AT6bZNdlL1oDDNNiwBBuirwI-?usp=sharing)! + diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index 46931f0189a..02616b39e34 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -27,7 +27,6 @@ BETTER_TRANFORMER_LAYERS_MAPPING_DICT = { - # Bert Family "TapasLayer": BertLayerBetterTransformer, "BertLayer": BertLayerBetterTransformer, diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index 6e77f0793d2..1acea21f4e3 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -17,113 +17,6 @@ from .base import BetterTransformerBaseLayer - def __init__(self, tapas_layer, config): - r""" - A simple conversion of the TAPAS layer to its `BetterTransformer` implementation. - - Args: - tapas_layer (`torch.nn.Module`): - The original TAPAS Layer where the weights needs to be retrieved. - """ - super().__init__(config) - # In_proj layer - self.in_proj_weight = nn.Parameter( - torch.cat( - [ - tapas_layer.attention.query.weight, - tapas_layer.attention.key.weight, - tapas_layer.attention.value.weight, - ] - ) - ) - self.in_proj_bias = nn.Parameter( - torch.cat( - [ - tapas_layer.attention.query.bias, - tapas_layer.attention.key.bias, - tapas_layer.attention.value.bias, - ] - ) - ) - - # Out proj layer - self.out_proj_weight = tapas_layer.attention.dense.weight - self.out_proj_bias = tapas_layer.attention.dense.bias - - # Linear layer 1 - self.linear1_weight = tapas_layer.ffn.weight - self.linear1_bias = tapas_layer.ffn.bias - - # Linear layer 2 - self.linear2_weight = tapas_layer.ffn_output.weight - self.linear2_bias = tapas_layer.ffn_output.bias - - # Layer norm 1 - self.norm1_eps = tapas_layer.attention.LayerNorm.eps - self.norm1_weight = tapas_layer.attention.LayerNorm.weight - self.norm1_bias = tapas_layer.attention.LayerNorm.bias - - # Layer norm 2 - self.norm2_eps = tapas_layer.full_layer_layer_norm.eps - self.norm2_weight = tapas_layer.full_layer_layer_norm.weight - self.norm2_bias = tapas_layer.full_layer_layer_norm.bias - - # Model hyper parameters - self.num_heads = tapas_layer.attention.num_attention_heads - self.embed_dim = tapas_layer.attention.all_head_size - - # Last step: set the last layer to `False` -> this will be set to `True` when converting the model - self.is_last_layer = False - - self.validate_bettertransformer() - - def forward(self, hidden_states, attention_mask, *_, **__): - r""" - This is just a wrapper around the forward function proposed in: - https://github.com/huggingface/transformers/pull/19553 - """ - super().forward_checker() - - if hidden_states.is_nested: - attention_mask = None - - if attention_mask is not None: - # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask - # 0->false->keep this token -inf->true->mask this token - attention_mask = attention_mask.bool() - attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1])) - seqlen = attention_mask.shape[1] - lengths = torch.sum(~attention_mask, 1) - if not all([l == seqlen for l in lengths]): - hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask) - attention_mask = None - - hidden_states = torch._transformer_encoder_layer_fwd( - hidden_states, - self.embed_dim, - self.num_heads, - self.in_proj_weight, - self.in_proj_bias, - self.out_proj_weight, - self.out_proj_bias, - self.use_gelu, - self.norm_first, - self.norm1_eps, - self.norm1_weight, - self.norm1_bias, - self.norm2_weight, - self.norm2_bias, - self.linear1_weight, - self.linear1_bias, - self.linear2_weight, - self.linear2_bias, - attention_mask, - ) - if hidden_states.is_nested and self.is_last_layer: - hidden_states = hidden_states.to_padded_tensor(0.0) - return (hidden_states,) - - class AlbertLayerBetterTransformer(BetterTransformerBaseLayer): def __init__(self, albert_layer, config): r"""