From 9ade58f0555430cec851e307c83c3a56c4a77d0b Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Tue, 2 May 2023 17:20:02 +0200 Subject: [PATCH 001/935] [ONNX] Sam fix (#23110) * [WIP] Fix for the ONNX export * Apply changes * Remove commented code * Resolve todo * empty -> zeros * fix slow tests --------- Co-authored-by: younesbelkada --- src/transformers/models/sam/modeling_sam.py | 49 +++++++++++---------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index 393f20cfc83e..bf14a4b2413a 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -223,9 +223,7 @@ def _separate_heads(self, hidden_states: Tensor, num_attention_heads: int) -> Te def _recombine_heads(self, hidden_states: Tensor, point_batch_size: int) -> Tensor: batch, n_heads, n_tokens, c_per_head = hidden_states.shape hidden_states = hidden_states.transpose(1, 2) - return hidden_states.reshape( - batch // max(1, point_batch_size), point_batch_size, n_tokens, n_heads * c_per_head - ) + return hidden_states.reshape(batch // point_batch_size, point_batch_size, n_tokens, n_heads * c_per_head) def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor: # Input projections @@ -482,7 +480,7 @@ def forward( Whether or not to return the attentions tensors of all attention layers. """ batch_size, num_channels, height, width = image_embeddings.shape - point_batch_size = max(1, sparse_prompt_embeddings.shape[1]) + point_batch_size = sparse_prompt_embeddings.shape[1] # Concatenate output tokens output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0) output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1) @@ -634,8 +632,18 @@ def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) - torch.tensor(0.0, dtype=point_embedding.dtype, device=point_embedding.device), ) - point_embedding[labels == 0] += self.point_embed[0].weight - point_embedding[labels == 1] += self.point_embed[1].weight + point_embedding = torch.where( + (labels == 0)[:, :, :, None], + point_embedding + self.point_embed[0].weight[None, None, :, :], + point_embedding, + ) + + point_embedding = torch.where( + (labels == 1)[:, :, :, None], + point_embedding + self.point_embed[1].weight[None, None, :, :], + point_embedding, + ) + return point_embedding def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: @@ -675,8 +683,7 @@ def forward( if input_labels is None: raise ValueError("If points are provided, labels must also be provided.") point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None)) - sparse_embeddings = torch.empty((batch_size, point_batch_size, 0, self.hidden_size), device=target_device) - sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=2) + sparse_embeddings = point_embeddings if input_boxes is not None: batch_size = input_boxes.shape[0] box_embeddings = self._embed_boxes(input_boxes) @@ -692,7 +699,7 @@ def forward( ) if sparse_embeddings is None: - sparse_embeddings = torch.empty((batch_size, 0, 1, self.hidden_size), device=target_device) + sparse_embeddings = torch.zeros((batch_size, 1, 1, self.hidden_size), device=target_device) return sparse_embeddings, dense_embeddings @@ -742,17 +749,13 @@ def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch. Extracted positional embeddings according to relative positions. """ max_rel_dist = int(2 * max(q_size, k_size) - 1) - # Interpolate rel pos if needed. - if rel_pos.shape[0] != max_rel_dist: - # Interpolate rel pos. - rel_pos_resized = F.interpolate( - rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), - size=max_rel_dist, - mode="linear", - ) - rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) - else: - rel_pos_resized = rel_pos + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) @@ -865,8 +868,7 @@ def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> Tup pad_h = (window_size - height % window_size) % window_size pad_w = (window_size - width % window_size) % window_size - if pad_h > 0 or pad_w > 0: - hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) + hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) pad_height, pad_width = height + pad_h, width + pad_w hidden_states = hidden_states.reshape( @@ -902,8 +904,7 @@ def window_unpartition( hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1) ) - if pad_height > height or pad_width > width: - hidden_states = hidden_states[:, :height, :width, :].contiguous() + hidden_states = hidden_states[:, :height, :width, :].contiguous() return hidden_states def forward( From 805db1fe13b3155d61ac5571439f5d619e47022f Mon Sep 17 00:00:00 2001 From: Alex Punnen Date: Tue, 2 May 2023 22:37:30 +0530 Subject: [PATCH 002/935] num_noise_spans should be <= num_items #22246 (#22938) --- examples/flax/language-modeling/run_t5_mlm_flax.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index 152760f4bf4b..f3cec97b2e82 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -418,13 +418,14 @@ def random_spans_noise_mask(self, length): orig_length = length num_noise_tokens = int(np.round(length * self.noise_density)) + num_nonnoise_tokens = length - num_noise_tokens # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) - num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length)) + # num_noise_tokens should be less than num_noise_tokens and num_nonnoise_tokens + num_noise_spans = int(np.round(min(num_noise_tokens, num_nonnoise_tokens) / self.mean_noise_span_length)) # avoid degeneracy by ensuring positive number of noise spans num_noise_spans = max(num_noise_spans, 1) - num_nonnoise_tokens = length - num_noise_tokens # pick the lengths of the noise spans and the non-noise spans def _random_segmentation(num_items, num_segments): From 3ff89f29f542dc0e16b48e7b8d7710555f454d5c Mon Sep 17 00:00:00 2001 From: "Gregory (Gabriel) Barello" <48561156+gbarello-uipath@users.noreply.github.com> Date: Tue, 2 May 2023 10:40:41 -0700 Subject: [PATCH 003/935] Fixed default config for `Pix2Struct` model to set `Pix2StructTextModel` to `is_decoder=True` (#23051) added as default keyword arg. to in order to correctly configure the decoder --- src/transformers/models/pix2struct/configuration_pix2struct.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py index 244cb2705867..32aa34941f8f 100644 --- a/src/transformers/models/pix2struct/configuration_pix2struct.py +++ b/src/transformers/models/pix2struct/configuration_pix2struct.py @@ -118,6 +118,7 @@ def __init__( pad_token_id=0, eos_token_id=1, tie_word_embeddings=False, + is_decoder=True, **kwargs, ): self.vocab_size = vocab_size @@ -144,6 +145,7 @@ def __init__( eos_token_id=eos_token_id, decoder_start_token_id=decoder_start_token_id, tie_word_embeddings=tie_word_embeddings, + is_decoder=is_decoder, **kwargs, ) From 4b6aecb48e4961efef9edb8062dbbdd1f3d9385e Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 2 May 2023 22:02:39 -0400 Subject: [PATCH 004/935] Pin numba for now (#23118) --- setup.py | 4 +++- src/transformers/dependency_versions_table.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1eaee04a67b9..abafc03bc6f2 100644 --- a/setup.py +++ b/setup.py @@ -135,6 +135,7 @@ "librosa", "nltk", "natten>=0.14.6", + "numba<0.57.0", # Can be removed once unpinned. "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", @@ -286,7 +287,8 @@ def run(self): extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"] extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") -extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm") +# numba can be removed here once unpinned +extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm", "numba") # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead extras["speech"] = deps_list("torchaudio") + extras["audio"] extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 68ad7a1587f4..bae19acd3e1d 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -36,6 +36,7 @@ "librosa": "librosa", "nltk": "nltk", "natten": "natten>=0.14.6", + "numba": "numba<0.57.0", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", From b61d5b47f640308068139561f673765b2af39874 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 3 May 2023 11:21:59 +0200 Subject: [PATCH 005/935] =?UTF-8?q?[`Doctest`]=C2=A0Fix=20pix2struct=20doc?= =?UTF-8?q?test=20(#23121)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix pix2struct doctest --- src/transformers/models/pix2struct/modeling_pix2struct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 97fbbb57ac2a..ffaebc20372e 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1702,7 +1702,7 @@ def forward( >>> outputs = model(**inputs, labels=labels) >>> loss = outputs.loss >>> print(f"{loss.item():.5f}") - 4.58370 + 5.95566 ```""" use_cache = use_cache if use_cache is not None else self.config.text_config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict From ce31e3c8bf039ef42294e9434cc68bcf234bd5ef Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 3 May 2023 14:24:50 +0100 Subject: [PATCH 006/935] Generate: slow assisted generation test (#23125) --- tests/generation/test_utils.py | 1 + tests/models/roberta/test_modeling_roberta.py | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 0dfb4368d7d0..3b96f2b2bdff 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1457,6 +1457,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): for output in (output_contrastive, output_generate): self._check_outputs(output, input_ids, model.config, use_cache=True) + @slow # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%. def test_assisted_decoding_matches_greedy_search(self): # This test ensures that the assisted generation does not introduce output changes over greedy search. # It breaks the pattern in the tests above, for multiple reasons: diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 11b400115678..49caa67d4f6c 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -397,10 +397,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi ) fx_compatible = True - @unittest.skip(reason="Fix me @gante") - def test_assisted_greedy_search_matches_greedy_search(self): - super().test_assisted_greedy_search_matches_greedy_search() - def setUp(self): self.model_tester = RobertaModelTester(self) self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37) From a0bd464776149e8eb5b73f19d7b3c3bebe6886f9 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 3 May 2023 14:29:55 +0100 Subject: [PATCH 007/935] Generate: correct beam search length on score calculation for multi batch generation (#23127) --- src/transformers/generation/beam_search.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py index 5c423a6a1807..d8bd8010dce3 100644 --- a/src/transformers/generation/beam_search.py +++ b/src/transformers/generation/beam_search.py @@ -212,7 +212,7 @@ def process( eos_token_id: Optional[Union[int, List[int]]] = None, beam_indices: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor]: - cur_len = input_ids.shape[-1] + cur_len = input_ids.shape[-1] + 1 # add up to the length which the next_scores is calculated on batch_size = len(self._beam_hyps) if not (batch_size == (input_ids.shape[0] // self.group_size)): if self.num_beam_groups > 1: @@ -287,7 +287,6 @@ def process( ) # Check if we are done so that we can save a pad step if all(done) - cur_len += 1 # add up to the length which the next_scores is calculated on self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done( next_scores[batch_idx].max().item(), cur_len ) @@ -532,7 +531,7 @@ def process( indicating to which beam the next tokens shall be added. """ - cur_len = input_ids.shape[-1] + cur_len = input_ids.shape[-1] + 1 # add up to the length which the next_scores is calculated on batch_size = len(self._beam_hyps) if not (batch_size == (input_ids.shape[0] // self.group_size)): if self.num_beam_groups > 1: @@ -617,7 +616,6 @@ def process( ) # Check if we are done so that we can save a pad step if all(done) - cur_len += 1 # add up to the length which the next_scores is calculated on self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done( next_scores[batch_idx].max().item(), cur_len ) From 2a16d8b275e4a76ee07c8c92ddb2e570ba3cbd9d Mon Sep 17 00:00:00 2001 From: Manuel <43467008+ManuelFay@users.noreply.github.com> Date: Wed, 3 May 2023 15:36:30 +0200 Subject: [PATCH 008/935] improve unclear documentation (#23123) --- src/transformers/trainer_callback.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index 8749e5f3f574..808c1b4702f9 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -534,7 +534,8 @@ class EarlyStoppingCallback(TrainerCallback): specified metric must improve to satisfy early stopping conditions. ` This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric - in [`TrainerState`]. + in [`TrainerState`]. Note that if the [`TrainingArguments`] argument *save_steps* differs from *eval_steps*, the + early stopping will not occur until the next save step. """ def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0): From 3a08dc63fd788f768e1f16a97db14d0015368940 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 3 May 2023 14:43:17 +0100 Subject: [PATCH 009/935] Generate: better warnings with pipelines (#23128) --- src/transformers/pipelines/base.py | 4 +++- src/transformers/pipelines/text2text_generation.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index b728e94f34ee..de6c9a8ec4d9 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -803,10 +803,12 @@ def __init__( self.torch_dtype = torch_dtype self.binary_output = binary_output - # Update config with task specific parameters + # Update config and generation_config with task specific parameters task_specific_params = self.model.config.task_specific_params if task_specific_params is not None and task in task_specific_params: self.model.config.update(task_specific_params.get(task)) + if self.model.can_generate(): + self.model.generation_config.update(**task_specific_params.get(task)) self.call_count = 0 self._batch_size = kwargs.pop("batch_size", None) diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py index dbd45e6ff1be..48df10336a65 100644 --- a/src/transformers/pipelines/text2text_generation.py +++ b/src/transformers/pipelines/text2text_generation.py @@ -273,7 +273,8 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int) -> b if input_length < max_length: logger.warning( - f"Your max_length is set to {max_length}, but your input_length is only {input_length}. You might " + f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is " + "a summarization task, where outputs shorter than the input are typically wanted, you might " f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})" ) From b53004fdcefa6b74ce883c3853c983d0beb14bdd Mon Sep 17 00:00:00 2001 From: Samin Yasar Date: Wed, 3 May 2023 19:53:00 +0600 Subject: [PATCH 010/935] Add resources for LayoutLmV2 and reformat documentation resources (#23115) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add resources for layoutlmv2 * remove 🌎 from some resources --- docs/source/en/model_doc/layoutlmv2.mdx | 29 +++++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/docs/source/en/model_doc/layoutlmv2.mdx b/docs/source/en/model_doc/layoutlmv2.mdx index 031cce83deb2..6d2a9dc3bbee 100644 --- a/docs/source/en/model_doc/layoutlmv2.mdx +++ b/docs/source/en/model_doc/layoutlmv2.mdx @@ -121,6 +121,28 @@ section below. In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on [LayoutXLM's documentation page](layoutxlm). +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A notebook on how to [finetune LayoutLMv2 for text-classification on RVL-CDIP dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb). +- See also: [Text classification task guide](../tasks/sequence_classification) + + + +- A notebook on how to [finetune LayoutLMv2 for question-answering on DocVQA dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb). +- See also: [Question answering task guide](../tasks/question_answering) +- See also: [Document question answering task guide](../tasks/document_question_answering) + + + + +- A notebook on how to [finetune LayoutLMv2 for token-classification on CORD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/CORD/Fine_tuning_LayoutLMv2ForTokenClassification_on_CORD.ipynb). +- A notebook on how to [finetune LayoutLMv2 for token-classification on FUNSD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb). +- See also: [Token classification task guide](../tasks/token_classification) + ## Usage: LayoutLMv2Processor The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally @@ -266,13 +288,6 @@ print(encoding.keys()) # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image']) ``` -## Documentation resources - -- [Document question answering task guide](../tasks/document_question_answering) -- [Text classification task guide](../tasks/sequence_classification) -- [Token classification task guide](../tasks/token_classification) -- [Question answering task guide](../tasks/question_answering) - ## LayoutLMv2Config [[autodoc]] LayoutLMv2Config From 56b8d49ddfceb4278e7d893f6ef7d13c86b67078 Mon Sep 17 00:00:00 2001 From: Alara Dirik <8944735+alaradirik@users.noreply.github.com> Date: Wed, 3 May 2023 17:21:27 +0300 Subject: [PATCH 011/935] Fix ConvNext V2 paramater naming issue (#23122) Fixes the parameter naming issue in ConvNextV2GRN module --- .../models/convnextv2/convert_convnextv2_to_pytorch.py | 4 ++++ src/transformers/models/convnextv2/modeling_convnextv2.py | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py index b2a0a52d27e8..8094ecf0d615 100644 --- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py +++ b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py @@ -99,6 +99,10 @@ def rename_key(name): if "stages" in name and "downsampling_layer" not in name: # stages.0.0. for instance should be renamed to stages.0.layers.0. name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] + if "gamma" in name: + name = name.replace("gamma", "weight") + if "beta" in name: + name = name.replace("beta", "bias") if "stages" in name: name = name.replace("stages", "encoder.stages") if "norm" in name: diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py index 5fbf7831ef12..c309cdc3b6e2 100644 --- a/src/transformers/models/convnextv2/modeling_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_convnextv2.py @@ -100,14 +100,14 @@ class ConvNextV2GRN(nn.Module): def __init__(self, dim: int): super().__init__() - self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim)) - self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim)) + self.weight = nn.Parameter(torch.zeros(1, 1, 1, dim)) + self.bias = nn.Parameter(torch.zeros(1, 1, 1, dim)) def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: # Compute and normalize global spatial feature maps global_features = torch.norm(hidden_states, p=2, dim=(1, 2), keepdim=True) norm_features = global_features / (global_features.mean(dim=-1, keepdim=True) + 1e-6) - hidden_states = self.gamma * (hidden_states * norm_features) + self.beta + hidden_states + hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states return hidden_states From ee4bc07474015d574742ad25b48dd7f6ccba297f Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 3 May 2023 22:49:54 +0800 Subject: [PATCH 012/935] Support union types `X | Y` syntax for `HfArgumentParser` for Python 3.10+ (#23126) * Support union types `X | Y` syntax for `HfArgumentParser` for Python 3.10+ * Add tests for PEP 604 for `HfArgumentParser` * Reorganize tests --- src/transformers/hf_argparser.py | 18 +++++++- tests/utils/test_hf_argparser.py | 73 +++++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index b1fa67f45823..f808acebe902 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -15,6 +15,7 @@ import dataclasses import json import sys +import types from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError from copy import copy from enum import Enum @@ -159,7 +160,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field): aliases = [aliases] origin_type = getattr(field.type, "__origin__", field.type) - if origin_type is Union: + if origin_type is Union or (hasattr(types, "UnionType") and isinstance(origin_type, types.UnionType)): if str not in field.type.__args__ and ( len(field.type.__args__) != 2 or type(None) not in field.type.__args__ ): @@ -245,10 +246,23 @@ def _add_dataclass_arguments(self, dtype: DataClassType): type_hints: Dict[str, type] = get_type_hints(dtype) except NameError: raise RuntimeError( - f"Type resolution failed for f{dtype}. Try declaring the class in global scope or " + f"Type resolution failed for {dtype}. Try declaring the class in global scope or " "removing line of `from __future__ import annotations` which opts in Postponed " "Evaluation of Annotations (PEP 563)" ) + except TypeError as ex: + # Remove this block when we drop Python 3.9 support + if sys.version_info[:2] < (3, 10) and "unsupported operand type(s) for |" in str(ex): + python_version = ".".join(map(str, sys.version_info[:3])) + raise RuntimeError( + f"Type resolution failed for {dtype} on Python {python_version}. Try removing " + "line of `from __future__ import annotations` which opts in union types as " + "`X | Y` (PEP 604) via Postponed Evaluation of Annotations (PEP 563). To " + "support Python versions that lower than 3.10, you need to use " + "`typing.Union[X, Y]` instead of `X | Y` and `typing.Optional[X]` instead of " + "`X | None`." + ) from ex + raise for field in dataclasses.fields(dtype): if not field.init: diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py index 0ad3c9c2ac46..a9db072f04d1 100644 --- a/tests/utils/test_hf_argparser.py +++ b/tests/utils/test_hf_argparser.py @@ -15,6 +15,7 @@ import argparse import json import os +import sys import tempfile import unittest from argparse import Namespace @@ -36,6 +37,10 @@ # For Python 3.7 from typing_extensions import Literal +# Since Python 3.10, we can use the builtin `|` operator for Union types +# See PEP 604: https://peps.python.org/pep-0604 +is_python_no_less_than_3_10 = sys.version_info >= (3, 10) + def list_field(default=None, metadata=None): return field(default_factory=lambda: default, metadata=metadata) @@ -125,6 +130,23 @@ class StringLiteralAnnotationExample: foo_str: "List[str]" = list_field(default=["Hallo", "Bonjour", "Hello"]) +if is_python_no_less_than_3_10: + + @dataclass + class WithDefaultBoolExamplePep604: + foo: bool = False + baz: bool = True + opt: bool | None = None + + @dataclass + class OptionalExamplePep604: + foo: int | None = None + bar: float | None = field(default=None, metadata={"help": "help message"}) + baz: str | None = None + ces: list[str] | None = list_field(default=[]) + des: list[int] | None = list_field(default=[]) + + class HfArgumentParserTest(unittest.TestCase): def argparsersEqual(self, a: argparse.ArgumentParser, b: argparse.ArgumentParser): """ @@ -167,8 +189,6 @@ def test_with_default(self): self.argparsersEqual(parser, expected) def test_with_default_bool(self): - parser = HfArgumentParser(WithDefaultBoolExample) - expected = argparse.ArgumentParser() expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?") expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?") @@ -176,22 +196,29 @@ def test_with_default_bool(self): # and its default must be set to False expected.add_argument("--no_baz", action="store_false", default=False, dest="baz") expected.add_argument("--opt", type=string_to_bool, default=None) - self.argparsersEqual(parser, expected) - args = parser.parse_args([]) - self.assertEqual(args, Namespace(foo=False, baz=True, opt=None)) + dataclass_types = [WithDefaultBoolExample] + if is_python_no_less_than_3_10: + dataclass_types.append(WithDefaultBoolExamplePep604) - args = parser.parse_args(["--foo", "--no_baz"]) - self.assertEqual(args, Namespace(foo=True, baz=False, opt=None)) + for dataclass_type in dataclass_types: + parser = HfArgumentParser(dataclass_type) + self.argparsersEqual(parser, expected) - args = parser.parse_args(["--foo", "--baz"]) - self.assertEqual(args, Namespace(foo=True, baz=True, opt=None)) + args = parser.parse_args([]) + self.assertEqual(args, Namespace(foo=False, baz=True, opt=None)) - args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"]) - self.assertEqual(args, Namespace(foo=True, baz=True, opt=True)) + args = parser.parse_args(["--foo", "--no_baz"]) + self.assertEqual(args, Namespace(foo=True, baz=False, opt=None)) - args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"]) - self.assertEqual(args, Namespace(foo=False, baz=False, opt=False)) + args = parser.parse_args(["--foo", "--baz"]) + self.assertEqual(args, Namespace(foo=True, baz=True, opt=None)) + + args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"]) + self.assertEqual(args, Namespace(foo=True, baz=True, opt=True)) + + args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"]) + self.assertEqual(args, Namespace(foo=False, baz=False, opt=False)) def test_with_enum(self): parser = HfArgumentParser(MixedTypeEnumExample) @@ -266,21 +293,27 @@ def test_with_list(self): self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7])) def test_with_optional(self): - parser = HfArgumentParser(OptionalExample) - expected = argparse.ArgumentParser() expected.add_argument("--foo", default=None, type=int) expected.add_argument("--bar", default=None, type=float, help="help message") expected.add_argument("--baz", default=None, type=str) expected.add_argument("--ces", nargs="+", default=[], type=str) expected.add_argument("--des", nargs="+", default=[], type=int) - self.argparsersEqual(parser, expected) - args = parser.parse_args([]) - self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[])) + dataclass_types = [OptionalExample] + if is_python_no_less_than_3_10: + dataclass_types.append(OptionalExamplePep604) + + for dataclass_type in dataclass_types: + parser = HfArgumentParser(dataclass_type) + + self.argparsersEqual(parser, expected) + + args = parser.parse_args([]) + self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[])) - args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split()) - self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3])) + args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split()) + self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3])) def test_with_required(self): parser = HfArgumentParser(RequiredExample) From c4e32e206fe8eff7ccd9ca64ed061b54e7597193 Mon Sep 17 00:00:00 2001 From: Mayank Agarwal Date: Wed, 3 May 2023 20:20:34 +0530 Subject: [PATCH 013/935] Add support for beam search's num_return_sequencs flag in flax (#23082) * add code for numReturnSeq * add flax support for num return sequences * Make Fix up for changes * add test for num return sequences * lint --- src/transformers/generation/flax_utils.py | 11 ++++++++--- tests/generation/test_flax_utils.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index c28e0afbb998..58a2bf13ba61 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -463,6 +463,7 @@ def generate( logits_processor=logits_processor, trace=trace, params=params, + num_return_sequences=generation_config.num_return_sequences, model_kwargs=model_kwargs, ) else: @@ -749,6 +750,7 @@ def _beam_search( logits_processor: Optional[FlaxLogitsProcessorList] = None, trace: bool = True, params: Optional[Dict[str, jnp.ndarray]] = None, + num_return_sequences: Optional[int] = None, model_kwargs: Optional[Dict[str, jnp.ndarray]] = None, ): """ @@ -793,6 +795,9 @@ def gather_fn(tensor): eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.generation_config.length_penalty early_stopping = early_stopping if early_stopping is not None else self.generation_config.early_stopping + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences + ) batch_size, num_beams, cur_len = input_ids.shape @@ -996,8 +1001,8 @@ def beam_search_body_fn(state, input_ids_length=1): sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences) scores = jnp.where(none_finished[:, None], state.scores, state.running_scores) - # take best beam for each batch - sequences = sequences[:, 0] - scores = scores[:, 0] + # Take best beams for each batch (the score is sorted in descending order) + sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :]) + scores = flatten_beam_dim(scores[:, :num_return_sequences]) return FlaxBeamSearchOutput(sequences=sequences, scores=scores) diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py index c6182a2386e5..647482b88cd8 100644 --- a/tests/generation/test_flax_utils.py +++ b/tests/generation/test_flax_utils.py @@ -158,6 +158,19 @@ def test_beam_search_generate(self): self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist()) + def test_beam_search_generate_num_return_sequences(self): + config, input_ids, _, max_length = self._get_input_ids_and_config() + config.do_sample = False + config.max_length = max_length + config.num_beams = 2 + config.num_return_sequences = 2 + + for model_class in self.all_generative_model_classes: + model = model_class(config) + + generation_outputs = model.generate(input_ids).sequences + self.assertEqual(generation_outputs.shape[0], input_ids.shape[0] * config.num_return_sequences) + def test_sample_generate_logits_warper(self): config, input_ids, _, max_length = self._get_input_ids_and_config() config.do_sample = True From fbe0178f08c219313986092f4c9b994a7bd4b4a1 Mon Sep 17 00:00:00 2001 From: Nayeon Han Date: Thu, 4 May 2023 00:04:58 +0900 Subject: [PATCH 014/935] docs: ko: update `_toctree.yml` (#23112) * docs: ko: update `_toctree.yml` * fix: ko: update toc * fix: resolve suggestions * fix: resolve build issue --------- Co-authored-by: Wonhyeong Seo --- docs/source/ko/_toctree.yml | 146 +++++++++++++++++++---------------- docs/source/ko/notebooks.mdx | 1 - 2 files changed, 79 insertions(+), 68 deletions(-) delete mode 100644 docs/source/ko/notebooks.mdx diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index a9e1ff921d6b..bd24ee4e5ce1 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -8,45 +8,22 @@ title: 시작하기 - sections: - local: pipeline_tutorial - title: 추론을 위한 Pipeline + title: Pipeline으로 추론하기 - local: autoclass_tutorial - title: 자동 클래스로 사전 학습된 인스턴스 로드하기 + title: AutoClass로 사전 학습된 인스턴스 로드하기 - local: preprocessing - title: 전처리 + title: 데이터 전처리하기 - local: training title: 사전 학습된 모델 미세 조정하기 + - local: run_scripts + title: 스크립트로 학습하기 - local: accelerate - title: 🤗 Accelerate를 활용한 분산 학습 + title: 🤗 Accelerate로 분산 학습 구성하기 - local: model_sharing - title: 모델 공유하기 - title: (번역중) 튜토리얼 + title: 만든 모델 공유하기 + title: 튜토리얼 - sections: - sections: - - local: create_a_model - title: 맞춤형 아키텍처 만들기 - - local: custom_models - title: 사용자 정의 모델 공유하기 - - local: run_scripts - title: 스크립트로 학습하기 - - local: sagemaker - title: Amazon SageMaker에서 학습 실행하기 - - local: in_translation - title: (번역중) Converting from TensorFlow checkpoints - - local: serialization - title: ONNX로 내보내기 - - local: torchscript - title: TorchScript로 내보내기 - - local: in_translation - title: (번역중) Troubleshoot - title: (번역중) 일반적인 사용방법 - - sections: - - local: in_translation - title: (번역중) Use tokenizers from 🤗 Tokenizers - - local: multilingual - title: 다국어 모델 추론하기 - - local: in_translation - title: (번역중) Text generation strategies - - sections: - local: tasks/sequence_classification title: 텍스트 분류 - local: tasks/token_classification @@ -62,39 +39,68 @@ - local: tasks/summarization title: 요약 - local: in_translation - title: (번역중) Multiple choice - title: (번역중) 태스크별 가이드 - isExpanded: false - title: (번역중) 자연어처리 + title: (번역중) Multiple Choice + title: 자연어처리 + isExpanded: false - sections: - - local: in_translation - title: (번역중) Audio classification - - local: in_translation - title: (번역중) Automatic speech recognition + - local: in_translation + title: (번역중) Audio classification + - local: in_translation + title: (번역중) Automatic speech recognition title: (번역중) 오디오 + isExpanded: false - sections: - - local: tasks/image_classification - title: 이미지 분류 + - local: tasks/image_classification + title: 이미지 분류 + - local: in_translation + title: (번역중) Semantic segmentation + - local: in_translation + title: (번역중) Video classification + - local: in_translation + title: (번역중) Object detection + - local: in_translation + title: (번역중) Zero-shot object detection + - local: tasks/zero_shot_image_classification + title: 제로샷(zero-shot) 이미지 분류 + - local: in_translation + title: (번역중) Depth estimation + title: (번역중) 컴퓨터 비전 + isExpanded: false + - sections: + - local: tasks/image_captioning + title: 이미지 캡셔닝 + - local: in_translation + title: (번역중) Document Question Answering + title: (번역중) 멀티모달 + isExpanded: false + title: 태스크 가이드 +- sections: - local: in_translation - title: (번역중) Semantic segmentation + title: (번역중) Use fast tokenizers from 🤗 Tokenizers + - local: multilingual + title: 다국어 모델 추론하기 - local: in_translation - title: (번역중) Video classification + title: (번역중) Customize text generation strategy + - local: create_a_model + title: 모델별 API 사용하기 + - local: custom_models + title: 사용자 정의 모델 공유하기 + - local: sagemaker + title: Amazon SageMaker에서 학습 실행하기 + - local: serialization + title: ONNX로 내보내기 + - local: torchscript + title: TorchScript로 내보내기 - local: in_translation - title: (번역중) Object detection + title: (번역중) Benchmarks - local: in_translation - title: (번역중) Zero-shot object detection - - local: tasks/zero_shot_image_classification - title: 제로샷(zero-shot) 이미지 분류 + title: (번역중) Notebooks with examples - local: in_translation - title: (번역중) Depth estimation - title: (번역중) 컴퓨터 비전 - - sections: - - local: tasks/image_captioning - title: 이미지 캡셔닝 + title: (번역중) Community resources - local: in_translation - title: (번역중) Document Question Answering - title: (번역중) 멀티모달 - - sections: + title: (번역중) Troubleshoot + title: (번역중) 개발자 가이드 +- sections: - local: in_translation title: (번역중) Overview - local: in_translation @@ -129,8 +135,8 @@ title: (번역중) Hyperparameter Search using Trainer API - local: in_translation title: (번역중) XLA Integration for TensorFlow Models - title: (번역중) 성능 및 확장성 - - sections: + title: (번역중) 성능 및 확장성 +- sections: - local: in_translation title: (번역중) How to contribute to transformers? - local: in_translation @@ -143,16 +149,8 @@ title: (번역중) Testing - local: in_translation title: (번역중) Checks on a Pull Request - title: (번역중) 기여하기 - - local: notebooks - title: (번역중) 🤗 Transformers Notebooks - - local: in_translation - title: (번역중) Community resources - - local: in_translation - title: (번역중) Benchmarks - - local: in_translation - title: (번역중) Migrating from previous packages - title: (번역중) How-to 가이드 + title: (번역중) 기여하기 + - sections: - local: in_translation title: (번역중) Philosophy @@ -263,6 +261,8 @@ title: (번역중) ConvBERT - local: in_translation title: (번역중) CPM + - local: in_translation + title: (번역중) CPMANT - local: in_translation title: (번역중) CTRL - local: in_translation @@ -309,6 +309,8 @@ title: (번역중) GPT-J - local: in_translation title: (번역중) GPT2 + - local: in_translation + title: (번역중) GPTBigCode - local: in_translation title: (번역중) GPTSAN Japanese - local: in_translation @@ -361,6 +363,8 @@ title: (번역중) NLLB-MoE - local: in_translation title: (번역중) Nyströmformer + - local: in_translation + title: (번역중) Open-Llama - local: in_translation title: (번역중) OPT - local: in_translation @@ -460,6 +464,8 @@ title: (번역중) EfficientFormer - local: in_translation title: (번역중) EfficientNet + - local: in_translation + title: (번역중) FocalNet - local: in_translation title: (번역중) GLPN - local: in_translation @@ -572,6 +578,8 @@ title: (번역중) CLIPSeg - local: in_translation title: (번역중) Data2Vec + - local: in_translation + title: (번역중) DePlot - local: in_translation title: (번역중) Donut - local: in_translation @@ -592,6 +600,8 @@ title: (번역중) LiLT - local: in_translation title: (번역중) LXMERT + - local: in_translation + title: (번역중) MatCha - local: in_translation title: (번역중) MGP-STR - local: in_translation @@ -602,6 +612,8 @@ title: (번역중) Perceiver - local: in_translation title: (번역중) Pix2Struct + - local: in_translation + title: (번역중) Segment Anything - local: in_translation title: (번역중) Speech Encoder Decoder Models - local: in_translation diff --git a/docs/source/ko/notebooks.mdx b/docs/source/ko/notebooks.mdx deleted file mode 100644 index ead906183348..000000000000 --- a/docs/source/ko/notebooks.mdx +++ /dev/null @@ -1 +0,0 @@ -# 열심히 번역 중입니다. 조금 이따 만나요! \ No newline at end of file From ca7eb27ed590dd583bc028ba1a0f78eb00dbb243 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 3 May 2023 18:23:09 +0200 Subject: [PATCH 015/935] =?UTF-8?q?[doc]=20Try=20a=20few=20=E2=89=A0=20way?= =?UTF-8?q?s=20of=20linking=20to=20Papers,=20users,=20and=20org=20profiles?= =?UTF-8?q?=20(#22611)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [doc] Try a few ≠ ways of linking to Papers, users, and org profiles * Empty commit * Empty commit now that the backend is fixed --------- Co-authored-by: Lysandre --- docs/source/en/model_doc/distilbert.mdx | 5 ++++- docs/source/en/model_doc/gpt2.mdx | 2 +- docs/source/en/model_doc/roberta.mdx | 5 ++++- docs/source/en/model_doc/t5.mdx | 7 +++++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/source/en/model_doc/distilbert.mdx b/docs/source/en/model_doc/distilbert.mdx index cc1e03715118..837f0319ec9e 100644 --- a/docs/source/en/model_doc/distilbert.mdx +++ b/docs/source/en/model_doc/distilbert.mdx @@ -19,13 +19,16 @@ specific language governing permissions and limitations under the License. Spaces + +Paper page + ## Overview The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a -distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a +distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/papers/1910.01108). DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than *bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark. diff --git a/docs/source/en/model_doc/gpt2.mdx b/docs/source/en/model_doc/gpt2.mdx index ee80eb2f8b9c..6288e46eb555 100644 --- a/docs/source/en/model_doc/gpt2.mdx +++ b/docs/source/en/model_doc/gpt2.mdx @@ -24,7 +24,7 @@ specific language governing permissions and limitations under the License. ## Overview OpenAI GPT-2 model was proposed in [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) by Alec -Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional) +Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever from [OpenAI](https://huggingface.co/openai). It's a causal (unidirectional) transformer pretrained using language modeling on a very large corpus of ~40 GB of text data. The abstract from the paper is the following: diff --git a/docs/source/en/model_doc/roberta.mdx b/docs/source/en/model_doc/roberta.mdx index 7c0818a0144d..49007409ad39 100644 --- a/docs/source/en/model_doc/roberta.mdx +++ b/docs/source/en/model_doc/roberta.mdx @@ -19,11 +19,14 @@ specific language governing permissions and limitations under the License. Spaces + +Paper page + ## Overview -The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer +The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.mdx index f7665c11ae4a..58074f1403b1 100644 --- a/docs/source/en/model_doc/t5.mdx +++ b/docs/source/en/model_doc/t5.mdx @@ -19,12 +19,15 @@ specific language governing permissions and limitations under the License. Spaces + +Paper page + ## Overview -The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, -Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. +The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang, +Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu). The abstract from the paper is the following: From 441658dd6c7a788400d566aaeefe74990c9d040f Mon Sep 17 00:00:00 2001 From: Alara Dirik <8944735+alaradirik@users.noreply.github.com> Date: Wed, 3 May 2023 19:32:42 +0300 Subject: [PATCH 016/935] Add focalnet backbone (#23104) Adds FocalNet backbone to return features from all stages --- src/transformers/__init__.py | 2 + src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/focalnet/__init__.py | 2 + .../models/focalnet/configuration_focalnet.py | 47 +++++++++++ .../focalnet/convert_focalnet_to_hf_format.py | 6 +- .../models/focalnet/modeling_focalnet.py | 82 ++++++++++++++++++- src/transformers/utils/dummy_pt_objects.py | 7 ++ .../models/focalnet/test_modeling_focalnet.py | 71 ++++++++++++++-- tests/test_backbone_common.py | 2 + utils/check_repo.py | 1 + 10 files changed, 210 insertions(+), 11 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 445fbb53e269..46be8c9d2c6f 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1623,6 +1623,7 @@ _import_structure["models.focalnet"].extend( [ "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST", + "FocalNetBackbone", "FocalNetForImageClassification", "FocalNetForMaskedImageModeling", "FocalNetModel", @@ -5178,6 +5179,7 @@ ) from .models.focalnet import ( FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST, + FocalNetBackbone, FocalNetForImageClassification, FocalNetForMaskedImageModeling, FocalNetModel, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index ccdda1af33f0..14847c7ad2ab 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -980,6 +980,7 @@ ("convnext", "ConvNextBackbone"), ("convnextv2", "ConvNextV2Backbone"), ("dinat", "DinatBackbone"), + ("focalnet", "FocalNetBackbone"), ("maskformer-swin", "MaskFormerSwinBackbone"), ("nat", "NatBackbone"), ("resnet", "ResNetBackbone"), diff --git a/src/transformers/models/focalnet/__init__.py b/src/transformers/models/focalnet/__init__.py index e082ae26d2d1..b043a006f937 100644 --- a/src/transformers/models/focalnet/__init__.py +++ b/src/transformers/models/focalnet/__init__.py @@ -30,6 +30,7 @@ "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST", "FocalNetForImageClassification", "FocalNetForMaskedImageModeling", + "FocalNetBackbone", "FocalNetModel", "FocalNetPreTrainedModel", ] @@ -45,6 +46,7 @@ else: from .modeling_focalnet import ( FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST, + FocalNetBackbone, FocalNetForImageClassification, FocalNetForMaskedImageModeling, FocalNetModel, diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py index 5bfecb5737a4..c6814e1dda14 100644 --- a/src/transformers/models/focalnet/configuration_focalnet.py +++ b/src/transformers/models/focalnet/configuration_focalnet.py @@ -47,6 +47,8 @@ class FocalNetConfig(PretrainedConfig): use_conv_embed (`bool`, *optional*, defaults to `False`): Whether to use convolutional embedding. The authors noted that using convolutional embedding usually improve the performance, but it's not used by default. + hidden_sizes (`List[int]`, *optional*, defaults to `[192, 384, 768, 768]`): + Dimensionality (hidden size) at each stage. depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`): Depth (number of layers) of each stage in the encoder. focal_levels (`list(int)`, *optional*, defaults to `[2, 2, 2, 2]`): @@ -78,6 +80,14 @@ class FocalNetConfig(PretrainedConfig): The epsilon used by the layer normalization layers. encoder_stride (`int`, `optional`, defaults to 32): Factor to increase the spatial resolution by in the decoder head for masked image modeling. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Example: @@ -102,6 +112,7 @@ def __init__( num_channels=3, embed_dim=96, use_conv_embed=False, + hidden_sizes=[192, 384, 768, 768], depths=[2, 2, 6, 2], focal_levels=[2, 2, 2, 2], focal_windows=[3, 3, 3, 3], @@ -117,6 +128,8 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-5, encoder_stride=32, + out_features=None, + out_indices=None, **kwargs, ): super().__init__(**kwargs) @@ -126,6 +139,7 @@ def __init__( self.num_channels = num_channels self.embed_dim = embed_dim self.use_conv_embed = use_conv_embed + self.hidden_sizes = hidden_sizes self.depths = depths self.focal_levels = focal_levels self.focal_windows = focal_windows @@ -141,3 +155,36 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.encoder_stride = encoder_stride + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + + if out_features is not None and out_indices is not None: + if len(out_features) != len(out_indices): + raise ValueError("out_features and out_indices should have the same length if both are set") + elif out_features != [self.stage_names[idx] for idx in out_indices]: + raise ValueError("out_features and out_indices should correspond to the same stages if both are set") + + if out_features is None and out_indices is not None: + out_features = [self.stage_names[idx] for idx in out_indices] + elif out_features is not None and out_indices is None: + out_indices = [self.stage_names.index(feature) for feature in out_features] + elif out_features is None and out_indices is None: + out_features = [self.stage_names[-1]] + out_indices = [len(self.stage_names) - 1] + + if out_features is not None: + if not isinstance(out_features, list): + raise ValueError("out_features should be a list") + for feature in out_features: + if feature not in self.stage_names: + raise ValueError( + f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" + ) + if out_indices is not None: + if not isinstance(out_indices, (list, tuple)): + raise ValueError("out_indices should be a list or tuple") + for idx in out_indices: + if idx >= len(self.stage_names): + raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") + + self.out_features = out_features + self.out_indices = out_indices diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py index a23383e3abc9..4aed15928062 100644 --- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py +++ b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py @@ -56,7 +56,6 @@ def get_focalnet_config(model_name): embed_dim = 128 elif "large" in model_name: embed_dim = 192 - focal_windows = [5, 5, 5, 5] elif "xlarge" in model_name: embed_dim = 256 elif "huge" in model_name: @@ -130,7 +129,10 @@ def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hu "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth", "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth", "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth", - "focalnet-large": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth", + "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth", + "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth", + "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth", + "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth", } # fmt: on diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index ff1b75e14b5e..cfd64689763b 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -26,7 +26,8 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN -from ...modeling_utils import PreTrainedModel +from ...modeling_outputs import BackboneOutput +from ...modeling_utils import BackboneMixin, PreTrainedModel from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -209,7 +210,6 @@ def forward( embeddings = embeddings * (1.0 - mask) + mask_tokens * mask embeddings = self.dropout(embeddings) - return embeddings, output_dimensions @@ -971,3 +971,81 @@ def forward( hidden_states=outputs.hidden_states, reshaped_hidden_states=outputs.reshaped_hidden_states, ) + + +@add_start_docstrings( + """ + FocalNet backbone, to be used with frameworks like X-Decoder. + """, + FOCALNET_START_DOCSTRING, +) +class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + + self.stage_names = config.stage_names + self.focalnet = FocalNetModel(config) + + self.num_features = [config.embed_dim] + config.hidden_sizes + self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] + if config.out_indices is not None: + self.out_indices = config.out_indices + else: + self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + + # initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf") + >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf") + + >>> inputs = processor(image, return_tensors="pt") + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.focalnet(pixel_values, output_hidden_states=True, return_dict=True) + + hidden_states = outputs.reshaped_hidden_states + + feature_maps = () + for idx, stage in enumerate(self.stage_names): + if stage in self.out_features: + feature_maps += (hidden_states[idx],) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs.hidden_states,) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=None, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 7fe538eccc9c..1e9845ba9bcc 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3002,6 +3002,13 @@ def __init__(self, *args, **kwargs): FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST = None +class FocalNetBackbone(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class FocalNetForImageClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py index 4ddf8e63b944..75127e5fd382 100644 --- a/tests/models/focalnet/test_modeling_focalnet.py +++ b/tests/models/focalnet/test_modeling_focalnet.py @@ -22,6 +22,7 @@ from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available +from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor @@ -30,7 +31,12 @@ import torch from torch import nn - from transformers import FocalNetForImageClassification, FocalNetForMaskedImageModeling, FocalNetModel + from transformers import ( + FocalNetBackbone, + FocalNetForImageClassification, + FocalNetForMaskedImageModeling, + FocalNetModel, + ) from transformers.models.focalnet.modeling_focalnet import FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST if is_vision_available(): @@ -48,6 +54,7 @@ def __init__( patch_size=2, num_channels=3, embed_dim=16, + hidden_sizes=[32, 64, 128], depths=[1, 2, 1], num_heads=[2, 2, 4], window_size=2, @@ -67,6 +74,7 @@ def __init__( type_sequence_label_size=10, encoder_stride=8, out_features=["stage1", "stage2"], + out_indices=[1, 2], ): self.parent = parent self.batch_size = batch_size @@ -74,6 +82,7 @@ def __init__( self.patch_size = patch_size self.num_channels = num_channels self.embed_dim = embed_dim + self.hidden_sizes = hidden_sizes self.depths = depths self.num_heads = num_heads self.window_size = window_size @@ -93,6 +102,7 @@ def __init__( self.type_sequence_label_size = type_sequence_label_size self.encoder_stride = encoder_stride self.out_features = out_features + self.out_indices = out_indices def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -111,6 +121,7 @@ def get_config(self): patch_size=self.patch_size, num_channels=self.num_channels, embed_dim=self.embed_dim, + hidden_sizes=self.hidden_sizes, depths=self.depths, num_heads=self.num_heads, window_size=self.window_size, @@ -126,6 +137,7 @@ def get_config(self): initializer_range=self.initializer_range, encoder_stride=self.encoder_stride, out_features=self.out_features, + out_indices=self.out_indices, ) def create_and_check_model(self, config, pixel_values, labels): @@ -139,6 +151,35 @@ def create_and_check_model(self, config, pixel_values, labels): self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim)) + def create_and_check_backbone(self, config, pixel_values, labels): + model = FocalNetBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size, 8, 8]) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + self.parent.assertListEqual(model.channels, config.hidden_sizes[:-1]) + + # verify backbone works with out_features=None + config.out_features = None + model = FocalNetBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size * 2, 4, 4]) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]]) + def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels): model = FocalNetForMaskedImageModeling(config=config) model.to(torch_device) @@ -191,6 +232,7 @@ class FocalNetModelTest(ModelTesterMixin, unittest.TestCase): FocalNetModel, FocalNetForImageClassification, FocalNetForMaskedImageModeling, + FocalNetBackbone, ) if is_torch_available() else () @@ -204,7 +246,7 @@ class FocalNetModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = FocalNetModelTester(self) - self.config_tester = ConfigTester(self, config_class=FocalNetConfig, embed_dim=37) + self.config_tester = ConfigTester(self, config_class=FocalNetConfig, embed_dim=37, has_text_modality=False) def test_config(self): self.create_and_test_config_common_properties() @@ -222,6 +264,10 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + def test_for_masked_image_modeling(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs) @@ -234,14 +280,14 @@ def test_for_image_classification(self): def test_inputs_embeds(self): pass - @unittest.skip(reason="FocalNet Transformer does not use feedforward chunking") + @unittest.skip(reason="FocalNet does not use feedforward chunking") def test_feed_forward_chunking(self): pass def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: + for model_class in self.all_model_classes[:-1]: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) x = model.get_output_embeddings() @@ -250,7 +296,7 @@ def test_model_common_attributes(self): def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: + for model_class in self.all_model_classes[:-1]: model = model_class(config) signature = inspect.signature(model.forward) # signature.parameters is an OrderedDict => so arg_names order is deterministic @@ -309,7 +355,7 @@ def test_hidden_states_output(self): else (self.model_tester.image_size, self.model_tester.image_size) ) - for model_class in self.all_model_classes: + for model_class in self.all_model_classes[:-1]: inputs_dict["output_hidden_states"] = True self.check_hidden_states_output(inputs_dict, config, model_class, image_size) @@ -337,7 +383,7 @@ def test_hidden_states_output_with_padding(self): padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0]) padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1]) - for model_class in self.all_model_classes: + for model_class in self.all_model_classes[:-1]: inputs_dict["output_hidden_states"] = True self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) @@ -393,3 +439,14 @@ def test_inference_image_classification_head(self): expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]).to(torch_device) self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) self.assertTrue(outputs.logits.argmax(dim=-1).item(), 281) + + +@require_torch +class FocalNetBackboneTest(BackboneTesterMixin, unittest.TestCase): + all_model_classes = (FocalNetBackbone,) if is_torch_available() else () + config_class = FocalNetConfig + + has_attentions = False + + def setUp(self): + self.model_tester = FocalNetModelTester(self) diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py index 80e68a2f44ad..6bcf47004bd2 100644 --- a/tests/test_backbone_common.py +++ b/tests/test_backbone_common.py @@ -135,6 +135,8 @@ def test_backbone_common_attributes(self): # Verify num_features has been initialized in the backbone init self.assertIsNotNone(backbone.num_features) self.assertTrue(len(backbone.channels) == len(backbone.out_indices)) + print(backbone.stage_names) + print(backbone.num_features) self.assertTrue(len(backbone.stage_names) == len(backbone.num_features)) self.assertTrue(len(backbone.channels) <= len(backbone.num_features)) self.assertTrue(len(backbone.out_feature_channels) == len(backbone.stage_names)) diff --git a/utils/check_repo.py b/utils/check_repo.py index 5bdec16b9ece..7280381faf97 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -836,6 +836,7 @@ def find_all_documented_objects(): "ConvNextBackbone", "ConvNextV2Backbone", "DinatBackbone", + "FocalNetBackbone", "MaskFormerSwinBackbone", "MaskFormerSwinConfig", "MaskFormerSwinModel", From e3ee45aa54680bc2c7d57acb568d0570bc28044c Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 3 May 2023 18:47:36 +0200 Subject: [PATCH 017/935] Enable to use custom tracer in FX `symbolic_trace` (#23105) * Enable to use custom tracer in FX `symbolic_trace` * Integrate feedback from review * Formatting Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/utils/fx.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 2f5b4810ecf2..e82d44c80281 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -1207,6 +1207,7 @@ def symbolic_trace( model: PreTrainedModel, input_names: Optional[List[str]] = None, disable_check: bool = False, + tracer_cls: Type[HFTracer] = HFTracer, ) -> GraphModule: """ Performs symbolic tracing on the model. @@ -1218,6 +1219,8 @@ def symbolic_trace( The names of the inputs of the traced model. If unset, model.dummy_inputs.keys() are used instead. disable_check (`bool`, *optional*, defaults to `False`): If `True`, no check is done before trying to trace the model, this is mostly usesul for debugging purposes. + tracer_cls (`Type[HFTracer]`, *optional*, defaults to `HFTracer`): + The tracer class to use for instantiating the tracer. If unset, `HFTracer` is used instead. Returns: `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model. @@ -1240,7 +1243,7 @@ def symbolic_trace( check_if_model_is_supported(model) # Tracing. - tracer = HFTracer() + tracer = tracer_cls() traced_graph = tracer.trace(model, concrete_args=concrete_args) traced = torch.fx.GraphModule(model, traced_graph) From b0a78091a5b2f7e872140cf2d3795e4c56c9c95d Mon Sep 17 00:00:00 2001 From: Alara Dirik <8944735+alaradirik@users.noreply.github.com> Date: Wed, 3 May 2023 20:04:48 +0300 Subject: [PATCH 018/935] Remove redundant print statements (#23133) remove redundant print statements --- tests/test_backbone_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py index 6bcf47004bd2..80e68a2f44ad 100644 --- a/tests/test_backbone_common.py +++ b/tests/test_backbone_common.py @@ -135,8 +135,6 @@ def test_backbone_common_attributes(self): # Verify num_features has been initialized in the backbone init self.assertIsNotNone(backbone.num_features) self.assertTrue(len(backbone.channels) == len(backbone.out_indices)) - print(backbone.stage_names) - print(backbone.num_features) self.assertTrue(len(backbone.stage_names) == len(backbone.num_features)) self.assertTrue(len(backbone.channels) <= len(backbone.num_features)) self.assertTrue(len(backbone.out_feature_channels) == len(backbone.stage_names)) From b6933d76d27fd14a835b9ea095d56725c69f4796 Mon Sep 17 00:00:00 2001 From: Robert Stone Date: Wed, 3 May 2023 12:50:41 -0700 Subject: [PATCH 019/935] Tidy Pytorch GLUE benchmark example (#23134) Migration to Evaluate for metric is not quite complete --- examples/pytorch/text-classification/run_glue.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 1bb4c7bee7b8..dd81d535df7b 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -486,6 +486,8 @@ def preprocess_function(examples): # Get the metric function if data_args.task_name is not None: metric = evaluate.load("glue", data_args.task_name) + elif is_regression: + metric = evaluate.load("mse") else: metric = evaluate.load("accuracy") @@ -494,15 +496,10 @@ def preprocess_function(examples): def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - if data_args.task_name is not None: - result = metric.compute(predictions=preds, references=p.label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - elif is_regression: - return {"mse": ((preds - p.label_ids) ** 2).mean().item()} - else: - return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if # we already did the padding. From 78b7debf56efb907c6af767882162050d4fbb294 Mon Sep 17 00:00:00 2001 From: peter-sk Date: Wed, 3 May 2023 21:59:19 +0200 Subject: [PATCH 020/935] GPTNeoForQuestionAnswering (#23057) * first draft - gives index error in question_answering.py * maturing * no labels * pipeline should know about QA * fixing checks * formatting * fixed docstring * initial commit * formatting * adding the class to many places * towards less unhappy checks * nearly there * Update src/transformers/models/gpt_neo/modeling_gpt_neo.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * avoid error * moving to device of star/end_logits --------- Co-authored-by: Prof. Peter Schneider-Kamp Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/model_doc/gpt_neo.mdx | 5 + docs/source/en/tasks/question_answering.mdx | 2 +- src/transformers/__init__.py | 2 + src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/gpt_neo/__init__.py | 2 + .../models/gpt_neo/modeling_gpt_neo.py | 102 ++++++++++++++++++ src/transformers/models/gptj/modeling_gptj.py | 4 +- src/transformers/utils/dummy_pt_objects.py | 7 ++ tests/models/gpt_neo/test_modeling_gpt_neo.py | 25 ++++- 9 files changed, 146 insertions(+), 4 deletions(-) diff --git a/docs/source/en/model_doc/gpt_neo.mdx b/docs/source/en/model_doc/gpt_neo.mdx index 4d6e8c58ce4b..a766a85cf151 100644 --- a/docs/source/en/model_doc/gpt_neo.mdx +++ b/docs/source/en/model_doc/gpt_neo.mdx @@ -69,6 +69,11 @@ The `generate()` method can be used to generate text using GPT Neo model. [[autodoc]] GPTNeoForCausalLM - forward +## GPTNeoForQuestionAnswering + +[[autodoc]] GPTNeoForQuestionAnswering + - forward + ## GPTNeoForSequenceClassification [[autodoc]] GPTNeoForSequenceClassification diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx index a079a9265c84..3dd11140c253 100644 --- a/docs/source/en/tasks/question_answering.mdx +++ b/docs/source/en/tasks/question_answering.mdx @@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit -[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) +[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 46be8c9d2c6f..1c74d242fc6b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1690,6 +1690,7 @@ [ "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST", "GPTNeoForCausalLM", + "GPTNeoForQuestionAnswering", "GPTNeoForSequenceClassification", "GPTNeoForTokenClassification", "GPTNeoModel", @@ -5234,6 +5235,7 @@ from .models.gpt_neo import ( GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST, GPTNeoForCausalLM, + GPTNeoForQuestionAnswering, GPTNeoForSequenceClassification, GPTNeoForTokenClassification, GPTNeoModel, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 14847c7ad2ab..a53128a61216 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -736,6 +736,7 @@ ("fnet", "FNetForQuestionAnswering"), ("funnel", "FunnelForQuestionAnswering"), ("gpt2", "GPT2ForQuestionAnswering"), + ("gpt_neo", "GPTNeoForQuestionAnswering"), ("gptj", "GPTJForQuestionAnswering"), ("ibert", "IBertForQuestionAnswering"), ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"), diff --git a/src/transformers/models/gpt_neo/__init__.py b/src/transformers/models/gpt_neo/__init__.py index a7ddaf848883..02ca0a11949b 100644 --- a/src/transformers/models/gpt_neo/__init__.py +++ b/src/transformers/models/gpt_neo/__init__.py @@ -29,6 +29,7 @@ _import_structure["modeling_gpt_neo"] = [ "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST", "GPTNeoForCausalLM", + "GPTNeoForQuestionAnswering", "GPTNeoForSequenceClassification", "GPTNeoForTokenClassification", "GPTNeoModel", @@ -61,6 +62,7 @@ from .modeling_gpt_neo import ( GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST, GPTNeoForCausalLM, + GPTNeoForQuestionAnswering, GPTNeoForSequenceClassification, GPTNeoForTokenClassification, GPTNeoModel, diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 808c3f39ebb1..02f7d5534bde 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -29,6 +29,7 @@ BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, CausalLMOutputWithPast, + QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) @@ -1012,3 +1013,104 @@ def forward( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + + +@add_start_docstrings( + """ + The GPT-Neo Model transformer with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + GPT_NEO_START_DOCSTRING, +) +class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPTNeoModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + real_checkpoint=_CHECKPOINT_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 43b93234569a..18985cb3bce4 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -1127,9 +1127,9 @@ def forward( if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) + start_positions = start_positions.squeeze(-1).to(start_logits.device) if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) + end_positions = end_positions.squeeze(-1).to(end_logits.device) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions = start_positions.clamp(0, ignored_index) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 1e9845ba9bcc..73827a3937a1 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3287,6 +3287,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class GPTNeoForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class GPTNeoForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py index 0b5f69639471..c0334bd05aa3 100644 --- a/tests/models/gpt_neo/test_modeling_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py @@ -34,6 +34,7 @@ GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST, GPT2Tokenizer, GPTNeoForCausalLM, + GPTNeoForQuestionAnswering, GPTNeoForSequenceClassification, GPTNeoForTokenClassification, GPTNeoModel, @@ -325,6 +326,17 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + def create_and_check_gpt_neo_for_question_answering( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTNeoForQuestionAnswering(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + def create_and_check_gpt_neo_for_sequence_classification( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args ): @@ -385,7 +397,13 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification, GPTNeoForTokenClassification) + ( + GPTNeoModel, + GPTNeoForCausalLM, + GPTNeoForQuestionAnswering, + GPTNeoForSequenceClassification, + GPTNeoForTokenClassification, + ) if is_torch_available() else () ) @@ -393,6 +411,7 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix pipeline_model_mapping = ( { "feature-extraction": GPTNeoModel, + "question-answering": GPTNeoForQuestionAnswering, "text-classification": GPTNeoForSequenceClassification, "token-classification": GPTNeoForTokenClassification, "text-generation": GPTNeoForCausalLM, @@ -438,6 +457,10 @@ def test_gpt_neo_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + def test_gpt_neo_question_answering_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_for_question_answering(*config_and_inputs) + def test_gpt_neo_sequence_classification_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt_neo_for_sequence_classification(*config_and_inputs) From 90e8263d912daf8bdb6c3849d35b6588cf5cc39c Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Thu, 4 May 2023 10:15:06 +0100 Subject: [PATCH 021/935] Add methods to update and verify out_features out_indices (#23031) * Add methods to update and verify out_features out_indices * Safe update for config attributes * Fix function names * Save config correctly * PR comments - use property setters * PR comment - directly set attributes * Update test * Add updates to recently merged focalnet backbone --- src/transformers/modeling_utils.py | 26 --- .../models/bit/configuration_bit.py | 38 +--- src/transformers/models/bit/modeling_bit.py | 11 +- .../models/convnext/configuration_convnext.py | 38 +--- .../models/convnext/modeling_convnext.py | 13 +- .../convnextv2/configuration_convnextv2.py | 38 +--- .../models/convnextv2/modeling_convnextv2.py | 13 +- .../models/dinat/configuration_dinat.py | 38 +--- .../models/dinat/modeling_dinat.py | 13 +- .../models/focalnet/configuration_focalnet.py | 38 +--- .../models/focalnet/modeling_focalnet.py | 11 +- .../configuration_maskformer_swin.py | 38 +--- .../maskformer/modeling_maskformer_swin.py | 12 +- .../models/nat/configuration_nat.py | 38 +--- src/transformers/models/nat/modeling_nat.py | 11 +- .../models/resnet/configuration_resnet.py | 38 +--- .../models/resnet/modeling_resnet.py | 11 +- .../models/swin/configuration_swin.py | 38 +--- src/transformers/models/swin/modeling_swin.py | 13 +- .../models/upernet/modeling_upernet.py | 3 +- src/transformers/utils/backbone_utils.py | 203 ++++++++++++++++++ tests/test_backbone_common.py | 21 +- tests/utils/test_backbone_utils.py | 102 +++++++++ 23 files changed, 420 insertions(+), 385 deletions(-) create mode 100644 src/transformers/utils/backbone_utils.py create mode 100644 tests/utils/test_backbone_utils.py diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 46e1c09dba1f..bf06d9c40538 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1006,32 +1006,6 @@ def floating_point_ops( return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) -class BackboneMixin: - @property - def out_feature_channels(self): - # the current backbones will output the number of channels for each stage - # even if that stage is not in the out_features list. - return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)} - - @property - def channels(self): - return [self.out_feature_channels[name] for name in self.out_features] - - def forward_with_filtered_kwargs(self, *args, **kwargs): - signature = dict(inspect.signature(self.forward).parameters) - filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature} - return self(*args, **filtered_kwargs) - - def forward( - self, - pixel_values: Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - raise NotImplementedError("This method should be implemented by the derived class.") - - class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin): r""" Base class for all models. diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py index da53807f3f06..bfac3ab03f00 100644 --- a/src/transformers/models/bit/configuration_bit.py +++ b/src/transformers/models/bit/configuration_bit.py @@ -16,6 +16,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -25,7 +26,7 @@ } -class BitConfig(PretrainedConfig): +class BitConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -128,35 +129,6 @@ def __init__( self.width_factor = width_factor self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py index 6a63a7a3e29c..d440f180757b 100644 --- a/src/transformers/models/bit/modeling_bit.py +++ b/src/transformers/models/bit/modeling_bit.py @@ -31,7 +31,7 @@ BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention, ) -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -39,6 +39,7 @@ logging, replace_return_docstrings, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_bit import BitConfig @@ -848,12 +849,10 @@ def __init__(self, config): self.stage_names = config.stage_names self.bit = BitModel(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] self.num_features = [config.embedding_size] + config.hidden_sizes - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) # initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py index a4b7272295c7..0cba78040579 100644 --- a/src/transformers/models/convnext/configuration_convnext.py +++ b/src/transformers/models/convnext/configuration_convnext.py @@ -22,6 +22,7 @@ from ...configuration_utils import PretrainedConfig from ...onnx import OnnxConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -32,7 +33,7 @@ } -class ConvNextConfig(PretrainedConfig): +class ConvNextConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`ConvNextModel`]. It is used to instantiate an ConvNeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -119,38 +120,9 @@ def __init__( self.drop_path_rate = drop_path_rate self.image_size = image_size self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) class ConvNextOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py index 35302bffed66..1748e68aeec1 100755 --- a/src/transformers/models/convnext/modeling_convnext.py +++ b/src/transformers/models/convnext/modeling_convnext.py @@ -29,7 +29,7 @@ BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention, ) -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -37,6 +37,7 @@ logging, replace_return_docstrings, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_convnext import ConvNextConfig @@ -485,16 +486,14 @@ def __init__(self, config): self.embeddings = ConvNextEmbeddings(config) self.encoder = ConvNextEncoder(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) # Add layer norms to hidden states of out_features hidden_states_norms = {} - for stage, num_channels in zip(self.out_features, self.channels): + for stage, num_channels in zip(self._out_features, self.channels): hidden_states_norms[stage] = ConvNextLayerNorm(num_channels, data_format="channels_first") self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) diff --git a/src/transformers/models/convnextv2/configuration_convnextv2.py b/src/transformers/models/convnextv2/configuration_convnextv2.py index f02a21371b20..14dfcf85124e 100644 --- a/src/transformers/models/convnextv2/configuration_convnextv2.py +++ b/src/transformers/models/convnextv2/configuration_convnextv2.py @@ -17,6 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -26,7 +27,7 @@ } -class ConvNextV2Config(PretrainedConfig): +class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`ConvNextV2Model`]. It is used to instantiate an ConvNeXTV2 model according to the specified arguments, defining the model architecture. Instantiating a @@ -109,35 +110,6 @@ def __init__( self.drop_path_rate = drop_path_rate self.image_size = image_size self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py index c309cdc3b6e2..c4cac4eb39fc 100644 --- a/src/transformers/models/convnextv2/modeling_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_convnextv2.py @@ -29,7 +29,7 @@ BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention, ) -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -37,6 +37,7 @@ logging, replace_return_docstrings, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_convnextv2 import ConvNextV2Config @@ -508,16 +509,14 @@ def __init__(self, config): self.embeddings = ConvNextV2Embeddings(config) self.encoder = ConvNextV2Encoder(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) # Add layer norms to hidden states of out_features hidden_states_norms = {} - for stage, num_channels in zip(self.out_features, self.channels): + for stage, num_channels in zip(self._out_features, self.channels): hidden_states_norms[stage] = ConvNextV2LayerNorm(num_channels, data_format="channels_first") self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py index 7c6a84ecdd13..963c72f29bd4 100644 --- a/src/transformers/models/dinat/configuration_dinat.py +++ b/src/transformers/models/dinat/configuration_dinat.py @@ -16,6 +16,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -26,7 +27,7 @@ } -class DinatConfig(PretrainedConfig): +class DinatConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -145,35 +146,6 @@ def __init__( self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) self.layer_scale_init_value = layer_scale_init_value self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 5b2394f122f1..7e3809c1a303 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -26,7 +26,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, @@ -39,6 +39,7 @@ replace_return_docstrings, requires_backends, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_dinat import DinatConfig @@ -890,16 +891,14 @@ def __init__(self, config): self.embeddings = DinatEmbeddings(config) self.encoder = DinatEncoder(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))] # Add layer norms to hidden states of out_features hidden_states_norms = {} - for stage, num_channels in zip(self.out_features, self.channels): + for stage, num_channels in zip(self._out_features, self.channels): hidden_states_norms[stage] = nn.LayerNorm(num_channels) self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py index c6814e1dda14..f4bcd0ddce3b 100644 --- a/src/transformers/models/focalnet/configuration_focalnet.py +++ b/src/transformers/models/focalnet/configuration_focalnet.py @@ -16,6 +16,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -25,7 +26,7 @@ } -class FocalNetConfig(PretrainedConfig): +class FocalNetConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -156,35 +157,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.encoder_stride = encoder_stride self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index cfd64689763b..e7ebdda5e5d4 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -27,7 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -36,6 +36,7 @@ logging, replace_return_docstrings, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_focalnet import FocalNetConfig @@ -987,11 +988,9 @@ def __init__(self, config): self.focalnet = FocalNetModel(config) self.num_features = [config.embed_dim] + config.hidden_sizes - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) # initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py index ca60b6176eed..7c3ac54bd80d 100644 --- a/src/transformers/models/maskformer/configuration_maskformer_swin.py +++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py @@ -16,12 +16,13 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) -class MaskFormerSwinConfig(PretrainedConfig): +class MaskFormerSwinConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`MaskFormerSwinModel`]. It is used to instantiate a Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -141,35 +142,6 @@ def __init__( # this indicates the channel dimension after the last stage of the model self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 8684a58a4e7b..c7b74a6f2bd7 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -27,8 +27,9 @@ from ...activations import ACT2FN from ...file_utils import ModelOutput from ...modeling_outputs import BackboneOutput -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_maskformer_swin import MaskFormerSwinConfig @@ -855,14 +856,13 @@ def __init__(self, config: MaskFormerSwinConfig): self.stage_names = config.stage_names self.model = MaskFormerSwinModel(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] + self._out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] if "stem" in self.out_features: raise ValueError("This backbone does not support 'stem' in the `out_features`.") - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))] self.hidden_states_norms = nn.ModuleList( [nn.LayerNorm(num_channels) for num_channels in self.num_features[1:]] diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py index a74b8c9165c7..5d8bd6b3c6eb 100644 --- a/src/transformers/models/nat/configuration_nat.py +++ b/src/transformers/models/nat/configuration_nat.py @@ -16,6 +16,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -26,7 +27,7 @@ } -class NatConfig(PretrainedConfig): +class NatConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -141,35 +142,6 @@ def __init__( self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) self.layer_scale_init_value = layer_scale_init_value self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py index c5e83b29da67..7634a08ad95b 100644 --- a/src/transformers/models/nat/modeling_nat.py +++ b/src/transformers/models/nat/modeling_nat.py @@ -26,7 +26,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, @@ -39,6 +39,7 @@ replace_return_docstrings, requires_backends, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_nat import NatConfig @@ -868,11 +869,9 @@ def __init__(self, config): self.embeddings = NatEmbeddings(config) self.encoder = NatEncoder(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))] # Add layer norms to hidden states of out_features diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py index 6a88935f3b02..f12fe542a067 100644 --- a/src/transformers/models/resnet/configuration_resnet.py +++ b/src/transformers/models/resnet/configuration_resnet.py @@ -22,6 +22,7 @@ from ...configuration_utils import PretrainedConfig from ...onnx import OnnxConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -31,7 +32,7 @@ } -class ResNetConfig(PretrainedConfig): +class ResNetConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`ResNetModel`]. It is used to instantiate an ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -108,38 +109,9 @@ def __init__( self.hidden_act = hidden_act self.downsample_in_first_stage = downsample_in_first_stage self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) class ResNetOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 6926f6a43116..b177cdeda6c1 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -28,7 +28,7 @@ BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention, ) -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -36,6 +36,7 @@ logging, replace_return_docstrings, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_resnet import ResNetConfig @@ -436,11 +437,9 @@ def __init__(self, config): self.embedder = ResNetEmbeddings(config) self.encoder = ResNetEncoder(config) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) self.num_features = [config.embedding_size] + config.hidden_sizes # initialize weights and apply final processing diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py index 612bc9949fb2..757112f8cebf 100644 --- a/src/transformers/models/swin/configuration_swin.py +++ b/src/transformers/models/swin/configuration_swin.py @@ -22,6 +22,7 @@ from ...configuration_utils import PretrainedConfig from ...onnx import OnnxConfig from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices logger = logging.get_logger(__name__) @@ -34,7 +35,7 @@ } -class SwinConfig(PretrainedConfig): +class SwinConfig(BackboneConfigMixin, PretrainedConfig): r""" This is the configuration class to store the configuration of a [`SwinModel`]. It is used to instantiate a Swin model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -158,38 +159,9 @@ def __init__( # this indicates the channel dimension after the last stage of the model self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - - if out_features is not None and out_indices is not None: - if len(out_features) != len(out_indices): - raise ValueError("out_features and out_indices should have the same length if both are set") - elif out_features != [self.stage_names[idx] for idx in out_indices]: - raise ValueError("out_features and out_indices should correspond to the same stages if both are set") - - if out_features is None and out_indices is not None: - out_features = [self.stage_names[idx] for idx in out_indices] - elif out_features is not None and out_indices is None: - out_indices = [self.stage_names.index(feature) for feature in out_features] - elif out_features is None and out_indices is None: - out_features = [self.stage_names[-1]] - out_indices = [len(self.stage_names) - 1] - - if out_features is not None: - if not isinstance(out_features, list): - raise ValueError("out_features should be a list") - for feature in out_features: - if feature not in self.stage_names: - raise ValueError( - f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}" - ) - if out_indices is not None: - if not isinstance(out_indices, (list, tuple)): - raise ValueError("out_indices should be a list or tuple") - for idx in out_indices: - if idx >= len(self.stage_names): - raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}") - - self.out_features = out_features - self.out_indices = out_indices + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) class SwinOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 2f7cfeb1adbd..6482ff1b5bf2 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -28,7 +28,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( ModelOutput, @@ -38,6 +38,7 @@ logging, replace_return_docstrings, ) +from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices from .configuration_swin import SwinConfig @@ -1264,16 +1265,14 @@ def __init__(self, config: SwinConfig): self.embeddings = SwinEmbeddings(config) self.encoder = SwinEncoder(config, self.embeddings.patch_grid) - self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]] - if config.out_indices is not None: - self.out_indices = config.out_indices - else: - self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features) + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + config.out_features, config.out_indices, self.stage_names + ) self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))] # Add layer norms to hidden states of out_features hidden_states_norms = {} - for stage, num_channels in zip(self.out_features, self.channels): + for stage, num_channels in zip(self._out_features, self.channels): hidden_states_norms[stage] = nn.LayerNorm(num_channels) self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py index a00866f77dc4..6143c57e92d5 100644 --- a/src/transformers/models/upernet/modeling_upernet.py +++ b/src/transformers/models/upernet/modeling_upernet.py @@ -22,8 +22,9 @@ from ... import AutoBackbone from ...modeling_outputs import SemanticSegmenterOutput -from ...modeling_utils import BackboneMixin, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings +from ...utils.backbone_utils import BackboneMixin from .configuration_upernet import UperNetConfig diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py new file mode 100644 index 000000000000..8c6b7107eb0e --- /dev/null +++ b/src/transformers/utils/backbone_utils.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Collection of utils to be used by backbones and their components.""" + +import inspect +from typing import Iterable, List, Optional, Tuple, Union + + +def verify_out_features_out_indices( + out_features: Optional[Iterable[str]], out_indices: Optional[Iterable[int]], stage_names: Optional[Iterable[str]] +): + """ + Verify that out_indices and out_features are valid for the given stage_names. + """ + if stage_names is None: + raise ValueError("Stage_names must be set for transformers backbones") + + if out_features is not None: + if not isinstance(out_features, (list,)): + raise ValueError(f"out_features must be a list {type(out_features)}") + if any(feat not in stage_names for feat in out_features): + raise ValueError(f"out_features must be a subset of stage_names: {stage_names} got {out_features}") + + if out_indices is not None: + if not isinstance(out_indices, (list, tuple)): + raise ValueError(f"out_indices must be a list or tuple, got {type(out_indices)}") + if any(idx >= len(stage_names) for idx in out_indices): + raise ValueError("out_indices must be valid indices for stage_names {stage_names}, got {out_indices}") + + if out_features is not None and out_indices is not None: + if len(out_features) != len(out_indices): + raise ValueError("out_features and out_indices should have the same length if both are set") + if out_features != [stage_names[idx] for idx in out_indices]: + raise ValueError("out_features and out_indices should correspond to the same stages if both are set") + + +def _align_output_features_output_indices( + out_features: Optional[List[str]], + out_indices: Optional[Union[List[int], Tuple[int]]], + stage_names: List[str], +): + """ + Finds the corresponding `out_features` and `out_indices` for the given `stage_names`. + + The logic is as follows: + - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the + `out_indices`. + - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the + `out_features`. + - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage. + - `out_indices` and `out_features` set: input `out_indices` and `out_features` are returned. + + Args: + out_features (`List[str]`): The names of the features for the backbone to output. + out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output. + stage_names (`List[str]`): The names of the stages of the backbone. + """ + if out_indices is None and out_features is None: + out_indices = [len(stage_names) - 1] + out_features = [stage_names[-1]] + elif out_indices is None and out_features is not None: + out_indices = [stage_names.index(layer) for layer in stage_names if layer in out_features] + elif out_features is None and out_indices is not None: + out_features = [stage_names[idx] for idx in out_indices] + return out_features, out_indices + + +def get_aligned_output_features_output_indices( + out_features: Optional[List[str]], + out_indices: Optional[Union[List[int], Tuple[int]]], + stage_names: List[str], +) -> Tuple[List[str], List[int]]: + """ + Get the `out_features` and `out_indices` so that they are aligned. + + The logic is as follows: + - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the + `out_indices`. + - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the + `out_features`. + - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage. + - `out_indices` and `out_features` set: they are verified to be aligned. + + Args: + out_features (`List[str]`): The names of the features for the backbone to output. + out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output. + stage_names (`List[str]`): The names of the stages of the backbone. + """ + # First verify that the out_features and out_indices are valid + verify_out_features_out_indices(out_features=out_features, out_indices=out_indices, stage_names=stage_names) + output_features, output_indices = _align_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=stage_names + ) + # Verify that the aligned out_features and out_indices are valid + verify_out_features_out_indices(out_features=output_features, out_indices=output_indices, stage_names=stage_names) + return output_features, output_indices + + +class BackboneMixin: + @property + def out_feature_channels(self): + # the current backbones will output the number of channels for each stage + # even if that stage is not in the out_features list. + return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)} + + @property + def channels(self): + return [self.out_feature_channels[name] for name in self.out_features] + + def forward_with_filtered_kwargs(self, *args, **kwargs): + signature = dict(inspect.signature(self.forward).parameters) + filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature} + return self(*args, **filtered_kwargs) + + def forward( + self, + pixel_values, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + raise NotImplementedError("This method should be implemented by the derived class.") + + @property + def out_features(self): + return self._out_features + + @out_features.setter + def out_features(self, out_features: List[str]): + """ + Set the out_features attribute. This will also update the out_indices attribute to match the new out_features. + """ + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=None, stage_names=self.stage_names + ) + + @property + def out_indices(self): + return self._out_indices + + @out_indices.setter + def out_indices(self, out_indices: Union[Tuple[int], List[int]]): + """ + Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices. + """ + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=None, out_indices=out_indices, stage_names=self.stage_names + ) + + +class BackboneConfigMixin: + """ + A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations. + """ + + @property + def out_features(self): + return self._out_features + + @out_features.setter + def out_features(self, out_features: List[str]): + """ + Set the out_features attribute. This will also update the out_indices attribute to match the new out_features. + """ + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=None, stage_names=self.stage_names + ) + + @property + def out_indices(self): + return self._out_indices + + @out_indices.setter + def out_indices(self, out_indices: Union[Tuple[int], List[int]]): + """ + Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices. + """ + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=None, out_indices=out_indices, stage_names=self.stage_names + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to + include the `out_features` and `out_indices` attributes. + """ + output = super().to_dict() + output["out_features"] = output.pop("_out_features") + output["out_indices"] = output.pop("_out_indices") + return output diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py index 80e68a2f44ad..fd9bbe3bfbfe 100644 --- a/tests/test_backbone_common.py +++ b/tests/test_backbone_common.py @@ -81,9 +81,15 @@ def test_channels(self): out_channels = [num_features[idx] for idx in out_indices] self.assertListEqual(model.channels, out_channels) - config.out_features = None - config.out_indices = None - model = model_class(config) + new_config = copy.deepcopy(config) + new_config.out_features = None + model = model_class(new_config) + self.assertEqual(len(model.channels), 1) + self.assertListEqual(model.channels, [num_features[-1]]) + + new_config = copy.deepcopy(config) + new_config.out_indices = None + model = model_class(new_config) self.assertEqual(len(model.channels), 1) self.assertListEqual(model.channels, [num_features[-1]]) @@ -102,6 +108,15 @@ def test_create_from_modified_config(self): # Check output of last stage is taken if out_features=None, out_indices=None modified_config = copy.deepcopy(config) modified_config.out_features = None + model = model_class(modified_config) + model.to(torch_device) + model.eval() + result = model(**inputs_dict) + + self.assertEqual(len(result.feature_maps), 1) + self.assertEqual(len(model.channels), 1) + + modified_config = copy.deepcopy(config) modified_config.out_indices = None model = model_class(modified_config) model.to(torch_device) diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py new file mode 100644 index 000000000000..66b7087da246 --- /dev/null +++ b/tests/utils/test_backbone_utils.py @@ -0,0 +1,102 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers.utils.backbone_utils import ( + BackboneMixin, + get_aligned_output_features_output_indices, + verify_out_features_out_indices, +) + + +class BackboneUtilsTester(unittest.TestCase): + def test_get_aligned_output_features_output_indices(self): + stage_names = ["a", "b", "c"] + + # Defaults to last layer if both are None + out_features, out_indices = get_aligned_output_features_output_indices(None, None, stage_names) + self.assertEqual(out_features, ["c"]) + self.assertEqual(out_indices, [2]) + + # Out indices set to match out features + out_features, out_indices = get_aligned_output_features_output_indices(["a", "c"], None, stage_names) + self.assertEqual(out_features, ["a", "c"]) + self.assertEqual(out_indices, [0, 2]) + + # Out features set to match out indices + out_features, out_indices = get_aligned_output_features_output_indices(None, [0, 2], stage_names) + self.assertEqual(out_features, ["a", "c"]) + self.assertEqual(out_indices, [0, 2]) + + # Out features selected from negative indices + out_features, out_indices = get_aligned_output_features_output_indices(None, [-3, -1], stage_names) + self.assertEqual(out_features, ["a", "c"]) + self.assertEqual(out_indices, [-3, -1]) + + def test_verify_out_features_out_indices(self): + # Stage names must be set + with self.assertRaises(ValueError): + verify_out_features_out_indices(["a", "b"], (0, 1), None) + + # Out features must be a list + with self.assertRaises(ValueError): + verify_out_features_out_indices(("a", "b"), (0, 1), ["a", "b"]) + + # Out features must be a subset of stage names + with self.assertRaises(ValueError): + verify_out_features_out_indices(["a", "b"], (0, 1), ["a"]) + + # Out indices must be a list or tuple + with self.assertRaises(ValueError): + verify_out_features_out_indices(None, 0, ["a", "b"]) + + # Out indices must be a subset of stage names + with self.assertRaises(ValueError): + verify_out_features_out_indices(None, (0, 1), ["a"]) + + # Out features and out indices must be the same length + with self.assertRaises(ValueError): + verify_out_features_out_indices(["a", "b"], (0,), ["a", "b", "c"]) + + # Out features should match out indices + with self.assertRaises(ValueError): + verify_out_features_out_indices(["a", "b"], (0, 2), ["a", "b", "c"]) + + # Out features and out indices should be in order + with self.assertRaises(ValueError): + verify_out_features_out_indices(["b", "a"], (0, 1), ["a", "b"]) + + # Check passes with valid inputs + verify_out_features_out_indices(["a", "b", "d"], (0, 1, -1), ["a", "b", "c", "d"]) + + def test_backbone_mixin(self): + backbone = BackboneMixin() + + backbone.stage_names = ["a", "b", "c"] + backbone._out_features = ["a", "c"] + backbone._out_indices = [0, 2] + + # Check that the output features and indices are set correctly + self.assertEqual(backbone.out_features, ["a", "c"]) + self.assertEqual(backbone.out_indices, [0, 2]) + + # Check out features and indices are updated correctly + backbone.out_features = ["a", "b"] + self.assertEqual(backbone.out_features, ["a", "b"]) + self.assertEqual(backbone.out_indices, [0, 1]) + + backbone.out_indices = [-3, -1] + self.assertEqual(backbone.out_features, ["a", "c"]) + self.assertEqual(backbone.out_indices, [-3, -1]) From 5eeb5564846297aea01b39621ed7fc32ed458246 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Thu, 4 May 2023 21:56:28 +0800 Subject: [PATCH 022/935] fix spelling error (#23143) change referrred to referred --- docs/source/en/preprocessing.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx index 9896b6898931..c2933099b03d 100644 --- a/docs/source/en/preprocessing.mdx +++ b/docs/source/en/preprocessing.mdx @@ -41,7 +41,7 @@ The main tool for preprocessing textual data is a [tokenizer](main_classes/token -If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referrred to as the *vocab*) during pretraining. +If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining. From 3b74889e8f3109efb36a0002c8aaa3ede164e30e Mon Sep 17 00:00:00 2001 From: Victor Geislinger <9027783+MrGeislinger@users.noreply.github.com> Date: Thu, 4 May 2023 06:56:45 -0700 Subject: [PATCH 023/935] Remove typo in perf_train_gpu_many.mdx (#23144) - Excess `w` in the word `bottom` --- docs/source/en/perf_train_gpu_many.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/perf_train_gpu_many.mdx b/docs/source/en/perf_train_gpu_many.mdx index 17eb7b739925..e756732daf1a 100644 --- a/docs/source/en/perf_train_gpu_many.mdx +++ b/docs/source/en/perf_train_gpu_many.mdx @@ -272,7 +272,7 @@ It's easy to see from the bottom diagram how PP has less dead zones, where GPUs Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0. -PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). +PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS. From adb0760b5f008893a421109d72918ef2a47825ad Mon Sep 17 00:00:00 2001 From: Qingyang Wu Date: Thu, 4 May 2023 09:57:32 -0400 Subject: [PATCH 024/935] fix resume fsdp (#23111) * fix resume fsdp * fix rank 0 loading * fix style and quality --- src/transformers/trainer.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index d30218dc4721..d18e3efeb87a 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2114,7 +2114,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None): safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME) if not any( - [os.path.isfile(f) for f in [weights_file, safe_weights_file, weights_index_file, safe_weights_index_file]] + os.path.isfile(f) for f in [weights_file, safe_weights_file, weights_index_file, safe_weights_index_file] ): raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") @@ -2364,6 +2364,12 @@ def _save_checkpoint(self, model, trial, metrics=None): if self.sharded_ddp == ShardedDDPOption.SIMPLE: self.optimizer.consolidate_state_dict() + if self.fsdp: + # FSDP has a different interface for saving optimizer states. + # Needs to be called on all ranks to gather all states. + # full_optim_state_dict will be deprecated after Pytorch 2.2! + full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer) + if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) @@ -2388,7 +2394,11 @@ def _save_checkpoint(self, model, trial, metrics=None): torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME)) elif self.args.should_save and not self.deepspeed: # deepspeed.save_checkpoint above saves model/optim/sched - torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + if self.fsdp: + torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME)) + else: + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + with warnings.catch_warnings(record=True) as caught_warnings: torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) reissue_pt_warnings(caught_warnings) @@ -2498,9 +2508,18 @@ def opt_load_hook(mod, opt): # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more # likely to get OOM on CPU (since we load num_gpu times the optimizer state map_location = self.args.device if self.args.world_size > 1 else "cpu" - self.optimizer.load_state_dict( - torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location) - ) + if self.fsdp: + full_osd = None + # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it + if self.args.process_index == 0: + full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME)) + # call scatter_full_optim_state_dict on all ranks + sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model) + self.optimizer.load_state_dict(sharded_osd) + else: + self.optimizer.load_state_dict( + torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location) + ) with warnings.catch_warnings(record=True) as caught_warnings: self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME))) reissue_pt_warnings(caught_warnings) From 510ad0a8b82cfe47cb368afcfcedf521ce0dbbed Mon Sep 17 00:00:00 2001 From: peter-sk Date: Thu, 4 May 2023 15:58:38 +0200 Subject: [PATCH 025/935] gpt2 multi-gpu fix (#23149) Co-authored-by: Prof. Peter Schneider-Kamp --- src/transformers/models/gpt2/modeling_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index afc79463bd7b..b69b910a76d0 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1670,9 +1670,9 @@ def forward( if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) + start_positions = start_positions.squeeze(-1).to(start_logits.device) if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) + end_positions = end_positions.squeeze(-1).to(end_logits.device) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions = start_positions.clamp(0, ignored_index) From 83b38fbea8c2a6fbd9af0d9561db3afde2f4f4e2 Mon Sep 17 00:00:00 2001 From: peter-sk Date: Thu, 4 May 2023 16:15:15 +0200 Subject: [PATCH 026/935] GPTNeoXForQuestionAnswering (#23059) * first draft - gives index error in question_answering.py * maturing * no labels * pipeline should know about QA * fixing checks * formatting * fixed docstring * initial commit * formatting * adding the class to many places * towards less unhappy checks * nearly there * and gpt neox for qa * use right model * forgot this one * base_model_prefix is "gpt_neox" for GPTNeoX* models * unnecessary stuff * Update src/transformers/models/gpt_neox/modeling_gpt_neox.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * format * Update src/transformers/models/gpt_neox/modeling_gpt_neox.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * removed gpt2 stuff --------- Co-authored-by: Prof. Peter Schneider-Kamp Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/model_doc/gpt_neox.mdx | 5 + docs/source/en/tasks/question_answering.mdx | 2 +- src/transformers/__init__.py | 2 + src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/gpt2/modeling_gpt2.py | 3 +- src/transformers/models/gpt_neox/__init__.py | 2 + .../models/gpt_neox/modeling_gpt_neox.py | 101 ++++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 7 ++ .../models/gpt_neox/test_modeling_gpt_neox.py | 23 +++- 9 files changed, 142 insertions(+), 4 deletions(-) diff --git a/docs/source/en/model_doc/gpt_neox.mdx b/docs/source/en/model_doc/gpt_neox.mdx index 0c86a81a3ba1..bc8bbd67331f 100644 --- a/docs/source/en/model_doc/gpt_neox.mdx +++ b/docs/source/en/model_doc/gpt_neox.mdx @@ -79,6 +79,11 @@ The `generate()` method can be used to generate text using GPT Neo model. [[autodoc]] GPTNeoXForCausalLM - forward +## GPTNeoXForQuestionAnswering + +[[autodoc]] GPTNeoXForQuestionAnswering + - forward + ## GPTNeoXForSequenceClassification [[autodoc]] GPTNeoXForSequenceClassification diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx index 3dd11140c253..fa39b640494d 100644 --- a/docs/source/en/tasks/question_answering.mdx +++ b/docs/source/en/tasks/question_answering.mdx @@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit -[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) +[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 1c74d242fc6b..7bf322ca8e1e 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1702,6 +1702,7 @@ [ "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST", "GPTNeoXForCausalLM", + "GPTNeoXForQuestionAnswering", "GPTNeoXForSequenceClassification", "GPTNeoXForTokenClassification", "GPTNeoXLayer", @@ -5245,6 +5246,7 @@ from .models.gpt_neox import ( GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST, GPTNeoXForCausalLM, + GPTNeoXForQuestionAnswering, GPTNeoXForSequenceClassification, GPTNeoXForTokenClassification, GPTNeoXLayer, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index a53128a61216..1ebc906d2d67 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -737,6 +737,7 @@ ("funnel", "FunnelForQuestionAnswering"), ("gpt2", "GPT2ForQuestionAnswering"), ("gpt_neo", "GPTNeoForQuestionAnswering"), + ("gpt_neox", "GPTNeoXForQuestionAnswering"), ("gptj", "GPTJForQuestionAnswering"), ("ibert", "IBertForQuestionAnswering"), ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"), diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index b69b910a76d0..e6ee04200b34 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -52,7 +52,6 @@ logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "gpt2" -_REAL_CHECKPOINT_FOR_DOC = "gpt2" _CONFIG_FOR_DOC = "GPT2Config" GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -1619,7 +1618,7 @@ def __init__(self, config): checkpoint=_CHECKPOINT_FOR_DOC, output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, - real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, + real_checkpoint=_CHECKPOINT_FOR_DOC, ) def forward( self, diff --git a/src/transformers/models/gpt_neox/__init__.py b/src/transformers/models/gpt_neox/__init__.py index d5fd5cbda2b6..46f06b1991af 100644 --- a/src/transformers/models/gpt_neox/__init__.py +++ b/src/transformers/models/gpt_neox/__init__.py @@ -36,6 +36,7 @@ _import_structure["modeling_gpt_neox"] = [ "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST", "GPTNeoXForCausalLM", + "GPTNeoXForQuestionAnswering", "GPTNeoXForSequenceClassification", "GPTNeoXForTokenClassification", "GPTNeoXLayer", @@ -64,6 +65,7 @@ from .modeling_gpt_neox import ( GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST, GPTNeoXForCausalLM, + GPTNeoXForQuestionAnswering, GPTNeoXForSequenceClassification, GPTNeoXForTokenClassification, GPTNeoXLayer, diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 055fc9ee27cc..2058dbc1a56b 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -31,6 +31,7 @@ from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, + QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) @@ -955,3 +956,103 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """ + The GPT-NeoX Model transformer with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + GPT_NEOX_START_DOCSTRING, +) +class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.gpt_neox = GPTNeoXModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.gpt_neox( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 73827a3937a1..7279117698db 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3336,6 +3336,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class GPTNeoXForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class GPTNeoXForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index ff226684ccae..927d097691c5 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -31,6 +31,7 @@ from transformers import ( GPTNeoXForCausalLM, + GPTNeoXForQuestionAnswering, GPTNeoXForSequenceClassification, GPTNeoXForTokenClassification, GPTNeoXModel, @@ -149,6 +150,15 @@ def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_la result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + def create_and_check_for_question_answering(self, config, input_ids, input_mask, token_labels): + config.num_labels = self.num_labels + model = GPTNeoXForQuestionAnswering(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, token_labels): config.num_labels = self.num_labels model = GPTNeoXForSequenceClassification(config) @@ -213,7 +223,13 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (GPTNeoXModel, GPTNeoXForCausalLM, GPTNeoXForSequenceClassification, GPTNeoXForTokenClassification) + ( + GPTNeoXModel, + GPTNeoXForCausalLM, + GPTNeoXForQuestionAnswering, + GPTNeoXForSequenceClassification, + GPTNeoXForTokenClassification, + ) if is_torch_available() else () ) @@ -221,6 +237,7 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi pipeline_model_mapping = ( { "feature-extraction": GPTNeoXModel, + "question-answering": GPTNeoXForQuestionAnswering, "text-classification": GPTNeoXForSequenceClassification, "token-classification": GPTNeoXForTokenClassification, "text-generation": GPTNeoXForCausalLM, @@ -265,6 +282,10 @@ def test_model_for_causal_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + def test_model_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + def test_model_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) From 57ffd8ab4c833e26b2288769f6031f94870a102c Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 4 May 2023 16:31:19 +0200 Subject: [PATCH 027/935] [`GPT-J`] Fix causal mask dtype (#23147) * fix #23136 * better fix * same fix for `masked_bias` --- src/transformers/models/gptj/modeling_gptj.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 18985cb3bce4..3a1f99dd713a 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -89,8 +89,9 @@ def __init__(self, config): torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) - self.register_buffer("masked_bias", torch.tensor(-1e9)) + self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) @@ -732,7 +733,7 @@ def custom_forward(*inputs): GPTJ_START_DOCSTRING, ) class GPTJForCausalLM(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] + _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] def __init__(self, config): super().__init__(config) From 3341bb41cd2e3bf69e2682fbfe042b7a98b6d4fb Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Thu, 4 May 2023 12:00:22 -0400 Subject: [PATCH 028/935] Pin urllib3 --- setup.py | 3 ++- src/transformers/dependency_versions_table.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index abafc03bc6f2..d8f58360d7af 100644 --- a/setup.py +++ b/setup.py @@ -185,6 +185,7 @@ "tqdm>=4.27", "unidic>=1.0.2", "unidic_lite>=1.0.7", + "urllib3<2.0.0", "uvicorn", ] @@ -331,7 +332,7 @@ def run(self): extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"] -extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder") +extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder", "urllib3") extras["all"] = ( extras["tf"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index bae19acd3e1d..dd1055ebd2e4 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -85,5 +85,6 @@ "tqdm": "tqdm>=4.27", "unidic": "unidic>=1.0.2", "unidic_lite": "unidic_lite>=1.0.7", + "urllib3": "urllib3<2.0.0", "uvicorn": "uvicorn", } From c8f2c5c56e942e8c45821d07555f2eab178b3f83 Mon Sep 17 00:00:00 2001 From: raghavanone <115454562+raghavanone@users.noreply.github.com> Date: Thu, 4 May 2023 22:30:16 +0530 Subject: [PATCH 029/935] Add FlaxWhisperForAudioClassification model (#22883) * Add FlaxWhisperForAudioClassification model * Add models to init * Add models to init * Fix copies * Fix automapping --- docs/source/en/model_doc/whisper.mdx | 6 + src/transformers/__init__.py | 8 +- .../models/auto/modeling_flax_auto.py | 5 + src/transformers/models/whisper/__init__.py | 2 + .../models/whisper/modeling_flax_whisper.py | 160 ++++++++++++++ src/transformers/utils/dummy_flax_objects.py | 7 + .../whisper/test_modeling_flax_whisper.py | 205 +++++++++++++++++- 7 files changed, 390 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx index 22b08e4e61bc..52a8b5953c63 100644 --- a/docs/source/en/model_doc/whisper.mdx +++ b/docs/source/en/model_doc/whisper.mdx @@ -105,3 +105,9 @@ The original code can be found [here](https://github.com/openai/whisper). [[autodoc]] FlaxWhisperForConditionalGeneration - __call__ + +## FlaxWhisperForAudioClassification + +[[autodoc]] FlaxWhisperForAudioClassification + - __call__ + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7bf322ca8e1e..b0766b0946cd 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3779,6 +3779,7 @@ "FlaxWhisperForConditionalGeneration", "FlaxWhisperModel", "FlaxWhisperPreTrainedModel", + "FlaxWhisperForAudioClassification", ] ) _import_structure["models.xglm"].extend( @@ -6903,7 +6904,12 @@ FlaxWav2Vec2Model, FlaxWav2Vec2PreTrainedModel, ) - from .models.whisper import FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel + from .models.whisper import ( + FlaxWhisperForAudioClassification, + FlaxWhisperForConditionalGeneration, + FlaxWhisperModel, + FlaxWhisperPreTrainedModel, + ) from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel from .models.xlm_roberta import ( FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py index 755d1f07a344..e3b8d9cf5b52 100644 --- a/src/transformers/models/auto/modeling_flax_auto.py +++ b/src/transformers/models/auto/modeling_flax_auto.py @@ -229,6 +229,11 @@ ] ) +FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + ("whisper", "FlaxWhisperForAudioClassification"), + ] +) FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES) FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES) diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py index 3b6015a56f6f..cd962478e34d 100644 --- a/src/transformers/models/whisper/__init__.py +++ b/src/transformers/models/whisper/__init__.py @@ -75,6 +75,7 @@ "FlaxWhisperForConditionalGeneration", "FlaxWhisperModel", "FlaxWhisperPreTrainedModel", + "FlaxWhisperForAudioClassification", ] @@ -126,6 +127,7 @@ pass else: from .modeling_flax_whisper import ( + FlaxWhisperForAudioClassification, FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel, diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index b8d6f07242d8..e36131680d63 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -36,6 +36,7 @@ FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput, FlaxSeq2SeqModelOutput, + FlaxSequenceClassifierOutput, ) from ...modeling_flax_utils import ( ACT2FN, @@ -1506,3 +1507,162 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): append_replace_return_docstrings( FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC ) + + +class FlaxWhisperForAudioClassificationModule(nn.Module): + config: WhisperConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype) + self.config.is_encoder_decoder = False + num_layers = self.config.num_hidden_layers + 1 + if self.config.use_weighted_layer_sum: + self.layer_weights = jnp.repeat(1 / num_layers, num_layers) + self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_features, + encoder_outputs=None, + output_attentions=None, + output_hidden_states: bool = True, + return_dict: bool = True, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_features, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = jnp.stack(encoder_outputs, axis=1) + norm_weights = jax.nn.softmax(self.layer_weights, axis=-1) + hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1) + else: + hidden_states = encoder_outputs[0] + + hidden_states = self.projector(hidden_states) + pooled_output = jnp.mean(hidden_states, axis=1) + + logits = self.classifier(pooled_output) + + if not return_dict: + return (logits,) + encoder_outputs[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING) +class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel): + module_class = FlaxWhisperForAudioClassificationModule + dtype: jnp.dtype = jnp.float32 + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_features = jnp.zeros(input_shape, dtype="f4") + input_features = input_features.at[(..., -1)].set(self.config.eos_token_id) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_features=input_features, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING) + def __call__( + self, + input_features: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + **kwargs, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + input_features=jnp.array(input_features, dtype="f4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + +FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r""" + Returns: + + Transcription example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id") + >>> model = FlaxWhisperForAudioClassification.from_pretrained( + ... "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True + ... ) + >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True) + + >>> sample = next(iter(ds)) + + >>> inputs = feature_extractor( + ... sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np" + ... ) + >>> input_features = inputs.input_features + + >>> logits = model(input_features).logits + + >>> predicted_class_ids = jnp.argmax(logits).item() + >>> predicted_label = model.config.id2label[predicted_class_ids] + >>> predicted_label + 'af_za' + ``` +""" + +overwrite_call_docstring( + FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING +) +append_replace_return_docstrings( + FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC +) diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py index eeec3277492d..ce571bc9f8d0 100644 --- a/src/transformers/utils/dummy_flax_objects.py +++ b/src/transformers/utils/dummy_flax_objects.py @@ -1182,6 +1182,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["flax"]) +class FlaxWhisperForAudioClassification(metaclass=DummyObject): + _backends = ["flax"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["flax"]) + + class FlaxWhisperForConditionalGeneration(metaclass=DummyObject): _backends = ["flax"] diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py index 3f1e201d72d8..79a2c51039ac 100644 --- a/tests/models/whisper/test_modeling_flax_whisper.py +++ b/tests/models/whisper/test_modeling_flax_whisper.py @@ -12,8 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - import functools import inspect import tempfile @@ -41,6 +39,7 @@ from transformers import ( FLAX_MODEL_MAPPING, + FlaxWhisperForAudioClassification, FlaxWhisperForConditionalGeneration, FlaxWhisperModel, WhisperFeatureExtractor, @@ -704,3 +703,205 @@ def test_tiny_timestamp_generation(self): transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) self.assertEqual(transcript, EXPECTED_TRANSCRIPT) + + +class FlaxWhisperEncoderModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=60, + is_training=True, + use_labels=True, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + input_channels=1, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + max_source_positions=30, + num_mel_bins=80, + num_conv_layers=1, + suppress_tokens=None, + begin_suppress_tokens=None, + classifier_proj_size=4, + num_labels=2, + is_encoder_decoder=False, + is_decoder=False, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.input_channels = input_channels + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_mel_bins = num_mel_bins + self.max_position_embeddings = max_position_embeddings + self.max_source_positions = max_source_positions + self.num_conv_layers = num_conv_layers + self.suppress_tokens = suppress_tokens + self.begin_suppress_tokens = begin_suppress_tokens + self.classifier_proj_size = classifier_proj_size + self.num_labels = num_labels + self.is_encoder_decoder = is_encoder_decoder + self.is_decoder = is_decoder + + def get_config(self): + return WhisperConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + input_channels=self.input_channels, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + max_source_positions=self.max_source_positions, + decoder_ffn_dim=self.hidden_size, + encoder_ffn_dim=self.hidden_size, + suppress_tokens=self.suppress_tokens, + begin_suppress_tokens=self.begin_suppress_tokens, + classifier_proj_size=self.classifier_proj_size, + num_labels=self.num_labels, + is_encoder_decoder=self.is_encoder_decoder, + is_decoder=self.is_decoder, + ) + + def prepare_whisper_encoder_inputs_dict( + self, + input_features, + ): + return { + "input_features": input_features, + } + + def prepare_config_and_inputs(self): + input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length]) + + config = self.get_config() + inputs_dict = self.prepare_whisper_encoder_inputs_dict( + input_features=input_features, + ) + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def get_subsampled_output_lengths(self, input_lengths): + """ + Computes the output length of the convolutional layers + """ + + for i in range(self.num_conv_layers): + input_lengths = (input_lengths - 1) // 2 + 1 + + return input_lengths + + @property + def encoder_seq_length(self): + return self.get_subsampled_output_lengths(self.seq_length) + + +@require_flax +class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else () + is_encoder_decoder = False + fx_compatible = False + test_pruning = False + test_missing_keys = False + + input_name = "input_features" + + def setUp(self): + self.model_tester = FlaxWhisperEncoderModelTester(self) + _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + self.init_shape = (1,) + inputs_dict["input_features"].shape[1:] + + self.all_model_classes = ( + make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes + ) + self.config_tester = ConfigTester(self, config_class=WhisperConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + # overwrite because of `input_features` + def test_jit_compilation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): + prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + + @jax.jit + def model_jitted(input_features, **kwargs): + return model(input_features=input_features, **kwargs) + + with self.subTest("JIT Enabled"): + jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple() + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + outputs = model_jitted(**prepared_inputs_dict).to_tuple() + + self.assertEqual(len(outputs), len(jitted_outputs)) + for jitted_output, output in zip(jitted_outputs, outputs): + self.assertEqual(jitted_output.shape, output.shape) + + # overwrite because of `input_features` + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.__call__) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_features", "attention_mask", "output_attentions"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_inputs_embeds(self): + pass + + # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented + def test_model_common_attributes(self): + pass + + # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings + def test_resize_tokens_embeddings(self): + pass + + # WhisperEncoder does not have any base model + def test_save_load_to_base(self): + pass + + # WhisperEncoder does not have any base model + def test_save_load_from_base(self): + pass + + # WhisperEncoder does not have any base model + @is_pt_flax_cross_test + def test_save_load_from_base_pt(self): + pass + + # WhisperEncoder does not have any base model + @is_pt_flax_cross_test + def test_save_load_to_base_pt(self): + pass + + # WhisperEncoder does not have any base model + @is_pt_flax_cross_test + def test_save_load_bf16_to_base_pt(self): + pass From 516dc6305f2abca3f48218310ea872e3b1c9c995 Mon Sep 17 00:00:00 2001 From: Maria Khalusova Date: Thu, 4 May 2023 13:17:13 -0400 Subject: [PATCH 030/935] [docs] Text to speech task guide (#23107) * First draft * Some polishing * Text polishing * added TOC entry for TTS * make style * added links to images * fixed links to images * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * feedback addressed * feedback from Matthijs addresed * Update docs/source/en/tasks/text-to-speech.mdx Co-authored-by: Matthijs Hollemans --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Matthijs Hollemans --- docs/source/en/_toctree.yml | 2 + docs/source/en/tasks/text-to-speech.mdx | 558 ++++++++++++++++++++++++ 2 files changed, 560 insertions(+) create mode 100644 docs/source/en/tasks/text-to-speech.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b9e346b3000c..f6e9684f79e3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -71,6 +71,8 @@ title: Image captioning - local: tasks/document_question_answering title: Document Question Answering + - local: tasks/text-to-speech + title: Text to speech title: Multimodal isExpanded: false title: Task Guides diff --git a/docs/source/en/tasks/text-to-speech.mdx b/docs/source/en/tasks/text-to-speech.mdx new file mode 100644 index 000000000000..a368fcb35f7e --- /dev/null +++ b/docs/source/en/tasks/text-to-speech.mdx @@ -0,0 +1,558 @@ + + +# Text to speech + +[[open-in-colab]] + +Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple +languages and for multiple speakers. The only text-to-speech model currently available in 🤗 Transformers +is [SpeechT5](model_doc/speecht5), though more will be added in the future. SpeechT5 is pre-trained on a combination of +speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text +and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 +supports multiple speakers through x-vector speaker embeddings. + +This guide illustrates how to: + +1. Fine-tune [SpeechT5](model_doc/speecht5) that was originally trained on English speech on the Dutch (`nl`) language subset of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset. +2. Use your fine-tuned model for inference. + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install datasets soundfile speechbrain accelerate +``` + +Install 🤗Transformers from source as not all the SpeechT5 features have been merged into an official release yet: + +```bash +pip install git+https://github.com/huggingface/transformers.git +``` + + + +To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available: + +```bash +!nvidia-smi +``` + + + +We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load the dataset + +[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of +data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 +European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset. + +Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable +option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are +typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite +challenging. + +Let's load the data: + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train") +>>> len(dataset) +20968 +``` + +20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so +make sure the examples in the dataset meet this requirement: + +```py +dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +``` + +## Preprocess the data + +Let's begin by defining the model checkpoint to use and loading the appropriate processor: + +```py +>>> from transformers import SpeechT5Processor + +>>> checkpoint = "microsoft/speecht5_tts" +>>> processor = SpeechT5Processor.from_pretrained(checkpoint) +``` + +### Text cleanup for SpeechT5 tokenization + +Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text: + +```py +>>> tokenizer = processor.tokenizer +``` + +The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, +consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written +out as text. Thus, it is a better fit, and we recommend using `normalized_text` as input text. + +Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If +left as is, these characters will be converted to `` tokens. However, in Dutch, certain characters like `à` are +used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`. + +To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which +works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates +the transcriptions from all examples into one string and converts it to a set of characters. +Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for +the mapping function. + +```py +>>> def extract_all_chars(batch): +... all_text = " ".join(batch["normalized_text"]) +... vocab = list(set(all_text)) +... return {"vocab": [vocab], "all_text": [all_text]} + + +>>> vocabs = dataset.map( +... extract_all_chars, +... batched=True, +... batch_size=-1, +... keep_in_memory=True, +... remove_columns=dataset.column_names, +... ) + +>>> dataset_vocab = set(vocabs["vocab"][0]) +>>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()} +``` + +Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. +To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting +set will contain the characters that are in the dataset but not in the tokenizer. + +```py +>>> dataset_vocab - tokenizer_vocab +{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'} +``` + +To handle the unsupported characters identified in the previous step, define a function that maps these characters to +valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately. + +```py +>>> replacements = [ +... ("à", "a"), +... ("ç", "c"), +... ("è", "e"), +... ("ë", "e"), +... ("í", "i"), +... ("ï", "i"), +... ("ö", "o"), +... ("ü", "u"), +... ] + + +>>> def cleanup_text(inputs): +... for src, dst in replacements: +... inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst) +... return inputs + + +>>> dataset = dataset.map(cleanup_text) +``` + +Now that you have dealt with special characters in the text, it's time to shift focus to the audio data. + +### Speakers + +The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To +determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. +With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of +speakers and examples in the data. + +```py +>>> from collections import defaultdict + +>>> speaker_counts = defaultdict(int) + +>>> for speaker_id in dataset["speaker_id"]: +... speaker_counts[speaker_id] += 1 +``` + +By plotting a histogram you can get a sense of how much data there is for each speaker. + +```py +>>> import matplotlib.pyplot as plt + +>>> plt.figure() +>>> plt.hist(speaker_counts.values(), bins=20) +>>> plt.ylabel("Speakers") +>>> plt.xlabel("Examples") +>>> plt.show() +``` + +
+ Speakers histogram +
+ +The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while +around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit +the data to speakers with between 100 and 400 examples. + +```py +>>> def select_speaker(speaker_id): +... return 100 <= speaker_counts[speaker_id] <= 400 + + +>>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"]) +``` + +Let's check how many speakers remain: + +```py +>>> len(set(dataset["speaker_id"])) +42 +``` + +Let's see how many examples are left: + +```py +>>> len(dataset) +9973 +``` + +You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient. + +Note that some speakers with few examples may actually have more audio available if the examples are long. However, +determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a +time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here. + +### Speaker embeddings + +To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. +The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics. +To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) +model from SpeechBrain. + +Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector +containing the corresponding speaker embedding. + +```py +>>> import os +>>> import torch +>>> from speechbrain.pretrained import EncoderClassifier + +>>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb" + +>>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> speaker_model = EncoderClassifier.from_hparams( +... source=spk_model_name, +... run_opts={"device": device}, +... savedir=os.path.join("/tmp", spk_model_name), +... ) + + +>>> def create_speaker_embedding(waveform): +... with torch.no_grad(): +... speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) +... speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) +... speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() +... return speaker_embeddings +``` + +It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb +dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate +reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases. + +For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model +is better able to capture the unique voice characteristics present in the Dutch language. + +### Processing the dataset + +Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a +single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. +It should also add the speaker embeddings as an additional input. + +```py +>>> def prepare_dataset(example): +... audio = example["audio"] + +... example = processor( +... text=example["normalized_text"], +... audio_target=audio["array"], +... sampling_rate=audio["sampling_rate"], +... return_attention_mask=False, +... ) + +... # strip off the batch dimension +... example["labels"] = example["labels"][0] + +... # use SpeechBrain to obtain x-vector +... example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) + +... return example +``` + +Verify the processing is correct by looking at a single example: + +```py +>>> processed_example = prepare_dataset(dataset[0]) +>>> list(processed_example.keys()) +['input_ids', 'labels', 'stop_labels', 'speaker_embeddings'] +``` + +Speaker embeddings should be a 512-element vector: + +```py +>>> processed_example["speaker_embeddings"].shape +(512,) +``` + +The labels should be a log-mel spectrogram with 80 mel bins. + +```py +>>> import matplotlib.pyplot as plt + +>>> plt.figure() +>>> plt.imshow(processed_example["labels"].T) +>>> plt.show() +``` + +
+ Log-mel spectrogram with 80 mel bins +
+ +Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies +at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library, +the y-axis is flipped and the spectrograms appear upside down. + +Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes. + +```py +>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names) +``` + +You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). +Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens. + +```py +>>> def is_not_too_long(input_ids): +... input_length = len(input_ids) +... return input_length < 200 + + +>>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"]) +>>> len(dataset) +8259 +``` + +Next, create a basic train/test split: + +```py +>>> dataset = dataset.train_test_split(test_size=0.1) +``` + +### Data collator + +In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding +tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value +instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss. + +```py +>>> from dataclasses import dataclass +>>> from typing import Any, Dict, List, Union + + +>>> @dataclass +... class TTSDataCollatorWithPadding: +... processor: Any + +... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: +... input_ids = [{"input_ids": feature["input_ids"]} for feature in features] +... label_features = [{"input_values": feature["labels"]} for feature in features] +... speaker_features = [feature["speaker_embeddings"] for feature in features] + +... # collate the inputs and targets into a batch +... batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt") + +... # replace padding with -100 to ignore loss correctly +... batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100) + +... # not used during fine-tuning +... del batch["decoder_attention_mask"] + +... # round down target lengths to multiple of reduction factor +... if model.config.reduction_factor > 1: +... target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features]) +... target_lengths = target_lengths.new( +... [length - length % model.config.reduction_factor for length in target_lengths] +... ) +... max_length = max(target_lengths) +... batch["labels"] = batch["labels"][:, :max_length] + +... # also add in the speaker embeddings +... batch["speaker_embeddings"] = torch.tensor(speaker_features) + +... return batch +``` + +In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every +other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original +target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a +multiple of 2. + +```py +>>> data_collator = TTSDataCollatorWithPadding(processor=processor) +``` + +## Train the model + +Load the pre-trained model from the same checkpoint as you used for loading the processor: + +```py +>>> from transformers import SpeechT5ForTextToSpeech + +>>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) +``` + +The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training. + +```py +>>> model.config.use_cache = False +``` + +Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll +only look at the loss: + +```python +>>> from transformers import Seq2SeqTrainingArguments + +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="speecht5_finetuned_voxpopuli_nl", # change to a repo name of your choice +... per_device_train_batch_size=4, +... gradient_accumulation_steps=8, +... learning_rate=1e-5, +... warmup_steps=500, +... max_steps=4000, +... gradient_checkpointing=True, +... fp16=True, +... evaluation_strategy="steps", +... per_device_eval_batch_size=2, +... save_steps=1000, +... eval_steps=1000, +... logging_steps=25, +... report_to=["tensorboard"], +... load_best_model_at_end=True, +... greater_is_better=False, +... label_names=["labels"], +... push_to_hub=True, +... ) +``` + +Instantiate the `Trainer` object and pass the model, dataset, and data collator to it. + +```py +>>> from transformers import Seq2SeqTrainer + +>>> trainer = Seq2SeqTrainer( +... args=training_args, +... model=model, +... train_dataset=dataset["train"], +... eval_dataset=dataset["test"], +... data_collator=data_collator, +... tokenizer=processor.tokenizer, +... ) +``` + +And with that, you're ready to start training! Training will take several hours. Depending on your GPU, +it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce +the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate. + +```py +>>> trainer.train() +``` + +Push the final model to the 🤗 Hub: + +```py +>>> trainer.push_to_hub() +``` + +## Inference + +Great, now that you've fine-tuned a model, you can use it for inference! +Load the model from the 🤗 Hub (make sure to use your account name in the following code snippet): + +```py +>>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl") +``` + +Pick an example, here we'll take one from the test dataset. Obtain a speaker embedding. + +```py +>>> example = dataset["test"][304] +>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) +``` + +Define some input text and tokenize it. + +```py +>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!" +``` + +Preprocess the input text: + +```py +>>> inputs = processor(text=text, return_tensors="pt") +``` + +Create a spectrogram with your model: + +```py +>>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) +``` + +Visualize the spectrogram, if you'd like to: + +```py +>>> plt.figure() +>>> plt.imshow(spectrogram.T) +>>> plt.show() +``` + +
+ Generated log-mel spectrogram +
+ +Finally, use the vocoder to turn the spectrogram into sound. + +```py +>>> with torch.no_grad(): +... speech = vocoder(spectrogram) + +>>> from IPython.display import Audio + +>>> Audio(speech.numpy(), rate=16000) +``` + +In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker +embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best +when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding. + +Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does +capture the voice characteristics of the speaker (compare to the original audio in the example). +Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to +see if this improves the results. + +Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it +may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please +use TTS judiciously and responsibly. \ No newline at end of file From b369e507aaa78103baf5d3f3563952b44a0408a1 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 4 May 2023 18:36:23 +0100 Subject: [PATCH 031/935] Generate: text generation pipeline no longer emits `max_length` warning when it is not set (#23139) --- src/transformers/generation/flax_utils.py | 2 +- src/transformers/generation/tf_utils.py | 2 +- src/transformers/generation/utils.py | 2 +- src/transformers/pipelines/text_generation.py | 32 +++++++++++++------ .../test_pipelines_text_generation.py | 32 ++++++++++++++++++- 5 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 58a2bf13ba61..65d65869afd2 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -385,7 +385,6 @@ def generate( UserWarning, ) elif generation_config.max_new_tokens is not None: - generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if not has_default_max_length: logger.warning( f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" @@ -393,6 +392,7 @@ def generate( "Please refer to the documentation for more information. " "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py index 5cd8153c2bf1..5e4bc58c8407 100644 --- a/src/transformers/generation/tf_utils.py +++ b/src/transformers/generation/tf_utils.py @@ -858,7 +858,6 @@ def generate( UserWarning, ) elif generation_config.max_new_tokens is not None: - generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if not has_default_max_length: logger.warning( f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" @@ -866,6 +865,7 @@ def generate( "Please refer to the documentation for more information. " "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length # If the input length is a tensor (i.e. dynamic length), skip length checks if not isinstance(input_ids_seq_length, tf.Tensor): diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 06836d4d4ae2..0f0191fb144f 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1348,7 +1348,6 @@ def generate( UserWarning, ) elif generation_config.max_new_tokens is not None: - generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if not has_default_max_length: logger.warning( f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" @@ -1356,6 +1355,7 @@ def generate( "Please refer to the documentation for more information. " "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py index f95acf7d307f..60037339072d 100644 --- a/src/transformers/pipelines/text_generation.py +++ b/src/transformers/pipelines/text_generation.py @@ -1,3 +1,4 @@ +import copy import enum import warnings @@ -105,17 +106,8 @@ def _sanitize_parameters( prefix_inputs = self.tokenizer( prefix, padding=False, add_special_tokens=False, return_tensors=self.framework ) - prefix_length = prefix_inputs["input_ids"].shape[-1] + generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1] - if "max_new_tokens" in generate_kwargs: - pass - elif "max_length" in generate_kwargs: - generate_kwargs["max_length"] += prefix_length - else: - generate_kwargs["max_length"] = self.model.config.max_length + prefix_length - - if "min_length" in generate_kwargs: - generate_kwargs["min_length"] += prefix_length if handle_long_generation is not None: if handle_long_generation not in {"hole"}: raise ValueError( @@ -247,6 +239,26 @@ def _forward(self, model_inputs, **generate_kwargs): else: in_b = input_ids.shape[0] prompt_text = model_inputs.pop("prompt_text") + + # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying + # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline. + generate_kwargs = copy.deepcopy(generate_kwargs) + prefix_length = generate_kwargs.pop("prefix_length", 0) + if prefix_length > 0: + has_max_new_tokens = "max_new_tokens" in generate_kwargs or ( + "generation_config" in generate_kwargs + and generate_kwargs["generation_config"].max_new_tokens is not None + ) + if not has_max_new_tokens: + generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length + generate_kwargs["max_length"] += prefix_length + has_min_new_tokens = "min_new_tokens" in generate_kwargs or ( + "generation_config" in generate_kwargs + and generate_kwargs["generation_config"].min_new_tokens is not None + ) + if not has_min_new_tokens and "min_length" in generate_kwargs: + generate_kwargs["min_length"] += prefix_length + # BS x SL generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) out_b = generated_sequence.shape[0] diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index 34dbef6df2d0..84b144905217 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -14,8 +14,15 @@ import unittest -from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING, TextGenerationPipeline, pipeline +from transformers import ( + MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TextGenerationPipeline, + logging, + pipeline, +) from transformers.testing_utils import ( + CaptureLogger, is_pipeline_test, require_accelerate, require_tf, @@ -323,3 +330,26 @@ def test_pipeline_accelerate_top_p(self): pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device_map="auto", torch_dtype=torch.float16) pipe("This is a test", do_sample=True, top_p=0.5) + + def test_pipeline_length_setting_warning(self): + prompt = """Hello world""" + text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2") + if text_generator.model.framework == "tf": + logger = logging.get_logger("transformers.generation.tf_utils") + else: + logger = logging.get_logger("transformers.generation.utils") + logger_msg = "Both `max_new_tokens`" # The beggining of the message to be checked in this test + + # Both are set by the user -> log warning + with CaptureLogger(logger) as cl: + _ = text_generator(prompt, max_length=10, max_new_tokens=1) + self.assertIn(logger_msg, cl.out) + + # The user only sets one -> no warning + with CaptureLogger(logger) as cl: + _ = text_generator(prompt, max_new_tokens=1) + self.assertNotIn(logger_msg, cl.out) + + with CaptureLogger(logger) as cl: + _ = text_generator(prompt, max_length=10) + self.assertNotIn(logger_msg, cl.out) From 01734dba842c29408c96caa5c345c9e415c7569b Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 4 May 2023 13:47:07 -0400 Subject: [PATCH 032/935] Revert "Add FlaxWhisperForAudioClassification model" (#23154) Revert "Add FlaxWhisperForAudioClassification model (#22883)" This reverts commit c8f2c5c56e942e8c45821d07555f2eab178b3f83. --- docs/source/en/model_doc/whisper.mdx | 6 - src/transformers/__init__.py | 8 +- .../models/auto/modeling_flax_auto.py | 5 - src/transformers/models/whisper/__init__.py | 2 - .../models/whisper/modeling_flax_whisper.py | 160 -------------- src/transformers/utils/dummy_flax_objects.py | 7 - .../whisper/test_modeling_flax_whisper.py | 205 +----------------- 7 files changed, 3 insertions(+), 390 deletions(-) diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx index 52a8b5953c63..22b08e4e61bc 100644 --- a/docs/source/en/model_doc/whisper.mdx +++ b/docs/source/en/model_doc/whisper.mdx @@ -105,9 +105,3 @@ The original code can be found [here](https://github.com/openai/whisper). [[autodoc]] FlaxWhisperForConditionalGeneration - __call__ - -## FlaxWhisperForAudioClassification - -[[autodoc]] FlaxWhisperForAudioClassification - - __call__ - diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b0766b0946cd..7bf322ca8e1e 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3779,7 +3779,6 @@ "FlaxWhisperForConditionalGeneration", "FlaxWhisperModel", "FlaxWhisperPreTrainedModel", - "FlaxWhisperForAudioClassification", ] ) _import_structure["models.xglm"].extend( @@ -6904,12 +6903,7 @@ FlaxWav2Vec2Model, FlaxWav2Vec2PreTrainedModel, ) - from .models.whisper import ( - FlaxWhisperForAudioClassification, - FlaxWhisperForConditionalGeneration, - FlaxWhisperModel, - FlaxWhisperPreTrainedModel, - ) + from .models.whisper import FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel from .models.xlm_roberta import ( FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py index e3b8d9cf5b52..755d1f07a344 100644 --- a/src/transformers/models/auto/modeling_flax_auto.py +++ b/src/transformers/models/auto/modeling_flax_auto.py @@ -229,11 +229,6 @@ ] ) -FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( - [ - ("whisper", "FlaxWhisperForAudioClassification"), - ] -) FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES) FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES) diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py index cd962478e34d..3b6015a56f6f 100644 --- a/src/transformers/models/whisper/__init__.py +++ b/src/transformers/models/whisper/__init__.py @@ -75,7 +75,6 @@ "FlaxWhisperForConditionalGeneration", "FlaxWhisperModel", "FlaxWhisperPreTrainedModel", - "FlaxWhisperForAudioClassification", ] @@ -127,7 +126,6 @@ pass else: from .modeling_flax_whisper import ( - FlaxWhisperForAudioClassification, FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel, diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index e36131680d63..b8d6f07242d8 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -36,7 +36,6 @@ FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput, FlaxSeq2SeqModelOutput, - FlaxSequenceClassifierOutput, ) from ...modeling_flax_utils import ( ACT2FN, @@ -1507,162 +1506,3 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): append_replace_return_docstrings( FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC ) - - -class FlaxWhisperForAudioClassificationModule(nn.Module): - config: WhisperConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self) -> None: - self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype) - self.config.is_encoder_decoder = False - num_layers = self.config.num_hidden_layers + 1 - if self.config.use_weighted_layer_sum: - self.layer_weights = jnp.repeat(1 / num_layers, num_layers) - self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype) - self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) - - def __call__( - self, - input_features, - encoder_outputs=None, - output_attentions=None, - output_hidden_states: bool = True, - return_dict: bool = True, - ): - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if encoder_outputs is None: - encoder_outputs = self.encoder( - input_features, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - if self.config.use_weighted_layer_sum: - hidden_states = jnp.stack(encoder_outputs, axis=1) - norm_weights = jax.nn.softmax(self.layer_weights, axis=-1) - hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1) - else: - hidden_states = encoder_outputs[0] - - hidden_states = self.projector(hidden_states) - pooled_output = jnp.mean(hidden_states, axis=1) - - logits = self.classifier(pooled_output) - - if not return_dict: - return (logits,) + encoder_outputs[1:] - - return FlaxSequenceClassifierOutput( - logits=logits, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING) -class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel): - module_class = FlaxWhisperForAudioClassificationModule - dtype: jnp.dtype = jnp.float32 - - def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: - # init input tensors - input_features = jnp.zeros(input_shape, dtype="f4") - input_features = input_features.at[(..., -1)].set(self.config.eos_token_id) - - params_rng, dropout_rng = jax.random.split(rng) - rngs = {"params": params_rng, "dropout": dropout_rng} - - random_params = self.module.init( - rngs, - input_features=input_features, - )["params"] - - if params is not None: - random_params = flatten_dict(unfreeze(random_params)) - params = flatten_dict(unfreeze(params)) - for missing_key in self._missing_keys: - params[missing_key] = random_params[missing_key] - self._missing_keys = set() - return freeze(unflatten_dict(params)) - else: - return random_params - - @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING) - def __call__( - self, - input_features: jnp.ndarray, - attention_mask: Optional[jnp.ndarray] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - train: bool = False, - params: dict = None, - dropout_rng: PRNGKey = None, - **kwargs, - ): - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - - # Handle any PRNG if needed - rngs = {} - if dropout_rng is not None: - rngs["dropout"] = dropout_rng - - return self.module.apply( - {"params": params or self.params}, - input_features=jnp.array(input_features, dtype="f4"), - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - rngs=rngs, - ) - - -FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r""" - Returns: - - Transcription example: - - ```python - >>> import jax.numpy as jnp - >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification - >>> from datasets import load_dataset - - >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id") - >>> model = FlaxWhisperForAudioClassification.from_pretrained( - ... "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True - ... ) - >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True) - - >>> sample = next(iter(ds)) - - >>> inputs = feature_extractor( - ... sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np" - ... ) - >>> input_features = inputs.input_features - - >>> logits = model(input_features).logits - - >>> predicted_class_ids = jnp.argmax(logits).item() - >>> predicted_label = model.config.id2label[predicted_class_ids] - >>> predicted_label - 'af_za' - ``` -""" - -overwrite_call_docstring( - FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING -) -append_replace_return_docstrings( - FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC -) diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py index ce571bc9f8d0..eeec3277492d 100644 --- a/src/transformers/utils/dummy_flax_objects.py +++ b/src/transformers/utils/dummy_flax_objects.py @@ -1182,13 +1182,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["flax"]) -class FlaxWhisperForAudioClassification(metaclass=DummyObject): - _backends = ["flax"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["flax"]) - - class FlaxWhisperForConditionalGeneration(metaclass=DummyObject): _backends = ["flax"] diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py index 79a2c51039ac..3f1e201d72d8 100644 --- a/tests/models/whisper/test_modeling_flax_whisper.py +++ b/tests/models/whisper/test_modeling_flax_whisper.py @@ -12,6 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import functools import inspect import tempfile @@ -39,7 +41,6 @@ from transformers import ( FLAX_MODEL_MAPPING, - FlaxWhisperForAudioClassification, FlaxWhisperForConditionalGeneration, FlaxWhisperModel, WhisperFeatureExtractor, @@ -703,205 +704,3 @@ def test_tiny_timestamp_generation(self): transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) self.assertEqual(transcript, EXPECTED_TRANSCRIPT) - - -class FlaxWhisperEncoderModelTester: - def __init__( - self, - parent, - batch_size=13, - seq_length=60, - is_training=True, - use_labels=True, - hidden_size=16, - num_hidden_layers=2, - num_attention_heads=4, - input_channels=1, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=20, - max_source_positions=30, - num_mel_bins=80, - num_conv_layers=1, - suppress_tokens=None, - begin_suppress_tokens=None, - classifier_proj_size=4, - num_labels=2, - is_encoder_decoder=False, - is_decoder=False, - ): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_labels = use_labels - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.input_channels = input_channels - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.num_mel_bins = num_mel_bins - self.max_position_embeddings = max_position_embeddings - self.max_source_positions = max_source_positions - self.num_conv_layers = num_conv_layers - self.suppress_tokens = suppress_tokens - self.begin_suppress_tokens = begin_suppress_tokens - self.classifier_proj_size = classifier_proj_size - self.num_labels = num_labels - self.is_encoder_decoder = is_encoder_decoder - self.is_decoder = is_decoder - - def get_config(self): - return WhisperConfig( - d_model=self.hidden_size, - encoder_layers=self.num_hidden_layers, - decoder_layers=self.num_hidden_layers, - encoder_attention_heads=self.num_attention_heads, - decoder_attention_heads=self.num_attention_heads, - input_channels=self.input_channels, - dropout=self.hidden_dropout_prob, - attention_dropout=self.attention_probs_dropout_prob, - max_position_embeddings=self.max_position_embeddings, - max_source_positions=self.max_source_positions, - decoder_ffn_dim=self.hidden_size, - encoder_ffn_dim=self.hidden_size, - suppress_tokens=self.suppress_tokens, - begin_suppress_tokens=self.begin_suppress_tokens, - classifier_proj_size=self.classifier_proj_size, - num_labels=self.num_labels, - is_encoder_decoder=self.is_encoder_decoder, - is_decoder=self.is_decoder, - ) - - def prepare_whisper_encoder_inputs_dict( - self, - input_features, - ): - return { - "input_features": input_features, - } - - def prepare_config_and_inputs(self): - input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length]) - - config = self.get_config() - inputs_dict = self.prepare_whisper_encoder_inputs_dict( - input_features=input_features, - ) - return config, inputs_dict - - def prepare_config_and_inputs_for_common(self): - config, inputs_dict = self.prepare_config_and_inputs() - return config, inputs_dict - - def get_subsampled_output_lengths(self, input_lengths): - """ - Computes the output length of the convolutional layers - """ - - for i in range(self.num_conv_layers): - input_lengths = (input_lengths - 1) // 2 + 1 - - return input_lengths - - @property - def encoder_seq_length(self): - return self.get_subsampled_output_lengths(self.seq_length) - - -@require_flax -class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else () - is_encoder_decoder = False - fx_compatible = False - test_pruning = False - test_missing_keys = False - - input_name = "input_features" - - def setUp(self): - self.model_tester = FlaxWhisperEncoderModelTester(self) - _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - self.init_shape = (1,) + inputs_dict["input_features"].shape[1:] - - self.all_model_classes = ( - make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes - ) - self.config_tester = ConfigTester(self, config_class=WhisperConfig) - - def test_config(self): - self.config_tester.run_common_tests() - - # overwrite because of `input_features` - def test_jit_compilation(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - with self.subTest(model_class.__name__): - prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) - model = model_class(config) - - @jax.jit - def model_jitted(input_features, **kwargs): - return model(input_features=input_features, **kwargs) - - with self.subTest("JIT Enabled"): - jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple() - - with self.subTest("JIT Disabled"): - with jax.disable_jit(): - outputs = model_jitted(**prepared_inputs_dict).to_tuple() - - self.assertEqual(len(outputs), len(jitted_outputs)) - for jitted_output, output in zip(jitted_outputs, outputs): - self.assertEqual(jitted_output.shape, output.shape) - - # overwrite because of `input_features` - def test_forward_signature(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - signature = inspect.signature(model.__call__) - # signature.parameters is an OrderedDict => so arg_names order is deterministic - arg_names = [*signature.parameters.keys()] - - expected_arg_names = ["input_features", "attention_mask", "output_attentions"] - self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - - def test_inputs_embeds(self): - pass - - # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented - def test_model_common_attributes(self): - pass - - # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings - def test_resize_tokens_embeddings(self): - pass - - # WhisperEncoder does not have any base model - def test_save_load_to_base(self): - pass - - # WhisperEncoder does not have any base model - def test_save_load_from_base(self): - pass - - # WhisperEncoder does not have any base model - @is_pt_flax_cross_test - def test_save_load_from_base_pt(self): - pass - - # WhisperEncoder does not have any base model - @is_pt_flax_cross_test - def test_save_load_to_base_pt(self): - pass - - # WhisperEncoder does not have any base model - @is_pt_flax_cross_test - def test_save_load_bf16_to_base_pt(self): - pass From 1b9c352e55f5316586ad0b3378fd33864e81cd09 Mon Sep 17 00:00:00 2001 From: Perry Huang Date: Fri, 5 May 2023 08:29:20 -0700 Subject: [PATCH 033/935] Add TrOCR resources (#23142) * Add TrOCR resources * Made fixes suggested by stevhliu --- docs/source/en/model_doc/trocr.mdx | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.mdx index 3e3a6c100753..8ad65668627d 100644 --- a/docs/source/en/model_doc/trocr.mdx +++ b/docs/source/en/model_doc/trocr.mdx @@ -50,6 +50,27 @@ Tips: information, see the [official models](https://huggingface.co/models?other=trocr>). - TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework. +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A blog post on [Accelerating Document AI](https://huggingface.co/blog/document-ai) with TrOCR. +- A blog post on how to [Document AI](https://github.com/philschmid/document-ai-transformers) with TrOCR. +- A notebook on how to [finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb). +- A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo. +- A notebook on [finetune TrOCR on the IAM Handwriting Database](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb) using native PyTorch. +- A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb). + + + +- [Casual language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) task guide. + +⚡️ Inference + +- An interactive-demo on [TrOCR handwritten character recognition](https://huggingface.co/spaces/nielsr/TrOCR-handwritten). + ## Inference TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of From 77412343c8825dd2ed40729367e7708c3d2ec7ac Mon Sep 17 00:00:00 2001 From: Andrei Filatov <43551010+anvilarth@users.noreply.github.com> Date: Fri, 5 May 2023 18:36:15 +0300 Subject: [PATCH 034/935] fixed whisper positional encoding (#23167) --- src/transformers/models/whisper/modeling_tf_whisper.py | 2 +- src/transformers/models/whisper/modeling_whisper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 12f6e7db5eef..d9a175062b36 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -128,7 +128,7 @@ def build(self, input_shape): def call(self, input_ids, past_key_values_length=0): past_key_values_length = tf.cast(past_key_values_length, tf.int32) - gather_indices = tf.range(tf.shape(input_ids)[-1], delta=1) + past_key_values_length + gather_indices = tf.range(tf.shape(input_ids)[1], delta=1) + past_key_values_length return tf.gather(self.weight, gather_indices) diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index ed845febac82..bde800911600 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -226,7 +226,7 @@ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional super().__init__(num_positions, embedding_dim) def forward(self, input_ids, past_key_values_length=0): - return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]] + return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[1]] class WhisperAttention(nn.Module): From 40082d598b756c4f1fb048c571ed95c31f05a69c Mon Sep 17 00:00:00 2001 From: Gabriel Yang Date: Sat, 6 May 2023 00:36:56 +0900 Subject: [PATCH 035/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20docs:=20ko:?= =?UTF-8?q?=20Translate=20`multiple=5Fchoice.mdx`=20(#23064)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update doctree * doc: ko: translate multiple choice * Update reviews --- docs/source/ko/_toctree.yml | 4 +- docs/source/ko/tasks/multiple_choice.mdx | 461 +++++++++++++++++++++++ 2 files changed, 463 insertions(+), 2 deletions(-) create mode 100644 docs/source/ko/tasks/multiple_choice.mdx diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index bd24ee4e5ce1..35744420e93e 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -38,8 +38,8 @@ title: 번역 - local: tasks/summarization title: 요약 - - local: in_translation - title: (번역중) Multiple Choice + - local: tasks/multiple_choice + title: 객관식 문제(Multiple Choice) title: 자연어처리 isExpanded: false - sections: diff --git a/docs/source/ko/tasks/multiple_choice.mdx b/docs/source/ko/tasks/multiple_choice.mdx new file mode 100644 index 000000000000..9a259ee77ae6 --- /dev/null +++ b/docs/source/ko/tasks/multiple_choice.mdx @@ -0,0 +1,461 @@ + + +# 객관식 문제[[multiple-choice]] + +[[open-in-colab]] + +객관식 과제는 문맥과 함께 여러 개의 후보 답변이 제공되고 모델이 정답을 선택하도록 학습된다는 점을 제외하면 질의응답과 유사합니다. + +진행하는 방법은 아래와 같습니다: + +1. [SWAG](https://huggingface.co/datasets/swag) 데이터 세트의 'regular' 구성으로 [BERT](https://huggingface.co/bert-base-uncased)를 미세 조정하여 여러 옵션과 일부 컨텍스트가 주어졌을 때 가장 적합한 답을 선택합니다. +2. 추론에 미세 조정된 모델을 사용합니다. + + +이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다: + + + +[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) + + + + + +시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요: + +```bash +pip install transformers datasets evaluate +``` + +모델을 업로드하고 커뮤니티와 공유할 수 있도록 허깅페이스 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력하여 로그인합니다: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## SWAG 데이터 세트 가져오기[[load-swag-dataset]] + +먼저 🤗 Datasets 라이브러리에서 SWAG 데이터셋의 '일반' 구성을 가져옵니다: + +```py +>>> from datasets import load_dataset + +>>> swag = load_dataset("swag", "regular") +``` + +이제 데이터를 살펴봅니다: + +```py +>>> swag["train"][0] +{'ending0': 'passes by walking down the street playing their instruments.', + 'ending1': 'has heard approaching them.', + 'ending2': "arrives and they're outside dancing and asleep.", + 'ending3': 'turns the lead singer watches the performance.', + 'fold-ind': '3416', + 'gold-source': 'gold', + 'label': 0, + 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', + 'sent2': 'A drum line', + 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', + 'video-id': 'anetv_jkn6uvmqwh4'} +``` + +여기에는 많은 필드가 있는 것처럼 보이지만 실제로는 매우 간단합니다: + +- `sent1` 및 `sent2`: 이 필드는 문장이 어떻게 시작되는지 보여주며, 이 두 필드를 합치면 `시작 구절(startphrase)` 필드가 됩니다. +- `종료 구절(ending)`: 문장이 어떻게 끝날 수 있는지에 대한 가능한 종료 구절를 제시하지만 그 중 하나만 정답입니다. +- `레이블(label)`: 올바른 문장 종료 구절을 식별합니다. + +## 전처리[[preprocess]] + +다음 단계는 문장의 시작과 네 가지 가능한 구절을 처리하기 위해 BERT 토크나이저를 불러옵니다: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +생성하려는 전처리 함수는 다음과 같아야 합니다: + +1. `sent1` 필드를 네 개 복사한 다음 각각을 `sent2`와 결합하여 문장이 시작되는 방식을 재현합니다. +2. `sent2`를 네 가지 가능한 문장 구절 각각과 결합합니다. +3. 이 두 목록을 토큰화할 수 있도록 평탄화(flatten)하고, 각 예제에 해당하는 `input_ids`, `attention_mask` 및 `labels` 필드를 갖도록 다차원화(unflatten) 합니다. + +```py +>>> ending_names = ["ending0", "ending1", "ending2", "ending3"] + + +>>> def preprocess_function(examples): +... first_sentences = [[context] * 4 for context in examples["sent1"]] +... question_headers = examples["sent2"] +... second_sentences = [ +... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) +... ] + +... first_sentences = sum(first_sentences, []) +... second_sentences = sum(second_sentences, []) + +... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) +... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} +``` + +전체 데이터 집합에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용합니다. `batched=True`를 설정하여 데이터 집합의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다: + +```py +tokenized_swag = swag.map(preprocess_function, batched=True) +``` + +🤗 Transformers에는 객관식용 데이터 콜레이터가 없으므로 예제 배치를 만들려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 데이터 정렬 중에 전체 데이터 집합을 최대 길이로 패딩하는 대신 배치 중 가장 긴 길이로 문장을 *동적 패딩*하는 것이 더 효율적입니다. + +`DataCollatorForMultipleChoice`는 모든 모델 입력을 평탄화하고 패딩을 적용하며 그 결과를 결과를 다차원화합니다: + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import torch + + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="pt", +... ) + +... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} +... batch["labels"] = torch.tensor(labels, dtype=torch.int64) +... return batch +``` + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import tensorflow as tf + + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="tf", +... ) + +... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} +... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) +... return batch +``` + + + +## 평가 하기[[evaluate]] + +훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗[Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 지표를 가져옵니다(🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +그리고 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다: + +```py +>>> import numpy as np + + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정할 때 이 함수로 돌아가게 됩니다. + +## 훈련 하기[[train]] + + + + + +[`Trainer`]로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-with-pytorch-trainer)를 살펴보세요! + + + +이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForMultipleChoice`]로 BERT를 로드합니다: + +```py +>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer + +>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased") +``` + +이제 세 단계만 남았습니다: + +1. 훈련 하이퍼파라미터를 [`TrainingArguments`]에 정의합니다. 유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`를 설정하여 이 모델을 허브에 푸시합니다(모델을 업로드하려면 허깅 페이스에 로그인해야 합니다). 각 에폭이 끝날 때마다 [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다. +2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 훈련 인자를 [`Trainer`]에 전달합니다. +3. [`~Trainer.train`]을 사용하여 모델을 미세 조정합니다. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_swag_model", +... evaluation_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... learning_rate=5e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_swag["train"], +... eval_dataset=tokenized_swag["validation"], +... tokenizer=tokenizer, +... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +훈련이 완료되면 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요: + +```py +>>> trainer.push_to_hub() +``` + + + + +Keras로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-a-tensorflow-model-with-keras)를 살펴보시기 바랍니다! + + +TensorFlow에서 모델을 미세 조정하려면 최적화 함수, 학습률 스케쥴 및 몇 가지 학습 하이퍼파라미터를 설정하는 것부터 시작하세요: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 2 +>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs +>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +그리고 [`TFAutoModelForMultipleChoice`]로 BERT를 가져올 수 있습니다: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased") +``` + +[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다: + +```py +>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_swag["train"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_swag["validation"], +... shuffle=False, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) +``` + +[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 훈련 모델을 구성합니다: + +```py +>>> model.compile(optimizer=optimizer) +``` + +훈련을 시작하기 전에 설정해야 할 마지막 두 가지는 예측의 정확도를 계산하고 모델을 허브로 푸시하는 방법을 제공하는 것입니다. 이 두 가지 작업은 모두 [Keras 콜백](../main_classes/keras_callbacks)을 사용하여 수행할 수 있습니다. + +`compute_metrics`함수를 [`~transformers.KerasMetricCallback`]에 전달하세요: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +모델과 토크나이저를 업로드할 위치를 [`~transformers.PushToHubCallback`]에서 지정하세요: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_model", +... tokenizer=tokenizer, +... ) +``` + +그리고 콜백을 함께 묶습니다: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +이제 모델 훈련을 시작합니다! 훈련 및 검증 데이터 세트, 에폭 수, 콜백을 사용하여 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하고 모델을 미세 조정합니다: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks) +``` + +훈련이 완료되면 모델이 자동으로 허브에 업로드되어 누구나 사용할 수 있습니다! + + + + + + +객관식 모델을 미세 조정하는 방법에 대한 보다 심층적인 예는 아래 문서를 참조하세요. +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb) +또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). + + + +## 추론 하기[[inference]] + +이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다! + +텍스트와 두 개의 후보 답안을 작성합니다: + +```py +>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette." +>>> candidate1 = "The law does not apply to croissants and brioche." +>>> candidate2 = "The law applies to baguettes." +``` + + + +각 프롬프트와 후보 답변 쌍을 토큰화하여 PyTorch 텐서를 반환합니다. 또한 `labels`을 생성해야 합니다: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True) +>>> labels = torch.tensor(0).unsqueeze(0) +``` + +입력과 레이블을 모델에 전달하고 `logits`을 반환합니다: + +```py +>>> from transformers import AutoModelForMultipleChoice + +>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") +>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) +>>> logits = outputs.logits +``` + +가장 높은 확률을 가진 클래스를 가져옵니다: + +```py +>>> predicted_class = logits.argmax().item() +>>> predicted_class +'0' +``` + + +각 프롬프트와 후보 답안 쌍을 토큰화하여 텐서플로 텐서를 반환합니다: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True) +``` + +모델에 입력을 전달하고 `logits`를 반환합니다: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") +>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()} +>>> outputs = model(inputs) +>>> logits = outputs.logits +``` + +가장 높은 확률을 가진 클래스를 가져옵니다: + +```py +>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) +>>> predicted_class +'0' +``` + + From 17083b9b847c71e8c303e9cb0798a8928e99a6e0 Mon Sep 17 00:00:00 2001 From: Connor Henderson Date: Fri, 5 May 2023 11:52:19 -0400 Subject: [PATCH 036/935] fix: Passing language as acronym to Whisper generate (#23141) * add fix * address comments * remove error formatting --- .../models/whisper/modeling_whisper.py | 8 ++++++-- tests/models/whisper/test_modeling_whisper.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index bde800911600..91de6810b17e 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1562,6 +1562,7 @@ def generate( generation_config.return_timestamps = False if language is not None: + language = language.lower() generation_config.language = language if task is not None: generation_config.task = task @@ -1573,10 +1574,13 @@ def generate( language_token = generation_config.language elif generation_config.language in TO_LANGUAGE_CODE.keys(): language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>" + elif generation_config.language in TO_LANGUAGE_CODE.values(): + language_token = f"<|{generation_config.language}|>" else: + is_language_code = len(generation_config.language) == 2 raise ValueError( - f"Unsupported language: {self.language}. Language should be one of:" - f" {list(TO_LANGUAGE_CODE.keys()) if generation_config.language in TO_LANGUAGE_CODE.keys() else list(TO_LANGUAGE_CODE.values())}." + f"Unsupported language: {generation_config.language}. Language should be one of:" + f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}." ) forced_decoder_ids.append((1, generation_config.lang_to_id[language_token])) else: diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index dd6ad07eb494..0591c6f46436 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -414,6 +414,21 @@ def test_generate_fp16(self): model.generate(input_features) model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + def test_generate_language(self): + config, input_dict = self.model_tester.prepare_config_and_inputs() + input_features = input_dict["input_features"] + model = WhisperForConditionalGeneration(config).to(torch_device) + # Hack to keep the test fast and not require downloading a model with a generation_config + model.generation_config.__setattr__("lang_to_id", {"<|en|>": 1}) + model.generation_config.__setattr__("task_to_id", {"transcribe": 2}) + + # test language code + model.generate(input_features, language="en") + # test tokenizer code + model.generate(input_features, language="<|en|>") + # test language name + model.generate(input_features, language="English") + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() From fc6c8b0eaa8755bff60a48757d2e37e3a03a33dd Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Fri, 5 May 2023 22:52:49 +0530 Subject: [PATCH 037/935] Add `no_trainer` scripts to pre-train Vision Transformers (#23156) * Add run_mim_no_trainer.py draft from #20412 Add parse_args method and copy over other dependencies Add Method call for sending telemetry Initialize Accelerator Make one log on every process Set seed and Handle repository creation Initialize dataset and Set validation split Create Config Adapt Config Update Config Create Feature Extractor Create model Set column names Create transforms Create mask generator Create method to preprocess images Shuffle datasets if needed and set transforms Create Dataloaders Add optimizer Add learning rate scheduler Prepare everything with our accelerator Tie weights for TPU training Recalculate training steps and training epochs Set accelerator checkpointing steps Initialize trackers and store configuration Set total batch size Fix typo: mlm -> mim Log info at the start of training Load in the weights and states from previous save update the progress_bar if load from checkpoint Define train loop Add evaluation loop to training Add to parse_args method Push repo to hub Save accelerator state End training and save model and feature extractor Remove unused imports Fix trailing whitespace * Update code based on comments, Rename feature_extractor to image_processor * Fix linting * Add argument for learning rate * Add argument for setting number of training epochs * Remove incorrect logger argument * Convert max_train_steps to int for tqdm --------- Co-authored-by: Saad Mahmud --- .../image-pretraining/run_mim_no_trainer.py | 771 ++++++++++++++++++ 1 file changed, 771 insertions(+) create mode 100644 examples/pytorch/image-pretraining/run_mim_no_trainer.py diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py new file mode 100644 index 000000000000..a94585d39698 --- /dev/null +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -0,0 +1,771 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import logging +import math +import os +from pathlib import Path + +import datasets +import numpy as np +import torch +from accelerate import Accelerator, DistributedType +from accelerate.utils import set_seed +from datasets import load_dataset +from huggingface_hub import Repository +from torch.utils.data import DataLoader +from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor +from tqdm.auto import tqdm + +import transformers +from transformers import ( + CONFIG_MAPPING, + IMAGE_PROCESSOR_MAPPING, + MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, + AutoConfig, + AutoImageProcessor, + AutoModelForMaskedImageModeling, + SchedulerType, + get_scheduler, +) +from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry +from transformers.utils.versions import require_version + + +""" Pre-training a 🤗 Transformers model for simple masked image modeling (SimMIM) +without using HuggingFace Trainer. +Any model supported by the AutoModelForMaskedImageModeling API can be used. +""" + +logger = logging.getLogger(__name__) + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.25.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Finetune a transformers model on a simple Masked Image Modeling task" + ) + parser.add_argument( + "--dataset_name", + type=str, + default="cifar10", + help="Name of a dataset from the datasets package", + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The configuration name of the dataset to use (via the datasets library).", + ) + parser.add_argument( + "--image_column_name", + type=str, + default=None, + help="The column name of the images in the files. If not set, will try to use 'image' or 'img'.", + ) + parser.add_argument( + "--train_dir", + type=str, + default=None, + help="A folder containing the training data.", + ) + parser.add_argument( + "--validation_dir", + type=None, + default=None, + help="A folder containing the validation data.", + ) + parser.add_argument( + "--train_val_split", + type=float, + default=0.15, + help="Percent to split off of train for validation.", + ) + parser.add_argument( + "--mask_patch_size", + type=int, + default=32, + help="The size of the square patches to use for masking.", + ) + parser.add_argument( + "--mask_ratio", + type=float, + default=0.6, + help="Percentage of patches to mask.", + ) + parser.add_argument( + "--max_train_samples", + type=int, + default=None, + help=( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ), + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help=( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ), + ) + parser.add_argument( + "--model_name_or_path", + type=str, + default=None, + help=( + "The model checkpoint for weights initialization. Can be a local path to a pytorch_model.bin or a " + "checkpoint identifier on the hub. " + "Don't set if you want to train a model from scratch." + ), + ) + parser.add_argument( + "--model_type", + type=str, + default=None, + help="If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES), + ) + parser.add_argument( + "--config_name_or_path", + type=str, + default=None, + help="Pretrained config name or path if not the same as model_name", + ) + parser.add_argument( + "--config_overrides", + type=str, + default=None, + help=( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ), + ) + parser.add_argument( + "--cache_dir", + type=str, + default=None, + help="Where do you want to store (cache) the pretrained models/datasets downloaded from the hub", + ) + parser.add_argument( + "--model_revision", + type=str, + default="main", + help="The specific model version to use (can be a branch name, tag name or commit id).", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--image_processor_name", + type=str, + default=None, + help="Name or path of preprocessor config.", + ) + parser.add_argument( + "--use_auth_token", + type=bool, + default=False, + help=( + "Will use the token generated when running `huggingface-cli login` (necessary to use this script " + "with private models)." + ), + ) + parser.add_argument( + "--image_size", + type=int, + default=None, + help="The size (resolution) of each image. If not specified, will use `image_size` of the configuration.", + ) + parser.add_argument( + "--patch_size", + type=int, + default=None, + help="The size (resolution) of each patch. If not specified, will use `patch_size` of the configuration.", + ) + parser.add_argument( + "--encoder_stride", + type=int, + default=None, + help={"help": "Stride to use for the encoder."}, + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether or not to push the model to the Hub.", + ) + parser.add_argument( + "--with_tracking", + action="store_true", + help="Whether to enable experiment trackers for logging.", + ) + parser.add_argument( + "--report_to", + type=str, + default="all", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' + "Only applicable when `--with_tracking` is passed." + ), + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="A seed for reproducible training.", + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=8, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="The initial learning rate for [`AdamW`] optimizer.", + ) + parser.add_argument( + "--weight_decay", + type=float, + default=0.0, + help="Weight decay to use.", + ) + parser.add_argument( + "--num_train_epochs", + type=float, + default=3.0, + help="Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training).", + ) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], + ) + parser.add_argument( + "--num_warmup_steps", + type=int, + default=0, + help="Number of steps for the warmup in the lr scheduler.", + ) + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=8, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + help="Where to store the final model.", + ) + args = parser.parse_args() + + # Sanity checks + data_files = {} + if args.train_dir is not None: + data_files["train"] = args.train_dir + if args.validation_dir is not None: + data_files["val"] = args.validation_dir + args.data_files = data_files if data_files else None + + if args.push_to_hub: + assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." + + return args + + +class MaskGenerator: + """ + A class to generate boolean masks for the pretraining task. + + A mask is a 1D tensor of shape (model_patch_size**2,) where the value is either 0 or 1, + where 1 indicates "masked". + """ + + def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ratio=0.6): + self.input_size = input_size + self.mask_patch_size = mask_patch_size + self.model_patch_size = model_patch_size + self.mask_ratio = mask_ratio + + if self.input_size % self.mask_patch_size != 0: + raise ValueError("Input size must be divisible by mask patch size") + if self.mask_patch_size % self.model_patch_size != 0: + raise ValueError("Mask patch size must be divisible by model patch size") + + self.rand_size = self.input_size // self.mask_patch_size + self.scale = self.mask_patch_size // self.model_patch_size + + self.token_count = self.rand_size**2 + self.mask_count = int(np.ceil(self.token_count * self.mask_ratio)) + + def __call__(self): + mask_idx = np.random.permutation(self.token_count)[: self.mask_count] + mask = np.zeros(self.token_count, dtype=int) + mask[mask_idx] = 1 + + mask = mask.reshape((self.rand_size, self.rand_size)) + mask = mask.repeat(self.scale, axis=0).repeat(self.scale, axis=1) + + return torch.tensor(mask.flatten()) + + +def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + mask = torch.stack([example["mask"] for example in examples]) + return {"pixel_values": pixel_values, "bool_masked_pos": mask} + + +def main(): + args = parse_args() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_mim_no_trainer", args) + + # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. + # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers + # in the environment + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + **accelerator_log_kwargs, + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + if args.hub_model_id is None: + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) + else: + repo_name = args.hub_model_id + repo = Repository(args.output_dir, clone_from=repo_name) + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + accelerator.wait_for_everyone() + + # Initialize our dataset. + ds = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=args.data_files, + cache_dir=args.cache_dir, + use_auth_token=True if args.use_auth_token else None, + ) + + # If we don't have a validation split, split off a percentage of train as validation. + args.train_val_split = None if "validation" in ds.keys() else args.train_val_split + if isinstance(args.train_val_split, float) and args.train_val_split > 0.0: + split = ds["train"].train_test_split(args.train_val_split) + ds["train"] = split["train"] + ds["validation"] = split["test"] + + # Create config + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config_kwargs = { + "cache_dir": args.cache_dir, + "revision": args.model_revision, + "use_auth_token": True if args.use_auth_token else None, + } + if args.config_name_or_path: + config = AutoConfig.from_pretrained(args.config_name_or_path, **config_kwargs) + elif args.model_name_or_path: + config = AutoConfig.from_pretrained(args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if args.config_overrides is not None: + logger.info(f"Overriding config: {args.config_overrides}") + config.update_from_string(args.config_overrides) + logger.info(f"New config: {config}") + + # make sure the decoder_type is "simmim" (only relevant for BEiT) + if hasattr(config, "decoder_type"): + config.decoder_type = "simmim" + + # adapt config + args.image_size = args.image_size if args.image_size is not None else config.image_size + args.patch_size = args.patch_size if args.patch_size is not None else config.patch_size + args.encoder_stride = args.encoder_stride if args.encoder_stride is not None else config.encoder_stride + + config.update( + { + "image_size": args.image_size, + "patch_size": args.patch_size, + "encoder_stride": args.encoder_stride, + } + ) + + # create image processor + if args.image_processor_name: + image_processor = AutoImageProcessor.from_pretrained(args.image_processor_name, **config_kwargs) + elif args.model_name_or_path: + image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path, **config_kwargs) + else: + IMAGE_PROCESSOR_TYPES = { + conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items() + } + image_processor = IMAGE_PROCESSOR_TYPES[args.model_type]() + + # create model + if args.model_name_or_path: + model = AutoModelForMaskedImageModeling.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir, + revision=args.model_revision, + use_auth_token=True if args.use_auth_token else None, + ) + else: + logger.info("Training new model from scratch") + model = AutoModelForMaskedImageModeling.from_config(config) + + column_names = ds["train"].column_names + + if args.image_column_name is not None: + image_column_name = args.image_column_name + elif "image" in column_names: + image_column_name = "image" + elif "img" in column_names: + image_column_name = "img" + else: + image_column_name = column_names[0] + + # transformations as done in original SimMIM paper + # source: https://github.com/microsoft/SimMIM/blob/main/data/data_simmim.py + transforms = Compose( + [ + Lambda(lambda img: img.convert("RGB")), + RandomResizedCrop(args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)), + RandomHorizontalFlip(), + ToTensor(), + Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + ] + ) + + # create mask generator + mask_generator = MaskGenerator( + input_size=args.image_size, + mask_patch_size=args.mask_patch_size, + model_patch_size=args.patch_size, + mask_ratio=args.mask_ratio, + ) + + def preprocess_images(examples): + """Preprocess a batch of images by applying transforms + creating a corresponding mask, indicating + which patches to mask.""" + + examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]] + examples["mask"] = [mask_generator() for i in range(len(examples[image_column_name]))] + + return examples + + if args.max_train_samples is not None: + ds["train"] = ds["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) + # Set the training transforms + ds["train"].set_transform(preprocess_images) + + if args.max_eval_samples is not None: + ds["validation"] = ds["validation"].shuffle(seed=args.seed).select(range(args.max_eval_samples)) + # Set the validation transforms + ds["validation"].set_transform(preprocess_images) + + # DataLoaders creation: + train_dataloader = DataLoader( + ds["train"], + shuffle=True, + collate_fn=collate_fn, + batch_size=args.per_device_train_batch_size, + ) + eval_dataloader = DataLoader( + ds["validation"], + collate_fn=collate_fn, + batch_size=args.per_device_eval_batch_size, + ) + + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) + + # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be + # shorter in multiprocess) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + ) + + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, + optimizer, + train_dataloader, + eval_dataloader, + lr_scheduler, + ) + + # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. + if accelerator.distributed_type == DistributedType.TPU: + model.tie_weights() + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Figure out how many steps we should save the Accelerator states + checkpointing_steps = args.checkpointing_steps + if checkpointing_steps is not None and checkpointing_steps.isdigit(): + checkpointing_steps = int(checkpointing_steps) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if args.with_tracking: + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("mim_no_trainer", experiment_config) + + # Train! + total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(ds['train'])}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(int(args.max_train_steps)), disable=not accelerator.is_local_main_process) + completed_steps = 0 + starting_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] + + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + + for epoch in range(starting_epoch, args.num_train_epochs): + model.train() + if args.with_tracking: + total_loss = 0 + for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 + continue + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + completed_steps += 1 + + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + output_dir = f"step_{completed_steps }" + if args.output_dir is not None: + output_dir = os.path.join(args.output_dir, output_dir) + accelerator.save_state(output_dir) + + if completed_steps >= args.max_train_steps: + break + + model.eval() + losses = [] + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + outputs = model(**batch) + + loss = outputs.loss + losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size))) + + losses = torch.cat(losses) + eval_loss = torch.mean(losses) + + logger.info(f"epoch {epoch}: eval_loss: {eval_loss}") + + if args.with_tracking: + accelerator.log( + { + "eval_loss": eval_loss, + "train_loss": total_loss.item() / len(train_dataloader), + "epoch": epoch, + "step": completed_steps, + }, + step=completed_steps, + ) + + if args.push_to_hub and epoch < args.num_train_epochs - 1: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained( + args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save + ) + if accelerator.is_main_process: + image_processor.save_pretrained(args.output_dir) + repo.push_to_hub( + commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True + ) + + if args.checkpointing_steps == "epoch": + output_dir = f"epoch_{epoch}" + if args.output_dir is not None: + output_dir = os.path.join(args.output_dir, output_dir) + accelerator.save_state(output_dir) + + if args.with_tracking: + accelerator.end_training() + + if args.output_dir is not None: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained( + args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save + ) + if accelerator.is_main_process: + image_processor.save_pretrained(args.output_dir) + if args.push_to_hub: + repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) + + +if __name__ == "__main__": + main() From 312b104ff65514736c0475814fec19e47425b0b5 Mon Sep 17 00:00:00 2001 From: raghavanone <115454562+raghavanone@users.noreply.github.com> Date: Fri, 5 May 2023 22:53:46 +0530 Subject: [PATCH 038/935] Add FlaxWhisperForAudioClassification model (#23173) * Add FlaxWhisperForAudioClassification model * Add models to init * Add models to init * Fix copies * Fix automapping * Fix failing test --- docs/source/en/model_doc/whisper.mdx | 6 + src/transformers/__init__.py | 8 +- .../models/auto/modeling_flax_auto.py | 5 + src/transformers/models/whisper/__init__.py | 2 + .../models/whisper/modeling_flax_whisper.py | 161 ++++++++++++++ src/transformers/utils/dummy_flax_objects.py | 7 + .../whisper/test_modeling_flax_whisper.py | 205 +++++++++++++++++- tests/models/whisper/test_modeling_whisper.py | 8 +- 8 files changed, 395 insertions(+), 7 deletions(-) diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx index 22b08e4e61bc..52a8b5953c63 100644 --- a/docs/source/en/model_doc/whisper.mdx +++ b/docs/source/en/model_doc/whisper.mdx @@ -105,3 +105,9 @@ The original code can be found [here](https://github.com/openai/whisper). [[autodoc]] FlaxWhisperForConditionalGeneration - __call__ + +## FlaxWhisperForAudioClassification + +[[autodoc]] FlaxWhisperForAudioClassification + - __call__ + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7bf322ca8e1e..b0766b0946cd 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3779,6 +3779,7 @@ "FlaxWhisperForConditionalGeneration", "FlaxWhisperModel", "FlaxWhisperPreTrainedModel", + "FlaxWhisperForAudioClassification", ] ) _import_structure["models.xglm"].extend( @@ -6903,7 +6904,12 @@ FlaxWav2Vec2Model, FlaxWav2Vec2PreTrainedModel, ) - from .models.whisper import FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel + from .models.whisper import ( + FlaxWhisperForAudioClassification, + FlaxWhisperForConditionalGeneration, + FlaxWhisperModel, + FlaxWhisperPreTrainedModel, + ) from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel from .models.xlm_roberta import ( FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py index 755d1f07a344..e3b8d9cf5b52 100644 --- a/src/transformers/models/auto/modeling_flax_auto.py +++ b/src/transformers/models/auto/modeling_flax_auto.py @@ -229,6 +229,11 @@ ] ) +FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + ("whisper", "FlaxWhisperForAudioClassification"), + ] +) FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES) FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES) diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py index 3b6015a56f6f..cd962478e34d 100644 --- a/src/transformers/models/whisper/__init__.py +++ b/src/transformers/models/whisper/__init__.py @@ -75,6 +75,7 @@ "FlaxWhisperForConditionalGeneration", "FlaxWhisperModel", "FlaxWhisperPreTrainedModel", + "FlaxWhisperForAudioClassification", ] @@ -126,6 +127,7 @@ pass else: from .modeling_flax_whisper import ( + FlaxWhisperForAudioClassification, FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel, diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index b8d6f07242d8..1a994acea4df 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -36,6 +36,7 @@ FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput, FlaxSeq2SeqModelOutput, + FlaxSequenceClassifierOutput, ) from ...modeling_flax_utils import ( ACT2FN, @@ -1506,3 +1507,163 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): append_replace_return_docstrings( FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC ) + + +class FlaxWhisperForAudioClassificationModule(nn.Module): + config: WhisperConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self) -> None: + self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype) + self.config.is_encoder_decoder = False + num_layers = self.config.num_hidden_layers + 1 + if self.config.use_weighted_layer_sum: + self.layer_weights = jnp.repeat(1 / num_layers, num_layers) + self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_features, + encoder_outputs=None, + output_attentions=None, + output_hidden_states: bool = True, + return_dict: bool = True, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_features, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = jnp.stack(encoder_outputs, axis=1) + norm_weights = jax.nn.softmax(self.layer_weights, axis=-1) + hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1) + else: + hidden_states = encoder_outputs[0] + + hidden_states = self.projector(hidden_states) + pooled_output = jnp.mean(hidden_states, axis=1) + + logits = self.classifier(pooled_output) + + if not return_dict: + return (logits,) + encoder_outputs[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING) +class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel): + module_class = FlaxWhisperForAudioClassificationModule + dtype: jnp.dtype = jnp.float32 + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_features = jnp.zeros(input_shape, dtype="f4") + input_features = input_features.at[(..., -1)].set(self.config.eos_token_id) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_features=input_features, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING) + def __call__( + self, + input_features: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + **kwargs, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + input_features=jnp.array(input_features, dtype="f4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + +FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r""" + Returns: + + Transcription example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id") + >>> model = FlaxWhisperForAudioClassification.from_pretrained( + ... "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True + ... ) + >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True) + + >>> sample = next(iter(ds)) + + >>> inputs = feature_extractor( + ... sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np" + ... ) + >>> input_features = inputs.input_features + + >>> logits = model(input_features).logits + + >>> predicted_class_ids = jnp.argmax(logits).item() + >>> predicted_label = model.config.id2label[predicted_class_ids] + >>> predicted_label + 'af_za' + ``` +""" + +overwrite_call_docstring( + FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING +) +append_replace_return_docstrings( + FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC +) diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py index eeec3277492d..ce571bc9f8d0 100644 --- a/src/transformers/utils/dummy_flax_objects.py +++ b/src/transformers/utils/dummy_flax_objects.py @@ -1182,6 +1182,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["flax"]) +class FlaxWhisperForAudioClassification(metaclass=DummyObject): + _backends = ["flax"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["flax"]) + + class FlaxWhisperForConditionalGeneration(metaclass=DummyObject): _backends = ["flax"] diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py index 3f1e201d72d8..79a2c51039ac 100644 --- a/tests/models/whisper/test_modeling_flax_whisper.py +++ b/tests/models/whisper/test_modeling_flax_whisper.py @@ -12,8 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - import functools import inspect import tempfile @@ -41,6 +39,7 @@ from transformers import ( FLAX_MODEL_MAPPING, + FlaxWhisperForAudioClassification, FlaxWhisperForConditionalGeneration, FlaxWhisperModel, WhisperFeatureExtractor, @@ -704,3 +703,205 @@ def test_tiny_timestamp_generation(self): transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) self.assertEqual(transcript, EXPECTED_TRANSCRIPT) + + +class FlaxWhisperEncoderModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=60, + is_training=True, + use_labels=True, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + input_channels=1, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + max_source_positions=30, + num_mel_bins=80, + num_conv_layers=1, + suppress_tokens=None, + begin_suppress_tokens=None, + classifier_proj_size=4, + num_labels=2, + is_encoder_decoder=False, + is_decoder=False, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.input_channels = input_channels + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_mel_bins = num_mel_bins + self.max_position_embeddings = max_position_embeddings + self.max_source_positions = max_source_positions + self.num_conv_layers = num_conv_layers + self.suppress_tokens = suppress_tokens + self.begin_suppress_tokens = begin_suppress_tokens + self.classifier_proj_size = classifier_proj_size + self.num_labels = num_labels + self.is_encoder_decoder = is_encoder_decoder + self.is_decoder = is_decoder + + def get_config(self): + return WhisperConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + input_channels=self.input_channels, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + max_source_positions=self.max_source_positions, + decoder_ffn_dim=self.hidden_size, + encoder_ffn_dim=self.hidden_size, + suppress_tokens=self.suppress_tokens, + begin_suppress_tokens=self.begin_suppress_tokens, + classifier_proj_size=self.classifier_proj_size, + num_labels=self.num_labels, + is_encoder_decoder=self.is_encoder_decoder, + is_decoder=self.is_decoder, + ) + + def prepare_whisper_encoder_inputs_dict( + self, + input_features, + ): + return { + "input_features": input_features, + } + + def prepare_config_and_inputs(self): + input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length]) + + config = self.get_config() + inputs_dict = self.prepare_whisper_encoder_inputs_dict( + input_features=input_features, + ) + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def get_subsampled_output_lengths(self, input_lengths): + """ + Computes the output length of the convolutional layers + """ + + for i in range(self.num_conv_layers): + input_lengths = (input_lengths - 1) // 2 + 1 + + return input_lengths + + @property + def encoder_seq_length(self): + return self.get_subsampled_output_lengths(self.seq_length) + + +@require_flax +class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else () + is_encoder_decoder = False + fx_compatible = False + test_pruning = False + test_missing_keys = False + + input_name = "input_features" + + def setUp(self): + self.model_tester = FlaxWhisperEncoderModelTester(self) + _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + self.init_shape = (1,) + inputs_dict["input_features"].shape[1:] + + self.all_model_classes = ( + make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes + ) + self.config_tester = ConfigTester(self, config_class=WhisperConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + # overwrite because of `input_features` + def test_jit_compilation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): + prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + + @jax.jit + def model_jitted(input_features, **kwargs): + return model(input_features=input_features, **kwargs) + + with self.subTest("JIT Enabled"): + jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple() + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + outputs = model_jitted(**prepared_inputs_dict).to_tuple() + + self.assertEqual(len(outputs), len(jitted_outputs)) + for jitted_output, output in zip(jitted_outputs, outputs): + self.assertEqual(jitted_output.shape, output.shape) + + # overwrite because of `input_features` + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.__call__) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_features", "attention_mask", "output_attentions"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_inputs_embeds(self): + pass + + # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented + def test_model_common_attributes(self): + pass + + # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings + def test_resize_tokens_embeddings(self): + pass + + # WhisperEncoder does not have any base model + def test_save_load_to_base(self): + pass + + # WhisperEncoder does not have any base model + def test_save_load_from_base(self): + pass + + # WhisperEncoder does not have any base model + @is_pt_flax_cross_test + def test_save_load_from_base_pt(self): + pass + + # WhisperEncoder does not have any base model + @is_pt_flax_cross_test + def test_save_load_to_base_pt(self): + pass + + # WhisperEncoder does not have any base model + @is_pt_flax_cross_test + def test_save_load_bf16_to_base_pt(self): + pass diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 0591c6f46436..0b5b375e9dd3 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -95,7 +95,7 @@ def __init__( self, parent, batch_size=13, - seq_length=60, + seq_length=1500, is_training=True, use_labels=False, vocab_size=200, @@ -107,7 +107,7 @@ def __init__( hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=20, - max_source_positions=30, + max_source_positions=750, max_target_positions=40, bos_token_id=98, eos_token_id=98, @@ -1434,7 +1434,7 @@ def __init__( self, parent, batch_size=13, - seq_length=60, + seq_length=3000, is_training=True, use_labels=True, hidden_size=16, @@ -1445,7 +1445,7 @@ def __init__( hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=20, - max_source_positions=30, + max_source_positions=1500, num_mel_bins=80, num_conv_layers=1, suppress_tokens=None, From ef42c2c487260c2a0111fa9d17f2507d84ddedea Mon Sep 17 00:00:00 2001 From: cyy Date: Sat, 6 May 2023 23:41:08 +0800 Subject: [PATCH 039/935] search buffers for dtype (#23159) --- src/transformers/modeling_utils.py | 36 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index bf06d9c40538..8f13d4aa2300 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -207,21 +207,29 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil # if no floating dtype was found return whatever the first dtype is return last_dtype - else: - # For nn.DataParallel compatibility in PyTorch > 1.5 - def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: - tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] - return tuples + for t in parameter.buffers(): + last_dtype = t.dtype + if t.is_floating_point(): + return t.dtype - gen = parameter._named_members(get_members_fn=find_tensor_attributes) - last_tuple = None - for tuple in gen: - last_tuple = tuple - if tuple[1].is_floating_point(): - return tuple[1].dtype - - # fallback to the last dtype - return last_tuple[1].dtype + if last_dtype is not None: + # if no floating dtype was found return whatever the first dtype is + return last_dtype + + # For nn.DataParallel compatibility in PyTorch > 1.5 + def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + last_tuple = None + for tuple in gen: + last_tuple = tuple + if tuple[1].is_floating_point(): + return tuple[1].dtype + + # fallback to the last dtype + return last_tuple[1].dtype def get_state_dict_float_dtype(state_dict): From ef0c380c121881adc9ab981a227e1166d99272f3 Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Mon, 8 May 2023 04:22:44 +0530 Subject: [PATCH 040/935] Update LLaMA docs with arxiv link (#23191) * Update docs with arxiv link * Update llama model docs --- docs/source/en/model_doc/llama.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/llama.mdx b/docs/source/en/model_doc/llama.mdx index edcb0482097b..a5f0553358be 100644 --- a/docs/source/en/model_doc/llama.mdx +++ b/docs/source/en/model_doc/llama.mdx @@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License. ## Overview -The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](LLaMA: Open and Efficient Foundation Language Models) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters. +The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters. The abstract from the paper is the following: From 6f8a02844a0eeaf15f29acb7fe40126768936fba Mon Sep 17 00:00:00 2001 From: Bartosz Szmelczynski <43574448+Bearnardd@users.noreply.github.com> Date: Mon, 8 May 2023 00:55:04 +0200 Subject: [PATCH 041/935] fix random attention for pytorch's bigbird/pegasus_bigbird (#23056) * fix random attention usage for bigbird and pegasus_bigbird * remove staticmethod, update tests target valus * revert style changes --- .../models/big_bird/modeling_big_bird.py | 14 ++- .../modeling_bigbird_pegasus.py | 12 ++- .../models/big_bird/test_modeling_big_bird.py | 88 ++++++++----------- .../big_bird/test_modeling_flax_big_bird.py | 16 +--- 4 files changed, 60 insertions(+), 70 deletions(-) diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 4f90deb506b7..73a5c7d9b52e 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -1052,9 +1052,8 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): return plan_from_length, plan_num_rand_blocks - @staticmethod def _bigbird_block_rand_mask( - from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 ): """ Create adjacency list of random attention. @@ -1077,6 +1076,9 @@ def _bigbird_block_rand_mask( raise ValueError("Error the number of blocks needs to be same!") rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) last = to_seq_length // to_block_size - 1 if last_idx > (2 * to_block_size): @@ -1160,11 +1162,17 @@ def _bigbird_block_rand_mask_with_head( plan_block_length = np.array(plan_from_length) // from_block_size # till when to follow plan max_plan_idx = plan_from_length.index(from_seq_length) + # Random Attention adjacency list rand_attn = [ np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) for i in range(num_heads) ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn # We will go iteratively over the plan blocks and pick random number of # Attention blocks from the legally allowed blocks @@ -1353,7 +1361,6 @@ def set_attention_type(self, value: str): attn_weights.key = self.self.key self.self = attn_weights self.attention_type = value - if not self.training: self.self.eval() @@ -1380,7 +1387,6 @@ def forward( from_mask = from_mask.to(hidden_states.dtype) if to_mask is not None: to_mask = to_mask.to(hidden_states.dtype) - if self.attention_type == "original_full": self_outputs = self.self( hidden_states, diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 859e019ac953..e4c64e12b554 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -873,9 +873,8 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): return plan_from_length, plan_num_rand_blocks - @staticmethod def _bigbird_block_rand_mask( - from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 ): """ Create adjacency list of random attention. @@ -898,6 +897,9 @@ def _bigbird_block_rand_mask( raise ValueError("Error the number of blocks needs to be same!") rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) last = to_seq_length // to_block_size - 1 if last_idx > (2 * to_block_size): @@ -981,11 +983,17 @@ def _bigbird_block_rand_mask_with_head( plan_block_length = np.array(plan_from_length) // from_block_size # till when to follow plan max_plan_idx = plan_from_length.index(from_seq_length) + # Random Attention adjacency list rand_attn = [ np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) for i in range(num_heads) ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn # We will go iteratively over the plan blocks and pick random number of # Attention blocks from the legally allowed blocks diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index 69455935e5e5..f86c6d0ac70a 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -20,7 +20,7 @@ from transformers import BigBirdConfig, is_torch_available from transformers.models.auto import get_values from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer -from transformers.testing_utils import is_pt_flax_cross_test, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask @@ -618,20 +618,6 @@ def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, n else: super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes) - @is_pt_flax_cross_test - @unittest.skip( - reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode" - ) - def test_equivalence_flax_to_pt(self): - pass - - @is_pt_flax_cross_test - @unittest.skip( - reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode" - ) - def test_equivalence_pt_to_flax(self): - pass - @require_torch @slow @@ -664,18 +650,19 @@ def test_inference_block_sparse_pretraining(self): expected_prediction_logits_slice = torch.tensor( [ - [-0.2420, -0.6048, -0.0614, 7.8422], - [-0.0596, -0.0104, -1.8408, 9.3352], - [1.0588, 0.7999, 5.0770, 8.7555], - [-0.1385, -1.7199, -1.7613, 6.1094], + [-0.5583, 0.0475, -0.2508, 7.4423], + [0.7409, 1.4460, -0.7593, 7.7010], + [1.9150, 3.1395, 5.8840, 9.3498], + [-0.1854, -1.4640, -2.2052, 3.7968], ], device=torch_device, ) + self.assertTrue( torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4) ) - expected_seq_relationship_logits = torch.tensor([[58.8196, 56.3629]], device=torch_device) + expected_seq_relationship_logits = torch.tensor([[46.9465, 47.9517]], device=torch_device) self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4)) def test_inference_full_pretraining(self): @@ -787,22 +774,23 @@ def test_block_sparse_context_layer(self): blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn( attn_mask, config.block_size ) + targeted_cl = torch.tensor( [ - [0.1874, 1.5260, 0.2335, -0.0473, -0.0961, 1.8384, -0.0141, 0.1250, 0.0085, -0.0048], - [-0.0554, 0.0728, 0.1683, -0.1332, 0.1741, 0.1337, -0.2380, -0.1849, -0.0390, -0.0259], - [-0.0419, 0.0767, 0.1591, -0.1399, 0.1789, 0.1257, -0.2406, -0.1772, -0.0261, -0.0079], - [0.1860, 1.5172, 0.2326, -0.0473, -0.0953, 1.8291, -0.0147, 0.1245, 0.0082, -0.0046], - [0.1879, 1.5296, 0.2335, -0.0471, -0.0975, 1.8433, -0.0136, 0.1260, 0.0086, -0.0054], - [0.1854, 1.5147, 0.2334, -0.0480, -0.0956, 1.8250, -0.0149, 0.1222, 0.0082, -0.0060], - [0.1859, 1.5184, 0.2334, -0.0474, -0.0955, 1.8297, -0.0143, 0.1234, 0.0079, -0.0054], - [0.1885, 1.5336, 0.2335, -0.0467, -0.0979, 1.8481, -0.0130, 0.1269, 0.0085, -0.0049], - [0.1881, 1.5305, 0.2335, -0.0471, -0.0976, 1.8445, -0.0135, 0.1262, 0.0086, -0.0053], - [0.1852, 1.5148, 0.2333, -0.0480, -0.0949, 1.8254, -0.0151, 0.1225, 0.0079, -0.0055], - [0.1877, 1.5292, 0.2335, -0.0470, -0.0972, 1.8431, -0.0135, 0.1259, 0.0084, -0.0052], - [0.1874, 1.5261, 0.2334, -0.0472, -0.0968, 1.8393, -0.0140, 0.1251, 0.0084, -0.0052], - [0.1853, 1.5151, 0.2331, -0.0478, -0.0948, 1.8256, -0.0154, 0.1228, 0.0086, -0.0052], - [0.1867, 1.5233, 0.2334, -0.0475, -0.0965, 1.8361, -0.0139, 0.1247, 0.0084, -0.0054], + [0.1870, 1.5248, 0.2333, -0.0483, -0.0952, 1.8359, -0.0142, 0.1239, 0.0083, -0.0045], + [-0.0601, 0.1243, 0.1329, -0.1524, 0.2347, 0.0894, -0.2248, -0.2461, -0.0645, -0.0109], + [-0.0418, 0.1463, 0.1290, -0.1638, 0.2489, 0.0799, -0.2341, -0.2406, -0.0524, 0.0106], + [0.1859, 1.5182, 0.2324, -0.0473, -0.0952, 1.8295, -0.0148, 0.1242, 0.0080, -0.0045], + [0.1879, 1.5300, 0.2334, -0.0480, -0.0967, 1.8428, -0.0137, 0.1256, 0.0087, -0.0050], + [0.1852, 1.5149, 0.2330, -0.0492, -0.0936, 1.8236, -0.0154, 0.1210, 0.0080, -0.0048], + [0.1857, 1.5186, 0.2331, -0.0484, -0.0940, 1.8285, -0.0148, 0.1224, 0.0077, -0.0045], + [0.1884, 1.5336, 0.2334, -0.0469, -0.0974, 1.8477, -0.0132, 0.1266, 0.0085, -0.0046], + [0.1881, 1.5308, 0.2334, -0.0479, -0.0969, 1.8438, -0.0136, 0.1258, 0.0088, -0.0050], + [0.1849, 1.5143, 0.2329, -0.0491, -0.0930, 1.8230, -0.0156, 0.1209, 0.0074, -0.0047], + [0.1878, 1.5299, 0.2333, -0.0472, -0.0967, 1.8434, -0.0137, 0.1257, 0.0084, -0.0048], + [0.1873, 1.5260, 0.2333, -0.0478, -0.0961, 1.8383, -0.0142, 0.1245, 0.0083, -0.0048], + [0.1849, 1.5145, 0.2327, -0.0491, -0.0935, 1.8237, -0.0156, 0.1215, 0.0083, -0.0046], + [0.1866, 1.5232, 0.2332, -0.0488, -0.0950, 1.8342, -0.0143, 0.1237, 0.0084, -0.0047], ], device=torch_device, ) @@ -851,21 +839,22 @@ def test_tokenizer_inference(self): expected_prediction = torch.tensor( [ - [-0.0213, -0.2213, -0.0061, 0.0687], - [0.0977, 0.1858, 0.2374, 0.0483], - [0.2112, -0.2524, 0.5793, 0.0967], - [0.2473, -0.5070, -0.0630, 0.2174], - [0.2885, 0.1139, 0.6071, 0.2991], - [0.2328, -0.2373, 0.3648, 0.1058], - [0.2517, -0.0689, 0.0555, 0.0880], - [0.1021, -0.1495, -0.0635, 0.1891], - [0.0591, -0.0722, 0.2243, 0.2432], - [-0.2059, -0.2679, 0.3225, 0.6183], - [0.2280, -0.2618, 0.1693, 0.0103], - [0.0183, -0.1375, 0.2284, -0.1707], + [0.1887, -0.0474, 0.2604, 0.1453], + [0.0651, 0.1999, 0.1797, 0.1161], + [0.2833, -0.3036, 0.6910, 0.1123], + [0.2836, -0.4644, -0.0111, 0.1530], + [0.3919, -0.2823, 0.4192, 0.1687], + [0.2168, -0.1956, 0.4050, 0.0925], + [0.2597, -0.0884, 0.1258, 0.1119], + [0.1127, -0.1203, 0.1924, 0.2859], + [0.1362, -0.1315, 0.2693, 0.1027], + [-0.3169, -0.2266, 0.4419, 0.6740], + [0.2366, -0.1452, 0.2589, 0.0579], + [0.0358, -0.2021, 0.3112, -0.1392], ], device=torch_device, ) + self.assertTrue(torch.allclose(prediction[0, 52:64, 320:324], expected_prediction, atol=1e-4)) def test_inference_question_answering(self): @@ -908,11 +897,12 @@ def test_inference_question_answering(self): # fmt: off target_start_logits = torch.tensor( - [[-8.9304, -10.3849, -14.4997, -9.6497, -13.9469, -7.8134, -8.9687, -13.3585, -9.7987, -13.8869, -9.2632, -8.9294, -13.6721, -7.3198, -9.5434, -11.2641, -14.3245, -9.5705, -12.7367, -8.6168, -11.083, -13.7573, -8.1151, -14.5329, -7.6876, -15.706, -12.8558, -9.1135, 8.0909, -3.1925, -11.5812, -9.4822], [-11.5595, -14.5591, -10.2978, -14.8445, -10.2092, -11.1899, -13.8356, -10.5644, -14.7706, -9.9841, -11.0052, -14.1862, -8.8173, -11.1098, -12.4686, -15.0531, -11.0196, -13.6614, -10.0236, -11.8151, -14.8744, -9.5123, -15.1605, -8.6472, -15.4184, -8.898, -9.6328, -7.0258, -11.3365, -14.4065, -10.2587, -8.9103]], # noqa: E231 + [[-8.5622, -9.6209, -14.3351, -8.7032, -11.8596, -7.7446, -9.6730, -13.6063, -8.9651, -11.7417, -8.2641, -8.7056, -13.4116, -5.6600, -8.8316, -10.4148, -12.2180, -7.7979, -12.5274, -6.0685, -10.3373, -11.3128, -6.6456, -14.4030, -6.8292, -14.5383, -11.5638, -6.3326, 11.5293, -1.8434, -10.0013, -7.6150], [-10.7384, -13.1179, -10.1837, -13.7700, -10.0186, -11.7335, -13.3411, -10.0188, -13.4235, -9.9381, -10.4252, -13.1281, -8.2022, -10.4326, -11.5542, -14.1549, -10.7546, -13.4691, -8.2744, -11.4324, -13.3773, -9.8284, -14.5825, -8.7471, -14.7050, -8.0364, -11.3627, -6.4638, -11.7031, -14.3446, -9.9425, -8.0088]], # noqa: E231 device=torch_device, ) + target_end_logits = torch.tensor( - [[-12.4131, -8.5959, -15.7163, -11.1524, -15.9913, -12.2038, -7.8902, -16.0296, -12.164, -16.5017, -13.3332, -6.9488, -15.7756, -13.8506, -11.0779, -9.2893, -15.0426, -10.1963, -17.3292, -12.2945, -11.5337, -16.4514, -9.1564, -17.5001, -9.1562, -16.2971, -13.3199, -7.5724, -5.1175, 7.2168, -10.3804, -11.9873], [-10.8654, -14.9967, -11.4144, -16.9189, -14.2673, -9.7068, -15.0182, -12.8846, -16.8716, -13.665, -10.3113, -15.1436, -14.9069, -13.3364, -11.2339, -16.0118, -11.8331, -17.0613, -13.8852, -12.4163, -16.8978, -10.7772, -17.2324, -10.6979, -16.9811, -10.3427, -9.497, -13.7104, -11.1107, -13.2936, -13.855, -14.1264]], # noqa: E231 + [[-12.1736, -8.8487, -14.8877, -11.6713, -15.1165, -12.2396, -7.6828, -15.4153, -12.2528, -14.3671, -12.3596, -7.4272, -14.9615, -13.6356, -11.7939, -9.9767, -14.8112, -8.9567, -15.8798, -11.5291, -9.4249, -14.7544, -7.9387, -16.2789, -8.9702, -15.3111, -11.5585, -7.9992, -4.1127, 10.3209, -8.3926, -10.2005], [-11.1375, -15.4027, -12.6861, -16.9884, -13.7093, -10.3560, -15.7228, -12.9290, -15.8519, -13.7953, -10.2460, -15.7198, -14.2078, -12.8477, -11.4861, -16.1017, -11.8900, -16.4488, -13.2959, -10.3980, -15.4874, -10.3539, -16.8263, -10.9973, -17.0344, -9.2751, -10.1196, -13.8907, -12.1025, -13.0628, -12.8530, -13.8173]], # noqa: E321 device=torch_device, ) # fmt: on @@ -954,7 +944,7 @@ def test_auto_padding(self): # fmt: off target = torch.tensor( - [[-0.045136, -0.068013, 0.12246, -0.01356, 0.018386, 0.025333, -0.0044439, -0.0030996, -0.064031, 0.0006439], [-0.045018, -0.067638, 0.12317, -0.013998, 0.019216, 0.025695, -0.0043705, -0.0031895, -0.063153, 0.00088899], [-0.045042, -0.067305, 0.1234, -0.014512, 0.020057, 0.026084, -0.004615, -0.0031728, -0.062442, 0.0010263], [-0.044589, -0.067655, 0.12416, -0.014287, 0.019416, 0.026065, -0.0050958, -0.002702, -0.063158, 0.0004827], [-0.044627, -0.067535, 0.1239, -0.014319, 0.019491, 0.026213, -0.0059482, -0.0025906, -0.063116, 0.00014669], [-0.044899, -0.067704, 0.12337, -0.014231, 0.019256, 0.026345, -0.0065565, -0.0022938, -0.063433, -0.00011409], [-0.045599, -0.067764, 0.12235, -0.014151, 0.019206, 0.026417, -0.0068965, -0.0024494, -0.063313, -4.4499e-06], [-0.045557, -0.068372, 0.12199, -0.013747, 0.017962, 0.026103, -0.0070607, -0.0023552, -0.06447, -0.00048756], [-0.045334, -0.068913, 0.1217, -0.013566, 0.01693, 0.025745, -0.006311, -0.0024903, -0.065575, -0.0006719], [-0.045171, -0.068726, 0.12164, -0.013688, 0.017139, 0.025629, -0.005213, -0.0029412, -0.065237, -0.00020669], [-0.044411, -0.069267, 0.12206, -0.013645, 0.016212, 0.025589, -0.0044121, -0.002972, -0.066277, -0.00067963], [-0.043487, -0.069792, 0.1232, -0.013663, 0.015303, 0.02613, -0.0036294, -0.0030616, -0.067483, -0.0012642], [-0.042622, -0.069287, 0.12469, -0.013936, 0.016204, 0.026474, -0.0040534, -0.0027365, -0.066994, -0.0014148], [-0.041879, -0.070031, 0.12593, -0.014047, 0.015082, 0.027751, -0.0040683, -0.0027189, -0.068985, -0.0027146]], # noqa: E231 + [[-0.129420, -0.164740, 0.042422, -0.336030, 0.094379, 0.033794, 0.384590, 0.229660, -0.196500, 0.108020], [-0.000154, -0.168800, 0.165820, -0.313670, 0.101240, 0.035145, 0.381880, 0.213730, -0.201080, 0.077443], [0.053754, -0.166350, 0.225520, -0.272900, 0.119670, 0.019987, 0.348670, 0.199190, -0.181600, 0.084640], [0.063636, -0.187110, 0.237010, -0.297380, 0.126300, 0.020025, 0.268490, 0.191820, -0.192300, 0.035077], [0.073893, -0.184790, 0.188870, -0.297860, 0.134280, 0.028972, 0.174650, 0.186890, -0.180530, 0.006851], [0.005253, -0.169360, 0.123100, -0.302550, 0.126930, 0.024188, 0.133410, 0.200600, -0.168210, -0.001006], [-0.093336, -0.175370, -0.004768, -0.333170, 0.114330, 0.034168, 0.120960, 0.203570, -0.162810, -0.005757], [-0.160210, -0.169310, -0.049064, -0.331950, 0.115730, 0.027062, 0.143600, 0.205310, -0.144580, 0.026746], [-0.193200, -0.156820, -0.079422, -0.351600, 0.106450, 0.032174, 0.245690, 0.210250, -0.173480, 0.043914], [-0.167980, -0.153050, -0.059764, -0.357890,0.103910, 0.031481, 0.334190, 0.208960,-0.178180, 0.072165], [-0.136990, -0.156950, -0.012099, -0.353140,0.096996, 0.025864, 0.376340, 0.216050, -0.171820, 0.089963], [-0.041143, -0.167060, 0.079754, -0.353220, 0.093247, 0.019867, 0.385810, 0.214340, -0.191800, 0.065946],[0.040373, -0.158610, 0.152570, -0.312930, 0.110590, 0.012282, 0.345270, 0.204040, -0.176500, 0.064972], [0.043762, -0.166450, 0.179500, -0.317930, 0.117280, -0.004040, 0.304490, 0.201380, -0.182780, 0.044000]], # noqa: E231 device=torch_device, ) # fmt: on diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py index 6c91106a8514..63b2237fbddc 100644 --- a/tests/models/big_bird/test_modeling_flax_big_bird.py +++ b/tests/models/big_bird/test_modeling_flax_big_bird.py @@ -15,7 +15,7 @@ import unittest from transformers import BigBirdConfig, is_flax_available -from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow +from transformers.testing_utils import require_flax, slow from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask @@ -221,17 +221,3 @@ def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, n return else: super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes) - - @is_pt_flax_cross_test - @unittest.skip( - reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode" - ) - def test_equivalence_flax_to_pt(self): - pass - - @is_pt_flax_cross_test - @unittest.skip( - reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode" - ) - def test_equivalence_pt_to_flax(self): - pass From dbc12269ed5546b2da9236b9f1078b95b6a4d3d5 Mon Sep 17 00:00:00 2001 From: Robert Baruch Date: Sun, 7 May 2023 16:06:24 -0700 Subject: [PATCH 042/935] Fix hf_argparser.parse_json_file to open file with utf-8 encoding, close file when finished (#23194) * Open json args in utf-8 encoding, close file when finished * black formatted --- src/transformers/hf_argparser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index f808acebe902..b31497dd103e 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -401,8 +401,8 @@ def parse_json_file(self, json_file: str, allow_extra_keys: bool = False) -> Tup - the dataclass instances in the same order as they were passed to the initializer. """ - open_json_file = open(Path(json_file)) - data = json.loads(open_json_file.read()) + with open(Path(json_file), encoding="utf-8") as open_json_file: + data = json.loads(open_json_file.read()) outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys) return tuple(outputs) From bbfb9fc22bdd49a45dd6ed850fc78c4d99b59afb Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 8 May 2023 10:45:40 +0100 Subject: [PATCH 043/935] =?UTF-8?q?Generate:=20starcoder=20=F0=9F=A4=9C=20?= =?UTF-8?q?=F0=9F=A4=9B=20assisted=20generation=20(#23182)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * starcoder has joined the chat * indexing that works for all --- src/transformers/generation/utils.py | 14 ++++++++++++-- tests/generation/test_utils.py | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 0f0191fb144f..8c8a67fa5cb0 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4221,6 +4221,9 @@ def assisted_decoding( # keep track of which sequences are already finished unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + # other auxiliary variables + max_len = stopping_criteria[0].max_length + this_peer_finished = False # used by synced_gpus only while True: if synced_gpus: @@ -4235,7 +4238,7 @@ def assisted_decoding( # Assistant: main logic start cur_len = input_ids.shape[-1] - max_len = stopping_criteria[0].max_length + assistant_kv_indexing = 0 if "bloom" not in assistant_model.__class__.__name__.lower() else 1 # 1. Forecast next N tokens using the assistant model. This `for` block can be replaced with a # `.generate()` call if we decide to add `past_key_values` as a possible output of generate, as we @@ -4244,7 +4247,7 @@ def assisted_decoding( for _ in range(int(assistant_model.max_assistant_tokens)): # 1.1. use the assistant model to obtain the next candidate logits if "assistant_past_key_values" in model_kwargs: - prev_seq_len = model_kwargs["assistant_past_key_values"][0][0].shape[2] + prev_seq_len = model_kwargs["assistant_past_key_values"][0][assistant_kv_indexing].shape[-2] # `new_token_len` can be 1 or 2 (next token in assistant + last token picked by the larger model) new_token_len = candidate_input_ids.shape[1] - prev_seq_len assist_inputs = candidate_input_ids[:, -new_token_len:] @@ -4505,6 +4508,13 @@ def _crop_past_key_values(model, past_key_values, maximum_length): ) ) past_key_values = tuple(new_past) + elif "gptbigcode" in model.__class__.__name__.lower(): # gptbigcode is too + if model.config.multi_query: + for idx in range(len(past_key_values)): + past_key_values[idx] = past_key_values[idx][:, :maximum_length, :] + else: + for idx in range(len(past_key_values)): + past_key_values[idx] = past_key_values[idx][:, :, :maximum_length, :] else: for idx in range(len(past_key_values)): new_past.append( diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 3b96f2b2bdff..70de057d5fe7 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1473,7 +1473,7 @@ def test_assisted_decoding_matches_greedy_search(self): # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes if any( model_name in model_class.__name__.lower() - for model_name in ["bigbirdpegasus", "gptbigcode", "led", "mega", "speech2text", "git", "prophetnet"] + for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"] ): return @@ -1529,7 +1529,7 @@ def test_assisted_decoding_sample(self): # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes if any( model_name in model_class.__name__.lower() - for model_name in ["bigbirdpegasus", "gptbigcode", "led", "mega", "speech2text", "git", "prophetnet"] + for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"] ): return From 843fdf2e420a95bdf93f5c7b5c43151fdaa98e48 Mon Sep 17 00:00:00 2001 From: Orr Zohar <108689663+orrzohar@users.noreply.github.com> Date: Mon, 8 May 2023 04:35:04 -0700 Subject: [PATCH 044/935] Fixing class embedding selection in owl-vit (#23157) fixing class embedding selection in owl-vit --- src/transformers/models/owlvit/modeling_owlvit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 1a0f3ed0f6d8..93872fffcf41 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -1499,7 +1499,7 @@ def embed_image_query( selected_inds = (ious[0] >= iou_threshold).nonzero() if selected_inds.numel(): - selected_embeddings = class_embeds[i][selected_inds[0]] + selected_embeddings = class_embeds[i][selected_inds.squeeze(1)] mean_embeds = torch.mean(class_embeds[i], axis=0) mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings) best_box_ind = selected_inds[torch.argmin(mean_sim)] From fd6970bc56afa5a011ab9b839f73fae1c9e72625 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Mon, 8 May 2023 08:52:44 -0400 Subject: [PATCH 045/935] Skip failing test --- examples/tensorflow/test_tensorflow_examples.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py index 956209baade4..d5ae4c71b869 100644 --- a/examples/tensorflow/test_tensorflow_examples.py +++ b/examples/tensorflow/test_tensorflow_examples.py @@ -297,6 +297,7 @@ def test_run_translation(self): result = get_results(tmp_dir) self.assertGreaterEqual(result["bleu"], 30) + @skip("Fix me Matt") def test_run_image_classification(self): tmp_dir = self.get_auto_remove_tmp_dir() testargs = f""" From 94056b57beb4499f4f74d5d88a41e8266cc01778 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 8 May 2023 09:47:08 -0400 Subject: [PATCH 046/935] New version of Accelerate for the Trainer (#23204) --- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d8f58360d7af..41e52e7f7c8e 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py _deps = [ "Pillow", - "accelerate>=0.17.0", + "accelerate>=0.19.0", "av==9.2.0", # Latest version of PyAV (10.0.0) has issues with audio stream. "beautifulsoup4", "black~=23.1", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index dd1055ebd2e4..f325447a109d 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -3,7 +3,7 @@ # 2. run `make deps_table_update`` deps = { "Pillow": "Pillow", - "accelerate": "accelerate>=0.17.0", + "accelerate": "accelerate>=0.19.0", "av": "av==9.2.0", "beautifulsoup4": "beautifulsoup4", "black": "black~=23.1", From 188a8bfcccc6b862fe7ccc2859d977c01dd98136 Mon Sep 17 00:00:00 2001 From: Connor Henderson Date: Mon, 8 May 2023 14:56:42 -0400 Subject: [PATCH 047/935] docs: Fix broken link in 'How to add a model...' (#23216) fix link --- docs/source/en/add_new_model.mdx | 2 +- templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/add_new_model.mdx b/docs/source/en/add_new_model.mdx index 49dce27600ce..38efaa945ded 100644 --- a/docs/source/en/add_new_model.mdx +++ b/docs/source/en/add_new_model.mdx @@ -678,7 +678,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder") **7. Implement the forward pass** Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make -sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward +sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers implementation instead of the original one. It should look as follows: diff --git a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md index 10bbd2011096..201806837591 100644 --- a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md +++ b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md @@ -848,7 +848,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder") Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make sure that the forward pass is correctly implemented. In [Get familiar with the original -repository](#run-a-pretrained-checkpoint-using-the-original-repository), +repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers implementation instead of the original one. It From 006da469dd5a465f4551f4245f780e3b1e92b76c Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 8 May 2023 18:36:22 -0400 Subject: [PATCH 048/935] Pin tensorflow-probability (#23220) * Pin tensorflow-probability * [all-test] * [all-test] Fix syntax for bash --- .circleci/create_circleci_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 53c0279c4a0a..7208d876a97c 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -186,7 +186,7 @@ def job_name(self): "git lfs install", "pip install --upgrade pip", "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]", - "pip install tensorflow_probability", + 'pip install "tensorflow_probability<0.20"', "pip install git+https://github.com/huggingface/accelerate", ], marker="is_pt_tf_cross_test", @@ -227,7 +227,7 @@ def job_name(self): "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake", "pip install --upgrade pip", "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]", - "pip install tensorflow_probability", + 'pip install "tensorflow_probability<0.20"', ], parallelism=1, pytest_options={"rA": None}, @@ -266,7 +266,7 @@ def job_name(self): "sudo apt-get -y update && sudo apt-get install -y cmake", "pip install --upgrade pip", "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]", - "pip install tensorflow_probability", + 'pip install "tensorflow_probability<0.20"', ], pytest_options={"rA": None}, marker="is_pipeline_test", From 431b04d8c410e3fc1a0f85d43d74ee6927bd95c5 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Tue, 9 May 2023 14:58:19 +0200 Subject: [PATCH 049/935] [SAM] Add resources (#23224) Add resources --- docs/source/en/model_doc/sam.mdx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/sam.mdx b/docs/source/en/model_doc/sam.mdx index 70e93d2ae2cb..969b7e2b2290 100644 --- a/docs/source/en/model_doc/sam.mdx +++ b/docs/source/en/model_doc/sam.mdx @@ -22,7 +22,7 @@ The model can be used to predict segmentation masks of any object of interest gi The abstract from the paper is the following: -*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at \href{https://segment-anything.com}{https://segment-anything.com} to foster research into foundation models for computer vision.* +*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.* Tips: @@ -63,8 +63,10 @@ scores = outputs.iou_scores Resources: -- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model -- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using automatic mask generation pipeline. +- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model. +- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using the automatic mask generation pipeline. +- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Run_inference_with_MedSAM_using_HuggingFace_Transformers.ipynb) for inference with MedSAM, a fine-tuned version of SAM on the medical domain. +- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)_on_a_custom_dataset.ipynb) for fine-tuning the model on custom data. ## SamConfig From 7f9195090160d508c7afb2e444e34f181872dd10 Mon Sep 17 00:00:00 2001 From: Matthijs Hollemans Date: Tue, 9 May 2023 15:10:17 +0200 Subject: [PATCH 050/935] audio_utils improvements (#21998) * silly change to allow making a PR * clean up doc comments * simplify hertz_to_mel and mel_to_hertz * fixup * clean up power_to_db * also add amplitude_to_db * move functions * clean up mel_filter_bank * fixup * credit librosa & torchaudio authors * add unit tests * tests for power_to_db and amplitude_to_db * add mel_filter_bank tests * rewrite STFT * add convenience spectrogram function * missing transpose * fewer transposes * add integration test to M-CTC-T * frame length can be either window or FFT length * rewrite stft API * add preemphasis coefficient * move argument * add log option to spectrogram * replace M-CTC-T feature extractor * fix api thing * replace whisper STFT * replace whisper mel filters * replace tvlt's stft * allow alternate window names * replace speecht5 stft * fixup * fix integration tests * fix doc comments * remove manual FFT length calculation * fix docs * go away, deprecation warnings * combine everything into spectrogram function * add deprecated functions back * fixup --- docs/source/en/internal/audio_utils.mdx | 15 +- src/transformers/audio_utils.py | 648 ++++++++++++----- .../models/clap/feature_extraction_clap.py | 53 +- .../models/mctct/feature_extraction_mctct.py | 116 +--- .../speecht5/feature_extraction_speecht5.py | 60 +- .../models/tvlt/feature_extraction_tvlt.py | 153 +--- .../whisper/feature_extraction_whisper.py | 147 +--- ...xtraction_audio_spectrogram_transformer.py | 5 +- .../mctct/test_feature_extraction_mctct.py | 39 +- .../test_feature_extraction_speech_to_text.py | 24 + .../test_feature_extraction_speecht5.py | 4 +- .../tvlt/test_feature_extraction_tvlt.py | 6 +- .../test_feature_extraction_whisper.py | 5 +- tests/utils/test_audio_utils.py | 652 ++++++++++++++++++ 14 files changed, 1334 insertions(+), 593 deletions(-) create mode 100644 tests/utils/test_audio_utils.py diff --git a/docs/source/en/internal/audio_utils.mdx b/docs/source/en/internal/audio_utils.mdx index 8f1d6597149d..74c2fe82a363 100644 --- a/docs/source/en/internal/audio_utils.mdx +++ b/docs/source/en/internal/audio_utils.mdx @@ -12,10 +12,9 @@ specific language governing permissions and limitations under the License. # Utilities for `FeatureExtractors` -This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *Mel log spectrogram*. +This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *log mel spectrogram*. - -Most of those are only useful if you are studying the code of the image processors in the library. +Most of those are only useful if you are studying the code of the audio processors in the library. ## Audio Transformations @@ -23,12 +22,14 @@ Most of those are only useful if you are studying the code of the image processo [[autodoc]] audio_utils.mel_to_hertz -[[autodoc]] audio_utils.get_mel_filter_banks +[[autodoc]] audio_utils.mel_filter_bank -[[autodoc]] audio_utils.stft +[[autodoc]] audio_utils.optimal_fft_length -[[autodoc]] audio_utils.power_to_db +[[autodoc]] audio_utils.window_function -[[autodoc]] audio_utils.fram_wave +[[autodoc]] audio_utils.spectrogram +[[autodoc]] audio_utils.power_to_db +[[autodoc]] audio_utils.amplitude_to_db diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py index 73bc041d6961..a34892af4123 100644 --- a/src/transformers/audio_utils.py +++ b/src/transformers/audio_utils.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. +# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,66 +13,61 @@ # See the License for the specific language governing permissions and # limitations under the License. """ - Audio processing functions to extract feature from a raw audio. Should all be in numpy to support all frameworks, and - remmove unecessary dependencies. +Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks +and remove unnecessary dependencies. """ -import math import warnings -from typing import Optional +from typing import Optional, Union import numpy as np -from numpy.fft import fft -def hertz_to_mel(freq: float, mel_scale: str = "htk") -> float: - """Convert Hertz to Mels. +def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: + """ + Convert frequency from hertz to mels. Args: - freqs (`float`): - Frequencies in Hertz + freq (`float` or `np.ndarray`): + The frequency, or multiple frequencies, in hertz (Hz). mel_scale (`str`, *optional*, defaults to `"htk"`): - Scale to use, `htk` or `slaney`. + The mel frequency scale to use, `"htk"` or `"slaney"`. Returns: - mels (`float`): - Frequency in Mels + `float` or `np.ndarray`: The frequencies on the mel scale. """ if mel_scale not in ["slaney", "htk"]: raise ValueError('mel_scale should be one of "htk" or "slaney".') if mel_scale == "htk": - return 2595.0 * math.log10(1.0 + (freq / 700.0)) - - # Fill in the linear part - frequency_min = 0.0 - f_sp = 200.0 / 3 + return 2595.0 * np.log10(1.0 + (freq / 700.0)) - mels = (freq - frequency_min) / f_sp - - # Fill in the log-scale part min_log_hertz = 1000.0 - min_log_mel = (min_log_hertz - frequency_min) / f_sp - logstep = math.log(6.4) / 27.0 + min_log_mel = 15.0 + logstep = 27.0 / np.log(6.4) + mels = 3.0 * freq / 200.0 - if freq >= min_log_hertz: - mels = min_log_mel + math.log(freq / min_log_hertz) / logstep + if isinstance(freq, np.ndarray): + log_region = freq >= min_log_hertz + mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep + elif freq >= min_log_hertz: + mels = min_log_mel + np.log(freq / min_log_hertz) * logstep return mels -def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array: - """Convert mel bin numbers to frequencies. +def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: + """ + Convert frequency from mels to hertz. Args: - mels (`np.array`): - Mel frequencies + mels (`float` or `np.ndarray`): + The frequency, or multiple frequencies, in mels. mel_scale (`str`, *optional*, `"htk"`): - Scale to use: `htk` or `slaney`. + The mel frequency scale to use, `"htk"` or `"slaney"`. Returns: - freqs (`np.array`): - Mels converted to Hertz + `float` or `np.ndarray`: The frequencies in hertz. """ if mel_scale not in ["slaney", "htk"]: @@ -81,171 +76,509 @@ def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array: if mel_scale == "htk": return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) - # Fill in the linear scale - frequency_min = 0.0 - f_sp = 200.0 / 3 - freqs = frequency_min + f_sp * mels - - # And now the nonlinear scale min_log_hertz = 1000.0 - min_log_mel = (min_log_hertz - frequency_min) / f_sp - logstep = math.log(6.4) / 27.0 + min_log_mel = 15.0 + logstep = np.log(6.4) / 27.0 + freq = 200.0 * mels / 3.0 - log_t = mels >= min_log_mel - freqs[log_t] = min_log_hertz * np.exp(logstep * (mels[log_t] - min_log_mel)) + if isinstance(mels, np.ndarray): + log_region = mels >= min_log_mel + freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel)) + elif mels >= min_log_mel: + freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel)) - return freqs + return freq -def _create_triangular_filterbank( - all_freqs: np.array, - f_pts: np.array, -) -> np.array: - """Create a triangular filter bank. +def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray: + """ + Creates a triangular filter bank. + Adapted from *torchaudio* and *librosa*. Args: - all_freqs (`np.array` of shape (`nb_frequency_bins`, )): - Discrete frequencies used when the STFT was computed. - f_pts (`np.array`, of shape (`nb_mel_filters`, )): - Coordinates of the middle points of the triangular filters to create. + fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`): + Discrete frequencies of the FFT bins in Hz. + filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`): + Center frequencies of the triangular filters to create, in Hz. Returns: - fb (np.array): - The filter bank of size (`nb_frequency_bins`, `nb_mel_filters`). + `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)` """ - # Adapted from Librosa - # calculate the difference between each filter mid point and each stft freq point in hertz - f_diff = f_pts[1:] - f_pts[:-1] # (n_filter + 1) - slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1) # (nb_frequency_bins, n_filter + 2) - # create overlapping triangles - zero = np.zeros(1) - down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (nb_frequency_bins, n_filter) - up_slopes = slopes[:, 2:] / f_diff[1:] # (nb_frequency_bins, n_filter) - fb = np.maximum(zero, np.minimum(down_slopes, up_slopes)) - - return fb - - -def get_mel_filter_banks( - nb_frequency_bins: int, - nb_mel_filters: int, - frequency_min: float, - frequency_max: float, - sample_rate: int, + filter_diff = np.diff(filter_freqs) + slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1) + down_slopes = -slopes[:, :-2] / filter_diff[:-1] + up_slopes = slopes[:, 2:] / filter_diff[1:] + return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes)) + + +def mel_filter_bank( + num_frequency_bins: int, + num_mel_filters: int, + min_frequency: float, + max_frequency: float, + sampling_rate: int, norm: Optional[str] = None, mel_scale: str = "htk", -) -> np.array: +) -> np.ndarray: """ - Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter bank*, - and various implementation exist, which differ in the number of filters, the shape of the filters, the way the - filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these + Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and + various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters + are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency. - This code is heavily inspired from the *torchaudio* implementation, see - [here](https://pytorch.org/audio/stable/transforms.html) for more details. - - - Tips: - - Different banks of Mel filters were introduced in the litterature. The following variation are supported: - - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz - and a speech bandwidth of `[0, 4600]` Hertz - - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a - speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz). - - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate - of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization. - - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling - rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz - - The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa` - uses the `"slaney"` implementation. + + Different banks of mel filters were introduced in the literature. The following variations are supported: + + - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech + bandwidth of `[0, 4600]` Hz. + - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech + bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz. + - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and + speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization. + - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of + 12.5 kHz and speech bandwidth of `[0, 6250]` Hz. + + This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's + `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation. Args: - nb_frequency_bins (`int`): + num_frequency_bins (`int`): Number of frequencies used to compute the spectrogram (should be the same as in `stft`). - nb_mel_filters (`int`): - Number of Mel filers to generate. - frequency_min (`float`): - Minimum frequency of interest(Hertz). - frequency_max (`float`): - Maximum frequency of interest(Hertz). - sample_rate (`int`): + num_mel_filters (`int`): + Number of mel filters to generate. + min_frequency (`float`): + Lowest frequency of interest in Hz. + max_frequency (`float`): + Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`. + sampling_rate (`int`): Sample rate of the audio waveform. norm (`str`, *optional*): - If "slaney", divide the triangular Mel weights by the width of the mel band (area normalization). + If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization). mel_scale (`str`, *optional*, defaults to `"htk"`): - Scale to use: `"htk"` or `"slaney"`. + The mel frequency scale to use, `"htk"` or `"slaney"`. Returns: - `np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix - is a projection matrix to go from a spectrogram to a Mel Spectrogram. - + `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a + projection matrix to go from a spectrogram to a mel spectrogram. """ - if norm is not None and norm != "slaney": raise ValueError('norm must be one of None or "slaney"') - # freqency bins - all_freqs = np.linspace(0, sample_rate // 2, nb_frequency_bins) - - # Compute mim and max frequencies in mel scale - m_min = hertz_to_mel(frequency_min, mel_scale=mel_scale) - m_max = hertz_to_mel(frequency_max, mel_scale=mel_scale) + # frequencies of FFT bins in Hz + fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins) - # create the centers of the triangular mel filters. - m_pts = np.linspace(m_min, m_max, nb_mel_filters + 2) - f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale) + # center points of the triangular mel filters + mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale) + mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale) + mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2) + filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale) - # create the filterbank - filterbank = _create_triangular_filterbank(all_freqs, f_pts) + mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs) if norm is not None and norm == "slaney": # Slaney-style mel is scaled to be approx constant energy per channel - enorm = 2.0 / (f_pts[2 : nb_mel_filters + 2] - f_pts[:nb_mel_filters]) - filterbank *= np.expand_dims(enorm, 0) + enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters]) + mel_filters *= np.expand_dims(enorm, 0) - if (filterbank.max(axis=0) == 0.0).any(): + if (mel_filters.max(axis=0) == 0.0).any(): warnings.warn( - "At least one mel filterbank has all zero values. " - f"The value for `nb_mel_filters` ({nb_mel_filters}) may be set too high. " - f"Or, the value for `nb_frequency_bins` ({nb_frequency_bins}) may be set too low." + "At least one mel filter has all zero values. " + f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. " + f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low." + ) + + return mel_filters + + +def optimal_fft_length(window_length: int) -> int: + """ + Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not + already a power of two, rounds it up to the next power or two. + + The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size + of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples + is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies, + it simply gives a higher frequency resolution (i.e. the frequency bins are smaller). + """ + return 2 ** int(np.ceil(np.log2(window_length))) + + +def window_function( + window_length: int, + name: str = "hann", + periodic: bool = True, + frame_length: Optional[int] = None, + center: bool = True, +) -> np.ndarray: + """ + Returns an array containing the specified window. This window is intended to be used with `stft`. + + The following window types are supported: + + - `"boxcar"`: a rectangular window + - `"hamming"`: the Hamming window + - `"hann"`: the Hann window + + Args: + window_length (`int`): + The length of the window in samples. + name (`str`, *optional*, defaults to `"hann"`): + The name of the window function. + periodic (`bool`, *optional*, defaults to `True`): + Whether the window is periodic or symmetric. + frame_length (`int`, *optional*): + The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller + than the frame length, so that it will be zero-padded. + center (`bool`, *optional*, defaults to `True`): + Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided. + + Returns: + `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window. + """ + length = window_length + 1 if periodic else window_length + + if name == "boxcar": + window = np.ones(length) + elif name in ["hamming", "hamming_window"]: + window = np.hamming(length) + elif name in ["hann", "hann_window"]: + window = np.hanning(length) + else: + raise ValueError(f"Unknown window function '{name}'") + + if periodic: + window = window[:-1] + + if frame_length is None: + return window + + if window_length > frame_length: + raise ValueError( + f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})" ) - return filterbank + padded_window = np.zeros(frame_length) + offset = (frame_length - window_length) // 2 if center else 0 + padded_window[offset : offset + window_length] = window + return padded_window + + +# TODO This method does not support batching yet as we are mainly focused on inference. +def spectrogram( + waveform: np.ndarray, + window: np.ndarray, + frame_length: int, + hop_length: int, + fft_length: Optional[int] = None, + power: Optional[float] = 1.0, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + preemphasis: Optional[float] = None, + mel_filters: Optional[np.ndarray] = None, + mel_floor: float = 1e-10, + log_mel: Optional[str] = None, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, + dtype: np.dtype = np.float32, +) -> np.ndarray: + """ + Calculates a spectrogram over one waveform using the Short-Time Fourier Transform. + + This function can create the following kinds of spectrograms: + + - amplitude spectrogram (`power = 1.0`) + - power spectrogram (`power = 2.0`) + - complex-valued spectrogram (`power = None`) + - log spectrogram (use `log_mel` argument) + - mel spectrogram (provide `mel_filters`) + - log-mel spectrogram (provide `mel_filters` and `log_mel`) + + How this works: + + 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length + - hop_length` samples. + 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. + 3. The DFT is taken of each windowed frame. + 4. The results are stacked into a spectrogram. + + We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: + + - The analysis frame. This is the size of the time slices that the input waveform is split into. + - The window. Each analysis frame is multiplied by the window to avoid spectral leakage. + - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. + + In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A + padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, + typically the next power of two. + + Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and + `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms + can be constructed. + + Args: + waveform (`np.ndarray` of shape `(length,)`): + The input waveform. This must be a single real-valued, mono waveform. + window (`np.ndarray` of shape `(frame_length,)`): + The windowing function to apply, including zero-padding if necessary. The actual window length may be + shorter than `frame_length`, but we're assuming the array has already been zero-padded. + frame_length (`int`): + The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also + allow smaller sizes. + hop_length (`int`): + The stride between successive analysis frames in samples. + fft_length (`int`, *optional*): + The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have. + For optimal speed, this should be a power of two. If `None`, uses `frame_length`. + power (`float`, *optional*, defaults to 1.0): + If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns + complex numbers. + center (`bool`, *optional*, defaults to `True`): + Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame + `t` will start at time `t * hop_length`. + pad_mode (`str`, *optional*, defaults to `"reflect"`): + Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"` + (pad with edge values), `"reflect"` (pads with mirrored values). + onesided (`bool`, *optional*, defaults to `True`): + If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1` + frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins. + preemphasis (`float`, *optional*) + Coefficient for a low-pass filter that applies pre-emphasis before the DFT. + mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*): + The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram. + mel_floor (`float`, *optional*, defaults to 1e-10): + Minimum value of mel frequency banks. + log_mel (`str`, *optional*): + How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take + the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be + used when `power` is not `None`. + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-10`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an + amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be + `np.complex64`. + + Returns: + `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape + `(num_mel_filters, length)` for a mel spectrogram. + """ + window_length = len(window) + + if fft_length is None: + fft_length = frame_length + + if frame_length > fft_length: + raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") + if window_length != frame_length: + raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") -def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0): + if hop_length <= 0: + raise ValueError("hop_length must be greater than zero") + + if waveform.ndim != 1: + raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") + + if np.iscomplexobj(waveform): + raise ValueError("Complex-valued input waveforms are not currently supported") + + # center pad the waveform + if center: + padding = [(int(frame_length // 2), int(frame_length // 2))] + waveform = np.pad(waveform, padding, mode=pad_mode) + + # promote to float64, since np.fft uses float64 internally + waveform = waveform.astype(np.float64) + window = window.astype(np.float64) + + # split waveform into frames of frame_length size + num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length)) + + num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length + spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64) + + # rfft is faster than fft + fft_func = np.fft.rfft if onesided else np.fft.fft + buffer = np.zeros(fft_length) + + timestep = 0 + for frame_idx in range(num_frames): + buffer[:frame_length] = waveform[timestep : timestep + frame_length] + + if preemphasis is not None: + buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1] + buffer[0] *= 1 - preemphasis + + buffer[:frame_length] *= window + + spectrogram[frame_idx] = fft_func(buffer) + timestep += hop_length + + # note: ** is much faster than np.power + if power is not None: + spectrogram = np.abs(spectrogram, dtype=np.float64) ** power + + spectrogram = spectrogram.T + + if mel_filters is not None: + spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram)) + + if power is not None and log_mel is not None: + if log_mel == "log": + spectrogram = np.log(spectrogram) + elif log_mel == "log10": + spectrogram = np.log10(spectrogram) + elif log_mel == "dB": + if power == 1.0: + spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range) + elif power == 2.0: + spectrogram = power_to_db(spectrogram, reference, min_value, db_range) + else: + raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") + else: + raise ValueError(f"Unknown log_mel option: {log_mel}") + + spectrogram = np.asarray(spectrogram, dtype) + + return spectrogram + + +def power_to_db( + spectrogram: np.ndarray, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, +) -> np.ndarray: + """ + Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic + logarithm properties for numerical stability. + + The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a + linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. + This means that large variations in energy may not sound all that different if the sound is loud to begin with. + This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. + + Based on the implementation of `librosa.power_to_db`. + + Args: + spectrogram (`np.ndarray`): + The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared! + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-10`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the spectrogram in decibels """ - Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb. - It computes `10 * log10(mel_spectrogram / ref)`, using basic log properties for stability. + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") - Tips: - - The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on - a - linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much energy into - it. - - This means that large variations in energy may not sound all that different if the sound is loud to begin - with. This compression operation makes the mel features match more closely what humans actually hear. + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) + + return spectrogram + + +def amplitude_to_db( + spectrogram: np.ndarray, + reference: float = 1.0, + min_value: float = 1e-5, + db_range: Optional[float] = None, +) -> np.ndarray: + """ + Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using + basic logarithm properties for numerical stability. + + The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a + linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. + This means that large variations in energy may not sound all that different if the sound is loud to begin with. + This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. Args: - mel_spectrogram (`np.array`): - Input mel spectrogram. - top_db (`int`, *optional*): - The maximum decibel value. - a_min (`int`, *optional*, default to 1e-10): - Minimum value to use when cliping the mel spectrogram. - ref (`float`, *optional*, default to 1.0): - Maximum reference value used to scale the mel_spectrogram. + spectrogram (`np.ndarray`): + The input amplitude (mel) spectrogram. + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-5`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + Returns: + `np.ndarray`: the spectrogram in decibels """ - log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None)) - log_spec -= 10.0 * np.log10(np.maximum(a_min, ref)) - if top_db is not None: - if top_db < 0: - raise ValueError("top_db must be non-negative") - log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf) - return log_spec + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) + + return spectrogram + + +### deprecated functions below this line ### + + +def get_mel_filter_banks( + nb_frequency_bins: int, + nb_mel_filters: int, + frequency_min: float, + frequency_max: float, + sample_rate: int, + norm: Optional[str] = None, + mel_scale: str = "htk", +) -> np.array: + warnings.warn( + "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers", + FutureWarning, + ) + return mel_filter_bank( + num_frequency_bins=nb_frequency_bins, + num_mel_filters=nb_mel_filters, + min_frequency=frequency_min, + max_frequency=frequency_max, + sampling_rate=sample_rate, + norm=norm, + mel_scale=mel_scale, + ) -# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference. def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True): """ In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed @@ -270,6 +603,10 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`): The framed waveforms that can be fed to `np.fft`. """ + warnings.warn( + "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers", + FutureWarning, + ) frames = [] for i in range(0, waveform.shape[0] + 1, hop_length): if center: @@ -298,9 +635,6 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = return frames -# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference. - - def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None): """ Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results @@ -337,6 +671,10 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = spectrogram (`np.ndarray`): A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm """ + warnings.warn( + "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers", + FutureWarning, + ) frame_size = frames.shape[1] if fft_window_size is None: @@ -355,5 +693,5 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = np.multiply(frame, windowing_function, out=fft_signal[:frame_size]) else: fft_signal[:frame_size] = frame - spectrogram[f] = fft(fft_signal, axis=0)[:nb_frequency_bins] + spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins] return spectrogram.T diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py index b73873e05652..6edd739fa16d 100644 --- a/src/transformers/models/clap/feature_extraction_clap.py +++ b/src/transformers/models/clap/feature_extraction_clap.py @@ -21,7 +21,7 @@ import numpy as np import torch -from ...audio_utils import fram_wave, get_mel_filter_banks, power_to_db, stft +from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import TensorType, logging @@ -116,21 +116,21 @@ def __init__( self.sampling_rate = sampling_rate self.frequency_min = frequency_min self.frequency_max = frequency_max - self.mel_filters = get_mel_filter_banks( - nb_frequency_bins=self.nb_frequency_bins, - nb_mel_filters=feature_size, - frequency_min=frequency_min, - frequency_max=frequency_max, - sample_rate=sampling_rate, + self.mel_filters = mel_filter_bank( + num_frequency_bins=self.nb_frequency_bins, + num_mel_filters=feature_size, + min_frequency=frequency_min, + max_frequency=frequency_max, + sampling_rate=sampling_rate, norm=None, mel_scale="htk", ) - self.mel_filters_slaney = get_mel_filter_banks( - nb_frequency_bins=self.nb_frequency_bins, - nb_mel_filters=feature_size, - frequency_min=frequency_min, - frequency_max=frequency_max, - sample_rate=sampling_rate, + self.mel_filters_slaney = mel_filter_bank( + num_frequency_bins=self.nb_frequency_bins, + num_mel_filters=feature_size, + min_frequency=frequency_min, + max_frequency=frequency_max, + sampling_rate=sampling_rate, norm="slaney", mel_scale="slaney", ) @@ -153,24 +153,25 @@ def to_dict(self) -> Dict[str, Any]: def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray: """ - Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In CLAP, two different - filter banks are used depending on the truncation pattern: - - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from + Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter + banks are used depending on the truncation pattern: + - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation` is set to `"fusion"`. - - `self.mel_filteres_slaney` : they correspond to the defaults parameters of `torchlibrosa` which used + - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original implementation when the truncation mode is not `"fusion"`. """ - window = np.hanning(self.fft_window_size + 1)[:-1] - frames = fram_wave(waveform, self.hop_length, self.fft_window_size) - spectrogram = stft(frames, window, fft_window_size=self.fft_window_size) - - magnitudes = np.abs(spectrogram) ** 2 - mel_spectrogram = np.matmul(mel_filters.T, magnitudes) - log_mel_spectrogram = power_to_db(mel_spectrogram).T - log_mel_spectrogram = np.asarray(log_mel_spectrogram, np.float32) - return log_mel_spectrogram + log_mel_spectrogram = spectrogram( + waveform, + window_function(self.fft_window_size, "hann"), + frame_length=self.fft_window_size, + hop_length=self.hop_length, + power=2.0, + mel_filters=mel_filters, + log_mel="dB", + ) + return log_mel_spectrogram.T def _random_mel_fusion(self, mel, total_frames, chunk_frames): ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3) diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py index d517e3caf85e..467e654244b9 100644 --- a/src/transformers/models/mctct/feature_extraction_mctct.py +++ b/src/transformers/models/mctct/feature_extraction_mctct.py @@ -20,9 +20,8 @@ import numpy as np import torch -import torchaudio -from packaging import version +from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...file_utils import PaddingStrategy, TensorType @@ -31,13 +30,6 @@ logger = logging.get_logger(__name__) -parsed_torchaudio_version_base = version.parse(version.parse(torchaudio.__version__).base_version) -if not parsed_torchaudio_version_base >= version.parse("0.10"): - logger.warning( - f"You are using torchaudio=={torchaudio.__version__}, but torchaudio>=0.10.0 is required to use " - "MCTCTFeatureExtractor. This requires torch>=1.10.0. Please upgrade torch and torchaudio." - ) - class MCTCTFeatureExtractor(SequenceFeatureExtractor): r""" @@ -110,68 +102,9 @@ def __init__( self.sample_size = win_length * sampling_rate // 1000 self.sample_stride = hop_length * sampling_rate // 1000 - self.n_fft = 2 ** int(np.ceil(np.log2(self.sample_size))) + self.n_fft = optimal_fft_length(self.sample_size) self.n_freqs = (self.n_fft // 2) + 1 - @staticmethod - def _num_frames_calc(in_size, frame_size, frame_stride): - return int(1 + np.floor((in_size - frame_size) * 1 / frame_stride)) - - @staticmethod - def _frame_signal(one_waveform, n_frames, frame_signal_scale, window_length, sample_stride): - scale = frame_signal_scale - frames = np.zeros(n_frames * window_length) - for frame_idx in range(n_frames): - start = frame_idx * window_length - end = (frame_idx + 1) * window_length - wave_start = frame_idx * sample_stride - wave_end = frame_idx * sample_stride + window_length - frames[start:end] = scale * one_waveform[wave_start:wave_end] - - return frames - - @staticmethod - def _apply_preemphasis_inplace(frames, window_length, preemphasis_coeff): - if frames.size % window_length != 0: - raise ValueError( - f"`frames` is supposed to have length divisble by `window_length`, but is {frames.size} with" - f" window_length={window_length}." - ) - - n_frames = frames.size // window_length - for frame_idx in range(n_frames, 0, -1): - start = (frame_idx - 1) * window_length - end = frame_idx * window_length - 1 - frames[start + 1 : end + 1] -= preemphasis_coeff * frames[start:end] - frames[start] *= 1 - preemphasis_coeff - - @staticmethod - def _windowing(frames, window_length, window): - if frames.size % window_length != 0: - raise ValueError( - f"`frames` is supposed to have length divisble by `window_length`, but is {frames.size} with" - f" window_length={window_length}." - ) - - shaped = frames.reshape(-1, window_length) - shaped = window * shaped - return shaped - - @staticmethod - def _dft(frames, K, n_frames, n_samples, n_fft): - dft = np.zeros([n_frames, K]) - - for frame in range(n_frames): - begin = frame * n_samples - - inwards_buffer = frames[begin : begin + n_samples] - inwards_buffer = np.pad(inwards_buffer, (0, n_fft - n_samples), "constant") - out = np.fft.rfft(inwards_buffer) - - dft[frame] = np.abs(out[:K]) - - return dft - def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray: """ Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code. @@ -183,36 +116,27 @@ def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray: window = window.numpy() - fbanks = torchaudio.functional.melscale_fbanks( - n_freqs=self.n_freqs, - f_min=0.0, # change this to zeros - f_max=self.sampling_rate / 2.0, - n_mels=self.feature_size, - sample_rate=self.sampling_rate, + fbanks = mel_filter_bank( + num_frequency_bins=self.n_freqs, + num_mel_filters=self.feature_size, + min_frequency=0.0, + max_frequency=self.sampling_rate / 2.0, + sampling_rate=self.sampling_rate, ) - fbanks = fbanks.numpy() - - n_frames = self._num_frames_calc(one_waveform.size, self.sample_size, self.sample_stride) - - frames = self._frame_signal( - one_waveform, n_frames, self.frame_signal_scale, self.sample_size, self.sample_stride + msfc_features = spectrogram( + one_waveform * self.frame_signal_scale, + window=window, + frame_length=self.sample_size, + hop_length=self.sample_stride, + fft_length=self.n_fft, + center=False, + preemphasis=self.preemphasis_coeff, + mel_filters=fbanks, + mel_floor=self.mel_floor, + log_mel="log", ) - - self._apply_preemphasis_inplace(frames, self.sample_size, self.preemphasis_coeff) - - frames = self._windowing(frames, self.sample_size, window) - - dft_out = self._dft(frames.flatten(), self.n_freqs, n_frames, self.sample_size, self.n_fft) - - # msfc_features = STFT * mel frequency banks. - msfc_features = np.einsum("...tf,fm->...tm", dft_out, fbanks) - - # clamp feature values then log scale, as implemented in flashlight - msfc_features = np.maximum(msfc_features, self.mel_floor) - msfc_features = np.log(msfc_features) - - return msfc_features + return msfc_features.T def _normalize_one(self, x, input_length, padding_value): # make sure we normalize float32 arrays diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py index 8ceb48dc03c6..5fe6ca39765c 100644 --- a/src/transformers/models/speecht5/feature_extraction_speecht5.py +++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py @@ -20,7 +20,7 @@ import numpy as np import torch -from ...audio_utils import get_mel_filter_banks +from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import PaddingStrategy, TensorType, logging @@ -110,18 +110,18 @@ def __init__( self.sample_size = win_length * sampling_rate // 1000 self.sample_stride = hop_length * sampling_rate // 1000 - self.n_fft = 2 ** int(np.ceil(np.log2(self.sample_size))) + self.n_fft = optimal_fft_length(self.sample_size) self.n_freqs = (self.n_fft // 2) + 1 window = getattr(torch, self.win_function)(window_length=self.sample_size, periodic=True) self.window = window.numpy().astype(np.float64) - self.mel_filters = get_mel_filter_banks( - nb_frequency_bins=self.n_freqs, - nb_mel_filters=self.num_mel_bins, - frequency_min=self.fmin, - frequency_max=self.fmax, - sample_rate=self.sampling_rate, + self.mel_filters = mel_filter_bank( + num_frequency_bins=self.n_freqs, + num_mel_filters=self.num_mel_bins, + min_frequency=self.fmin, + max_frequency=self.fmax, + sampling_rate=self.sampling_rate, norm="slaney", mel_scale="slaney", ) @@ -160,31 +160,6 @@ def zero_mean_unit_var_norm( return normed_input_values - @staticmethod - def _stft(waveform: np.ndarray, fft_length: int, hop_length: int, window: np.ndarray) -> np.ndarray: - """ - Calculates the magnitude spectrogram over one waveform array. - """ - # center pad the waveform - padding = [(int(fft_length // 2), int(fft_length // 2))] - waveform = np.pad(waveform, padding, mode="reflect") - waveform_size = waveform.size - - # promote to float64, since np.fft uses float64 internally - waveform = waveform.astype(np.float64) - - num_frames = int(1 + np.floor((waveform_size - fft_length) / hop_length)) - num_frequency_bins = (fft_length // 2) + 1 - spectrogram = np.empty((num_frames, num_frequency_bins)) - - start = 0 - for frame_idx in range(num_frames): - frame = waveform[start : start + fft_length] * window - spectrogram[frame_idx] = np.abs(np.fft.rfft(frame)) - start += hop_length - - return spectrogram - def _extract_mel_features( self, one_waveform: np.ndarray, @@ -192,14 +167,17 @@ def _extract_mel_features( """ Extracts log-mel filterbank features for one waveform array (unbatched). """ - if self.n_fft != self.sample_size: - raise NotImplementedError( - f"Currently the STFT frame size must be a power of two, but got {self.sample_size} for a window length of {self.win_length} and sampling rate of {self.sampling_rate}. Ensure `win_length * sampling_rate // 1000` is divisible by two." - ) - - stft_out = self._stft(one_waveform, self.n_fft, self.sample_stride, self.window) - - return np.log10(np.maximum(self.mel_floor, np.dot(stft_out, self.mel_filters))) + log_mel_spec = spectrogram( + one_waveform, + window=self.window, + frame_length=self.sample_size, + hop_length=self.sample_stride, + fft_length=self.n_fft, + mel_filters=self.mel_filters, + mel_floor=self.mel_floor, + log_mel="log10", + ) + return log_mel_spec.T def __call__( self, diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py index ac219502f1bd..6d919550cf55 100644 --- a/src/transformers/models/tvlt/feature_extraction_tvlt.py +++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py @@ -18,8 +18,8 @@ from typing import List, Optional, Union import numpy as np -from numpy.fft import fft +from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor from ...utils import TensorType, logging @@ -83,143 +83,34 @@ def __init__( self.hop_length = sampling_rate // hop_length_to_sampling_rate self.sampling_rate = sampling_rate self.padding_value = padding_value - self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size) - - # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.get_mel_filters with 45.245640471924965->59.99247463746737 - def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32): - # Initialize the weights - n_mels = int(n_mels) - weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) - - # Center freqs of each FFT bin - fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr) - - # 'Center freqs' of mel bands - uniformly spaced between limits - min_mel = 0.0 - max_mel = 59.99247463746737 - - mels = np.linspace(min_mel, max_mel, n_mels + 2) - - mels = np.asanyarray(mels) - - # Fill in the linear scale - f_min = 0.0 - f_sp = 200.0 / 3 - freqs = f_min + f_sp * mels - - # And now the nonlinear scale - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = np.log(6.4) / 27.0 # step size for log region - - # If we have vector data, vectorize - log_t = mels >= min_log_mel - freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) - - mel_f = freqs - - fdiff = np.diff(mel_f) - ramps = np.subtract.outer(mel_f, fftfreqs) - - for i in range(n_mels): - # lower and upper slopes for all bins - lower = -ramps[i] / fdiff[i] - upper = ramps[i + 2] / fdiff[i + 1] - - # .. then intersect them with each other and zero - weights[i] = np.maximum(0, np.minimum(lower, upper)) - - # Slaney-style mel is scaled to be approx constant energy per channel - enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) - weights *= enorm[:, np.newaxis] - - return weights - - # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.fram_wave - def fram_wave(self, waveform, center=True): - """ - Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is - contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each - new frame. - - Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`. - """ - frames = [] - for i in range(0, waveform.shape[0] + 1, self.hop_length): - half_window = (self.n_fft - 1) // 2 + 1 - if center: - start = i - half_window if i > half_window else 0 - end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] - - frame = waveform[start:end] - - if start == 0: - padd_width = (-i + half_window, 0) - frame = np.pad(frame, pad_width=padd_width, mode="reflect") - - elif end == waveform.shape[0]: - padd_width = (0, (i - waveform.shape[0] + half_window)) - frame = np.pad(frame, pad_width=padd_width, mode="reflect") - - else: - frame = waveform[i : i + self.n_fft] - frame_width = frame.shape[0] - if frame_width < waveform.shape[0]: - frame = np.lib.pad( - frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0 - ) - - frames.append(frame) - return np.stack(frames, 0) - - # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.stft - def stft(self, frames, window): - """ - Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same - results as `torch.stft`. - """ - frame_size = frames.shape[1] - fft_size = self.n_fft - - if fft_size is None: - fft_size = frame_size - - if fft_size < frame_size: - raise ValueError("FFT size must greater or equal the frame size") - # number of FFT bins to store - num_fft_bins = (fft_size >> 1) + 1 - - data = np.empty((len(frames), num_fft_bins), dtype=np.complex64) - fft_signal = np.zeros(fft_size) - - for f, frame in enumerate(frames): - if window is not None: - np.multiply(frame, window, out=fft_signal[:frame_size]) - else: - fft_signal[:frame_size] = frame - data[f] = fft(fft_signal, axis=0)[:num_fft_bins] - return data.T + self.mel_filters = mel_filter_bank( + num_frequency_bins=1 + n_fft // 2, + num_mel_filters=feature_size, + min_frequency=0.0, + max_frequency=22050.0, + sampling_rate=sampling_rate, + norm="slaney", + mel_scale="slaney", + ).T def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray: """ - Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch + Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch implementation with 1e-5 tolerance. """ - window = np.hanning(self.n_fft + 1)[:-1] - - frames = self.fram_wave(waveform) - stft = self.stft(frames, window=window) - magnitudes = np.abs(stft[:, :-1]) ** 2 - - filters = self.mel_filters - mel_spec = filters @ magnitudes - - log_spec = 10.0 * np.log10(np.maximum(1e-10, mel_spec)) - log_spec -= 10.0 * np.log10(np.maximum(1e-10, 1.0)) - log_spec = np.maximum(log_spec, log_spec.max() - 80.0) + log_spec = spectrogram( + waveform, + window_function(self.n_fft, "hann"), + frame_length=self.n_fft, + hop_length=self.hop_length, + power=2.0, + mel_filters=self.mel_filters.T, + log_mel="dB", + db_range=80.0, + ) + log_spec = log_spec[:, :-1] log_spec = log_spec - 20.0 log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0 - return log_spec def __call__( diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py index da700f2d2257..e0b772216205 100644 --- a/src/transformers/models/whisper/feature_extraction_whisper.py +++ b/src/transformers/models/whisper/feature_extraction_whisper.py @@ -19,8 +19,8 @@ from typing import Any, Dict, List, Optional, Union import numpy as np -from numpy.fft import fft +from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import TensorType, logging @@ -81,138 +81,33 @@ def __init__( self.n_samples = chunk_length * sampling_rate self.nb_max_frames = self.n_samples // hop_length self.sampling_rate = sampling_rate - self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size) - - def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32): - # Initialize the weights - n_mels = int(n_mels) - weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) - - # Center freqs of each FFT bin - fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr) - - # 'Center freqs' of mel bands - uniformly spaced between limits - min_mel = 0.0 - max_mel = 45.245640471924965 - - mels = np.linspace(min_mel, max_mel, n_mels + 2) - - mels = np.asanyarray(mels) - - # Fill in the linear scale - f_min = 0.0 - f_sp = 200.0 / 3 - freqs = f_min + f_sp * mels - - # And now the nonlinear scale - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = np.log(6.4) / 27.0 # step size for log region - - # If we have vector data, vectorize - log_t = mels >= min_log_mel - freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) - - mel_f = freqs - - fdiff = np.diff(mel_f) - ramps = np.subtract.outer(mel_f, fftfreqs) - - for i in range(n_mels): - # lower and upper slopes for all bins - lower = -ramps[i] / fdiff[i] - upper = ramps[i + 2] / fdiff[i + 1] - - # .. then intersect them with each other and zero - weights[i] = np.maximum(0, np.minimum(lower, upper)) - - # Slaney-style mel is scaled to be approx constant energy per channel - enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) - weights *= enorm[:, np.newaxis] - - return weights - - def fram_wave(self, waveform, center=True): - """ - Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is - contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each - new frame. - - Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`. - """ - frames = [] - for i in range(0, waveform.shape[0] + 1, self.hop_length): - half_window = (self.n_fft - 1) // 2 + 1 - if center: - start = i - half_window if i > half_window else 0 - end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] - - frame = waveform[start:end] - - if start == 0: - padd_width = (-i + half_window, 0) - frame = np.pad(frame, pad_width=padd_width, mode="reflect") - - elif end == waveform.shape[0]: - padd_width = (0, (i - waveform.shape[0] + half_window)) - frame = np.pad(frame, pad_width=padd_width, mode="reflect") - - else: - frame = waveform[i : i + self.n_fft] - frame_width = frame.shape[0] - if frame_width < waveform.shape[0]: - frame = np.lib.pad( - frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0 - ) - - frames.append(frame) - return np.stack(frames, 0) - - def stft(self, frames, window): - """ - Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same - results as `torch.stft`. - """ - frame_size = frames.shape[1] - fft_size = self.n_fft - - if fft_size is None: - fft_size = frame_size - - if fft_size < frame_size: - raise ValueError("FFT size must greater or equal the frame size") - # number of FFT bins to store - num_fft_bins = (fft_size >> 1) + 1 - - data = np.empty((len(frames), num_fft_bins), dtype=np.complex64) - fft_signal = np.zeros(fft_size) - - for f, frame in enumerate(frames): - if window is not None: - np.multiply(frame, window, out=fft_signal[:frame_size]) - else: - fft_signal[:frame_size] = frame - data[f] = fft(fft_signal, axis=0)[:num_fft_bins] - return data.T + self.mel_filters = mel_filter_bank( + num_frequency_bins=1 + n_fft // 2, + num_mel_filters=feature_size, + min_frequency=0.0, + max_frequency=8000.0, + sampling_rate=sampling_rate, + norm="slaney", + mel_scale="slaney", + ) def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray: """ - Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch + Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch implementation with 1e-5 tolerance. """ - window = np.hanning(self.n_fft + 1)[:-1] - - frames = self.fram_wave(waveform) - stft = self.stft(frames, window=window) - magnitudes = np.abs(stft[:, :-1]) ** 2 - - filters = self.mel_filters - mel_spec = filters @ magnitudes - - log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None)) + log_spec = spectrogram( + waveform, + window_function(self.n_fft, "hann"), + frame_length=self.n_fft, + hop_length=self.hop_length, + power=2.0, + mel_filters=self.mel_filters, + log_mel="log10", + ) + log_spec = log_spec[:, :-1] log_spec = np.maximum(log_spec, log_spec.max() - 8.0) log_spec = (log_spec + 4.0) / 4.0 - return log_spec @staticmethod diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 6fd035af8d04..a7a81dceb153 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -160,6 +160,7 @@ def test_integration(self): # fmt: on input_speech = self._load_datasamples(1) - feaure_extractor = ASTFeatureExtractor() - input_values = feaure_extractor(input_speech, return_tensors="pt").input_values + feature_extractor = ASTFeatureExtractor() + input_values = feature_extractor(input_speech, return_tensors="pt").input_values + self.assertEquals(input_values.shape, (1, 1024, 128)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py index 29b0cf899ad3..a3c07474d281 100644 --- a/tests/models/mctct/test_feature_extraction_mctct.py +++ b/tests/models/mctct/test_feature_extraction_mctct.py @@ -21,7 +21,7 @@ import numpy as np from transformers import is_speech_available -from transformers.testing_utils import require_torch, require_torchaudio +from transformers.testing_utils import require_torch from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -47,7 +47,6 @@ def floats_list(shape, scale=1.0, rng=None, name=None): @require_torch -@require_torchaudio class MCTCTFeatureExtractionTester(unittest.TestCase): def __init__( self, @@ -102,7 +101,6 @@ def _flatten(list_of_lists): @require_torch -@require_torchaudio class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None @@ -271,3 +269,38 @@ def test_different_window(self): self.assertTrue(np_processed.input_features.dtype == np.float32) pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt") self.assertTrue(pt_processed.input_features.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def test_integration(self): + # fmt: off + expected = np.array([ + [ + 1.1280, 1.1319, 1.2744, 1.4369, 1.4328, 1.3671, 1.2889, 1.3046, + 1.4419, 0.8387, 0.2995, 0.0404, 0.1068, 0.0472, 0.3728, 1.3356, + 1.4491, 0.4770, 0.3997, 0.2776, 0.3184, -0.1243, -0.1170, -0.0828 + ], + [ + 1.0826, 1.0565, 1.2110, 1.3886, 1.3416, 1.2009, 1.1894, 1.2707, + 1.5153, 0.7005, 0.4916, 0.4017, 0.3743, 0.1935, 0.4228, 1.1084, + 0.9768, 0.0608, 0.2044, 0.1723, 0.0433, -0.2360, -0.2478, -0.2643 + ], + [ + 1.0590, 0.9923, 1.1185, 1.3309, 1.1971, 1.0067, 1.0080, 1.2036, + 1.5397, 1.0383, 0.7672, 0.7551, 0.4878, 0.8771, 0.7565, 0.8775, + 0.9042, 0.4595, 0.6157, 0.4954, 0.1857, 0.0307, 0.0199, 0.1033 + ], + ]) + # fmt: on + + input_speech = self._load_datasamples(1) + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features + self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4)) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 749323d3a8c9..aedd445e5d63 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -247,3 +247,27 @@ def test_double_precision_pad(self): self.assertTrue(np_processed.input_features.dtype == np.float32) pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt") self.assertTrue(pt_processed.input_features.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def test_integration(self): + # fmt: off + expected = np.array([ + -1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241, + -1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128, + -1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625, + ]) + # fmt: on + + input_speech = self._load_datasamples(1) + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + input_features = feature_extractor(input_speech, return_tensors="pt").input_features + self.assertEquals(input_features.shape, (1, 584, 24)) + self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index d19c71dd56f9..da733614913f 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -395,7 +395,8 @@ def test_integration(self): input_speech = self._load_datasamples(1) feature_extractor = SpeechT5FeatureExtractor() input_values = feature_extractor(input_speech, return_tensors="pt").input_values - self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) + self.assertEquals(input_values.shape, (1, 93680)) + self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6)) def test_integration_target(self): # fmt: off @@ -410,4 +411,5 @@ def test_integration_target(self): input_speech = self._load_datasamples(1) feature_extractor = SpeechT5FeatureExtractor() input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values + self.assertEquals(input_values.shape, (1, 366, 80)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py index 9f73a732f197..560abd78f92e 100644 --- a/tests/models/tvlt/test_feature_extraction_tvlt.py +++ b/tests/models/tvlt/test_feature_extraction_tvlt.py @@ -198,10 +198,10 @@ def _load_datasamples(self, num_samples): def test_integration(self): input_speech = self._load_datasamples(1) - feaure_extractor = TvltFeatureExtractor() - audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values + feature_extractor = TvltFeatureExtractor() + audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values - self.assertTrue(audio_values.shape, [1, 1, 192, 128]) + self.assertEquals(audio_values.shape, (1, 1, 192, 128)) expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]]) self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4)) diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index 57c12b86ddbc..31ea28b9ad62 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -218,8 +218,9 @@ def test_integration(self): # fmt: on input_speech = self._load_datasamples(1) - feaure_extractor = WhisperFeatureExtractor() - input_features = feaure_extractor(input_speech, return_tensors="pt").input_features + feature_extractor = WhisperFeatureExtractor() + input_features = feature_extractor(input_speech, return_tensors="pt").input_features + self.assertEqual(input_features.shape, (1, 80, 3000)) self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4)) def test_zero_mean_unit_variance_normalization_trunc_np_longest(self): diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py new file mode 100644 index 000000000000..f0333113ea7e --- /dev/null +++ b/tests/utils/test_audio_utils.py @@ -0,0 +1,652 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import pytest + +from transformers.audio_utils import ( + amplitude_to_db, + hertz_to_mel, + mel_filter_bank, + mel_to_hertz, + power_to_db, + spectrogram, + window_function, +) + + +class AudioUtilsFunctionTester(unittest.TestCase): + def test_hertz_to_mel(self): + self.assertEqual(hertz_to_mel(0.0), 0.0) + self.assertAlmostEqual(hertz_to_mel(100), 150.48910241) + + inputs = np.array([100, 200]) + expected = np.array([150.48910241, 283.22989816]) + self.assertTrue(np.allclose(hertz_to_mel(inputs), expected)) + + self.assertEqual(hertz_to_mel(0.0, "slaney"), 0.0) + self.assertEqual(hertz_to_mel(100, "slaney"), 1.5) + + inputs = np.array([60, 100, 200, 1000, 1001, 2000]) + expected = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016]) + self.assertTrue(np.allclose(hertz_to_mel(inputs, "slaney"), expected)) + + with pytest.raises(ValueError): + hertz_to_mel(100, mel_scale=None) + + def test_mel_to_hertz(self): + self.assertEqual(mel_to_hertz(0.0), 0.0) + self.assertAlmostEqual(mel_to_hertz(150.48910241), 100) + + inputs = np.array([150.48910241, 283.22989816]) + expected = np.array([100, 200]) + self.assertTrue(np.allclose(mel_to_hertz(inputs), expected)) + + self.assertEqual(mel_to_hertz(0.0, "slaney"), 0.0) + self.assertEqual(mel_to_hertz(1.5, "slaney"), 100) + + inputs = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016]) + expected = np.array([60, 100, 200, 1000, 1001, 2000]) + self.assertTrue(np.allclose(mel_to_hertz(inputs, "slaney"), expected)) + + with pytest.raises(ValueError): + mel_to_hertz(100, mel_scale=None) + + def test_mel_filter_bank_shape(self): + mel_filters = mel_filter_bank( + num_frequency_bins=513, + num_mel_filters=13, + min_frequency=100, + max_frequency=4000, + sampling_rate=16000, + norm=None, + mel_scale="htk", + ) + self.assertEqual(mel_filters.shape, (513, 13)) + + mel_filters = mel_filter_bank( + num_frequency_bins=513, + num_mel_filters=13, + min_frequency=100, + max_frequency=4000, + sampling_rate=16000, + norm="slaney", + mel_scale="slaney", + ) + self.assertEqual(mel_filters.shape, (513, 13)) + + def test_mel_filter_bank_htk(self): + mel_filters = mel_filter_bank( + num_frequency_bins=16, + num_mel_filters=4, + min_frequency=0, + max_frequency=2000, + sampling_rate=4000, + norm=None, + mel_scale="htk", + ) + # fmt: off + expected = np.array([ + [0.0 , 0.0 , 0.0 , 0.0 ], + [0.61454786, 0.0 , 0.0 , 0.0 ], + [0.82511046, 0.17488954, 0.0 , 0.0 ], + [0.35597035, 0.64402965, 0.0 , 0.0 ], + [0.0 , 0.91360726, 0.08639274, 0.0 ], + [0.0 , 0.55547007, 0.44452993, 0.0 ], + [0.0 , 0.19733289, 0.80266711, 0.0 ], + [0.0 , 0.0 , 0.87724349, 0.12275651], + [0.0 , 0.0 , 0.6038449 , 0.3961551 ], + [0.0 , 0.0 , 0.33044631, 0.66955369], + [0.0 , 0.0 , 0.05704771, 0.94295229], + [0.0 , 0.0 , 0.0 , 0.83483975], + [0.0 , 0.0 , 0.0 , 0.62612982], + [0.0 , 0.0 , 0.0 , 0.41741988], + [0.0 , 0.0 , 0.0 , 0.20870994], + [0.0 , 0.0 , 0.0 , 0.0 ] + ]) + # fmt: on + self.assertTrue(np.allclose(mel_filters, expected)) + + def test_mel_filter_bank_slaney(self): + mel_filters = mel_filter_bank( + num_frequency_bins=16, + num_mel_filters=4, + min_frequency=0, + max_frequency=2000, + sampling_rate=4000, + norm=None, + mel_scale="slaney", + ) + # fmt: off + expected = np.array([ + [0.0 , 0.0 , 0.0 , 0.0 ], + [0.39869419, 0.0 , 0.0 , 0.0 ], + [0.79738839, 0.0 , 0.0 , 0.0 ], + [0.80391742, 0.19608258, 0.0 , 0.0 ], + [0.40522322, 0.59477678, 0.0 , 0.0 ], + [0.00652903, 0.99347097, 0.0 , 0.0 ], + [0.0 , 0.60796161, 0.39203839, 0.0 ], + [0.0 , 0.20939631, 0.79060369, 0.0 ], + [0.0 , 0.0 , 0.84685344, 0.15314656], + [0.0 , 0.0 , 0.52418477, 0.47581523], + [0.0 , 0.0 , 0.2015161 , 0.7984839 ], + [0.0 , 0.0 , 0.0 , 0.9141874 ], + [0.0 , 0.0 , 0.0 , 0.68564055], + [0.0 , 0.0 , 0.0 , 0.4570937 ], + [0.0 , 0.0 , 0.0 , 0.22854685], + [0.0 , 0.0 , 0.0 , 0.0 ] + ]) + # fmt: on + self.assertTrue(np.allclose(mel_filters, expected)) + + def test_mel_filter_bank_slaney_norm(self): + mel_filters = mel_filter_bank( + num_frequency_bins=16, + num_mel_filters=4, + min_frequency=0, + max_frequency=2000, + sampling_rate=4000, + norm="slaney", + mel_scale="slaney", + ) + # fmt: off + expected = np.array([ + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [1.19217795e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [2.38435591e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [2.40387905e-03, 5.86232616e-04, 0.00000000e+00, 0.00000000e+00], + [1.21170110e-03, 1.77821783e-03, 0.00000000e+00, 0.00000000e+00], + [1.95231437e-05, 2.97020305e-03, 0.00000000e+00, 0.00000000e+00], + [0.00000000e+00, 1.81763684e-03, 1.04857612e-03, 0.00000000e+00], + [0.00000000e+00, 6.26036972e-04, 2.11460963e-03, 0.00000000e+00], + [0.00000000e+00, 0.00000000e+00, 2.26505954e-03, 3.07332945e-04], + [0.00000000e+00, 0.00000000e+00, 1.40202503e-03, 9.54861093e-04], + [0.00000000e+00, 0.00000000e+00, 5.38990521e-04, 1.60238924e-03], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.83458185e-03], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.37593638e-03], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.17290923e-04], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.58645462e-04], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00] + ]) + # fmt: on + self.assertTrue(np.allclose(mel_filters, expected)) + + def test_window_function(self): + window = window_function(16, "hann") + self.assertEqual(len(window), 16) + + # fmt: off + expected = np.array([ + 0.0, 0.03806023, 0.14644661, 0.30865828, 0.5, 0.69134172, 0.85355339, 0.96193977, + 1.0, 0.96193977, 0.85355339, 0.69134172, 0.5, 0.30865828, 0.14644661, 0.03806023, + ]) + # fmt: on + self.assertTrue(np.allclose(window, expected)) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + return [x["array"] for x in speech_samples] + + def test_spectrogram_impulse(self): + waveform = np.zeros(40) + waveform[9] = 1.0 # impulse shifted in time + + spec = spectrogram( + waveform, + window_function(12, "hann", frame_length=16), + frame_length=16, + hop_length=4, + power=1.0, + center=True, + pad_mode="reflect", + onesided=True, + ) + self.assertEqual(spec.shape, (9, 11)) + + expected = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) + self.assertTrue(np.allclose(spec, expected)) + + def test_spectrogram_integration_test(self): + waveform = self._load_datasamples(1)[0] + + spec = spectrogram( + waveform, + window_function(400, "hann", frame_length=512), + frame_length=512, + hop_length=128, + power=1.0, + center=True, + pad_mode="reflect", + onesided=True, + ) + self.assertEqual(spec.shape, (257, 732)) + + # fmt: off + expected = np.array([ + 0.02464888, 0.04648664, 0.05872392, 0.02311783, 0.0327175 , + 0.02433643, 0.01198814, 0.02055709, 0.01559287, 0.01394357, + 0.01299037, 0.01728045, 0.0254554 , 0.02486533, 0.02011792, + 0.01755333, 0.02100457, 0.02337024, 0.01436963, 0.01464558, + 0.0211017 , 0.0193489 , 0.01272165, 0.01858462, 0.03722598, + 0.0456542 , 0.03281558, 0.00620586, 0.02226466, 0.03618042, + 0.03508182, 0.02271432, 0.01051649, 0.01225771, 0.02315293, + 0.02331886, 0.01417785, 0.0106844 , 0.01791214, 0.017177 , + 0.02125114, 0.05028201, 0.06830665, 0.05216664, 0.01963666, + 0.06941418, 0.11513043, 0.12257859, 0.10948435, 0.08568069, + 0.05509328, 0.05047818, 0.047112 , 0.05060737, 0.02982424, + 0.02803827, 0.02933729, 0.01760491, 0.00587815, 0.02117637, + 0.0293578 , 0.03452379, 0.02194803, 0.01676056, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[:64, 400], expected)) + + spec = spectrogram( + waveform, + window_function(400, "hann"), + frame_length=400, + hop_length=128, + fft_length=512, + power=1.0, + center=True, + pad_mode="reflect", + onesided=True, + ) + self.assertEqual(spec.shape, (257, 732)) + self.assertTrue(np.allclose(spec[:64, 400], expected)) + + def test_spectrogram_center_padding(self): + waveform = self._load_datasamples(1)[0] + + spec = spectrogram( + waveform, + window_function(512, "hann"), + frame_length=512, + hop_length=128, + center=True, + pad_mode="reflect", + ) + self.assertEqual(spec.shape, (257, 732)) + + # fmt: off + expected = np.array([ + 0.1287945 , 0.12792738, 0.08311573, 0.03155122, 0.02470202, + 0.00727857, 0.00910694, 0.00686163, 0.01238981, 0.01473668, + 0.00336144, 0.00370314, 0.00600871, 0.01120164, 0.01942998, + 0.03132008, 0.0232842 , 0.01124642, 0.02754783, 0.02423725, + 0.00147893, 0.00038027, 0.00112299, 0.00596233, 0.00571529, + 0.02084235, 0.0231855 , 0.00810006, 0.01837943, 0.00651339, + 0.00093931, 0.00067426, 0.01058399, 0.01270507, 0.00151734, + 0.00331913, 0.00302416, 0.01081792, 0.00754549, 0.00148963, + 0.00111943, 0.00152573, 0.00608017, 0.01749986, 0.01205949, + 0.0143082 , 0.01910573, 0.00413786, 0.03916619, 0.09873404, + 0.08302026, 0.02673891, 0.00401255, 0.01397392, 0.00751862, + 0.01024884, 0.01544606, 0.00638907, 0.00623633, 0.0085103 , + 0.00217659, 0.00276204, 0.00260835, 0.00299299, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[:64, 0], expected)) + + spec = spectrogram( + waveform, + window_function(512, "hann"), + frame_length=512, + hop_length=128, + center=True, + pad_mode="constant", + ) + self.assertEqual(spec.shape, (257, 732)) + + # fmt: off + expected = np.array([ + 0.06558744, 0.06889656, 0.06263352, 0.04264418, 0.03404115, + 0.03244197, 0.02279134, 0.01646339, 0.01452216, 0.00826055, + 0.00062093, 0.0031821 , 0.00419456, 0.00689327, 0.01106367, + 0.01712119, 0.01721762, 0.00977533, 0.01606626, 0.02275621, + 0.01727687, 0.00992739, 0.01217688, 0.01049927, 0.01022947, + 0.01302475, 0.01166873, 0.01081812, 0.01057327, 0.00767912, + 0.00429567, 0.00089625, 0.00654583, 0.00912084, 0.00700984, + 0.00225026, 0.00290545, 0.00667712, 0.00730663, 0.00410813, + 0.00073102, 0.00219296, 0.00527618, 0.00996585, 0.01123781, + 0.00872816, 0.01165121, 0.02047945, 0.03681747, 0.0514379 , + 0.05137928, 0.03960042, 0.02821562, 0.01813349, 0.01201322, + 0.01260964, 0.00900654, 0.00207905, 0.00456714, 0.00850599, + 0.00788239, 0.00664407, 0.00824227, 0.00628301, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[:64, 0], expected)) + + spec = spectrogram( + waveform, + window_function(512, "hann"), + frame_length=512, + hop_length=128, + center=False, + ) + self.assertEqual(spec.shape, (257, 728)) + + # fmt: off + expected = np.array([ + 0.00250445, 0.02161521, 0.06232229, 0.04339567, 0.00937727, + 0.01080616, 0.00248685, 0.0095264 , 0.00727476, 0.0079152 , + 0.00839946, 0.00254932, 0.00716622, 0.005559 , 0.00272623, + 0.00581774, 0.01896395, 0.01829788, 0.01020514, 0.01632692, + 0.00870888, 0.02065827, 0.0136022 , 0.0132382 , 0.011827 , + 0.00194505, 0.0189979 , 0.026874 , 0.02194014, 0.01923883, + 0.01621437, 0.00661967, 0.00289517, 0.00470257, 0.00957801, + 0.00191455, 0.00431664, 0.00544359, 0.01126213, 0.00785778, + 0.00423469, 0.01322504, 0.02226548, 0.02318576, 0.03428908, + 0.03648811, 0.0202938 , 0.011902 , 0.03226198, 0.06347476, + 0.01306318, 0.05308729, 0.05474771, 0.03127991, 0.00998512, + 0.01449977, 0.01272741, 0.00868176, 0.00850386, 0.00313876, + 0.00811857, 0.00538216, 0.00685749, 0.00535275, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[:64, 0], expected)) + + def test_spectrogram_shapes(self): + waveform = self._load_datasamples(1)[0] + + spec = spectrogram( + waveform, + window_function(400, "hann"), + frame_length=400, + hop_length=128, + power=1.0, + center=True, + pad_mode="reflect", + onesided=True, + ) + self.assertEqual(spec.shape, (201, 732)) + + spec = spectrogram( + waveform, + window_function(400, "hann"), + frame_length=400, + hop_length=128, + power=1.0, + center=False, + pad_mode="reflect", + onesided=True, + ) + self.assertEqual(spec.shape, (201, 729)) + + spec = spectrogram( + waveform, + window_function(400, "hann"), + frame_length=400, + hop_length=128, + fft_length=512, + power=1.0, + center=True, + pad_mode="reflect", + onesided=True, + ) + self.assertEqual(spec.shape, (257, 732)) + + spec = spectrogram( + waveform, + window_function(400, "hann", frame_length=512), + frame_length=512, + hop_length=64, + power=1.0, + center=True, + pad_mode="reflect", + onesided=False, + ) + self.assertEqual(spec.shape, (512, 1464)) + + spec = spectrogram( + waveform, + window_function(512, "hann"), + frame_length=512, + hop_length=64, + power=1.0, + center=True, + pad_mode="reflect", + onesided=False, + ) + self.assertEqual(spec.shape, (512, 1464)) + + spec = spectrogram( + waveform, + window_function(512, "hann"), + frame_length=512, + hop_length=512, + power=1.0, + center=True, + pad_mode="reflect", + onesided=False, + ) + self.assertEqual(spec.shape, (512, 183)) + + def test_mel_spectrogram(self): + waveform = self._load_datasamples(1)[0] + + mel_filters = mel_filter_bank( + num_frequency_bins=513, + num_mel_filters=13, + min_frequency=100, + max_frequency=4000, + sampling_rate=16000, + norm=None, + mel_scale="htk", + ) + self.assertEqual(mel_filters.shape, (513, 13)) + + spec = spectrogram( + waveform, + window_function(800, "hann", frame_length=1024), + frame_length=1024, + hop_length=128, + power=2.0, + ) + self.assertEqual(spec.shape, (513, 732)) + + spec = spectrogram( + waveform, + window_function(800, "hann", frame_length=1024), + frame_length=1024, + hop_length=128, + power=2.0, + mel_filters=mel_filters, + ) + self.assertEqual(spec.shape, (13, 732)) + + # fmt: off + expected = np.array([ + 1.08027889e+02, 1.48080673e+01, 7.70758213e+00, 9.57676639e-01, + 8.81639061e-02, 5.26073833e-02, 1.52736155e-02, 9.95350117e-03, + 7.95364356e-03, 1.01148004e-02, 4.29241020e-03, 9.90708797e-03, + 9.44153646e-04 + ]) + # fmt: on + self.assertTrue(np.allclose(spec[:, 300], expected)) + + def test_spectrogram_power(self): + waveform = self._load_datasamples(1)[0] + + spec = spectrogram( + waveform, + window_function(400, "hann", frame_length=512), + frame_length=512, + hop_length=128, + power=None, + ) + self.assertEqual(spec.shape, (257, 732)) + self.assertEqual(spec.dtype, np.complex64) + + # fmt: off + expected = np.array([ + 0.01452305+0.01820039j, -0.01737362-0.01641946j, + 0.0121028 +0.01565081j, -0.02794554-0.03021514j, + 0.04719803+0.04086519j, -0.04391563-0.02779365j, + 0.05682834+0.01571325j, -0.08604821-0.02023657j, + 0.07497991+0.0186641j , -0.06366091-0.00922475j, + 0.11003416+0.0114788j , -0.13677941-0.01523552j, + 0.10934535-0.00117226j, -0.11635598+0.02551187j, + 0.14708674-0.03469823j, -0.1328196 +0.06034218j, + 0.12667368-0.13973421j, -0.14764774+0.18912019j, + 0.10235471-0.12181523j, -0.00773012+0.04730498j, + -0.01487191-0.07312611j, -0.02739162+0.09619419j, + 0.02895459-0.05398273j, 0.01198589+0.05276592j, + -0.02117299-0.10123465j, 0.00666388+0.09526499j, + -0.01672773-0.05649684j, 0.02723125+0.05939891j, + -0.01879361-0.062954j , 0.03686557+0.04568823j, + -0.07394181-0.07949649j, 0.06238583+0.13905765j, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[64:96, 321], expected)) + + spec = spectrogram( + waveform, + window_function(400, "hann", frame_length=512), + frame_length=512, + hop_length=128, + power=1.0, + ) + self.assertEqual(spec.shape, (257, 732)) + self.assertEqual(spec.dtype, np.float64) + + # fmt: off + expected = np.array([ + 0.02328461, 0.02390484, 0.01978448, 0.04115711, 0.0624309 , + 0.05197181, 0.05896072, 0.08839577, 0.07726794, 0.06432579, + 0.11063128, 0.13762532, 0.10935163, 0.11911998, 0.15112405, + 0.14588428, 0.18860507, 0.23992978, 0.15910825, 0.04793241, + 0.07462307, 0.10001811, 0.06125769, 0.05411011, 0.10342509, + 0.09549777, 0.05892122, 0.06534349, 0.06569936, 0.05870678, + 0.10856833, 0.1524107 , 0.11463385, 0.05766969, 0.12385171, + 0.14472842, 0.11978184, 0.10353675, 0.07244056, 0.03461861, + 0.02624896, 0.02227475, 0.01238363, 0.00885281, 0.0110049 , + 0.00807005, 0.01033663, 0.01703181, 0.01445856, 0.00585615, + 0.0132431 , 0.02754132, 0.01524478, 0.0204908 , 0.07453328, + 0.10716327, 0.07195779, 0.08816078, 0.18340898, 0.16449876, + 0.12322842, 0.1621659 , 0.12334293, 0.06033659, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[64:128, 321], expected)) + + spec = spectrogram( + waveform, + window_function(400, "hann", frame_length=512), + frame_length=512, + hop_length=128, + power=2.0, + ) + self.assertEqual(spec.shape, (257, 732)) + self.assertEqual(spec.dtype, np.float64) + + # fmt: off + expected = np.array([ + 5.42173162e-04, 5.71441371e-04, 3.91425507e-04, 1.69390778e-03, + 3.89761780e-03, 2.70106923e-03, 3.47636663e-03, 7.81381316e-03, + 5.97033510e-03, 4.13780799e-03, 1.22392802e-02, 1.89407300e-02, + 1.19577805e-02, 1.41895693e-02, 2.28384770e-02, 2.12822221e-02, + 3.55718732e-02, 5.75663000e-02, 2.53154356e-02, 2.29751552e-03, + 5.56860259e-03, 1.00036217e-02, 3.75250424e-03, 2.92790355e-03, + 1.06967501e-02, 9.11982451e-03, 3.47171025e-03, 4.26977174e-03, + 4.31640586e-03, 3.44648538e-03, 1.17870830e-02, 2.32290216e-02, + 1.31409196e-02, 3.32579296e-03, 1.53392460e-02, 2.09463164e-02, + 1.43476883e-02, 1.07198600e-02, 5.24763530e-03, 1.19844836e-03, + 6.89007982e-04, 4.96164430e-04, 1.53354369e-04, 7.83722571e-05, + 1.21107812e-04, 6.51257360e-05, 1.06845939e-04, 2.90082477e-04, + 2.09049831e-04, 3.42945241e-05, 1.75379610e-04, 7.58524227e-04, + 2.32403356e-04, 4.19872697e-04, 5.55520924e-03, 1.14839673e-02, + 5.17792348e-03, 7.77232368e-03, 3.36388536e-02, 2.70598419e-02, + 1.51852425e-02, 2.62977779e-02, 1.52134784e-02, 3.64050455e-03, + ]) + # fmt: on + self.assertTrue(np.allclose(spec[64:128, 321], expected)) + + def test_power_to_db(self): + spectrogram = np.zeros((2, 3)) + spectrogram[0, 0] = 2.0 + spectrogram[0, 1] = 0.5 + spectrogram[0, 2] = 0.707 + spectrogram[1, 1] = 1.0 + + output = power_to_db(spectrogram, reference=1.0) + expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-100.0, 0.0, -100.0]]) + self.assertTrue(np.allclose(output, expected)) + + output = power_to_db(spectrogram, reference=2.0) + expected = np.array([[0.0, -6.02059991, -4.51610582], [-103.01029996, -3.01029996, -103.01029996]]) + self.assertTrue(np.allclose(output, expected)) + + output = power_to_db(spectrogram, min_value=1e-6) + expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-60.0, 0.0, -60.0]]) + self.assertTrue(np.allclose(output, expected)) + + output = power_to_db(spectrogram, db_range=80) + expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-76.98970004, 0.0, -76.98970004]]) + self.assertTrue(np.allclose(output, expected)) + + output = power_to_db(spectrogram, reference=2.0, db_range=80) + expected = np.array([[0.0, -6.02059991, -4.51610582], [-80.0, -3.01029996, -80.0]]) + self.assertTrue(np.allclose(output, expected)) + + output = power_to_db(spectrogram, reference=2.0, min_value=1e-6, db_range=80) + expected = np.array([[0.0, -6.02059991, -4.51610582], [-63.01029996, -3.01029996, -63.01029996]]) + self.assertTrue(np.allclose(output, expected)) + + with pytest.raises(ValueError): + power_to_db(spectrogram, reference=0.0) + with pytest.raises(ValueError): + power_to_db(spectrogram, min_value=0.0) + with pytest.raises(ValueError): + power_to_db(spectrogram, db_range=-80) + + def test_amplitude_to_db(self): + spectrogram = np.zeros((2, 3)) + spectrogram[0, 0] = 2.0 + spectrogram[0, 1] = 0.5 + spectrogram[0, 2] = 0.707 + spectrogram[1, 1] = 1.0 + + output = amplitude_to_db(spectrogram, reference=1.0) + expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-100.0, 0.0, -100.0]]) + self.assertTrue(np.allclose(output, expected)) + + output = amplitude_to_db(spectrogram, reference=2.0) + expected = np.array([[0.0, -12.04119983, -9.03221164], [-106.02059991, -6.02059991, -106.02059991]]) + self.assertTrue(np.allclose(output, expected)) + + output = amplitude_to_db(spectrogram, min_value=1e-3) + expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-60.0, 0.0, -60.0]]) + self.assertTrue(np.allclose(output, expected)) + + output = amplitude_to_db(spectrogram, db_range=80) + expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-73.97940009, 0.0, -73.97940009]]) + self.assertTrue(np.allclose(output, expected)) + + output = amplitude_to_db(spectrogram, reference=2.0, db_range=80) + expected = np.array([[0.0, -12.04119983, -9.03221164], [-80.0, -6.02059991, -80.0]]) + self.assertTrue(np.allclose(output, expected)) + + output = amplitude_to_db(spectrogram, reference=2.0, min_value=1e-3, db_range=80) + expected = np.array([[0.0, -12.04119983, -9.03221164], [-66.02059991, -6.02059991, -66.02059991]]) + self.assertTrue(np.allclose(output, expected)) + + with pytest.raises(ValueError): + amplitude_to_db(spectrogram, reference=0.0) + with pytest.raises(ValueError): + amplitude_to_db(spectrogram, min_value=0.0) + with pytest.raises(ValueError): + amplitude_to_db(spectrogram, db_range=-80) From e02a8065e0515c10d7ebd19855adeb002c18268d Mon Sep 17 00:00:00 2001 From: dumpmemory <64742282+dumpmemory@users.noreply.github.com> Date: Tue, 9 May 2023 21:14:02 +0800 Subject: [PATCH 051/935] make opt checkpoint dir name correct (#21660) make opt checkpoint dir name corrent following https://github.com/huggingface/Megatron-LM/blob/100b522bb8044d98413398f9e71563af15b83325/megatron/checkpointing.py#L117 --- .../checkpoint_reshaping_and_interoperability.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py index 49a462491f01..fffed940105e 100644 --- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py +++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py @@ -690,9 +690,9 @@ def convert_checkpoint_from_transformers_to_megatron(args): for j in range(args.target_tensor_model_parallel_size): for k in range(args.target_data_parallel_size): if args.target_pipeline_model_parallel_size == 1: - checkpoint_dir = f"mp_rank_{i:02d}_{k:03d}" + checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}" else: - checkpoint_dir = f"mp_rank_{i:02d}_{j:03d}_{k:03d}" + checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}" checkpoint_dir = os.path.join(release_dir, checkpoint_dir) os.makedirs(checkpoint_dir, exist_ok=True) torch.save( From 51ae566511280251e879ea6c3d69a56a7d3e83dc Mon Sep 17 00:00:00 2001 From: Furkan Akkurt <71407287+furkanakkurt1335@users.noreply.github.com> Date: Tue, 9 May 2023 16:19:38 +0300 Subject: [PATCH 052/935] Fix typo ; Update output.mdx (#23227) --- docs/source/en/main_classes/output.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/main_classes/output.mdx b/docs/source/en/main_classes/output.mdx index ca4e8dfc0ace..921031671cad 100644 --- a/docs/source/en/main_classes/output.mdx +++ b/docs/source/en/main_classes/output.mdx @@ -31,7 +31,7 @@ outputs = model(**inputs, labels=labels) ``` The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the -documentation of that class below, it means it has an optional `loss`, a `logits` an optional `hidden_states` and +documentation of that class below, it means it has an optional `loss`, a `logits`, an optional `hidden_states` and an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have `hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or `output_attentions=True`. From 1a8f61110e4a6b90f75b0ac40ea8cea2817da5ac Mon Sep 17 00:00:00 2001 From: Sebastian Date: Tue, 9 May 2023 15:20:10 +0200 Subject: [PATCH 053/935] fix: Update run_qa.py to work with deepset/germanquad (#23225) Call str on id to make sure any ints are converted into the expected format for squad datasets --- examples/pytorch/question-answering/run_qa.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index d3377611bdfd..b0dcf6e5d836 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -590,12 +590,12 @@ def post_processing_function(examples, features, predictions, stage="eval"): # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ - {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()] - references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad") From 9a50cb6195cb477c1feae8f0fb2bce20b679246f Mon Sep 17 00:00:00 2001 From: Rustin Welter <131769788+rustinwelter@users.noreply.github.com> Date: Tue, 9 May 2023 14:51:43 +0000 Subject: [PATCH 054/935] Add Japanese translation to accelerate.mdx (#23232) Co-authored-by: rustinwelter --- docs/source/ja/_toctree.yml | 4 ++ docs/source/ja/accelerate.mdx | 132 ++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 docs/source/ja/accelerate.mdx diff --git a/docs/source/ja/_toctree.yml b/docs/source/ja/_toctree.yml index a85f0ee11dcf..8ac8b1e3183f 100644 --- a/docs/source/ja/_toctree.yml +++ b/docs/source/ja/_toctree.yml @@ -4,6 +4,10 @@ - local: installation title: インストール title: はじめに +- sections: + - local: accelerate + title: 🤗 Accelerate を用いた分散学習 + title: チュートリアル - sections: - sections: - local: multilingual diff --git a/docs/source/ja/accelerate.mdx b/docs/source/ja/accelerate.mdx new file mode 100644 index 000000000000..823ed0dcf72b --- /dev/null +++ b/docs/source/ja/accelerate.mdx @@ -0,0 +1,132 @@ + + +# 🤗 Accelerate を用いた分散学習 + +モデルが大きくなるにつれて、限られたハードウェアでより大きなモデルを訓練し、訓練速度を大幅に上昇させるための方法として並列処理が浮上してきました。1台のマシンに複数のGPUがあっても、複数のマシンにまたがる複数のGPUがあっても、あらゆるタイプの分散処理セットアップ上でユーザーが簡単に 🤗 Transformers モデルを訓練できるように、 Hugging Face では [🤗 Accelerate](https://huggingface.co/docs/accelerate) ライブラリを作成しました。このチュートリアルでは、PyTorch の訓練ループをカスタマイズして、分散処理環境での訓練を可能にする方法について学びます。 + +## セットアップ + +はじめに 🤗 Accelerate をインストールしましょう: + +```bash +pip install accelerate +``` + +そしたらインポートして [`~accelerate.Accelerator`] オブジェクトを作成しましょう。[`~accelerate.Accelerator`] は分散処理セットアップを自動的に検出し、訓練のために必要な全てのコンポーネントを初期化します。モデルをデバイスに明示的に配置する必要はありません。 + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## Accelerate する準備をしましょう + +次に、関連する全ての訓練オブジェクトを [`~accelerate.Accelerator.prepare`] メソッドに渡します。これには、訓練と評価それぞれのDataloader、モデル、optimizer が含まれます: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## Backward + +最後に訓練ループ内の `loss.backward()` を 🤗 Accelerate の [`~accelerate.Accelerator.backward`] メソッドで置き換えます: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +以下のコードで確認できる通り、訓練ループに4行のコードを追加するだけで分散学習が可能です! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## 訓練する + +関連するコードを追加したら、スクリプトまたは Colaboratory などのノートブックで訓練を開始します。 + +### スクリプトで訓練する + +スクリプトから訓練をしている場合は、設定ファイルを作成・保存するために以下のコマンドを実行してください: + +```bash +accelerate config +``` + +そして次のようにして訓練を開始します: + +```bash +accelerate launch train.py +``` + +### ノートブックで訓練する + +Colaboratory の TPU の利用をお考えの場合、🤗 Accelerate はノートブック上で実行することもできます。訓練に必要な全てのコードを関数に含め、[`~accelerate.notebook_launcher`] に渡してください: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +🤗 Accelerate と豊富な機能についてもっと知りたい方は[ドキュメント](https://huggingface.co/docs/accelerate)を参照してください。 From b4d4d6fe87ffcd7508307970cdf8fa3eda288701 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 9 May 2023 13:04:10 -0400 Subject: [PATCH 055/935] Add RWKV-4 (#22797) * First draft of RWKV-4 * Add support for generate * Style post-rebase * Properly use state * Write doc * Fix doc * More math * Add model to README, dummies and clean config * Fix init * multiple fixes: - fix common tests - fix configuraion default values - add CI test for checking state computation - fix some CI tests * correct tokenizer * some tweaks - fix config docstring - fix failing tests * fix CI tests - add output_attention / output_hidden_states - override test_initialization - fix failing CIs * fix conversion script - fix sharded case - add new arguments * add slow tests + more fixes on conversion script * add another test * final fixes * change single name variable * add mock attention mask for pipeline to work * correct eos token id * fix nits * add checkpoints * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * add `tie_word_embeddings` in docstring * change tensor name * fix final nits * Trigger CI --------- Co-authored-by: younesbelkada Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.mdx | 2 + docs/source/en/model_doc/rwkv.mdx | 129 +++ docs/source/en/tasks/language_modeling.mdx | 2 +- src/transformers/__init__.py | 16 + src/transformers/generation/utils.py | 2 + src/transformers/kernels/rwkv/wkv_cuda.cu | 187 ++++ .../kernels/rwkv/wkv_cuda_bf16.cu | 186 ++++ src/transformers/kernels/rwkv/wkv_op.cpp | 66 ++ src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/auto/modeling_auto.py | 4 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/rwkv/__init__.py | 60 ++ .../models/rwkv/configuration_rwkv.py | 130 +++ .../rwkv/convert_rwkv_checkpoint_to_hf.py | 201 +++++ src/transformers/models/rwkv/modeling_rwkv.py | 804 ++++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 24 + tests/models/rwkv/__init__.py | 0 tests/models/rwkv/test_modeling_rwkv.py | 451 ++++++++++ tests/test_configuration_common.py | 9 +- 28 files changed, 2284 insertions(+), 3 deletions(-) create mode 100644 docs/source/en/model_doc/rwkv.mdx create mode 100644 src/transformers/kernels/rwkv/wkv_cuda.cu create mode 100644 src/transformers/kernels/rwkv/wkv_cuda_bf16.cu create mode 100644 src/transformers/kernels/rwkv/wkv_op.cpp create mode 100644 src/transformers/models/rwkv/__init__.py create mode 100644 src/transformers/models/rwkv/configuration_rwkv.py create mode 100644 src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py create mode 100644 src/transformers/models/rwkv/modeling_rwkv.py create mode 100644 tests/models/rwkv/__init__.py create mode 100644 tests/models/rwkv/test_modeling_rwkv.py diff --git a/README.md b/README.md index be4f2d103148..fdceaa1643e0 100644 --- a/README.md +++ b/README.md @@ -422,6 +422,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. diff --git a/README_es.md b/README_es.md index 58d6fc3cf338..c3c569c531d6 100644 --- a/README_es.md +++ b/README_es.md @@ -410,6 +410,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. diff --git a/README_hd.md b/README_hd.md index e580c37794b2..af9c58011455 100644 --- a/README_hd.md +++ b/README_hd.md @@ -382,6 +382,7 @@ conda install -c huggingface transformers 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित। +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा। diff --git a/README_ja.md b/README_ja.md index ebb173495c15..953ff5598e4a 100644 --- a/README_ja.md +++ b/README_ja.md @@ -444,6 +444,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM) 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) diff --git a/README_ko.md b/README_ko.md index cf01054dcc49..2707f191dad0 100644 --- a/README_ko.md +++ b/README_ko.md @@ -359,6 +359,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다. 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다. +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다. 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index b46c84983d9b..74d39b1df3d6 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -383,6 +383,7 @@ conda install -c huggingface transformers 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。 +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index c360cce25a83..4fd2a3caad54 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -395,6 +395,7 @@ conda install -c huggingface transformers 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f6e9684f79e3..c92f21a93458 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -399,6 +399,8 @@ title: RoCBert - local: model_doc/roformer title: RoFormer + - local: model_doc/rwkv + title: RWKV - local: model_doc/splinter title: Splinter - local: model_doc/squeezebert diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index cc5724b5fd6c..c9af68ae078e 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -196,6 +196,7 @@ The documentation is organized into five sections: 1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. 1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. 1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. @@ -396,6 +397,7 @@ Flax), PyTorch, and/or TensorFlow. | RoBERTa-PreLayerNorm | ❌ | ❌ | ✅ | ✅ | ✅ | | RoCBert | ✅ | ❌ | ✅ | ❌ | ❌ | | RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | +| RWKV | ❌ | ❌ | ✅ | ❌ | ❌ | | SAM | ❌ | ❌ | ✅ | ❌ | ❌ | | SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ | | SEW | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/rwkv.mdx b/docs/source/en/model_doc/rwkv.mdx new file mode 100644 index 000000000000..b8812516560f --- /dev/null +++ b/docs/source/en/model_doc/rwkv.mdx @@ -0,0 +1,129 @@ + + +# RWKV + +## Overview + +The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM) + +It suggests a tweak in the traditional Transformer attention to make it linear. This way, the model can be used as recurrent network: passing inputs for timestamp 0 and timestamp 1 together is the same as passing inputs at timestamp 0, then inputs at timestamp 1 along with the state of timestamp 0 (see example below). + +This can be more efficient than a regular Transformer and can deal with sentence of any length (even if the model uses a fixed context length for training). + +This model was contributed by [sgugger](https://huggingface.co/sgugger). +The original code can be found [here](https://github.com/BlinkDL/RWKV-LM). + +Example of use as an RNN: + +```py +import torch +from transformers import AutoTokenizer, RwkvConfig, RwkvModel + +model = RwkvModel.from_pretrained("sgugger/rwkv-430M-pile") +tokenizer = AutoTokenizer.from_pretrained("sgugger/rwkv-430M-pile") + +inputs = tokenizer("This is an example.", return_tensors="pt") +# Feed everything to the model +outputs = model(inputs["input_ids"]) +output_whole = outputs.last_hidden_state + +outputs = model(inputs["input_ids"][:, :2]) +output_one = outputs.last_hidden_state + +# Using the state computed on the first inputs, we will get the same output +outputs = model(inputs["input_ids"][:, 2:], state=outputs.state) +output_two = outputs.last_hidden_state + +torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5) +``` + +## RwkvConfig + +[[autodoc]] RwkvConfig + + +## RwkvModel + +[[autodoc]] RwkvModel + - forward + +## RwkvLMHeadModel + +[[autodoc]] RwkvForCausalLM + - forward + +## Rwkv attention and the recurrent formulas + +In a traditional auto-regressive Transformer, attention is written as + +$$O = \hbox{softmax}(QK^{T} / \sqrt{d}) V$$ + +with \\(Q\\), \\(K\\) and \\(V\\) are matrices of shape `seq_len x hidden_size` named query, key and value (they are actually bigger matrices with a batch dimension and an attention head dimension but we're only interested in the last two, which is where the matrix product is taken, so for the sake of simplicity we only consider those two). The product \\(QK^{T}\\) then has shape `seq_len x seq_len` and we can take the maxtrix product with \\(V\\) to get the output \\(O\\) of the same shape as the others. + +Replacing the softmax by its value gives: + +$$O_{i} = \frac{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}} V_{j}}{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}}}$$ + +Note that the entries in \\(QK^{T}\\) corresponding to \\(j > i\\) are masked (the sum stops at j) because the attention is not allowed to look at future tokens (only past ones). + +In comparison, the RWKV attention is given by + +$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$ + +where \\(R\\) is a new matrix called receptance by the author, \\(K\\) and \\(V\\) are still the key and value (\\(\sigma\\) here is the sigmoid function). \\(W\\) is a new vector that represents the position of the token and is given by + +$$W_{0} = u \hbox{ and } W_{k} = (k-1)w \hbox{ for } k \geq 1$$ + +with \\(u\\) and \\(w\\) learnable parameters called in the code `time_first` and `time_decay` respectively. The numerator and denominator can both be expressed recursively. Naming them \\(N_{i}\\) and \\(D_{i}\\) we have: + +$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}_{i} \hbox{ where } \hat{N}_{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$ + +so \\(\hat{N}_{i}\\) (called `numerator_state` in the code) satistfies + +$$\hat{N}_{0} = 0 \hbox{ and } \hat{N}_{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$ + +and + +$$D_{i} = e^{u + K_{i}} + \hat{D}_{i} \hbox{ where } \hat{D}_{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$ + +so \\(\hat{D}_{i}\\) (called `denominator_state` in the code) satistfies + +$$\hat{D}_{0} = 0 \hbox{ and } \hat{D}_{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$ + +The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator: + +$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$ + +with \\(M\\) the maximum of all \\(x_{j}\\). So here on top of saving the numerator state (\\(\hat{N}\\)) and the denominator state (\\(\hat{D}\\)) we also keep track of the maximum of all terms encountered in the exponentials. So we actually use + +$$\tilde{N}_{i} = e^{-M_{i}} \hat{N}_{i} \hbox{ and } \tilde{D}_{i} = e^{-M_{i}} \hat{D}_{i}$$ + +defined by the following recurrent formulas: + +$$\tilde{N}_{0} = 0 \hbox{ and } \tilde{N}_{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}_{j} \hbox{ where } q = \max(K_{j}, w + M_{j})$$ + +and + +$$\tilde{D}_{0} = 0 \hbox{ and } \tilde{D}_{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}_{j} \hbox{ where } q = \max(K_{j}, w + M_{j})$$ + +and \\(M_{j+1} = q\\). With those, we can then compute + +$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}_{i} \hbox{ where } q = \max(u + K_{i}, M_{i})$$ + +and + +$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{ where } q = \max(u + K_{i}, M_{i})$$ + +which finally gives us + +$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$ \ No newline at end of file diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx index 33bd8c5a1ab6..ea25e17efa47 100644 --- a/docs/source/en/tasks/language_modeling.mdx +++ b/docs/source/en/tasks/language_modeling.mdx @@ -33,8 +33,8 @@ You can finetune other architectures for causal language modeling following the Choose one of the following architectures: +[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod) -[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b0766b0946cd..375966131e0d 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -431,6 +431,7 @@ "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"], "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"], "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"], + "models.rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig"], "models.sam": [ "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP", "SamConfig", @@ -2364,6 +2365,14 @@ "load_tf_weights_in_roformer", ] ) + _import_structure["models.rwkv"].extend( + [ + "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST", + "RwkvForCausalLM", + "RwkvModel", + "RwkvPreTrainedModel", + ] + ) _import_structure["models.sam"].extend( [ "SAM_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4169,6 +4178,7 @@ ) from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig, RoCBertTokenizer from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer + from .models.rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig from .models.sam import ( SAM_PRETRAINED_CONFIG_ARCHIVE_MAP, SamConfig, @@ -5783,6 +5793,12 @@ RoFormerPreTrainedModel, load_tf_weights_in_roformer, ) + from .models.rwkv import ( + RWKV_PRETRAINED_MODEL_ARCHIVE_LIST, + RwkvForCausalLM, + RwkvModel, + RwkvPreTrainedModel, + ) from .models.sam import ( SAM_PRETRAINED_MODEL_ARCHIVE_LIST, SamModel, diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 8c8a67fa5cb0..9f6b072a9a0c 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -753,6 +753,8 @@ def _update_model_kwargs_for_generation( model_kwargs["past_key_values"] = self._extract_past_from_model_output( outputs, standardize_cache_format=standardize_cache_format ) + if getattr(outputs, "state", None) is not None: + model_kwargs["state"] = outputs.state # update token_type_ids with last value if "token_type_ids" in model_kwargs: diff --git a/src/transformers/kernels/rwkv/wkv_cuda.cu b/src/transformers/kernels/rwkv/wkv_cuda.cu new file mode 100644 index 000000000000..571d5a8a8307 --- /dev/null +++ b/src/transformers/kernels/rwkv/wkv_cuda.cu @@ -0,0 +1,187 @@ +#include +#include + +#define MIN_VALUE (-1e38) + +template +__global__ void kernel_forward( + const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u, + const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + F *__restrict__ const y = _y + _offset; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + F aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + + F ww = u + kk; + F p = max(pp, ww); + F e1 = exp(pp - p); + F e2 = exp(ww - p); + y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } +} + +template +__global__ void kernel_forward_with_state( + const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u, + const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y, F *__restrict__ const _s +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset_s = _b * C * 3 + _c * 3; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + F *__restrict__ const y = _y + _offset; + F *__restrict__ const s = _s + _offset_s; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + F aa = s[0], bb = s[1], pp = s[2]; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + + F ww = u + kk; + F p = max(pp, ww); + F e1 = exp(pp - p); + F e2 = exp(ww - p); + y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + s[0] = aa; + s[1] = bb; + s[2] = pp; +} + +template +__global__ void kernel_backward( + const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u, + const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _y, + const F *__restrict__ const _gy, F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, + F *__restrict__ const _gv +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + const F *__restrict__ const y = _y + _offset; + const F *__restrict__ const gy = _gy + _offset; + F *__restrict__ const gk = _gk + _offset; + F *__restrict__ const gv = _gv + _offset; + + F q[Tmax], r[Tmax]; + + F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + const F yy = y[ii]; + + F ww = u + kk; + F p = max(pp, ww); + F e1 = exp(pp - p); + F e2 = exp(ww - p); + const F qq = gy[ii] / (e1 * bb + e2); + gw += (ga - gb * yy) * e1 * qq; + gu += (vv - yy) * e2 * qq; + q[i] = qq; + r[i] = ww - p; + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + ga = e1 * (aa + ga); + gb = e1 * (bb + gb); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + const int _offsetBC = _b * C + _c; + _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward() + _gu[_offsetBC] = gu; + + aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = T - 1; i >= 0; i--) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + const F yy = y[ii]; + const F qq = q[i]; + const F rr = r[i]; + + F e1 = qq * exp(rr); + F e2 = exp(kk + pp); + gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb); + gv[ii] = e1 + e2 * aa; + + const F ww = w + pp; + const F www = rr - u - kk; + const F p = max(ww, www); + e1 = exp(ww - p); + e2 = qq * exp(www - p); + aa = e1 * aa + e2; + bb = e1 * bb - e2 * yy; + pp = p; + } +} + +void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward<<>>(B, T, C, w, u, k, v, y); +} + +void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward_with_state<<>>(B, T, C, w, u, k, v, y, s); +} + +void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_backward<<>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv); +} diff --git a/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu b/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu new file mode 100644 index 000000000000..042cb4aba1db --- /dev/null +++ b/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu @@ -0,0 +1,186 @@ +#include +#include +#include "ATen/ATen.h" +#define MIN_VALUE (-1e38) +typedef at::BFloat16 bf16; + +__global__ void kernel_forward_bf16( + const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u, + const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + float u = float(_u[_c]); + float w = _w[_c]; + const bf16 *__restrict__ const k = _k + _offset; + const bf16 *__restrict__ const v = _v + _offset; + bf16 *__restrict__ const y = _y + _offset; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + float aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + + float ww = u + kk; + float p = max(pp, ww); + float e1 = exp(pp - p); + float e2 = exp(ww - p); + y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2)); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } +} + +__global__ void kernel_forward_with_state_bf16( + const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u, + const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y, + float *__restrict__ const _s +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset_s = _b * C * 3 + _c * 3; + const int _offset = _b * T * C + _c; + + float u = float(_u[_c]); + float w = _w[_c]; + const bf16 *__restrict__ const k = _k + _offset; + const bf16 *__restrict__ const v = _v + _offset; + bf16 *__restrict__ const y = _y + _offset; + float *__restrict__ const s = _s + _offset_s; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + float aa = s[0], bb = s[1], pp = s[2]; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + + float ww = u + kk; + float p = max(pp, ww); + float e1 = exp(pp - p); + float e2 = exp(ww - p); + y[ii] = bf16(e1 * aa + e2 * vv) / (e1 * bb + e2); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + s[0] = aa; + s[1] = bb; + s[2] = pp; +} + +__global__ void kernel_backward_bf16( + const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u, + const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, const bf16 *__restrict__ const _y, + const bf16 *__restrict__ const _gy, bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu, + bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + float u = float(_u[_c]); + float w = _w[_c]; + const bf16 *__restrict__ const k = _k + _offset; + const bf16 *__restrict__ const v = _v + _offset; + const bf16 *__restrict__ const y = _y + _offset; + const bf16 *__restrict__ const gy = _gy + _offset; + bf16 *__restrict__ const gk = _gk + _offset; + bf16 *__restrict__ const gv = _gv + _offset; + + float q[Tmax], r[Tmax]; + + float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + const float yy = float(y[ii]); + + float ww = u + kk; + float p = max(pp, ww); + float e1 = exp(pp - p); + float e2 = exp(ww - p); + const float qq = float(gy[ii]) / (e1 * bb + e2); + gw += (ga - gb * yy) * e1 * qq; + gu += (vv - yy) * e2 * qq; + q[i] = qq; + r[i] = ww - p; + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + ga = e1 * (aa + ga); + gb = e1 * (bb + gb); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + const int _offsetBC = _b * C + _c; + _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward() + _gu[_offsetBC] = bf16(gu); + + aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = T - 1; i >= 0; i--) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + const float yy = float(y[ii]); + const float qq = q[i]; + const float rr = r[i]; + + float e1 = qq * exp(rr); + float e2 = exp(kk + pp); + gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb)); + gv[ii] = bf16(e1 + e2 * aa); + + const float ww = w + pp; + const float www = rr - u - kk; + const float p = max(ww, www); + e1 = exp(ww - p); + e2 = qq * exp(www - p); + aa = e1 * aa + e2; + bb = e1 * bb - e2 * yy; + pp = p; + } +} + +void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward_bf16<<>>(B, T, C, w, u, k, v, y); +} + +void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward_with_state_bf16<<>>(B, T, C, w, u, k, v, y, s); +} + +void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_backward_bf16<<>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv); +} diff --git a/src/transformers/kernels/rwkv/wkv_op.cpp b/src/transformers/kernels/rwkv/wkv_op.cpp new file mode 100644 index 000000000000..55e728066592 --- /dev/null +++ b/src/transformers/kernels/rwkv/wkv_op.cpp @@ -0,0 +1,66 @@ +#include +#include "ATen/ATen.h" +typedef at::BFloat16 bf16; + +void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y); +void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y); +void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s); +void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s); +void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv); +void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv); + +void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr()); +} +void forward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward_bf16(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr()); +} +void forward_with_state(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward_with_state(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), s.data_ptr()); +} +void forward_with_state_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward_with_state_bf16(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), s.data_ptr()); +} +void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_backward(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), gy.data_ptr(), gw.data_ptr(), gu.data_ptr(), gk.data_ptr(), gv.data_ptr()); +} +void backward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_backward_bf16(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), + gy.data_ptr(), gw.data_ptr(), gu.data_ptr(), gk.data_ptr(), gv.data_ptr()); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &forward, "wkv forward"); + m.def("forward_bf16", &forward_bf16, "wkv forward bf16"); + m.def("forward_with_state", &forward_with_state, "wkv forward with state"); + m.def("forward_with_state_bf16", &forward_with_state_bf16, "wkv forward with state bf16"); + m.def("backward", &backward, "wkv backward"); + m.def("backward_bf16", &backward_bf16, "wkv backward bf16"); +} + +TORCH_LIBRARY(wkv, m) { + m.def("forward", forward); + m.def("forward_bf16", forward_bf16); + m.def("forward_with_state", forward_with_state); + m.def("forward_with_state_bf16", forward_with_state_bf16); + m.def("backward", backward); + m.def("backward_bf16", backward_bf16); +} diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 4753f253df74..9c955a008f30 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -162,6 +162,7 @@ roberta_prelayernorm, roc_bert, roformer, + rwkv, sam, segformer, sew, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c3623437348a..8da4a15b29ee 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -163,6 +163,7 @@ ("roberta-prelayernorm", "RobertaPreLayerNormConfig"), ("roc_bert", "RoCBertConfig"), ("roformer", "RoFormerConfig"), + ("rwkv", "RwkvConfig"), ("sam", "SamConfig"), ("segformer", "SegformerConfig"), ("sew", "SEWConfig"), @@ -343,6 +344,7 @@ ("roberta-prelayernorm", "ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("roc_bert", "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("rwkv", "RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("sam", "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -545,6 +547,7 @@ ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"), ("roc_bert", "RoCBert"), ("roformer", "RoFormer"), + ("rwkv", "RWKV"), ("sam", "SAM"), ("segformer", "SegFormer"), ("sew", "SEW"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1ebc906d2d67..8e1835e31f09 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -158,6 +158,7 @@ ("roberta-prelayernorm", "RobertaPreLayerNormModel"), ("roc_bert", "RoCBertModel"), ("roformer", "RoFormerModel"), + ("rwkv", "RwkvModel"), ("sam", "SamModel"), ("segformer", "SegformerModel"), ("sew", "SEWModel"), @@ -248,6 +249,7 @@ ("roberta", "RobertaForMaskedLM"), ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"), ("roc_bert", "RoCBertForPreTraining"), + ("rwkv", "RwkvForCausalLM"), ("splinter", "SplinterForPreTraining"), ("squeezebert", "SqueezeBertForMaskedLM"), ("switch_transformers", "SwitchTransformersForConditionalGeneration"), @@ -332,6 +334,7 @@ ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"), ("roc_bert", "RoCBertForMaskedLM"), ("roformer", "RoFormerForMaskedLM"), + ("rwkv", "RwkvForCausalLM"), ("speech_to_text", "Speech2TextForConditionalGeneration"), ("squeezebert", "SqueezeBertForMaskedLM"), ("switch_transformers", "SwitchTransformersForConditionalGeneration"), @@ -395,6 +398,7 @@ ("roberta-prelayernorm", "RobertaPreLayerNormForCausalLM"), ("roc_bert", "RoCBertForCausalLM"), ("roformer", "RoFormerForCausalLM"), + ("rwkv", "RwkvForCausalLM"), ("speech_to_text_2", "Speech2Text2ForCausalLM"), ("transfo-xl", "TransfoXLLMHeadModel"), ("trocr", "TrOCRForCausalLM"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index de954e206ae1..cb6c91521de9 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -297,6 +297,7 @@ ), ("roc_bert", ("RoCBertTokenizer", None)), ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)), + ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), diff --git a/src/transformers/models/rwkv/__init__.py b/src/transformers/models/rwkv/__init__.py new file mode 100644 index 000000000000..e68eefe9f8aa --- /dev/null +++ b/src/transformers/models/rwkv/__init__.py @@ -0,0 +1,60 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = { + "configuration_rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig", "RwkvOnnxConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_rwkv"] = [ + "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST", + "RwkvForCausalLM", + "RwkvModel", + "RwkvPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig, RwkvOnnxConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_rwkv import ( + RWKV_PRETRAINED_MODEL_ARCHIVE_LIST, + RwkvForCausalLM, + RwkvModel, + RwkvPreTrainedModel, + ) +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py new file mode 100644 index 000000000000..89b2f5fb6483 --- /dev/null +++ b/src/transformers/models/rwkv/configuration_rwkv.py @@ -0,0 +1,130 @@ +# coding=utf-8 +# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" RWKV configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "RWKV/rwkv-4-169m-pile": "https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json", + "RWKV/rwkv-4-430m-pile": "https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json", + "RWKV/rwkv-4-1b5-pile": "https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json", + "RWKV/rwkv-4-3b-pile": "https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json", + "RWKV/rwkv-4-7b-pile": "https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json", + "RWKV/rwkv-4-14b-pile": "https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json", + "RWKV/rwkv-raven-1b5": "https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json", + "RWKV/rwkv-raven-3b": "https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json", + "RWKV/rwkv-raven-7b": "https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json", + "RWKV/rwkv-raven-14b": "https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json", +} + + +class RwkvConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the RWVK-4 + [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50277): + Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`RwkvModel`]. + context_length (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode + lets use any sequence length). + hidden_size (`int`, *optional*, defaults to 4096): + Dimensionality of the embeddings and hidden states. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the model. + attention_hidden_size (`int`, *optional*): + Dimensionality of the attention hidden states. Will default to `hidden_size` if unset. + intermediate_size (`int`, *optional*): + Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + bos_token_id (`int`, *optional*, defaults to 0): + The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer + as GPTNeoX. + eos_token_id (`int`, *optional*, defaults to 0): + The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as + GPTNeoX. + rescale_every (`int`, *optional*, default to 6): + At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every + `rescale_every` layer. If set to 0 or a negative number, no rescale is done. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to tie the word embeddings with the input token embeddings. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last state. + + + Example: + + ```python + >>> from transformers import RwkvConfig, RwkvModel + + >>> # Initializing a Rwkv configuration + >>> configuration = RwkvConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = RwkvModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "rwkv" + attribute_map = {"max_position_embeddings": "context_length"} + + def __init__( + self, + vocab_size=50277, + context_length=1024, + hidden_size=4096, + num_hidden_layers=32, + attention_hidden_size=None, + intermediate_size=None, + layer_norm_epsilon=1e-5, + bos_token_id=0, + eos_token_id=0, + rescale_every=6, + tie_word_embeddings=False, + use_cache=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.context_length = context_length + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size + self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size + self.layer_norm_epsilon = layer_norm_epsilon + self.rescale_every = rescale_every + self.use_cache = use_cache + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__( + tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs + ) diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py new file mode 100644 index 000000000000..b340b9f028b3 --- /dev/null +++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py @@ -0,0 +1,201 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format.""" + + +import argparse +import gc +import json +import os +import re + +import torch +from huggingface_hub import hf_hub_download + +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig +from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint + + +NUM_HIDDEN_LAYERS_MAPPING = { + "169M": 12, + "430M": 24, + "1B5": 24, + "3B": 32, + "7B": 32, + "14B": 40, +} + +HIDEN_SIZE_MAPPING = { + "169M": 768, + "430M": 1024, + "1B5": 2048, + "3B": 2560, + "7B": 4096, + "14B": 5120, +} + + +def convert_state_dict(state_dict): + state_dict_keys = list(state_dict.keys()) + for name in state_dict_keys: + weight = state_dict.pop(name) + # emb -> embedding + if name.startswith("emb."): + name = name.replace("emb.", "embeddings.") + # ln_0 -> pre_ln (only present at block 0) + if name.startswith("blocks.0.ln0"): + name = name.replace("blocks.0.ln0", "blocks.0.pre_ln") + # att -> attention + name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name) + # ffn -> feed_forward + name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name) + # time_mix_k -> time_mix_key and reshape + if name.endswith(".time_mix_k"): + name = name.replace(".time_mix_k", ".time_mix_key") + # time_mix_v -> time_mix_value and reshape + if name.endswith(".time_mix_v"): + name = name.replace(".time_mix_v", ".time_mix_value") + # time_mix_r -> time_mix_key and reshape + if name.endswith(".time_mix_r"): + name = name.replace(".time_mix_r", ".time_mix_receptance") + + if name != "head.weight": + name = "rwkv." + name + + state_dict[name] = weight + return state_dict + + +def convert_rmkv_checkpoint_to_hf_format( + repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None +): + # 1. If possible, build the tokenizer. + if tokenizer_file is None: + print("No `--tokenizer_file` provided, we will use the default tokenizer.") + vocab_size = 50277 + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") + else: + tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file) + vocab_size = len(tokenizer) + tokenizer.save_pretrained(output_dir) + + # 2. Build the config + possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys()) + if size is None: + # Try to infer size from the checkpoint name + for candidate in possible_sizes: + if candidate in checkpoint_file: + size = candidate + break + if size is None: + raise ValueError("Could not infer the size, please provide it with the `--size` argument.") + if size not in possible_sizes: + raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.") + + config = RwkvConfig( + vocab_size=vocab_size, + num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size], + hidden_size=HIDEN_SIZE_MAPPING[size], + ) + config.save_pretrained(output_dir) + + # 3. Download model file then convert state_dict + model_file = hf_hub_download(repo_id, checkpoint_file) + state_dict = torch.load(model_file, map_location="cpu") + state_dict = convert_state_dict(state_dict) + + # 4. Split in shards and save + shards, index = shard_checkpoint(state_dict) + for shard_file, shard in shards.items(): + torch.save(shard, os.path.join(output_dir, shard_file)) + + if index is not None: + save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + # 5. Clean up shards (for some reason the file PyTorch saves take the same space as the whole state_dict + print( + "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model." + ) + shard_files = list(shards.keys()) + + del state_dict + del shards + gc.collect() + + for shard_file in shard_files: + state_dict = torch.load(os.path.join(output_dir, shard_file)) + torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file)) + + del state_dict + gc.collect() + + if push_to_hub: + if model_name is None: + raise ValueError("Please provide a `model_name` to push the model to the Hub.") + model = AutoModelForCausalLM.from_pretrained(output_dir) + model.push_to_hub(model_name, max_shard_size="2GB") + tokenizer.push_to_hub(model_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint." + ) + parser.add_argument( + "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo." + ) + parser.add_argument( + "--output_dir", default=None, type=str, required=True, help="Where to save the converted model." + ) + parser.add_argument( + "--tokenizer_file", + default=None, + type=str, + help="Path to the tokenizer file to use (if not provided, only the model is converted).", + ) + parser.add_argument( + "--size", + default=None, + type=str, + help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Push to the Hub the converted model.", + ) + parser.add_argument( + "--model_name", + default=None, + type=str, + help="Name of the pushed model on the Hub, including the username / organization.", + ) + + args = parser.parse_args() + convert_rmkv_checkpoint_to_hf_format( + args.repo_id, + args.checkpoint_file, + args.output_dir, + size=args.size, + tokenizer_file=args.tokenizer_file, + push_to_hub=args.push_to_hub, + model_name=args.model_name, + ) diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py new file mode 100644 index 000000000000..dd85c279daeb --- /dev/null +++ b/src/transformers/models/rwkv/modeling_rwkv.py @@ -0,0 +1,804 @@ +# coding=utf-8 +# Copyright 2023 Bo Peng and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch RWKV model.""" + +import math +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_ninja_available, + is_torch_cuda_available, + logging, +) +from .configuration_rwkv import RwkvConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile" +_CONFIG_FOR_DOC = "RwkvConfig" + +RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "RWKV/rwkv-4-169m-pile", + "RWKV/rwkv-4-430m-pile", + "RWKV/rwkv-4-1b5-pile", + "RWKV/rwkv-4-3b-pile", + "RWKV/rwkv-4-7b-pile", + "RWKV/rwkv-4-14b-pile", + "RWKV/rwkv-raven-1b5", + "RWKV/rwkv-raven-3b", + "RWKV/rwkv-raven-7b", + "RWKV/rwkv-raven-14b", + # See all RWKV models at https://huggingface.co/models?filter=rwkv +] + + +rwkv_cuda_kernel = None + + +def load_wkv_cuda_kernel(context_length): + from torch.utils.cpp_extension import load as load_kernel + + global rwkv_cuda_kernel + + kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv" + cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]] + + # Only load the kernel if it's not been loaded yet or if we changed the context length + if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length: + return + + logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.") + + flags = [ + "-res-usage", + "--maxrregcount 60", + "--use_fast_math", + "-O3", + "-Xptxas -O3", + "--extra-device-vectorization", + f"-DTmax={context_length}", + ] + rwkv_cuda_kernel = load_kernel( + name=f"wkv_{context_length}", + sources=cuda_kernel_files, + verbose=(logging.get_verbosity() == logging.DEBUG), + extra_cuda_cflags=flags, + ) + rwkv_cuda_kernel.max_seq_length = context_length + + +class RwkvLinearAttention(torch.autograd.Function): + @staticmethod + def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False): + batch_size, seq_len, hidden_size = key.size() + if seq_len > rwkv_cuda_kernel.max_seq_length: + raise ValueError( + f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of " + f"{rwkv_cuda_kernel.max_seq_length} with this model." + ) + if batch_size * hidden_size % min(hidden_size, 32) != 0: + raise ValueError( + f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round " + f"multiple of {min(hidden_size, 32)}." + ) + + ctx.input_dtype = key.dtype + + if ( + time_decay.device.type != "cuda" + or time_first.device.type != "cuda" + or key.device.type != "cuda" + or value.device.type != "cuda" + ): + raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.") + + time_decay = -torch.exp(time_decay.float().contiguous()) + if key.dtype == torch.float16: + time_first = time_first.float() + key = key.float() + value = value.float() + time_first = time_first.contiguous() + key = key.contiguous() + value = value.contiguous() + # The CUDA kernel will fill this tensor. + output = torch.empty_like(key, memory_format=torch.contiguous_format) + if return_state or state is not None: + if state is None: + state = torch.zeros( + batch_size, + hidden_size, + 3, + dtype=torch.float32, + device=key.device, + memory_format=torch.contiguous_format, + ) + state[:, :, 2] -= 1e38 + else: + state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous() + if key.dtype == torch.bfloat16: + forward_func = rwkv_cuda_kernel.forward_with_state_bf16 + else: + forward_func = rwkv_cuda_kernel.forward_with_state + forward_func(time_decay, time_first, key, value, output, state) + else: + forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward + forward_func(time_decay, time_first, key, value, output) + + ctx.save_for_backward(time_decay, time_first, key, value, output) + + if state is not None: + state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)] + + return output.to(ctx.input_dtype), state + + @staticmethod + # g stands for grad + def backward(ctx, g_output): + input_dtype = ctx.input_dtype + + time_decay, time_first, key, value, output = ctx.saved_tensors + # The CUDA kernel will fill those tensors. + g_time_decay = torch.empty_like( + time_decay, + memory_format=torch.contiguous_format, + dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32, + ) + g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format) + g_key = torch.empty_like(key, memory_format=torch.contiguous_format) + g_value = torch.empty_like(value, memory_format=torch.contiguous_format) + + if input_dtype == torch.float16: + g_output = g_output.float() + backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward + backward_func( + time_decay, + time_first, + key, + value, + output, + g_output.contiguous(), + g_time_decay, + g_time_first, + g_key, + g_value, + ) + g_time_decay = torch.sum(g_time_decay, dim=0) + g_time_first = torch.sum(g_time_first, dim=0) + + return ( + None, + None, + None, + g_time_decay.to(input_dtype), + g_time_first.to(input_dtype), + g_key.to(input_dtype), + g_value.to(input_dtype), + ) + + +def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False): + # For CPU fallback. Will be slower and probably take more memory than the custom CUDA kernel if not executed + # within a torch.no_grad. + _, seq_length, _ = key.size() + output = torch.zeros_like(key) + + if state is None: + num_state = torch.zeros_like(key[:, 0], dtype=torch.float32) + den_state = torch.zeros_like(key[:, 0], dtype=torch.float32) + max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38 + else: + num_state, den_state, max_state = state + # For numerical stability + # real_numerator_state = num_state * torch.exp(max_state) + # real_denominator_state = den_state * torch.exp(max_state) + + time_decay = -torch.exp(time_decay) + + for current_index in range(seq_length): + current_key = key[:, current_index].float() + current_value = value[:, current_index] + + # wkv computation at time t + max_for_output = torch.maximum(max_state, current_key + time_first) + e1 = torch.exp(max_state - max_for_output) + e2 = torch.exp(current_key + time_first - max_for_output) + numerator = e1 * num_state + e2 * current_value + denominator = e1 * den_state + e2 + output[:, current_index] = (numerator / denominator).to(output.dtype) + + # Update state for next iteration + max_for_state = torch.maximum(max_state + time_decay, current_key) + e1 = torch.exp(max_state + time_decay - max_for_state) + e2 = torch.exp(current_key - max_for_state) + num_state = e1 * num_state + e2 * current_value + den_state = e1 * den_state + e2 + max_state = max_for_state + + if return_state or state is not None: + state = [num_state, den_state, max_state] + + return output, state + + +def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False): + no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value]) + # Launching the CUDA kernel for just one token will actually be slower (there is no for loop in the CPU version + # in this case). + one_token = key.size(1) == 1 + if rwkv_cuda_kernel is None or no_cuda or one_token: + return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state) + else: + return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state) + + +class RwkvSelfAttention(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.config = config + kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length + if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded: + try: + load_wkv_cuda_kernel(config.context_length) + except Exception: + logger.info("Could not load the custom CUDA kernel for RWKV attention.") + self.layer_id = layer_id + hidden_size = config.hidden_size + attention_hidden_size = ( + config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size + ) + self.attention_hidden_size = attention_hidden_size + + self.time_decay = nn.Parameter(torch.empty(attention_hidden_size)) + self.time_first = nn.Parameter(torch.empty(attention_hidden_size)) + + self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size)) + self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size)) + self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size)) + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False) + self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False) + self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False) + self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False) + + # TODO: maybe jit, otherwise move inside forward + def extract_key_value(self, hidden, state=None): + # Mix hidden with the previous timestep to produce key, value, receptance + if hidden.size(1) == 1 and state is not None: + shifted = state[1][:, :, self.layer_id] + else: + shifted = self.time_shift(hidden) + if state is not None: + shifted[:, 0] = state[1][:, :, self.layer_id] + key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key) + value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value) + receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance) + + key = self.key(key) + value = self.value(value) + receptance = torch.sigmoid(self.receptance(receptance)) + if state is not None: + state[1][:, :, self.layer_id] = hidden[:, -1] + return receptance, key, value, state + + def forward(self, hidden, state=None, use_cache=False): + receptance, key, value, state = self.extract_key_value(hidden, state=state) + layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None + rwkv, layer_state = rwkv_linear_attention( + self.time_decay, + self.time_first, + key, + value, + state=layer_state, + return_state=use_cache, + ) + + if layer_state is not None: + state[2][:, :, self.layer_id] = layer_state[0] + state[3][:, :, self.layer_id] = layer_state[1] + state[4][:, :, self.layer_id] = layer_state[2] + + return self.output(receptance * rwkv), state + + +class RwkvFeedForward(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.config = config + self.layer_id = layer_id + hidden_size = config.hidden_size + intermediate_size = ( + config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size + ) + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size)) + self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size)) + + self.key = nn.Linear(hidden_size, intermediate_size, bias=False) + self.receptance = nn.Linear(hidden_size, hidden_size, bias=False) + self.value = nn.Linear(intermediate_size, hidden_size, bias=False) + + def forward(self, hidden, state=None): + if hidden.size(1) == 1 and state is not None: + shifted = state[0][:, :, self.layer_id] + else: + shifted = self.time_shift(hidden) + if state is not None: + shifted[:, 0] = state[0][:, :, self.layer_id] + key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key) + receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance) + + key = torch.square(torch.relu(self.key(key))) + value = self.value(key) + receptance = torch.sigmoid(self.receptance(receptance)) + + if state is not None: + state[0][:, :, self.layer_id] = hidden[:, -1] + + return receptance * value, state + + +class RwkvBlock(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.config = config + self.layer_id = layer_id + + if layer_id == 0: + self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + self.attention = RwkvSelfAttention(config, layer_id) + self.feed_forward = RwkvFeedForward(config, layer_id) + + def forward(self, hidden, state=None, use_cache=False, output_attentions=False): + if self.layer_id == 0: + hidden = self.pre_ln(hidden) + + attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache) + hidden = hidden + attention + + feed_forward, state = self.feed_forward(self.ln2(hidden), state=state) + hidden = hidden + feed_forward + + outputs = (hidden, state) + if output_attentions: + outputs += (attention,) + else: + outputs += (None,) + + return outputs + + +class RwkvPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RwkvConfig + base_model_prefix = "rwkv" + _no_split_modules = ["RwkvBlock"] + _keep_in_fp32_modules = ["time_decay", "time_first"] + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, RwkvSelfAttention): + layer_id = module.layer_id + num_hidden_layers = module.config.num_hidden_layers + hidden_size = module.config.hidden_size + attention_hidden_size = module.attention_hidden_size + + ratio_0_to_1 = layer_id / (num_hidden_layers - 1) # 0 to 1 + ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers) # 1 to ~0 + + time_weight = torch.tensor( + [i / hidden_size for i in range(hidden_size)], + dtype=module.time_mix_key.dtype, + device=module.time_mix_key.device, + ) + time_weight = time_weight[None, None, :] + + decay_speed = [ + -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1) + for h in range(attention_hidden_size) + ] + decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device) + zigzag = ( + torch.tensor( + [(i + 1) % 3 - 1 for i in range(attention_hidden_size)], + dtype=module.time_first.dtype, + device=module.time_first.device, + ) + * 0.5 + ) + + with torch.no_grad(): + module.time_decay.data = decay_speed + module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag) + + module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0) + module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1 + module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0) + elif isinstance(module, RwkvFeedForward): + layer_id = module.layer_id + num_hidden_layers = module.config.num_hidden_layers + hidden_size = module.config.hidden_size + + ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers) # 1 to ~0 + + time_weight = torch.tensor( + [i / hidden_size for i in range(hidden_size)], + dtype=module.time_mix_key.dtype, + device=module.time_mix_key.device, + ) + time_weight = time_weight[None, None, :] + + with torch.no_grad(): + module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0) + module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, RwkvModel): + module.gradient_checkpointing = value + + +@dataclass +class RwkvOutput(ModelOutput): + """ + Class for the RWKV model outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: torch.FloatTensor = None + state: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class RwkvCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + state: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +RWKV_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`RwkvConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +RWKV_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*): + If passed along, the model uses the previous state in all the blocks (which will give the output for the + `input_ids` provided as if the model add `state_input_ids + input_ids` as context). + use_cache (`bool`, *optional*): + If set to `True`, the last state is returned and can be used to quickly generate the next logits. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.", + RWKV_START_DOCSTRING, +) +class RwkvModel(RwkvPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)]) + self.ln_out = nn.LayerNorm(config.hidden_size) + + self.layers_are_rescaled = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=RwkvOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + state: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, RwkvOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.training == self.layers_are_rescaled: + self._rescale_layers() + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + if use_cache and state is None: + shape = (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers) + state = [ + torch.zeros( + *shape, dtype=inputs_embeds.dtype if i <= 1 else torch.float32, device=inputs_embeds.device + ) + for i in range(5) + ] + state[4] -= 1e30 + + hidden_states = inputs_embeds + + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + for idx, block in enumerate(self.blocks): + hidden_states, state, attentions = block( + hidden_states, state=state, use_cache=use_cache, output_attentions=output_attentions + ) + if ( + self.layers_are_rescaled + and self.config.rescale_every > 0 + and (idx + 1) % self.config.rescale_every == 0 + ): + hidden_states = hidden_states / 2 + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if output_attentions: + all_self_attentions = all_self_attentions + (attentions,) + + hidden_states = self.ln_out(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return (hidden_states, state, all_hidden_states, all_self_attentions) + + return RwkvOutput( + last_hidden_state=hidden_states, + state=state, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def _rescale_layers(self): + # Layers should be rescaled for inference only. + if self.layers_are_rescaled == (not self.training): + return + if self.config.rescale_every > 0: + with torch.no_grad(): + for block_id, block in enumerate(self.blocks): + if self.training: + block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every)) + block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every)) + else: + block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every)) + block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every)) + + self.layers_are_rescaled = not self.training + + +@add_start_docstrings( + """ + The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + RWKV_START_DOCSTRING, +) +class RwkvForCausalLM(RwkvPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.rwkv = RwkvModel(config) + self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.head + + def set_output_embeddings(self, new_embeddings): + self.head = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs): + # only last token for inputs_ids if the state is passed along. + if state is not None: + input_ids = input_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and state is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs["state"] = state + return model_inputs + + @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=RwkvCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + state: Optional[List[torch.FloatTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, RwkvCausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + rwkv_outputs = self.rwkv( + input_ids, + inputs_embeds=inputs_embeds, + state=state, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = rwkv_outputs[0] + + logits = self.head(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + rwkv_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return RwkvCausalLMOutput( + loss=loss, + logits=logits, + state=rwkv_outputs.state, + hidden_states=rwkv_outputs.hidden_states, + attentions=rwkv_outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 7279117698db..eeb799e18875 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -6105,6 +6105,30 @@ def load_tf_weights_in_roformer(*args, **kwargs): requires_backends(load_tf_weights_in_roformer, ["torch"]) +RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class RwkvForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RwkvModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RwkvPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + SAM_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/rwkv/__init__.py b/tests/models/rwkv/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py new file mode 100644 index 000000000000..4afcc9b41f86 --- /dev/null +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -0,0 +1,451 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest +from unittest.util import safe_repr + +from transformers import AutoTokenizer, RwkvConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + RWKV_PRETRAINED_MODEL_ARCHIVE_LIST, + RwkvForCausalLM, + RwkvModel, + ) + + +class RwkvModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=False, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def get_large_model_config(self): + return RwkvConfig.from_pretrained("sgugger/rwkv-4-pile-7b") + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + return ( + config, + input_ids, + input_mask, + None, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + return RwkvConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=self.intermediate_size, + activation_function=self.hidden_act, + resid_pdrop=self.hidden_dropout_prob, + attn_pdrop=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_rwkv_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + config.output_hidden_states = True + model = RwkvModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.hidden_states), config.num_hidden_layers + 1) + + def create_and_check_causl_lm(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = RwkvForCausalLM(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_state_equivalency(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = RwkvModel(config=config) + model.to(torch_device) + model.eval() + + outputs = model(input_ids) + output_whole = outputs.last_hidden_state + + outputs = model(input_ids[:, :2]) + output_one = outputs.last_hidden_state + + # Using the state computed on the first inputs, we will get the same output + outputs = model(input_ids[:, 2:], state=outputs.state) + output_two = outputs.last_hidden_state + + self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = RwkvForCausalLM(config) + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + result = model(input_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = {"input_ids": input_ids} + + return config, inputs_dict + + +@require_torch +class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (RwkvModel, RwkvForCausalLM) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": RwkvModel, + "text-generation": RwkvForCausalLM, + } + if is_torch_available() + else {} + ) + # all_generative_model_classes = (RwkvForCausalLM,) if is_torch_available() else () + fx_compatible = False + test_missing_keys = False + test_model_parallel = False + test_pruning = False + test_head_masking = False # Rwkv does not support head masking + + def setUp(self): + self.model_tester = RwkvModelTester(self) + self.config_tester = ConfigTester( + self, config_class=RwkvConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"] + ) + + def assertInterval(self, member, container, msg=None): + r""" + Simple utility function to check if a member is inside an interval. + """ + if isinstance(member, torch.Tensor): + max_value, min_value = member.max().item(), member.min().item() + elif isinstance(member, list) or isinstance(member, tuple): + max_value, min_value = max(member), min(member) + + if not isinstance(container, list): + raise TypeError("container should be a list or tuple") + elif len(container) != 2: + raise ValueError("container should have 2 elements") + + expected_min, expected_max = container + + is_inside_interval = (min_value >= expected_min) and (max_value <= expected_max) + + if not is_inside_interval: + standardMsg = "%s not found in %s" % (safe_repr(member), safe_repr(container)) + self.fail(self._formatMessage(msg, standardMsg)) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_rwkv_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_rwkv_model(*config_and_inputs) + + def test_rwkv_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_causl_lm(*config_and_inputs) + + def test_state_equivalency(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_state_equivalency(*config_and_inputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config=config) + for name, param in model.named_parameters(): + if "time_decay" in name: + if param.requires_grad: + self.assertTrue(param.data.max().item() == 3.0) + self.assertTrue(param.data.min().item() == -5.0) + elif "time_first" in name: + if param.requires_grad: + # check if it's a ones like + self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5)) + elif any([x in name for x in ["time_mix_key", "time_mix_receptance"]]): + if param.requires_grad: + self.assertInterval( + param.data, + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + elif "time_mix_value" in name: + if param.requires_grad: + self.assertInterval( + param.data, + [0.0, 1.3], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_attention_outputs(self): + r""" + Overriding the test_attention_outputs test as the attention outputs of Rwkv are different from other models + it has a shape `batch_size, seq_len, hidden_size`. + """ + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + batch_size = inputs["input_ids"].shape[0] + with torch.no_grad(): + outputs = model(**inputs) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + batch_size = inputs["input_ids"].shape[0] + with torch.no_grad(): + outputs = model(**inputs) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [batch_size, seq_len, config.hidden_size], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + batch_size = inputs["input_ids"].shape[0] + with torch.no_grad(): + outputs = model(**inputs) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [batch_size, seq_len, config.hidden_size], + ) + + @slow + def test_model_from_pretrained(self): + for model_name in RWKV_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = RwkvModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@slow +class RWKVIntegrationTests(unittest.TestCase): + def setUp(self): + self.model_id = "RWKV/rwkv-4-169m-pile" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + + def test_simple_generate(self): + expected_output = "Hello my name is Jasmine and I am a newbie to the" + model = RwkvForCausalLM.from_pretrained(self.model_id).to(torch_device) + + input_ids = self.tokenizer("Hello my name is", return_tensors="pt").input_ids.to(torch_device) + output = model.generate(input_ids, max_new_tokens=10) + output_sentence = self.tokenizer.decode(output[0].tolist()) + + self.assertEqual(output_sentence, expected_output) + + def test_simple_generate_bf16(self): + expected_output = "Hello my name is Jasmine and I am a newbie to the" + + input_ids = self.tokenizer("Hello my name is", return_tensors="pt").input_ids.to(torch_device) + model = RwkvForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device) + + output = model.generate(input_ids, max_new_tokens=10) + output_sentence = self.tokenizer.decode(output[0].tolist()) + + self.assertEqual(output_sentence, expected_output) diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 1d249c3b52dd..fdb679529d2b 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -93,15 +93,20 @@ class ConfigTester(object): - def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs): + def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs): self.parent = parent self.config_class = config_class self.has_text_modality = has_text_modality self.inputs_dict = kwargs + self.common_properties = common_properties def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) - common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"] + common_properties = ( + ["hidden_size", "num_attention_heads", "num_hidden_layers"] + if self.common_properties is None + else self.common_properties + ) # Add common fields for text models if self.has_text_modality: From c34a525d2faea2976fbeeabbaaae929d05f8d8a7 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 9 May 2023 19:04:27 +0200 Subject: [PATCH 056/935] Proposed fix for TF example now running on safetensors. (#23208) * Proposed fix for TF example now running on safetensors. * Adding more warnings and returning keys. * Trigger CI * Trigger CI --------- Co-authored-by: Sylvain Gugger --- .../tensorflow/test_tensorflow_examples.py | 1 - src/transformers/modeling_tf_pytorch_utils.py | 35 +++++++++++++++++-- src/transformers/modeling_tf_utils.py | 1 + 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py index d5ae4c71b869..956209baade4 100644 --- a/examples/tensorflow/test_tensorflow_examples.py +++ b/examples/tensorflow/test_tensorflow_examples.py @@ -297,7 +297,6 @@ def test_run_translation(self): result = get_results(tmp_dir) self.assertGreaterEqual(result["bleu"], 30) - @skip("Fix me Matt") def test_run_image_classification(self): tmp_dir = self.get_auto_remove_tmp_dir() testargs = f""" diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index 402159cc6f9e..3b1c030699b9 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -246,6 +246,7 @@ def load_pytorch_state_dict_in_tf2_model( output_loading_info=False, _prefix=None, tf_to_pt_weight_rename=None, + ignore_mismatched_sizes=False, ): """Load a pytorch state_dict in a TF 2.0 model.""" import tensorflow as tf @@ -297,6 +298,7 @@ def load_pytorch_state_dict_in_tf2_model( weight_value_tuples = [] all_pytorch_weights = set(pt_state_dict.keys()) missing_keys = [] + mismatched_keys = [] for symbolic_weight in symbolic_weights: sw_name = symbolic_weight.name name, transpose = convert_tf_weight_name_to_pt_weight_name( @@ -319,7 +321,18 @@ def load_pytorch_state_dict_in_tf2_model( continue raise AttributeError(f"{name} not found in PyTorch model") - array = apply_transpose(transpose, pt_state_dict[name], symbolic_weight.shape) + try: + array = apply_transpose(transpose, pt_state_dict[name], symbolic_weight.shape) + except tf.errors.InvalidArgumentError as e: + if not ignore_mismatched_sizes: + error_msg = str(e) + error_msg += ( + "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." + ) + raise tf.errors.InvalidArgumentError(error_msg) + else: + mismatched_keys.append((name, pt_state_dict[name].shape, symbolic_weight.shape)) + continue tf_loaded_numel += tensor_size(array) @@ -367,8 +380,26 @@ def load_pytorch_state_dict_in_tf2_model( f"you can already use {tf_model.__class__.__name__} for predictions without further training." ) + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + logger.warning( + f"Some weights of {tf_model.__class__.__name__} were not initialized from the model checkpoint" + f" are newly initialized because the shapes did not" + f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" + " to use it for predictions and inference." + ) + if output_loading_info: - loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys} + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + } return tf_model, loading_info return tf_model diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index f48651a6e9cc..35c526379c88 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -2820,6 +2820,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): allow_missing_keys=True, output_loading_info=output_loading_info, _prefix=load_weight_prefix, + ignore_mismatched_sizes=ignore_mismatched_sizes, ) # 'by_name' allow us to do transfer learning by skipping/adding layers From 650a71e157478cc8c9d9dc648a6a79108f49e047 Mon Sep 17 00:00:00 2001 From: Konstantin Dobler Date: Tue, 9 May 2023 19:05:13 +0200 Subject: [PATCH 057/935] Support ratios for `logging_steps`, `eval_steps`, and `save_steps` (#23235) * Ratio option for `logging_steps`, `eval_steps`, `save_steps` * Add guards if arguments are not set * Add more detailed comments + formatting * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Convert args values to `int` if bigger than 1 * `black` * `make fixup` --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/trainer.py | 8 ++++ src/transformers/training_args.py | 72 +++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index d18e3efeb87a..f7fb3558df7f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1712,6 +1712,14 @@ def _inner_training_loop( f" {args.max_steps}" ) + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps and args.logging_steps < 1: + args.logging_steps = math.ceil(max_steps * args.logging_steps) + if args.eval_steps and args.eval_steps < 1: + args.eval_steps = math.ceil(max_steps * args.eval_steps) + if args.save_steps and args.save_steps < 1: + args.save_steps = math.ceil(max_steps * args.save_steps) + if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if self.args.n_gpu > 1: # nn.DataParallel(model) replicates the model, creating new variables and module diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 4b0ea975b26d..44f28ff99ef4 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -251,8 +251,9 @@ class TrainingArguments: logging_first_step (`bool`, *optional*, defaults to `False`): Whether to log and evaluate the first `global_step` or not. - logging_steps (`int`, *optional*, defaults to 500): - Number of update steps between two logs if `logging_strategy="steps"`. + logging_steps (`int` or `float`, *optional*, defaults to 500): + Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in + range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. logging_nan_inf_filter (`bool`, *optional*, defaults to `True`): Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan` or `inf` is filtered and the average loss of the current logging window is taken instead. @@ -270,8 +271,9 @@ class TrainingArguments: - `"no"`: No save is done during training. - `"epoch"`: Save is done at the end of each epoch. - `"steps"`: Save is done every `save_steps`. - save_steps (`int`, *optional*, defaults to 500): - Number of updates steps before two checkpoint saves if `save_strategy="steps"`. + save_steps (`int` or `float`, *optional*, defaults to 500): + Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. save_total_limit (`int`, *optional*): If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. @@ -332,9 +334,10 @@ class TrainingArguments: dataloader_drop_last (`bool`, *optional*, defaults to `False`): Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not. - eval_steps (`int`, *optional*): + eval_steps (`int` or `float`, *optional*): Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same - value as `logging_steps` if not set. + value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1, + will be interpreted as ratio of total training steps. dataloader_num_workers (`int`, *optional*, defaults to 0): Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process. @@ -721,13 +724,29 @@ class TrainingArguments: metadata={"help": "The logging strategy to use."}, ) logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) - logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) + logging_steps: float = field( + default=500, + metadata={ + "help": ( + "Log every X updates steps. Should be an integer or a float in range `[0,1)`." + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."}) save_strategy: Union[IntervalStrategy, str] = field( default="steps", metadata={"help": "The checkpoint save strategy to use."}, ) - save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) + save_steps: float = field( + default=500, + metadata={ + "help": ( + "Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`." + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) save_total_limit: Optional[int] = field( default=None, metadata={ @@ -854,7 +873,15 @@ class TrainingArguments: dataloader_drop_last: bool = field( default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} ) - eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."}) + eval_steps: Optional[float] = field( + default=None, + metadata={ + "help": ( + "Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`." + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) dataloader_num_workers: int = field( default=0, metadata={ @@ -1186,6 +1213,19 @@ def __post_init__(self): if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0: raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps") + if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1: + if self.logging_steps != int(self.logging_steps): + raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}") + self.logging_steps = int(self.logging_steps) + if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1: + if self.eval_steps != int(self.eval_steps): + raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}") + self.eval_steps = int(self.eval_steps) + if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1: + if self.save_steps != int(self.save_steps): + raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}") + self.save_steps = int(self.save_steps) + # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible. if self.load_best_model_at_end: if self.evaluation_strategy != self.save_strategy: @@ -1194,6 +1234,20 @@ def __post_init__(self): f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}" ) if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: + if self.eval_steps < 1 or self.save_steps < 1: + if not (self.eval_steps < 1 and self.save_steps < 1): + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps" + f"{self.save_steps} and eval_steps {self.eval_steps}." + ) + # Work around floating point precision issues + LARGE_MULTIPLIER = 1_000_000 + if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0: + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}." + ) raise ValueError( "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation " f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}." From 627f44799a9f4948a6a1b8fe9e536eee0e29ea68 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 10 May 2023 03:34:48 +0900 Subject: [PATCH 058/935] [Doctests] Refactor doctests + add CI (#22987) * intiial commit * new styling * update * just run doctest in CI * remove more test for fast dev * update * update refs * update path and fetch upstream * update documentatyion trests * typo * parse pwd * don't check for files that are in hidden folders * just give paths relative to transformers * update * update * update * major refactoring * make sure options is ok * lest test that mdx is tested * doctest glob * nits * update doctest nightly * some cleaning * run correct test on diff * debug * run on a single worker * skip_cuda_test tampkate * updates * add rA and continue on failure * test options * parse `py` codeblock? * we don't need to replace ignore results, don't remember whyu I put it * cleanup * more cleaning * fix arg * more cleaning * clean an todo * more pre-processing * doctest-module has none so extra `- ` is needed * remove logs * nits * doctest-modules .... * oups * let's use sugar * make dataset go quiet * add proper timeout * nites * spleling timeout * update * properly skip tests that have CUDSA * proper skipping * cleaning main and get tests to run * remove make report? * remove tee * some updates * tee was removed but is the full output still available? * [all-test] * only our tests * don't touch tee in this PR * no atee-sys * proper sub * monkey * only replace call * fix sub * nits * nits * fix invalid syntax * add skip cuda doctest env variable * make sure all packages are installed * move file * update check repo * revert changes * nit * finish cleanup * fix re * findall * update don't test init files * ignore pycache * `-ignore-pycache` when running pytests * try to fix the import missmatch error * install dec * pytest is required as doctest_utils imports things from it * the only log issues were dataset, ignore results should work * more cleaning * Update .circleci/create_circleci_config.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * [ydshieh] empty string if cuda is found * [ydshieh] fix condition * style * [ydshieh] fix * Add comment * style * style * show failure * trigger CI --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: ydshieh --- .circleci/config.yml | 1 + .circleci/create_circleci_config.py | 85 ++++++++++- .github/workflows/doctests.yml | 8 - Makefile | 8 +- conftest.py | 12 +- docs/source/en/testing.mdx | 12 +- setup.cfg | 1 + src/transformers/utils/__init__.py | 1 + src/transformers/utils/doctest_utils.py | 189 ++++++++++++++++++++++++ utils/prepare_for_doc_test.py | 148 ------------------- 10 files changed, 287 insertions(+), 178 deletions(-) create mode 100644 src/transformers/utils/doctest_utils.py delete mode 100644 utils/prepare_for_doc_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 63c6162fc15d..9af649045803 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -166,6 +166,7 @@ jobs: - v0.6-repository_consistency - run: pip install --upgrade pip - run: pip install .[all,quality] + - run: pip install pytest - save_cache: key: v0.5-repository_consistency-{{ checksum "setup.py" }} paths: diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 7208d876a97c..30898f9e1c2a 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -51,6 +51,8 @@ class CircleCIJob: resource_class: Optional[str] = "xlarge" tests_to_run: Optional[List[str]] = None working_directory: str = "~/transformers" + # This should be only used for doctest job! + command_timeout: Optional[int] = None def __post_init__(self): # Deal with defaults for mutable attributes. @@ -107,11 +109,15 @@ def to_dict(self): steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}}) all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options} - pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()] + pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()] pytest_flags.append( f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}" ) - test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) + test_command = "" + if self.command_timeout: + test_command = f"timeout {self.command_timeout} " + test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) + if self.parallelism == 1: if self.tests_to_run is None: test_command += " << pipeline.parameters.tests_to_run >>" @@ -161,12 +167,37 @@ def to_dict(self): steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}}) steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}}) - test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) + test_command = "" + if self.timeout: + test_command = f"timeout {self.timeout} " + test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) test_command += " $(cat splitted_tests.txt)" if self.marker is not None: test_command += f" -m {self.marker}" - test_command += " | tee tests_output.txt" + + if self.name == "pr_documentation_tests": + # can't use ` | tee tee tests_output.txt` as usual + test_command += " > tests_output.txt" + # Save the return code, so we can check if it is timeout in the next step. + test_command += '; touch "$?".txt' + # Never fail the test step for the doctest job. We will check the results in the next step, and fail that + # step instead if the actual test failures are found. This is to avoid the timeout being reported as test + # failure. + test_command = f"({test_command}) || true" + else: + test_command += " | tee tests_output.txt" steps.append({"run": {"name": "Run tests", "command": test_command}}) + + # return code `124` means the previous (pytest run) step is timeout + if self.name == "pr_documentation_tests": + checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; ' + checkout_doctest_command += 'then echo "some test failed"; ' + checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; ' + checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; ' + checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; ' + checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;' + steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}}) + steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}}) steps.append({"store_artifacts": {"path": "~/transformers/reports"}}) job["steps"] = steps @@ -401,6 +432,51 @@ def job_name(self): tests_to_run="tests/repo_utils", ) +# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file). +py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' +py_command = f"$(python3 -c '{py_command}')" +command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt' +doc_test_job = CircleCIJob( + "pr_documentation_tests", + additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"}, + install_steps=[ + "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time", + "pip install --upgrade pip", + "pip install -e .[dev]", + "pip install git+https://github.com/huggingface/accelerate", + "pip install --upgrade pytest pytest-sugar", + "find -name __pycache__ -delete", + "find . -name \*.pyc -delete", + # Add an empty file to keep the test step running correctly even no file is selected to be tested. + "touch dummy.py", + { + "name": "Get files to test", + "command": + "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n" + "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt" + }, + { + "name": "List files beings changed: pr_documentation_tests.txt", + "command": + "cat pr_documentation_tests.txt" + }, + { + "name": "Filter pr_documentation_tests.txt", + "command": + command + }, + { + "name": "List files beings tested: pr_documentation_tests_filtered.txt", + "command": + "cat pr_documentation_tests_filtered.txt" + }, + ], + tests_to_run="$(cat pr_documentation_tests_filtered.txt)", # noqa + pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None}, + command_timeout=1200, # test cannot run longer than 1200 seconds + pytest_num_workers=1, +) + REGULAR_TESTS = [ torch_and_tf_job, torch_and_flax_job, @@ -411,6 +487,7 @@ def job_name(self): hub_job, onnx_job, exotic_models_job, + doc_test_job ] EXAMPLES_TESTS = [ examples_torch_job, diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index a0efe40cbbe9..55c09b1acc82 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -37,18 +37,10 @@ jobs: - name: Show installed libraries and their versions run: pip freeze - - name: Prepare files for doctests - run: | - python3 utils/prepare_for_doc_test.py src docs - - name: Run doctests run: | python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx" - - name: Clean files after doctests - run: | - python3 utils/prepare_for_doc_test.py src docs --remove_new_line - - name: Failure short reports if: ${{ failure() }} continue-on-error: true diff --git a/Makefile b/Makefile index 5e5a11a1fee0..d6d6966a1dad 100644 --- a/Makefile +++ b/Makefile @@ -47,10 +47,10 @@ repo-consistency: # this target runs checks on all files quality: - black --check $(check_dirs) setup.py + black --check $(check_dirs) setup.py conftest.py python utils/custom_init_isort.py --check_only python utils/sort_auto_mappings.py --check_only - ruff $(check_dirs) setup.py + ruff $(check_dirs) setup.py conftest.py doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source python utils/check_doc_toc.py @@ -65,8 +65,8 @@ extra_style_checks: # this target runs checks on all files and potentially modifies some of them style: - black $(check_dirs) setup.py - ruff $(check_dirs) setup.py --fix + black $(check_dirs) setup.py conftest.py + ruff $(check_dirs) setup.py conftest.py --fix ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/conftest.py b/conftest.py index 53efec7a6c2d..c57fac2b1d9c 100644 --- a/conftest.py +++ b/conftest.py @@ -20,6 +20,10 @@ import warnings from os.path import abspath, dirname, join +import _pytest + +from transformers.utils.doctest_utils import HfDoctestModule, HfDocTestParser + # allow having multiple repository checkouts and not needing to remember to rerun # 'pip install -e .[dev]' when switching between checkouts and running tests. @@ -38,9 +42,7 @@ def pytest_configure(config): config.addinivalue_line( "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested" ) - config.addinivalue_line( - "markers", "is_pipeline_test: mark test to run only when pipelines are tested" - ) + config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested") config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment") config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate") @@ -66,7 +68,7 @@ def pytest_sessionfinish(session, exitstatus): # Doctest custom flag to ignore output. -IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT') +IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT") OutputChecker = doctest.OutputChecker @@ -79,3 +81,5 @@ def check_output(self, want, got, optionflags): doctest.OutputChecker = CustomOutputChecker +_pytest.doctest.DoctestModule = HfDoctestModule +doctest.DocTestParser = HfDocTestParser diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx index 4663b8ac4d93..5adbc8e44db7 100644 --- a/docs/source/en/testing.mdx +++ b/docs/source/en/testing.mdx @@ -212,20 +212,12 @@ Example: ```""" ``` -3 steps are required to debug the docstring examples: -1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: -```bash -python utils/prepare_for_doc_test.py -``` -2. Then, you can use the following line to automatically test every docstring example in the desired file: +Just run the following line to automatically test every docstring example in the desired file: ```bash pytest --doctest-modules ``` -3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: -```bash -python utils/prepare_for_doc_test.py --remove_new_line -``` +If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument. ### Run only modified tests diff --git a/setup.cfg b/setup.cfg index 5f47c5c6be69..8b84d3a6d9b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,3 @@ [tool:pytest] doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS +doctest_glob=**/*.mdx \ No newline at end of file diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 42e856d9e4ac..0600eb382818 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -27,6 +27,7 @@ copy_func, replace_return_docstrings, ) +from .doctest_utils import HfDocTestParser from .generic import ( ContextManagers, ExplicitEnum, diff --git a/src/transformers/utils/doctest_utils.py b/src/transformers/utils/doctest_utils.py new file mode 100644 index 000000000000..90f37e8ce694 --- /dev/null +++ b/src/transformers/utils/doctest_utils.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils to run the documentation tests without having to overwrite any files. + +The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is +made as a print would otherwise fail the corresonding line. + +To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules +""" +import doctest +import inspect +import os +import re +from typing import Iterable + +from _pytest.doctest import ( + Module, + _get_checker, + _get_continue_on_failure, + _get_runner, + _is_mocked, + _patch_unwrap_mock_aware, + get_optionflags, + import_path, +) +from _pytest.outcomes import skip +from pytest import DoctestItem + + +def preprocess_string(string, skip_cuda_tests): + """Prepare a docstring or a `.mdx` file to be run by doctest. + + The argument `string` would be the whole file content if it is a `.mdx` file. For a python file, it would be one of + its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a + cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for + `string`. + """ + codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)" + codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string) + is_cuda_found = False + for i, codeblock in enumerate(codeblocks): + if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock: + codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock) + if ( + (">>>" in codeblock or "..." in codeblock) + and re.search(r"cuda|to\(0\)|device=0", codeblock) + and skip_cuda_tests + ): + is_cuda_found = True + break + modified_string = "" + if not is_cuda_found: + modified_string = "".join(codeblocks) + return modified_string + + +class HfDocTestParser(doctest.DocTestParser): + """ + Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This + means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also + added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line. + + Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough. + """ + + # This regular expression is used to find doctest examples in a + # string. It defines three groups: `source` is the source code + # (including leading indentation and prompts); `indent` is the + # indentation of the first (PS1) line of the source code; and + # `want` is the expected output (including leading indentation). + # fmt: off + _EXAMPLE_RE = re.compile(r''' + # Source consists of a PS1 line followed by zero or more PS2 lines. + (?P + (?:^(?P [ ]*) >>> .*) # PS1 line + (?:\n [ ]* \.\.\. .*)*) # PS2 lines + \n? + # Want consists of any non-blank lines that do not start with PS1. + (?P (?:(?![ ]*$) # Not a blank line + (?![ ]*>>>) # Not a line starting with PS1 + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + (?:(?!```).)* # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line) + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + (?:\n|$) # Match a new line or end of string + )*) + ''', re.MULTILINE | re.VERBOSE + ) + # fmt: on + + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False)) + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + + def parse(self, string, name=""): + """ + Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before + calling `super().parse` + """ + string = preprocess_string(string, self.skip_cuda_tests) + return super().parse(string, name) + + +class HfDoctestModule(Module): + """ + Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering + tests. + """ + + def collect(self) -> Iterable[DoctestItem]: + class MockAwareDocTestFinder(doctest.DocTestFinder): + """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug. + + https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532 + """ + + def _find_lineno(self, obj, source_lines): + """Doctest code does not take into account `@property`, this + is a hackish way to fix it. https://bugs.python.org/issue17446 + + Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be + reported upstream. #8796 + """ + if isinstance(obj, property): + obj = getattr(obj, "fget", obj) + + if hasattr(obj, "__wrapped__"): + # Get the main obj in case of it being wrapped + obj = inspect.unwrap(obj) + + # Type ignored because this is a private function. + return super()._find_lineno( # type:ignore[misc] + obj, + source_lines, + ) + + def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None: + if _is_mocked(obj): + return + with _patch_unwrap_mock_aware(): + # Type ignored because this is a private function. + super()._find( # type:ignore[misc] + tests, obj, name, module, source_lines, globs, seen + ) + + if self.path.name == "conftest.py": + module = self.config.pluginmanager._importconftest( + self.path, + self.config.getoption("importmode"), + rootpath=self.config.rootpath, + ) + else: + try: + module = import_path( + self.path, + root=self.config.rootpath, + mode=self.config.getoption("importmode"), + ) + except ImportError: + if self.config.getvalue("doctest_ignore_import_errors"): + skip("unable to import module %r" % self.path) + else: + raise + + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + finder = MockAwareDocTestFinder(parser=HfDocTestParser()) + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + optionflags = get_optionflags(self) + runner = _get_runner( + verbose=False, + optionflags=optionflags, + checker=_get_checker(), + continue_on_failure=_get_continue_on_failure(self.config), + ) + for test in finder.find(module, module.__name__): + if test.examples: # skip empty doctests and cuda + yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test) diff --git a/utils/prepare_for_doc_test.py b/utils/prepare_for_doc_test.py deleted file mode 100644 index c55f3540d994..000000000000 --- a/utils/prepare_for_doc_test.py +++ /dev/null @@ -1,148 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Style utils to preprocess files for doc tests. - - The doc precossing function can be run on a list of files and/org - directories of files. It will recursively check if the files have - a python code snippet by looking for a ```python or ```py syntax. - In the default mode - `remove_new_line==False` the script will - add a new line before every python code ending ``` line to make - the docstrings ready for pytest doctests. - However, we don't want to have empty lines displayed in the - official documentation which is why the new line command can be - reversed by adding the flag `--remove_new_line` which sets - `remove_new_line==True`. - - When debugging the doc tests locally, please make sure to - always run: - - ```python utils/prepare_for_doc_test.py src docs``` - - before running the doc tests: - - ```pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"``` - - Afterwards you should revert the changes by running - - ```python utils/prepare_for_doc_test.py src docs --remove_new_line``` -""" - -import argparse -import os - - -def process_code_block(code, add_new_line=True): - if add_new_line: - return maybe_append_new_line(code) - else: - return maybe_remove_new_line(code) - - -def maybe_append_new_line(code): - """ - Append new line if code snippet is a - Python code snippet - """ - lines = code.split("\n") - - if lines[0] in ["py", "python"]: - # add new line before last line being ``` - last_line = lines[-1] - lines.pop() - lines.append("\n" + last_line) - - return "\n".join(lines) - - -def maybe_remove_new_line(code): - """ - Remove new line if code snippet is a - Python code snippet - """ - lines = code.split("\n") - - if lines[0] in ["py", "python"]: - # add new line before last line being ``` - lines = lines[:-2] + lines[-1:] - - return "\n".join(lines) - - -def process_doc_file(code_file, add_new_line=True): - """ - Process given file. - - Args: - code_file (`str` or `os.PathLike`): The file in which we want to style the docstring. - """ - with open(code_file, "r", encoding="utf-8", newline="\n") as f: - code = f.read() - - # fmt: off - splits = code.split("```") - if len(splits) % 2 != 1: - raise ValueError("The number of occurrences of ``` should be an even number.") - - splits = [s if i % 2 == 0 else process_code_block(s, add_new_line=add_new_line) for i, s in enumerate(splits)] - clean_code = "```".join(splits) - # fmt: on - - diff = clean_code != code - if diff: - print(f"Overwriting content of {code_file}.") - with open(code_file, "w", encoding="utf-8", newline="\n") as f: - f.write(clean_code) - - -def process_doc_files(*files, add_new_line=True): - """ - Applies doc styling or checks everything is correct in a list of files. - - Args: - files (several `str` or `os.PathLike`): The files to treat. - Whether to restyle file or just check if they should be restyled. - - Returns: - List[`str`]: The list of files changed or that should be restyled. - """ - for file in files: - # Treat folders - if os.path.isdir(file): - files = [os.path.join(file, f) for f in os.listdir(file)] - files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")] - process_doc_files(*files, add_new_line=add_new_line) - else: - try: - process_doc_file(file, add_new_line=add_new_line) - except Exception: - print(f"There is a problem in {file}.") - raise - - -def main(*files, add_new_line=True): - process_doc_files(*files, add_new_line=add_new_line) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.") - parser.add_argument( - "--remove_new_line", - action="store_true", - help="Whether to remove new line after each python code block instead of adding one.", - ) - args = parser.parse_args() - - main(*args.files, add_new_line=not args.remove_new_line) From a0c0a7823393f7276f835fef815eabcadd8eaf64 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Tue, 9 May 2023 14:59:38 -0400 Subject: [PATCH 059/935] v4.30.0.dev0 --- README.md | 8 ++++---- examples/flax/question-answering/run_qa.py | 2 +- examples/flax/text-classification/run_flax_glue.py | 2 +- examples/flax/token-classification/run_flax_ner.py | 2 +- .../audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../image-classification/run_image_classification.py | 2 +- .../run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- examples/pytorch/question-answering/run_qa_beam_search.py | 2 +- .../question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../semantic-segmentation/run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../speech-recognition/run_speech_recognition_ctc.py | 2 +- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- examples/pytorch/summarization/run_summarization.py | 2 +- .../pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- .../pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- .../pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- .../pytorch/translation/run_translation_no_trainer.py | 2 +- examples/tensorflow/contrastive-image-text/run_clip.py | 2 +- .../image-classification/run_image_classification.py | 2 +- examples/tensorflow/multiple-choice/run_swag.py | 2 +- examples/tensorflow/question-answering/run_qa.py | 2 +- examples/tensorflow/summarization/run_summarization.py | 2 +- examples/tensorflow/text-classification/run_glue.py | 2 +- examples/tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- 45 files changed, 48 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index fdceaa1643e0..7b0ff6f293cf 100644 --- a/README.md +++ b/README.md @@ -341,7 +341,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. -1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. @@ -400,7 +400,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. -1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. @@ -422,9 +422,9 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. -1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. -1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 5a9f5eb16e0f..78d51fa7e949 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index ffd98152d77c..ac31c2a00db2 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 8e038ac13679..a2f8ba2e60b6 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index e9beb8dcf917..f689864271ee 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index c30349c37aaa..9fb0da7c35f2 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 4b4fee5b5175..10a8ece093f3 100644 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index e30699435431..6a900ff76137 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 3f7ef47c6a67..7af1b3e0b5ba 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 874b7c651248..344f5d9b5b81 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index a94585d39698..126029150b9e 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.25.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 020a6f10dde9..3ffa7a5bba3e 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 4bb750a0b024..e7f5e88b2912 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 7b8d67980251..eced8377bcdc 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 9334de8c0331..30138bfbc31a 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index ab955d5b941a..1af82f5fc2da 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index a0660f0085f3..add403dfb8ad 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 6d3987c8feb4..21f2e1bf04bc 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index b0dcf6e5d836..29591a457793 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index fc4b0e0288be..42b5b7b6580b 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index ba7e82cc6470..05c85bdf50e3 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 67d9fb8b455d..2e363ae97098 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 3c0ac4dfbc22..314c4a201522 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index e1027f5d67b4..0f989351eefb 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 00d115646ac4..783810be64b7 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 8c4f4352489a..73eea0b79f33 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 419be107b164..22a29fc73054 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index e083e68848ef..c89881b0beb9 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 37ea3bcfbb9e..ea09c6b89e06 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index dd81d535df7b..8f9fa5c1a4b3 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index c71581f7811c..2fbacade06c6 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 88139986b286..4397b4a9433b 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 9e5dd8d31bd2..3391bcecfc2e 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 800312839440..bc51fab14e04 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index d31a6a8ca035..035bb3b06d98 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 29f3e49f0a07..e52050308ab2 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 35359a2fa708..2c696244f629 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index ee6ebfb46901..61c6cea2cd94 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 9c6a90f1dc1b..d3ddca3f134c 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 7059a9a03212..a42d5111814a 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index b60a2129166d..f3195d39d96d 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index 8aa3d4c7fe80..df062c342e5d 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 5f45f752c503..1f31c69245fb 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0.dev0") +check_min_version("4.30.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index 41e52e7f7c8e..1d8115f66845 100644 --- a/setup.py +++ b/setup.py @@ -428,7 +428,7 @@ def run(self): setup( name="transformers", - version="4.29.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.30.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 375966131e0d..2f1e04c3cb6b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.29.0.dev0" +__version__ = "4.30.0.dev0" from typing import TYPE_CHECKING From 69ee46243c40ea61f63d4b8f78d171ad27b4a046 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 9 May 2023 15:26:15 -0400 Subject: [PATCH 060/935] Revert "[Doctests] Refactor doctests + add CI" (#23245) Revert "[Doctests] Refactor doctests + add CI (#22987)" This reverts commit 627f44799a9f4948a6a1b8fe9e536eee0e29ea68. --- .circleci/config.yml | 1 - .circleci/create_circleci_config.py | 85 +---------- .github/workflows/doctests.yml | 8 + Makefile | 8 +- conftest.py | 12 +- docs/source/en/testing.mdx | 12 +- setup.cfg | 1 - src/transformers/utils/__init__.py | 1 - src/transformers/utils/doctest_utils.py | 189 ------------------------ utils/prepare_for_doc_test.py | 148 +++++++++++++++++++ 10 files changed, 178 insertions(+), 287 deletions(-) delete mode 100644 src/transformers/utils/doctest_utils.py create mode 100644 utils/prepare_for_doc_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 9af649045803..63c6162fc15d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -166,7 +166,6 @@ jobs: - v0.6-repository_consistency - run: pip install --upgrade pip - run: pip install .[all,quality] - - run: pip install pytest - save_cache: key: v0.5-repository_consistency-{{ checksum "setup.py" }} paths: diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 30898f9e1c2a..7208d876a97c 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -51,8 +51,6 @@ class CircleCIJob: resource_class: Optional[str] = "xlarge" tests_to_run: Optional[List[str]] = None working_directory: str = "~/transformers" - # This should be only used for doctest job! - command_timeout: Optional[int] = None def __post_init__(self): # Deal with defaults for mutable attributes. @@ -109,15 +107,11 @@ def to_dict(self): steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}}) all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options} - pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()] + pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()] pytest_flags.append( f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}" ) - test_command = "" - if self.command_timeout: - test_command = f"timeout {self.command_timeout} " - test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) - + test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) if self.parallelism == 1: if self.tests_to_run is None: test_command += " << pipeline.parameters.tests_to_run >>" @@ -167,37 +161,12 @@ def to_dict(self): steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}}) steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}}) - test_command = "" - if self.timeout: - test_command = f"timeout {self.timeout} " - test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) + test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) test_command += " $(cat splitted_tests.txt)" if self.marker is not None: test_command += f" -m {self.marker}" - - if self.name == "pr_documentation_tests": - # can't use ` | tee tee tests_output.txt` as usual - test_command += " > tests_output.txt" - # Save the return code, so we can check if it is timeout in the next step. - test_command += '; touch "$?".txt' - # Never fail the test step for the doctest job. We will check the results in the next step, and fail that - # step instead if the actual test failures are found. This is to avoid the timeout being reported as test - # failure. - test_command = f"({test_command}) || true" - else: - test_command += " | tee tests_output.txt" + test_command += " | tee tests_output.txt" steps.append({"run": {"name": "Run tests", "command": test_command}}) - - # return code `124` means the previous (pytest run) step is timeout - if self.name == "pr_documentation_tests": - checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; ' - checkout_doctest_command += 'then echo "some test failed"; ' - checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; ' - checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; ' - checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; ' - checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;' - steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}}) - steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}}) steps.append({"store_artifacts": {"path": "~/transformers/reports"}}) job["steps"] = steps @@ -432,51 +401,6 @@ def job_name(self): tests_to_run="tests/repo_utils", ) -# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file). -py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' -py_command = f"$(python3 -c '{py_command}')" -command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt' -doc_test_job = CircleCIJob( - "pr_documentation_tests", - additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"}, - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time", - "pip install --upgrade pip", - "pip install -e .[dev]", - "pip install git+https://github.com/huggingface/accelerate", - "pip install --upgrade pytest pytest-sugar", - "find -name __pycache__ -delete", - "find . -name \*.pyc -delete", - # Add an empty file to keep the test step running correctly even no file is selected to be tested. - "touch dummy.py", - { - "name": "Get files to test", - "command": - "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n" - "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt" - }, - { - "name": "List files beings changed: pr_documentation_tests.txt", - "command": - "cat pr_documentation_tests.txt" - }, - { - "name": "Filter pr_documentation_tests.txt", - "command": - command - }, - { - "name": "List files beings tested: pr_documentation_tests_filtered.txt", - "command": - "cat pr_documentation_tests_filtered.txt" - }, - ], - tests_to_run="$(cat pr_documentation_tests_filtered.txt)", # noqa - pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None}, - command_timeout=1200, # test cannot run longer than 1200 seconds - pytest_num_workers=1, -) - REGULAR_TESTS = [ torch_and_tf_job, torch_and_flax_job, @@ -487,7 +411,6 @@ def job_name(self): hub_job, onnx_job, exotic_models_job, - doc_test_job ] EXAMPLES_TESTS = [ examples_torch_job, diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 55c09b1acc82..a0efe40cbbe9 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -37,10 +37,18 @@ jobs: - name: Show installed libraries and their versions run: pip freeze + - name: Prepare files for doctests + run: | + python3 utils/prepare_for_doc_test.py src docs + - name: Run doctests run: | python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx" + - name: Clean files after doctests + run: | + python3 utils/prepare_for_doc_test.py src docs --remove_new_line + - name: Failure short reports if: ${{ failure() }} continue-on-error: true diff --git a/Makefile b/Makefile index d6d6966a1dad..5e5a11a1fee0 100644 --- a/Makefile +++ b/Makefile @@ -47,10 +47,10 @@ repo-consistency: # this target runs checks on all files quality: - black --check $(check_dirs) setup.py conftest.py + black --check $(check_dirs) setup.py python utils/custom_init_isort.py --check_only python utils/sort_auto_mappings.py --check_only - ruff $(check_dirs) setup.py conftest.py + ruff $(check_dirs) setup.py doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source python utils/check_doc_toc.py @@ -65,8 +65,8 @@ extra_style_checks: # this target runs checks on all files and potentially modifies some of them style: - black $(check_dirs) setup.py conftest.py - ruff $(check_dirs) setup.py conftest.py --fix + black $(check_dirs) setup.py + ruff $(check_dirs) setup.py --fix ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/conftest.py b/conftest.py index c57fac2b1d9c..53efec7a6c2d 100644 --- a/conftest.py +++ b/conftest.py @@ -20,10 +20,6 @@ import warnings from os.path import abspath, dirname, join -import _pytest - -from transformers.utils.doctest_utils import HfDoctestModule, HfDocTestParser - # allow having multiple repository checkouts and not needing to remember to rerun # 'pip install -e .[dev]' when switching between checkouts and running tests. @@ -42,7 +38,9 @@ def pytest_configure(config): config.addinivalue_line( "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested" ) - config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested") + config.addinivalue_line( + "markers", "is_pipeline_test: mark test to run only when pipelines are tested" + ) config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment") config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate") @@ -68,7 +66,7 @@ def pytest_sessionfinish(session, exitstatus): # Doctest custom flag to ignore output. -IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT") +IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT') OutputChecker = doctest.OutputChecker @@ -81,5 +79,3 @@ def check_output(self, want, got, optionflags): doctest.OutputChecker = CustomOutputChecker -_pytest.doctest.DoctestModule = HfDoctestModule -doctest.DocTestParser = HfDocTestParser diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx index 5adbc8e44db7..4663b8ac4d93 100644 --- a/docs/source/en/testing.mdx +++ b/docs/source/en/testing.mdx @@ -212,12 +212,20 @@ Example: ```""" ``` +3 steps are required to debug the docstring examples: +1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: +```bash +python utils/prepare_for_doc_test.py +``` -Just run the following line to automatically test every docstring example in the desired file: +2. Then, you can use the following line to automatically test every docstring example in the desired file: ```bash pytest --doctest-modules ``` -If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument. +3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: +```bash +python utils/prepare_for_doc_test.py --remove_new_line +``` ### Run only modified tests diff --git a/setup.cfg b/setup.cfg index 8b84d3a6d9b9..5f47c5c6be69 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,2 @@ [tool:pytest] doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -doctest_glob=**/*.mdx \ No newline at end of file diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 0600eb382818..42e856d9e4ac 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -27,7 +27,6 @@ copy_func, replace_return_docstrings, ) -from .doctest_utils import HfDocTestParser from .generic import ( ContextManagers, ExplicitEnum, diff --git a/src/transformers/utils/doctest_utils.py b/src/transformers/utils/doctest_utils.py deleted file mode 100644 index 90f37e8ce694..000000000000 --- a/src/transformers/utils/doctest_utils.py +++ /dev/null @@ -1,189 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utils to run the documentation tests without having to overwrite any files. - -The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is -made as a print would otherwise fail the corresonding line. - -To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules -""" -import doctest -import inspect -import os -import re -from typing import Iterable - -from _pytest.doctest import ( - Module, - _get_checker, - _get_continue_on_failure, - _get_runner, - _is_mocked, - _patch_unwrap_mock_aware, - get_optionflags, - import_path, -) -from _pytest.outcomes import skip -from pytest import DoctestItem - - -def preprocess_string(string, skip_cuda_tests): - """Prepare a docstring or a `.mdx` file to be run by doctest. - - The argument `string` would be the whole file content if it is a `.mdx` file. For a python file, it would be one of - its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a - cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for - `string`. - """ - codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)" - codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string) - is_cuda_found = False - for i, codeblock in enumerate(codeblocks): - if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock: - codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock) - if ( - (">>>" in codeblock or "..." in codeblock) - and re.search(r"cuda|to\(0\)|device=0", codeblock) - and skip_cuda_tests - ): - is_cuda_found = True - break - modified_string = "" - if not is_cuda_found: - modified_string = "".join(codeblocks) - return modified_string - - -class HfDocTestParser(doctest.DocTestParser): - """ - Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This - means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also - added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line. - - Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough. - """ - - # This regular expression is used to find doctest examples in a - # string. It defines three groups: `source` is the source code - # (including leading indentation and prompts); `indent` is the - # indentation of the first (PS1) line of the source code; and - # `want` is the expected output (including leading indentation). - # fmt: off - _EXAMPLE_RE = re.compile(r''' - # Source consists of a PS1 line followed by zero or more PS2 lines. - (?P - (?:^(?P [ ]*) >>> .*) # PS1 line - (?:\n [ ]* \.\.\. .*)*) # PS2 lines - \n? - # Want consists of any non-blank lines that do not start with PS1. - (?P (?:(?![ ]*$) # Not a blank line - (?![ ]*>>>) # Not a line starting with PS1 - # !!!!!!!!!!! HF Specific !!!!!!!!!!! - (?:(?!```).)* # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line) - # !!!!!!!!!!! HF Specific !!!!!!!!!!! - (?:\n|$) # Match a new line or end of string - )*) - ''', re.MULTILINE | re.VERBOSE - ) - # fmt: on - - # !!!!!!!!!!! HF Specific !!!!!!!!!!! - skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False)) - # !!!!!!!!!!! HF Specific !!!!!!!!!!! - - def parse(self, string, name=""): - """ - Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before - calling `super().parse` - """ - string = preprocess_string(string, self.skip_cuda_tests) - return super().parse(string, name) - - -class HfDoctestModule(Module): - """ - Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering - tests. - """ - - def collect(self) -> Iterable[DoctestItem]: - class MockAwareDocTestFinder(doctest.DocTestFinder): - """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug. - - https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532 - """ - - def _find_lineno(self, obj, source_lines): - """Doctest code does not take into account `@property`, this - is a hackish way to fix it. https://bugs.python.org/issue17446 - - Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be - reported upstream. #8796 - """ - if isinstance(obj, property): - obj = getattr(obj, "fget", obj) - - if hasattr(obj, "__wrapped__"): - # Get the main obj in case of it being wrapped - obj = inspect.unwrap(obj) - - # Type ignored because this is a private function. - return super()._find_lineno( # type:ignore[misc] - obj, - source_lines, - ) - - def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None: - if _is_mocked(obj): - return - with _patch_unwrap_mock_aware(): - # Type ignored because this is a private function. - super()._find( # type:ignore[misc] - tests, obj, name, module, source_lines, globs, seen - ) - - if self.path.name == "conftest.py": - module = self.config.pluginmanager._importconftest( - self.path, - self.config.getoption("importmode"), - rootpath=self.config.rootpath, - ) - else: - try: - module = import_path( - self.path, - root=self.config.rootpath, - mode=self.config.getoption("importmode"), - ) - except ImportError: - if self.config.getvalue("doctest_ignore_import_errors"): - skip("unable to import module %r" % self.path) - else: - raise - - # !!!!!!!!!!! HF Specific !!!!!!!!!!! - finder = MockAwareDocTestFinder(parser=HfDocTestParser()) - # !!!!!!!!!!! HF Specific !!!!!!!!!!! - optionflags = get_optionflags(self) - runner = _get_runner( - verbose=False, - optionflags=optionflags, - checker=_get_checker(), - continue_on_failure=_get_continue_on_failure(self.config), - ) - for test in finder.find(module, module.__name__): - if test.examples: # skip empty doctests and cuda - yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test) diff --git a/utils/prepare_for_doc_test.py b/utils/prepare_for_doc_test.py new file mode 100644 index 000000000000..c55f3540d994 --- /dev/null +++ b/utils/prepare_for_doc_test.py @@ -0,0 +1,148 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Style utils to preprocess files for doc tests. + + The doc precossing function can be run on a list of files and/org + directories of files. It will recursively check if the files have + a python code snippet by looking for a ```python or ```py syntax. + In the default mode - `remove_new_line==False` the script will + add a new line before every python code ending ``` line to make + the docstrings ready for pytest doctests. + However, we don't want to have empty lines displayed in the + official documentation which is why the new line command can be + reversed by adding the flag `--remove_new_line` which sets + `remove_new_line==True`. + + When debugging the doc tests locally, please make sure to + always run: + + ```python utils/prepare_for_doc_test.py src docs``` + + before running the doc tests: + + ```pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"``` + + Afterwards you should revert the changes by running + + ```python utils/prepare_for_doc_test.py src docs --remove_new_line``` +""" + +import argparse +import os + + +def process_code_block(code, add_new_line=True): + if add_new_line: + return maybe_append_new_line(code) + else: + return maybe_remove_new_line(code) + + +def maybe_append_new_line(code): + """ + Append new line if code snippet is a + Python code snippet + """ + lines = code.split("\n") + + if lines[0] in ["py", "python"]: + # add new line before last line being ``` + last_line = lines[-1] + lines.pop() + lines.append("\n" + last_line) + + return "\n".join(lines) + + +def maybe_remove_new_line(code): + """ + Remove new line if code snippet is a + Python code snippet + """ + lines = code.split("\n") + + if lines[0] in ["py", "python"]: + # add new line before last line being ``` + lines = lines[:-2] + lines[-1:] + + return "\n".join(lines) + + +def process_doc_file(code_file, add_new_line=True): + """ + Process given file. + + Args: + code_file (`str` or `os.PathLike`): The file in which we want to style the docstring. + """ + with open(code_file, "r", encoding="utf-8", newline="\n") as f: + code = f.read() + + # fmt: off + splits = code.split("```") + if len(splits) % 2 != 1: + raise ValueError("The number of occurrences of ``` should be an even number.") + + splits = [s if i % 2 == 0 else process_code_block(s, add_new_line=add_new_line) for i, s in enumerate(splits)] + clean_code = "```".join(splits) + # fmt: on + + diff = clean_code != code + if diff: + print(f"Overwriting content of {code_file}.") + with open(code_file, "w", encoding="utf-8", newline="\n") as f: + f.write(clean_code) + + +def process_doc_files(*files, add_new_line=True): + """ + Applies doc styling or checks everything is correct in a list of files. + + Args: + files (several `str` or `os.PathLike`): The files to treat. + Whether to restyle file or just check if they should be restyled. + + Returns: + List[`str`]: The list of files changed or that should be restyled. + """ + for file in files: + # Treat folders + if os.path.isdir(file): + files = [os.path.join(file, f) for f in os.listdir(file)] + files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")] + process_doc_files(*files, add_new_line=add_new_line) + else: + try: + process_doc_file(file, add_new_line=add_new_line) + except Exception: + print(f"There is a problem in {file}.") + raise + + +def main(*files, add_new_line=True): + process_doc_files(*files, add_new_line=add_new_line) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.") + parser.add_argument( + "--remove_new_line", + action="store_true", + help="Whether to remove new line after each python code block instead of adding one.", + ) + args = parser.parse_args() + + main(*args.files, add_new_line=not args.remove_new_line) From 366a8ca09e8dd92dfb1956c8be3118b5a2b13639 Mon Sep 17 00:00:00 2001 From: Kunhao ZHENG Date: Tue, 9 May 2023 22:58:39 +0200 Subject: [PATCH 061/935] Fix `from_config` (#23246) fix --- src/transformers/models/auto/auto_factory.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index f8bc266fe832..17cc7f95799f 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -407,8 +407,7 @@ def from_config(cls, config, **kwargs): repo_id, class_ref = class_ref.split("--") else: repo_id = config.name_or_path - module_file, class_name = class_ref.split(".") - model_class = get_class_from_dynamic_module(repo_id, module_file + ".py", class_name, **kwargs) + model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs) return model_class._from_config(config, **kwargs) elif type(config) in cls._model_mapping.keys(): model_class = _get_model_class(config, cls._model_mapping) From 3335724376319a0c453049d0cd883504f530ff52 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 9 May 2023 20:37:57 -0400 Subject: [PATCH 062/935] Test composition (#23214) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove nestedness in tool config * Really do it * Use remote tools descriptions * Work * Clean up eval * Changes * Tools * Tools * tool * Fix everything * Use last result/assign for evaluation * Prompt * Remove hardcoded selection * Evaluation for chat agents * correct some spelling * Small fixes * Change summarization model (#23172) * Fix link displayed * Update description of the tool * Fixes in chat prompt * Custom tools, custom prompt * Tool clean up * save_pretrained and push_to_hub for tool * Fix init * Tests * Fix tests * Tool save/from_hub/push_to_hub and tool->load_tool * Clean push_to_hub and add app file * Custom inference API for endpoints too * Clean up * old remote tool and new remote tool * Make a requirements * return_code adds tool creation * Avoid redundancy between global variables * Remote tools can be loaded * Tests * Text summarization tests * Quality * Properly mark tests * Test the python interpreter * And the CI shall be green. * fix loading of additional tools * Work on RemoteTool and fix tests * General clean up * Guard imports * Fix tools * docs: Fix broken link in 'How to add a model...' (#23216) fix link * Get default endpoint from the Hub * Add guide * Simplify tool config * Docs * Some fixes * Docs * Docs * Docs * Fix code returned by agent * Try this * Match args with signature in remote tool * Should fix python interpreter for Python 3.8 * Fix push_to_hub for tools * Other fixes to push_to_hub * Add API doc page * Docs * Docs * Custom tools * Pin tensorflow-probability (#23220) * Pin tensorflow-probability * [all-test] * [all-test] Fix syntax for bash * PoC for some chaining API * Text to speech * J'ai pris des libertés * Rename * Basic python interpreter * Add agents * Quality * Add translation tool * temp * GenQA + LID + S2T * Quality + word missing in translation * Add open assistance, support f-strings in evaluate * captioning + s2t fixes * Style * Refactor descriptions and remove chain * Support errors and rename OpenAssistantAgent * Add setup * Deal with typos + example of inference API * Some rename + README * Fixes * Update prompt * Unwanted change * Make sure everyone has a default * One prompt to rule them all. * SD * Description * Clean up remote tools * More remote tools * Add option to return code and update doc * Image segmentation * ControlNet * Gradio demo * Diffusers protection * Lib protection * ControlNet description * Cleanup * Style * Remove accelerate and try to be reproducible * No randomness * Male Basic optional in token * Clean description * Better prompts * Fix args eval in interpreter * Add tool wrapper * Tool on the Hub * Style post-rebase * Big refactor of descriptions, batch generation and evaluation for agents * Make problems easier - interface to debug * More problems, add python primitives * Back to one prompt * Remove dict for translation * Be consistent * Add prompts * New version of the agent * Evaluate new agents * New endpoints agents * Make all tools a dict variable * Typo * Add problems * Add to big prompt * Harmonize * Add tools * New evaluation * Add more tools * Build prompt with tools descriptions * Tools on the Hub * Let's chat! * Cleanup * Temporary bs4 safeguard * Cache agents and clean up * Blank init * Fix evaluation for agents * New format for tools on the Hub * Add method to reset state * Remove nestedness in tool config * Really do it * Use remote tools descriptions * Work * Clean up eval * Changes * Tools * Tools * tool * Fix everything * Use last result/assign for evaluation * Prompt * Remove hardcoded selection * Evaluation for chat agents * correct some spelling * Small fixes * Change summarization model (#23172) * Fix link displayed * Update description of the tool * Fixes in chat prompt * Custom tools, custom prompt * Tool clean up * save_pretrained and push_to_hub for tool * Fix init * Tests * Fix tests * Tool save/from_hub/push_to_hub and tool->load_tool * Clean push_to_hub and add app file * Custom inference API for endpoints too * Clean up * old remote tool and new remote tool * Make a requirements * return_code adds tool creation * Avoid redundancy between global variables * Remote tools can be loaded * Tests * Text summarization tests * Quality * Properly mark tests * Test the python interpreter * And the CI shall be green. * Work on RemoteTool and fix tests * fix loading of additional tools * General clean up * Guard imports * Fix tools * Get default endpoint from the Hub * Simplify tool config * Add guide * Docs * Some fixes * Docs * Docs * Fix code returned by agent * Try this * Docs * Match args with signature in remote tool * Should fix python interpreter for Python 3.8 * Fix push_to_hub for tools * Other fixes to push_to_hub * Add API doc page * Fixes * Doc fixes * Docs * Fix audio * Custom tools * Audio fix * Improve custom tools docstring * Docstrings * Trigger CI * Mode docstrings * More docstrings * Improve custom tools * Fix for remote tools * Style * Fix repo consistency * Quality * Tip * Cleanup on doc * Cleanup toc * Add disclaimer for starcoder vs openai * Remove disclaimer * Small fixed in the prompts * 4.29 * Update src/transformers/tools/agents.py Co-authored-by: Lysandre Debut * Complete documentation * Small fixes * Agent evaluation * Note about gradio-tools & LC * Clean up agents and prompt * Apply suggestions from code review Co-authored-by: Patrick von Platen * Apply suggestions from code review Co-authored-by: Patrick von Platen * Note about gradio-tools & LC * Add copyrights and address review comments * Quality * Add all language codes * Add remote tool tests * Move custom prompts to other docs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * TTS tests * Quality --------- Co-authored-by: Lysandre Co-authored-by: Patrick von Platen Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Co-authored-by: Connor Henderson Co-authored-by: Lysandre Co-authored-by: Lysandre Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- conftest.py | 1 + docs/source/en/_toctree.yml | 6 + docs/source/en/custom_tools.mdx | 503 ++++++++++++ docs/source/en/main_classes/agent.mdx | 64 ++ docs/source/en/transformers_agents.mdx | 329 ++++++++ src/transformers/__init__.py | 13 + src/transformers/dynamic_module_utils.py | 33 +- src/transformers/image_utils.py | 4 + src/transformers/testing_utils.py | 16 + src/transformers/tools/__init__.py | 73 ++ src/transformers/tools/agents.py | 489 ++++++++++++ src/transformers/tools/base.py | 722 ++++++++++++++++++ .../tools/document_question_answering.py | 80 ++ src/transformers/tools/evaluate_agent.py | 692 +++++++++++++++++ src/transformers/tools/image_captioning.py | 51 ++ .../tools/image_question_answering.py | 57 ++ src/transformers/tools/image_segmentation.py | 60 ++ src/transformers/tools/prompts.py | 186 +++++ src/transformers/tools/python_interpreter.py | 238 ++++++ src/transformers/tools/speech_to_text.py | 41 + src/transformers/tools/text_classification.py | 70 ++ .../tools/text_question_answering.py | 52 ++ src/transformers/tools/text_summarization.py | 52 ++ src/transformers/tools/text_to_speech.py | 65 ++ src/transformers/tools/translation.py | 271 +++++++ src/transformers/utils/__init__.py | 1 + src/transformers/utils/hub.py | 13 +- src/transformers/utils/import_utils.py | 15 + tests/tools/__init__.py | 0 .../tools/test_document_question_answering.py | 57 ++ tests/tools/test_image_captioning.py | 53 ++ tests/tools/test_image_question_answering.py | 53 ++ tests/tools/test_image_segmentation.py | 53 ++ tests/tools/test_python_interpreter.py | 124 +++ tests/tools/test_speech_to_text.py | 38 + tests/tools/test_text_classification.py | 43 ++ tests/tools/test_text_question_answering.py | 52 ++ tests/tools/test_text_summarization.py | 64 ++ tests/tools/test_text_to_speech.py | 54 ++ tests/tools/test_tools_common.py | 100 +++ tests/tools/test_translation.py | 53 ++ 41 files changed, 4933 insertions(+), 8 deletions(-) create mode 100644 docs/source/en/custom_tools.mdx create mode 100644 docs/source/en/main_classes/agent.mdx create mode 100644 docs/source/en/transformers_agents.mdx create mode 100644 src/transformers/tools/__init__.py create mode 100644 src/transformers/tools/agents.py create mode 100644 src/transformers/tools/base.py create mode 100644 src/transformers/tools/document_question_answering.py create mode 100644 src/transformers/tools/evaluate_agent.py create mode 100644 src/transformers/tools/image_captioning.py create mode 100644 src/transformers/tools/image_question_answering.py create mode 100644 src/transformers/tools/image_segmentation.py create mode 100644 src/transformers/tools/prompts.py create mode 100644 src/transformers/tools/python_interpreter.py create mode 100644 src/transformers/tools/speech_to_text.py create mode 100644 src/transformers/tools/text_classification.py create mode 100644 src/transformers/tools/text_question_answering.py create mode 100644 src/transformers/tools/text_summarization.py create mode 100644 src/transformers/tools/text_to_speech.py create mode 100644 src/transformers/tools/translation.py create mode 100644 tests/tools/__init__.py create mode 100644 tests/tools/test_document_question_answering.py create mode 100644 tests/tools/test_image_captioning.py create mode 100644 tests/tools/test_image_question_answering.py create mode 100644 tests/tools/test_image_segmentation.py create mode 100644 tests/tools/test_python_interpreter.py create mode 100644 tests/tools/test_speech_to_text.py create mode 100644 tests/tools/test_text_classification.py create mode 100644 tests/tools/test_text_question_answering.py create mode 100644 tests/tools/test_text_summarization.py create mode 100644 tests/tools/test_text_to_speech.py create mode 100644 tests/tools/test_tools_common.py create mode 100644 tests/tools/test_translation.py diff --git a/conftest.py b/conftest.py index 53efec7a6c2d..683b47705bf4 100644 --- a/conftest.py +++ b/conftest.py @@ -43,6 +43,7 @@ def pytest_configure(config): ) config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment") config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate") + config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule") def pytest_addoption(parser): diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index c92f21a93458..c2c50082274d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -21,6 +21,8 @@ title: Set up distributed training with 🤗 Accelerate - local: model_sharing title: Share your model + - local: transformers_agents + title: Agents title: Tutorials - sections: - sections: @@ -99,6 +101,8 @@ title: Notebooks with examples - local: community title: Community resources + - local: custom_tools + title: Custom Tools - local: troubleshooting title: Troubleshoot title: Developer guides @@ -179,6 +183,8 @@ title: Conceptual guides - sections: - sections: + - local: main_classes/agent + title: Agents and Tools - local: model_doc/auto title: Auto Classes - local: main_classes/callback diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx new file mode 100644 index 000000000000..f69a2cde90d7 --- /dev/null +++ b/docs/source/en/custom_tools.mdx @@ -0,0 +1,503 @@ + + +# Custom Tools and Prompts + + + +If you are not aware of what tools and agents are in the context of transformers, we recommend you read the +[Transformers Agents](transformers_agents) page first. + + + + + +Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents +can vary as the APIs or underlying models are prone to change. + + + +Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks. +In this guide we'll take a look at: + +- How to customize the prompt +- How to use custom tools +- How to create custom tools + +## Customizing the prompt + +As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode. +Both the run and chat mode underlie the same logic. The language model powering the agent is conditioned on a long prompt +and simply asked to complete the prompt by generating next tokens until the stop token is reached. +The only difference between the `run` and `chat` mode is that during the `chat` mode the prompt is extended with +previous user inputs and model generations, which seemingly gives the agent a memory and allows it to refer to +past interactions. + +Let's take a closer look into how the prompt is structured to understand how it can be best customized. +The prompt is structured broadly into four parts. + +- 1. Introduction: how the agent should behave, explanation of the concept of tools. +- 2. Description of all the tools. This is defined by a `<>` token that is dynamically replaced at runtime with the tools defined/chosen by the user. +- 3. A set of examples of tasks and their solution +- 4. Current example, and request for solution. + +To better understand each part, let's look at a shortened version of how such a prompt can look like in practice. + +``` +I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task. +[...] +You can print intermediate results if it makes sense to do so. + +Tools: +- document_qa: This is a tool that answers a question about an document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question. +- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to caption, and returns a text that contains the description in English. +[...] + +Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French." + +I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image. + +Answer: +```py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") +answer = image_qa(image=image, question=translated_question) +print(f"The answer is {answer}") +``` + +Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner." + +I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. + +Answer: +```py +answer = document_qa(document, question="What is the oldest person?") +print(f"The answer is {answer}.") +image = image_generator("A banner showing " + answer) +``` + +[...] + +Task: "Draw me a picture of rivers and lakes" + +I will use the following +``` + +The first part explains precisely how the model shall behave and what it should do. This part +most likely does not need to be customized. + +TODO(PVP) - explain better how the .description and .name influence the prompt + +### Customizing the tool descriptions + +The performance of the agent is directly linked to the prompt itself. We structure the prompt so that it works well +with what we intend for the agent to do; but for maximum customization we also offer the ability to specify a different prompt when instantiating the agent. + +### Customizing the single-execution prompt + +In order to specify a custom single-execution prompt, one would so the following: + +```py +template = """ [...] """ + +agent = HfAgent(your_endpoint, run_prompt_template=template) +``` + + + +Please make sure to have the `<>` string defined somewhere in the `template` so that the agent can be aware +of the tools it has available to it. + + + +#### Chat-execution prompt + +In order to specify a custom single-execution prompt, one would so the following: + +``` +template = """ [...] """ + +agent = HfAgent( + url_endpoint=your_endpoint, + token=your_hf_token, + chat_prompt_template=template +) +``` + + + +Please make sure to have the `<>` string defined somewhere in the `template` so that the agent can be +aware of the tools it has available to it. + + + +## Using custom tools + +In this section, we'll be leveraging two existing custom tools that are specific to image generation: + +- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation), + with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) + to allow for more image modifications. +- We add a new tool for image upscaling to the default toolbox: + [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool. + +We'll start by loading the custom tools with the convenient [`load_tool`] function: + +```py +from transformers import load_tool + +controlnet_transformer = load_tool("diffusers/controlnet-canny-tool") +upscaler = load_tool("diffusers/latent-upscaler-tool") +``` + +Upon adding custom tools to an agent, the tools' descriptions and names are automatically +included in the agents' prompts. Thus, it is imperative that custom tools have +a well-written description and name in order for the agent to understand how to use them. +Let's take a look at the description and name of `controlnet_transformer`: + +```py +print(f"Description: '{controlnet_transformer.description}'") +print(f"Name: '{controlnet_transformer.name}'") +``` + +gives +``` +Description: 'This is a tool that transforms an image with ControlNet according to a prompt. +It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.' +Name: 'image_transformer' +``` + +The name and description is accurate and fits the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools). +Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`: + +```py +tools = [controlnet_transformer, upscaler] +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools) +``` + +This command should give you the following info: + +``` +image_transformer has been replaced by as provided in `additional_tools` +``` + +The set of curated tools already has a `image_transformer` tool which is hereby replaced with our custom tool. + + + +Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool +because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API +as the overwritten tool in this case. + + + +The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore is simply added to the list of tools. +You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute: + +```py +print("\n".join([f"- {a}" for a in agent.toolbox.keys()])) +``` + +``` +- document_qa +- image_captioner +- image_qa +- image_segmenter +- transcriber +- summarizer +- text_classifier +- text_qa +- text_reader +- translator +- image_transformer +- text_downloader +- image_generator +- video_generator +- image_upscaler +``` + +Note how `image_upscaler` is now part of the agents' toolbox. + +Let's now try out the new tools! We will re-use the image we generated in (Transformers Agents Quickstart)[./transformers_agents#single-execution-run]. + +```py +from diffusers.utils import load_image + +image = load_image( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" +) +``` + + + +Let's transform the image into a beautiful winter landscape: + +```py +image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image) +``` + +``` +==Explanation from the agent== +I will use the following tool: `image_transformer` to transform the image. + + +==Code generated by the agent== +image = image_transformer(image, prompt="A frozen lake and snowy forest") +``` + + + +The new image processing tool is based on ControlNet which is can make very strong modifications to the image. +By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it. + +```py +image = agent.run("Upscale the image", image) +``` + +``` +==Explanation from the agent== +I will use the following tool: `image_upscaler` to upscale the image. + + +==Code generated by the agent== +upscaled_image = image_upscaler(image) +``` + + + +The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool +and was able to correctly run it. + +Next, let's have a look into how you can create a new custom tool. + +### Adding new tools + +In this section we show how to create a new tool that can be added to the agent. + +#### Creating a new tool + +We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face +Hub with the most downloads for a given task. + +We can do that with the following code: + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` + +For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`. + +How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the +main attributes necessary. We'll create a class that inherits from it: + +```python +from transformers import Tool + + +class HFModelDownloadsTool(Tool): + pass +``` + +This class has a few needs: +- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a + performative name, we'll name it `model_download_counter`. +- An attribute `description`, which will be used to populate the prompt of the agent. +- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types, + and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected + values, which can be `text`, `image`, or `audio`. +- A `__call__` method which contains the inference code. This is the code we've played with above! + +Here's what our class looks like now: + +```python +from transformers import Tool +from huggingface_hub import list_models + + +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " + "It takes the name of the category (such as text-classification, depth-estimation, etc), and " + "returns the name of the checkpoint." + ) + + inputs = ["text"] + outputs = ["text"] + + def __call__(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id +``` + +We now have our tool handy. Save it in a file and import it from your main script. Let's name this file +`model_downloads.py`, so the resulting import code looks like this: + +```python +from model_downloads import HFModelDownloadsTool + +tool = HFModelDownloadsTool() +``` + +In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your +namespace. To do so, just call `push_to_hub` on the `tool` variable: + +```python +tool.push_to_hub("lysandre/hf-model-downloads") +``` + +You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it. + +#### Having the agent use the tool + +We now have our tool that lives on the Hub which can be instantiated as such: + +```python +from transformers import load_tool + +tool = load_tool("lysandre/hf-model-downloads") +``` + +In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method: + +```python +from transformers import HfAgent + +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) + +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` +which outputs the following: +``` +==Code generated by the agent== +model = model_download_counter(task="text-to-video") +print(f"The model with the most downloads is {model}.") +audio_model = text_reader(model) + + +==Result== +The model with the most downloads is damo-vilab/text-to-video-ms-1.7b. +``` + +and generates the following audio. + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +|