More utils doc (#25457)

* Document and clean more utils. * More documentation and fixes * Switch to Lysandre's token * Address review comments * Actually put else
huggingface · Aug 17, 2023 · 2defb6b · 2defb6b
1 parent 36f183e
commit 2defb6b
Show file tree

Hide file tree

Showing 9 changed files with 412 additions and 85 deletions.
diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml
@@ -24,4 +24,4 @@ jobs:
 
       - name: Update metadata
         run: |
-          python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}
+          python utils/update_metadata.py --token ${{ secrets.LYSANDRE_HF_TOKEN }} --commit_sha ${{ github.sha }}
diff --git a/setup.py b/setup.py
@@ -17,33 +17,34 @@
 
 To create the package for pypi.
 
-1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
-   documentation.
+1. Create the release branch named: v<RELEASE>-release, for example v4.19-release. For a patch release checkout the
+   current release branch.
 
    If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make
    for the post-release and run `make fix-copies` on the main branch as well.
 
-2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
+2. Run `make pre-release` (or `make pre-patch` for a patch release) and commit these changes with the message:
+   "Release: <VERSION>" and push.
 
-3. Unpin specific versions from setup.py that use a git install.
+3. Go back to the main branch and run `make post-release` then `make fix-copies`. Commit these changes with the
+   message "v<NEXT_VERSION>.dev.0" and push to main.
 
-4. Checkout the release branch (v<RELEASE>-release, for example v4.19-release), and commit these changes with the
-   message: "Release: <VERSION>" and push.
+# If you were just cutting the branch in preparation for a release, you can stop here for now.
 
-5. Wait for the tests on main to be completed and be green (otherwise revert and fix bugs)
+4. Wait for the tests on the release branch to be completed and be green (otherwise revert and fix bugs)
 
-6. Add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
+5. On the release branch, add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
    Push the tag to git: git push --tags origin v<RELEASE>-release
 
-7. Build both the sources and the wheel. Do not change anything in setup.py between
+6. Build both the sources and the wheel. Do not change anything in setup.py between
    creating the wheel and the source distribution (obviously).
 
    Run `make build-release`. This will build the release and do some sanity checks for you. If this ends with an error
    message, you need to fix things before going further.
 
    You should now have a /dist directory with both .whl and .tar.gz source versions.
 
-8. Check that everything looks correct by uploading the package to the pypi test server:
+7. Check that everything looks correct by uploading the package to the pypi test server:
 
    twine upload dist/* -r testpypi
    (pypi suggest using twine as other methods upload files via plaintext.)
@@ -60,13 +61,10 @@
 
    If making a patch release, double check the bug you are patching is indeed resolved.
 
-9. Upload the final version to actual pypi:
+8. Upload the final version to actual pypi:
    twine upload dist/* -r pypi
 
-10. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
-
-11. Run `make post-release` then run `make fix-copies`. If you were on a branch for the release,
-    you need to go back to main before executing this.
+9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 """
 
 import os

diff --git a/utils/check_table.py b/utils/check_table.py
@@ -12,11 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the big table in the file docs/source/en/index.md and potentially updates it.
 
+Use from the root of the repo with:
+
+```bash
+python utils/check_inits.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_inits.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies`.
+"""
 import argparse
 import collections
 import os
 import re
+from typing import List
 
 from transformers.utils import direct_transformers_import
 
@@ -28,19 +47,28 @@
 REPO_PATH = "."
 
 
-def _find_text_in_file(filename, start_prompt, end_prompt):
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
     """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Find the text in filename between two prompts.
+
+    Args:
+        filename (`str`): The file to search into.
+        start_prompt (`str`): A string to look for at the start of the content searched.
+        end_prompt (`str`): A string that will mark the end of the content to look for.
+
+    Returns:
+        `str`: The content between the prompts.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
+
     # Find the start prompt.
     start_index = 0
     while not lines[start_index].startswith(start_prompt):
         start_index += 1
     start_index += 1
 
+    # Now go until the end prompt.
     end_index = start_index
     while not lines[end_index].startswith(end_prompt):
         end_index += 1
@@ -54,35 +82,60 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
     return "".join(lines[start_index:end_index]), start_index, end_index, lines
 
 
-# Add here suffixes that are used to identify models, separated by |
-ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration"
-# Regexes that match TF/Flax/PT model names.
+# Regexes that match TF/Flax/PT model names. Add here suffixes that are used to identify models, separated by |
 _re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
 _re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
-# Will match any TF or Flax model too so need to be in an else branch afterthe two previous regexes.
+# Will match any TF or Flax model too so need to be in an else branch after the two previous regexes.
 _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
 
 
 # This is to make sure the transformers module imported is the one in the repo.
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
 
 
-# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
-def camel_case_split(identifier):
-    "Split a camelcased `identifier` into words."
+def camel_case_split(identifier: str) -> List[str]:
+    """
+    Split a camel-cased name into words.
+
+    Args:
+        identifier (`str`): The camel-cased name to parse.
+
+    Returns:
+        `List[str]`: The list of words in the identifier (as seprated by capital letters).
+
+    Example:
+
+    ```py
+    >>> camel_case_split("CamelCasedClass")
+    ["Camel", "Cased", "Class"]
+    ```
+    """
+    # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
     matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
     return [m.group(0) for m in matches]
 
 
-def _center_text(text, width):
+def _center_text(text: str, width: int) -> str:
+    """
+    Utility that will add spaces on the left and right of a text to make it centered for a given width.
+
+    Args:
+        text (`str`): The text to center.
+        width (`int`): The desired length of the result.
+
+    Returns:
+        `str`: A text of length `width` with the original `text` in the middle.
+    """
     text_length = 2 if text == "✅" or text == "❌" else len(text)
     left_indent = (width - text_length) // 2
     right_indent = width - text_length - left_indent
     return " " * left_indent + text + " " * right_indent
 
 
-def get_model_table_from_auto_modules():
-    """Generates an up-to-date model table from the content of the auto modules."""
+def get_model_table_from_auto_modules() -> str:
+    """
+    Generates an up-to-date model table from the content of the auto modules.
+    """
     # Dictionary model names to config.
     config_maping_names = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES
     model_name_to_config = {
@@ -92,7 +145,7 @@ def get_model_table_from_auto_modules():
     }
     model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()}
 
-    # Dictionaries flagging if each model prefix has a slow/fast tokenizer, backend in PT/TF/Flax.
+    # Dictionaries flagging if each model prefix has a backend in PT/TF/Flax.
     pt_models = collections.defaultdict(bool)
     tf_models = collections.defaultdict(bool)
     flax_models = collections.defaultdict(bool)
@@ -145,7 +198,13 @@ def get_model_table_from_auto_modules():
 
 
 def check_model_table(overwrite=False):
-    """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`."""
+    """
+    Check the model table in the index.md is consistent with the state of the lib and potentially fix it.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the table when it's not up to date.
+    """
     current_table, start_index, end_index, lines = _find_text_in_file(
         filename=os.path.join(PATH_TO_DOCS, "index.md"),
         start_prompt="<!--This table is updated automatically from the auto modules",

diff --git a/utils/check_task_guides.py b/utils/check_task_guides.py
@@ -12,7 +12,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the list of models in the tips in the task-specific pages of the doc is up to date and potentially
+fixes it.
 
+Use from the root of the repo with:
+
+```bash
+python utils/check_task_guides.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_task_guides.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies`.
+"""
 import argparse
 import os
 
@@ -25,10 +44,17 @@
 PATH_TO_TASK_GUIDES = "docs/source/en/tasks"
 
 
-def _find_text_in_file(filename, start_prompt, end_prompt):
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
     """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Find the text in filename between two prompts.
+
+    Args:
+        filename (`str`): The file to search into.
+        start_prompt (`str`): A string to look for at the start of the content searched.
+        end_prompt (`str`): A string that will mark the end of the content to look for.
+
+    Returns:
+        `str`: The content between the prompts.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
@@ -38,6 +64,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
         start_index += 1
     start_index += 1
 
+    # Now go until the end prompt.
     end_index = start_index
     while not lines[end_index].startswith(end_prompt):
         end_index += 1
@@ -54,6 +81,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
 # This is to make sure the transformers module imported is the one in the repo.
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
 
+# Map between a task guide and the corresponding auto class.
 TASK_GUIDE_TO_MODELS = {
     "asr.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CTC_MAPPING_NAMES,
     "audio_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
@@ -81,9 +109,15 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
 }
 
 
-def get_model_list_for_task(task_guide):
+def get_model_list_for_task(task_guide: str) -> str:
     """
-    Return the list of models supporting given task.
+    Return the list of models supporting a given task.
+
+    Args:
+        task_guide (`str`): The name of the task guide to check.
+
+    Returns:
+        `str`: The list of models supporting this task, as links to their respective doc pages separated by commas.
     """
     model_maping_names = TASK_GUIDE_TO_MODELS[task_guide]
     special_model_types = SPECIAL_TASK_GUIDE_TO_MODEL_TYPES.get(task_guide, set())
@@ -95,9 +129,17 @@ def get_model_list_for_task(task_guide):
     return ", ".join([f"[{name}](../model_doc/{code})" for code, name in model_names.items()]) + "\n"
 
 
-def check_model_list_for_task(task_guide, overwrite=False):
-    """For a given task guide, checks the model list in the generated tip for consistency with the state of the lib and overwrites if needed."""
-
+def check_model_list_for_task(task_guide: str, overwrite: bool = False):
+    """
+    For a given task guide, checks the model list in the generated tip for consistency with the state of the lib and
+    updates it if needed.
+
+    Args:
+        task_guide (`str`):
+            The name of the task guide to check.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the table when it's not up to date.
+    """
     current_list, start_index, end_index, lines = _find_text_in_file(
         filename=os.path.join(PATH_TO_TASK_GUIDES, task_guide),
         start_prompt="<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->",