diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml index 78f99a10919e..60bda35169e3 100644 --- a/.github/workflows/update_metdata.yml +++ b/.github/workflows/update_metdata.yml @@ -24,4 +24,4 @@ jobs: - name: Update metadata run: | - python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }} + python utils/update_metadata.py --token ${{ secrets.LYSANDRE_HF_TOKEN }} --commit_sha ${{ github.sha }} diff --git a/setup.py b/setup.py index 5047727141e5..4212e311d460 100644 --- a/setup.py +++ b/setup.py @@ -17,25 +17,26 @@ To create the package for pypi. -1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the - documentation. +1. Create the release branch named: v-release, for example v4.19-release. For a patch release checkout the + current release branch. If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make for the post-release and run `make fix-copies` on the main branch as well. -2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid. +2. Run `make pre-release` (or `make pre-patch` for a patch release) and commit these changes with the message: + "Release: " and push. -3. Unpin specific versions from setup.py that use a git install. +3. Go back to the main branch and run `make post-release` then `make fix-copies`. Commit these changes with the + message "v.dev.0" and push to main. -4. Checkout the release branch (v-release, for example v4.19-release), and commit these changes with the - message: "Release: " and push. +# If you were just cutting the branch in preparation for a release, you can stop here for now. -5. Wait for the tests on main to be completed and be green (otherwise revert and fix bugs) +4. Wait for the tests on the release branch to be completed and be green (otherwise revert and fix bugs) -6. Add a tag in git to mark the release: "git tag v -m 'Adds tag v for pypi' " +5. On the release branch, add a tag in git to mark the release: "git tag v -m 'Adds tag v for pypi' " Push the tag to git: git push --tags origin v-release -7. Build both the sources and the wheel. Do not change anything in setup.py between +6. Build both the sources and the wheel. Do not change anything in setup.py between creating the wheel and the source distribution (obviously). Run `make build-release`. This will build the release and do some sanity checks for you. If this ends with an error @@ -43,7 +44,7 @@ You should now have a /dist directory with both .whl and .tar.gz source versions. -8. Check that everything looks correct by uploading the package to the pypi test server: +7. Check that everything looks correct by uploading the package to the pypi test server: twine upload dist/* -r testpypi (pypi suggest using twine as other methods upload files via plaintext.) @@ -60,13 +61,10 @@ If making a patch release, double check the bug you are patching is indeed resolved. -9. Upload the final version to actual pypi: +8. Upload the final version to actual pypi: twine upload dist/* -r pypi -10. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. - -11. Run `make post-release` then run `make fix-copies`. If you were on a branch for the release, - you need to go back to main before executing this. +9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. """ import os diff --git a/utils/check_table.py b/utils/check_table.py index a48310f7c16f..444036af99f1 100644 --- a/utils/check_table.py +++ b/utils/check_table.py @@ -12,11 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that checks the big table in the file docs/source/en/index.md and potentially updates it. +Use from the root of the repo with: + +```bash +python utils/check_inits.py +``` + +for a check that will error in case of inconsistencies (used by `make repo-consistency`). + +To auto-fix issues run: + +```bash +python utils/check_inits.py --fix_and_overwrite +``` + +which is used by `make fix-copies`. +""" import argparse import collections import os import re +from typing import List from transformers.utils import direct_transformers_import @@ -28,19 +47,28 @@ REPO_PATH = "." -def _find_text_in_file(filename, start_prompt, end_prompt): +def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str: """ - Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty - lines. + Find the text in filename between two prompts. + + Args: + filename (`str`): The file to search into. + start_prompt (`str`): A string to look for at the start of the content searched. + end_prompt (`str`): A string that will mark the end of the content to look for. + + Returns: + `str`: The content between the prompts. """ with open(filename, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() + # Find the start prompt. start_index = 0 while not lines[start_index].startswith(start_prompt): start_index += 1 start_index += 1 + # Now go until the end prompt. end_index = start_index while not lines[end_index].startswith(end_prompt): end_index += 1 @@ -54,12 +82,10 @@ def _find_text_in_file(filename, start_prompt, end_prompt): return "".join(lines[start_index:end_index]), start_index, end_index, lines -# Add here suffixes that are used to identify models, separated by | -ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration" -# Regexes that match TF/Flax/PT model names. +# Regexes that match TF/Flax/PT model names. Add here suffixes that are used to identify models, separated by | _re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)") _re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)") -# Will match any TF or Flax model too so need to be in an else branch afterthe two previous regexes. +# Will match any TF or Flax model too so need to be in an else branch after the two previous regexes. _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)") @@ -67,22 +93,49 @@ def _find_text_in_file(filename, start_prompt, end_prompt): transformers_module = direct_transformers_import(TRANSFORMERS_PATH) -# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python -def camel_case_split(identifier): - "Split a camelcased `identifier` into words." +def camel_case_split(identifier: str) -> List[str]: + """ + Split a camel-cased name into words. + + Args: + identifier (`str`): The camel-cased name to parse. + + Returns: + `List[str]`: The list of words in the identifier (as seprated by capital letters). + + Example: + + ```py + >>> camel_case_split("CamelCasedClass") + ["Camel", "Cased", "Class"] + ``` + """ + # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier) return [m.group(0) for m in matches] -def _center_text(text, width): +def _center_text(text: str, width: int) -> str: + """ + Utility that will add spaces on the left and right of a text to make it centered for a given width. + + Args: + text (`str`): The text to center. + width (`int`): The desired length of the result. + + Returns: + `str`: A text of length `width` with the original `text` in the middle. + """ text_length = 2 if text == "✅" or text == "❌" else len(text) left_indent = (width - text_length) // 2 right_indent = width - text_length - left_indent return " " * left_indent + text + " " * right_indent -def get_model_table_from_auto_modules(): - """Generates an up-to-date model table from the content of the auto modules.""" +def get_model_table_from_auto_modules() -> str: + """ + Generates an up-to-date model table from the content of the auto modules. + """ # Dictionary model names to config. config_maping_names = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES model_name_to_config = { @@ -92,7 +145,7 @@ def get_model_table_from_auto_modules(): } model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()} - # Dictionaries flagging if each model prefix has a slow/fast tokenizer, backend in PT/TF/Flax. + # Dictionaries flagging if each model prefix has a backend in PT/TF/Flax. pt_models = collections.defaultdict(bool) tf_models = collections.defaultdict(bool) flax_models = collections.defaultdict(bool) @@ -145,7 +198,13 @@ def get_model_table_from_auto_modules(): def check_model_table(overwrite=False): - """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`.""" + """ + Check the model table in the index.md is consistent with the state of the lib and potentially fix it. + + Args: + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the table when it's not up to date. + """ current_table, start_index, end_index, lines = _find_text_in_file( filename=os.path.join(PATH_TO_DOCS, "index.md"), start_prompt="", diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py index d250ce7e6481..cdf2aaffdf4b 100644 --- a/utils/custom_init_isort.py +++ b/utils/custom_init_isort.py @@ -12,12 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that sorts the imports in the custom inits of Transformers. Transformers uses init files that delay the +import of an object to when it's actually needed. This is to avoid the main init importing all models, which would +make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with +delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the +objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff` +properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half. +Use from the root of the repo with: + +```bash +python utils/custom_init_isort.py +``` + +which will auto-sort the imports (used in `make style`). + +For a check only (as used in `make quality`) run: + +```bash +python utils/custom_init_isort.py --check_only +``` +""" import argparse import os import re +from typing import Any, Callable, List, Optional +# Path is defined with the intent you should run this script from the root of the repo. PATH_TO_TRANSFORMERS = "src/transformers" # Pattern that looks at the indentation in a line. @@ -32,17 +55,30 @@ _re_bracket_content = re.compile(r"\[([^\]]+)\]") -def get_indent(line): - """Returns the indent in `line`.""" +def get_indent(line: str) -> str: + """Returns the indent in given line (as string).""" search = _re_indent.search(line) return "" if search is None else search.groups()[0] -def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_prompt=None): +def split_code_in_indented_blocks( + code: str, indent_level: str = "", start_prompt: Optional[str] = None, end_prompt: Optional[str] = None +) -> List[str]: """ - Split `code` into its indented blocks, starting at `indent_level`. If provided, begins splitting after - `start_prompt` and stops at `end_prompt` (but returns what's before `start_prompt` as a first block and what's - after `end_prompt` as a last block, so `code` is always the same as joining the result of this function). + Split some code into its indented blocks, starting at a given level. + + Args: + code (`str`): The code to split. + indent_level (`str`): The indent level (as string) to use for identifying the blocks to split. + start_prompt (`str`, *optional*): If provided, only starts splitting at the line where this text is. + end_prompt (`str`, *optional*): If provided, stops splitting at a line where this text is. + + Warning: + The text before `start_prompt` or after `end_prompt` (if provided) is not ignored, just not split. The input `code` + can thus be retrieved by joining the result. + + Returns: + `List[str]`: The list of blocks. """ # Let's split the code into lines and move to start_index. index = 0 @@ -54,12 +90,17 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_ else: blocks = [] - # We split into blocks until we get to the `end_prompt` (or the end of the block). + # This variable contains the block treated at a given time. current_block = [lines[index]] index += 1 + # We split into blocks until we get to the `end_prompt` (or the end of the file). while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)): + # We have a non-empty line with the proper indent -> start of a new block if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level: + # Store the current block in the result and rest. There are two cases: the line is part of the block (like + # a closing parenthesis) or not. if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "): + # Line is part of the current block current_block.append(lines[index]) blocks.append("\n".join(current_block)) if index < len(lines) - 1: @@ -68,9 +109,11 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_ else: current_block = [] else: + # Line is not part of the current block blocks.append("\n".join(current_block)) current_block = [lines[index]] else: + # Just add the line to the current block current_block.append(lines[index]) index += 1 @@ -85,8 +128,10 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_ return blocks -def ignore_underscore(key): - "Wraps a `key` (that maps an object to string) to lower case and remove underscores." +def ignore_underscore_and_lowercase(key: Callable[[Any], str]) -> Callable[[Any], str]: + """ + Wraps a key function (as used in a sort) to lowercase and ignore underscores. + """ def _inner(x): return key(x).lower().replace("_", "") @@ -94,8 +139,21 @@ def _inner(x): return _inner -def sort_objects(objects, key=None): - "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str." +def sort_objects(objects: List[Any], key: Optional[Callable[[Any], str]] = None) -> List[Any]: + """ + Sort a list of objects following the rules of isort (all uppercased first, camel-cased second and lower-cased + last). + + Args: + objects (`List[Any]`): + The list of objects to sort. + key (`Callable[[Any], str]`, *optional*): + A function taking an object as input and returning a string, used to sort them by alphabetical order. + If not provided, will default to noop (so a `key` must be provided if the `objects` are not of type string). + + Returns: + `List[Any]`: The sorted list with the same elements as in the inputs + """ # If no key is provided, we use a noop. def noop(x): @@ -110,18 +168,26 @@ def noop(x): # Functions begin with a lowercase, they go last. functions = [obj for obj in objects if not key(obj)[0].isupper()] - key1 = ignore_underscore(key) + # Then we sort each group. + key1 = ignore_underscore_and_lowercase(key) return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1) -def sort_objects_in_import(import_statement): +def sort_objects_in_import(import_statement: str) -> str: """ - Return the same `import_statement` but with objects properly sorted. + Sorts the imports in a single import statement. + + Args: + import_statement (`str`): The import statement in which to sort the imports. + + Returns: + `str`: The same as the input, but with objects properly sorted. """ # This inner function sort imports between [ ]. def _replace(match): imports = match.groups()[0] + # If there is one import only, nothing to do. if "," not in imports: return f"[{imports}]" keys = [part.strip().replace('"', "") for part in imports.split(",")] @@ -165,13 +231,18 @@ def _replace(match): return import_statement -def sort_imports(file, check_only=True): +def sort_imports(file: str, check_only: bool = True): """ - Sort `_import_structure` imports in `file`, `check_only` determines if we only check or overwrite. + Sort the imports defined in the `_import_structure` of a given init. + + Args: + file (`str`): The path to the init to check/fix. + check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init. """ with open(file, encoding="utf-8") as f: code = f.read() + # If the file is not a custom init, there is nothing to do. if "_import_structure" not in code: return @@ -234,6 +305,12 @@ def sort_imports(file, check_only=True): def sort_imports_in_all_inits(check_only=True): + """ + Sort the imports defined in the `_import_structure` of all inits in the repo. + + Args: + check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init. + """ failures = [] for root, _, files in os.walk(PATH_TO_TRANSFORMERS): if "__init__.py" in files: diff --git a/utils/release.py b/utils/release.py index 3bb75f0bebf4..8ba0ead7c6ce 100644 --- a/utils/release.py +++ b/utils/release.py @@ -12,7 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that prepares the repository for releases (or patches) by updating all versions in the relevant places. It +also performs some post-release cleanup, by updating the links in the main README to respective model doc pages (from +main to stable). +To prepare for a release, use from the root of the repo on the release branch with: + +```bash +python release.py +``` + +or use `make pre-release`. + +To prepare for a patch release, use from the root of the repo on the release branch with: + +```bash +python release.py --patch +``` + +or use `make pre-patch`. + +To do the post-release cleanup, use from the root of the repo on the main branch with: + +```bash +python release.py --post_release +``` + +or use `make post-release`. +""" import argparse import os import re @@ -20,13 +48,16 @@ import packaging.version +# All paths are defined with the intent that this script should be run from the root of the repo. PATH_TO_EXAMPLES = "examples/" +# This maps a type of file to the pattern to look for when searching where the version is defined, as well as the +# template to follow when replacing it with the new version. REPLACE_PATTERNS = { "examples": (re.compile(r'^check_min_version\("[^"]+"\)\s*$', re.MULTILINE), 'check_min_version("VERSION")\n'), "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'), "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'), - "doc": (re.compile(r'^(\s*)release\s*=\s*"[^"]+"$', re.MULTILINE), 'release = "VERSION"\n'), } +# This maps a type of file to its path in Transformers REPLACE_FILES = { "init": "src/transformers/__init__.py", "setup": "setup.py", @@ -34,19 +65,31 @@ README_FILE = "README.md" -def update_version_in_file(fname, version, pattern): - """Update the version in one file using a specific pattern.""" +def update_version_in_file(fname: str, version: str, file_type: str): + """ + Update the version of Transformers in one file. + + Args: + fname (`str`): The path to the file where we want to update the version. + version (`str`): The new version to set in the file. + file_type (`str`): The type of the file (should be a key in `REPLACE_PATTERNS`). + """ with open(fname, "r", encoding="utf-8", newline="\n") as f: code = f.read() - re_pattern, replace = REPLACE_PATTERNS[pattern] + re_pattern, replace = REPLACE_PATTERNS[file_type] replace = replace.replace("VERSION", version) code = re_pattern.sub(replace, code) with open(fname, "w", encoding="utf-8", newline="\n") as f: f.write(code) -def update_version_in_examples(version): - """Update the version in all examples files.""" +def update_version_in_examples(version: str): + """ + Update the version in all examples files. + + Args: + version (`str`): The new version to set in the examples. + """ for folder, directories, fnames in os.walk(PATH_TO_EXAMPLES): # Removing some of the folders with non-actively maintained examples from the walk if "research_projects" in directories: @@ -55,19 +98,28 @@ def update_version_in_examples(version): directories.remove("legacy") for fname in fnames: if fname.endswith(".py"): - update_version_in_file(os.path.join(folder, fname), version, pattern="examples") + update_version_in_file(os.path.join(folder, fname), version, file_type="examples") + +def global_version_update(version: str, patch: bool = False): + """ + Update the version in all needed files. -def global_version_update(version, patch=False): - """Update the version in all needed files.""" + Args: + version (`str`): The new version to set everywhere. + patch (`bool`, *optional*, defaults to `False`): Whether or not this is a patch release. + """ for pattern, fname in REPLACE_FILES.items(): update_version_in_file(fname, version, pattern) if not patch: + # We don't update the version in the examples for patch releases. update_version_in_examples(version) def clean_main_ref_in_model_list(): - """Replace the links from main doc tp stable doc in the model list of the README.""" + """ + Replace the links from main doc to stable doc in the model list of the README. + """ # If the introduction or the conclusion of the list change, the prompts may need to be updated. _start_prompt = "🤗 Transformers currently provides the following architectures" _end_prompt = "1. Want to contribute a new model?" @@ -94,16 +146,26 @@ def clean_main_ref_in_model_list(): f.writelines(lines) -def get_version(): - """Reads the current version in the __init__.""" +def get_version() -> packaging.version.Version: + """ + Reads the current version in the main __init__. + """ with open(REPLACE_FILES["init"], "r") as f: code = f.read() default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0] return packaging.version.parse(default_version) -def pre_release_work(patch=False): - """Do all the necessary pre-release steps.""" +def pre_release_work(patch: bool = False): + """ + Do all the necessary pre-release steps: + - figure out the next minor release version and ask confirmation + - update the version eveywhere + - clean-up the model list in the main README + + Args: + patch (`bool`, *optional*, defaults to `False`): Whether or not this is a patch release. + """ # First let's get the default version: base version if we are in dev, bump minor otherwise. default_version = get_version() if patch and default_version.is_devrelease: @@ -115,7 +177,7 @@ def pre_release_work(patch=False): else: default_version = f"{default_version.major}.{default_version.minor + 1}.0" - # Now let's ask nicely if that's the right one. + # Now let's ask nicely if we have found the right version. version = input(f"Which version are you releasing? [{default_version}]") if len(version) == 0: version = default_version @@ -128,7 +190,12 @@ def pre_release_work(patch=False): def post_release_work(): - """Do all the necesarry post-release steps.""" + """ + Do all the necesarry post-release steps: + - figure out the next dev version and ask confirmation + - update the version eveywhere + - clean-up the model list in the main README + """ # First let's get the current version current_version = get_version() dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0" diff --git a/utils/sort_auto_mappings.py b/utils/sort_auto_mappings.py index 857af614df0b..8871b6807dfc 100644 --- a/utils/sort_auto_mappings.py +++ b/utils/sort_auto_mappings.py @@ -12,12 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that sorts the names in the auto mappings defines in the auto modules in alphabetical order. +Use from the root of the repo with: + +```bash +python utils/sort_auto_mappings.py +``` + +to auto-fix all the auto mappings (used in `make style`). + +To only check if the mappings are properly sorted (as used in `make quality`), do: + +```bash +python utils/sort_auto_mappings.py --check_only +``` +""" import argparse import os import re +from typing import Optional +# Path are set with the intent you should run this script from the root of the repo. PATH_TO_AUTO_MODULE = "src/transformers/models/auto" @@ -28,7 +46,18 @@ _re_identifier = re.compile(r'\s*\(\s*"(\S[^"]+)"') -def sort_auto_mapping(fname, overwrite: bool = False): +def sort_auto_mapping(fname: str, overwrite: bool = False) -> Optional[bool]: + """ + Sort all auto mappings in a file. + + Args: + fname (`str`): The name of the file where we want to sort auto-mappings. + overwrite (`bool`, *optional*, defaults to `False`): Whether or not to fix and overwrite the file. + + Returns: + `Optional[bool]`: Returns `None` if `overwrite=True`. Otherwise returns `True` if the file has an auto-mapping + improperly sorted, `False` if the file is okay. + """ with open(fname, "r", encoding="utf-8") as f: content = f.read() @@ -37,8 +66,8 @@ def sort_auto_mapping(fname, overwrite: bool = False): line_idx = 0 while line_idx < len(lines): if _re_intro_mapping.search(lines[line_idx]) is not None: - indent = len(re.search(r"^(\s*)\S", lines[line_idx]).groups()[0]) + 8 # Start of a new mapping! + indent = len(re.search(r"^(\s*)\S", lines[line_idx]).groups()[0]) + 8 while not lines[line_idx].startswith(" " * indent + "("): new_lines.append(lines[line_idx]) line_idx += 1 @@ -65,11 +94,17 @@ def sort_auto_mapping(fname, overwrite: bool = False): if overwrite: with open(fname, "w", encoding="utf-8") as f: f.write("\n".join(new_lines)) - elif "\n".join(new_lines) != content: - return True + else: + return "\n".join(new_lines) != content def sort_all_auto_mappings(overwrite: bool = False): + """ + Sort all auto mappings in the library. + + Args: + overwrite (`bool`, *optional*, defaults to `False`): Whether or not to fix and overwrite the file. + """ fnames = [os.path.join(PATH_TO_AUTO_MODULE, f) for f in os.listdir(PATH_TO_AUTO_MODULE) if f.endswith(".py")] diffs = [sort_auto_mapping(fname, overwrite=overwrite) for fname in fnames] diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 3344ae960aae..c0b329263a33 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -15,6 +15,7 @@ """ Welcome to tests_fetcher V2. + This util is designed to fetch tests to run on a PR so that only the tests impacted by the modifications are run, and when too many models are being impacted, only run the tests of a subset of core models. It works like this. diff --git a/utils/update_metadata.py b/utils/update_metadata.py index e760ce2fd075..637cdd959c71 100644 --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -12,12 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that updates the metadata of the Transformers library in the repository `huggingface/transformers-metadata`. +Usage for an update (as used by the GitHub action `update_metadata`): + +```bash +python utils/update_metadata.py --token --commit_sha +``` + +Usage to check all pipelines are properly defined in the constant `PIPELINE_TAGS_AND_AUTO_MODELS` of this script, so +that new pipelines are properly added as metadata (as used in `make repo-consistency`): + +```bash +python utils/update_metadata.py --check-only +``` +""" import argparse import collections import os import re import tempfile +from typing import Dict, List, Tuple import pandas as pd from datasets import Dataset @@ -102,14 +118,29 @@ ] -# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python -def camel_case_split(identifier): - "Split a camelcased `identifier` into words." +def camel_case_split(identifier: str) -> List[str]: + """ + Split a camel-cased name into words. + + Args: + identifier (`str`): The camel-cased name to parse. + + Returns: + `List[str]`: The list of words in the identifier (as seprated by capital letters). + + Example: + + ```py + >>> camel_case_split("CamelCasedClass") + ["Camel", "Cased", "Class"] + ``` + """ + # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier) return [m.group(0) for m in matches] -def get_frameworks_table(): +def get_frameworks_table() -> pd.DataFrame: """ Generates a dataframe containing the supported auto classes for each model type, using the content of the auto modules. @@ -155,7 +186,8 @@ def get_frameworks_table(): data["tensorflow"] = [tf_models[t] for t in all_models] data["flax"] = [flax_models[t] for t in all_models] - # Now let's use the auto-mapping names to make sure + # Now let's find the right processing class for each model. In order we check if there is a Processor, then a + # Tokenizer, then a FeatureExtractor, then an ImageProcessor processors = {} for t in all_models: if t in transformers_module.models.auto.processing_auto.PROCESSOR_MAPPING_NAMES: @@ -164,6 +196,8 @@ def get_frameworks_table(): processors[t] = "AutoTokenizer" elif t in transformers_module.models.auto.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES: processors[t] = "AutoFeatureExtractor" + elif t in transformers_module.models.auto.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES: + processors[t] = "AutoFeatureExtractor" else: # Default to AutoTokenizer if a model has nothing, for backward compatibility. processors[t] = "AutoTokenizer" @@ -173,10 +207,17 @@ def get_frameworks_table(): return pd.DataFrame(data) -def update_pipeline_and_auto_class_table(table): +def update_pipeline_and_auto_class_table(table: Dict[str, Tuple[str, str]]) -> Dict[str, Tuple[str, str]]: """ - Update the table of model class to (pipeline_tag, auto_class) without removing old keys if they don't exist - anymore. + Update the table maping models to pipelines and auto classes without removing old keys if they don't exist anymore. + + Args: + table (`Dict[str, Tuple[str, str]]`): + The existing table mapping model names to a tuple containing the pipeline tag and the auto-class name with + which they should be used. + + Returns: + `Dict[str, Tuple[str, str]]`: The updated table in the same format. """ auto_modules = [ transformers_module.models.auto.modeling_auto, @@ -205,9 +246,13 @@ def update_pipeline_and_auto_class_table(table): return table -def update_metadata(token, commit_sha): +def update_metadata(token: str, commit_sha: str): """ - Update the metadata for the Transformers repo. + Update the metadata for the Transformers repo in `huggingface/transformers-metadata`. + + Args: + token (`str`): A valid token giving write access to `huggingface/transformers-metadata`. + commit_sha (`str`): The commit SHA on Transformers corresponding to this update. """ frameworks_table = get_frameworks_table() frameworks_dataset = Dataset.from_pandas(frameworks_table) @@ -255,6 +300,9 @@ def update_metadata(token, commit_sha): def check_pipeline_tags(): + """ + Check all pipeline tags are properly defined in the `PIPELINE_TAGS_AND_AUTO_MODELS` constant of this script. + """ in_table = {tag: cls for tag, _, cls in PIPELINE_TAGS_AND_AUTO_MODELS} pipeline_tasks = transformers_module.pipelines.SUPPORTED_TASKS missing = []