fix: shorten non-URL dataset names that are more than 50 characters (f…

…ixes #654) (#712) * fix: shorten non-URL dataset names that are more than 50 characters * fix: add a lengthy dataset to default parameters This ensures that we hit the dataset shortening code in CI. Updating to a newer mtdata is required to have a dataset that is lengthy enough.
mozilla · Jul 8, 2024 · 8f1a068 · 8f1a068
1 parent 794bdb2
commit 8f1a068
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 30 deletions.
diff --git a/pipeline/common/datasets.py b/pipeline/common/datasets.py
@@ -8,6 +8,10 @@
 from typing import Iterable, Iterator, Optional
 from urllib.parse import urlparse
 
+# We keep this relatively short because these datasets end up in task labels,
+# which end up in task cache routes, which need to be <= 256 characters.
+DATASET_NAME_MAX_LENGTH = 50
+
 
 class Dataset:
     """
@@ -56,6 +60,16 @@ def _escape(dataset: str) -> str:
             hash = md5.hexdigest()[:6]
 
             dataset = f"{hostname}_{file}_{hash}"
+        # Even non-URL datasets can be too long, for example:
+        # mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng
+        # We need to truncate and hash any that are over a certain length
+        elif len(dataset) > DATASET_NAME_MAX_LENGTH:
+            md5 = hashlib.md5()
+            md5.update(dataset.encode("utf-8"))
+            hash = md5.hexdigest()[:6]
+
+            truncated = dataset[:DATASET_NAME_MAX_LENGTH]
+            dataset = f"{truncated}_{hash}"
 
         return (
             dataset.replace("://", "_")

diff --git a/pipeline/data/requirements/data.in b/pipeline/data/requirements/data.in
@@ -1,4 +1,4 @@
 # use the latest main, switch to PyPi when released
 git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
 simalign==0.4
-mtdata==0.3.2
+mtdata==0.4.1
diff --git a/pipeline/data/requirements/data.txt b/pipeline/data/requirements/data.txt
@@ -1,37 +1,38 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile pipeline/data/requirements/data.in
+#    pip-compile requirements/data.in
 #
 blessed==1.20.0
     # via enlighten
-certifi==2024.2.2
+certifi==2024.6.2
     # via requests
-charset-normalizer==2.0.12
+charset-normalizer==3.3.2
     # via requests
 click==8.1.7
     # via sacremoses
 enlighten==1.10.1
     # via mtdata
-filelock==3.13.1
+filelock==3.15.4
     # via
     #   huggingface-hub
     #   torch
     #   transformers
-fsspec==2024.2.0
+    #   triton
+fsspec==2024.6.1
     # via
     #   huggingface-hub
     #   torch
-huggingface-hub==0.20.3
+huggingface-hub==0.23.4
     # via
     #   tokenizers
     #   transformers
-idna==3.6
+idna==3.7
     # via requests
-jinja2==3.1.3
+jinja2==3.1.4
     # via torch
-joblib==1.3.2
+joblib==1.4.2
     # via
     #   sacremoses
     #   scikit-learn
@@ -41,9 +42,9 @@ markupsafe==2.1.5
     # via jinja2
 mpmath==1.3.0
     # via sympy
-mtdata==0.3.2
-    # via -r pipeline/data/requirements/data.in
-networkx==3.2.1
+mtdata==0.4.1
+    # via -r requirements/data.in
+networkx==3.3
     # via
     #   simalign
     #   torch
@@ -53,9 +54,40 @@ numpy==1.26.4
     #   scipy
     #   simalign
     #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.5.82
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
-    # via -r pipeline/data/requirements/data.in
-packaging==23.2
+    # via -r requirements/data.in
+packaging==24.1
     # via
     #   huggingface-hub
     #   transformers
@@ -71,12 +103,12 @@ pyyaml==6.0.1
     #   opustrainer
     #   pybtex
     #   transformers
-regex==2023.10.3
+regex==2024.5.15
     # via
     #   sacremoses
     #   simalign
     #   transformers
-requests==2.26.0
+requests==2.31.0
     # via
     #   huggingface-hub
     #   mtdata
@@ -87,44 +119,46 @@ ruamel-yaml-clib==0.2.8
     # via ruamel-yaml
 sacremoses==0.1.1
     # via opustrainer
-safetensors==0.4.2
+safetensors==0.4.3
     # via transformers
-scikit-learn==1.4.1.post1
+scikit-learn==1.5.0
     # via simalign
-scipy==1.12.0
+scipy==1.14.0
     # via
     #   scikit-learn
     #   simalign
 sentencepiece==0.1.99
     # via opustrainer
 simalign==0.4
-    # via -r pipeline/data/requirements/data.in
+    # via -r requirements/data.in
 six==1.16.0
     # via
     #   blessed
     #   pybtex
-sympy==1.12
+sympy==1.12.1
     # via torch
-threadpoolctl==3.3.0
+threadpoolctl==3.5.0
     # via scikit-learn
-tokenizers==0.15.2
+tokenizers==0.19.1
     # via transformers
-torch==2.2.0
+torch==2.3.1
     # via simalign
-tqdm==4.66.1
+tqdm==4.66.4
     # via
     #   huggingface-hub
     #   sacremoses
     #   transformers
-transformers==4.37.2
+transformers==4.42.3
     # via simalign
-typing-extensions==4.9.0
+triton==2.3.1
+    # via torch
+typing-extensions==4.12.2
     # via
     #   huggingface-hub
     #   torch
 typo==0.1.5
     # via opustrainer
-urllib3==1.26.18
+urllib3==2.2.2
     # via requests
 wcwidth==0.2.13
     # via blessed
diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py
@@ -82,6 +82,7 @@ def get_defaults(_):
                     "opus_ada83/v1",
                     "opus_ELRC-3075-wikipedia_health/v1",
                     "url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst",
+                    "mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus",
                 ],
                 "devtest": [
                     "flores_dev",

diff --git a/taskcluster/translations_taskgraph/util/dataset_helpers.py b/taskcluster/translations_taskgraph/util/dataset_helpers.py
@@ -3,6 +3,11 @@
 import hashlib
 
 
+# We keep this relatively short because these datasets end up in task labels,
+# which end up in task cache routes, which need to be <= 256 characters.
+DATASET_NAME_MAX_LENGTH = 50
+
+
 # Important! Keep in sync with `Dataset._escape` in pipeline/common/datasets.py.
 def sanitize_dataset_name(dataset: str) -> str:
     # URLs can be too large when used as Taskcluster labels. Create a nice identifier for them.
@@ -24,6 +29,16 @@ def sanitize_dataset_name(dataset: str) -> str:
         hash = md5.hexdigest()[:6]
 
         dataset = f"{hostname}_{file}_{hash}"
+    # Even non-URL datasets can be too long, for example:
+    # mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng
+    # We need to truncate and hash any that are over a certain length
+    elif len(dataset) > DATASET_NAME_MAX_LENGTH:
+        md5 = hashlib.md5()
+        md5.update(dataset.encode("utf-8"))
+        hash = md5.hexdigest()[:6]
+
+        truncated = dataset[:DATASET_NAME_MAX_LENGTH]
+        dataset = f"{truncated}_{hash}"
 
     return (
         dataset.replace("://", "_")