Skip to content

Commit

Permalink
fix: shorten non-URL dataset names that are more than 50 characters (f…
Browse files Browse the repository at this point in the history
…ixes #654) (#712)

* fix: shorten non-URL dataset names that are more than 50 characters

* fix: add a lengthy dataset to default parameters

This ensures that we hit the dataset shortening code in CI. Updating to a newer mtdata is required to have a dataset that is lengthy enough.
  • Loading branch information
bhearsum authored Jul 8, 2024
1 parent 794bdb2 commit 8f1a068
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 30 deletions.
14 changes: 14 additions & 0 deletions pipeline/common/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
from typing import Iterable, Iterator, Optional
from urllib.parse import urlparse

# We keep this relatively short because these datasets end up in task labels,
# which end up in task cache routes, which need to be <= 256 characters.
DATASET_NAME_MAX_LENGTH = 50


class Dataset:
"""
Expand Down Expand Up @@ -56,6 +60,16 @@ def _escape(dataset: str) -> str:
hash = md5.hexdigest()[:6]

dataset = f"{hostname}_{file}_{hash}"
# Even non-URL datasets can be too long, for example:
# mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng
# We need to truncate and hash any that are over a certain length
elif len(dataset) > DATASET_NAME_MAX_LENGTH:
md5 = hashlib.md5()
md5.update(dataset.encode("utf-8"))
hash = md5.hexdigest()[:6]

truncated = dataset[:DATASET_NAME_MAX_LENGTH]
dataset = f"{truncated}_{hash}"

return (
dataset.replace("://", "_")
Expand Down
2 changes: 1 addition & 1 deletion pipeline/data/requirements/data.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# use the latest main, switch to PyPi when released
git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
simalign==0.4
mtdata==0.3.2
mtdata==0.4.1
92 changes: 63 additions & 29 deletions pipeline/data/requirements/data.txt
Original file line number Diff line number Diff line change
@@ -1,37 +1,38 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile pipeline/data/requirements/data.in
# pip-compile requirements/data.in
#
blessed==1.20.0
# via enlighten
certifi==2024.2.2
certifi==2024.6.2
# via requests
charset-normalizer==2.0.12
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via sacremoses
enlighten==1.10.1
# via mtdata
filelock==3.13.1
filelock==3.15.4
# via
# huggingface-hub
# torch
# transformers
fsspec==2024.2.0
# triton
fsspec==2024.6.1
# via
# huggingface-hub
# torch
huggingface-hub==0.20.3
huggingface-hub==0.23.4
# via
# tokenizers
# transformers
idna==3.6
idna==3.7
# via requests
jinja2==3.1.3
jinja2==3.1.4
# via torch
joblib==1.3.2
joblib==1.4.2
# via
# sacremoses
# scikit-learn
Expand All @@ -41,9 +42,9 @@ markupsafe==2.1.5
# via jinja2
mpmath==1.3.0
# via sympy
mtdata==0.3.2
# via -r pipeline/data/requirements/data.in
networkx==3.2.1
mtdata==0.4.1
# via -r requirements/data.in
networkx==3.3
# via
# simalign
# torch
Expand All @@ -53,9 +54,40 @@ numpy==1.26.4
# scipy
# simalign
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.1.105
# via torch
nvidia-cuda-nvrtc-cu12==12.1.105
# via torch
nvidia-cuda-runtime-cu12==12.1.105
# via torch
nvidia-cudnn-cu12==8.9.2.26
# via torch
nvidia-cufft-cu12==11.0.2.54
# via torch
nvidia-curand-cu12==10.3.2.106
# via torch
nvidia-cusolver-cu12==11.4.5.107
# via torch
nvidia-cusparse-cu12==12.1.0.106
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.20.5
# via torch
nvidia-nvjitlink-cu12==12.5.82
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105
# via torch
opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
# via -r pipeline/data/requirements/data.in
packaging==23.2
# via -r requirements/data.in
packaging==24.1
# via
# huggingface-hub
# transformers
Expand All @@ -71,12 +103,12 @@ pyyaml==6.0.1
# opustrainer
# pybtex
# transformers
regex==2023.10.3
regex==2024.5.15
# via
# sacremoses
# simalign
# transformers
requests==2.26.0
requests==2.31.0
# via
# huggingface-hub
# mtdata
Expand All @@ -87,44 +119,46 @@ ruamel-yaml-clib==0.2.8
# via ruamel-yaml
sacremoses==0.1.1
# via opustrainer
safetensors==0.4.2
safetensors==0.4.3
# via transformers
scikit-learn==1.4.1.post1
scikit-learn==1.5.0
# via simalign
scipy==1.12.0
scipy==1.14.0
# via
# scikit-learn
# simalign
sentencepiece==0.1.99
# via opustrainer
simalign==0.4
# via -r pipeline/data/requirements/data.in
# via -r requirements/data.in
six==1.16.0
# via
# blessed
# pybtex
sympy==1.12
sympy==1.12.1
# via torch
threadpoolctl==3.3.0
threadpoolctl==3.5.0
# via scikit-learn
tokenizers==0.15.2
tokenizers==0.19.1
# via transformers
torch==2.2.0
torch==2.3.1
# via simalign
tqdm==4.66.1
tqdm==4.66.4
# via
# huggingface-hub
# sacremoses
# transformers
transformers==4.37.2
transformers==4.42.3
# via simalign
typing-extensions==4.9.0
triton==2.3.1
# via torch
typing-extensions==4.12.2
# via
# huggingface-hub
# torch
typo==0.1.5
# via opustrainer
urllib3==1.26.18
urllib3==2.2.2
# via requests
wcwidth==0.2.13
# via blessed
1 change: 1 addition & 0 deletions taskcluster/translations_taskgraph/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def get_defaults(_):
"opus_ada83/v1",
"opus_ELRC-3075-wikipedia_health/v1",
"url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst",
"mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus",
],
"devtest": [
"flores_dev",
Expand Down
15 changes: 15 additions & 0 deletions taskcluster/translations_taskgraph/util/dataset_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import hashlib


# We keep this relatively short because these datasets end up in task labels,
# which end up in task cache routes, which need to be <= 256 characters.
DATASET_NAME_MAX_LENGTH = 50


# Important! Keep in sync with `Dataset._escape` in pipeline/common/datasets.py.
def sanitize_dataset_name(dataset: str) -> str:
# URLs can be too large when used as Taskcluster labels. Create a nice identifier for them.
Expand All @@ -24,6 +29,16 @@ def sanitize_dataset_name(dataset: str) -> str:
hash = md5.hexdigest()[:6]

dataset = f"{hostname}_{file}_{hash}"
# Even non-URL datasets can be too long, for example:
# mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng
# We need to truncate and hash any that are over a certain length
elif len(dataset) > DATASET_NAME_MAX_LENGTH:
md5 = hashlib.md5()
md5.update(dataset.encode("utf-8"))
hash = md5.hexdigest()[:6]

truncated = dataset[:DATASET_NAME_MAX_LENGTH]
dataset = f"{truncated}_{hash}"

return (
dataset.replace("://", "_")
Expand Down

0 comments on commit 8f1a068

Please sign in to comment.