diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md index 8d2ba49d0d8965..66ac8fb0d4b56a 100644 --- a/docs/source/es/tasks/language_modeling.md +++ b/docs/source/es/tasks/language_modeling.md @@ -122,7 +122,7 @@ Así es como puedes crear una función de preprocesamiento para convertir la lis ... return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True) ``` -Usa de 🤗 Datasets la función [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez y aumentar la cantidad de procesos con `num_proc`. Elimina las columnas que no necesitas: +Usa de 🤗 Datasets la función [`map`](https://huggingface.co/docs/datasets/process#map) para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez y aumentar la cantidad de procesos con `num_proc`. Elimina las columnas que no necesitas: ```py >>> tokenized_eli5 = eli5.map( diff --git a/docs/source/pt/tasks/sequence_classification.md b/docs/source/pt/tasks/sequence_classification.md index cc04f5dbaece86..6469ac4d45534c 100644 --- a/docs/source/pt/tasks/sequence_classification.md +++ b/docs/source/pt/tasks/sequence_classification.md @@ -70,7 +70,7 @@ Crie uma função de pré-processamento para tokenizar o campo `text` e truncar ... return tokenizer(examples["text"], truncation=True) ``` -Use a função [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) do 🤗 Datasets para aplicar a função de pré-processamento em todo o conjunto de dados. Você pode acelerar a função `map` definindo `batched=True` para processar vários elementos do conjunto de dados de uma só vez: +Use a função [`map`](https://huggingface.co/docs/datasets/process#map) do 🤗 Datasets para aplicar a função de pré-processamento em todo o conjunto de dados. Você pode acelerar a função `map` definindo `batched=True` para processar vários elementos do conjunto de dados de uma só vez: ```py tokenized_imdb = imdb.map(preprocess_function, batched=True) diff --git a/docs/source/pt/tasks/token_classification.md b/docs/source/pt/tasks/token_classification.md index 1de82f4a509c24..ba8298e9f581d9 100644 --- a/docs/source/pt/tasks/token_classification.md +++ b/docs/source/pt/tasks/token_classification.md @@ -128,7 +128,7 @@ Aqui está como você pode criar uma função para realinhar os tokens e rótulo ... return tokenized_inputs ``` -Use a função [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) do 🤗 Datasets para tokenizar e alinhar os rótulos em todo o conjunto de dados. Você pode acelerar a função `map` configurando `batched=True` para processar vários elementos do conjunto de dados de uma só vez: +Use a função [`map`](https://huggingface.co/docs/datasets/process#map) do 🤗 Datasets para tokenizar e alinhar os rótulos em todo o conjunto de dados. Você pode acelerar a função `map` configurando `batched=True` para processar vários elementos do conjunto de dados de uma só vez: ```py >>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True) diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py index 259f67f0b17dfa..a24ad1cff008c0 100644 --- a/examples/flax/language-modeling/run_bart_dlm_flax.py +++ b/examples/flax/language-modeling/run_bart_dlm_flax.py @@ -684,7 +684,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index 1a296a4fa992fb..7d242bb147dce5 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -607,7 +607,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map lm_datasets = tokenized_datasets.map( group_texts, diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index 0c49a2cff7b025..2c1e0a667305f9 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -625,7 +625,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index c3afc58207b4d2..5b30ba899f0adb 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -715,7 +715,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 92bb89c4dc858a..6430f81efe76b9 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -533,7 +533,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): if not data_args.streaming: diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 6efbfd96e4ddd8..968ad293486870 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -473,7 +473,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with accelerator.main_process_first(): lm_datasets = tokenized_datasets.map( diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 58e7be1ea16c6a..8161701831d4b0 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -547,7 +547,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): if not data_args.streaming: diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 4d62a61911e37d..a7cbe0e229345c 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -504,7 +504,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with accelerator.main_process_first(): tokenized_datasets = tokenized_datasets.map( diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 70e913eb219ab5..167c324ead8496 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -478,7 +478,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): tokenized_datasets = tokenized_datasets.map( diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py index 7103b5a28111ff..43466f733084b9 100644 --- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py +++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py @@ -395,7 +395,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map lm_datasets = tokenized_datasets.map( group_texts, diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 1614bbd4b124c9..0710928c96953b 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -459,7 +459,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map lm_datasets = tokenized_datasets.map( group_texts, diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 671331745de7f4..7de9de8d612cbb 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -474,7 +474,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map tokenized_datasets = tokenized_datasets.map( group_texts,