diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py index 8603482218b422..f5369299a6d4c9 100644 --- a/examples/flax/language-modeling/run_bart_dlm_flax.py +++ b/examples/flax/language-modeling/run_bart_dlm_flax.py @@ -558,9 +558,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset( diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index 48d924f9bb3948..2145b242bb1c18 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -449,9 +449,10 @@ def main(): dataset_args = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index 39fc5e7836376e..d0aa82c6b8fed9 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -485,9 +485,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset( diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index 45d3fe32bcf9f1..fa6a5742236ca5 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -599,9 +599,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset( diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 15d513b0c928b6..d2855a514a82a6 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -345,9 +345,10 @@ def main(): dataset_args = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index d9b8120a98e883..68253c33a35f3b 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -351,9 +351,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 66451247e06a7f..1a744083b18a94 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -328,9 +328,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index a1cfcfdddafa4a..2eaf97b70335b9 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -311,9 +311,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 9ad72548329184..529ce2eae3cbcc 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -357,9 +357,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 905189d0d41a88..4103dd0014ec53 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -362,11 +362,13 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file + extension = args.validation_file.split(".")[-1] if args.test_file is not None: data_files["test"] = args.test_file - extension = args.train_file.split(".")[-1] + extension = args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets. diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 1a58f6ce442f0e..0c6794b16f2198 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -410,11 +410,13 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file + extension = args.validation_file.split(".")[-1] if args.test_file is not None: data_files["test"] = args.test_file - extension = args.train_file.split(".")[-1] + extension = args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets. diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 96ccb552ed164f..826c8b64a4ec90 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -404,9 +404,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets. diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 8d7a67cd1571a6..13ce7d98771d25 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -311,11 +311,13 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets. diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 02bbd12d22ba50..439730e77d261e 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -339,9 +339,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index c4764b5ee4a7d7..87f7ba6bcfad4c 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -384,9 +384,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets. diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py index 4ff4bd559d8ced..a72e5cff861c8b 100644 --- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py +++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py @@ -297,9 +297,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py index e03c665e4ec2cd..cac487b059d71f 100644 --- a/examples/research_projects/luke/run_luke_ner_no_trainer.py +++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py @@ -285,9 +285,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: diff --git a/examples/research_projects/mlm_wwm/run_mlm_wwm.py b/examples/research_projects/mlm_wwm/run_mlm_wwm.py index 3a7326d38219c0..629026bdb20a63 100644 --- a/examples/research_projects/mlm_wwm/run_mlm_wwm.py +++ b/examples/research_projects/mlm_wwm/run_mlm_wwm.py @@ -271,9 +271,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py index 7c1f418815bed8..4261d9c184b7a7 100644 --- a/examples/research_projects/performer/run_mlm_performer.py +++ b/examples/research_projects/performer/run_mlm_performer.py @@ -517,9 +517,10 @@ def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarra data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 5be9e0219b7182..38bdc71d984d91 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -341,9 +341,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset( diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 8572ec98e1ae96..4706fbc7e6dd9b 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -320,9 +320,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index 84b2ab702a17ab..c6921f9f9aa39c 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -260,9 +260,10 @@ def main(): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.validation_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py index f01283ae08fc7c..2ed4a4d8af909a 100755 --- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py +++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py @@ -730,9 +730,10 @@ def main(): data_files = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.