Skip to content

Commit

Permalink
Fix input data file extension in examples (#28741)
Browse files Browse the repository at this point in the history
  • Loading branch information
khipp authored Jan 29, 2024
1 parent 5649c0c commit 39fa400
Show file tree
Hide file tree
Showing 23 changed files with 49 additions and 23 deletions.
3 changes: 2 additions & 1 deletion examples/flax/language-modeling/run_bart_dlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,9 +558,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
datasets = load_dataset(
Expand Down
3 changes: 2 additions & 1 deletion examples/flax/language-modeling/run_clm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,10 @@ def main():
dataset_args = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
Expand Down
3 changes: 2 additions & 1 deletion examples/flax/language-modeling/run_mlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,9 +485,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
datasets = load_dataset(
Expand Down
3 changes: 2 additions & 1 deletion examples/flax/language-modeling/run_t5_mlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,9 +599,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
datasets = load_dataset(
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/language-modeling/run_clm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,9 +345,10 @@ def main():
dataset_args = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/language-modeling/run_mlm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files)
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/language-modeling/run_plm.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/multiple-choice/run_swag.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/multiple-choice/run_swag_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,9 +357,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples
if args.debug:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -362,11 +362,13 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.validation_file.split(".")[-1]
if args.test_file is not None:
data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1]
extension = args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.
Expand Down
4 changes: 3 additions & 1 deletion examples/pytorch/question-answering/run_qa_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,13 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.validation_file.split(".")[-1]
if args.test_file is not None:
data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1]
extension = args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -404,9 +404,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.
Expand Down
4 changes: 3 additions & 1 deletion examples/pytorch/token-classification/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,11 +311,13 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.validation_file.split(".")[-1]
if data_args.test_file is not None:
data_files["test"] = data_args.test_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/token-classification/run_ner_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples
if args.debug:
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/translation/run_translation_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
Expand Down
3 changes: 2 additions & 1 deletion examples/research_projects/luke/run_luke_ner_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples
if args.debug:
Expand Down
3 changes: 2 additions & 1 deletion examples/research_projects/mlm_wwm/run_mlm_wwm.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
datasets = load_dataset(extension, data_files=data_files)
Expand Down
3 changes: 2 additions & 1 deletion examples/research_projects/performer/run_mlm_performer.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,10 @@ def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarra
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
datasets = load_dataset(extension, data_files=data_files)
Expand Down
3 changes: 2 additions & 1 deletion examples/tensorflow/language-modeling/run_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(
Expand Down
3 changes: 2 additions & 1 deletion examples/tensorflow/multiple-choice/run_swag.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
Expand Down
3 changes: 2 additions & 1 deletion examples/tensorflow/token-classification/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,10 @@ def main():
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -730,9 +730,10 @@ def main():
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.
Expand Down

0 comments on commit 39fa400

Please sign in to comment.