From 006f251f6d2ef5859545728045d7c7de1fd29c59 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 17 Jul 2024 23:08:35 -0700 Subject: [PATCH] Fix hf dataset hang on small dataset (#1370) --- llmfoundry/data/finetuning/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 0adad8af4e..78bfb9c74c 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -913,6 +913,8 @@ def dataset_mapper(example: Dict): detected_cpu_count = os.cpu_count() or 1 detected_cpus_with_margin = detected_cpu_count - 8 num_cpus_to_use = max(1, detected_cpus_with_margin) + if len(dataset) < num_cpus_to_use: + num_cpus_to_use = 1 columns_to_remove = list(dataset[0].keys()) tokenized_dataset = dataset.map(