From aa1a85d7e33d6aaa353c4023551df3533987753b Mon Sep 17 00:00:00 2001 From: vnoroozi Date: Sat, 18 Nov 2023 18:28:47 -0800 Subject: [PATCH] added sampling for pretraining data --- scripts/speechlm_sft/sampling_pretraining_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/speechlm_sft/sampling_pretraining_data.py b/scripts/speechlm_sft/sampling_pretraining_data.py index 0aafbd304791..eada091f9d50 100644 --- a/scripts/speechlm_sft/sampling_pretraining_data.py +++ b/scripts/speechlm_sft/sampling_pretraining_data.py @@ -303,6 +303,8 @@ dataset_handler = open(dataset_path, 'r', encoding='utf-8') dataset_handlers[dataset] = dataset_handler +print("Blend to be used:", DATA_BLEND) + with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf: datasets = list(DATA_BLEND.keys()) weights = list(DATA_BLEND.values())