From dd1c3cbfccfc0b7dc0f9b2e98dedb333ba2efaac Mon Sep 17 00:00:00 2001 From: Virginia Adams Date: Mon, 7 Mar 2022 17:34:31 +0000 Subject: [PATCH] Fixed soft prompt eval loading bug Signed-off-by: Virginia Adams --- .../conf/megatron_prompt_tuning_gpt.yaml | 12 ++++++------ .../models/language_modeling/megatron_gpt_model.py | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml b/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml index a89f0b07e99e..912e10c9a444 100755 --- a/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml +++ b/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml @@ -5,14 +5,14 @@ trainer: gpus: 1 num_nodes: 1 accelerator: ddp - precision: 32 + precision: 16 logger: False # logger provided by exp_manager checkpoint_callback: False replace_sampler_ddp: False max_epochs: null - max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: 3000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 - val_check_interval: 50 + val_check_interval: 250 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models @@ -43,7 +43,7 @@ model: # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size micro_batch_size: 4 # limited by GPU memory - global_batch_size: 16 # will use more micro batches to reach global batch size + global_batch_size: 8 # will use more micro batches to reach global batch size tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism @@ -117,7 +117,7 @@ model: optim: name: fused_adam - lr: 2e-4 + lr: 1e-5 weight_decay: 0.01 betas: - 0.9 @@ -126,4 +126,4 @@ model: name: CosineAnnealing warmup_steps: 50 constant_steps: 10 - min_lr: 2e-5 + min_lr: 1e-6 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 14847b3d9b24..a10bc3c2e4f8 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -708,13 +708,13 @@ def setup(self, stage=None): init_consumed_samples = 0 self.init_consumed_samples = init_consumed_samples - # Initalize soft prompts before loading datasets and training - if self.use_soft_prompts: - self.init_new_prompts() - if stage == 'predict': return else: + # Initalize soft prompts before loading datasets and training + if self.use_soft_prompts: + self.init_new_prompts() + # TODO: consider adding a ModelPT guard to check if model is being restored. # allowing restored models to optionally setup datasets self.build_train_valid_test_datasets()