Merge branch 'main' into t5_lm_adaptation

NVIDIA · Feb 14, 2022 · a826ac7 · a826ac7
2 parents 377342c + 6a517f0
commit a826ac7
Show file tree

Hide file tree

Showing 39 changed files with 2,906 additions and 472 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1920,7 +1920,7 @@ pipeline {
         sh "rm -rf examples/nlp/language_modeling/ptune_results"
       }
     }
-    stage('L2: Megatron GPT Pretraining and Resume Training') {
+    stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
       when {
         anyOf {
           branch 'main'
@@ -1934,7 +1934,7 @@ pipeline {
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=10 \
         trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=2 \
+        trainer.accumulate_grad_batches=1 \
         trainer.max_steps=10 \
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
@@ -1960,8 +1960,8 @@ pipeline {
         trainer.gpus=2 \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=2 \
+        trainer.limit_val_batches=1 \
+        trainer.accumulate_grad_batches=1 \
         trainer.max_steps=20 \
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
@@ -1987,6 +1987,75 @@ pipeline {
         sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
       }
     }
+    stage('L2: Megatron GPT Pretraining and Resume Training PP=2') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.gpus=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]"
+        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.gpus=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]"
+        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+      }
+    }
     stage('L2: Megatron GPT Eval') {
       when {
         anyOf {
@@ -2011,49 +2080,50 @@ pipeline {
             16"
       }
     }
-    stage('L2: Megatron GPT Prompt Tuning and Inference') {
-      when {
-	anyOf {
-	  branch 'main'
-	  changeRequest target: 'main'
-	}
-      }
-      failFast true
-      steps {
-	sh "python tests/collections/nlp/test_prompt_tuning.py"
-	sh "python examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py \
-	   --config-name=megatron_gpt_config \
-	   trainer.gpus=1 \
-	   trainer.max_steps=10 \
-	   trainer.val_check_interval=1 \
-	   exp_manager.name='megatron_gpt125M_prompt_tuning' \
-	   exp_manager.checkpoint_callback_params.save_top_k=2 \
-	   exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
-	   restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
-	   +model.use_soft_prompts=True \
-	   +model.num_prompt_tokens=10 \
-           +model.new_prompt_tags=['Winogrande, BoolQ'] \
-	   +model.new_prompt_init_text=['logic choose person name, None'] \
-	   +model.new_prompt_init_methods=['text, random'] \
-           model.data.data_prefix=None \
-	   +model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
-	   +model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
-	   +model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
-	   +model.data.batch_size=8 \
-	   model.optim.lr=2e-2 \
-	   model.optim.sched.min_lr=2e-3 \
-	   model.optim.sched.warmup_steps=2 \
-	   model.optim.sched.constant_steps=8 \
-	   model.encoder_seq_length=2048"
-	sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-	    --use_soft_prompts \
-	    --model_file=nemo_experiments/megatron_gpt125M_prompt_tuning/checkpoints/megatron_gpt125M_prompt_tuning.nemo \
-	    --tokens_to_generate=3 \
-	    --prompt_tag='Winogrande' \
-	    --prompt='option1: wood option2: bag sentence: The _ is soft. answer:'"
-	sh "rm -rf nemo_experiments"
-      }
-    }
+  // # TODO uncomment once prompt tuning works with apex fwd/bwd functions
+  //   stage('L2: Megatron GPT Prompt Tuning and Inference') {
+  //     when {
+	// anyOf {
+	//   branch 'main'
+	//   changeRequest target: 'main'
+	// }
+  //     }
+  //     failFast true
+  //     steps {
+	// sh "python tests/collections/nlp/test_prompt_tuning.py"
+	// sh "python examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py \
+	//    --config-name=megatron_gpt_config \
+	//    trainer.gpus=1 \
+	//    trainer.max_steps=10 \
+	//    trainer.val_check_interval=1 \
+	//    exp_manager.name='megatron_gpt125M_prompt_tuning' \
+	//    exp_manager.checkpoint_callback_params.save_top_k=2 \
+	//    exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
+	//    restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
+	//    +model.use_soft_prompts=True \
+	//    +model.num_prompt_tokens=10 \
+  //          +model.new_prompt_tags=['Winogrande, BoolQ'] \
+	//    +model.new_prompt_init_text=['logic choose person name, None'] \
+	//    +model.new_prompt_init_methods=['text, random'] \
+  //          model.data.data_prefix=None \
+	//    +model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
+	//    +model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+	//    +model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+	//    +model.data.batch_size=8 \
+	//    model.optim.lr=2e-2 \
+	//    model.optim.sched.min_lr=2e-3 \
+	//    model.optim.sched.warmup_steps=2 \
+	//    model.optim.sched.constant_steps=8 \
+	//    model.encoder_seq_length=2048"
+	// sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
+	//     --use_soft_prompts \
+	//     --model_file=nemo_experiments/megatron_gpt125M_prompt_tuning/checkpoints/megatron_gpt125M_prompt_tuning.nemo \
+	//     --tokens_to_generate=3 \
+	//     --prompt_tag='Winogrande' \
+	//     --prompt='option1: wood option2: bag sentence: The _ is soft. answer:'"
+	// sh "rm -rf nemo_experiments"
+  //     }
+  //   }
 
 
     stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {

diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -35,13 +35,14 @@ exp_manager:
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
     filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 
 model:
   # model parallelism 
   micro_batch_size: 4
-  tensor_model_parallel_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
 
   # model architecture
   encoder_seq_length: 512

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -15,7 +15,7 @@ trainer:
   val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
-  accumulate_grad_batches: 1
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
   gradient_clip_val: 1.0
 
 exp_manager:
@@ -36,13 +36,16 @@ exp_manager:
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
     save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
     filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 
 model:
-  # model parallelism 
-  micro_batch_size: 4
-  tensor_model_parallel_size: 1
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
 
   # model architecture
   encoder_seq_length: 512
@@ -78,16 +81,13 @@ model:
 
   # Megatron O2-style half-precision
   megatron_amp_O2: False # Enable O2-level automatic mixed precision using master parameters
-  fp32_grad_accum: False # Execute gradient accumulation and  all-reduce in fp32
-  contiguous_grad_bucket: False # Allocate master gradients in the contiguous memory space
-  async_grad_allreduce: False # Asynchronous master gradient all-reduce with a training step
 
   # miscellaneous
   seed: 1234
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
 
-  # not implemented in NeMo yet
   activations_checkpoint_method: null # 'uniform', 'block'
   activations_checkpoint_num_layers: 1 
 

diff --git a/examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml b/examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml
@@ -0,0 +1,110 @@
+name: megatron_ptune_gpt
+
+trainer:
+  gpus: 2
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  checkpoint_callback: False
+  replace_sampler_ddp: False
+  max_epochs: 3
+  max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 300
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_ptune_gpt
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_acc
+    save_top_k: 2
+    mode: max
+    always_save_nemo: False # TODO: add support
+    filename: 'megatron_gpt--{val_acc:.3f}-{step}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+
+model:
+  tensor_model_parallel_size: 1
+  seed: 1234
+  nemo_path: null # filename to save the model and associated artifacts to .nemo file
+  use_lm_finetune: False # whether fine tune the language model
+  pseudo_token: '[PROMPT]' # pseudo prompt tokens
+  max_decode_length: null # max number of tokens length to decode, if set null, it will be inferred from data
+
+  language_model:
+    nemo_file: null
+
+  prompt_encoder:
+    template: [3, 3, 3]
+    dropout: 0.0
+    num_layers: 2
+    task_dependent: true # whether the prompt encoder conditions on the task name or not, useful for multiple tasks
+
+  task_processors:
+    - taskname: qa-task
+      template: "{v0} Context: {context}{v1} Question: {question}?{v2} Answer:"
+      limit_length_field: 'context' 
+    - taskname: sentiment-task  
+      template: "{v0}{v1} Sentence: {sentence}{v2} Sentiment:"
+      limit_length_field: 'sentence' 
+
+  data:
+    train_ds:
+      file_path: ???
+      batch_size: 32
+      shuffle: True
+      num_workers: 8
+      pin_memory: True
+
+    validation_ds:
+      file_path: ???
+      batch_size: 32
+      shuffle: False
+      num_workers: 8
+      pin_memory: True
+
+    test_ds:
+      file_path: ???
+      batch_size: 32
+      shuffle: False
+      num_workers: 8
+      pin_memory: True
+
+  optim:
+    name: adam
+    lr: 1e-5
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0005
+    # scheduler setup
+    sched:
+      name: WarmupAnnealing
+      # Scheduler params
+      warmup_steps: null
+      warmup_ratio: 0.1
+      last_epoch: -1
+      # pytorch lightning args
+      monitor: val_loss
+      reduce_on_plateau: false
+
+  # List of some sample queries for inference after training is done
+  infer_samples:
+    - {"prompt_tag": "boolq-full-text", "sentence": "Slave states and free states -- In the 1770s, blacks throughout New England began sending petitions to northern legislatures demanding freedom. Five of the Northern self-declared states adopted policies to at least gradually abolish slavery: Pennsylvania (1780), New Hampshire and Massachusetts (1783), Connecticut and Rhode Island (1784). Vermont had abolished slavery in 1777, while it was still independent, and when it joined the United States as the 14th state in 1791, it was the first state to join untainted by slavery. These state jurisdictions thus enacted the first abolition laws in the Americas. By 1804 (including, New York (1799), New Jersey (1804)), all of the northern states had abolished slavery or set measures in place to gradually abolish it.", "question": "were the new england states free states in 1854"}
+    - {"prompt_tag": "boolq-full-text", "sentence": "Titius\u2013Bode law -- Recent astronomical research suggests that planetary systems around some other stars may follow Titius--Bode-like laws. Bovaird and Lineweaver applied a generalized Titius--Bode relation to 68 exoplanet systems that contain four or more planets. They showed that 96% of these exoplanet systems adhere to a generalized Titius--Bode relation to a similar or greater extent than the Solar System does. The locations of potentially undetected exoplanets are predicted in each system.", "question": "do exoplanetary systems follow the titus bode rule"}
+    - {"prompt_tag": "boolq-full-text", "sentence": "Bipolar disorder -- Bipolar disorder, previously known as manic depression, is a mental disorder that causes periods of depression and periods of abnormally elevated mood. The elevated mood is significant and is known as mania or hypomania, depending on its severity, or whether symptoms of psychosis are present. During mania, an individual behaves or feels abnormally energetic, happy, or irritable. Individuals often make poorly thought out decisions with little regard to the consequences. The need for sleep is usually reduced during manic phases. During periods of depression, there may be crying, a negative outlook on life, and poor eye contact with others. The risk of suicide among those with the illness is high at greater than 6 percent over 20 years, while self-harm occurs in 30--40 percent. Other mental health issues such as anxiety disorders and substance use disorder are commonly associated.", "question": "is manic depression the same as bi polar"}
+    - {"prompt_tag": "boolq-full-text", "sentence": "SS Politician -- SS Politician was an 8000-ton cargo ship owned by T & J Harrison of Liverpool. It left Liverpool on 3 February 1941, bound for Kingston, Jamaica and New Orleans with a cargo including 28,000 cases of malt whisky. The ship sank off the north coast of Eriskay in the Outer Hebrides, off the west coast of Scotland, and much of the wreck's cargo was salvaged by the island's inhabitants. The story of the wreck and looting was the basis for the book and film Whisky Galore!.", "question": "was whiskey galore based on a true story"}
+    - {"prompt_tag": "boolq-full-text", "sentence": "Plants in space -- Plant research continued on the International Space Station. Biomass Production System was used on the ISS Expedition 4. The Vegetable Production System (Veggie) system was later used aboard ISS. Plants tested in Veggie before going into space included lettuce, Swiss chard, radishes, Chinese cabbage and peas. Red Romaine lettuce was grown in space on Expedition 40 which were harvested when mature, frozen and tested back on Earth. Expedition 44 members became the first American astronauts to eat plants grown in space on 10 August 2015, when their crop of Red Romaine was harvested. Since 2003 Russian cosmonauts have been eating half of their crop while the other half goes towards further research. In 2012, a sunflower bloomed aboard the ISS under the care of NASA astronaut Donald Pettit. In January 2016, US astronauts announced that a zinnia had blossomed aboard the ISS.", "question": "are there plants on the international space station"}
+    - {"prompt_tag": "boolq-full-text", "sentence": "Goal (ice hockey) -- In ice hockey, a goal is scored when the puck entirely crosses the goal line between the two goal posts and below the goal crossbar. A goal awards one point to the team attacking the goal scored upon, regardless of which team the player who actually deflected the puck into the goal belongs to (see also own goal). Typically, a player on the team attempting to score shoots the puck with his/her stick towards the goal net opening, and a player on the opposing team called a goaltender tries to block the shot to prevent a goal from being scored against his/her team.", "question": "does the hockey puck have to cross the line to be a goal"}
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -35,12 +35,14 @@ exp_manager:
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
     filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
 
 model:
   # model parallelism 
   micro_batch_size: 4
   tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
 
   # model architecture
   make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.