Skip to content

Commit

Permalink
Merge branch 'main' into t5_lm_adaptation
Browse files Browse the repository at this point in the history
  • Loading branch information
ericharper authored Feb 14, 2022
2 parents 377342c + 6a517f0 commit a826ac7
Show file tree
Hide file tree
Showing 39 changed files with 2,906 additions and 472 deletions.
164 changes: 117 additions & 47 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1920,7 +1920,7 @@ pipeline {
sh "rm -rf examples/nlp/language_modeling/ptune_results"
}
}
stage('L2: Megatron GPT Pretraining and Resume Training') {
stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
when {
anyOf {
branch 'main'
Expand All @@ -1934,7 +1934,7 @@ pipeline {
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
Expand All @@ -1960,8 +1960,8 @@ pipeline {
trainer.gpus=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=2 \
trainer.limit_val_batches=1 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=20 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
Expand All @@ -1987,6 +1987,75 @@ pipeline {
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
}
}
stage('L2: Megatron GPT Pretraining and Resume Training PP=2') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.gpus=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]"
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.gpus=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=20 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]"
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
}
}
stage('L2: Megatron GPT Eval') {
when {
anyOf {
Expand All @@ -2011,49 +2080,50 @@ pipeline {
16"
}
}
stage('L2: Megatron GPT Prompt Tuning and Inference') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python tests/collections/nlp/test_prompt_tuning.py"
sh "python examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py \
--config-name=megatron_gpt_config \
trainer.gpus=1 \
trainer.max_steps=10 \
trainer.val_check_interval=1 \
exp_manager.name='megatron_gpt125M_prompt_tuning' \
exp_manager.checkpoint_callback_params.save_top_k=2 \
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
+model.use_soft_prompts=True \
+model.num_prompt_tokens=10 \
+model.new_prompt_tags=['Winogrande, BoolQ'] \
+model.new_prompt_init_text=['logic choose person name, None'] \
+model.new_prompt_init_methods=['text, random'] \
model.data.data_prefix=None \
+model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
+model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+model.data.batch_size=8 \
model.optim.lr=2e-2 \
model.optim.sched.min_lr=2e-3 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=8 \
model.encoder_seq_length=2048"
sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
--use_soft_prompts \
--model_file=nemo_experiments/megatron_gpt125M_prompt_tuning/checkpoints/megatron_gpt125M_prompt_tuning.nemo \
--tokens_to_generate=3 \
--prompt_tag='Winogrande' \
--prompt='option1: wood option2: bag sentence: The _ is soft. answer:'"
sh "rm -rf nemo_experiments"
}
}
// # TODO uncomment once prompt tuning works with apex fwd/bwd functions
// stage('L2: Megatron GPT Prompt Tuning and Inference') {
// when {
// anyOf {
// branch 'main'
// changeRequest target: 'main'
// }
// }
// failFast true
// steps {
// sh "python tests/collections/nlp/test_prompt_tuning.py"
// sh "python examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py \
// --config-name=megatron_gpt_config \
// trainer.gpus=1 \
// trainer.max_steps=10 \
// trainer.val_check_interval=1 \
// exp_manager.name='megatron_gpt125M_prompt_tuning' \
// exp_manager.checkpoint_callback_params.save_top_k=2 \
// exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
// restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
// +model.use_soft_prompts=True \
// +model.num_prompt_tokens=10 \
// +model.new_prompt_tags=['Winogrande, BoolQ'] \
// +model.new_prompt_init_text=['logic choose person name, None'] \
// +model.new_prompt_init_methods=['text, random'] \
// model.data.data_prefix=None \
// +model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
// +model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
// +model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
// +model.data.batch_size=8 \
// model.optim.lr=2e-2 \
// model.optim.sched.min_lr=2e-3 \
// model.optim.sched.warmup_steps=2 \
// model.optim.sched.constant_steps=8 \
// model.encoder_seq_length=2048"
// sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
// --use_soft_prompts \
// --model_file=nemo_experiments/megatron_gpt125M_prompt_tuning/checkpoints/megatron_gpt125M_prompt_tuning.nemo \
// --tokens_to_generate=3 \
// --prompt_tag='Winogrande' \
// --prompt='option1: wood option2: bag sentence: The _ is soft. answer:'"
// sh "rm -rf nemo_experiments"
// }
// }


stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
Expand Down
5 changes: 3 additions & 2 deletions examples/nlp/language_modeling/conf/megatron_bert_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@ exp_manager:
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${model.tensor_model_parallel_size}
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}


model:
# model parallelism
micro_batch_size: 4
tensor_model_parallel_size: 2
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1

# model architecture
encoder_seq_length: 512
Expand Down
18 changes: 9 additions & 9 deletions examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ trainer:
val_check_interval: 100
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0

exp_manager:
Expand All @@ -36,13 +36,16 @@ exp_manager:
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${model.tensor_model_parallel_size}
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}


model:
# model parallelism
micro_batch_size: 4
tensor_model_parallel_size: 1
# specify micro_batch_size, global_batch_size, and model parallelism
# gradient accumulation will be done automatically based on data_parallel_size
micro_batch_size: 4 # limited by GPU memory
global_batch_size: 8 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism

# model architecture
encoder_seq_length: 512
Expand Down Expand Up @@ -78,16 +81,13 @@ model:

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using master parameters
fp32_grad_accum: False # Execute gradient accumulation and all-reduce in fp32
contiguous_grad_bucket: False # Allocate master gradients in the contiguous memory space
async_grad_allreduce: False # Asynchronous master gradient all-reduce with a training step

# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this

# not implemented in NeMo yet
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1

Expand Down
110 changes: 110 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: megatron_ptune_gpt

trainer:
gpus: 2
num_nodes: 1
precision: 16
logger: False # logger provided by exp_manager
checkpoint_callback: False
replace_sampler_ddp: False
max_epochs: 3
max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 300
accumulate_grad_batches: 1
gradient_clip_val: 1.0
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.


exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_ptune_gpt
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_acc
save_top_k: 2
mode: max
always_save_nemo: False # TODO: add support
filename: 'megatron_gpt--{val_acc:.3f}-{step}'
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True

model:
tensor_model_parallel_size: 1
seed: 1234
nemo_path: null # filename to save the model and associated artifacts to .nemo file
use_lm_finetune: False # whether fine tune the language model
pseudo_token: '[PROMPT]' # pseudo prompt tokens
max_decode_length: null # max number of tokens length to decode, if set null, it will be inferred from data

language_model:
nemo_file: null

prompt_encoder:
template: [3, 3, 3]
dropout: 0.0
num_layers: 2
task_dependent: true # whether the prompt encoder conditions on the task name or not, useful for multiple tasks

task_processors:
- taskname: qa-task
template: "{v0} Context: {context}{v1} Question: {question}?{v2} Answer:"
limit_length_field: 'context'
- taskname: sentiment-task
template: "{v0}{v1} Sentence: {sentence}{v2} Sentiment:"
limit_length_field: 'sentence'

data:
train_ds:
file_path: ???
batch_size: 32
shuffle: True
num_workers: 8
pin_memory: True

validation_ds:
file_path: ???
batch_size: 32
shuffle: False
num_workers: 8
pin_memory: True

test_ds:
file_path: ???
batch_size: 32
shuffle: False
num_workers: 8
pin_memory: True

optim:
name: adam
lr: 1e-5
# optimizer arguments
betas: [0.9, 0.999]
weight_decay: 0.0005
# scheduler setup
sched:
name: WarmupAnnealing
# Scheduler params
warmup_steps: null
warmup_ratio: 0.1
last_epoch: -1
# pytorch lightning args
monitor: val_loss
reduce_on_plateau: false

# List of some sample queries for inference after training is done
infer_samples:
- {"prompt_tag": "boolq-full-text", "sentence": "Slave states and free states -- In the 1770s, blacks throughout New England began sending petitions to northern legislatures demanding freedom. Five of the Northern self-declared states adopted policies to at least gradually abolish slavery: Pennsylvania (1780), New Hampshire and Massachusetts (1783), Connecticut and Rhode Island (1784). Vermont had abolished slavery in 1777, while it was still independent, and when it joined the United States as the 14th state in 1791, it was the first state to join untainted by slavery. These state jurisdictions thus enacted the first abolition laws in the Americas. By 1804 (including, New York (1799), New Jersey (1804)), all of the northern states had abolished slavery or set measures in place to gradually abolish it.", "question": "were the new england states free states in 1854"}
- {"prompt_tag": "boolq-full-text", "sentence": "Titius\u2013Bode law -- Recent astronomical research suggests that planetary systems around some other stars may follow Titius--Bode-like laws. Bovaird and Lineweaver applied a generalized Titius--Bode relation to 68 exoplanet systems that contain four or more planets. They showed that 96% of these exoplanet systems adhere to a generalized Titius--Bode relation to a similar or greater extent than the Solar System does. The locations of potentially undetected exoplanets are predicted in each system.", "question": "do exoplanetary systems follow the titus bode rule"}
- {"prompt_tag": "boolq-full-text", "sentence": "Bipolar disorder -- Bipolar disorder, previously known as manic depression, is a mental disorder that causes periods of depression and periods of abnormally elevated mood. The elevated mood is significant and is known as mania or hypomania, depending on its severity, or whether symptoms of psychosis are present. During mania, an individual behaves or feels abnormally energetic, happy, or irritable. Individuals often make poorly thought out decisions with little regard to the consequences. The need for sleep is usually reduced during manic phases. During periods of depression, there may be crying, a negative outlook on life, and poor eye contact with others. The risk of suicide among those with the illness is high at greater than 6 percent over 20 years, while self-harm occurs in 30--40 percent. Other mental health issues such as anxiety disorders and substance use disorder are commonly associated.", "question": "is manic depression the same as bi polar"}
- {"prompt_tag": "boolq-full-text", "sentence": "SS Politician -- SS Politician was an 8000-ton cargo ship owned by T & J Harrison of Liverpool. It left Liverpool on 3 February 1941, bound for Kingston, Jamaica and New Orleans with a cargo including 28,000 cases of malt whisky. The ship sank off the north coast of Eriskay in the Outer Hebrides, off the west coast of Scotland, and much of the wreck's cargo was salvaged by the island's inhabitants. The story of the wreck and looting was the basis for the book and film Whisky Galore!.", "question": "was whiskey galore based on a true story"}
- {"prompt_tag": "boolq-full-text", "sentence": "Plants in space -- Plant research continued on the International Space Station. Biomass Production System was used on the ISS Expedition 4. The Vegetable Production System (Veggie) system was later used aboard ISS. Plants tested in Veggie before going into space included lettuce, Swiss chard, radishes, Chinese cabbage and peas. Red Romaine lettuce was grown in space on Expedition 40 which were harvested when mature, frozen and tested back on Earth. Expedition 44 members became the first American astronauts to eat plants grown in space on 10 August 2015, when their crop of Red Romaine was harvested. Since 2003 Russian cosmonauts have been eating half of their crop while the other half goes towards further research. In 2012, a sunflower bloomed aboard the ISS under the care of NASA astronaut Donald Pettit. In January 2016, US astronauts announced that a zinnia had blossomed aboard the ISS.", "question": "are there plants on the international space station"}
- {"prompt_tag": "boolq-full-text", "sentence": "Goal (ice hockey) -- In ice hockey, a goal is scored when the puck entirely crosses the goal line between the two goal posts and below the goal crossbar. A goal awards one point to the team attacking the goal scored upon, regardless of which team the player who actually deflected the puck into the goal belongs to (see also own goal). Typically, a player on the team attempting to score shoots the puck with his/her stick towards the goal net opening, and a player on the opposing team called a goaltender tries to block the shot to prevent a goal from being scored against his/her team.", "question": "does the hockey puck have to cross the line to be a goal"}
4 changes: 3 additions & 1 deletion examples/nlp/language_modeling/conf/megatron_t5_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ exp_manager:
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${model.tensor_model_parallel_size}
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}


model:
# model parallelism
micro_batch_size: 4
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1

# model architecture
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
Expand Down
Loading

0 comments on commit a826ac7

Please sign in to comment.