From 5766b5d81da6e989ecbca5816437e2cc2806049c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 23 Nov 2021 11:06:41 +0100 Subject: [PATCH 1/5] Some update to tr10 config --- train/tr10-13B-ml/tr10-13B.slurm | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm index 0e6ecdd0..09e1ba56 100644 --- a/train/tr10-13B-ml/tr10-13B.slurm +++ b/train/tr10-13B-ml/tr10-13B.slurm @@ -46,7 +46,7 @@ GLOBAL_BATCH_SIZE=2048 NLAYERS=40 NHIDDEN=5120 -NHEADS=32 +NHEADS=40 SEQ_LEN=2048 VOCAB_SIZE=150000 @@ -57,13 +57,14 @@ OPTIMIZER_ARGS=" \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --lr 6e-5 \ + --lr 6e-4 \ --min-lr 6e-6 \ --lr-decay-style cosine \ - --lr-decay-samples 126_953_125 \ --lr-warmup-samples 216_320 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ + --hidden-dropout 0.0 \ + --attention-dropout 0.0 \ " EXIT_OPTS=" \ @@ -80,7 +81,7 @@ GPT_ARGS=" \ --micro-batch-size $MICRO_BATCH_SIZE \ --rampup-batch-size 16 16 6_000_000 \ --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-samples 300_000_000 \ + --train-samples $((3000000000 / $SEQ_LEN + 1)) \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path $TOKENIZER_NAME \ --loss-scale 12 \ From 11433bdff39b196ff9e5c03587fabb1c1c330792 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 23 Nov 2021 11:11:56 +0100 Subject: [PATCH 2/5] Woops --- train/tr10-13B-ml/tr10-13B.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm index 09e1ba56..6a012aad 100644 --- a/train/tr10-13B-ml/tr10-13B.slurm +++ b/train/tr10-13B-ml/tr10-13B.slurm @@ -57,7 +57,7 @@ OPTIMIZER_ARGS=" \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --lr 6e-4 \ + --lr 1e-4 \ --min-lr 6e-6 \ --lr-decay-style cosine \ --lr-warmup-samples 216_320 \ From d50b0673593f36a74ed7b81f77e8052b7e27c59a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 23 Nov 2021 09:21:38 -0800 Subject: [PATCH 3/5] restore the split --- train/tr10-13B-ml/tr10-13B.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm index 6a012aad..0309f342 100644 --- a/train/tr10-13B-ml/tr10-13B.slurm +++ b/train/tr10-13B-ml/tr10-13B.slurm @@ -166,7 +166,7 @@ export CMD=" \ --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ --data-impl mmap \ - --split 900,100,0 \ + --split 950,50,0 \ --distributed-backend nccl \ $DEEPSPEED_ARGS \ " From 13b93c6390931ce93e7fc0674033c6e697a6d0ac Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 24 Nov 2021 10:41:53 +0100 Subject: [PATCH 4/5] Update the formula for computing the number of samples --- train/tr10-13B-ml/tr10-13B.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm index 0309f342..5cb2eda1 100644 --- a/train/tr10-13B-ml/tr10-13B.slurm +++ b/train/tr10-13B-ml/tr10-13B.slurm @@ -81,7 +81,7 @@ GPT_ARGS=" \ --micro-batch-size $MICRO_BATCH_SIZE \ --rampup-batch-size 16 16 6_000_000 \ --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-samples $((3000000000 / $SEQ_LEN + 1)) \ + --train-samples $((300_000_000_000 / $SEQ_LEN + 1)) \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path $TOKENIZER_NAME \ --loss-scale 12 \ From 673e80189520d5b5b4a2767ee7e91e07e8a9984a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 24 Nov 2021 22:07:07 +0100 Subject: [PATCH 5/5] Woops --- train/tr10-13B-ml/tr10-13B.slurm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm index 5cb2eda1..5b02c437 100644 --- a/train/tr10-13B-ml/tr10-13B.slurm +++ b/train/tr10-13B-ml/tr10-13B.slurm @@ -50,6 +50,9 @@ NHEADS=40 SEQ_LEN=2048 VOCAB_SIZE=150000 +TRAIN_TOKENS=300_000_000_000 +TRAIN_SAMPLES=$(python -c "print($TRAIN_TOKENS // $SEQ_LEN)") + SAVE_INTERVAL=300 OPTIMIZER_ARGS=" \ @@ -81,7 +84,7 @@ GPT_ARGS=" \ --micro-batch-size $MICRO_BATCH_SIZE \ --rampup-batch-size 16 16 6_000_000 \ --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-samples $((300_000_000_000 / $SEQ_LEN + 1)) \ + --train-samples $TRAIN_SAMPLES \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path $TOKENIZER_NAME \ --loss-scale 12 \