From 5766b5d81da6e989ecbca5816437e2cc2806049c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 23 Nov 2021 11:06:41 +0100
Subject: [PATCH 1/5] Some update to tr10 config

---
 train/tr10-13B-ml/tr10-13B.slurm | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm
index 0e6ecdd0..09e1ba56 100644
--- a/train/tr10-13B-ml/tr10-13B.slurm
+++ b/train/tr10-13B-ml/tr10-13B.slurm
@@ -46,7 +46,7 @@ GLOBAL_BATCH_SIZE=2048
 
 NLAYERS=40
 NHIDDEN=5120
-NHEADS=32
+NHEADS=40
 SEQ_LEN=2048
 VOCAB_SIZE=150000
 
@@ -57,13 +57,14 @@ OPTIMIZER_ARGS=" \
     --adam-beta1 0.9 \
     --adam-beta2 0.95 \
     --adam-eps 1e-8 \
-    --lr 6e-5 \
+    --lr 6e-4 \
     --min-lr 6e-6 \
     --lr-decay-style cosine \
-    --lr-decay-samples 126_953_125 \
     --lr-warmup-samples 216_320 \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
+    --hidden-dropout 0.0 \
+    --attention-dropout 0.0 \
     "
 
 EXIT_OPTS=" \
@@ -80,7 +81,7 @@ GPT_ARGS=" \
     --micro-batch-size $MICRO_BATCH_SIZE \
     --rampup-batch-size 16 16 6_000_000 \
     --global-batch-size $GLOBAL_BATCH_SIZE \
-    --train-samples 300_000_000 \
+    --train-samples $((3000000000 / $SEQ_LEN + 1)) \
     --tokenizer-type PretrainedFromHF \
     --tokenizer-name-or-path $TOKENIZER_NAME \
     --loss-scale 12 \

From 11433bdff39b196ff9e5c03587fabb1c1c330792 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 23 Nov 2021 11:11:56 +0100
Subject: [PATCH 2/5] Woops

---
 train/tr10-13B-ml/tr10-13B.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm
index 09e1ba56..6a012aad 100644
--- a/train/tr10-13B-ml/tr10-13B.slurm
+++ b/train/tr10-13B-ml/tr10-13B.slurm
@@ -57,7 +57,7 @@ OPTIMIZER_ARGS=" \
     --adam-beta1 0.9 \
     --adam-beta2 0.95 \
     --adam-eps 1e-8 \
-    --lr 6e-4 \
+    --lr 1e-4 \
     --min-lr 6e-6 \
     --lr-decay-style cosine \
     --lr-warmup-samples 216_320 \

From d50b0673593f36a74ed7b81f77e8052b7e27c59a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 23 Nov 2021 09:21:38 -0800
Subject: [PATCH 3/5] restore the split

---
 train/tr10-13B-ml/tr10-13B.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm
index 6a012aad..0309f342 100644
--- a/train/tr10-13B-ml/tr10-13B.slurm
+++ b/train/tr10-13B-ml/tr10-13B.slurm
@@ -166,7 +166,7 @@ export CMD=" \
     --load $CHECKPOINT_PATH \
     --data-path $DATA_PATH \
     --data-impl mmap \
-    --split 900,100,0 \
+    --split 950,50,0 \
     --distributed-backend nccl \
      $DEEPSPEED_ARGS \
     "

From 13b93c6390931ce93e7fc0674033c6e697a6d0ac Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 24 Nov 2021 10:41:53 +0100
Subject: [PATCH 4/5] Update the formula for computing the number of samples

---
 train/tr10-13B-ml/tr10-13B.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm
index 0309f342..5cb2eda1 100644
--- a/train/tr10-13B-ml/tr10-13B.slurm
+++ b/train/tr10-13B-ml/tr10-13B.slurm
@@ -81,7 +81,7 @@ GPT_ARGS=" \
     --micro-batch-size $MICRO_BATCH_SIZE \
     --rampup-batch-size 16 16 6_000_000 \
     --global-batch-size $GLOBAL_BATCH_SIZE \
-    --train-samples $((3000000000 / $SEQ_LEN + 1)) \
+    --train-samples $((300_000_000_000 / $SEQ_LEN + 1)) \
     --tokenizer-type PretrainedFromHF \
     --tokenizer-name-or-path $TOKENIZER_NAME \
     --loss-scale 12 \

From 673e80189520d5b5b4a2767ee7e91e07e8a9984a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 24 Nov 2021 22:07:07 +0100
Subject: [PATCH 5/5] Woops

---
 train/tr10-13B-ml/tr10-13B.slurm | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm
index 5cb2eda1..5b02c437 100644
--- a/train/tr10-13B-ml/tr10-13B.slurm
+++ b/train/tr10-13B-ml/tr10-13B.slurm
@@ -50,6 +50,9 @@ NHEADS=40
 SEQ_LEN=2048
 VOCAB_SIZE=150000
 
+TRAIN_TOKENS=300_000_000_000
+TRAIN_SAMPLES=$(python -c "print($TRAIN_TOKENS // $SEQ_LEN)")
+
 SAVE_INTERVAL=300
 
 OPTIMIZER_ARGS=" \
@@ -81,7 +84,7 @@ GPT_ARGS=" \
     --micro-batch-size $MICRO_BATCH_SIZE \
     --rampup-batch-size 16 16 6_000_000 \
     --global-batch-size $GLOBAL_BATCH_SIZE \
-    --train-samples $((300_000_000_000 / $SEQ_LEN + 1)) \
+    --train-samples $TRAIN_SAMPLES \
     --tokenizer-type PretrainedFromHF \
     --tokenizer-name-or-path $TOKENIZER_NAME \
     --loss-scale 12 \