From a1607235309602aff203080066989b278bbfa896 Mon Sep 17 00:00:00 2001
From: cdliang11 <1404056823@qq.com>
Date: Fri, 29 Nov 2024 14:54:32 +0800
Subject: [PATCH 1/2] [train] support multi-node training

---
 examples/cnceleb/v2/run.sh               | 11 ++++++++++-
 examples/cnceleb/v3_finetune/run.sh      |  8 +++++++-
 examples/sre/v2/run.sh                   |  8 +++++++-
 examples/sre/v3/run.sh                   | 10 +++++++---
 examples/voxceleb/v1/Whisper-PMFA/run.sh | 12 ++++++++++--
 examples/voxceleb/v2/run.sh              | 11 ++++++++++-
 examples/voxceleb/v2/run_wavlm.sh        |  8 +++++++-
 examples/voxceleb/v2_deprecated/run.sh   |  8 +++++++-
 examples/voxceleb/v3/dino/run.sh         |  8 +++++++-
 examples/voxceleb/v3/moco/run.sh         |  8 +++++++-
 examples/voxceleb/v3/simclr/run.sh       |  8 +++++++-
 wespeaker/bin/train.py                   |  5 +++--
 12 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/examples/cnceleb/v2/run.sh b/examples/cnceleb/v2/run.sh
index d813c60c..c1874977 100755
--- a/examples/cnceleb/v2/run.sh
+++ b/examples/cnceleb/v2/run.sh
@@ -7,9 +7,16 @@
 
 . ./path.sh || exit 1
 
+# multi-node + multi-gpus:
+#   bash run.sh --stage 3 --stop-stage 3 --HOST_NODE_ADDR "xxx.xxx.xxx.xxx:port" --num_nodes num_node
+
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -57,7 +64,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/cnceleb/v3_finetune/run.sh b/examples/cnceleb/v3_finetune/run.sh
index 3a15f57a..21d0d4bf 100755
--- a/examples/cnceleb/v3_finetune/run.sh
+++ b/examples/cnceleb/v3_finetune/run.sh
@@ -10,6 +10,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -60,7 +64,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/sre/v2/run.sh b/examples/sre/v2/run.sh
index 4fcffad6..227be95e 100755
--- a/examples/sre/v2/run.sh
+++ b/examples/sre/v2/run.sh
@@ -9,6 +9,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 # the sre data should be prepared in kaldi format and stored in the following directory
 # only wav.scp, utt2spk and spk2utt files are needed
 sre_data_dir=sre_data_dir
@@ -65,7 +69,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/sre/v3/run.sh b/examples/sre/v3/run.sh
index ccb39d21..aaa77466 100755
--- a/examples/sre/v3/run.sh
+++ b/examples/sre/v3/run.sh
@@ -22,6 +22,10 @@
 stage=1
 stop_stage=1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -194,9 +198,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   else
       num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
   fi
-  echo "Using $num_gpus_train GPUs: $gpus"
-  #torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus_train \  # The below is to prevent problems if many jobs run on the same machine
-  torchrun --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$((RANDOM)) --nnodes=1 --nproc_per_node=$num_gpus_train \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
       wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v1/Whisper-PMFA/run.sh b/examples/voxceleb/v1/Whisper-PMFA/run.sh
index 577423f9..133164df 100644
--- a/examples/voxceleb/v1/Whisper-PMFA/run.sh
+++ b/examples/voxceleb/v1/Whisper-PMFA/run.sh
@@ -8,6 +8,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="raw"  # shard/raw
 model=whisper_PMFA_large_v2
@@ -57,7 +61,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training with frozen whisper parameter..."
   config=conf/whisper_PMFA_stage0.yaml
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
@@ -84,7 +90,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   config=conf/whisper_PMFA_stage1.yaml
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
   checkpoint=${exp_dir}/models/model_4.pt
-  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v2/run.sh b/examples/voxceleb/v2/run.sh
index 955755a8..5b076bf7 100755
--- a/examples/voxceleb/v2/run.sh
+++ b/examples/voxceleb/v2/run.sh
@@ -5,9 +5,16 @@
 
 . ./path.sh || exit 1
 
+# multi-node + multi-gpus:
+#   bash run.sh --stage 3 --stop-stage 3 --HOST_NODE_ADDR "xxx.xxx.xxx.xxx:port" --num_nodes num_node
+
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -55,7 +62,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --master_addr=localhost --master_port=29401 --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v2/run_wavlm.sh b/examples/voxceleb/v2/run_wavlm.sh
index 4eb35a4e..8be42312 100755
--- a/examples/voxceleb/v2/run_wavlm.sh
+++ b/examples/voxceleb/v2/run_wavlm.sh
@@ -7,6 +7,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -57,7 +61,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --master_addr=localhost --master_port=29401 --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v2_deprecated/run.sh b/examples/voxceleb/v2_deprecated/run.sh
index f4817d32..64ced135 100755
--- a/examples/voxceleb/v2_deprecated/run.sh
+++ b/examples/voxceleb/v2_deprecated/run.sh
@@ -8,6 +8,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 config=conf/resnet.yaml
 exp_dir=exp/ResNet34-TSTP-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-SGD-epoch150
 gpus="[0,1]"
@@ -28,7 +32,9 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/bin/train_deprecated.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v3/dino/run.sh b/examples/voxceleb/v3/dino/run.sh
index 27e1bb39..cb479170 100755
--- a/examples/voxceleb/v3/dino/run.sh
+++ b/examples/voxceleb/v3/dino/run.sh
@@ -10,6 +10,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -54,7 +58,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/ssl/bin/train_dino.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v3/moco/run.sh b/examples/voxceleb/v3/moco/run.sh
index a70e7475..0fd64e9d 100755
--- a/examples/voxceleb/v3/moco/run.sh
+++ b/examples/voxceleb/v3/moco/run.sh
@@ -10,6 +10,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -54,7 +58,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/ssl/bin/train_contrastive.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/examples/voxceleb/v3/simclr/run.sh b/examples/voxceleb/v3/simclr/run.sh
index 9c44a420..7da61550 100755
--- a/examples/voxceleb/v3/simclr/run.sh
+++ b/examples/voxceleb/v3/simclr/run.sh
@@ -10,6 +10,10 @@
 stage=-1
 stop_stage=-1
 
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+job_id=2024
+
 data=data
 data_type="shard"  # shard/raw
 
@@ -54,7 +58,9 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Start training ..."
   num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
-  torchrun --master_addr=localhost --master_port=16888 --nnodes=1 --nproc_per_node=$num_gpus \
+  echo "$0: num_nodes is $num_nodes, proc_per_node is $num_gpus"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
+           --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
     wespeaker/ssl/bin/train_contrastive.py --config $config \
       --exp_dir ${exp_dir} \
       --gpus $gpus \
diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py
index 63fb99bc..54ca0053 100644
--- a/wespeaker/bin/train.py
+++ b/wespeaker/bin/train.py
@@ -46,9 +46,10 @@ def train(config='conf/config.yaml', **kwargs):
     configs = parse_config_or_kwargs(config, **kwargs)
     checkpoint = configs.get('checkpoint', None)
     # dist configs
-    rank = int(os.environ['RANK'])
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
     world_size = int(os.environ['WORLD_SIZE'])
-    gpu = int(configs['gpus'][rank])
+    gpu = int(configs['gpus'][local_rank])
     torch.cuda.set_device(gpu)
     dist.init_process_group(backend='nccl')
 

From fb1c02e545b6fc327d4ee82fb1d20878ea3011c4 Mon Sep 17 00:00:00 2001
From: cdliang11 <1404056823@qq.com>
Date: Mon, 2 Dec 2024 13:49:16 +0800
Subject: [PATCH 2/2] [train] modify localhost:0 to localhost:29400

---
 examples/cnceleb/v2/run.sh               | 2 +-
 examples/cnceleb/v3_finetune/run.sh      | 2 +-
 examples/sre/v2/run.sh                   | 2 +-
 examples/sre/v3/run.sh                   | 2 +-
 examples/voxceleb/v1/Whisper-PMFA/run.sh | 2 +-
 examples/voxceleb/v2/run.sh              | 2 +-
 examples/voxceleb/v2/run_wavlm.sh        | 2 +-
 examples/voxceleb/v2_deprecated/run.sh   | 2 +-
 examples/voxceleb/v3/dino/run.sh         | 2 +-
 examples/voxceleb/v3/moco/run.sh         | 2 +-
 examples/voxceleb/v3/simclr/run.sh       | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/cnceleb/v2/run.sh b/examples/cnceleb/v2/run.sh
index c1874977..f0f7f216 100755
--- a/examples/cnceleb/v2/run.sh
+++ b/examples/cnceleb/v2/run.sh
@@ -13,7 +13,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/cnceleb/v3_finetune/run.sh b/examples/cnceleb/v3_finetune/run.sh
index 21d0d4bf..79e9328f 100755
--- a/examples/cnceleb/v3_finetune/run.sh
+++ b/examples/cnceleb/v3_finetune/run.sh
@@ -10,7 +10,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/sre/v2/run.sh b/examples/sre/v2/run.sh
index 227be95e..16f41211 100755
--- a/examples/sre/v2/run.sh
+++ b/examples/sre/v2/run.sh
@@ -9,7 +9,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/sre/v3/run.sh b/examples/sre/v3/run.sh
index aaa77466..fe6a82e2 100755
--- a/examples/sre/v3/run.sh
+++ b/examples/sre/v3/run.sh
@@ -22,7 +22,7 @@
 stage=1
 stop_stage=1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v1/Whisper-PMFA/run.sh b/examples/voxceleb/v1/Whisper-PMFA/run.sh
index 133164df..0b12d006 100644
--- a/examples/voxceleb/v1/Whisper-PMFA/run.sh
+++ b/examples/voxceleb/v1/Whisper-PMFA/run.sh
@@ -8,7 +8,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v2/run.sh b/examples/voxceleb/v2/run.sh
index 5b076bf7..272996c9 100755
--- a/examples/voxceleb/v2/run.sh
+++ b/examples/voxceleb/v2/run.sh
@@ -11,7 +11,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v2/run_wavlm.sh b/examples/voxceleb/v2/run_wavlm.sh
index 8be42312..40eed073 100755
--- a/examples/voxceleb/v2/run_wavlm.sh
+++ b/examples/voxceleb/v2/run_wavlm.sh
@@ -7,7 +7,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v2_deprecated/run.sh b/examples/voxceleb/v2_deprecated/run.sh
index 64ced135..26342abf 100755
--- a/examples/voxceleb/v2_deprecated/run.sh
+++ b/examples/voxceleb/v2_deprecated/run.sh
@@ -8,7 +8,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v3/dino/run.sh b/examples/voxceleb/v3/dino/run.sh
index cb479170..2dd6b8bd 100755
--- a/examples/voxceleb/v3/dino/run.sh
+++ b/examples/voxceleb/v3/dino/run.sh
@@ -10,7 +10,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v3/moco/run.sh b/examples/voxceleb/v3/moco/run.sh
index 0fd64e9d..f59b95c3 100755
--- a/examples/voxceleb/v3/moco/run.sh
+++ b/examples/voxceleb/v3/moco/run.sh
@@ -10,7 +10,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024
 
diff --git a/examples/voxceleb/v3/simclr/run.sh b/examples/voxceleb/v3/simclr/run.sh
index 7da61550..3c3b1c8a 100755
--- a/examples/voxceleb/v3/simclr/run.sh
+++ b/examples/voxceleb/v3/simclr/run.sh
@@ -10,7 +10,7 @@
 stage=-1
 stop_stage=-1
 
-HOST_NODE_ADDR="localhost:0"
+HOST_NODE_ADDR="localhost:29400"
 num_nodes=1
 job_id=2024