From 9c7270d00dbdd0599b6b6bf816c3ff2dd17d4878 Mon Sep 17 00:00:00 2001 From: ChaimZhu Date: Tue, 15 Mar 2022 20:18:57 +0800 Subject: [PATCH] add multi-machine dist_train (#1303) --- docs/en/1_exist_data_model.md | 21 +++++++-------------- docs/zh_cn/1_exist_data_model.md | 16 +++++++++++++++- tools/dist_test.sh | 16 ++++++++++++++-- tools/dist_train.sh | 15 +++++++++++++-- tools/multinode_train.sh | 17 ----------------- 5 files changed, 49 insertions(+), 36 deletions(-) delete mode 100644 tools/multinode_train.sh diff --git a/docs/en/1_exist_data_model.md b/docs/en/1_exist_data_model.md index 0ecb0c9167..0fe7c89fce 100644 --- a/docs/en/1_exist_data_model.md +++ b/docs/en/1_exist_data_model.md @@ -201,30 +201,23 @@ GPUS=16 ./tools/slurm_train.sh dev pp_kitti_3class hv_pointpillars_secfpn_6x8_16 You can check [slurm_train.sh](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) for full arguments and environment variables. -You can also use pytorch original DDP with script `multinode_train.sh`. (This script also supports single machine training.) +If you launch with multiple machines simply connected with ethernet, you can simply run following commands: -For each machine, run -```shell -./tools/sh_train.sh ${CONFIG_FILE} ${NODE_NUM} ${NODE_RANK} ${MASTER_NODE_IP} -``` - -Here is an example of using 16 GPUs (2 nodes), the IP=10.10.10.10: +On the first machine: -run in node0: ```shell -./tools/sh_train.sh hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py 2 0 10.10.10.10 +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS ``` -run in node1: +On the second machine: + ```shell -./tools/sh_train.sh hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py 2 1 10.10.10.10 +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS ``` - -If you have just multiple machines connected within ethernet, you can refer to -PyTorch [launch utility](https://pytorch.org/docs/stable/distributed.html). Usually it is slow if you do not have high speed networking like InfiniBand. + ### Launch multiple jobs on a single machine If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs, diff --git a/docs/zh_cn/1_exist_data_model.md b/docs/zh_cn/1_exist_data_model.md index 8858fd89a0..35f1a9b43f 100644 --- a/docs/zh_cn/1_exist_data_model.md +++ b/docs/zh_cn/1_exist_data_model.md @@ -198,7 +198,21 @@ GPUS=16 ./tools/slurm_train.sh dev pp_kitti_3class hv_pointpillars_secfpn_6x8_16 你可以查看 [slurm_train.sh](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) 来获取所有的参数和环境变量。 -如果你有多个机器连接到以太网,可以参考 PyTorch 的 [launch utility](https://pytorch.org/docs/stable/distributed.html),如果你没有像 InfiniBand 一样的高速率网络,通常会很慢。 +如果您想使用由 ethernet 连接起来的多台机器, 您可以使用以下命令: + +在第一台机器上: + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS +``` + +在第二台机器上: + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS +``` + +但是,如果您不使用高速网路连接这几台机器的话,训练将会非常慢。 ### 在单个机器上启动多个任务 diff --git a/tools/dist_test.sh b/tools/dist_test.sh index 3c74ec6ecd..dea131b43e 100755 --- a/tools/dist_test.sh +++ b/tools/dist_test.sh @@ -3,8 +3,20 @@ CONFIG=$1 CHECKPOINT=$2 GPUS=$3 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ -python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ - $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/test.py \ + $CONFIG \ + $CHECKPOINT \ + --launcher pytorch \ + ${@:4} diff --git a/tools/dist_train.sh b/tools/dist_train.sh index 5b43fffbf2..aa71bf4ae9 100755 --- a/tools/dist_train.sh +++ b/tools/dist_train.sh @@ -2,8 +2,19 @@ CONFIG=$1 GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ -python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ - $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/train.py \ + $CONFIG \ + --seed 0 \ + --launcher pytorch ${@:3} diff --git a/tools/multinode_train.sh b/tools/multinode_train.sh deleted file mode 100644 index 8c891134f7..0000000000 --- a/tools/multinode_train.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -set -e -set -x - -CONFIG=$1 -NODE_NUM=$2 -NODE_RANK=$3 -MASTER_ADDR=$4 - - -PORT=${PORT:-29500} - -PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ -python -m torch.distributed.launch --nproc_per_node=8 --master_port=$PORT \ - --nnodes=$NODE_NUM --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR \ - $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:5} \ No newline at end of file