Launch DDP with script (#350)

* first working version * wip * fix bugs - now working with n_proc_node * update recipe with nproc_per * set logging level and small updates * trying torchrun (breaking) * wip * wip[ * launch_elastic working * clean version that does it all - not tested yet * cleaning code * clean branch * improve doc in run.py * add version_base to hydra.main and add comment * renaming script and removing import * flake8 * wip * working version * remove unused code * update function names and comments * small update * rename nproc into n_gpus * finish renaming of gpu and small fixes * update readme * add logging when launching DDP with job info, and moving functions to training.utils.distributed... * remtrain_from_recipe_no_ddp.py becaus not needed anymore * flake8
Deci-AI · Sep 11, 2022 · c1404b9 · c1404b9
1 parent 269a286
commit c1404b9
Show file tree

Hide file tree

Showing 25 changed files with 215 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -487,10 +487,6 @@ The basic basic syntax is as follow:
 ```
 python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=<CONFIG-NAME> dataset_params.data_dir=<PATH-TO-DATASET>
 ```
-But in most cases you will want to train on multiple GPU's using this syntax:
-```
-python -m torch.distributed.launch --nproc_per_node=<N-NODES> src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=<CONFIG-NAME> dataset_params.data_dir=<PATH-TO-DATASET>
-```
 *Note: this script needs to be launched from the root folder of super_gradients*
 *Note: if you stored your dataset in the path specified by the recipe you can drop "dataset_params.data_dir=<PATH-TO-DATASET>".*
 
@@ -501,7 +497,7 @@ You will find information about the performance of a recipe as well as the comma
 
 *Example: [Training of YoloX Small on Coco 2017](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/coco2017_yolox.yaml), using 8 GPU* 
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_s dataset_params.data_dir=/home/coco2017
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_s dataset_params.data_dir=/home/coco2017
 ```
 
 
@@ -526,19 +522,19 @@ python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.
 
 efficientnet
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_efficientnet
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_efficientnet
 ```
 mobilenetv2
 ```
-python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv2
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv2
 ```
 mobilenetv3 small
 ```
-python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_small
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_small
 ```
 mobilenetv3 large
 ```
-python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_large
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_large
 ```
 regnetY200
 ```
@@ -558,23 +554,23 @@ python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.
 ```
 repvgg
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_repvgg
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_repvgg
 ```
 resnet50
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_resnet50
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_resnet50
 ```
 resnet50_kd
 ```
-python -m torch.distributed.launch --nproc_per_node=8  src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py --config-name=imagenet_resnet50_kd
+python src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py --config-name=imagenet_resnet50_kd
 ```
 vit_base
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_vit_base
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_vit_base
 ```
 vit_large
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_vit_large
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_vit_large
 ```
 </details>
 
@@ -585,31 +581,31 @@ python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/exampl
 
 ssd_lite_mobilenet_v2
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2
 ```
 yolox_n
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_n
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_n
 ```
 yolox_t
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_t
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_t
 ```
 yolox_s
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_s
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_s
 ```
 yolox_m
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_m
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_m
 ```
 yolox_l
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_l
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_l
 ```
 yolox_x
 ```
-python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_x
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_x
 ```
 
 </details>
@@ -622,31 +618,31 @@ python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/exampl
 
 DDRNet23
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet
 ```
 DDRNet23-Slim
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet architecture=ddrnet_23_slim
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet architecture=ddrnet_23_slim
 ```
 RegSeg48
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48
 ```
 STDC1-Seg50
 ```
-python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50
 ```
 STDC2-Seg50
 ```
-python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 architecture=stdc2_seg
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 architecture=stdc2_seg
 ```
 STDC1-Seg75
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75
 ```
 STDC2-Seg75
 ```
-python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 external_checkpoint_path=<stdc2-backbone-pretrained-path> architecture=stdc2_seg
+python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 external_checkpoint_path=<stdc2-backbone-pretrained-path> architecture=stdc2_seg
 ```
 
 </details>

diff --git a/src/super_gradients/common/environment/env_helpers.py b/src/super_gradients/common/environment/env_helpers.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import sys
+import socket
 from functools import wraps
 
 from omegaconf import OmegaConf
@@ -59,24 +60,37 @@ def hydra_output_dir_resolver(ckpt_root_dir, experiment_name):
 
 def init_trainer():
     """
-    a function to initialize the super_gradients environment. This function should be the first thing to be called
-    by any code running super_gradients. It resolves conflicts between the different tools, packages and environments used
-    and prepares the super_gradients environment.
+    Initialize the super_gradients environment.
+
+    This function should be the first thing to be called by any code running super_gradients.
+    It resolves conflicts between the different tools, packages and environments used and prepares the super_gradients environment.
     """
-    OmegaConf.register_new_resolver(
-        "hydra_output_dir", hydra_output_dir_resolver, replace=True
-    )
+
+    register_hydra_resolvers()
+
+    # We pop local_rank if it was specified in the args, because it would break
+    args_local_rank = pop_arg("local_rank", default_value=-1)
+
+    # Set local_rank with priority order (env variable > args.local_rank > args.default_value)
+    environment_config.DDP_LOCAL_RANK = int(os.getenv("LOCAL_RANK", default=args_local_rank))
+
+
+def register_hydra_resolvers():
+    """Register all the hydra resolvers required for the super-gradients recipes."""
+    OmegaConf.register_new_resolver("hydra_output_dir", hydra_output_dir_resolver, replace=True)
+
+
+def pop_arg(arg_name: str, default_value: int = None) -> argparse.Namespace:
+    """Get the specified args and remove them from argv"""
+
     parser = argparse.ArgumentParser()
-    parser.add_argument("--local_rank", type=int, default=-1)  # used by DDP
+    parser.add_argument(f"--{arg_name}", default=default_value)
     args, _ = parser.parse_known_args()
 
-    # remove any flags starting with --local_rank from the argv list
-    to_remove = list(filter(lambda x: x.startswith('--local_rank'), sys.argv))
-    if len(to_remove) > 0:
-        for val in to_remove:
-            sys.argv.remove(val)
-
-    environment_config.DDP_LOCAL_RANK = args.local_rank
+    # Remove the ddp args to not have a conflict with the use of hydra
+    for val in filter(lambda x: x.startswith(f"--{arg_name}"), sys.argv):
+        sys.argv.remove(val)
+    return vars(args)[arg_name]
 
 
 def is_distributed() -> bool:
@@ -102,3 +116,14 @@ def wrapper(*args, **kwargs):
             return do_nothing(*args, **kwargs)
 
     return wrapper
+
+
+def find_free_port() -> int:
+    """Find an available port of current machine/node.
+    Note: there is still a chance the port could be taken by other processes."""
+
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+        # Binding to port 0 will cause the OS to find an available port for us
+        sock.bind(("", 0))
+        _ip, port = sock.getsockname()
+    return port
diff --git a/src/super_gradients/common/environment/environment_config.py b/src/super_gradients/common/environment/environment_config.py
@@ -23,6 +23,7 @@
 
 # Controlling the default logging level via environment variable
 DEFAULT_LOGGING_LEVEL = environ.get("LOG_LEVEL", "INFO").upper()
+
 logging.basicConfig(
     level=DEFAULT_LOGGING_LEVEL
 )  # Set the default level for all libraries - including 3rd party packages

diff --git a/src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py b/src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py
@@ -12,7 +12,7 @@
 from super_gradients.training.kd_trainer import KDTrainer
 
 
-@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""))
+@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2")
 def main(cfg: DictConfig) -> None:
     KDTrainer.train_from_config(cfg)
 

diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py
@@ -5,19 +5,22 @@
 For recipe's specific instructions and details refer to the recipe's configuration file in the recipes directory.
 """
 
-import super_gradients
 from omegaconf import DictConfig
 import hydra
 import pkg_resources
 
-from super_gradients import Trainer
+from super_gradients import Trainer, init_trainer
 
 
-@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""))
+@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2")
 def main(cfg: DictConfig) -> None:
     Trainer.train_from_config(cfg)
 
 
-if __name__ == "__main__":
-    super_gradients.init_trainer()
+def run():
+    init_trainer()
     main()
+
+
+if __name__ == "__main__":
+    run()
diff --git a/src/super_gradients/recipes/cityscapes_ddrnet.yaml b/src/super_gradients/recipes/cityscapes_ddrnet.yaml
@@ -9,8 +9,8 @@
 #   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
 #   1. Move to the project root (where you will find the ReadMe and src folder)
 #   2. Run the command:
-#      DDRNet23:       python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet
-#      DDRNet23-Slim:  python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet architecture=ddrnet_23_slim
+#      DDRNet23:       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet
+#      DDRNet23-Slim:  python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet architecture=ddrnet_23_slim
 # Note: add "checkpoint_params.external_checkpoint_path=<ddrnet23-backbone-pretrained-path>" to use pretrained backbone
 #
 #  Validation mIoU - Cityscapes, training time:
@@ -82,6 +82,7 @@ model_checkpoints_location: local
 ckpt_root_dir:
 
 multi_gpu: DDP
+num_gpus: 4
 
 hydra:
   run:

diff --git a/src/super_gradients/recipes/cityscapes_regseg48.yaml b/src/super_gradients/recipes/cityscapes_regseg48.yaml
@@ -6,7 +6,7 @@
 #   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
 #   1. Move to the project root (where you will find the ReadMe and src folder)
 #   2. Run the command:
-#       python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48
 #
 #
 #  Validation mIoU - Cityscapes, training time:
@@ -88,6 +88,7 @@ project_name: RegSeg
 experiment_name: ${architecture}_cityscapes
 
 multi_gpu: AUTO
+num_gpus: 4
 
 hydra:
   searchpath:

diff --git a/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml b/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml
@@ -6,8 +6,8 @@
 #   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
 #   1. Move to the project root (where you will find the ReadMe and src folder)
 #   2. Run the command:
-#       STDC1-Seg50: python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50
-#       STDC2-Seg50: python -m torch.distributed.launch --nproc_per_node=2 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 architecture=stdc2_seg
+#       STDC1-Seg50: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50
+#       STDC2-Seg50: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 architecture=stdc2_seg
 # Note: add "checkpoint_params.external_checkpoint_path=<stdc1-backbone-pretrained-path>" to use pretrained backbone
 #
 #
@@ -70,6 +70,7 @@ training_hyperparams:
       edge_kernel: 3
 
 multi_gpu: DDP
+num_gpus: 2
 
 experiment_name: ${architecture}50_cityscapes
 ckpt_root_dir:

diff --git a/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml b/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml
@@ -6,8 +6,8 @@
 #   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
 #   1. Move to the project root (where you will find the ReadMe and src folder)
 #   2. Run the command:
-#       STDC1-Seg75: python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75
-#       STDC2-Seg75: python -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 architecture=stdc2_seg
+#       STDC1-Seg75: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75
+#       STDC2-Seg75: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 architecture=stdc2_seg
 # Note: add "external_checkpoint_path=<stdc1-backbone-pretrained-path>" to use pretrained backbone
 #
 #
@@ -74,6 +74,7 @@ training_hyperparams:
       weights: [ 1., 0.6, 0.4, 1. ]
 
 multi_gpu: DDP
+num_gpus: 4
 
 experiment_name: ${architecture}75_cityscapes
 ckpt_root_dir:

diff --git a/src/super_gradients/recipes/coco2017_ssd_lite_mobilenet_v2.yaml b/src/super_gradients/recipes/coco2017_ssd_lite_mobilenet_v2.yaml
@@ -11,14 +11,14 @@
 #   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
 #   1. Move to the project root (where you will find the ReadMe and src folder)
 #   2. Run the command:
-#       python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2
 
 
 # NOTE:
 # Anchors will be selected based on validation resolution and anchors_name
 # To switch between anchors, set anchors_name to something else defined in the anchors section
 # e.g.
-# python -m torch.distributed.launch --nproc_per_node=8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2 anchors_name=stride_16_plus
+# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2 anchors_name=stride_16_plus
 
 
 defaults:
@@ -54,3 +54,4 @@ training_hyperparams:
     dboxes: ${dboxes}
 
 multi_gpu: DDP
+num_gpus: 8