Skip to content

Commit

Permalink
Discover variables for NCCL backend on AML without mpi4py (deepspeeda…
Browse files Browse the repository at this point in the history
…i#542)

* Use AML method to set env vars instead of using mpi4py.

Co-authored-by: Jeff Rasley <[email protected]>
  • Loading branch information
awan-10 and jeffra authored Nov 19, 2020
1 parent d81cb26 commit 1b45917
Showing 1 changed file with 47 additions and 1 deletion.
48 changes: 47 additions & 1 deletion deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,10 @@ def __init__(self,
if dist_init_required is None:
dist_init_required = not dist.is_initialized()

self._mpi_check(args, dist_init_required)
if self._in_aml():
self._set_environment_variables_for_nccl_backend(args)
else:
self._mpi_check(args, dist_init_required)

self.dist_backend = "nccl"
if dist_init_required:
Expand Down Expand Up @@ -199,6 +202,49 @@ def __init__(self,
self.flatten = util_ops.flatten
self.unflatten = util_ops.unflatten

def _in_aml(self):
# read and environment variable to detect if we are using an Azure ML environment
if 'AZUREML_EXPERIMENT_ID' in os.environ:
return True
else:
return False

def _set_environment_variables_for_nccl_backend(self,
args,
master_port=6105,
verbose=True):
"""Helper routine to get and set environment variables.
This is adapted from Azure ML's documentation available from:
https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
"""
os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
os.environ["WORLD_SIZE"])
if not single_node:
master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
os.environ["MASTER_ADDR"] = master_node_params[0]
# Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
if "MASTER_PORT" not in os.environ:
os.environ["MASTER_PORT"] = str(master_port)
else:
os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
os.environ["MASTER_PORT"] = "54965"
print("NCCL_SOCKET_IFNAME original value = {}".format(
os.environ["NCCL_SOCKET_IFNAME"]))

os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
args.local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])

if verbose:
logger.info(
"Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
.format(os.environ['RANK'],
args.local_rank,
os.environ['WORLD_SIZE'],
os.environ['MASTER_ADDR'],
os.environ['MASTER_PORT']))

def _mpi_check(self, args, dist_init_required):
if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
from mpi4py import MPI
Expand Down

0 comments on commit 1b45917

Please sign in to comment.