Skip to content

Commit

Permalink
Reverted -x FI_EFA_USE_DEVICE_RDMA=1 to fix a crash on PyTorch Datalo…
Browse files Browse the repository at this point in the history
…aders for Distributed training (aws#106)
  • Loading branch information
piyushghai authored Apr 23, 2021
1 parent 0efc75f commit 2936f22
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 5 deletions.
3 changes: 0 additions & 3 deletions src/sagemaker_training/smdataparallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,6 @@ def _get_mpirun_command(
mpirun_command.extend(additional_options)

instance_type = self._get_instance_type()
# Use EFA's RDMA functionality for one-sided and two-sided transfer
if instance_type in ["ml.p4d.24xlarge"]:
mpirun_command.extend(["-x", "FI_EFA_USE_DEVICE_RDMA=1"])

if smdataparallel_server_addr and smdataparallel_server_port:
# in case of multi-node [distributed] training, smdataparallel_server_addr,
Expand Down
2 changes: 0 additions & 2 deletions test/unit/test_smdataparallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,6 @@ def test_smdataparallel_run_single_node_python(
"-x",
"LD_PRELOAD=%s" % inspect.getfile(gethostname),
"--verbose",
"-x",
"FI_EFA_USE_DEVICE_RDMA=1",
"smddprun",
"usr/bin/python3",
"-m",
Expand Down

0 comments on commit 2936f22

Please sign in to comment.