From 2936f229f1dbfda9e9a869df77e87888bf7181e1 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 23 Apr 2021 14:43:12 -0700 Subject: [PATCH] Reverted -x FI_EFA_USE_DEVICE_RDMA=1 to fix a crash on PyTorch Dataloaders for Distributed training (#106) --- src/sagemaker_training/smdataparallel.py | 3 --- test/unit/test_smdataparallel.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/src/sagemaker_training/smdataparallel.py b/src/sagemaker_training/smdataparallel.py index fa8b171a..62763583 100644 --- a/src/sagemaker_training/smdataparallel.py +++ b/src/sagemaker_training/smdataparallel.py @@ -168,9 +168,6 @@ def _get_mpirun_command( mpirun_command.extend(additional_options) instance_type = self._get_instance_type() - # Use EFA's RDMA functionality for one-sided and two-sided transfer - if instance_type in ["ml.p4d.24xlarge"]: - mpirun_command.extend(["-x", "FI_EFA_USE_DEVICE_RDMA=1"]) if smdataparallel_server_addr and smdataparallel_server_port: # in case of multi-node [distributed] training, smdataparallel_server_addr, diff --git a/test/unit/test_smdataparallel.py b/test/unit/test_smdataparallel.py index 9917dacb..081fe4b9 100644 --- a/test/unit/test_smdataparallel.py +++ b/test/unit/test_smdataparallel.py @@ -221,8 +221,6 @@ def test_smdataparallel_run_single_node_python( "-x", "LD_PRELOAD=%s" % inspect.getfile(gethostname), "--verbose", - "-x", - "FI_EFA_USE_DEVICE_RDMA=1", "smddprun", "usr/bin/python3", "-m",