aws · josephevans · Aug 15, 2022 · Aug 15, 2022 · Aug 15, 2022 · pinaraws
diff --git a/src/sagemaker_training/pytorch_xla.py b/src/sagemaker_training/pytorch_xla.py
@@ -72,6 +72,19 @@ def _setup(self):  # type: () -> None
         logger.info("Starting distributed training through PT-XLA Runtime.")
         self._check_compatibility()
 
+        # Set NCCL logging to info to debug customer issues
+        os.environ["NCCL_DEBUG"] = "info"
+
+        # Use `simple` protocol to handle the out-of-order data delivery from EFA
+        os.environ["NCCL_PROTO"] = "simple"
+
+        # Use GPU RDMA when available (available only in p4d.24xlarge)
+        os.environ["FI_EFA_USE_DEVICE_RDMA"] = "1"
+
+        # Use multiple connections per GPU to better saturate the EFA bandwidth
+        os.environ["OFI_NCCL_NIC_DUP_CONNS"] = str(self._num_gpus)
+
+        # Set cluster configuration for XLA runtime
         os.environ["XRT_HOST_ORDINAL"] = str(self._rank)
         os.environ["XRT_SHARD_WORLD_SIZE"] = str(self._num_hosts)
         address = "localservice:{};{}:" + str(self.WORKER_PORT)

diff --git a/test/unit/test_pytorch_xla.py b/test/unit/test_pytorch_xla.py
@@ -85,6 +85,10 @@ def test_setup(self, cluster, cluster_size, master, instance_type, num_gpus, *pa
             )
             runner._check_compatibility = lambda: None
             runner._setup()
+            assert os.environ["NCCL_DEBUG"] == "info"
+            assert os.environ["NCCL_PROTO"] == "simple"
+            assert os.environ["FI_EFA_USE_DEVICE_RDMA"] == "1"
+            assert os.environ["OFI_NCCL_NIC_DUP_CONNS"] == str(num_gpus)
             assert os.environ["XRT_HOST_ORDINAL"] == str(rank)
             assert os.environ["XRT_SHARD_WORLD_SIZE"] == str(cluster_size)
             assert os.environ["XRT_WORKERS"] == "|".join(