diff --git a/src/sagemaker_training/smdataparallel.py b/src/sagemaker_training/smdataparallel.py index 9fcfe492..f74df134 100644 --- a/src/sagemaker_training/smdataparallel.py +++ b/src/sagemaker_training/smdataparallel.py @@ -129,6 +129,9 @@ def _get_mpirun_command( "orte_abort_on_non_zero_status", "1", "-mca", + "btl_vader_single_copy_mechanism", + "none", + "-mca", "plm_rsh_num_concurrent", str(num_hosts), "-x", diff --git a/test/unit/test_smdataparallel.py b/test/unit/test_smdataparallel.py index edd3a794..a55b1d63 100644 --- a/test/unit/test_smdataparallel.py +++ b/test/unit/test_smdataparallel.py @@ -90,6 +90,9 @@ def test_smdataparallel_run_multi_node_python( "orte_abort_on_non_zero_status", "1", "-mca", + "btl_vader_single_copy_mechanism", + "none", + "-mca", "plm_rsh_num_concurrent", str(num_hosts), "-x", @@ -189,6 +192,9 @@ def test_smdataparallel_run_single_node_python( "orte_abort_on_non_zero_status", "1", "-mca", + "btl_vader_single_copy_mechanism", + "none", + "-mca", "plm_rsh_num_concurrent", str(num_hosts), "-x",