From a7d7d7840577ebb26232989de9aeb48f94c87706 Mon Sep 17 00:00:00 2001 From: "Peng Wang(AI FWK)" Date: Sun, 29 Jan 2023 10:10:57 +0000 Subject: [PATCH 1/4] only link mpi when needed --- cmake/CMakeLists.txt | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b315b346f7b05..98927896442bf 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1365,15 +1365,17 @@ if (onnxruntime_ENABLE_TRAINING) find_package(MPI) - if (MPI_CXX_FOUND) - message( STATUS "MPI Version: ${MPI_CXX_VERSION}") - message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) - mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) - else () - set(onnxruntime_USE_NCCL OFF) - set(onnxruntime_USE_MPI OFF) - message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." ) + if (onnxruntime_USE_MPI OR onnxruntime_USE_NCCL) + if (MPI_CXX_FOUND) + message( STATUS "MPI Version: ${MPI_CXX_VERSION}") + message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) + mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + else () + set(onnxruntime_USE_NCCL OFF) + set(onnxruntime_USE_MPI OFF) + message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." ) + endif() endif() # Find NCCL and MPI From 169684f9b45b57ad2e44b1d12bf3b1c8aa803729 Mon Sep 17 00:00:00 2001 From: "Peng Wang(AI FWK)" Date: Tue, 31 Jan 2023 08:37:03 +0000 Subject: [PATCH 2/4] make mpi not found fatal error when it is really needed --- cmake/CMakeLists.txt | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 98927896442bf..ed63731ecb046 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1372,14 +1372,15 @@ if (onnxruntime_ENABLE_TRAINING) mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) else () - set(onnxruntime_USE_NCCL OFF) - set(onnxruntime_USE_MPI OFF) - message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." ) + message( + FATAL_ERROR + "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." + ) endif() endif() # Find NCCL and MPI - if (onnxruntime_USE_NCCL AND MPI_CXX_FOUND) + if (onnxruntime_USE_NCCL) if (onnxruntime_USE_CUDA) set(NCCL_LIBNAME "nccl") elseif (onnxruntime_USE_ROCM) @@ -1437,13 +1438,15 @@ if (onnxruntime_ENABLE_TRAINING) add_definitions(-DORT_USE_NCCL=1) message( STATUS "NCCL is enabled in Linux GPU Build." ) else () - set(onnxruntime_USE_NCCL OFF) - message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." ) + message( + FATAL_ERROR + "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." + ) endif() endif() endif() - if (onnxruntime_USE_MPI AND MPI_CXX_FOUND) + if (onnxruntime_USE_MPI) add_definitions(-DUSE_MPI=1) endif() From 0c0776d7e46548f5c3159d685b7568786cd5e845 Mon Sep 17 00:00:00 2001 From: "Peng Wang(AI FWK)" Date: Tue, 31 Jan 2023 10:24:48 +0000 Subject: [PATCH 3/4] disable MPI and NCCL by default --- tools/ci_build/build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 432c93599668c..74d9d774bfdd2 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -192,10 +192,10 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.") parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.") - parser.add_argument("--disable_nccl", action="store_true", help="Disable Nccl.") + parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.") parser.add_argument("--mpi_home", help="Path to MPI installation dir") parser.add_argument("--nccl_home", help="Path to NCCL installation dir") - parser.add_argument("--use_mpi", nargs="?", default=True, const=True, type=_str_to_bool) + parser.add_argument("--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="By default MPI is disabled.") # enable ONNX tests parser.add_argument( From f6af9ce9d320f32186a890d9113b53ab5c106ab5 Mon Sep 17 00:00:00 2001 From: "Peng Wang(AI FWK)" Date: Thu, 2 Feb 2023 08:56:27 +0000 Subject: [PATCH 4/4] fix format --- tools/ci_build/build.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 74d9d774bfdd2..a68045d14de92 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -195,7 +195,9 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.") parser.add_argument("--mpi_home", help="Path to MPI installation dir") parser.add_argument("--nccl_home", help="Path to NCCL installation dir") - parser.add_argument("--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="By default MPI is disabled.") + parser.add_argument( + "--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="Disabled by default." + ) # enable ONNX tests parser.add_argument(