Skip to content

Commit

Permalink
link mpi when either use_mpi or use_nccl enabled (#14467)
Browse files Browse the repository at this point in the history
### Only link mpi when either use_mpi or use_nccl enabled

To fix the issue #14278. 

Talked with @askhade, we think if users want to enable NCCL/MPi but MPI
is not found, it should be failure instead of warning.
So this PR made the change. As a result, to make CIs pass, we need
disable NCCL/MPI explicitly in the build command. This PR take an
alternative approach, e.g. since NCCL and MPi are not used for
customers, disable NCCL by default if "--disable_nccl" not specified,
disable MPI by default if "--use_mpi" not specified.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
  • Loading branch information
pengwa authored Feb 3, 2023
1 parent c6c1103 commit 7eca424
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 15 deletions.
31 changes: 18 additions & 13 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1347,19 +1347,22 @@ if (onnxruntime_ENABLE_TRAINING)

find_package(MPI)

if (MPI_CXX_FOUND)
message( STATUS "MPI Version: ${MPI_CXX_VERSION}")
message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" )
mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
else ()
set(onnxruntime_USE_NCCL OFF)
set(onnxruntime_USE_MPI OFF)
message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." )
if (onnxruntime_USE_MPI OR onnxruntime_USE_NCCL)
if (MPI_CXX_FOUND)
message( STATUS "MPI Version: ${MPI_CXX_VERSION}")
message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" )
mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
else ()
message(
FATAL_ERROR
"MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled."
)
endif()
endif()

# Find NCCL and MPI
if (onnxruntime_USE_NCCL AND MPI_CXX_FOUND)
if (onnxruntime_USE_NCCL)
if (onnxruntime_USE_CUDA)
set(NCCL_LIBNAME "nccl")
elseif (onnxruntime_USE_ROCM)
Expand Down Expand Up @@ -1417,13 +1420,15 @@ if (onnxruntime_ENABLE_TRAINING)
add_definitions(-DORT_USE_NCCL=1)
message( STATUS "NCCL is enabled in Linux GPU Build." )
else ()
set(onnxruntime_USE_NCCL OFF)
message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." )
message(
FATAL_ERROR
"NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled."
)
endif()
endif()
endif()

if (onnxruntime_USE_MPI AND MPI_CXX_FOUND)
if (onnxruntime_USE_MPI)
add_definitions(-DUSE_MPI=1)
endif()

Expand Down
6 changes: 4 additions & 2 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,12 @@ def convert_arg_line_to_args(self, arg_line):
parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.")
parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.")

parser.add_argument("--disable_nccl", action="store_true", help="Disable Nccl.")
parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.")
parser.add_argument("--mpi_home", help="Path to MPI installation dir")
parser.add_argument("--nccl_home", help="Path to NCCL installation dir")
parser.add_argument("--use_mpi", nargs="?", default=True, const=True, type=_str_to_bool)
parser.add_argument(
"--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="Disabled by default."
)

# enable ONNX tests
parser.add_argument(
Expand Down

0 comments on commit 7eca424

Please sign in to comment.