Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/nccl dso #8240

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions cmake/external/nccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,32 @@ include(ExternalProject)

set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)

include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
# https://github.com/PaddlePaddle/Paddle/issues/8195
# Note: nccl2.1.4 seems works well on cuda9, but not compatible with cuda8
# TODO(dzhwinter): disable the NCCL DSO temporarily, should be removed
# also the commented out code in nccl.h
set(WITH_DSO OFF)

if(WITH_DSO)
# If we use DSO, we use system default nccl.h
set(NCCL_ROOT "/usr" CACHE PATH "NCCL ROOT")
find_path(NCCL_INCLUDE_DIR nccl.h
PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include
$ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
NO_DEFAULT_PATH
)
# If we use DSO, we do not build nccl, just download the dependencies
set(NCCL_BUILD_COMMAND "")
set(NCCL_INSTALL_COMMAND "")
set(NCCL_INSTALL_DIR "")
else()
# otherwise, we build nccl and link it.
include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
# Note: cuda 8.0 is needed to make nccl
# When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
set(NCCL_BUILD_COMMAND "make -j 8")
set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}")
set(NCCL_BUILD_COMMAND make -j 8)
set(NCCL_INSTALL_COMMAND make install PREFIX=${NCCL_INSTALL_DIR})
endif()

ExternalProject_Add(
Expand All @@ -44,6 +56,7 @@ ExternalProject_Add(
PREFIX "${NCCL_SOURCE_DIR}"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If set BUILD_IN_SOURCE then NCCL_INSTALL_DIR is not needed.

BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
INSTALL_DIR "${NCCL_INSTALL_DIR}"
Expand Down
1 change: 1 addition & 0 deletions paddle/platform/dynload/nccl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ void *nccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name

NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
DEFINE_WRAP(ncclCommDestroy);

void LoadNCCLDSO() {
platform::call_once(nccl_dso_flag,
Expand Down
17 changes: 13 additions & 4 deletions paddle/platform/dynload/nccl.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace dynload {
extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;

/*
#ifdef PADDLE_USE_DSO
extern void LoadNCCLDSO();

Expand All @@ -42,6 +43,8 @@ extern void LoadNCCLDSO();
}; \
extern DynLoad__##__name __name
#else
#endif
*/
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
Expand All @@ -50,26 +53,32 @@ extern void LoadNCCLDSO();
} \
}; \
extern DynLoad__##__name __name
#endif
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP_NO_RETURN(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
void operator()(Args... args) { \
__name(args...); \
} \
}; \
extern DynLoad__##__name __name

#define NCCL_RAND_ROUTINE_EACH(__macro) \
__macro(ncclCommInitAll); \
__macro(ncclGetUniqueId); \
__macro(ncclCommInitRank); \
__macro(ncclCommDestroy); \
__macro(ncclCommCount); \
__macro(ncclCommCuDevice); \
__macro(ncclCommUserRank); \
__macro(ncclAllReduce); \
__macro(ncclBcast); \
__macro(ncclAllGather); \
__macro(ncclGroupStart); \
__macro(ncclGroupEnd); \
__macro(ncclReduce); \
__macro(ncclGetErrorString);

NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)

DECLARE_DYNAMIC_LOAD_NCCL_WRAP_NO_RETURN(ncclCommDestroy);

} // namespace dynload
} // namespace platform
} // namespace paddle