Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[gpuCI] Auto-merge branch-0.18 to branch-0.19 [skip ci] #136

Merged
merged 23 commits into from
Feb 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
# RAFT 0.18.0 (Date TBD)
# 0.18.0

## New Features

## Improvements

## Bug Fixes
Please see https://github.com/rapidsai/raft/releases/tag/branch-0.18-latest for the latest changes to this development branch.

# RAFT 0.17.0 (Date TBD)
# RAFT 0.17.0 (10 Dec 2020)

## New Features
- PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects
Expand Down
8 changes: 7 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ ARGS=$*
# script, and that this script resides in the repo dir!
REPODIR=$(cd $(dirname $0); pwd)

VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest"
VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
HELP="$0 [<target> ...] [<flag> ...]
where <target> is:
clean - remove all existing build artifacts and configuration (start over)
Expand All @@ -29,6 +29,7 @@ HELP="$0 [<target> ...] [<flag> ...]
-v - verbose build mode
-g - build for debug
--allgpuarch - build for all supported GPU architectures
--buildfaiss - build faiss statically into raft
--nvtx - Enable nvtx for profiling support
--show_depr_warn - show cmake deprecation warnings
-h - print this text
Expand All @@ -44,6 +45,7 @@ BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
VERBOSE=""
BUILD_ALL_GPU_ARCH=0
BUILD_GTEST=OFF
BUILD_STATIC_FAISS=OFF
SINGLEGPU=""
NVTX=OFF
CLEAN=0
Expand Down Expand Up @@ -89,6 +91,9 @@ fi
if hasArg --buildgtest; then
BUILD_GTEST=ON
fi
if hasArg --buildfaiss; then
BUILD_STATIC_FAISS=ON
fi
if hasArg --singlegpu; then
SINGLEGPU="--singlegpu"
fi
Expand Down Expand Up @@ -140,6 +145,7 @@ if (( ${NUMARGS} == 0 )) || hasArg cppraft; then
-DNCCL_PATH=${INSTALL_PREFIX} \
-DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
-DBUILD_GTEST=${BUILD_GTEST} \
-DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS} \
..

fi
Expand Down
18 changes: 18 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

option(BUILD_GTEST "Build the GTEST library for running raft test executables" OFF)

option(BUILD_STATIC_FAISS "Build the FAISS library for nearest neighbors search on GPU" OFF)

option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)

option(EMPTY_MARKER_KERNEL "Enable empty marker kernel after nvtxRangePop" ON)
Expand Down Expand Up @@ -175,6 +177,18 @@ endif()
include(cmake/Dependencies.cmake)
include(cmake/comms.cmake)

###################################################################################################
# - FAISS -------------------------------------------------------------------------------------------

if(NOT BUILD_STATIC_FAISS)
find_path(FAISS_INCLUDE_DIRS "faiss"
HINTS
"$ENV{FAISS_ROOT}/include"
"$ENV{CONDA_PREFIX}/include/faiss"
"$ENV{CONDA_PREFIX}/include")
endif(NOT BUILD_STATIC_FAISS)
message(STATUS "FAISS: FAISS_INCLUDE_DIRS set to ${FAISS_INCLUDE_DIRS}")

###################################################################################################
# - RMM -------------------------------------------------------------------------------------------

Expand All @@ -196,6 +210,7 @@ set(RAFT_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING
set(RAFT_INCLUDE_DIRECTORIES
${RAFT_INCLUDE_DIR}
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
${FAISS_INCLUDE_DIRS}
${RMM_INCLUDE_DIRS})

if(NOT CUB_IS_PART_OF_CTK)
Expand All @@ -218,6 +233,7 @@ set(RAFT_LINK_LIBRARIES
${CUDA_curand_LIBRARY})

set(RAFT_LINK_DIRECTORIES
${FAISS_INCLUDE_DIRS}
${RMM_INCLUDE_DIRS})

if(DEFINED ENV{CONDA_PREFIX})
Expand Down Expand Up @@ -261,6 +277,7 @@ if(BUILD_RAFT_TESTS)
test/random/rng.cu
test/random/rng_int.cu
test/random/sample_without_replacement.cu
test/spatial/knn.cu
test/stats/mean.cu
test/stats/mean_center.cu
test/stats/stddev.cu
Expand All @@ -283,6 +300,7 @@ if(BUILD_RAFT_TESTS)
target_link_libraries(test_raft
PRIVATE
${RAFT_LINK_LIBRARIES}
FAISS::FAISS
GTest::GTest
GTest::Main
OpenMP::OpenMP_CXX
Expand Down
45 changes: 44 additions & 1 deletion cpp/cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,48 @@ if(NOT CUB_IS_PART_OF_CTK)
INSTALL_COMMAND "")
endif(NOT CUB_IS_PART_OF_CTK)

##############################################################################
# - faiss --------------------------------------------------------------------

if(BUILD_STATIC_FAISS)
set(FAISS_DIR ${CMAKE_CURRENT_BINARY_DIR}/faiss CACHE STRING
"Path to FAISS source directory")
ExternalProject_Add(faiss
GIT_REPOSITORY https://github.com/facebookresearch/faiss.git
GIT_TAG a5b850dec6f1cd6c88ab467bfd5e87b0cac2e41d
CONFIGURE_COMMAND LIBS=-pthread
CPPFLAGS=-w
LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib
${CMAKE_CURRENT_BINARY_DIR}/faiss/src/faiss/configure
--prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss
--with-blas=${BLAS_LIBRARIES}
--with-cuda=${CUDA_TOOLKIT_ROOT_DIR}
--with-cuda-arch=${FAISS_GPU_ARCHS}
-v
PREFIX ${FAISS_DIR}
BUILD_COMMAND make -j${PARALLEL_LEVEL} VERBOSE=1
BUILD_BYPRODUCTS ${FAISS_DIR}/lib/libfaiss.a
BUILD_ALWAYS 1
INSTALL_COMMAND make -s install > /dev/null
UPDATE_COMMAND ""
BUILD_IN_SOURCE 1
PATCH_COMMAND patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/faiss_cuda11.patch || true)

ExternalProject_Get_Property(faiss install_dir)
add_library(FAISS::FAISS STATIC IMPORTED)
set_property(TARGET FAISS::FAISS PROPERTY
IMPORTED_LOCATION ${FAISS_DIR}/lib/libfaiss.a)
# to account for the FAISS file reorg that happened recently after the current
# pinned commit, just change the following line to
# set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src/faiss")
set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src")
else()
add_library(FAISS::FAISS SHARED IMPORTED)
set_property(TARGET FAISS::FAISS PROPERTY
IMPORTED_LOCATION $ENV{CONDA_PREFIX}/lib/libfaiss.so)
message(STATUS "Found FAISS: $ENV{CONDA_PREFIX}/lib/libfaiss.so")
endif(BUILD_STATIC_FAISS)

##############################################################################
# - googletest ---------------------------------------------------------------

Expand Down Expand Up @@ -65,4 +107,5 @@ endif(BUILD_GTEST)

if(NOT CUB_IS_PART_OF_CTK)
add_dependencies(GTest::GTest cub)
endif(NOT CUB_IS_PART_OF_CTK)
endif(NOT CUB_IS_PART_OF_CTK)
add_dependencies(FAISS::FAISS faiss)
40 changes: 40 additions & 0 deletions cpp/cmake/faiss_cuda11.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
diff --git a/configure b/configure
index ed40dae..f88ed0a 100755
--- a/configure
+++ b/configure
@@ -2970,7 +2970,7 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex
ac_compiler_gnu=$ac_cv_cxx_compiler_gnu


- ax_cxx_compile_alternatives="11 0x" ax_cxx_compile_cxx11_required=true
+ ax_cxx_compile_alternatives="14 11 0x" ax_cxx_compile_cxx11_required=true
ac_ext=cpp
ac_cpp='$CXXCPP $CPPFLAGS'
ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
index 89d3dda..bc0f9b5 100644
--- a/gpu/utils/DeviceDefs.cuh
+++ b/gpu/utils/DeviceDefs.cuh
@@ -13,7 +13,7 @@
namespace faiss { namespace gpu {

#ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 750
+#if __CUDA_ARCH__ <= 800
constexpr int kWarpSize = 32;
#else
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh
index ede225e..4f7eb44 100644
--- a/gpu/utils/MatrixMult-inl.cuh
+++ b/gpu/utils/MatrixMult-inl.cuh
@@ -51,6 +51,9 @@ rawGemm(cublasHandle_t handle,
auto cBT = GetCudaType<BT>::Type;

// Always accumulate in f32
+# if __CUDACC_VER_MAJOR__ >= 11
+ cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+# endif
return cublasSgemmEx(handle, transa, transb, m, n, k,
&fAlpha, A, cAT, lda,
B, cBT, ldb,
48 changes: 48 additions & 0 deletions cpp/include/raft/comms/comms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,15 @@ class comms_iface {
const size_t* recvcounts, const size_t* displs,
datatype_t datatype, cudaStream_t stream) const = 0;

virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
datatype_t datatype, int root,
cudaStream_t stream) const = 0;

virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
const size_t* recvcounts, const size_t* displs,
datatype_t datatype, int root,
cudaStream_t stream) const = 0;

virtual void reducescatter(const void* sendbuff, void* recvbuff,
size_t recvcount, datatype_t datatype, op_t op,
cudaStream_t stream) const = 0;
Expand Down Expand Up @@ -316,6 +325,45 @@ class comms_t {
get_type<value_t>(), stream);
}

/**
* Gathers data from each rank onto all ranks
* @tparam value_t datatype of underlying buffers
* @param sendbuff buffer containing data to gather
* @param recvbuff buffer containing gathered data from all ranks
* @param sendcount number of elements in send buffer
* @param root rank to store the results
* @param stream CUDA stream to synchronize operation
*/
template <typename value_t>
void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
int root, cudaStream_t stream) const {
impl_->gather(static_cast<const void*>(sendbuff),
static_cast<void*>(recvbuff), sendcount, get_type<value_t>(),
root, stream);
}

/**
* Gathers data from all ranks and delivers to combined data to all ranks
* @param value_t datatype of underlying buffers
* @param sendbuff buffer containing data to send
* @param recvbuff buffer containing data to receive
* @param sendcount number of elements in send buffer
* @param recvcounts pointer to an array (of length num_ranks size) containing the number of
* elements that are to be received from each rank
* @param displs pointer to an array (of length num_ranks size) to specify the displacement
* (relative to recvbuf) at which to place the incoming data from each rank
* @param root rank to store the results
* @param stream CUDA stream to synchronize operation
*/
template <typename value_t>
void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount,
const size_t* recvcounts, const size_t* displs, int root,
cudaStream_t stream) const {
impl_->gatherv(static_cast<const void*>(sendbuf),
static_cast<void*>(recvbuf), sendcount, recvcounts, displs,
get_type<value_t>(), root, stream);
}

/**
* Reduces data from all ranks then scatters the result across ranks
* @tparam value_t datatype of underlying buffers
Expand Down
33 changes: 33 additions & 0 deletions cpp/include/raft/comms/mpi_comms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,39 @@ class mpi_comms : public comms_iface {
}
}

void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
datatype_t datatype, int root, cudaStream_t stream) const {
size_t dtype_size = get_datatype_size(datatype);
NCCL_TRY(ncclGroupStart());
if (get_rank() == root) {
for (int r = 0; r < get_size(); ++r) {
NCCL_TRY(ncclRecv(
static_cast<char*>(recvbuff) + sendcount * r * dtype_size, sendcount,
get_nccl_datatype(datatype), r, nccl_comm_, stream));
}
}
NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
nccl_comm_, stream));
NCCL_TRY(ncclGroupEnd());
}

void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount,
const size_t* recvcounts, const size_t* displs,
datatype_t datatype, int root, cudaStream_t stream) const {
size_t dtype_size = get_datatype_size(datatype);
NCCL_TRY(ncclGroupStart());
if (get_rank() == root) {
for (int r = 0; r < get_size(); ++r) {
NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
recvcounts[r], get_nccl_datatype(datatype), r,
nccl_comm_, stream));
}
}
NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
nccl_comm_, stream));
NCCL_TRY(ncclGroupEnd());
}

void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount,
datatype_t datatype, op_t op, cudaStream_t stream) const {
NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
Expand Down
33 changes: 33 additions & 0 deletions cpp/include/raft/comms/std_comms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,39 @@ class std_comms : public comms_iface {
}
}

void gather(const void *sendbuff, void *recvbuff, size_t sendcount,
datatype_t datatype, int root, cudaStream_t stream) const {
size_t dtype_size = get_datatype_size(datatype);
NCCL_TRY(ncclGroupStart());
if (get_rank() == root) {
for (int r = 0; r < get_size(); ++r) {
NCCL_TRY(ncclRecv(
static_cast<char *>(recvbuff) + sendcount * r * dtype_size, sendcount,
get_nccl_datatype(datatype), r, nccl_comm_, stream));
}
}
NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
nccl_comm_, stream));
NCCL_TRY(ncclGroupEnd());
}

void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount,
const size_t *recvcounts, const size_t *displs,
datatype_t datatype, int root, cudaStream_t stream) const {
size_t dtype_size = get_datatype_size(datatype);
NCCL_TRY(ncclGroupStart());
if (get_rank() == root) {
for (int r = 0; r < get_size(); ++r) {
NCCL_TRY(ncclRecv(
static_cast<char *>(recvbuff) + displs[r] * dtype_size, recvcounts[r],
get_nccl_datatype(datatype), r, nccl_comm_, stream));
}
}
NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
nccl_comm_, stream));
NCCL_TRY(ncclGroupEnd());
}

void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
datatype_t datatype, op_t op, cudaStream_t stream) const {
NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
Expand Down
Loading