Merge branch-21.10

rapidsai · Aug 3, 2021 · 7725a80 · 7725a80
2 parents 192882a + 947e22f
commit 7725a80
Show file tree

Hide file tree

Showing 44 changed files with 1,739 additions and 204 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# raft 21.10.00 (Date TBD)
+
+Please see https://github.com/rapidsai/raft/releases/tag/v21.10.00a for the latest changes to this development branch.
+
 # raft 21.08.00 (Date TBD)
 
 Please see https://github.com/rapidsai/raft/releases/tag/v21.08.00a for the latest changes to this development branch.

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -39,13 +39,13 @@ env
 gpuci_logger "Check GPU usage"
 nvidia-smi
 
-# temporary usage of gpuci_conda_retry install with packages listed here, looking into
+# temporary usage of gpuci_mamba_retry install with packages listed here, looking into
 # using the repos yaml files for this
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 gpuci_logger "Installing packages needed for RAFT"
-gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
+gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
       "cudatoolkit=${CUDA_REL}" \
       "cudf=${MINOR_VERSION}" \
       "rmm=${MINOR_VERSION}" \
@@ -59,8 +59,8 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 

diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
@@ -81,8 +81,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -15,13 +15,9 @@
 #=============================================================================
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
-include(FetchContent)
-FetchContent_Declare(
-  rapids-cmake
-  GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git
-  GIT_TAG        origin/branch-21.08
-  )
-FetchContent_MakeAvailable(rapids-cmake)
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-cuda)
@@ -30,7 +26,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 21.08.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION 21.10.00 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -26,8 +26,8 @@ endif()
 list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
 # set warnings as errors
-# list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call)
-# list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call)
+list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
 if(CUDA_ENABLE_LINEINFO)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -21,21 +21,17 @@ function(find_and_configure_cuco VERSION)
     endif()
 
     rapids_cpm_find(cuco ${VERSION}
-      GLOBAL_TARGETS cuco cuco::cuco
+      GLOBAL_TARGETS      cuco::cuco
       BUILD_EXPORT_SET    raft-exports
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
-        GIT_REPOSITORY https://github.com/trxcllnt/cuCollections.git
-        GIT_TAG        dev
+        GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+        GIT_TAG        b1fea0cbe4c384160740af00f7c8760846539abb
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"
     )
 
-    if(NOT TARGET cuco::cuco)
-      add_library(cuco::cuco ALIAS cuco)
-    endif()
-
 endfunction()
 
 find_and_configure_cuco(0.0.1)
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -40,7 +40,10 @@ function(find_and_configure_faiss)
 
     if(FAISS_ADDED)
       set(FAISS_GPU_HEADERS ${FAISS_SOURCE_DIR} PARENT_SCOPE)
-      add_library(FAISS::FAISS ALIAS faiss)
+    endif()
+
+    if(TARGET faiss AND NOT TARGET FAISS::FAISS)
+        add_library(FAISS::FAISS ALIAS faiss)
     endif()
 
 endfunction()

diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the canberra distance matrix calculation implementer
+ *  It computes the following equation: cij = max(cij, op(ai-bj))
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and cols of C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param fin_op    the final gemm epilogue lambda
+ * @param stream    cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                         IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                         FinalLambda fin_op, cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
+    const auto add = raft::myAbs(x) + raft::myAbs(y);
+    // deal with potential for 0 in denominator by
+    // forcing 1/0 instead
+    acc += ((add != 0) * diff / (add + (add == 0)));
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) { return; };
+
+  if (isRowMajor) {
+    auto canberraRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
+
+    canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    auto canberraColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
+    canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+              const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op,
+              cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                             stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                             stream);
+  } else {
+    canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the canberra distance matrix calculation
+ *  It computes the following equation: cij = max(cij, op(ai-bj))
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param[in] m number of rows of A and C/D
+ * @param[in] n number of rows of B and cols of C/D
+ * @param[in] k number of cols of A and B
+ * @param[in] pA input matrix
+ * @param[in] pB input matrix
+ * @param[out] pD output matrix
+ * @param[in] fin_op the final element-wise epilogue lambda
+ * @param[in] stream cuda stream to launch work
+ * @param[in] isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                  OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                  bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    canberraOutType;
+  Index_ lda, ldb, ldd;
+  canberraOutType *pDcast = reinterpret_cast<canberraOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  } else {
+    lda = n, ldb = m, ldd = m;
+    canberra<InType, AccType, canberraOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft