Merge branch 'branch-22.08' into fea-knn-ivf-flat

rapidsai · Jul 26, 2022 · e6a815b · e6a815b
2 parents 196b83f + af4f35c
commit e6a815b
Show file tree

Hide file tree

Showing 16 changed files with 481 additions and 423 deletions.
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -34,8 +34,7 @@ function sed_runner() {
 sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
 sed_runner 's/'"pylibraft_version .*)"'/'"pylibraft_version ${NEXT_FULL_TAG})"'/g' python/pylibraft/CMakeLists.txt
 sed_runner 's/'"pyraft_version .*)"'/'"pyraft_version ${NEXT_FULL_TAG})"'/g' python/raft/CMakeLists.txt
-sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
-sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' python/pylibraft/CMakeLists.txt
+sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
 
 # Docs update
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -17,9 +17,7 @@ set(RAPIDS_VERSION "22.06")
 set(RAFT_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake
-        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
-include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(../fetch_rapids.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-cuda)

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
@@ -29,6 +29,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/random/make_blobs.cu
   bench/random/permute.cu
   bench/random/rng.cu
+  bench/sparse/convert_csr.cu
   bench/spatial/fused_l2_nn.cu
   bench/spatial/knn.cu
   bench/spatial/selection.cu

diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/scan.h>
+#include <stdio.h>
+
+#include <common/benchmark.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t num_cols;
+  index_t num_rows;
+  index_t divisor;
+};
+
+template <typename index_t>
+__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
+{
+  index_t r = blockDim.y * blockIdx.y + threadIdx.y;
+  index_t c = blockDim.x * blockIdx.x + threadIdx.x;
+
+  for (; r < num_rows; r += gridDim.y * blockDim.y) {
+    for (; c < num_cols; c += gridDim.x * blockDim.x) {
+      adj[r * num_cols + c] = c % divisor == 0;
+    }
+  }
+}
+
+template <typename index_t>
+void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
+{
+  // adj matrix: element a_ij is set to one if j is divisible by divisor.
+  dim3 block(32, 32);
+  const index_t max_y_grid_dim = 65535;
+  dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
+  init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
+  RAFT_CHECK_CUDA(stream);
+}
+
+template <typename index_t>
+struct bench_base : public fixture {
+  bench_base(const bench_param<index_t>& p)
+    : params(p),
+      handle(stream),
+      adj(p.num_rows * p.num_cols, stream),
+      row_ind(p.num_rows, stream),
+      row_ind_host(p.num_rows),
+      row_counters(p.num_rows, stream),
+      // col_ind is over-dimensioned because nnz is unknown at this point
+      col_ind(p.num_rows * p.num_cols, stream)
+  {
+    init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);
+
+    std::vector<index_t> row_ind_host(p.num_rows);
+    for (size_t i = 0; i < row_ind_host.size(); ++i) {
+      size_t nnz_per_row = raft::ceildiv(p.num_cols, p.divisor);
+      row_ind_host[i]    = nnz_per_row * i;
+    }
+    raft::update_device(row_ind.data(), row_ind_host.data(), row_ind.size(), stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::sparse::convert::adj_to_csr<index_t>(handle,
+                                                 adj.data(),
+                                                 row_ind.data(),
+                                                 params.num_rows,
+                                                 params.num_cols,
+                                                 row_counters.data(),
+                                                 col_ind.data());
+    });
+
+    // Estimate bandwidth:
+    index_t num_entries = params.num_rows * params.num_cols;
+    index_t bytes_read  = num_entries * sizeof(bool);
+    index_t bytes_write = num_entries / params.divisor * sizeof(index_t);
+
+    state.counters["BW"]      = benchmark::Counter(bytes_read + bytes_write,
+                                              benchmark::Counter::kIsIterationInvariantRate,
+                                              benchmark::Counter::OneK::kIs1024);
+    state.counters["BW read"] = benchmark::Counter(
+      bytes_read, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+    state.counters["BW write"] = benchmark::Counter(bytes_write,
+                                                    benchmark::Counter::kIsIterationInvariantRate,
+                                                    benchmark::Counter::OneK::kIs1024);
+
+    state.counters["Fraction nz"] = benchmark::Counter(100.0 / ((double)params.divisor));
+    state.counters["Columns"]     = benchmark::Counter(params.num_cols);
+    state.counters["Rows"]        = benchmark::Counter(params.num_rows);
+  }
+
+ protected:
+  raft::handle_t handle;
+  bench_param<index_t> params;
+  rmm::device_uvector<bool> adj;
+  rmm::device_uvector<index_t> row_ind;
+  std::vector<index_t> row_ind_host;
+  rmm::device_uvector<index_t> row_counters;
+  rmm::device_uvector<index_t> col_ind;
+};  // struct bench_base
+
+const int64_t num_cols = 1 << 30;
+
+const std::vector<bench_param<int64_t>> bench_params = {
+  {num_cols, 1, 8},
+  {num_cols >> 3, 1 << 3, 8},
+  {num_cols >> 6, 1 << 6, 8},
+
+  {num_cols, 1, 64},
+  {num_cols >> 3, 1 << 3, 64},
+  {num_cols >> 6, 1 << 6, 64},
+
+  {num_cols, 1, 2048},
+  {num_cols >> 3, 1 << 3, 2048},
+  {num_cols >> 6, 1 << 6, 2048},
+};
+
+RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in
@@ -844,9 +844,8 @@ EXCLUDE_PATTERNS       = */detail/* \
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = detail \
-                         csr_adj_graph \
-                         csr_adj_graph_batched
+EXCLUDE_SYMBOLS        = detail
+
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include

diff --git a/cpp/include/raft/comms/detail/util.hpp b/cpp/include/raft/comms/detail/util.hpp
@@ -65,9 +65,7 @@ namespace raft {
 namespace comms {
 namespace detail {
 
-constexpr size_t
-
-get_datatype_size(const datatype_t datatype)
+constexpr size_t get_datatype_size(const datatype_t datatype)
 {
   switch (datatype) {
     case datatype_t::CHAR: return sizeof(char);
@@ -82,9 +80,7 @@ get_datatype_size(const datatype_t datatype)
   }
 }
 
-constexpr ncclDataType_t
-
-get_nccl_datatype(const datatype_t datatype)
+constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
 {
   switch (datatype) {
     case datatype_t::CHAR: return ncclChar;
@@ -99,9 +95,7 @@ get_nccl_datatype(const datatype_t datatype)
   }
 }
 
-constexpr ncclRedOp_t
-
-get_nccl_op(const op_t op)
+constexpr ncclRedOp_t get_nccl_op(const op_t op)
 {
   switch (op) {
     case op_t::SUM: return ncclSum;
@@ -112,7 +106,7 @@ get_nccl_op(const op_t op)
   }
 }
 
-status_t nccl_sync_stream(ncclComm_t comm, cudaStream_t stream)
+inline status_t nccl_sync_stream(ncclComm_t comm, cudaStream_t stream)
 {
   cudaError_t cudaErr;
   ncclResult_t ncclErr, ncclAsyncErr;

diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
@@ -18,34 +18,10 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COO_H
-#define __COO_H
-
 #pragma once
 
-#include <raft/sparse/convert/detail/coo.cuh>
-
-namespace raft {
-namespace sparse {
-namespace convert {
-
-/**
- * @brief Convert a CSR row_ind array to a COO rows array
- * @param row_ind: Input CSR row_ind array
- * @param m: size of row_ind array
- * @param coo_rows: Output COO row array
- * @param nnz: size of output COO row array
- * @param stream: cuda stream to use
- */
-template <typename value_idx = int>
-void csr_to_coo(
-  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
-{
-  detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
-}
-
-};  // end NAMESPACE convert
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "coo.cuh"
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <raft/sparse/convert/detail/adj_to_csr.cuh>
 #include <raft/sparse/convert/detail/csr.cuh>
 #include <raft/sparse/csr.hpp>
 
@@ -39,74 +40,6 @@ void coo_to_csr(const raft::handle_t& handle,
   detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
 }
 
-/**
- * @brief Constructs an adjacency graph CSR row_ind_ptr array from
- * a row_ind array and adjacency array.
- * @tparam T the numeric type of the index arrays
- * @tparam TPB_X the number of threads to use per block for kernels
- * @tparam Lambda function for fused operation in the adj_graph construction
- * @param row_ind the input CSR row_ind array
- * @param total_rows number of vertices in graph
- * @param nnz number of non-zeros
- * @param batchSize number of vertices in current batch
- * @param adj an adjacency array (size batchSize x total_rows)
- * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
- * @param stream cuda stream to use
- * @param fused_op: the fused operation
- */
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_* row_ind,
-                           Index_ total_rows,
-                           Index_ nnz,
-                           Index_ batchSize,
-                           const bool* adj,
-                           Index_* row_ind_ptr,
-                           cudaStream_t stream,
-                           Lambda fused_op)
-{
-  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
-}
-
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_* row_ind,
-                           Index_ total_rows,
-                           Index_ nnz,
-                           Index_ batchSize,
-                           const bool* adj,
-                           Index_* row_ind_ptr,
-                           cudaStream_t stream)
-{
-  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
-}
-
-/**
- * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
- * a row_ind array and adjacency array.
- * @tparam T the numeric type of the index arrays
- * @tparam TPB_X the number of threads to use per block for kernels
- * @param row_ind the input CSR row_ind array
- * @param total_rows number of total vertices in graph
- * @param nnz number of non-zeros
- * @param adj an adjacency array
- * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
- * @param stream cuda stream to use
- * @param fused_op the fused operation
- */
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_* row_ind,
-                   Index_ total_rows,
-                   Index_ nnz,
-                   const bool* adj,
-                   Index_* row_ind_ptr,
-                   cudaStream_t stream,
-                   Lambda fused_op)
-{
-  detail::csr_adj_graph<Index_, 32, Lambda>(
-    row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
-}
-
 /**
  * @brief Generate the row indices array for a sorted COO matrix
  *
@@ -135,8 +68,42 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
   detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
+/**
+ * @brief Converts a boolean adjacency matrix into unsorted CSR format.
+ *
+ * The conversion supports non-square matrices.
+ *
+ * @tparam     index_t     Indexing arithmetic type
+ *
+ * @param[in]  handle      RAFT handle
+ * @param[in]  adj         A num_rows x num_cols boolean matrix in contiguous row-major
+ *                         format.
+ * @param[in]  row_ind     An array of length num_rows that indicates at which index
+ *                         a row starts in out_col_ind. Equivalently, it is the
+ *                         exclusive scan of the number of non-zeros in each row of
+ *                         adj.
+ * @param[in]  num_rows    Number of rows of adj.
+ * @param[in]  num_cols    Number of columns of adj.
+ * @param      tmp         A pre-allocated array of size num_rows.
+ * @param[out] out_col_ind An array containing the column indices of the
+ *                         non-zero values in adj. Size should be at least the
+ *                         number of non-zeros in adj.
+ */
+template <typename index_t = int>
+void adj_to_csr(const raft::handle_t& handle,
+                const bool* adj,         // Row-major adjacency matrix
+                const index_t* row_ind,  // Precomputed row indices
+                index_t num_rows,        // # rows of adj
+                index_t num_cols,        // # cols of adj
+                index_t* tmp,  // Pre-allocated atomic counters. Minimum size: num_rows elements.
+                index_t* out_col_ind  // Output column indices
+)
+{
+  detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
+}
+
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
+#endif