Replace csr_adj_graph functions with faster equivalent (#746)

The `csr_adj_graph` functions are a performance bottleneck in the DBSCAN implementation in cuML. They are not used anywhere else. This PR replaces the `csr_adj_graph` functions with the faster `dense_bool_to_unsorted_csr` function. It has the same functionality, *but* 1. It requires the input adjacency matrix to be in row-major order (rather than column-major). 2. The output column indices are not guaranteed to be in ascending order (hence unsorted). Authors: - Allard Hendriksen (https://github.com/ahendriksen) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: #746
rapidsai · Jul 22, 2022 · 362f91c · 362f91c
1 parent fd2595c
commit 362f91c
Show file tree

Hide file tree

Showing 10 changed files with 456 additions and 402 deletions.
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
@@ -29,6 +29,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/random/make_blobs.cu
   bench/random/permute.cu
   bench/random/rng.cu
+  bench/sparse/convert_csr.cu
   bench/spatial/fused_l2_nn.cu
   bench/spatial/knn.cu
   bench/spatial/selection.cu

diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/scan.h>
+#include <stdio.h>
+
+#include <common/benchmark.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t num_cols;
+  index_t num_rows;
+  index_t divisor;
+};
+
+template <typename index_t>
+__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
+{
+  index_t r = blockDim.y * blockIdx.y + threadIdx.y;
+  index_t c = blockDim.x * blockIdx.x + threadIdx.x;
+
+  for (; r < num_rows; r += gridDim.y * blockDim.y) {
+    for (; c < num_cols; c += gridDim.x * blockDim.x) {
+      adj[r * num_cols + c] = c % divisor == 0;
+    }
+  }
+}
+
+template <typename index_t>
+void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
+{
+  // adj matrix: element a_ij is set to one if j is divisible by divisor.
+  dim3 block(32, 32);
+  const index_t max_y_grid_dim = 65535;
+  dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
+  init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
+  RAFT_CHECK_CUDA(stream);
+}
+
+template <typename index_t>
+struct bench_base : public fixture {
+  bench_base(const bench_param<index_t>& p)
+    : params(p),
+      handle(stream),
+      adj(p.num_rows * p.num_cols, stream),
+      row_ind(p.num_rows, stream),
+      row_ind_host(p.num_rows),
+      row_counters(p.num_rows, stream),
+      // col_ind is over-dimensioned because nnz is unknown at this point
+      col_ind(p.num_rows * p.num_cols, stream)
+  {
+    init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);
+
+    std::vector<index_t> row_ind_host(p.num_rows);
+    for (size_t i = 0; i < row_ind_host.size(); ++i) {
+      size_t nnz_per_row = raft::ceildiv(p.num_cols, p.divisor);
+      row_ind_host[i]    = nnz_per_row * i;
+    }
+    raft::update_device(row_ind.data(), row_ind_host.data(), row_ind.size(), stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::sparse::convert::adj_to_csr<index_t>(handle,
+                                                 adj.data(),
+                                                 row_ind.data(),
+                                                 params.num_rows,
+                                                 params.num_cols,
+                                                 row_counters.data(),
+                                                 col_ind.data());
+    });
+
+    // Estimate bandwidth:
+    index_t num_entries = params.num_rows * params.num_cols;
+    index_t bytes_read  = num_entries * sizeof(bool);
+    index_t bytes_write = num_entries / params.divisor * sizeof(index_t);
+
+    state.counters["BW"]      = benchmark::Counter(bytes_read + bytes_write,
+                                              benchmark::Counter::kIsIterationInvariantRate,
+                                              benchmark::Counter::OneK::kIs1024);
+    state.counters["BW read"] = benchmark::Counter(
+      bytes_read, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+    state.counters["BW write"] = benchmark::Counter(bytes_write,
+                                                    benchmark::Counter::kIsIterationInvariantRate,
+                                                    benchmark::Counter::OneK::kIs1024);
+
+    state.counters["Fraction nz"] = benchmark::Counter(100.0 / ((double)params.divisor));
+    state.counters["Columns"]     = benchmark::Counter(params.num_cols);
+    state.counters["Rows"]        = benchmark::Counter(params.num_rows);
+  }
+
+ protected:
+  raft::handle_t handle;
+  bench_param<index_t> params;
+  rmm::device_uvector<bool> adj;
+  rmm::device_uvector<index_t> row_ind;
+  std::vector<index_t> row_ind_host;
+  rmm::device_uvector<index_t> row_counters;
+  rmm::device_uvector<index_t> col_ind;
+};  // struct bench_base
+
+const int64_t num_cols = 1 << 30;
+
+const std::vector<bench_param<int64_t>> bench_params = {
+  {num_cols, 1, 8},
+  {num_cols >> 3, 1 << 3, 8},
+  {num_cols >> 6, 1 << 6, 8},
+
+  {num_cols, 1, 64},
+  {num_cols >> 3, 1 << 3, 64},
+  {num_cols >> 6, 1 << 6, 64},
+
+  {num_cols, 1, 2048},
+  {num_cols >> 3, 1 << 3, 2048},
+  {num_cols >> 6, 1 << 6, 2048},
+};
+
+RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in
@@ -844,9 +844,8 @@ EXCLUDE_PATTERNS       = */detail/* \
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = detail \
-                         csr_adj_graph \
-                         csr_adj_graph_batched
+EXCLUDE_SYMBOLS        = detail
+
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include

diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
@@ -18,34 +18,10 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COO_H
-#define __COO_H
-
 #pragma once
 
-#include <raft/sparse/convert/detail/coo.cuh>
-
-namespace raft {
-namespace sparse {
-namespace convert {
-
-/**
- * @brief Convert a CSR row_ind array to a COO rows array
- * @param row_ind: Input CSR row_ind array
- * @param m: size of row_ind array
- * @param coo_rows: Output COO row array
- * @param nnz: size of output COO row array
- * @param stream: cuda stream to use
- */
-template <typename value_idx = int>
-void csr_to_coo(
-  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
-{
-  detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
-}
-
-};  // end NAMESPACE convert
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "coo.cuh"
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <raft/sparse/convert/detail/adj_to_csr.cuh>
 #include <raft/sparse/convert/detail/csr.cuh>
 #include <raft/sparse/csr.hpp>
 
@@ -39,74 +40,6 @@ void coo_to_csr(const raft::handle_t& handle,
   detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
 }
 
-/**
- * @brief Constructs an adjacency graph CSR row_ind_ptr array from
- * a row_ind array and adjacency array.
- * @tparam T the numeric type of the index arrays
- * @tparam TPB_X the number of threads to use per block for kernels
- * @tparam Lambda function for fused operation in the adj_graph construction
- * @param row_ind the input CSR row_ind array
- * @param total_rows number of vertices in graph
- * @param nnz number of non-zeros
- * @param batchSize number of vertices in current batch
- * @param adj an adjacency array (size batchSize x total_rows)
- * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
- * @param stream cuda stream to use
- * @param fused_op: the fused operation
- */
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_* row_ind,
-                           Index_ total_rows,
-                           Index_ nnz,
-                           Index_ batchSize,
-                           const bool* adj,
-                           Index_* row_ind_ptr,
-                           cudaStream_t stream,
-                           Lambda fused_op)
-{
-  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
-}
-
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_* row_ind,
-                           Index_ total_rows,
-                           Index_ nnz,
-                           Index_ batchSize,
-                           const bool* adj,
-                           Index_* row_ind_ptr,
-                           cudaStream_t stream)
-{
-  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
-}
-
-/**
- * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
- * a row_ind array and adjacency array.
- * @tparam T the numeric type of the index arrays
- * @tparam TPB_X the number of threads to use per block for kernels
- * @param row_ind the input CSR row_ind array
- * @param total_rows number of total vertices in graph
- * @param nnz number of non-zeros
- * @param adj an adjacency array
- * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
- * @param stream cuda stream to use
- * @param fused_op the fused operation
- */
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_* row_ind,
-                   Index_ total_rows,
-                   Index_ nnz,
-                   const bool* adj,
-                   Index_* row_ind_ptr,
-                   cudaStream_t stream,
-                   Lambda fused_op)
-{
-  detail::csr_adj_graph<Index_, 32, Lambda>(
-    row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
-}
-
 /**
  * @brief Generate the row indices array for a sorted COO matrix
  *
@@ -135,8 +68,42 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
   detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
+/**
+ * @brief Converts a boolean adjacency matrix into unsorted CSR format.
+ *
+ * The conversion supports non-square matrices.
+ *
+ * @tparam     index_t     Indexing arithmetic type
+ *
+ * @param[in]  handle      RAFT handle
+ * @param[in]  adj         A num_rows x num_cols boolean matrix in contiguous row-major
+ *                         format.
+ * @param[in]  row_ind     An array of length num_rows that indicates at which index
+ *                         a row starts in out_col_ind. Equivalently, it is the
+ *                         exclusive scan of the number of non-zeros in each row of
+ *                         adj.
+ * @param[in]  num_rows    Number of rows of adj.
+ * @param[in]  num_cols    Number of columns of adj.
+ * @param      tmp         A pre-allocated array of size num_rows.
+ * @param[out] out_col_ind An array containing the column indices of the
+ *                         non-zero values in adj. Size should be at least the
+ *                         number of non-zeros in adj.
+ */
+template <typename index_t = int>
+void adj_to_csr(const raft::handle_t& handle,
+                const bool* adj,         // Row-major adjacency matrix
+                const index_t* row_ind,  // Precomputed row indices
+                index_t num_rows,        // # rows of adj
+                index_t num_cols,        // # cols of adj
+                index_t* tmp,  // Pre-allocated atomic counters. Minimum size: num_rows elements.
+                index_t* out_col_ind  // Output column indices
+)
+{
+  detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
+}
+
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
+#endif