From 0b4142af77b8793aabebad8593cb08c529beff11 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 25 Jul 2022 11:48:21 +0200
Subject: [PATCH 1/2] dbscan: Remove duplicate adj_to_csr implementation

This functionality has been moved to RAFT.
---
 cpp/src/dbscan/adjgraph/algo.cuh | 107 +------------------------------
 1 file changed, 3 insertions(+), 104 deletions(-)
diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh
index a1978b429a..16858e55f9 100644
--- a/cpp/src/dbscan/adjgraph/algo.cuh
+++ b/cpp/src/dbscan/adjgraph/algo.cuh
@@ -24,9 +24,8 @@
 #include "pack.h"
 
 #include <raft/cuda_utils.cuh>
-#include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
-#include <raft/vectorized.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace ML {
@@ -34,85 +33,6 @@ namespace Dbscan {
 namespace AdjGraph {
 namespace Algo {
 
-/**
- * @brief Convert a boolean adjacency matrix into CSR format.
- *
- * The adj_to_csr kernel converts a boolean adjacency matrix into CSR format.
- * High performance comes at the cost of non-deterministic output: the column
- * indices are not guaranteed to be stored in order.
- *
- * The kernel has been optimized to handle matrices that are non-square, for
- * instance subsets of a full adjacency matrix. In practice, these matrices can
- * be very wide and not very tall. In principle, each row is assigned to one
- * block. If there are more SMs than rows, multiple blocks operate on a single
- * row. To enable cooperation between these blocks, each row is provided a
- * counter where the current output index can be cooperatively (atomically)
- * incremented. As a result, the order of the output indices is not guaranteed
- * to be in order.
- *
- * @param[in] adj: a num_rows x num_cols boolean matrix in contiguous row-major
- *                 format.
- *
- * @param[in] row_ind: an array of length num_rows that indicates at which index
- *                     a row starts in out_col_ind. Equivalently, it is the
- *                     exclusive scan of the number of non-zeros in each row of
- *                     `adj`.
- *
- * @param[in] num_rows: number of rows of adj.
- * @param[in] num_cols: number of columns of adj.
- *
- * @param[in,out] row_counters: a temporary zero-initialized array of length num_rows.
- *
- * @param[out] out_col_ind: an array containing the column indices of the
- *                          non-zero values in `adj`. Size should be at least
- *                          the number of non-zeros in `adj`.
- */
-template <typename index_t>
-__global__ void adj_to_csr(const bool* adj,         // row-major adjacency matrix
-                           const index_t* row_ind,  // precomputed row indices
-                           index_t num_rows,        // # rows of adj
-                           index_t num_cols,        // # cols of adj
-                           index_t* row_counters,   // pre-allocated (zeroed) atomic counters
-                           index_t* out_col_ind     // output column indices
-)
-{
-  typedef raft::TxN_t<bool, 16> bool16;
-
-  for (index_t i = blockIdx.y; i < num_rows; i += gridDim.y) {
-    // Load row information
-    index_t row_base   = row_ind[i];
-    index_t* row_count = row_counters + i;
-    const bool* row    = adj + i * num_cols;
-
-    // Peeling: process the first j0 elements that are not aligned to a 16-byte
-    // boundary.
-    index_t j0 = (16 - (((uintptr_t)(const void*)row) % 16)) % 16;
-    j0         = min(j0, num_cols);
-    if (threadIdx.x < j0 && blockIdx.x == 0) {
-      if (row[threadIdx.x]) { out_col_ind[row_base + atomicIncWarp(row_count)] = threadIdx.x; }
-    }
-
-    // Process the rest of the row in 16 byte chunks starting at j0.
-    // This is a grid-stride loop.
-    index_t j = j0 + 16 * (blockIdx.x * blockDim.x + threadIdx.x);
-    for (; j + 15 < num_cols; j += 16 * (blockDim.x * gridDim.x)) {
-      bool16 chunk;
-      chunk.load(row, j);
-
-      for (int k = 0; k < 16; ++k) {
-        if (chunk.val.data[k]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j + k; }
-      }
-    }
-
-    // Remainder: process the last j1 bools in the row individually.
-    index_t j1 = (num_cols - j0) % 16;
-    if (threadIdx.x < j1 && blockIdx.x == 0) {
-      int j = num_cols - j1 + threadIdx.x;
-      if (row[j]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j; }
-    }
-  }
-}
-
 /**
  * @brief Converts a boolean adjacency matrix into CSR format.
  *
@@ -144,29 +64,8 @@ void launcher(const raft::handle_t& handle,
   device_ptr<Index_> dev_ex_scan = device_pointer_cast(data.ex_scan);
   thrust::exclusive_scan(handle.get_thrust_policy(), dev_vd, dev_vd + batch_size, dev_ex_scan);
 
-  // Zero-fill a temporary vector that can be used by the adj_to_csr kernel to
-  // keep track of the number of entries added to a row.
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_counters, 0, batch_size * sizeof(Index_), stream));
-
-  // Split the grid in the row direction (since each row can be processed
-  // independently). If the maximum number of active blocks (num_sms *
-  // occupancy) exceeds the number of rows, assign multiple blocks to a single
-  // row.
-  int threads_per_block = 1024;
-  int dev_id, sm_count, blocks_per_sm;
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm, adj_to_csr<Index_>, threads_per_block, 0);
-
-  Index_ max_active_blocks = sm_count * blocks_per_sm;
-  Index_ blocks_per_row    = raft::ceildiv(max_active_blocks, num_rows);
-  Index_ grid_rows         = raft::ceildiv(max_active_blocks, blocks_per_row);
-  dim3 block(threads_per_block, 1);
-  dim3 grid(blocks_per_row, grid_rows);
-
-  adj_to_csr<Index_><<<grid, block, 0, stream>>>(
-    adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
+  raft::sparse::convert::adj_to_csr(
+    handle, adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 

From 41405d5f103cd920a9091f976462e321b0edec0b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 25 Jul 2022 12:34:52 +0200
Subject: [PATCH 2/2] dbscan: Remove naive versions of algorithms

This fixes issue #3414.
---
 cpp/src/dbscan/adjgraph/algo.cuh         |  1 -
 cpp/src/dbscan/adjgraph/naive.cuh        | 65 -----------------
 cpp/src/dbscan/adjgraph/runner.cuh       |  6 +-
 cpp/src/dbscan/common.cuh                | 43 -----------
 cpp/src/dbscan/dbscan.cuh                |  2 +
 cpp/src/dbscan/vertexdeg/naive.cuh       | 93 ------------------------
 cpp/src/dbscan/vertexdeg/precomputed.cuh |  9 ---
 cpp/src/dbscan/vertexdeg/runner.cuh      |  6 +-
 8 files changed, 8 insertions(+), 217 deletions(-)
 delete mode 100644 cpp/src/dbscan/adjgraph/naive.cuh
 delete mode 100644 cpp/src/dbscan/common.cuh
 delete mode 100644 cpp/src/dbscan/vertexdeg/naive.cuh

diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh
index 16858e55f9..0e49a8fab1 100644
--- a/cpp/src/dbscan/adjgraph/algo.cuh
+++ b/cpp/src/dbscan/adjgraph/algo.cuh
@@ -20,7 +20,6 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
-#include "../common.cuh"
 #include "pack.h"
 
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh
deleted file mode 100644
index 6ce3994610..0000000000
--- a/cpp/src/dbscan/adjgraph/naive.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../common.cuh"
-#include "pack.h"
-#include <cuml/common/pinned_host_vector.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <vector>
-
-namespace ML {
-namespace Dbscan {
-namespace AdjGraph {
-namespace Naive {
-
-template <typename Index_ = int>
-void launcher(const raft::handle_t& handle,
-              Pack<Index_> data,
-              Index_ batch_size,
-              cudaStream_t stream)
-{
-  Index_ k = 0;
-  Index_ N = data.N;
-  ML::pinned_host_vector<Index_> host_vd(batch_size + 1);
-  ML::pinned_host_vector<char> host_adj(((batch_size * N) / 8) + 1);
-  ML::pinned_host_vector<Index_> host_ex_scan(batch_size);
-  raft::update_host((bool*)host_adj.data(), data.adj, batch_size * N, stream);
-  raft::update_host(host_vd.data(), data.vd, batch_size + 1, stream);
-  handle.sync_stream(stream);
-  size_t adjgraph_size = size_t(host_vd[batch_size]);
-  ML::pinned_host_vector<Index_> host_adj_graph(adjgraph_size);
-  for (Index_ i = 0; i < batch_size; i++) {
-    for (Index_ j = 0; j < N; j++) {
-      /// TODO: change layout or remove; cf #3414
-      if (host_adj[i * N + j]) {
-        host_adj_graph[k] = j;
-        k                 = k + 1;
-      }
-    }
-  }
-  host_ex_scan[0] = Index_(0);
-  for (Index_ i = 1; i < batch_size; i++)
-    host_ex_scan[i] = host_ex_scan[i - 1] + host_vd[i - 1];
-  raft::update_device(data.adj_graph, host_adj_graph.data(), adjgraph_size, stream);
-  raft::update_device(data.ex_scan, host_ex_scan.data(), batch_size, stream);
-}
-}  // namespace Naive
-}  // namespace AdjGraph
-}  // namespace Dbscan
-}  // namespace ML
diff --git a/cpp/src/dbscan/adjgraph/runner.cuh b/cpp/src/dbscan/adjgraph/runner.cuh
index 978f3ed14b..2b1c9d0f89 100644
--- a/cpp/src/dbscan/adjgraph/runner.cuh
+++ b/cpp/src/dbscan/adjgraph/runner.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include "algo.cuh"
-#include "naive.cuh"
 #include "pack.h"
 
 namespace ML {
@@ -39,8 +38,9 @@ void run(const raft::handle_t& handle,
 {
   Pack<Index_> data = {vd, adj, adj_graph, adjnnz, ex_scan, N};
   switch (algo) {
-    // TODO: deprecate naive runner. cf #3414
-    case 0: Naive::launcher<Index_>(handle, data, batch_size, stream); break;
+    case 0:
+      ASSERT(
+        false, "Incorrect algo '%d' passed! Naive version of adjgraph has been removed.", algo);
     case 1: Algo::launcher<Index_>(handle, data, batch_size, row_counters, stream); break;
     default: ASSERT(false, "Incorrect algo passed! '%d'", algo);
   }
diff --git a/cpp/src/dbscan/common.cuh b/cpp/src/dbscan/common.cuh
deleted file mode 100644
index 02da48ed1b..0000000000
--- a/cpp/src/dbscan/common.cuh
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace Dbscan {
-
-/// Default "ds1" diff-squared-accumulate traits specialization for value_t->accum_t
-/// Currently only supported for float/double
-template <typename value_t, typename accum_t>
-struct ds_accummulate {
-  /// Single-component "ds1" diff-squared vector type
-  typedef value_t dp_vector_t;
-
-  /// Compute "ds1" float->float
-  inline __device__ static void mad(float& d, const float& a, const float& b, const float& c)
-  {
-    float diff = a - b;
-    asm volatile("fma.rn.f32 %0, %1, %1, %2;\n" : "=f"(d) : "f"(diff), "f"(c));
-  }
-
-  /// Compute "ds1" double->double
-  inline __device__ static void mad(double& d, const double& a, const double& b, const double& c)
-  {
-    double diff = a - b;
-    asm volatile("fma.rn.f64 %0, %1, %1, %2;\n" : "=d"(d) : "d"(diff), "d"(c));
-  }
-};
-
-}  // namespace Dbscan
diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh
index 24595ff931..6b8c38435f 100644
--- a/cpp/src/dbscan/dbscan.cuh
+++ b/cpp/src/dbscan/dbscan.cuh
@@ -110,6 +110,8 @@ void dbscanFitImpl(const raft::handle_t& handle,
 {
   raft::common::nvtx::range fun_scope("ML::Dbscan::Fit");
   ML::Logger::get().setLevel(verbosity);
+  // XXX: for algo_vd and algo_adj, 0 (naive) is no longer an option and has
+  // been removed.
   int algo_vd  = (metric == raft::distance::Precomputed) ? 2 : 1;
   int algo_adj = 1;
   int algo_ccl = 2;
diff --git a/cpp/src/dbscan/vertexdeg/naive.cuh b/cpp/src/dbscan/vertexdeg/naive.cuh
deleted file mode 100644
index f332367ca9..0000000000
--- a/cpp/src/dbscan/vertexdeg/naive.cuh
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "pack.h"
-#include <raft/cuda_utils.cuh>
-
-namespace ML {
-namespace Dbscan {
-namespace VertexDeg {
-namespace Naive {
-
-using namespace MLCommon;
-
-/** number of threads in a CTA along X dim */
-static const int TPB_X = 32;
-/** number of threads in a CTA along Y dim */
-static const int TPB_Y = 8;
-
-/**
- * @brief Naive distance matrix evaluation and epsilon neighborhood construction
- * @param data input struct containing vertex degree computation params
- * @param start_vertex_id which vertex to begin the computations from
- * @param batch_size number of vertices in this batch
- */
-template <typename Type, typename Index_ = int>
-__global__ void vertex_degree_kernel(Pack<Type, Index_> data,
-                                     Index_ start_vertex_id,
-                                     Index_ batch_size)
-{
-  const Type Zero = (Type)0;
-  Index_ row      = (blockIdx.y * TPB_Y) + threadIdx.y;
-  Index_ col      = (blockIdx.x * TPB_X) + threadIdx.x;
-  Index_ N        = data.N;
-  if ((row >= batch_size) || (col >= N)) return;
-  Type eps      = data.eps;
-  Type eps2     = eps * eps;
-  Type sum      = Zero;
-  Index_ D      = data.D;
-  const Type* x = data.x;
-  bool* adj     = data.adj;
-  Index_* vd    = data.vd;
-  for (Index_ d = 0; d < D; ++d) {
-    Type a    = __ldg(x + (row + start_vertex_id) * D + d);
-    Type b    = __ldg(x + col * D + d);
-    Type diff = a - b;
-    sum += (diff * diff);
-  }
-  Index_ res         = (sum <= eps2);
-  adj[row * N + col] = res;
-  /// TODO: change layout or remove; cf #3414
-
-  if (sizeof(Index_) == 4) {
-    raft::myAtomicAdd((int*)(vd + row), (int)res);
-    raft::myAtomicAdd((int*)(vd + batch_size), (int)res);
-  } else if (sizeof(Index_) == 8) {
-    raft::myAtomicAdd<unsigned long long>((unsigned long long*)(vd + row), res);
-    raft::myAtomicAdd<unsigned long long>((unsigned long long*)(vd + batch_size), res);
-  }
-}
-
-template <typename Type, typename Index_ = int>
-void launcher(Pack<Type, Index_> data,
-              Index_ start_vertex_id,
-              Index_ batch_size,
-              cudaStream_t stream)
-{
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "index_t should be 4 or 8 bytes");
-
-  dim3 grid(raft::ceildiv(data.N, (Index_)TPB_X), raft::ceildiv(batch_size, (Index_)TPB_Y), 1);
-  dim3 blk(TPB_X, TPB_Y, 1);
-  data.resetArray(stream, batch_size + 1);
-  vertex_degree_kernel<<<grid, blk, 0, stream>>>(data, start_vertex_id, batch_size);
-}
-
-}  // namespace Naive
-}  // namespace VertexDeg
-}  // namespace Dbscan
-}  // namespace ML
diff --git a/cpp/src/dbscan/vertexdeg/precomputed.cuh b/cpp/src/dbscan/vertexdeg/precomputed.cuh
index 3fa3a828bc..e9fd3cf345 100644
--- a/cpp/src/dbscan/vertexdeg/precomputed.cuh
+++ b/cpp/src/dbscan/vertexdeg/precomputed.cuh
@@ -32,15 +32,6 @@ namespace Dbscan {
 namespace VertexDeg {
 namespace Precomputed {
 
-template <typename value_t, typename index_t>
-__global__ void dist_to_adj_kernel(
-  const value_t* X, bool* adj, index_t N, index_t start_vertex_id, index_t batch_size, value_t eps)
-{
-  for (index_t i = threadIdx.x; i < batch_size; i += blockDim.x) {
-    adj[batch_size * blockIdx.x + i] = X[N * blockIdx.x + start_vertex_id + i] <= eps;
-  }
-}
-
 /**
  * Calculates the vertex degree array and the epsilon neighborhood adjacency matrix for the batch.
  */
diff --git a/cpp/src/dbscan/vertexdeg/runner.cuh b/cpp/src/dbscan/vertexdeg/runner.cuh
index ea1da4f4e0..deded16783 100644
--- a/cpp/src/dbscan/vertexdeg/runner.cuh
+++ b/cpp/src/dbscan/vertexdeg/runner.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include "algo.cuh"
-#include "naive.cuh"
 #include "pack.h"
 #include "precomputed.cuh"
 
@@ -41,8 +40,9 @@ void run(const raft::handle_t& handle,
 {
   Pack<Type_f, Index_> data = {vd, adj, x, eps, N, D};
   switch (algo) {
-    // TODO: deprecate naive runner. cf #3414
-    case 0: Naive::launcher<Type_f, Index_>(data, start_vertex_id, batch_size, stream); break;
+    case 0:
+      ASSERT(
+        false, "Incorrect algo '%d' passed! Naive version of vertexdeg has been removed.", algo);
     case 1:
       Algo::launcher<Type_f, Index_>(handle, data, start_vertex_id, batch_size, stream, metric);
       break;