From 0b4142af77b8793aabebad8593cb08c529beff11 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Mon, 25 Jul 2022 11:48:21 +0200 Subject: [PATCH 1/2] dbscan: Remove duplicate adj_to_csr implementation This functionality has been moved to RAFT. --- cpp/src/dbscan/adjgraph/algo.cuh | 107 +------------------------------ 1 file changed, 3 insertions(+), 104 deletions(-) diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh index a1978b429a..16858e55f9 100644 --- a/cpp/src/dbscan/adjgraph/algo.cuh +++ b/cpp/src/dbscan/adjgraph/algo.cuh @@ -24,9 +24,8 @@ #include "pack.h" #include -#include #include -#include +#include #include namespace ML { @@ -34,85 +33,6 @@ namespace Dbscan { namespace AdjGraph { namespace Algo { -/** - * @brief Convert a boolean adjacency matrix into CSR format. - * - * The adj_to_csr kernel converts a boolean adjacency matrix into CSR format. - * High performance comes at the cost of non-deterministic output: the column - * indices are not guaranteed to be stored in order. - * - * The kernel has been optimized to handle matrices that are non-square, for - * instance subsets of a full adjacency matrix. In practice, these matrices can - * be very wide and not very tall. In principle, each row is assigned to one - * block. If there are more SMs than rows, multiple blocks operate on a single - * row. To enable cooperation between these blocks, each row is provided a - * counter where the current output index can be cooperatively (atomically) - * incremented. As a result, the order of the output indices is not guaranteed - * to be in order. - * - * @param[in] adj: a num_rows x num_cols boolean matrix in contiguous row-major - * format. - * - * @param[in] row_ind: an array of length num_rows that indicates at which index - * a row starts in out_col_ind. Equivalently, it is the - * exclusive scan of the number of non-zeros in each row of - * `adj`. - * - * @param[in] num_rows: number of rows of adj. - * @param[in] num_cols: number of columns of adj. - * - * @param[in,out] row_counters: a temporary zero-initialized array of length num_rows. - * - * @param[out] out_col_ind: an array containing the column indices of the - * non-zero values in `adj`. Size should be at least - * the number of non-zeros in `adj`. - */ -template -__global__ void adj_to_csr(const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* row_counters, // pre-allocated (zeroed) atomic counters - index_t* out_col_ind // output column indices -) -{ - typedef raft::TxN_t bool16; - - for (index_t i = blockIdx.y; i < num_rows; i += gridDim.y) { - // Load row information - index_t row_base = row_ind[i]; - index_t* row_count = row_counters + i; - const bool* row = adj + i * num_cols; - - // Peeling: process the first j0 elements that are not aligned to a 16-byte - // boundary. - index_t j0 = (16 - (((uintptr_t)(const void*)row) % 16)) % 16; - j0 = min(j0, num_cols); - if (threadIdx.x < j0 && blockIdx.x == 0) { - if (row[threadIdx.x]) { out_col_ind[row_base + atomicIncWarp(row_count)] = threadIdx.x; } - } - - // Process the rest of the row in 16 byte chunks starting at j0. - // This is a grid-stride loop. - index_t j = j0 + 16 * (blockIdx.x * blockDim.x + threadIdx.x); - for (; j + 15 < num_cols; j += 16 * (blockDim.x * gridDim.x)) { - bool16 chunk; - chunk.load(row, j); - - for (int k = 0; k < 16; ++k) { - if (chunk.val.data[k]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j + k; } - } - } - - // Remainder: process the last j1 bools in the row individually. - index_t j1 = (num_cols - j0) % 16; - if (threadIdx.x < j1 && blockIdx.x == 0) { - int j = num_cols - j1 + threadIdx.x; - if (row[j]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j; } - } - } -} - /** * @brief Converts a boolean adjacency matrix into CSR format. * @@ -144,29 +64,8 @@ void launcher(const raft::handle_t& handle, device_ptr dev_ex_scan = device_pointer_cast(data.ex_scan); thrust::exclusive_scan(handle.get_thrust_policy(), dev_vd, dev_vd + batch_size, dev_ex_scan); - // Zero-fill a temporary vector that can be used by the adj_to_csr kernel to - // keep track of the number of entries added to a row. - RAFT_CUDA_TRY(cudaMemsetAsync(row_counters, 0, batch_size * sizeof(Index_), stream)); - - // Split the grid in the row direction (since each row can be processed - // independently). If the maximum number of active blocks (num_sms * - // occupancy) exceeds the number of rows, assign multiple blocks to a single - // row. - int threads_per_block = 1024; - int dev_id, sm_count, blocks_per_sm; - cudaGetDevice(&dev_id); - cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); - cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks_per_sm, adj_to_csr, threads_per_block, 0); - - Index_ max_active_blocks = sm_count * blocks_per_sm; - Index_ blocks_per_row = raft::ceildiv(max_active_blocks, num_rows); - Index_ grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row); - dim3 block(threads_per_block, 1); - dim3 grid(blocks_per_row, grid_rows); - - adj_to_csr<<>>( - adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph); + raft::sparse::convert::adj_to_csr( + handle, adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph); RAFT_CUDA_TRY(cudaPeekAtLastError()); } From 41405d5f103cd920a9091f976462e321b0edec0b Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Mon, 25 Jul 2022 12:34:52 +0200 Subject: [PATCH 2/2] dbscan: Remove naive versions of algorithms This fixes issue #3414. --- cpp/src/dbscan/adjgraph/algo.cuh | 1 - cpp/src/dbscan/adjgraph/naive.cuh | 65 ----------------- cpp/src/dbscan/adjgraph/runner.cuh | 6 +- cpp/src/dbscan/common.cuh | 43 ----------- cpp/src/dbscan/dbscan.cuh | 2 + cpp/src/dbscan/vertexdeg/naive.cuh | 93 ------------------------ cpp/src/dbscan/vertexdeg/precomputed.cuh | 9 --- cpp/src/dbscan/vertexdeg/runner.cuh | 6 +- 8 files changed, 8 insertions(+), 217 deletions(-) delete mode 100644 cpp/src/dbscan/adjgraph/naive.cuh delete mode 100644 cpp/src/dbscan/common.cuh delete mode 100644 cpp/src/dbscan/vertexdeg/naive.cuh diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh index 16858e55f9..0e49a8fab1 100644 --- a/cpp/src/dbscan/adjgraph/algo.cuh +++ b/cpp/src/dbscan/adjgraph/algo.cuh @@ -20,7 +20,6 @@ #include #include -#include "../common.cuh" #include "pack.h" #include diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh deleted file mode 100644 index 6ce3994610..0000000000 --- a/cpp/src/dbscan/adjgraph/naive.cuh +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "../common.cuh" -#include "pack.h" -#include -#include -#include -#include - -namespace ML { -namespace Dbscan { -namespace AdjGraph { -namespace Naive { - -template -void launcher(const raft::handle_t& handle, - Pack data, - Index_ batch_size, - cudaStream_t stream) -{ - Index_ k = 0; - Index_ N = data.N; - ML::pinned_host_vector host_vd(batch_size + 1); - ML::pinned_host_vector host_adj(((batch_size * N) / 8) + 1); - ML::pinned_host_vector host_ex_scan(batch_size); - raft::update_host((bool*)host_adj.data(), data.adj, batch_size * N, stream); - raft::update_host(host_vd.data(), data.vd, batch_size + 1, stream); - handle.sync_stream(stream); - size_t adjgraph_size = size_t(host_vd[batch_size]); - ML::pinned_host_vector host_adj_graph(adjgraph_size); - for (Index_ i = 0; i < batch_size; i++) { - for (Index_ j = 0; j < N; j++) { - /// TODO: change layout or remove; cf #3414 - if (host_adj[i * N + j]) { - host_adj_graph[k] = j; - k = k + 1; - } - } - } - host_ex_scan[0] = Index_(0); - for (Index_ i = 1; i < batch_size; i++) - host_ex_scan[i] = host_ex_scan[i - 1] + host_vd[i - 1]; - raft::update_device(data.adj_graph, host_adj_graph.data(), adjgraph_size, stream); - raft::update_device(data.ex_scan, host_ex_scan.data(), batch_size, stream); -} -} // namespace Naive -} // namespace AdjGraph -} // namespace Dbscan -} // namespace ML diff --git a/cpp/src/dbscan/adjgraph/runner.cuh b/cpp/src/dbscan/adjgraph/runner.cuh index 978f3ed14b..2b1c9d0f89 100644 --- a/cpp/src/dbscan/adjgraph/runner.cuh +++ b/cpp/src/dbscan/adjgraph/runner.cuh @@ -17,7 +17,6 @@ #pragma once #include "algo.cuh" -#include "naive.cuh" #include "pack.h" namespace ML { @@ -39,8 +38,9 @@ void run(const raft::handle_t& handle, { Pack data = {vd, adj, adj_graph, adjnnz, ex_scan, N}; switch (algo) { - // TODO: deprecate naive runner. cf #3414 - case 0: Naive::launcher(handle, data, batch_size, stream); break; + case 0: + ASSERT( + false, "Incorrect algo '%d' passed! Naive version of adjgraph has been removed.", algo); case 1: Algo::launcher(handle, data, batch_size, row_counters, stream); break; default: ASSERT(false, "Incorrect algo passed! '%d'", algo); } diff --git a/cpp/src/dbscan/common.cuh b/cpp/src/dbscan/common.cuh deleted file mode 100644 index 02da48ed1b..0000000000 --- a/cpp/src/dbscan/common.cuh +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace Dbscan { - -/// Default "ds1" diff-squared-accumulate traits specialization for value_t->accum_t -/// Currently only supported for float/double -template -struct ds_accummulate { - /// Single-component "ds1" diff-squared vector type - typedef value_t dp_vector_t; - - /// Compute "ds1" float->float - inline __device__ static void mad(float& d, const float& a, const float& b, const float& c) - { - float diff = a - b; - asm volatile("fma.rn.f32 %0, %1, %1, %2;\n" : "=f"(d) : "f"(diff), "f"(c)); - } - - /// Compute "ds1" double->double - inline __device__ static void mad(double& d, const double& a, const double& b, const double& c) - { - double diff = a - b; - asm volatile("fma.rn.f64 %0, %1, %1, %2;\n" : "=d"(d) : "d"(diff), "d"(c)); - } -}; - -} // namespace Dbscan diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh index 24595ff931..6b8c38435f 100644 --- a/cpp/src/dbscan/dbscan.cuh +++ b/cpp/src/dbscan/dbscan.cuh @@ -110,6 +110,8 @@ void dbscanFitImpl(const raft::handle_t& handle, { raft::common::nvtx::range fun_scope("ML::Dbscan::Fit"); ML::Logger::get().setLevel(verbosity); + // XXX: for algo_vd and algo_adj, 0 (naive) is no longer an option and has + // been removed. int algo_vd = (metric == raft::distance::Precomputed) ? 2 : 1; int algo_adj = 1; int algo_ccl = 2; diff --git a/cpp/src/dbscan/vertexdeg/naive.cuh b/cpp/src/dbscan/vertexdeg/naive.cuh deleted file mode 100644 index f332367ca9..0000000000 --- a/cpp/src/dbscan/vertexdeg/naive.cuh +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "pack.h" -#include - -namespace ML { -namespace Dbscan { -namespace VertexDeg { -namespace Naive { - -using namespace MLCommon; - -/** number of threads in a CTA along X dim */ -static const int TPB_X = 32; -/** number of threads in a CTA along Y dim */ -static const int TPB_Y = 8; - -/** - * @brief Naive distance matrix evaluation and epsilon neighborhood construction - * @param data input struct containing vertex degree computation params - * @param start_vertex_id which vertex to begin the computations from - * @param batch_size number of vertices in this batch - */ -template -__global__ void vertex_degree_kernel(Pack data, - Index_ start_vertex_id, - Index_ batch_size) -{ - const Type Zero = (Type)0; - Index_ row = (blockIdx.y * TPB_Y) + threadIdx.y; - Index_ col = (blockIdx.x * TPB_X) + threadIdx.x; - Index_ N = data.N; - if ((row >= batch_size) || (col >= N)) return; - Type eps = data.eps; - Type eps2 = eps * eps; - Type sum = Zero; - Index_ D = data.D; - const Type* x = data.x; - bool* adj = data.adj; - Index_* vd = data.vd; - for (Index_ d = 0; d < D; ++d) { - Type a = __ldg(x + (row + start_vertex_id) * D + d); - Type b = __ldg(x + col * D + d); - Type diff = a - b; - sum += (diff * diff); - } - Index_ res = (sum <= eps2); - adj[row * N + col] = res; - /// TODO: change layout or remove; cf #3414 - - if (sizeof(Index_) == 4) { - raft::myAtomicAdd((int*)(vd + row), (int)res); - raft::myAtomicAdd((int*)(vd + batch_size), (int)res); - } else if (sizeof(Index_) == 8) { - raft::myAtomicAdd((unsigned long long*)(vd + row), res); - raft::myAtomicAdd((unsigned long long*)(vd + batch_size), res); - } -} - -template -void launcher(Pack data, - Index_ start_vertex_id, - Index_ batch_size, - cudaStream_t stream) -{ - ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "index_t should be 4 or 8 bytes"); - - dim3 grid(raft::ceildiv(data.N, (Index_)TPB_X), raft::ceildiv(batch_size, (Index_)TPB_Y), 1); - dim3 blk(TPB_X, TPB_Y, 1); - data.resetArray(stream, batch_size + 1); - vertex_degree_kernel<<>>(data, start_vertex_id, batch_size); -} - -} // namespace Naive -} // namespace VertexDeg -} // namespace Dbscan -} // namespace ML diff --git a/cpp/src/dbscan/vertexdeg/precomputed.cuh b/cpp/src/dbscan/vertexdeg/precomputed.cuh index 3fa3a828bc..e9fd3cf345 100644 --- a/cpp/src/dbscan/vertexdeg/precomputed.cuh +++ b/cpp/src/dbscan/vertexdeg/precomputed.cuh @@ -32,15 +32,6 @@ namespace Dbscan { namespace VertexDeg { namespace Precomputed { -template -__global__ void dist_to_adj_kernel( - const value_t* X, bool* adj, index_t N, index_t start_vertex_id, index_t batch_size, value_t eps) -{ - for (index_t i = threadIdx.x; i < batch_size; i += blockDim.x) { - adj[batch_size * blockIdx.x + i] = X[N * blockIdx.x + start_vertex_id + i] <= eps; - } -} - /** * Calculates the vertex degree array and the epsilon neighborhood adjacency matrix for the batch. */ diff --git a/cpp/src/dbscan/vertexdeg/runner.cuh b/cpp/src/dbscan/vertexdeg/runner.cuh index ea1da4f4e0..deded16783 100644 --- a/cpp/src/dbscan/vertexdeg/runner.cuh +++ b/cpp/src/dbscan/vertexdeg/runner.cuh @@ -17,7 +17,6 @@ #pragma once #include "algo.cuh" -#include "naive.cuh" #include "pack.h" #include "precomputed.cuh" @@ -41,8 +40,9 @@ void run(const raft::handle_t& handle, { Pack data = {vd, adj, x, eps, N, D}; switch (algo) { - // TODO: deprecate naive runner. cf #3414 - case 0: Naive::launcher(data, start_vertex_id, batch_size, stream); break; + case 0: + ASSERT( + false, "Incorrect algo '%d' passed! Naive version of vertexdeg has been removed.", algo); case 1: Algo::launcher(handle, data, start_vertex_id, batch_size, stream, metric); break;