From 5dfe8d5ab2d8db51cf81b1f681cc35aeed9f93a1 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Feb 2022 16:09:49 -0500 Subject: [PATCH 1/8] Hiding impl details for lap, spectral, and label --- cpp/include/raft/cluster/detail/kmeans.hpp | 986 ++++++++++++++++++ cpp/include/raft/cluster/kmeans.hpp | 66 ++ cpp/include/raft/label/classlabels.hpp | 115 ++ .../raft/label/{ => detail}/classlabels.cuh | 3 + .../raft/label/{ => detail}/merge_labels.cuh | 0 cpp/include/raft/label/merge_labels.hpp | 68 ++ cpp/include/raft/lap/{ => detail}/d_structs.h | 0 .../raft/lap/{ => detail}/lap_functions.cuh | 0 .../raft/lap/{ => detail}/lap_kernels.cuh | 0 cpp/include/raft/lap/{lap.cuh => lap.hpp} | 4 +- .../raft/sparse/linalg/detail/spectral.cuh | 1 - cpp/include/raft/spectral/cluster_solvers.hpp | 8 +- .../raft/spectral/{ => detail}/lapack.hpp | 0 .../spectral/{ => detail}/matrix_wrappers.hpp | 0 .../detail/modularity_maximization.hpp | 188 ++++ .../raft/spectral/detail/partition.hpp | 180 ++++ .../spectral/{ => detail}/spectral_util.hpp | 0 .../raft/spectral/{ => detail}/warn_dbg.hpp | 0 cpp/include/raft/spectral/eigen_solvers.hpp | 3 + cpp/include/raft/spectral/kmeans.hpp | 986 ------------------ .../raft/spectral/modularity_maximization.hpp | 127 +-- cpp/include/raft/spectral/partition.hpp | 101 +- 22 files changed, 1637 insertions(+), 1199 deletions(-) create mode 100644 cpp/include/raft/cluster/detail/kmeans.hpp create mode 100644 cpp/include/raft/cluster/kmeans.hpp create mode 100644 cpp/include/raft/label/classlabels.hpp rename cpp/include/raft/label/{ => detail}/classlabels.cuh (99%) rename cpp/include/raft/label/{ => detail}/merge_labels.cuh (100%) create mode 100644 cpp/include/raft/label/merge_labels.hpp rename cpp/include/raft/lap/{ => detail}/d_structs.h (100%) rename cpp/include/raft/lap/{ => detail}/lap_functions.cuh (100%) rename cpp/include/raft/lap/{ => detail}/lap_kernels.cuh (100%) rename cpp/include/raft/lap/{lap.cuh => lap.hpp} (99%) rename cpp/include/raft/spectral/{ => detail}/lapack.hpp (100%) rename cpp/include/raft/spectral/{ => detail}/matrix_wrappers.hpp (100%) create mode 100644 cpp/include/raft/spectral/detail/modularity_maximization.hpp create mode 100644 cpp/include/raft/spectral/detail/partition.hpp rename cpp/include/raft/spectral/{ => detail}/spectral_util.hpp (100%) rename cpp/include/raft/spectral/{ => detail}/warn_dbg.hpp (100%) delete mode 100644 cpp/include/raft/spectral/kmeans.hpp diff --git a/cpp/include/raft/cluster/detail/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.hpp new file mode 100644 index 0000000000..cf7255c22e --- /dev/null +++ b/cpp/include/raft/cluster/detail/kmeans.hpp @@ -0,0 +1,986 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace cluster { +namespace detail { +// ========================================================= +// Useful grid settings +// ========================================================= + + constexpr unsigned int BLOCK_SIZE = 1024; + constexpr unsigned int WARP_SIZE = 32; + constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); + +// ========================================================= +// CUDA kernels +// ========================================================= + +/** + * @brief Compute distances between observation vectors and centroids + * Block dimensions should be (warpSize, 1, + * blockSize/warpSize). Ideally, the grid is large enough so there + * are d threads in the x-direction, k threads in the y-direction, + * and n threads in the z-direction. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, d*n entries) Observation matrix. Matrix is + * stored column-major and each column is an observation + * vector. Matrix dimensions are d x n. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Output, n*k entries) Distance matrix. Matrix is + * stored column-major and the (i,j)-entry is the square of the + * Euclidean distance between the ith observation vector and jth + * centroid. Matrix dimensions are n x k. Entries must be + * initialized to zero. + */ + template + static __global__ void computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) + { + // Loop index + index_type_t i; + + // Block indices + index_type_t bidx; + // Global indices + index_type_t gidx, gidy, gidz; + + // Private memory + value_type_t centroid_private, dist_private; + + // Global x-index indicates index of vector entry + bidx = blockIdx.x; + while (bidx * blockDim.x < d) { + gidx = threadIdx.x + bidx * blockDim.x; + + // Global y-index indicates centroid + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Load centroid coordinate from global memory + centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; + + // Global z-index indicates observation vector + gidz = threadIdx.z + blockIdx.z * blockDim.z; + while (gidz < n) { + // Load observation vector coordinate from global memory + dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; + + // Compute contribution of current entry to distance + dist_private = centroid_private - dist_private; + dist_private = dist_private * dist_private; + + // Perform reduction on warp + for (i = WARP_SIZE / 2; i > 0; i /= 2) + dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + + // Write result to global memory + if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + + // Move to another observation vector + gidz += blockDim.z * gridDim.z; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } + + // Move to another vector entry + bidx += gridDim.x; + } + } + +/** + * @brief Find closest centroid to observation vectors. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Input/output, n*k entries) Distance matrix. Matrix + * is stored column-major and the (i,j)-entry is the square of + * the Euclidean distance between the ith observation vector and + * jth centroid. Matrix dimensions are n x k. On exit, the first + * n entries give the square of the Euclidean distance between + * observation vectors and closest centroids. + * @param codes (Output, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ + template + static __global__ void minDistances(index_type_t n, + index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) + { + // Loop index + index_type_t i, j; + + // Current matrix entry + value_type_t dist_curr; + + // Smallest entry in row + value_type_t dist_min; + index_type_t code_min; + + // Each row in observation matrix is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Find minimum entry in row + code_min = 0; + dist_min = dists[IDX(i, 0, n)]; + for (j = 1; j < k; ++j) { + dist_curr = dists[IDX(i, j, n)]; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + } + + // Transfer result to global memory + dists[i] = dist_min; + codes[i] = code_min; + + // Increment cluster sizes + atomicAdd(clusterSizes + code_min, 1); + + // Move to another row + i += blockDim.x * gridDim.x; + } + } + +/** + * @brief Check if newly computed distances are smaller than old distances. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Number of observation vectors. + * @param dists_old (Input/output, n entries) Distances between + * observation vectors and closest centroids. On exit, entries + * are replaced by entries in 'dists_new' if the corresponding + * observation vectors are closest to the new centroid. + * @param dists_new (Input, n entries) Distance between observation + * vectors and new centroid. + * @param codes_old (Input/output, n entries) Cluster + * assignments. On exit, entries are replaced with 'code_new' if + * the corresponding observation vectors are closest to the new + * centroid. + * @param code_new Index associated with new centroid. + */ + template + static __global__ void minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) + { + // Loop index + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; + + // Distances + value_type_t dist_old_private; + value_type_t dist_new_private; + + // Each row is processed by a thread + while (i < n) { + // Get old and new distances + dist_old_private = dists_old[i]; + dist_new_private = dists_new[i]; + + // Update if new distance is smaller than old distance + if (dist_new_private < dist_old_private) { + dists_old[i] = dist_new_private; + codes_old[i] = code_new; + } + + // Move to another row + i += blockDim.x * gridDim.x; + } + } + +/** + * @brief Compute size of k-means clusters. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam index_type_t the type of data used for indexing. + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param codes (Input, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ + template + static __global__ void computeClusterSizes(index_type_t n, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) + { + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + atomicAdd(clusterSizes + codes[i], 1); + i += blockDim.x * gridDim.x; + } + } + +/** + * @brief Divide rows of centroid matrix by cluster sizes. + * Divides the ith column of the sum matrix by the size of the ith + * cluster. If the sum matrix has been initialized so that the ith + * row is the sum of all observation vectors in the ith cluster, + * this kernel produces cluster centroids. The grid and block + * dimensions should be 2-dimensional. Ideally the grid is large + * enough so there are d threads in the x-direction and k threads + * in the y-direction. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param clusterSizes (Input, k entries) Number of points in each + * cluster. + * @param centroids (Input/output, d*k entries) Sum matrix. Matrix + * is stored column-major and matrix dimensions are d x k. The + * ith column is the sum of all observation vectors in the ith + * cluster. On exit, the matrix is the centroid matrix (each + * column is the mean position of a cluster). + */ + template + static __global__ void divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) + { + // Global indices + index_type_t gidx, gidy; + + // Current cluster size + index_type_t clusterSize_private; + + // Observation vector is determined by global y-index + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Get cluster size from global memory + clusterSize_private = clusterSizes[gidy]; + + // Add vector entries to centroid matrix + // vector entris are determined by global x-index + gidx = threadIdx.x + blockIdx.x * blockDim.x; + while (gidx < d) { + centroids[IDX(gidx, gidy, d)] /= clusterSize_private; + gidx += blockDim.x * gridDim.x; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } + } + +// ========================================================= +// Helper functions +// ========================================================= + +/** + * @brief Randomly choose new centroids. + * Centroid is randomly chosen with k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param rand Random number drawn uniformly from [0,1). + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are n x d. + * @param dists (Input, device memory, 2*n entries) Workspace. The + * first n entries should be the distance between observation + * vectors and the closest centroid. + * @param centroid (Output, device memory, d entries) Centroid + * coordinates. + * @return Zero if successful. Otherwise non-zero. + */ + template + static int chooseNewCentroid(handle_t const& handle, + index_type_t n, + index_type_t d, + value_type_t rand, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ dists, + value_type_t* __restrict__ centroid) + { + // Cumulative sum of distances + value_type_t* distsCumSum = dists + n; + // Residual sum of squares + value_type_t distsSum{0}; + // Observation vector that is chosen as new centroid + index_type_t obsIndex; + + auto stream = handle.get_stream(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Compute cumulative sum of distances + thrust::inclusive_scan(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + thrust::device_pointer_cast(distsCumSum)); + CHECK_CUDA(stream); + CUDA_TRY(cudaMemcpyAsync( + &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); + + // Randomly choose observation vector + // Probabilities are proportional to square of distance to closest + // centroid (see k-means++ algorithm) + // + // seg-faults due to Thrust bug + // on binary-search-like algorithms + // when run with stream dependent + // execution policies; fixed on Thrust GitHub + // hence replace w/ linear interpolation, + // until the Thrust issue gets resolved: + // + // obsIndex = (thrust::lower_bound( + // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + // thrust::device_pointer_cast(distsCumSum)); + // + // linear interpolation logic: + //{ + value_type_t minSum{0}; + RAFT_CUDA_TRY( + cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); + RAFT_CHECK_CUDA(stream); + + if (distsSum > minSum) { + value_type_t vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); + } else { + obsIndex = 0; + } + //} + + RAFT_CHECK_CUDA(stream); + obsIndex = max(obsIndex, 0); + obsIndex = min(obsIndex, n - 1); + + // Record new centroid position + RAFT_CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + + return 0; + } + +/** + * @brief Choose initial cluster centroids for k-means algorithm. + * Centroids are randomly chosen with k-means++ algorithm + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param dists (Output, device memory, 2*n entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @return Zero if successful. Otherwise non-zero. + */ + template + static int initializeCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, + unsigned long long seed) + { + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Loop index + index_type_t i; + + // Random number generator + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution uniformDist(0, 1); + + auto stream = handle.get_stream(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + constexpr index_type_t grid_lower_bound{65535}; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Initialize grid dimensions + dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; + + // CUDA grid dimensions + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + + // CUDA grid dimensions + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; + + // Assign observation vectors to code 0 + RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + + // Choose first centroid + thrust::fill(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + 1); + RAFT_CHECK_CUDA(stream); + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from first centroid + RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, dists); + RAFT_CHECK_CUDA(stream); + + // Choose remaining centroids + for (i = 1; i < k; ++i) { + // Choose ith centroid + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from ith centroid + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); + computeDistances<<>>( + n, d, 1, obs, centroids + IDX(0, i, d), dists + n); + RAFT_CHECK_CUDA(stream); + + // Recompute minimum distances + minDistances2<<>>(n, dists, dists + n, codes, i); + RAFT_CHECK_CUDA(stream); + } + + // Compute cluster sizes + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); + computeClusterSizes<<>>(n, codes, clusterSizes); + RAFT_CHECK_CUDA(stream); + + return 0; + } + +/** + * @brief Find cluster centroids closest to observation vectors. + * Distance is measured with Euclidean norm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Input, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param dists (Output, device memory, n*k entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares of assignment. + * @return Zero if successful. Otherwise non-zero. + */ + template + static int assignCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) + { + auto stream = handle.get_stream(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Compute distance between centroids and observation vectors + RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); + + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim; + constexpr index_type_t grid_lower_bound{65535}; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); + gridDim.y = min(k, grid_lower_bound); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + + computeDistances<<>>(n, d, k, obs, centroids, dists); + RAFT_CHECK_CUDA(stream); + + // Find centroid closest to each observation vector + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); + CHECK_CUDA(stream); + + // Compute residual sum of squares + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + + return 0; + } + +/** + * @brief Update cluster centroids for k-means algorithm. + * All clusters are assumed to be non-empty. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Input, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Input, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*d entries) Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @return Zero if successful. Otherwise non-zero. + */ + template + static int updateCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const index_type_t* __restrict__ codes, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int) + { + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Useful constants + const value_type_t one = 1; + const value_type_t zero = 0; + + constexpr index_type_t grid_lower_bound{65535}; + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Device memory + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); + + // Take transpose of observation matrix + RAFT_CUBLAS_TRY(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); + + // Cluster assigned to each observation matrix entry + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + RAFT_CHECK_CUDA(stream); + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, + thrust::modulus()); + RAFT_CHECK_CUDA(stream); + thrust::gather( + thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); + RAFT_CHECK_CUDA(stream); + + // Row associated with each observation matrix entry + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + RAFT_CHECK_CUDA(stream); + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, + thrust::divides()); + RAFT_CHECK_CUDA(stream); + + // Sort and reduce to add observation vectors in same cluster + thrust::stable_sort_by_key(thrust_exec_policy, + codes_copy, + codes_copy + d * n, + make_zip_iterator(make_tuple(obs_copy, rows))); + RAFT_CHECK_CUDA(stream); + thrust::reduce_by_key(thrust_exec_policy, + rows, + rows + d * n, + obs_copy, + codes_copy, // Output to codes_copy is ignored + thrust::device_pointer_cast(centroids)); + RAFT_CHECK_CUDA(stream); + + // Divide sums by cluster size to get centroid matrix + // + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; + + // CUDA grid dimensions + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), + 1}; + + divideCentroids<<>>(d, k, clusterSizes, centroids); + RAFT_CHECK_CUDA(stream); + + return 0; + } + +} // namespace + +namespace raft { + +// ========================================================= +// k-means algorithm +// ========================================================= + +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*max(k,d) entries) + * Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares (sum of squares of distances between observation + * vectors and centroids). + * @param iters_host (Output, host memory, 1 entry) Number of + * k-means iterations. + * @param seed random seed to be used. + * @return error flag. + */ + template + int kmeans(handle_t const &handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t *__restrict__ obs, + index_type_t *__restrict__ codes, + index_type_t *__restrict__ clusterSizes, + value_type_t *__restrict__ centroids, + value_type_t *__restrict__ work, + index_type_t *__restrict__ work_int, + value_type_t *residual_host, + index_type_t *iters_host, + unsigned long long seed) { + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Current iteration + index_type_t iter; + + constexpr + index_type_t grid_lower_bound{65535}; + + // Residual sum of squares at previous iteration + value_type_t residualPrev = 0; + + // Random number generator + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Initialization + // ------------------------------------------------------- + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Trivial cases + if (k == 1) { + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + CUDA_TRY( + cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not compute k-means centroids"); + + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; + + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, work); + RAFT_CHECK_CUDA(stream); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); + RAFT_CHECK_CUDA(stream); + return 0; + } + if (n <= k) { + thrust::sequence(thrust_exec_policy, + thrust::device_pointer_cast(codes), + thrust::device_pointer_cast(codes + n)); + RAFT_CHECK_CUDA(stream); + thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); + RAFT_CHECK_CUDA(stream); + + if (n < k) + RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync( + centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + *residual_host = 0; + return 0; + } + + // Initialize cuBLAS + RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // k-means++ algorithm + // ------------------------------------------------------- + + // Choose initial cluster centroids + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) + WARNING("could not initialize k-means centroids"); + + // Apply k-means iteration until convergence + for (iter = 0; iter < maxiter; ++iter) { + // Update cluster centroids + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not update k-means centroids"); + + // Determine centroid closest to each observation + residualPrev = *residual_host; + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + + // Reinitialize empty clusters with new centroids + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + + // FIXME: emptyCentroid never reaches k (infinite loop) under certain + // conditions, such as if obs is corrupt (as seen as a result of a + // DataFrame column of NULL edge vals used to create the Graph) + while (emptyCentroid < k) { + if (chooseNewCentroid( + handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) + WARNING("could not replace empty centroid"); + if (assignCentroids( + handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + RAFT_CHECK_CUDA(stream); + } + + // Check for convergence + if (std::fabs(residualPrev - (*residual_host)) / n < tol) { + ++iter; + break; + } + } + + // Warning if k-means has failed to converge + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + + *iters_host = iter; + return 0; + } + +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param residual On exit, residual sum of squares (sum of squares + * of distances between observation vectors and centroids). + * @param iters on exit, number of k-means iterations. + * @param seed random seed to be used. + * @return error flag + */ + template + int kmeans(handle_t const &handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t *__restrict__ obs, + index_type_t *__restrict__ codes, + value_type_t &residual, + index_type_t &iters, + unsigned long long seed = 123456) { + using namespace matrix; + + // Check that parameters are valid + RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); + + // Allocate memory + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); + + // Perform k-means + return kmeans(handle, + n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters, + seed); + } + +} // namespace detail +} // namespace cluster +} // namespace raft diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp new file mode 100644 index 0000000000..fe4dd4793a --- /dev/null +++ b/cpp/include/raft/cluster/kmeans.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace raft { +namespace cluster { + +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param residual On exit, residual sum of squares (sum of squares + * of distances between observation vectors and centroids). + * @param iters on exit, number of k-means iterations. + * @param seed random seed to be used. + * @return error flag + */ +template +int kmeans(handle_t const &handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t *__restrict__ obs, + index_type_t *__restrict__ codes, + value_type_t &residual, + index_type_t &iters, + unsigned long long seed = 123456) { + + return detail::kmeans(handle, n, d, k, tol, + maxiter, obs, codes, + residual, iters, seed); +} +} // namespace cluster +} // namespace raft diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp new file mode 100644 index 0000000000..a4d0cae359 --- /dev/null +++ b/cpp/include/raft/label/classlabels.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { + namespace label { + +/** + * Get unique class labels. + * + * The y array is assumed to store class labels. The unique values are selected + * from this array. + * + * \tparam value_t numeric type of the arrays with class labels + * \param [in] y device array of labels, size [n] + * \param [in] n number of labels + * \param [out] unique device array of unique labels, unallocated on entry, + * on exit it has size [n_unique] + * \param [out] n_unique number of unique labels + * \param [in] stream cuda stream + */ +template +int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) { + detail::getUniqueLabels(unique, y, n, stream); +} + +/** + * Assign one versus rest labels. + * + * The output labels will have values +/-1: + * y_out = (y == y_unique[idx]) ? +1 : -1; + * + * The output type currently is set to value_t, but for SVM in principle we are + * free to choose other type for y_out (it should represent +/-1, and it is used + * in floating point arithmetics). + * + * \param [in] y device array if input labels, size [n] + * \param [in] n number of labels + * \param [in] y_unique device array of unique labels, size [n_classes] + * \param [in] n_classes number of unique labels + * \param [out] y_out device array of output labels + * \param [in] idx index of unique label that should be labeled as 1 + * \param [in] stream cuda stream + */ +template +void getOvrlabels( + value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) { + detail::getOvrLabels(y, n, y_unique, n_classes, y_out, idx, stream); +} +/** + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out the output monotonic array + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ +template +void make_monotonic( + Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false) +{ + detail::make_monotonic(out, in, N, stream, filter_op, zero_based); +} + +/** + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out output label array with labels assigned monotonically + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + */ +template +void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false) +{ + detail::make_monotonic(out, in, N, stream, zero_based); +} +}; // namespace label +}; // end namespace raft diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh similarity index 99% rename from cpp/include/raft/label/classlabels.cuh rename to cpp/include/raft/label/detail/classlabels.cuh index dce732b06b..b7582be1a3 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/detail/classlabels.cuh @@ -26,6 +26,7 @@ namespace raft { namespace label { +namespace detail { /** * Get unique class labels. @@ -194,5 +195,7 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer make_monotonic( out, in, N, stream, [] __device__(Type val) { return false; }, zero_based); } + +}; // namespace detail }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh similarity index 100% rename from cpp/include/raft/label/merge_labels.cuh rename to cpp/include/raft/label/detail/merge_labels.cuh diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp new file mode 100644 index 0000000000..b21502a06a --- /dev/null +++ b/cpp/include/raft/label/merge_labels.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace label { +namespace detail { + +/** + * @brief Merge two labellings in-place, according to a core mask + * + * A labelling is a representation of disjoint sets (groups) where points that + * belong to the same group have the same label. It is assumed that group + * labels take values between 1 and N. labels relate to points, i.e a label i+1 + * means that you belong to the same group as the point i. + * The special value MAX_LABEL is used to mark points that are not labelled. + * + * The two label arrays A and B induce two sets of groups over points 0..N-1. + * If a point is labelled i in A and j in B and the mask is true for this + * point, then i and j are equivalent labels and their groups are merged by + * relabeling the elements of both groups to have the same label. The new label + * is the smaller one from the original labels. + * It is required that if the mask is true for a point, this point is labelled + * (i.e its label is different than the special value MAX_LABEL). + * + * One use case is finding connected components: the two input label arrays can + * represent the connected components of graphs G_A and G_B, and the output + * would be the connected components labels of G_A \union G_B. + * + * @param[inout] labels_a First input, and output label array (in-place) + * @param[in] labels_b Second input label array + * @param[in] mask Core point mask + * @param[out] R label equivalence map + * @param[in] m Working flag + * @param[in] N Number of points in the dataset + * @param[in] stream CUDA stream + */ +template +void merge_labels(value_idx *labels_a, + const value_idx *labels_b, + const bool *mask, + value_idx *R, + bool *m, + value_idx N, + cudaStream_t stream) { + + detail::merge_labels(labels_a, labels_b, mask, R, m, N, stream); +} + +}; // namespace detail +}; // namespace label +}; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/detail/d_structs.h similarity index 100% rename from cpp/include/raft/lap/d_structs.h rename to cpp/include/raft/lap/detail/d_structs.h diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh similarity index 100% rename from cpp/include/raft/lap/lap_functions.cuh rename to cpp/include/raft/lap/detail/lap_functions.cuh diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh similarity index 100% rename from cpp/include/raft/lap/lap_kernels.cuh rename to cpp/include/raft/lap/detail/lap_kernels.cuh diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.hpp similarity index 99% rename from cpp/include/raft/lap/lap.cuh rename to cpp/include/raft/lap/lap.hpp index 42b898ebff..2350ebcddf 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.hpp @@ -27,8 +27,8 @@ #include #include -#include "d_structs.h" -#include "lap_functions.cuh" +#include "detail/d_structs.h" +#include "detail/lap_functions.cuh" namespace raft { namespace lap { diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh index de62f25ffa..2519c1914e 100644 --- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 221a9679d4..092e51a17d 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -15,10 +15,11 @@ */ #pragma once -#include +#include #include // for std::pair namespace raft { +namespace spectral { using namespace matrix; @@ -52,7 +53,8 @@ struct kmeans_solver_t { RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, + + raft::cluster::kmeans(handle, n_obs_vecs, dim, config_.n_clusters, @@ -71,4 +73,6 @@ struct kmeans_solver_t { private: cluster_solver_config_t config_; }; + +} // namespace spectral } // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/detail/lapack.hpp similarity index 100% rename from cpp/include/raft/spectral/lapack.hpp rename to cpp/include/raft/spectral/detail/lapack.hpp diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp similarity index 100% rename from cpp/include/raft/spectral/matrix_wrappers.hpp rename to cpp/include/raft/spectral/detail/matrix_wrappers.hpp diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp new file mode 100644 index 0000000000..ec0e23dfe0 --- /dev/null +++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#ifdef COLLECT_TIME_STATISTICS +#include +#include +#include +#include +#include +#endif + +#ifdef COLLECT_TIME_STATISTICS +static double timer(void) +{ + struct timeval tv; + cudaDeviceSynchronize(); + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} +#endif + +namespace raft { +namespace spectral { +namespace detail { + +using namespace matrix; +using namespace linalg; + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Cluster + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return error flag. + */ +template +std::tuple modularity_maximization( + handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + + std::tuple + stats; // # iters eigen solver, cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // Compute eigenvectors of Modularity Matrix + + // Initialize Modularity Matrix + modularity_matrix_t B{handle, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute eigenvectors corresponding to largest eigenvalues + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); + + // notice that at this point the matrix has already been transposed, so we are scaling + // columns + scale_obs(nEigVecs, n, eigVecs); + RAFT_CHECK_CUDA(stream); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param G Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param clusters (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +void analyzeModularity(handle_t const& handle, + sparse_matrix_t const& csr_m, + vertex_t nClusters, + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + weight_t partModularity, clustersize; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Device memory + vector_t part_i(handle, n); + vector_t Bx(handle, n); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Modularity + modularity_matrix_t B{handle, csr_m}; + + // Initialize output + modularity = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { + WARNING("empty partition"); + continue; + } + + // Record results + modularity += partModularity; + } + + modularity = modularity / B.diagonal_.nrm1(); +} + +} // namespace detail +} // namespace spectral +} // namespace raft diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp new file mode 100644 index 0000000000..82313f84ee --- /dev/null +++ b/cpp/include/raft/spectral/detail/partition.hpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace raft { +namespace spectral { +namespace detail { + +using namespace matrix; +using namespace linalg; + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple partition(handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) { + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + // cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // ------------------------------------------------------- + // Spectral partitioner + // ------------------------------------------------------- + + // Compute eigenvectors of Laplacian + + // Initialize Laplacian + /// sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute smallest eigenvalues and eigenvectors + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return error flag. + */ +template +void analyzePartition(handle_t const& handle, + sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) { + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + + weight_t partEdgesCut, clustersize; + + // Device memory + vector_t part_i(handle, n); + vector_t Lx(handle, n); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Laplacian + /// sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, csr_m}; + + // Initialize output + cost = 0; + edgeCut = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + // Construct indicator vector for ith partition + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { + WARNING("empty partition"); + continue; + } + + // Record results + cost += partEdgesCut / clustersize; + edgeCut += partEdgesCut / 2; + } +} + +} // namespace detail +} // namespace spectral +} // namespace raft diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/detail/spectral_util.hpp similarity index 100% rename from cpp/include/raft/spectral/spectral_util.hpp rename to cpp/include/raft/spectral/detail/spectral_util.hpp diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/detail/warn_dbg.hpp similarity index 100% rename from cpp/include/raft/spectral/warn_dbg.hpp rename to cpp/include/raft/spectral/detail/warn_dbg.hpp diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 156b996586..192dc15a6b 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -18,6 +18,7 @@ #include namespace raft { +namespace spectral { using namespace matrix; @@ -95,4 +96,6 @@ struct lanczos_solver_t { private: eigen_solver_config_t config_; }; + +} // namespace spectral } // namespace raft diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp deleted file mode 100644 index bcb28bfb1e..0000000000 --- a/cpp/include/raft/spectral/kmeans.hpp +++ /dev/null @@ -1,986 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace { - -using namespace raft; -using namespace raft::linalg; -// ========================================================= -// Useful grid settings -// ========================================================= - -constexpr unsigned int BLOCK_SIZE = 1024; -constexpr unsigned int WARP_SIZE = 32; -constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); - -// ========================================================= -// CUDA kernels -// ========================================================= - -/** - * @brief Compute distances between observation vectors and centroids - * Block dimensions should be (warpSize, 1, - * blockSize/warpSize). Ideally, the grid is large enough so there - * are d threads in the x-direction, k threads in the y-direction, - * and n threads in the z-direction. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, d*n entries) Observation matrix. Matrix is - * stored column-major and each column is an observation - * vector. Matrix dimensions are d x n. - * @param centroids (Input, d*k entries) Centroid matrix. Matrix is - * stored column-major and each column is a centroid. Matrix - * dimensions are d x k. - * @param dists (Output, n*k entries) Distance matrix. Matrix is - * stored column-major and the (i,j)-entry is the square of the - * Euclidean distance between the ith observation vector and jth - * centroid. Matrix dimensions are n x k. Entries must be - * initialized to zero. - */ -template -static __global__ void computeDistances(index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) -{ - // Loop index - index_type_t i; - - // Block indices - index_type_t bidx; - // Global indices - index_type_t gidx, gidy, gidz; - - // Private memory - value_type_t centroid_private, dist_private; - - // Global x-index indicates index of vector entry - bidx = blockIdx.x; - while (bidx * blockDim.x < d) { - gidx = threadIdx.x + bidx * blockDim.x; - - // Global y-index indicates centroid - gidy = threadIdx.y + blockIdx.y * blockDim.y; - while (gidy < k) { - // Load centroid coordinate from global memory - centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; - - // Global z-index indicates observation vector - gidz = threadIdx.z + blockIdx.z * blockDim.z; - while (gidz < n) { - // Load observation vector coordinate from global memory - dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; - - // Compute contribution of current entry to distance - dist_private = centroid_private - dist_private; - dist_private = dist_private * dist_private; - - // Perform reduction on warp - for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); - - // Write result to global memory - if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); - - // Move to another observation vector - gidz += blockDim.z * gridDim.z; - } - - // Move to another centroid - gidy += blockDim.y * gridDim.y; - } - - // Move to another vector entry - bidx += gridDim.x; - } -} - -/** - * @brief Find closest centroid to observation vectors. - * Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param n Number of observation vectors. - * @param k Number of clusters. - * @param centroids (Input, d*k entries) Centroid matrix. Matrix is - * stored column-major and each column is a centroid. Matrix - * dimensions are d x k. - * @param dists (Input/output, n*k entries) Distance matrix. Matrix - * is stored column-major and the (i,j)-entry is the square of - * the Euclidean distance between the ith observation vector and - * jth centroid. Matrix dimensions are n x k. On exit, the first - * n entries give the square of the Euclidean distance between - * observation vectors and closest centroids. - * @param codes (Output, n entries) Cluster assignments. - * @param clusterSizes (Output, k entries) Number of points in each - * cluster. Entries must be initialized to zero. - */ -template -static __global__ void minDistances(index_type_t n, - index_type_t k, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) -{ - // Loop index - index_type_t i, j; - - // Current matrix entry - value_type_t dist_curr; - - // Smallest entry in row - value_type_t dist_min; - index_type_t code_min; - - // Each row in observation matrix is processed by a thread - i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - // Find minimum entry in row - code_min = 0; - dist_min = dists[IDX(i, 0, n)]; - for (j = 1; j < k; ++j) { - dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; - } - - // Transfer result to global memory - dists[i] = dist_min; - codes[i] = code_min; - - // Increment cluster sizes - atomicAdd(clusterSizes + code_min, 1); - - // Move to another row - i += blockDim.x * gridDim.x; - } -} - -/** - * @brief Check if newly computed distances are smaller than old distances. - * Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param n Number of observation vectors. - * @param dists_old (Input/output, n entries) Distances between - * observation vectors and closest centroids. On exit, entries - * are replaced by entries in 'dists_new' if the corresponding - * observation vectors are closest to the new centroid. - * @param dists_new (Input, n entries) Distance between observation - * vectors and new centroid. - * @param codes_old (Input/output, n entries) Cluster - * assignments. On exit, entries are replaced with 'code_new' if - * the corresponding observation vectors are closest to the new - * centroid. - * @param code_new Index associated with new centroid. - */ -template -static __global__ void minDistances2(index_type_t n, - value_type_t* __restrict__ dists_old, - const value_type_t* __restrict__ dists_new, - index_type_t* __restrict__ codes_old, - index_type_t code_new) -{ - // Loop index - index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; - - // Distances - value_type_t dist_old_private; - value_type_t dist_new_private; - - // Each row is processed by a thread - while (i < n) { - // Get old and new distances - dist_old_private = dists_old[i]; - dist_new_private = dists_new[i]; - - // Update if new distance is smaller than old distance - if (dist_new_private < dist_old_private) { - dists_old[i] = dist_new_private; - codes_old[i] = code_new; - } - - // Move to another row - i += blockDim.x * gridDim.x; - } -} - -/** - * @brief Compute size of k-means clusters. - * Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * @tparam index_type_t the type of data used for indexing. - * @param n Number of observation vectors. - * @param k Number of clusters. - * @param codes (Input, n entries) Cluster assignments. - * @param clusterSizes (Output, k entries) Number of points in each - * cluster. Entries must be initialized to zero. - */ -template -static __global__ void computeClusterSizes(index_type_t n, - const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) -{ - index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - atomicAdd(clusterSizes + codes[i], 1); - i += blockDim.x * gridDim.x; - } -} - -/** - * @brief Divide rows of centroid matrix by cluster sizes. - * Divides the ith column of the sum matrix by the size of the ith - * cluster. If the sum matrix has been initialized so that the ith - * row is the sum of all observation vectors in the ith cluster, - * this kernel produces cluster centroids. The grid and block - * dimensions should be 2-dimensional. Ideally the grid is large - * enough so there are d threads in the x-direction and k threads - * in the y-direction. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param clusterSizes (Input, k entries) Number of points in each - * cluster. - * @param centroids (Input/output, d*k entries) Sum matrix. Matrix - * is stored column-major and matrix dimensions are d x k. The - * ith column is the sum of all observation vectors in the ith - * cluster. On exit, the matrix is the centroid matrix (each - * column is the mean position of a cluster). - */ -template -static __global__ void divideCentroids(index_type_t d, - index_type_t k, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) -{ - // Global indices - index_type_t gidx, gidy; - - // Current cluster size - index_type_t clusterSize_private; - - // Observation vector is determined by global y-index - gidy = threadIdx.y + blockIdx.y * blockDim.y; - while (gidy < k) { - // Get cluster size from global memory - clusterSize_private = clusterSizes[gidy]; - - // Add vector entries to centroid matrix - // vector entris are determined by global x-index - gidx = threadIdx.x + blockIdx.x * blockDim.x; - while (gidx < d) { - centroids[IDX(gidx, gidy, d)] /= clusterSize_private; - gidx += blockDim.x * gridDim.x; - } - - // Move to another centroid - gidy += blockDim.y * gridDim.y; - } -} - -// ========================================================= -// Helper functions -// ========================================================= - -/** - * @brief Randomly choose new centroids. - * Centroid is randomly chosen with k-means++ algorithm. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param rand Random number drawn uniformly from [0,1). - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are n x d. - * @param dists (Input, device memory, 2*n entries) Workspace. The - * first n entries should be the distance between observation - * vectors and the closest centroid. - * @param centroid (Output, device memory, d entries) Centroid - * coordinates. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int chooseNewCentroid(handle_t const& handle, - index_type_t n, - index_type_t d, - value_type_t rand, - const value_type_t* __restrict__ obs, - value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) -{ - // Cumulative sum of distances - value_type_t* distsCumSum = dists + n; - // Residual sum of squares - value_type_t distsSum{0}; - // Observation vector that is chosen as new centroid - index_type_t obsIndex; - - auto stream = handle.get_stream(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, - thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), - thrust::device_pointer_cast(distsCumSum)); - CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync( - &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - - // Randomly choose observation vector - // Probabilities are proportional to square of distance to closest - // centroid (see k-means++ algorithm) - // - // seg-faults due to Thrust bug - // on binary-search-like algorithms - // when run with stream dependent - // execution policies; fixed on Thrust GitHub - // hence replace w/ linear interpolation, - // until the Thrust issue gets resolved: - // - // obsIndex = (thrust::lower_bound( - // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), - // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - - // thrust::device_pointer_cast(distsCumSum)); - // - // linear interpolation logic: - //{ - value_type_t minSum{0}; - RAFT_CUDA_TRY( - cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - RAFT_CHECK_CUDA(stream); - - if (distsSum > minSum) { - value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); - } else { - obsIndex = 0; - } - //} - - RAFT_CHECK_CUDA(stream); - obsIndex = max(obsIndex, 0); - obsIndex = min(obsIndex, n - 1); - - // Record new centroid position - RAFT_CUDA_TRY(cudaMemcpyAsync(centroid, - obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - - return 0; -} - -/** - * @brief Choose initial cluster centroids for k-means algorithm. - * Centroids are randomly chosen with k-means++ algorithm - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param dists (Output, device memory, 2*n entries) Workspace. On - * exit, the first n entries give the square of the Euclidean - * distance between observation vectors and the closest centroid. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int initializeCentroids(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ dists, - unsigned long long seed) -{ - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Loop index - index_type_t i; - - // Random number generator - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); - - auto stream = handle.get_stream(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - constexpr index_type_t grid_lower_bound{65535}; - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Initialize grid dimensions - dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; - - // CUDA grid dimensions - dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; - - // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; - - // Assign observation vectors to code 0 - RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - - // Choose first centroid - thrust::fill(thrust_exec_policy, - thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), - 1); - RAFT_CHECK_CUDA(stream); - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) - WARNING("error in k-means++ (could not pick centroid)"); - - // Compute distances from first centroid - RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, dists); - RAFT_CHECK_CUDA(stream); - - // Choose remaining centroids - for (i = 1; i < k; ++i) { - // Choose ith centroid - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) - WARNING("error in k-means++ (could not pick centroid)"); - - // Compute distances from ith centroid - CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>( - n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - RAFT_CHECK_CUDA(stream); - - // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); - RAFT_CHECK_CUDA(stream); - } - - // Compute cluster sizes - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, codes, clusterSizes); - RAFT_CHECK_CUDA(stream); - - return 0; -} - -/** - * @brief Find cluster centroids closest to observation vectors. - * Distance is measured with Euclidean norm. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param centroids (Input, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param dists (Output, device memory, n*k entries) Workspace. On - * exit, the first n entries give the square of the Euclidean - * distance between observation vectors and the closest centroid. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param residual_host (Output, host memory, 1 entry) Residual sum - * of squares of assignment. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int assignCentroids(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) -{ - auto stream = handle.get_stream(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Compute distance between centroids and observation vectors - RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); - - // CUDA grid dimensions - dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; - - dim3 gridDim; - constexpr index_type_t grid_lower_bound{65535}; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); - gridDim.y = min(k, grid_lower_bound); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - - computeDistances<<>>(n, d, k, obs, centroids, dists); - RAFT_CHECK_CUDA(stream); - - // Find centroid closest to each observation vector - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); - CHECK_CUDA(stream); - - // Compute residual sum of squares - *residual_host = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); - - return 0; -} - -/** - * @brief Update cluster centroids for k-means algorithm. - * All clusters are assumed to be non-empty. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Input, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Input, device memory, k entries) Number of - * points in each cluster. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param work (Output, device memory, n*d entries) Workspace. - * @param work_int (Output, device memory, 2*d*n entries) - * Workspace. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int updateCentroids(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const index_type_t* __restrict__ codes, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) -{ - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Useful constants - const value_type_t one = 1; - const value_type_t zero = 0; - - constexpr index_type_t grid_lower_bound{65535}; - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Device memory - thrust::device_ptr obs_copy(work); - thrust::device_ptr codes_copy(work_int); - thrust::device_ptr rows(work_int + d * n); - - // Take transpose of observation matrix - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); - - // Cluster assigned to each observation matrix entry - thrust::sequence(thrust_exec_policy, rows, rows + d * n); - RAFT_CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, - rows, - rows + d * n, - thrust::make_constant_iterator(n), - rows, - thrust::modulus()); - RAFT_CHECK_CUDA(stream); - thrust::gather( - thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); - RAFT_CHECK_CUDA(stream); - - // Row associated with each observation matrix entry - thrust::sequence(thrust_exec_policy, rows, rows + d * n); - RAFT_CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, - rows, - rows + d * n, - thrust::make_constant_iterator(n), - rows, - thrust::divides()); - RAFT_CHECK_CUDA(stream); - - // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, - codes_copy, - codes_copy + d * n, - make_zip_iterator(make_tuple(obs_copy, rows))); - RAFT_CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, - rows, - rows + d * n, - obs_copy, - codes_copy, // Output to codes_copy is ignored - thrust::device_pointer_cast(centroids)); - RAFT_CHECK_CUDA(stream); - - // Divide sums by cluster size to get centroid matrix - // - // CUDA grid dimensions - dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; - - // CUDA grid dimensions - dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), - 1}; - - divideCentroids<<>>(d, k, clusterSizes, centroids); - RAFT_CHECK_CUDA(stream); - - return 0; -} - -} // namespace - -namespace raft { - -// ========================================================= -// k-means algorithm -// ========================================================= - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param work (Output, device memory, n*max(k,d) entries) - * Workspace. - * @param work_int (Output, device memory, 2*d*n entries) - * Workspace. - * @param residual_host (Output, host memory, 1 entry) Residual sum - * of squares (sum of squares of distances between observation - * vectors and centroids). - * @param iters_host (Output, host memory, 1 entry) Number of - * k-means iterations. - * @param seed random seed to be used. - * @return error flag. - */ -template -int kmeans(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int, - value_type_t* residual_host, - index_type_t* iters_host, - unsigned long long seed) -{ - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Current iteration - index_type_t iter; - - constexpr index_type_t grid_lower_bound{65535}; - - // Residual sum of squares at previous iteration - value_type_t residualPrev = 0; - - // Random number generator - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); - - // ------------------------------------------------------- - // Initialization - // ------------------------------------------------------- - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Trivial cases - if (k == 1) { - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY( - cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) - WARNING("could not compute k-means centroids"); - - dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; - - dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; - - CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, work); - RAFT_CHECK_CUDA(stream); - *residual_host = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - RAFT_CHECK_CUDA(stream); - return 0; - } - if (n <= k) { - thrust::sequence(thrust_exec_policy, - thrust::device_pointer_cast(codes), - thrust::device_pointer_cast(codes + n)); - RAFT_CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); - RAFT_CHECK_CUDA(stream); - - if (n < k) - RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); - RAFT_CUDA_TRY(cudaMemcpyAsync( - centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - *residual_host = 0; - return 0; - } - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // ------------------------------------------------------- - // k-means++ algorithm - // ------------------------------------------------------- - - // Choose initial cluster centroids - if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) - WARNING("could not initialize k-means centroids"); - - // Apply k-means iteration until convergence - for (iter = 0; iter < maxiter; ++iter) { - // Update cluster centroids - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) - WARNING("could not update k-means centroids"); - - // Determine centroid closest to each observation - residualPrev = *residual_host; - if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) - WARNING("could not assign observation vectors to k-means clusters"); - - // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - - // FIXME: emptyCentroid never reaches k (infinite loop) under certain - // conditions, such as if obs is corrupt (as seen as a result of a - // DataFrame column of NULL edge vals used to create the Graph) - while (emptyCentroid < k) { - if (chooseNewCentroid( - handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) - WARNING("could not replace empty centroid"); - if (assignCentroids( - handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) - WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - RAFT_CHECK_CUDA(stream); - } - - // Check for convergence - if (std::fabs(residualPrev - (*residual_host)) / n < tol) { - ++iter; - break; - } - } - - // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); - - *iters_host = iter; - return 0; -} - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param residual On exit, residual sum of squares (sum of squares - * of distances between observation vectors and centroids). - * @param iters on exit, number of k-means iterations. - * @param seed random seed to be used. - * @return error flag - */ -template -int kmeans(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, - value_type_t& residual, - index_type_t& iters, - unsigned long long seed = 123456) -{ - using namespace matrix; - - // Check that parameters are valid - RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); - RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); - RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); - RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); - RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); - - // Allocate memory - vector_t clusterSizes(handle, k); - vector_t centroids(handle, d * k); - vector_t work(handle, n * max(k, d)); - vector_t work_int(handle, 2 * d * n); - - // Perform k-means - return kmeans(handle, - n, - d, - k, - tol, - maxiter, - obs, - codes, - clusterSizes.raw(), - centroids.raw(), - work.raw(), - work_int.raw(), - &residual, - &iters, - seed); -} - -} // namespace raft diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index c61b5f1458..1f8857200f 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -16,44 +16,13 @@ #pragma once -#include -#include - -#include -#include -#include -#include - #include -#include -#include -#include - -#ifdef COLLECT_TIME_STATISTICS -#include -#include -#include -#include -#include -#endif - -#ifdef COLLECT_TIME_STATISTICS -static double timer(void) -{ - struct timeval tv; - cudaDeviceSynchronize(); - gettimeofday(&tv, NULL); - return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; -} -#endif +#include namespace raft { namespace spectral { -using namespace matrix; -using namespace linalg; - // ========================================================= // Spectral modularity_maximization // ========================================================= @@ -81,52 +50,15 @@ using namespace linalg; */ template std::tuple modularity_maximization( - handle_t const& handle, - sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - - std::tuple - stats; // # iters eigen solver, cluster solver residual, # iters cluster solver - - vertex_t n = csr_m.nrows_; - - // Compute eigenvectors of Modularity Matrix - - // Initialize Modularity Matrix - modularity_matrix_t B{handle, csr_m}; - - auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; - - // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); - - // Whiten eigenvector matrix - transform_eigen_matrix(handle, n, nEigVecs, eigVecs); - - // notice that at this point the matrix has already been transposed, so we are scaling - // columns - scale_obs(nEigVecs, n, eigVecs); - RAFT_CHECK_CUDA(stream); - - // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); - - std::get<1>(stats) = pair_cluster.first; - std::get<2>(stats) = pair_cluster.second; - - return stats; + handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) { + return detail::modularity_maximization(handle, csr_m, + eigen_solver, cluster_solver, clusters, eigVals, eigVecs); } //=================================================== // Analysis of graph partition @@ -144,42 +76,9 @@ void analyzeModularity(handle_t const& handle, sparse_matrix_t const& csr_m, vertex_t nClusters, vertex_t const* __restrict__ clusters, - weight_t& modularity) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - - vertex_t i; - vertex_t n = csr_m.nrows_; - weight_t partModularity, clustersize; - - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - // Device memory - vector_t part_i(handle, n); - vector_t Bx(handle, n); - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // Initialize Modularity - modularity_matrix_t B{handle, csr_m}; - - // Initialize output - modularity = 0; - - // Iterate through partitions - for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { - WARNING("empty partition"); - continue; - } - - // Record results - modularity += partModularity; - } - - modularity = modularity / B.diagonal_.nrm1(); + weight_t& modularity) { + detail::analyzeModularity(handle, csr_m, nClusters, clusters, modularity); +} } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 5b1478baa9..ca2b71ae49 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -15,26 +15,13 @@ */ #pragma once -#include -#include - -#include -#include -#include -#include - #include -#include -#include -#include +#include namespace raft { namespace spectral { -using namespace matrix; -using namespace linalg; - // ========================================================= // Spectral partitioner // ========================================================= @@ -68,47 +55,10 @@ std::tuple partition(handle_t const& handle, ClusterSolver const& cluster_solver, vertex_t* __restrict__ clusters, weight_t* eigVals, - weight_t* eigVecs) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - - std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, - // cluster solver residual, # iters cluster solver - - vertex_t n = csr_m.nrows_; - - // ------------------------------------------------------- - // Spectral partitioner - // ------------------------------------------------------- - - // Compute eigenvectors of Laplacian - - // Initialize Laplacian - /// sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, csr_m}; - - auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; - - // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); - - // Whiten eigenvector matrix - transform_eigen_matrix(handle, n, nEigVecs, eigVecs); - - // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); - - std::get<1>(stats) = pair_cluster.first; - std::get<2>(stats) = pair_cluster.second; - - return stats; + weight_t* eigVecs) { + return detail::partition(handle, csr_m, eigen_solver, + cluster_solver, clusters, + eigVals, eigVecs); } // ========================================================= @@ -135,45 +85,8 @@ void analyzePartition(handle_t const& handle, vertex_t nClusters, const vertex_t* __restrict__ clusters, weight_t& edgeCut, - weight_t& cost) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - - vertex_t i; - vertex_t n = csr_m.nrows_; - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - - weight_t partEdgesCut, clustersize; - - // Device memory - vector_t part_i(handle, n); - vector_t Lx(handle, n); - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // Initialize Laplacian - /// sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, csr_m}; - - // Initialize output - cost = 0; - edgeCut = 0; - - // Iterate through partitions - for (i = 0; i < nClusters; ++i) { - // Construct indicator vector for ith partition - if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { - WARNING("empty partition"); - continue; - } - - // Record results - cost += partEdgesCut / clustersize; - edgeCut += partEdgesCut / 2; - } + weight_t& cost) { + detail::analyzePartition(handle, csr_m, nClusters, clusters, edgeCut, cost); } } // namespace spectral From cca8c839b57b63924b2566ffa31815e3be0500b0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Feb 2022 17:08:20 -0500 Subject: [PATCH 2/8] Updating necessary headers to remove device/host buffer from raft, update lap, spectral, cluster, and label packages --- cpp/include/raft/cluster/detail/kmeans.hpp | 1323 ++++++++--------- cpp/include/raft/cluster/kmeans.hpp | 21 +- cpp/include/raft/comms/helper.hpp | 1 - cpp/include/raft/comms/std_comms.hpp | 2 - cpp/include/raft/label/classlabels.hpp | 18 +- .../raft/label/detail/merge_labels.cuh | 2 + cpp/include/raft/label/merge_labels.hpp | 20 +- cpp/include/raft/lap/detail/lap_functions.cuh | 2 +- cpp/include/raft/lap/detail/lap_kernels.cuh | 1 - cpp/include/raft/linalg/lanczos.hpp | 6 +- cpp/include/raft/mr/buffer_base.hpp | 211 --- cpp/include/raft/mr/device/buffer.hpp | 70 - cpp/include/raft/mr/host/buffer.hpp | 85 -- .../raft/sparse/distance/detail/coo_spmv.cuh | 1 - .../raft/sparse/distance/detail/utils.cuh | 1 - cpp/include/raft/sparse/distance/distance.hpp | 1 - .../sparse/hierarchy/detail/agglomerative.cuh | 1 - .../hierarchy/detail/connectivities.cuh | 1 - .../raft/sparse/hierarchy/detail/mst.cuh | 1 - .../raft/sparse/linalg/detail/spectral.cuh | 2 + cpp/include/raft/sparse/op/detail/reduce.cuh | 1 - .../selection/detail/connect_components.cuh | 3 +- .../raft/sparse/selection/detail/knn.cuh | 1 - cpp/include/raft/spatial/knn/ann.hpp | 2 - cpp/include/raft/spatial/knn/knn.hpp | 4 - cpp/include/raft/spectral/cluster_solvers.hpp | 20 +- .../detail/modularity_maximization.hpp | 2 +- .../raft/spectral/detail/partition.hpp | 8 +- .../raft/spectral/modularity_maximization.hpp | 25 +- cpp/include/raft/spectral/partition.hpp | 13 +- cpp/test/CMakeLists.txt | 2 - cpp/test/cluster_solvers.cu | 10 +- cpp/test/eigen_solvers.cu | 17 +- cpp/test/label/label.cu | 2 +- cpp/test/label/merge_labels.cu | 2 +- cpp/test/lap/lap.cu | 2 +- cpp/test/mr/device/buffer.cpp | 92 -- cpp/test/mr/host/buffer.cpp | 71 - cpp/test/spectral_matrix.cu | 2 +- 39 files changed, 762 insertions(+), 1287 deletions(-) delete mode 100644 cpp/include/raft/mr/buffer_base.hpp delete mode 100644 cpp/include/raft/mr/device/buffer.hpp delete mode 100644 cpp/include/raft/mr/host/buffer.hpp delete mode 100644 cpp/test/mr/device/buffer.cpp delete mode 100644 cpp/test/mr/host/buffer.cpp diff --git a/cpp/include/raft/cluster/detail/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.hpp index cf7255c22e..e5911bc7d9 100644 --- a/cpp/include/raft/cluster/detail/kmeans.hpp +++ b/cpp/include/raft/cluster/detail/kmeans.hpp @@ -32,8 +32,8 @@ #include #include #include -#include -#include +#include +#include namespace raft { namespace cluster { @@ -42,9 +42,9 @@ namespace detail { // Useful grid settings // ========================================================= - constexpr unsigned int BLOCK_SIZE = 1024; - constexpr unsigned int WARP_SIZE = 32; - constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels @@ -73,66 +73,66 @@ namespace detail { * centroid. Matrix dimensions are n x k. Entries must be * initialized to zero. */ - template - static __global__ void computeDistances(index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) - { - // Loop index - index_type_t i; - - // Block indices - index_type_t bidx; - // Global indices - index_type_t gidx, gidy, gidz; - - // Private memory - value_type_t centroid_private, dist_private; - - // Global x-index indicates index of vector entry - bidx = blockIdx.x; - while (bidx * blockDim.x < d) { - gidx = threadIdx.x + bidx * blockDim.x; - - // Global y-index indicates centroid - gidy = threadIdx.y + blockIdx.y * blockDim.y; - while (gidy < k) { - // Load centroid coordinate from global memory - centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; - - // Global z-index indicates observation vector - gidz = threadIdx.z + blockIdx.z * blockDim.z; - while (gidz < n) { - // Load observation vector coordinate from global memory - dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; - - // Compute contribution of current entry to distance - dist_private = centroid_private - dist_private; - dist_private = dist_private * dist_private; - - // Perform reduction on warp - for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); - - // Write result to global memory - if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); - - // Move to another observation vector - gidz += blockDim.z * gridDim.z; - } - - // Move to another centroid - gidy += blockDim.y * gridDim.y; - } - - // Move to another vector entry - bidx += gridDim.x; - } +template +static __global__ void computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) +{ + // Loop index + index_type_t i; + + // Block indices + index_type_t bidx; + // Global indices + index_type_t gidx, gidy, gidz; + + // Private memory + value_type_t centroid_private, dist_private; + + // Global x-index indicates index of vector entry + bidx = blockIdx.x; + while (bidx * blockDim.x < d) { + gidx = threadIdx.x + bidx * blockDim.x; + + // Global y-index indicates centroid + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Load centroid coordinate from global memory + centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; + + // Global z-index indicates observation vector + gidz = threadIdx.z + blockIdx.z * blockDim.z; + while (gidz < n) { + // Load observation vector coordinate from global memory + dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; + + // Compute contribution of current entry to distance + dist_private = centroid_private - dist_private; + dist_private = dist_private * dist_private; + + // Perform reduction on warp + for (i = WARP_SIZE / 2; i > 0; i /= 2) + dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + + // Write result to global memory + if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + + // Move to another observation vector + gidz += blockDim.z * gridDim.z; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; } + // Move to another vector entry + bidx += gridDim.x; + } +} + /** * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the @@ -154,47 +154,47 @@ namespace detail { * @param clusterSizes (Output, k entries) Number of points in each * cluster. Entries must be initialized to zero. */ - template - static __global__ void minDistances(index_type_t n, - index_type_t k, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) - { - // Loop index - index_type_t i, j; - - // Current matrix entry - value_type_t dist_curr; - - // Smallest entry in row - value_type_t dist_min; - index_type_t code_min; - - // Each row in observation matrix is processed by a thread - i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - // Find minimum entry in row - code_min = 0; - dist_min = dists[IDX(i, 0, n)]; - for (j = 1; j < k; ++j) { - dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; - } - - // Transfer result to global memory - dists[i] = dist_min; - codes[i] = code_min; - - // Increment cluster sizes - atomicAdd(clusterSizes + code_min, 1); - - // Move to another row - i += blockDim.x * gridDim.x; - } +template +static __global__ void minDistances(index_type_t n, + index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) +{ + // Loop index + index_type_t i, j; + + // Current matrix entry + value_type_t dist_curr; + + // Smallest entry in row + value_type_t dist_min; + index_type_t code_min; + + // Each row in observation matrix is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Find minimum entry in row + code_min = 0; + dist_min = dists[IDX(i, 0, n)]; + for (j = 1; j < k; ++j) { + dist_curr = dists[IDX(i, j, n)]; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } + // Transfer result to global memory + dists[i] = dist_min; + codes[i] = code_min; + + // Increment cluster sizes + atomicAdd(clusterSizes + code_min, 1); + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + /** * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the @@ -214,37 +214,37 @@ namespace detail { * centroid. * @param code_new Index associated with new centroid. */ - template - static __global__ void minDistances2(index_type_t n, - value_type_t* __restrict__ dists_old, - const value_type_t* __restrict__ dists_new, - index_type_t* __restrict__ codes_old, - index_type_t code_new) - { - // Loop index - index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; - - // Distances - value_type_t dist_old_private; - value_type_t dist_new_private; - - // Each row is processed by a thread - while (i < n) { - // Get old and new distances - dist_old_private = dists_old[i]; - dist_new_private = dists_new[i]; - - // Update if new distance is smaller than old distance - if (dist_new_private < dist_old_private) { - dists_old[i] = dist_new_private; - codes_old[i] = code_new; - } - - // Move to another row - i += blockDim.x * gridDim.x; - } +template +static __global__ void minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) +{ + // Loop index + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; + + // Distances + value_type_t dist_old_private; + value_type_t dist_new_private; + + // Each row is processed by a thread + while (i < n) { + // Get old and new distances + dist_old_private = dists_old[i]; + dist_new_private = dists_new[i]; + + // Update if new distance is smaller than old distance + if (dist_new_private < dist_old_private) { + dists_old[i] = dist_new_private; + codes_old[i] = code_new; } + // Move to another row + i += blockDim.x * gridDim.x; + } +} + /** * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the @@ -256,17 +256,17 @@ namespace detail { * @param clusterSizes (Output, k entries) Number of points in each * cluster. Entries must be initialized to zero. */ - template - static __global__ void computeClusterSizes(index_type_t n, - const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) - { - index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - atomicAdd(clusterSizes + codes[i], 1); - i += blockDim.x * gridDim.x; - } - } +template +static __global__ void computeClusterSizes(index_type_t n, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) +{ + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + atomicAdd(clusterSizes + codes[i], 1); + i += blockDim.x * gridDim.x; + } +} /** * @brief Divide rows of centroid matrix by cluster sizes. @@ -289,37 +289,37 @@ namespace detail { * cluster. On exit, the matrix is the centroid matrix (each * column is the mean position of a cluster). */ - template - static __global__ void divideCentroids(index_type_t d, - index_type_t k, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) - { - // Global indices - index_type_t gidx, gidy; - - // Current cluster size - index_type_t clusterSize_private; - - // Observation vector is determined by global y-index - gidy = threadIdx.y + blockIdx.y * blockDim.y; - while (gidy < k) { - // Get cluster size from global memory - clusterSize_private = clusterSizes[gidy]; - - // Add vector entries to centroid matrix - // vector entris are determined by global x-index - gidx = threadIdx.x + blockIdx.x * blockDim.x; - while (gidx < d) { - centroids[IDX(gidx, gidy, d)] /= clusterSize_private; - gidx += blockDim.x * gridDim.x; - } - - // Move to another centroid - gidy += blockDim.y * gridDim.y; - } +template +static __global__ void divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) +{ + // Global indices + index_type_t gidx, gidy; + + // Current cluster size + index_type_t clusterSize_private; + + // Observation vector is determined by global y-index + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Get cluster size from global memory + clusterSize_private = clusterSizes[gidy]; + + // Add vector entries to centroid matrix + // vector entris are determined by global x-index + gidx = threadIdx.x + blockIdx.x * blockDim.x; + while (gidx < d) { + centroids[IDX(gidx, gidy, d)] /= clusterSize_private; + gidx += blockDim.x * gridDim.x; } + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } +} + // ========================================================= // Helper functions // ========================================================= @@ -344,78 +344,78 @@ namespace detail { * coordinates. * @return Zero if successful. Otherwise non-zero. */ - template - static int chooseNewCentroid(handle_t const& handle, - index_type_t n, - index_type_t d, - value_type_t rand, - const value_type_t* __restrict__ obs, - value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) - { - // Cumulative sum of distances - value_type_t* distsCumSum = dists + n; - // Residual sum of squares - value_type_t distsSum{0}; - // Observation vector that is chosen as new centroid - index_type_t obsIndex; - - auto stream = handle.get_stream(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, - thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), - thrust::device_pointer_cast(distsCumSum)); - CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync( - &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - - // Randomly choose observation vector - // Probabilities are proportional to square of distance to closest - // centroid (see k-means++ algorithm) - // - // seg-faults due to Thrust bug - // on binary-search-like algorithms - // when run with stream dependent - // execution policies; fixed on Thrust GitHub - // hence replace w/ linear interpolation, - // until the Thrust issue gets resolved: - // - // obsIndex = (thrust::lower_bound( - // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), - // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - - // thrust::device_pointer_cast(distsCumSum)); - // - // linear interpolation logic: - //{ - value_type_t minSum{0}; - RAFT_CUDA_TRY( - cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - RAFT_CHECK_CUDA(stream); - - if (distsSum > minSum) { - value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); - } else { - obsIndex = 0; - } - //} - - RAFT_CHECK_CUDA(stream); - obsIndex = max(obsIndex, 0); - obsIndex = min(obsIndex, n - 1); - - // Record new centroid position - RAFT_CUDA_TRY(cudaMemcpyAsync(centroid, - obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - - return 0; - } +template +static int chooseNewCentroid(handle_t const& handle, + index_type_t n, + index_type_t d, + value_type_t rand, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ dists, + value_type_t* __restrict__ centroid) +{ + // Cumulative sum of distances + value_type_t* distsCumSum = dists + n; + // Residual sum of squares + value_type_t distsSum{0}; + // Observation vector that is chosen as new centroid + index_type_t obsIndex; + + auto stream = handle.get_stream(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Compute cumulative sum of distances + thrust::inclusive_scan(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + thrust::device_pointer_cast(distsCumSum)); + CHECK_CUDA(stream); + CUDA_TRY(cudaMemcpyAsync( + &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); + + // Randomly choose observation vector + // Probabilities are proportional to square of distance to closest + // centroid (see k-means++ algorithm) + // + // seg-faults due to Thrust bug + // on binary-search-like algorithms + // when run with stream dependent + // execution policies; fixed on Thrust GitHub + // hence replace w/ linear interpolation, + // until the Thrust issue gets resolved: + // + // obsIndex = (thrust::lower_bound( + // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + // thrust::device_pointer_cast(distsCumSum)); + // + // linear interpolation logic: + //{ + value_type_t minSum{0}; + RAFT_CUDA_TRY( + cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); + RAFT_CHECK_CUDA(stream); + + if (distsSum > minSum) { + value_type_t vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); + } else { + obsIndex = 0; + } + //} + + RAFT_CHECK_CUDA(stream); + obsIndex = max(obsIndex, 0); + obsIndex = min(obsIndex, n - 1); + + // Record new centroid position + RAFT_CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + + return 0; +} /** * @brief Choose initial cluster centroids for k-means algorithm. @@ -441,90 +441,90 @@ namespace detail { * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ - template - static int initializeCentroids(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ dists, - unsigned long long seed) - { - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Loop index - index_type_t i; - - // Random number generator - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); - - auto stream = handle.get_stream(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - constexpr index_type_t grid_lower_bound{65535}; - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Initialize grid dimensions - dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; - - // CUDA grid dimensions - dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; - - // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; - - // Assign observation vectors to code 0 - RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - - // Choose first centroid - thrust::fill(thrust_exec_policy, - thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), - 1); - RAFT_CHECK_CUDA(stream); - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) - WARNING("error in k-means++ (could not pick centroid)"); - - // Compute distances from first centroid - RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, dists); - RAFT_CHECK_CUDA(stream); - - // Choose remaining centroids - for (i = 1; i < k; ++i) { - // Choose ith centroid - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) - WARNING("error in k-means++ (could not pick centroid)"); - - // Compute distances from ith centroid - CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>( - n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - RAFT_CHECK_CUDA(stream); - - // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); - RAFT_CHECK_CUDA(stream); - } - - // Compute cluster sizes - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, codes, clusterSizes); - RAFT_CHECK_CUDA(stream); - - return 0; - } +template +static int initializeCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, + unsigned long long seed) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Loop index + index_type_t i; + + // Random number generator + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution uniformDist(0, 1); + + auto stream = handle.get_stream(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + constexpr index_type_t grid_lower_bound{65535}; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Initialize grid dimensions + dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; + + // CUDA grid dimensions + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + + // CUDA grid dimensions + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; + + // Assign observation vectors to code 0 + RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + + // Choose first centroid + thrust::fill(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + 1); + RAFT_CHECK_CUDA(stream); + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from first centroid + RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, dists); + RAFT_CHECK_CUDA(stream); + + // Choose remaining centroids + for (i = 1; i < k; ++i) { + // Choose ith centroid + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from ith centroid + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); + computeDistances<<>>( + n, d, 1, obs, centroids + IDX(0, i, d), dists + n); + RAFT_CHECK_CUDA(stream); + + // Recompute minimum distances + minDistances2<<>>(n, dists, dists + n, codes, i); + RAFT_CHECK_CUDA(stream); + } + + // Compute cluster sizes + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); + computeClusterSizes<<>>(n, codes, clusterSizes); + RAFT_CHECK_CUDA(stream); + + return 0; +} /** * @brief Find cluster centroids closest to observation vectors. @@ -552,53 +552,53 @@ namespace detail { * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ - template - static int assignCentroids(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) - { - auto stream = handle.get_stream(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Compute distance between centroids and observation vectors - RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); - - // CUDA grid dimensions - dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; - - dim3 gridDim; - constexpr index_type_t grid_lower_bound{65535}; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); - gridDim.y = min(k, grid_lower_bound); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - - computeDistances<<>>(n, d, k, obs, centroids, dists); - RAFT_CHECK_CUDA(stream); - - // Find centroid closest to each observation vector - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); - CHECK_CUDA(stream); - - // Compute residual sum of squares - *residual_host = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); - - return 0; - } +template +static int assignCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) +{ + auto stream = handle.get_stream(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Compute distance between centroids and observation vectors + RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); + + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim; + constexpr index_type_t grid_lower_bound{65535}; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); + gridDim.y = min(k, grid_lower_bound); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + + computeDistances<<>>(n, d, k, obs, centroids, dists); + RAFT_CHECK_CUDA(stream); + + // Find centroid closest to each observation vector + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); + CHECK_CUDA(stream); + + // Compute residual sum of squares + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + + return 0; +} /** * @brief Update cluster centroids for k-means algorithm. @@ -624,111 +624,107 @@ namespace detail { * Workspace. * @return Zero if successful. Otherwise non-zero. */ - template - static int updateCentroids(handle_t const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const index_type_t* __restrict__ codes, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) - { - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Useful constants - const value_type_t one = 1; - const value_type_t zero = 0; - - constexpr index_type_t grid_lower_bound{65535}; - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Device memory - thrust::device_ptr obs_copy(work); - thrust::device_ptr codes_copy(work_int); - thrust::device_ptr rows(work_int + d * n); - - // Take transpose of observation matrix - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); - - // Cluster assigned to each observation matrix entry - thrust::sequence(thrust_exec_policy, rows, rows + d * n); - RAFT_CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, - rows, - rows + d * n, - thrust::make_constant_iterator(n), - rows, - thrust::modulus()); - RAFT_CHECK_CUDA(stream); - thrust::gather( - thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); - RAFT_CHECK_CUDA(stream); - - // Row associated with each observation matrix entry - thrust::sequence(thrust_exec_policy, rows, rows + d * n); - RAFT_CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, - rows, - rows + d * n, - thrust::make_constant_iterator(n), - rows, - thrust::divides()); - RAFT_CHECK_CUDA(stream); - - // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, - codes_copy, - codes_copy + d * n, - make_zip_iterator(make_tuple(obs_copy, rows))); - RAFT_CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, - rows, - rows + d * n, - obs_copy, - codes_copy, // Output to codes_copy is ignored - thrust::device_pointer_cast(centroids)); - RAFT_CHECK_CUDA(stream); - - // Divide sums by cluster size to get centroid matrix - // - // CUDA grid dimensions - dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; - - // CUDA grid dimensions - dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), - 1}; - - divideCentroids<<>>(d, k, clusterSizes, centroids); - RAFT_CHECK_CUDA(stream); - - return 0; - } - -} // namespace - -namespace raft { +template +static int updateCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const index_type_t* __restrict__ codes, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Useful constants + const value_type_t one = 1; + const value_type_t zero = 0; + + constexpr index_type_t grid_lower_bound{65535}; + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Device memory + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); + + // Take transpose of observation matrix + RAFT_CUBLAS_TRY(linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); + + // Cluster assigned to each observation matrix entry + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + RAFT_CHECK_CUDA(stream); + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, + thrust::modulus()); + RAFT_CHECK_CUDA(stream); + thrust::gather( + thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); + RAFT_CHECK_CUDA(stream); + + // Row associated with each observation matrix entry + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + RAFT_CHECK_CUDA(stream); + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, + thrust::divides()); + RAFT_CHECK_CUDA(stream); + + // Sort and reduce to add observation vectors in same cluster + thrust::stable_sort_by_key(thrust_exec_policy, + codes_copy, + codes_copy + d * n, + make_zip_iterator(make_tuple(obs_copy, rows))); + RAFT_CHECK_CUDA(stream); + thrust::reduce_by_key(thrust_exec_policy, + rows, + rows + d * n, + obs_copy, + codes_copy, // Output to codes_copy is ignored + thrust::device_pointer_cast(centroids)); + RAFT_CHECK_CUDA(stream); + + // Divide sums by cluster size to get centroid matrix + // + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; + + // CUDA grid dimensions + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), + 1}; + + divideCentroids<<>>(d, k, clusterSizes, centroids); + RAFT_CHECK_CUDA(stream); + + return 0; +} // ========================================================= // k-means algorithm @@ -770,146 +766,146 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ - template - int kmeans(handle_t const &handle, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t *__restrict__ obs, - index_type_t *__restrict__ codes, - index_type_t *__restrict__ clusterSizes, - value_type_t *__restrict__ centroids, - value_type_t *__restrict__ work, - index_type_t *__restrict__ work_int, - value_type_t *residual_host, - index_type_t *iters_host, - unsigned long long seed) { - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Current iteration - index_type_t iter; - - constexpr - index_type_t grid_lower_bound{65535}; - - // Residual sum of squares at previous iteration - value_type_t residualPrev = 0; - - // Random number generator - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); - - // ------------------------------------------------------- - // Initialization - // ------------------------------------------------------- - - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); - auto thrust_exec_policy = handle.get_thrust_policy(); - - // Trivial cases - if (k == 1) { - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY( - cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) - WARNING("could not compute k-means centroids"); - - dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; - - dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; - - CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, work); - RAFT_CHECK_CUDA(stream); - *residual_host = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - RAFT_CHECK_CUDA(stream); - return 0; - } - if (n <= k) { - thrust::sequence(thrust_exec_policy, - thrust::device_pointer_cast(codes), - thrust::device_pointer_cast(codes + n)); - RAFT_CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); - RAFT_CHECK_CUDA(stream); - - if (n < k) - RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); - RAFT_CUDA_TRY(cudaMemcpyAsync( - centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - *residual_host = 0; - return 0; - } - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // ------------------------------------------------------- - // k-means++ algorithm - // ------------------------------------------------------- - - // Choose initial cluster centroids - if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) - WARNING("could not initialize k-means centroids"); - - // Apply k-means iteration until convergence - for (iter = 0; iter < maxiter; ++iter) { - // Update cluster centroids - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) - WARNING("could not update k-means centroids"); - - // Determine centroid closest to each observation - residualPrev = *residual_host; - if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) - WARNING("could not assign observation vectors to k-means clusters"); - - // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - - // FIXME: emptyCentroid never reaches k (infinite loop) under certain - // conditions, such as if obs is corrupt (as seen as a result of a - // DataFrame column of NULL edge vals used to create the Graph) - while (emptyCentroid < k) { - if (chooseNewCentroid( - handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) - WARNING("could not replace empty centroid"); - if (assignCentroids( - handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) - WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - RAFT_CHECK_CUDA(stream); - } - - // Check for convergence - if (std::fabs(residualPrev - (*residual_host)) / n < tol) { - ++iter; - break; - } - } - - // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); - - *iters_host = iter; - return 0; - } +template +int kmeans(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int, + value_type_t* residual_host, + index_type_t* iters_host, + unsigned long long seed) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Current iteration + index_type_t iter; + + constexpr index_type_t grid_lower_bound{65535}; + + // Residual sum of squares at previous iteration + value_type_t residualPrev = 0; + + // Random number generator + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Initialization + // ------------------------------------------------------- + + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); + + // Trivial cases + if (k == 1) { + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + CUDA_TRY( + cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not compute k-means centroids"); + + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; + + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, work); + RAFT_CHECK_CUDA(stream); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); + RAFT_CHECK_CUDA(stream); + return 0; + } + if (n <= k) { + thrust::sequence(thrust_exec_policy, + thrust::device_pointer_cast(codes), + thrust::device_pointer_cast(codes + n)); + RAFT_CHECK_CUDA(stream); + thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); + RAFT_CHECK_CUDA(stream); + + if (n < k) + RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync( + centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + *residual_host = 0; + return 0; + } + + // Initialize cuBLAS + RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // k-means++ algorithm + // ------------------------------------------------------- + + // Choose initial cluster centroids + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) + WARNING("could not initialize k-means centroids"); + + // Apply k-means iteration until convergence + for (iter = 0; iter < maxiter; ++iter) { + // Update cluster centroids + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not update k-means centroids"); + + // Determine centroid closest to each observation + residualPrev = *residual_host; + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + + // Reinitialize empty clusters with new centroids + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + + // FIXME: emptyCentroid never reaches k (infinite loop) under certain + // conditions, such as if obs is corrupt (as seen as a result of a + // DataFrame column of NULL edge vals used to create the Graph) + while (emptyCentroid < k) { + if (chooseNewCentroid( + handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) + WARNING("could not replace empty centroid"); + if (assignCentroids( + handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + RAFT_CHECK_CUDA(stream); + } + + // Check for convergence + if (std::fabs(residualPrev - (*residual_host)) / n < tol) { + ++iter; + break; + } + } + + // Warning if k-means has failed to converge + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + + *iters_host = iter; + return 0; +} /** * @brief Find clusters with k-means algorithm. @@ -936,50 +932,51 @@ namespace raft { * @param seed random seed to be used. * @return error flag */ - template - int kmeans(handle_t const &handle, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t *__restrict__ obs, - index_type_t *__restrict__ codes, - value_type_t &residual, - index_type_t &iters, - unsigned long long seed = 123456) { - using namespace matrix; - - // Check that parameters are valid - RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); - RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); - RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); - RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); - RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); - - // Allocate memory - vector_t clusterSizes(handle, k); - vector_t centroids(handle, d * k); - vector_t work(handle, n * max(k, d)); - vector_t work_int(handle, 2 * d * n); - - // Perform k-means - return kmeans(handle, - n, - d, - k, - tol, - maxiter, - obs, - codes, - clusterSizes.raw(), - centroids.raw(), - work.raw(), - work_int.raw(), - &residual, - &iters, - seed); - } +template +int kmeans(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + value_type_t& residual, + index_type_t& iters, + unsigned long long seed = 123456) +{ + using namespace matrix; + + // Check that parameters are valid + RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); + + // Allocate memory + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); + + // Perform k-means + return kmeans(handle, + n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters, + seed); +} } // namespace detail } // namespace cluster diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp index fe4dd4793a..a6c824afe4 100644 --- a/cpp/include/raft/cluster/kmeans.hpp +++ b/cpp/include/raft/cluster/kmeans.hpp @@ -45,22 +45,21 @@ namespace cluster { * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const &handle, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, value_type_t tol, index_type_t maxiter, - const value_type_t *__restrict__ obs, - index_type_t *__restrict__ codes, - value_type_t &residual, - index_type_t &iters, - unsigned long long seed = 123456) { - - return detail::kmeans(handle, n, d, k, tol, - maxiter, obs, codes, - residual, iters, seed); + const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + value_type_t& residual, + index_type_t& iters, + unsigned long long seed = 123456) +{ + return detail::kmeans( + handle, n, d, k, tol, maxiter, obs, codes, residual, iters, seed); } } // namespace cluster } // namespace raft diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 09a767bea7..d83e8f4d4f 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index f54535a88c..b4aa72d53e 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -21,8 +21,6 @@ #include #include -#include - #include #include #include diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp index a4d0cae359..f254c8f228 100644 --- a/cpp/include/raft/label/classlabels.hpp +++ b/cpp/include/raft/label/classlabels.hpp @@ -19,7 +19,7 @@ #include namespace raft { - namespace label { +namespace label { /** * Get unique class labels. @@ -36,8 +36,9 @@ namespace raft { * \param [in] stream cuda stream */ template -int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) { - detail::getUniqueLabels(unique, y, n, stream); +int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) +{ + return detail::getUniquelabels(unique, y, n, stream); } /** @@ -60,8 +61,9 @@ int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, */ template void getOvrlabels( - value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) { - detail::getOvrLabels(y, n, y_unique, n_classes, y_out, idx, stream); + value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) +{ + detail::getOvrlabels(y, n, y_unique, n_classes, y_out, idx, stream); } /** * Maps an input array containing a series of numbers into a new array @@ -84,9 +86,9 @@ void getOvrlabels( */ template void make_monotonic( - Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false) + Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false) { - detail::make_monotonic(out, in, N, stream, filter_op, zero_based); + detail::make_monotonic(out, in, N, stream, filter_op, zero_based); } /** @@ -109,7 +111,7 @@ void make_monotonic( template void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false) { - detail::make_monotonic(out, in, N, stream, zero_based); + detail::make_monotonic(out, in, N, stream, zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh index a3f2411102..3602a29e59 100644 --- a/cpp/include/raft/label/detail/merge_labels.cuh +++ b/cpp/include/raft/label/detail/merge_labels.cuh @@ -25,6 +25,7 @@ namespace raft { namespace label { +namespace detail { /** Note: this is one possible implementation where we represent the label * equivalence graph implicitly using labels_a, labels_b and mask. @@ -153,5 +154,6 @@ void merge_labels(value_idx* labels_a, RAFT_CUDA_TRY(cudaPeekAtLastError()); } +} // namespace detail }; // namespace label }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp index b21502a06a..5ba8fe8a27 100644 --- a/cpp/include/raft/label/merge_labels.hpp +++ b/cpp/include/raft/label/merge_labels.hpp @@ -20,7 +20,6 @@ namespace raft { namespace label { -namespace detail { /** * @brief Merge two labellings in-place, according to a core mask @@ -51,18 +50,17 @@ namespace detail { * @param[in] N Number of points in the dataset * @param[in] stream CUDA stream */ -template -void merge_labels(value_idx *labels_a, - const value_idx *labels_b, - const bool *mask, - value_idx *R, - bool *m, +template +void merge_labels(value_idx* labels_a, + const value_idx* labels_b, + const bool* mask, + value_idx* R, + bool* m, value_idx N, - cudaStream_t stream) { - - detail::merge_labels(labels_a, labels_b, mask, R, m, N, stream); + cudaStream_t stream) +{ + detail::merge_labels(labels_a, labels_b, mask, R, m, N, stream); } -}; // namespace detail }; // namespace label }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/lap/detail/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh index ab4aa2df59..6c6b09e5d8 100644 --- a/cpp/include/raft/lap/detail/lap_functions.cuh +++ b/cpp/include/raft/lap/detail/lap_functions.cuh @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/lap/detail/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh index 328cbf3e74..b61d0bd269 100644 --- a/cpp/include/raft/lap/detail/lap_kernels.cuh +++ b/cpp/include/raft/lap/detail/lap_kernels.cuh @@ -28,7 +28,6 @@ #include #include -#include #include diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 9376994742..7b7aa0decf 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -28,9 +28,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include namespace raft { diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp deleted file mode 100644 index 6998c1f186..0000000000 --- a/cpp/include/raft/mr/buffer_base.hpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include -#include -#include - -namespace raft { -namespace mr { - -/** - * @brief Base for all RAII-based owning of temporary memory allocations. This - * class should ideally not be used by users directly, but instead via - * the child classes `device_buffer` and `host_buffer`. - * - * @tparam T data type - * @tparam AllocatorT The underly allocator object - */ -template -class buffer_base { - public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; - using const_reference = const T&; - - buffer_base() = delete; - - buffer_base(const buffer_base& other) = delete; - - buffer_base& operator=(const buffer_base& other) = delete; - - /** - * @brief Main ctor - * - * @param[in] allocator asynchronous allocator used for managing buffer life - * @param[in] stream cuda stream where this allocation operations are async - * @param[in] n size of the buffer (in number of elements) - */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) - : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) - { - if (capacity_ > 0) { - data_ = - static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_)); - } - } - - ~buffer_base() { release(); } - - value_type* data() { return data_; } - - const value_type* data() const { return data_; } - - size_type size() const { return size_; } - - void clear() { size_ = 0; } - - iterator begin() { return data_; } - - const_iterator begin() const { return data_; } - - iterator end() { return data_ + size_; } - - const_iterator end() const { return data_ + size_; } - - /** - * @brief Reserve new memory size for this buffer. - * - * It re-allocates a fresh buffer if the new requested capacity is more than - * the current one, copies the old buffer contents to this new buffer and - * removes the old one. - * - * @param[in] new_capacity new capacity (in number of elements) - * @{ - */ - void reserve(size_type new_capacity) - { - if (new_capacity > capacity_) { - auto* new_data = - static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } - // Only deallocate if we have allocated a pointer - if (nullptr != data_) { - allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); - } - data_ = new_data; - capacity_ = new_capacity; - } - } - - void reserve(size_type new_capacity, cudaStream_t stream) - { - set_stream(stream); - reserve(new_capacity); - } - /** @} */ - - /** - * @brief Resize the underlying buffer (uses `reserve` method internally) - * - * @param[in] new_size new buffer size - * @{ - */ - void resize(const size_type new_size) - { - reserve(new_size); - size_ = new_size; - } - - void resize(const size_type new_size, cudaStream_t stream) - { - set_stream(stream); - resize(new_size); - } - /** @} */ - - /** - * @brief Deletes the underlying buffer - * - * If this method is not explicitly called, it will be during the destructor - * @{ - */ - void release() - { - if (nullptr != data_) { - allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); - } - data_ = nullptr; - capacity_ = 0; - size_ = 0; - } - - void release(cudaStream_t stream) - { - set_stream(stream); - release(); - } - /** @} */ - - /** - * @brief returns the underlying allocator used - * - * @return the allocator pointer - */ - std::shared_ptr get_allocator() const { return allocator_; } - - /** - * @brief returns the underlying stream used - * - * @return the cuda stream - */ - cudaStream_t get_stream() const { return stream_; } - - protected: - value_type* data_; - - private: - size_type size_; - size_type capacity_; - cudaStream_t stream_; - std::shared_ptr allocator_; - - /** - * @brief Sets a new cuda stream where the future operations will be queued - * - * This method makes sure that the inter-stream dependencies are met and taken - * care of, before setting the input stream as a new stream for this buffer. - * Ideally, the same cuda stream passed during constructor is expected to be - * used throughout this buffer's lifetime, for performance. - * - * @param[in] stream new cuda stream to be set. If it is the same as the - * current one, then this method will be a no-op. - */ - void set_stream(cudaStream_t stream) - { - if (stream_ != stream) { - cudaEvent_t event; - RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - RAFT_CUDA_TRY(cudaEventRecord(event, stream_)); - RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0)); - stream_ = stream; - RAFT_CUDA_TRY(cudaEventDestroy(event)); - } - } -}; // class buffer_base - -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp deleted file mode 100644 index 9b5ff11c50..0000000000 --- a/cpp/include/raft/mr/device/buffer.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "allocator.hpp" -#include -#include - -namespace raft { -namespace mr { -namespace device { - -/** - * @brief RAII object owning a contiguous typed device buffer. The passed in - * allocator supports asynchronous allocation and deallocation so this - * can also be used for temporary memory - * - * @code{.cpp} - * template - * void foo(..., cudaStream_t stream) { - * ... - * raft::mr::device::buffer temp(stream, 0); - * ... - * temp.resize(n); - * kernelA<<>>(...,temp.data(),...); - * kernelB<<>>(...,temp.data(),...); - * temp.release(); - * ... - * } - * @endcode - */ -template -class buffer : public buffer_base { - public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; - using const_reference = typename buffer_base::const_reference; - - buffer() = delete; - - buffer(const buffer& other) = delete; - - buffer& operator=(const buffer& other) = delete; - - buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) - { - } -}; // class buffer - -}; // namespace device -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp deleted file mode 100644 index 204b384719..0000000000 --- a/cpp/include/raft/mr/host/buffer.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "allocator.hpp" -#include -#include -#include - -namespace raft { -namespace mr { -namespace host { - -/** - * @brief RAII object owning a contigous typed host buffer (aka pinned memory). - * The passed in allocator supports asynchronus allocation and - * deallocation so this can also be used for temporary memory - * - * @code{.cpp} - * template - * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) { - * ... - * raft::mr::host::buffer temp(stream, 0); - * ... - * temp.resize(n); - * raft::copy(temp.data(), in_d, temp.size()); - * ... - * raft::copy(out_d, temp.data(), temp.size()); - * temp.release(stream); - * ... - * } - * @endcode - */ -template -class buffer : public buffer_base { - public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; - using const_reference = typename buffer_base::const_reference; - - buffer() = delete; - - buffer(const buffer& other) = delete; - - buffer& operator=(const buffer& other) = delete; - - buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) - { - if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } - } - - buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) - { - } - - reference operator[](size_type pos) { return data_[pos]; } - - const_reference operator[](size_type pos) const { return data_[pos]; } - - private: - using buffer_base::data_; -}; - -}; // namespace host -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index c23a2b1537..cb9e67dd6b 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -21,7 +21,6 @@ #include #include -#include #include #include "../../csr.hpp" diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh index 8c01b33c1e..06c034ad9f 100644 --- a/cpp/include/raft/sparse/distance/detail/utils.cuh +++ b/cpp/include/raft/sparse/distance/detail/utils.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index 2f121dce33..aee53ac37a 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 4e78494e6b..593702b4a7 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 5d4640f4a6..548ca00c93 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 7173c76c08..10e9d04c0d 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh index 2519c1914e..33e62d1374 100644 --- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh @@ -17,6 +17,8 @@ #include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh index 074a139ba9..715bbe6deb 100644 --- a/cpp/include/raft/sparse/op/detail/reduce.cuh +++ b/cpp/include/raft/sparse/op/detail/reduce.cuh @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh index 817b9782f2..5ef7371492 100644 --- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh @@ -17,9 +17,8 @@ #include #include -#include +#include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index de0a15c029..b8dc25053b 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 6ce9463e43..5f64a8d1b5 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -22,8 +22,6 @@ #include #include -#include - namespace raft { namespace spatial { namespace knn { diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index b29c4cc51c..59df75ba36 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -19,14 +19,10 @@ #include "detail/knn_brute_force_faiss.cuh" #include "detail/selection_faiss.cuh" -#include - namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - /** * Performs a k-select across row partitioned index/distance * matrices formatted like the following: diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 092e51a17d..cc25e66cae 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -55,16 +55,16 @@ struct kmeans_solver_t { index_type_t iters{}; raft::cluster::kmeans(handle, - n_obs_vecs, - dim, - config_.n_clusters, - config_.tol, - config_.maxIter, - obs, - codes, - residual, - iters, - config_.seed); + n_obs_vecs, + dim, + config_.n_clusters, + config_.tol, + config_.maxIter, + obs, + codes, + residual, + iters, + config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp index ec0e23dfe0..08e104dea6 100644 --- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp @@ -27,8 +27,8 @@ #include #include +#include #include -#include #ifdef COLLECT_TIME_STATISTICS #include diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp index 82313f84ee..99d6f3e419 100644 --- a/cpp/include/raft/spectral/detail/partition.hpp +++ b/cpp/include/raft/spectral/detail/partition.hpp @@ -26,8 +26,8 @@ #include #include -#include #include +#include namespace raft { namespace spectral { @@ -69,7 +69,8 @@ std::tuple partition(handle_t const& handle, ClusterSolver const& cluster_solver, vertex_t* __restrict__ clusters, weight_t* eigVals, - weight_t* eigVecs) { + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); @@ -135,7 +136,8 @@ void analyzePartition(handle_t const& handle, vertex_t nClusters, const vertex_t* __restrict__ clusters, weight_t& edgeCut, - weight_t& cost) { + weight_t& cost) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 1f8857200f..466851c74f 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -50,15 +50,16 @@ namespace spectral { */ template std::tuple modularity_maximization( - handle_t const& handle, - sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) { - return detail::modularity_maximization(handle, csr_m, - eigen_solver, cluster_solver, clusters, eigVals, eigVecs); + handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + return detail::modularity_maximization( + handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); } //=================================================== // Analysis of graph partition @@ -76,9 +77,9 @@ void analyzeModularity(handle_t const& handle, sparse_matrix_t const& csr_m, vertex_t nClusters, vertex_t const* __restrict__ clusters, - weight_t& modularity) { - detail::analyzeModularity(handle, csr_m, nClusters, clusters, modularity); -} + weight_t& modularity) +{ + detail::analyzeModularity(handle, csr_m, nClusters, clusters, modularity); } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index ca2b71ae49..597ef530a2 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -55,10 +55,10 @@ std::tuple partition(handle_t const& handle, ClusterSolver const& cluster_solver, vertex_t* __restrict__ clusters, weight_t* eigVals, - weight_t* eigVecs) { - return detail::partition(handle, csr_m, eigen_solver, - cluster_solver, clusters, - eigVals, eigVecs); + weight_t* eigVecs) +{ + return detail::partition( + handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); } // ========================================================= @@ -85,8 +85,9 @@ void analyzePartition(handle_t const& handle, vertex_t nClusters, const vertex_t* __restrict__ clusters, weight_t& edgeCut, - weight_t& cost) { - detail::analyzePartition(handle, csr_m, nClusters, clusters, edgeCut, cost); + weight_t& cost) +{ + detail::analyzePartition(handle, csr_m, nClusters, clusters, edgeCut, cost); } } // namespace spectral diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 07f04ad2ab..ac216d0a2c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -64,8 +64,6 @@ add_executable(test_raft test/matrix/math.cu test/matrix/matrix.cu test/matrix/linewise_op.cu - test/mr/device/buffer.cpp - test/mr/host/buffer.cpp test/mst.cu test/random/rng.cu test/random/rng_int.cu diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 2c7996514a..0030596e21 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -19,9 +19,11 @@ #include #include +#include #include namespace raft { +namespace spectral { TEST(Raft, ClusterSolvers) { @@ -60,7 +62,12 @@ TEST(Raft, ModularitySolvers) using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_device()); + ASSERT_EQ(0, + h. + + get_device() + + ); index_type neigvs{10}; index_type maxiter{100}; @@ -95,4 +102,5 @@ TEST(Raft, ModularitySolvers) EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); } +} // namespace spectral } // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index f898d11d2e..541d4dccc8 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -25,6 +26,7 @@ #include namespace raft { +namespace spectral { TEST(Raft, EigenSolvers) { @@ -34,7 +36,12 @@ TEST(Raft, EigenSolvers) using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_device()); + ASSERT_EQ(0, + h. + + get_device() + + ); index_type* ro{nullptr}; index_type* ci{nullptr}; @@ -75,7 +82,12 @@ TEST(Raft, SpectralSolvers) using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_device()); + ASSERT_EQ(0, + h. + + get_device() + + ); index_type neigvs{10}; index_type maxiter{100}; @@ -109,4 +121,5 @@ TEST(Raft, SpectralSolvers) EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); } +} // namespace spectral } // namespace raft diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index d441bf95a8..b19accc3b4 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -16,7 +16,7 @@ #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index 5d30af795f..db6b34bbd6 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index afdebae1f8..24e1c6be4f 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -28,7 +28,7 @@ #include #include -#include +#include #include #define PROBLEMSIZE 1000 // Number of rows/columns diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp deleted file mode 100644 index 4861a4ca1f..0000000000 --- a/cpp/test/mr/device/buffer.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace mr { -namespace device { - -TEST(Raft, DeviceBufferAlloc) -{ - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - // no allocation at construction - rmm::device_uvector buff(0, stream); - ASSERT_EQ(0, buff.size()); - // explicit allocation after construction - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - // resizing to a smaller buffer size - buff.resize(10, stream); - ASSERT_EQ(10, buff.size()); - // explicit deallocation - buff.release(); - ASSERT_EQ(0, buff.size()); - // use these methods without the explicit stream parameter - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - buff.resize(10, stream); - ASSERT_EQ(10, buff.size()); - buff.release(); - ASSERT_EQ(0, buff.size()); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -TEST(Raft, DeviceBufferZeroResize) -{ - // Create a limiting_resource_adaptor to track allocations - auto curr_mr = - dynamic_cast(rmm::mr::get_current_device_resource()); - auto limit_mr = - std::make_shared>(curr_mr, - 1000); - - rmm::mr::set_current_device_resource(limit_mr.get()); - - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - // no allocation at construction - rmm::device_uvector buff(10, stream); - ASSERT_EQ(10, buff.size()); - // explicit allocation after construction - buff.resize(0, stream); - ASSERT_EQ(0, buff.size()); - // resizing to a smaller buffer size - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - // explicit deallocation - buff.release(); - ASSERT_EQ(0, buff.size()); - - // Now check that there is no memory left. (Used to not be true) - ASSERT_EQ(0, limit_mr->get_allocated_bytes()); - - rmm::mr::set_current_device_resource(curr_mr); - - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -} // namespace device -} // namespace mr -} // namespace raft diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp deleted file mode 100644 index d645ffa0e0..0000000000 --- a/cpp/test/mr/host/buffer.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace raft { -namespace mr { -namespace host { - -TEST(Raft, HostBuffer) -{ - auto alloc = std::make_shared(); - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - // no allocation at construction - buffer buff(alloc, stream); - ASSERT_EQ(0, buff.size()); - // explicit allocation after construction - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - // resizing to a smaller buffer size - buff.resize(10, stream); - ASSERT_EQ(10, buff.size()); - // explicit deallocation - buff.release(stream); - ASSERT_EQ(0, buff.size()); - // use these methods without the explicit stream parameter - buff.resize(20); - ASSERT_EQ(20, buff.size()); - buff.resize(10); - ASSERT_EQ(10, buff.size()); - buff.release(); - ASSERT_EQ(0, buff.size()); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -TEST(Raft, DeviceToHostBuffer) -{ - auto d_alloc = std::make_shared(); - auto h_alloc = std::make_shared(); - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - device::buffer d_buff(d_alloc, stream, 32); - RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); - buffer h_buff(h_alloc, d_buff); - ASSERT_EQ(d_buff.size(), h_buff.size()); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -} // namespace host -} // namespace mr -} // namespace raft diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index fa54b04cda..6a5a2c9f59 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { namespace { From 32951b2982f2c478bef4055a34e13c4643ba1314 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Feb 2022 18:47:22 -0500 Subject: [PATCH 3/8] Updates --- .../cluster/detail/{kmeans.hpp => kmeans.cuh} | 2 +- cpp/include/raft/cluster/kmeans.hpp | 2 +- cpp/include/raft/linalg/lanczos.hpp | 2 +- .../raft/sparse/linalg/detail/spectral.cuh | 5 ++- ...atrix_wrappers.hpp => matrix_wrappers.cuh} | 0 .../detail/modularity_maximization.hpp | 2 +- .../raft/spectral/detail/partition.hpp | 2 +- .../{spectral_util.hpp => spectral_util.cuh} | 40 ++++++++++--------- cpp/test/spectral_matrix.cu | 2 +- 9 files changed, 30 insertions(+), 27 deletions(-) rename cpp/include/raft/cluster/detail/{kmeans.hpp => kmeans.cuh} (99%) rename cpp/include/raft/spectral/detail/{matrix_wrappers.hpp => matrix_wrappers.cuh} (100%) rename cpp/include/raft/spectral/detail/{spectral_util.hpp => spectral_util.cuh} (86%) diff --git a/cpp/include/raft/cluster/detail/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.cuh similarity index 99% rename from cpp/include/raft/cluster/detail/kmeans.hpp rename to cpp/include/raft/cluster/detail/kmeans.cuh index e5911bc7d9..009d1a0642 100644 --- a/cpp/include/raft/cluster/detail/kmeans.hpp +++ b/cpp/include/raft/cluster/detail/kmeans.cuh @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp index a6c824afe4..ab0fbb04c7 100644 --- a/cpp/include/raft/cluster/kmeans.hpp +++ b/cpp/include/raft/cluster/kmeans.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include namespace raft { namespace cluster { diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 7b7aa0decf..b8da623418 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh index 33e62d1374..956ecb53a4 100644 --- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh @@ -71,11 +71,12 @@ void fit_embedding(const raft::handle_t& handle, value_type tol = 0.01; index_type restart_iter = 15 + neigvs; // what cugraph is using - raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; + raft::spectral::eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol}; cfg.seed = seed; - raft::lanczos_solver_t eig_solver{cfg}; + raft::spectral::lanczos_solver_t eig_solver{cfg}; // cluster computation here is irrelevant, // hence define a no-op such solver to diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.cuh similarity index 100% rename from cpp/include/raft/spectral/detail/matrix_wrappers.hpp rename to cpp/include/raft/spectral/detail/matrix_wrappers.cuh diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp index 08e104dea6..a55dfbe67f 100644 --- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp @@ -27,7 +27,7 @@ #include #include -#include +#include #include #ifdef COLLECT_TIME_STATISTICS diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp index 99d6f3e419..b7c811d5a5 100644 --- a/cpp/include/raft/spectral/detail/partition.hpp +++ b/cpp/include/raft/spectral/detail/partition.hpp @@ -26,7 +26,7 @@ #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/spectral/detail/spectral_util.hpp b/cpp/include/raft/spectral/detail/spectral_util.cuh similarity index 86% rename from cpp/include/raft/spectral/detail/spectral_util.hpp rename to cpp/include/raft/spectral/detail/spectral_util.cuh index a30906de10..df26676f99 100644 --- a/cpp/include/raft/spectral/detail/spectral_util.hpp +++ b/cpp/include/raft/spectral/detail/spectral_util.cuh @@ -132,7 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::minus()); RAFT_CHECK_CUDA(stream); - RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -149,22 +149,22 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t*)NULL, - nEigVecs, - work.raw(), - nEigVecs, - stream)); + RAFT_CUBLAS_TRY(raft::linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); RAFT_CUDA_TRY(cudaMemcpyAsync( eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); @@ -216,14 +216,16 @@ bool construct_indicator(handle_t const& handle, RAFT_CHECK_CUDA(stream); // Compute size of ith partition - RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + RAFT_CUBLAS_TRY( + raft::linalg::cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + RAFT_CUBLAS_TRY( + raft::linalg::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 6a5a2c9f59..652aa61451 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { namespace { From 71710b9ee12c0233259a4bdee0610d3629b2b5c5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Feb 2022 19:55:12 -0500 Subject: [PATCH 4/8] Fixing classlabels doc --- cpp/include/raft/label/classlabels.hpp | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp index f254c8f228..8cd394facc 100644 --- a/cpp/include/raft/label/classlabels.hpp +++ b/cpp/include/raft/label/classlabels.hpp @@ -27,13 +27,12 @@ namespace label { * The y array is assumed to store class labels. The unique values are selected * from this array. * - * \tparam value_t numeric type of the arrays with class labels - * \param [in] y device array of labels, size [n] - * \param [in] n number of labels - * \param [out] unique device array of unique labels, unallocated on entry, - * on exit it has size [n_unique] - * \param [out] n_unique number of unique labels - * \param [in] stream cuda stream + * @tparam value_t numeric type of the arrays with class labels + * @param [in] y device array of labels, size [n] + * @param [in] n number of labels + * @param [in] stream cuda stream + * @returns unique device array of unique labels, unallocated on entry, + * on exit it has size */ template int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) @@ -51,13 +50,13 @@ int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, * free to choose other type for y_out (it should represent +/-1, and it is used * in floating point arithmetics). * - * \param [in] y device array if input labels, size [n] - * \param [in] n number of labels - * \param [in] y_unique device array of unique labels, size [n_classes] - * \param [in] n_classes number of unique labels - * \param [out] y_out device array of output labels - * \param [in] idx index of unique label that should be labeled as 1 - * \param [in] stream cuda stream + * @param [in] y device array if input labels, size [n] + * @param [in] n number of labels + * @param [in] y_unique device array of unique labels, size [n_classes] + * @param [in] n_classes number of unique labels + * @param [out] y_out device array of output labels + * @param [in] idx index of unique label that should be labeled as 1 + * @param [in] stream cuda stream */ template void getOvrlabels( From 15943918dc31c56e49e98a6156958bad59023f2e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 4 Feb 2022 18:56:11 -0500 Subject: [PATCH 5/8] Fixing doc --- cpp/include/raft/label/classlabels.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp index 8cd394facc..cc7afd9780 100644 --- a/cpp/include/raft/label/classlabels.hpp +++ b/cpp/include/raft/label/classlabels.hpp @@ -30,6 +30,7 @@ namespace label { * @tparam value_t numeric type of the arrays with class labels * @param [in] y device array of labels, size [n] * @param [in] n number of labels + * @param [inout] unique output unique labels * @param [in] stream cuda stream * @returns unique device array of unique labels, unallocated on entry, * on exit it has size From 0f1d128ed077c29267a268a1278788ec118cb48c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 4 Feb 2022 18:56:27 -0500 Subject: [PATCH 6/8] ng unique where it belongs --- cpp/include/raft/label/classlabels.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp index cc7afd9780..54e00e1f4f 100644 --- a/cpp/include/raft/label/classlabels.hpp +++ b/cpp/include/raft/label/classlabels.hpp @@ -28,9 +28,9 @@ namespace label { * from this array. * * @tparam value_t numeric type of the arrays with class labels + * @param [inout] unique output unique labels * @param [in] y device array of labels, size [n] * @param [in] n number of labels - * @param [inout] unique output unique labels * @param [in] stream cuda stream * @returns unique device array of unique labels, unallocated on entry, * on exit it has size From 55ab6dc7bf15aa29c7c97d927314e7f8923d028a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 7 Feb 2022 14:11:46 -0500 Subject: [PATCH 7/8] Adding missing doxygen argument --- cpp/include/raft/label/classlabels.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp index 54e00e1f4f..de9f60518d 100644 --- a/cpp/include/raft/label/classlabels.hpp +++ b/cpp/include/raft/label/classlabels.hpp @@ -77,12 +77,13 @@ void getOvrlabels( * @tparam Type the numeric type of the input and output arrays * @tparam Lambda the type of an optional filter function, which determines * which items in the array to map. - * @param out the output monotonic array - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - * @param filter_op an optional function for specifying which values + * @param[out] out the output monotonic array + * @param[in] in input label array + * @param[in] N number of elements in the input array + * @param[in] stream cuda stream to use + * @param[in] filter_op an optional function for specifying which values * should have monotonically increasing labels applied to them. + * @param[in] zero_based force monotonic set to start at 0? */ template void make_monotonic( @@ -101,12 +102,11 @@ void make_monotonic( * where a set of vertices need to be labeled in a monotonically increasing * order. * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out output label array with labels assigned monotonically - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use + * @param[out] out output label array with labels assigned monotonically + * @param[in] in input label array + * @param[in] N number of elements in the input array + * @param[in] stream cuda stream to use + * @param[in] zero_based force monotonic label set to start at 0? */ template void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false) From 36948fd552adac2faba427c09fe8cd9d54e312c5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 8 Feb 2022 18:32:10 -0500 Subject: [PATCH 8/8] Updating w/ branch-22.04 --- cpp/include/raft/cluster/detail/kmeans.cuh | 19 +------------------ cpp/include/raft/linalg/detail/lanczos.hpp | 6 +++--- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh index 79337011bf..5f1a0e137d 100644 --- a/cpp/include/raft/cluster/detail/kmeans.cuh +++ b/cpp/include/raft/cluster/detail/kmeans.cuh @@ -31,9 +31,9 @@ #include #include #include +#include #include #include -#include namespace raft { namespace cluster { @@ -656,22 +656,6 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix -<<<<<<< HEAD:cpp/include/raft/cluster/detail/kmeans.cuh - RAFT_CUBLAS_TRY(linalg::cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); -======= // #TODO: Call from public API when ready RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, CUBLAS_OP_T, @@ -687,7 +671,6 @@ static int updateCentroids(handle_t const& handle, thrust::raw_pointer_cast(obs_copy), n, stream)); ->>>>>>> rapidsai/branch-22.04:cpp/include/raft/spectral/kmeans.hpp // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index 3d8fde7e68..a2b7751a05 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -28,9 +28,9 @@ #include "cublas_wrappers.hpp" #include #include -#include -#include -#include +#include +#include +#include namespace raft {