diff --git a/CHANGELOG.md b/CHANGELOG.md index 58477e868f..0e376f5f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # RAFT 0.15.0 (Date TBD) ## New Features +- PR #12: Spectral clustering. - PR #7: Migrating cuml comms -> raft comms_t - PR #15: add exception based error handling macros diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bff25da45..b1a2284285 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -204,6 +204,7 @@ set(RAFT_LINK_LIBRARIES ${CUDA_cusolver_LIBRARY} ${CUDA_CUDART_LIBRARY} ${CUDA_cusparse_LIBRARY} + ${CUDA_curand_LIBRARY} rmm) set(RAFT_LINK_DIRECTORIES "${RMM_LIBRARY}") @@ -224,7 +225,10 @@ if(BUILD_RAFT_TESTS) test/handle.cpp test/mr/device/buffer.cpp test/mr/host/buffer.cpp - test/test.cpp) + test/test.cpp + test/spectral_matrix.cu + test/eigen_solvers.cu + test/cluster_solvers.cu) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 7e8a52196a..40caff228e 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -578,5 +578,51 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, } /** @} */ +/** + * @defgroup setpointermode cublas set pointer mode method + * @{ + */ +// no T dependency... +// template +// cublasStatus_t cublassetpointermode( // NOLINT +// cublasHandle_t handle, +// cublasPointerMode_t mode, +// cudaStream_t stream); + +// template<> +inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, + cublasPointerMode_t mode, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSetPointerMode(handle, mode); +} +/** @} */ + +/** + * @defgroup scal cublas dot calls + * @{ + */ +template +cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, + int incx, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const float *alpha, float *x, int incx, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSscal(handle, n, alpha, x, incx); +} + +template <> +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const double *alpha, double *x, int incx, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDscal(handle, n, alpha, x, incx); +} + +/** @} */ + } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp new file mode 100644 index 0000000000..b775a1f696 --- /dev/null +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -0,0 +1,1231 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +//for cmath: +#define _USE_MATH_DEFINES + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace raft { + +using namespace matrix; +using namespace linalg; + +namespace spectral { + +// curandGeneratorNormalX +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + float *outputPtr, size_t n, + float mean, float stddev) { + return curandGenerateNormal(generator, outputPtr, n, mean, stddev); +} +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + double *outputPtr, size_t n, + double mean, double stddev) { + return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); +} + +// ========================================================= +// Helper functions +// ========================================================= + +/** + * @brief Perform Lanczos iteration + * Lanczos iteration is performed on a shifted matrix A+shift*I. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param iter Pointer to current Lanczos iteration. On exit, the + * variable is set equal to the final Lanczos iteration. + * @param maxIter Maximum Lanczos iteration. This function will + * perform a maximum of maxIter-*iter iterations. + * @param shift Matrix shift. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm (i.e. entry in beta_host) is + * less than tol. + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param alpha_host (Output, host memory, maxIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, maxIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Input/output, device memory, + * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as + * columns of a column-major matrix with dimensions + * n x (maxIter+1). + * @param work_dev (Output, device memory, maxIter entries) + * Workspace. Not needed if full reorthogonalization is disabled. + * @return Zero if successful. Otherwise non-zero. + */ +template +int performLanczosIteration( + handle_t const &handle, sparse_matrix_t const *A, + index_type_t *iter, index_type_t maxIter, value_type_t shift, + value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev) { + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful variables + constexpr value_type_t one = 1; + constexpr value_type_t negOne = -1; + constexpr value_type_t zero = 0; + value_type_t alpha; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); + + index_type_t n = A->nrows_; + + // ------------------------------------------------------- + // Compute second Lanczos vector + // ------------------------------------------------------- + if (*iter <= 0) { + *iter = 1; + + // Apply matrix + if (shift != 0) + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); + A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); + + // Orthogonalize Lanczos vector + CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, + stream)); + + alpha = -alpha_host[0]; + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, + beta_host, stream)); + + // Check if Lanczos has converged + if (beta_host[0] <= tol) return 0; + + // Normalize Lanczos vector + alpha = 1 / beta_host[0]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), + 1, stream)); + } + + // ------------------------------------------------------- + // Compute remaining Lanczos vectors + // ------------------------------------------------------- + + while (*iter < maxIter) { + ++(*iter); + + // Apply matrix + if (shift != 0) + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, + lanczosVecs_dev + IDX(0, *iter, n)); + + // Full reorthogonalization + // "Twice is enough" algorithm per Kahan and Parlett + if (reorthogonalize) { + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), + sizeof(value_type_t), cudaMemcpyDeviceToHost, + stream)); + + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + } + + // Orthogonalization with 3-term recurrence relation + else { + CUBLAS_CHECK(cublasdot(cublas_h, n, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, + alpha_host + (*iter - 1), stream)); + + auto alpha = -alpha_host[*iter - 1]; + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + + alpha = -beta_host[*iter - 2]; + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + } + + // Compute residual + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, + beta_host + *iter - 1, stream)); + + // Check if Lanczos has converged + if (beta_host[*iter - 1] <= tol) break; + + // Normalize Lanczos vector + alpha = 1 / beta_host[*iter - 1]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + } + + CUDA_TRY(cudaStreamSynchronize(stream)); + + return 0; +} + +/** + * @brief Find Householder transform for 3-dimensional system + * Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param v (Input/output, host memory, 3 entries) Input + * 3-dimensional vector. On exit, the vector is set to the + * Householder vector. + * @param Pv (Output, host memory, 1 entry) First entry of P*v + * (here v is the input vector). Either equal to ||v||_2 or + * -||v||_2. + * @param P (Output, host memory, 9 entries) Householder transform + * matrix. Matrix dimensions are 3 x 3. + */ +template +static void findHouseholder3(value_type_t *v, value_type_t *Pv, + value_type_t *P) { + // Compute norm of vector + *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + + // Choose whether to reflect to e_1 or -e_1 + // This choice avoids catastrophic cancellation + if (v[0] >= 0) *Pv = -(*Pv); + v[0] -= *Pv; + + // Normalize Householder vector + value_type_t normHouseholder = + std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + if (normHouseholder != 0) { + v[0] /= normHouseholder; + v[1] /= normHouseholder; + v[2] /= normHouseholder; + } else { + v[0] = 0; + v[1] = 0; + v[2] = 0; + } + + // Construct Householder matrix + index_type_t i, j; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; +} + +/** + * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix + * The Householder transform is pre-applied to the top three rows + * of the matrix and post-applied to the left three columns. The + * 4 x 4 matrix is intended to contain the bulge that is produced + * in the Francis QR algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param v (Input, host memory, 3 entries) Householder vector. + * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. + */ +template +static void applyHouseholder3(const value_type_t *v, value_type_t *A) { + // Loop indices + index_type_t i, j; + // Dot product between Householder vector and matrix row/column + value_type_t vDotA; + + // Pre-apply Householder transform + for (j = 0; j < 4; ++j) { + vDotA = 0; + for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + } + + // Post-apply Householder transform + for (i = 0; i < 4; ++i) { + vDotA = 0; + for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + } +} + +/** + * @brief Perform one step of Francis QR algorithm + * Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Matrix dimension. + * @param shift1 QR algorithm shift. + * @param shift2 QR algorithm shift. + * @param alpha (Input/output, host memory, n entries) Diagonal + * entries of tridiagonal matrix. + * @param beta (Input/output, host memory, n-1 entries) + * Off-diagonal entries of tridiagonal matrix. + * @param V (Input/output, host memory, n*n entries) Orthonormal + * transforms from previous steps of QR algorithm. Matrix + * dimensions are n x n. On exit, the orthonormal transform from + * this Francis QR step is post-applied to the matrix. + * @param work (Output, host memory, 3*n entries) Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int francisQRIteration(index_type_t n, value_type_t shift1, + value_type_t shift2, value_type_t *alpha, + value_type_t *beta, value_type_t *V, + value_type_t *work) { + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Temporary storage of 4x4 bulge and Householder vector + value_type_t bulge[16]; + + // Householder vector + value_type_t householder[3]; + // Householder matrix + value_type_t householderMatrix[3 * 3]; + + // Shifts are roots of the polynomial p(x)=x^2+b*x+c + value_type_t b = -shift1 - shift2; + value_type_t c = shift1 * shift2; + + // Loop indices + index_type_t i, j, pos; + // Temporary variable + value_type_t temp; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute initial Householder transform + householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; + householder[1] = beta[0] * (alpha[0] + alpha[1] + b); + householder[2] = beta[0] * beta[1]; + findHouseholder3(householder, &temp, + householderMatrix); + + // Apply initial Householder transform to create bulge + memset(bulge, 0, 16 * sizeof(value_type_t)); + for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 3; ++i) { + bulge[IDX(i + 1, i, 4)] = beta[i]; + bulge[IDX(i, i + 1, 4)] = beta[i]; + } + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, + 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(value_type_t)); + + // Chase bulge to bottom-right of matrix with Householder transforms + for (pos = 0; pos < n - 4; ++pos) { + // Move to next position + alpha[pos] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = beta[pos + 3]; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = beta[pos + 3]; + bulge[IDX(3, 3, 4)] = alpha[pos + 4]; + + // Apply Householder transform + findHouseholder3(householder, beta + pos, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), + n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); + } + + // Apply penultimate Householder transform + // Values in the last row and column are zero + alpha[n - 4] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = 0; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = 0; + bulge[IDX(3, 3, 4)] = 0; + findHouseholder3(householder, beta + n - 4, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, + householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); + + // Apply final Householder transform + // Values in the last two rows and columns are zero + alpha[n - 3] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = 0; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, + householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); + + // Bulge has been eliminated + alpha[n - 2] = bulge[IDX(0, 0, 4)]; + alpha[n - 1] = bulge[IDX(1, 1, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; + + return 0; +} + +/** + * @brief Perform implicit restart of Lanczos algorithm + * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Matrix dimension. + * @param iter Current Lanczos iteration. + * @param iter_new Lanczos iteration after restart. + * @param shiftUpper Pointer (host memory) to upper bound for unwanted + * region. Value is ignored if less than *shiftLower. If a + * stronger upper bound has been found, the value is updated on + * exit. + * @param shiftLower Pointer (host memory) to lower bound for unwanted + * region. Value is ignored if greater than *shiftUpper. If a + * stronger lower bound has been found, the value is updated on + * exit. + * @param alpha_host (Input/output, host memory, iter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Input/output, host memory, iter entries) + * Off-diagonal entries of Lanczos system. + * @param V_host (Output, host memory, iter*iter entries) + * Orthonormal transform used to obtain restarted system. Matrix + * dimensions are iter x iter. + * @param work_host (Output, host memory, 4*iter entries) + * Workspace. + * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (iter+1). + * @param work_dev (Output, device memory, (n+iter)*iter entries) + * Workspace. + * @param smallest_eig specifies whether smallest (true) or largest + * (false) eigenvalues are to be calculated. + * @return error flag. + */ +template +static int lanczosRestart( + handle_t const &handle, index_type_t n, index_type_t iter, + index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, bool smallest_eig) { + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + constexpr value_type_t zero = 0; + constexpr value_type_t one = 1; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Loop index + index_type_t i; + + // Number of implicit restart steps + // Assumed to be even since each call to Francis algorithm is + // equivalent to two calls of QR algorithm + index_type_t restartSteps = iter - iter_new; + + // Ritz values from Lanczos method + value_type_t *ritzVals_host = work_host + 3 * iter; + // Shifts for implicit restart + value_type_t *shifts_host; + + // Orthonormal matrix for similarity transform + value_type_t *V_dev = work_dev + n * iter; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute Ritz values + memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t)); + Lapack::sterf(iter, ritzVals_host, work_host); + + // Debug: Print largest eigenvalues + // for (int i = iter-iter_new; i < iter; ++i) + // std::cout <<*(ritzVals_host+i)<< " "; + // std::cout < *shiftUpper) { + *shiftUpper = ritzVals_host[iter - 1]; + *shiftLower = ritzVals_host[iter_new]; + } else { + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]); + } + } else { + if (*shiftLower > *shiftUpper) { + *shiftUpper = ritzVals_host[iter - iter_new - 1]; + *shiftLower = ritzVals_host[0]; + } else { + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[0]); + } + } + + // Calculate Chebyshev nodes as shifts + shifts_host = ritzVals_host; + for (i = 0; i < restartSteps; ++i) { + shifts_host[i] = + cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); + shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); + } + + // Apply Francis QR algorithm to implicitly restart Lanczos + for (i = 0; i < restartSteps; i += 2) + if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, + beta_host, V_host, work_host)) + WARNING("error in implicitly shifted QR algorithm"); + + // Obtain new residual + CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = + beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, + n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), 1, stream)); + + // Obtain new Lanczos vectors + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, + &one, lanczosVecs_dev, n, V_dev, iter, &zero, + work_dev, n, stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + n * iter_new * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); + + // Normalize residual to obtain new Lanczos vector + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, + beta_host + iter_new - 1, stream)); + + auto h_beta = 1 / beta_host[iter_new - 1]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, + lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + + return 0; +} + +} // namespace spectral + +// ========================================================= +// Eigensolver +// ========================================================= + +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param shift On exit, pointer to matrix shift (estimate for + * largest eigenvalue). + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeSmallestEigenvectors( + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *shift, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { + using namespace spectral; + + // Useful constants + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; + + // Matrix dimension + index_type_t n = A->nrows_; + + // Shift for implicit restart + value_type_t shiftUpper; + value_type_t shiftLower; + + // Lanczos iteration counters + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + index_type_t i; + + // Host memory + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // ------------------------------------------------------- + // Variable initialization + // ------------------------------------------------------- + + // Total number of Lanczos iterations + *totalIter = 0; + + // Allocate host memory + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // Compute largest eigenvalue to determine shift + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + + curandSetPseudoRandomGeneratorSeed(randGen, seed); + + // Initialize initial Lanczos vector + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); + value_type_t normQ1; + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + *shift = 0; + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); + if (status) WARNING("error in Lanczos iteration"); + + // Determine largest eigenvalue + + Lapack::sterf(*effIter, alpha_host, beta_host); + *shift = -alpha_host[*effIter - 1]; + + // ------------------------------------------------------- + // Compute eigenvectors of shifted matrix + // ------------------------------------------------------- + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + shiftLower = 1; + shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + index_type_t iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, + work_host); + + // Obtain desired eigenvalues by applying shift + for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + + // Copy results to device memory + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + + CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(stream); + + // Convert eigenvectors from Lanczos basis to standard basis + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); + + // Clean up and exit + curandDestroyGenerator(randGen); + return 0; +} + +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param iter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Smallest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeSmallestEigenvectors( + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { + using namespace spectral; + + // Matrix dimension + index_type_t n = A.nrows_; + + // Check that parameters are valid + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + // Allocate memory + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); + + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); + + // Perform Lanczos method + index_type_t effIter; + value_type_t shift; + int status = computeSmallestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); + + // Clean up and return + return status; +} + +// ========================================================= +// Eigensolver +// ========================================================= + +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeLargestEigenvectors( + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { + using namespace spectral; + + // Useful constants + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; + + // Matrix dimension + index_type_t n = A->nrows_; + + // Lanczos iteration counters + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + index_type_t i; + + // Host memory + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // ------------------------------------------------------- + // Variable initialization + // ------------------------------------------------------- + + // Total number of Lanczos iterations + *totalIter = 0; + + // Allocate host memory + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // Compute largest eigenvalue + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + curandSetPseudoRandomGeneratorSeed(randGen, seed); + // Initialize initial Lanczos vector + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); + value_type_t normQ1; + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + value_type_t shift_val = 0.0; + value_type_t *shift = &shift_val; + + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + value_type_t shiftLower = 1; + value_type_t shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + index_type_t iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + for (int i = 0; i < restartIter; ++i) { + for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + } + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, + work_host); + + // note: We need to pick the top nEigVecs eigenvalues + // but effItter can be larger than nEigVecs + // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the + // matrix of size effIter. remember the array is sorted, so it is not needed for smallest + // eigenvalues case because the first ones are the smallest ones + + index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs; + + // Debug : print nEigVecs largest eigenvalues + // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) + // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; + // std::cout < +int computeLargestEigenvectors( + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { + // Matrix dimension + index_type_t n = A.nrows_; + + // Check that parameters are valid + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + // Allocate memory + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); + + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); + + // Perform Lanczos method + index_type_t effIter; + int status = computeLargestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); + + // Clean up and return + return status; +} + +} // namespace raft diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 9de242ea10..9e8606df3e 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -182,21 +182,337 @@ inline void cusparsecoosortByRow( // NOLINT * @defgroup Gemmi cusparse gemmi operations * @{ */ -inline cusparseStatus_t cusparsegemmi( - cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha, - const float* A, int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, float* C, int ldc) { +template +cusparseStatus_t cusparsegemmi( // NOLINT + cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, + const T* A, int lda, const T* cscValB, const int* cscColPtrB, + const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const float* alpha, + const float* A, int lda, + const float* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, const float* beta, + float* C, int ldc, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } -inline cusparseStatus_t cusparsegemmi( - cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha, - const double* A, int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, double* C, int ldc) { +template <> +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const double* alpha, + const double* A, int lda, + const double* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, const double* beta, + double* C, int ldc, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } /** @} */ +#if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup cusparse Create CSR operations + * @{ + */ +template +cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, int64_t nnz, + IndexT* csrRowOffsets, IndexT* csrColInd, + ValueT* csrValues); +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int32_t* csrRowOffsets, + int32_t* csrColInd, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int32_t* csrRowOffsets, + int32_t* csrColInd, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, + int64_t* csrColInd, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, + int64_t* csrColInd, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} +/** @} */ +/** + * @defgroup cusparse CreateDnVec operations + * @{ + */ +template +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, T* values); +template <> +inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, float* values) { + return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, double* values) { + return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); +} +/** @} */ + +/** + * @defgroup Csrmv cusparse SpMV operations + * @{ + */ +template +cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + CUDA_R_32F, alg, bufferSize); +} +template <> +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + CUDA_R_64F, alg, bufferSize); +} + +template +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, T* externalBuffer, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + float* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, + alg, externalBuffer); +} +template <> +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + double* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, + alg, externalBuffer); +} +/** @} */ +#else +/** + * @defgroup Csrmv cusparse csrmv operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmv( // NOLINT + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, + const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, + float* y, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); +} +template <> +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, + const int* csrRowPtr, const int* csrColInd, const double* x, + const double* beta, double* y, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); +} +/** @} */ +#endif + +#if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup Csrmm cusparse csrmm operations + * @{ + */ +template +cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, CUDA_R_32F, alg, bufferSize); +} +template <> +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, CUDA_R_64F, alg, bufferSize); +} +template +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + CUDA_R_32F, alg, externalBuffer); +} +template <> +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + CUDA_R_64F, alg, externalBuffer); +} +/** @} */ +#else +/** + * @defgroup Csrmm cusparse csrmm operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmm( // NOLINT + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, + const T* beta, T* y, const int ldy, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const float* alpha, const cusparseMatDescr_t descr, + const float* csrVal, const int* csrRowPtr, const int* csrColInd, + const float* x, const int ldx, const float* beta, float* y, const int ldy, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); +} +template <> +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const double* alpha, const cusparseMatDescr_t descr, + const double* csrVal, const int* csrRowPtr, const int* csrColInd, + const double* x, const int ldx, const double* beta, double* y, const int ldy, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); +} +/** @} */ +#endif + +/** + * @defgroup csr2coo cusparse CSR to COO converter methods + * @{ + */ +template +void cusparsecsr2coo( // NOLINT + cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, + T* cooRowInd, cudaStream_t stream); +template <> +inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, + const int* csrRowPtr, int* cooRowInd, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, + CUSPARSE_INDEX_BASE_ZERO)); +} +/** @} */ + +/** + * @defgroup setpointermode cusparse set pointer mode method + * @{ + */ +// no T dependency... +// template +// cusparseStatus_t cusparsesetpointermode( // NOLINT +// cusparseHandle_t handle, +// cusparsePointerMode_t mode, +// cudaStream_t stream); + +// template<> +inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, + cusparsePointerMode_t mode, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSetPointerMode(handle, mode); +} +/** @} */ + } // namespace sparse } // namespace raft diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp new file mode 100644 index 0000000000..922ae7cfab --- /dev/null +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include // for std::pair + +namespace raft { + +using namespace matrix; + +// aggregate of control params for Eigen Solver: +// +template +struct cluster_solver_config_t { + size_type_t n_clusters; + size_type_t maxIter; + + value_type_t tol; + + unsigned long long seed{123456}; +}; + +template +struct kmeans_solver_t { + explicit kmeans_solver_t(cluster_solver_config_t const& config) + : config_(config) {} + + template + std::pair solve( + handle_t const& handle, thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const { + RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); + RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); + value_type_t residual{}; + index_type_t iters{}; + kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, + config_.tol, config_.maxIter, obs, codes, residual, iters, + config_.seed); + return std::make_pair(residual, iters); + } + + auto const& get_config(void) const { return config_; } + + private: + cluster_solver_config_t config_; +}; +} // namespace raft diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp new file mode 100644 index 0000000000..e36dca2e0c --- /dev/null +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace raft { + +using namespace matrix; + +// aggregate of control params for Eigen Solver: +// +template +struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; + + size_type_t restartIter; + value_type_t tol; + + bool reorthogonalize{false}; + unsigned long long seed{ + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations +}; + +template +struct lanczos_solver_t { + explicit lanczos_solver_t(eigen_solver_config_t const& config) + : config_(config) {} + + index_type_t solve_smallest_eigenvectors( + handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + index_type_t iters{}; + computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, + eigVecs, config_.seed); + return iters; + } + + index_type_t solve_largest_eigenvectors( + handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + index_type_t iters{}; + computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, eigVecs, + config_.seed); + return iters; + } + + auto const& get_config(void) const { return config_; } + + private: + eigen_solver_config_t config_; +}; +} // namespace raft diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp new file mode 100644 index 0000000000..08913d41e7 --- /dev/null +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -0,0 +1,957 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +using namespace raft; +using namespace raft::linalg; +// ========================================================= +// Useful grid settings +// ========================================================= + +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); + +// ========================================================= +// CUDA kernels +// ========================================================= + +/** + * @brief Compute distances between observation vectors and centroids + * Block dimensions should be (warpSize, 1, + * blockSize/warpSize). Ideally, the grid is large enough so there + * are d threads in the x-direction, k threads in the y-direction, + * and n threads in the z-direction. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, d*n entries) Observation matrix. Matrix is + * stored column-major and each column is an observation + * vector. Matrix dimensions are d x n. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Output, n*k entries) Distance matrix. Matrix is + * stored column-major and the (i,j)-entry is the square of the + * Euclidean distance between the ith observation vector and jth + * centroid. Matrix dimensions are n x k. Entries must be + * initialized to zero. + */ +template +static __global__ void computeDistances( + index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) { + // Loop index + index_type_t i; + + // Block indices + index_type_t bidx; + // Global indices + index_type_t gidx, gidy, gidz; + + // Private memory + value_type_t centroid_private, dist_private; + + // Global x-index indicates index of vector entry + bidx = blockIdx.x; + while (bidx * blockDim.x < d) { + gidx = threadIdx.x + bidx * blockDim.x; + + // Global y-index indicates centroid + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Load centroid coordinate from global memory + centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; + + // Global z-index indicates observation vector + gidz = threadIdx.z + blockIdx.z * blockDim.z; + while (gidz < n) { + // Load observation vector coordinate from global memory + dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; + + // Compute contribution of current entry to distance + dist_private = centroid_private - dist_private; + dist_private = dist_private * dist_private; + + // Perform reduction on warp + for (i = WARP_SIZE / 2; i > 0; i /= 2) + dist_private += utils::shfl_down(dist_private, i, 2 * i); + + // Write result to global memory + if (threadIdx.x == 0) + utils::atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + + // Move to another observation vector + gidz += blockDim.z * gridDim.z; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } + + // Move to another vector entry + bidx += gridDim.x; + } +} + +/** + * @brief Find closest centroid to observation vectors. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Input/output, n*k entries) Distance matrix. Matrix + * is stored column-major and the (i,j)-entry is the square of + * the Euclidean distance between the ith observation vector and + * jth centroid. Matrix dimensions are n x k. On exit, the first + * n entries give the square of the Euclidean distance between + * observation vectors and closest centroids. + * @param codes (Output, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ +template +static __global__ void minDistances(index_type_t n, index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { + // Loop index + index_type_t i, j; + + // Current matrix entry + value_type_t dist_curr; + + // Smallest entry in row + value_type_t dist_min; + index_type_t code_min; + + // Each row in observation matrix is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Find minimum entry in row + code_min = 0; + dist_min = dists[IDX(i, 0, n)]; + for (j = 1; j < k; ++j) { + dist_curr = dists[IDX(i, j, n)]; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + } + + // Transfer result to global memory + dists[i] = dist_min; + codes[i] = code_min; + + // Increment cluster sizes + atomicAdd(clusterSizes + code_min, 1); + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + +/** + * @brief Check if newly computed distances are smaller than old distances. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Number of observation vectors. + * @param dists_old (Input/output, n entries) Distances between + * observation vectors and closest centroids. On exit, entries + * are replaced by entries in 'dists_new' if the corresponding + * observation vectors are closest to the new centroid. + * @param dists_new (Input, n entries) Distance between observation + * vectors and new centroid. + * @param codes_old (Input/output, n entries) Cluster + * assignments. On exit, entries are replaced with 'code_new' if + * the corresponding observation vectors are closest to the new + * centroid. + * @param code_new Index associated with new centroid. + */ +template +static __global__ void minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) { + // Loop index + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; + + // Distances + value_type_t dist_old_private; + value_type_t dist_new_private; + + // Each row is processed by a thread + while (i < n) { + // Get old and new distances + dist_old_private = dists_old[i]; + dist_new_private = dists_new[i]; + + // Update if new distance is smaller than old distance + if (dist_new_private < dist_old_private) { + dists_old[i] = dist_new_private; + codes_old[i] = code_new; + } + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + +/** + * @brief Compute size of k-means clusters. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam index_type_t the type of data used for indexing. + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param codes (Input, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ +template +static __global__ void computeClusterSizes( + index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + atomicAdd(clusterSizes + codes[i], 1); + i += blockDim.x * gridDim.x; + } +} + +/** + * @brief Divide rows of centroid matrix by cluster sizes. + * Divides the ith column of the sum matrix by the size of the ith + * cluster. If the sum matrix has been initialized so that the ith + * row is the sum of all observation vectors in the ith cluster, + * this kernel produces cluster centroids. The grid and block + * dimensions should be 2-dimensional. Ideally the grid is large + * enough so there are d threads in the x-direction and k threads + * in the y-direction. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param clusterSizes (Input, k entries) Number of points in each + * cluster. + * @param centroids (Input/output, d*k entries) Sum matrix. Matrix + * is stored column-major and matrix dimensions are d x k. The + * ith column is the sum of all observation vectors in the ith + * cluster. On exit, the matrix is the centroid matrix (each + * column is the mean position of a cluster). + */ +template +static __global__ void divideCentroids( + index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) { + // Global indices + index_type_t gidx, gidy; + + // Current cluster size + index_type_t clusterSize_private; + + // Observation vector is determined by global y-index + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Get cluster size from global memory + clusterSize_private = clusterSizes[gidy]; + + // Add vector entries to centroid matrix + // vector entris are determined by global x-index + gidx = threadIdx.x + blockIdx.x * blockDim.x; + while (gidx < d) { + centroids[IDX(gidx, gidy, d)] /= clusterSize_private; + gidx += blockDim.x * gridDim.x; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } +} + +// ========================================================= +// Helper functions +// ========================================================= + +/** + * @brief Randomly choose new centroids. + * Centroid is randomly chosen with k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. + * @param handle the raft handle. + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param rand Random number drawn uniformly from [0,1). + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are n x d. + * @param dists (Input, device memory, 2*n entries) Workspace. The + * first n entries should be the distance between observation + * vectors and the closest centroid. + * @param centroid (Output, device memory, d entries) Centroid + * coordinates. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int chooseNewCentroid(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, + value_type_t rand, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ dists, + value_type_t* __restrict__ centroid) { + // Cumulative sum of distances + value_type_t* distsCumSum = dists + n; + // Residual sum of squares + value_type_t distsSum{0}; + // Observation vector that is chosen as new centroid + index_type_t obsIndex; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Compute cumulative sum of distances + thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + thrust::device_pointer_cast(distsCumSum)); + CHECK_CUDA(stream); + CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); + + // Randomly choose observation vector + // Probabilities are proportional to square of distance to closest + // centroid (see k-means++ algorithm) + // + //seg-faults due to Thrust bug + //on binary-search-like algorithms + //when run with stream dependent + //execution policies; fixed on Thrust GitHub + //hence replace w/ linear interpolation, + //until the Thrust issue gets resolved: + // + // obsIndex = (thrust::lower_bound( + // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + // thrust::device_pointer_cast(distsCumSum)); + // + //linear interpolation logic: + //{ + value_type_t minSum{0}; + CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(stream); + + if (distsSum > minSum) { + value_type_t vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / + (distsSum - minSum)); + } else { + obsIndex = 0; + } + //} + + CHECK_CUDA(stream); + obsIndex = max(obsIndex, 0); + obsIndex = min(obsIndex, n - 1); + + // Record new centroid position + CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, + stream)); + + return 0; +} + +/** + * @brief Choose initial cluster centroids for k-means algorithm. + * Centroids are randomly chosen with k-means++ algorithm + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy + * (assumed to have same stream as handle.stream). + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param dists (Output, device memory, 2*n entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int initializeCentroids( + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, + unsigned long long seed) { + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Loop index + index_type_t i; + + // Random number generator + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution uniformDist(0, 1); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + constexpr index_type_t grid_lower_bound{65535}; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Initialize grid dimensions + dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; + + // CUDA grid dimensions + dim3 gridDim_warp{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + + // CUDA grid dimensions + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), + 1, 1}; + + // Assign observation vectors to code 0 + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + + // Choose first centroid + thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), 1); + CHECK_CUDA(stream); + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids)) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from first centroid + CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); + computeDistances<<>>( + n, d, 1, obs, centroids, dists); + CHECK_CUDA(stream); + + // Choose remaining centroids + for (i = 1; i < k; ++i) { + // Choose ith centroid + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids + IDX(0, i, d))) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from ith centroid + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); + computeDistances<<>>( + n, d, 1, obs, centroids + IDX(0, i, d), dists + n); + CHECK_CUDA(stream); + + // Recompute minimum distances + minDistances2<<>>(n, dists, dists + n, + codes, i); + CHECK_CUDA(stream); + } + + // Compute cluster sizes + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); + computeClusterSizes<<>>(n, k, codes, + clusterSizes); + CHECK_CUDA(stream); + + return 0; +} + +/** + * @brief Find cluster centroids closest to observation vectors. + * Distance is measured with Euclidean norm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy + * (assumed to have same stream as handle.stream). + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Input, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param dists (Output, device memory, n*k entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares of assignment. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int assignCentroids( + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Compute distance between centroids and observation vectors + CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); + + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim; + constexpr index_type_t grid_lower_bound{65535}; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); + gridDim.y = min(k, grid_lower_bound); + gridDim.z = + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + + computeDistances<<>>(n, d, k, obs, centroids, + dists); + CHECK_CUDA(stream); + + // Find centroid closest to each observation vector + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, + clusterSizes); + CHECK_CUDA(stream); + + // Compute residual sum of squares + *residual_host = + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n)); + + return 0; +} + +/** + * @brief Update cluster centroids for k-means algorithm. + * All clusters are assumed to be non-empty. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy + * (assumed to have same stream as handle.stream). + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Input, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Input, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*d entries) Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int updateCentroids(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const index_type_t* __restrict__ codes, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int) { + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Useful constants + const value_type_t one = 1; + const value_type_t zero = 0; + + constexpr index_type_t grid_lower_bound{65535}; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Device memory + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); + + // Take transpose of observation matrix + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, + d, &zero, (value_type_t*)NULL, n, + thrust::raw_pointer_cast(obs_copy), n, stream)); + + // Cluster assigned to each observation matrix entry + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + CHECK_CUDA(stream); + thrust::transform(thrust_exec_policy, rows, rows + d * n, + thrust::make_constant_iterator(n), rows, + thrust::modulus()); + CHECK_CUDA(stream); + thrust::gather(thrust_exec_policy, rows, rows + d * n, + thrust::device_pointer_cast(codes), codes_copy); + CHECK_CUDA(stream); + + // Row associated with each observation matrix entry + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + CHECK_CUDA(stream); + thrust::transform(thrust_exec_policy, rows, rows + d * n, + thrust::make_constant_iterator(n), rows, + thrust::divides()); + CHECK_CUDA(stream); + + // Sort and reduce to add observation vectors in same cluster + thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + make_zip_iterator(make_tuple(obs_copy, rows))); + CHECK_CUDA(stream); + thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + codes_copy, // Output to codes_copy is ignored + thrust::device_pointer_cast(centroids)); + CHECK_CUDA(stream); + + // Divide sums by cluster size to get centroid matrix + // + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; + + // CUDA grid dimensions + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; + + divideCentroids<<>>(d, k, clusterSizes, + centroids); + CHECK_CUDA(stream); + + return 0; +} + +} // namespace + +namespace raft { + +// ========================================================= +// k-means algorithm +// ========================================================= + +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy + * (assumed to have same stream as handle.stream). + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*max(k,d) entries) + * Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares (sum of squares of distances between observation + * vectors and centroids). + * @param iters_host (Output, host memory, 1 entry) Number of + * k-means iterations. + * @param seed random seed to be used. + * @return error flag. + */ +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, + value_type_t* residual_host, index_type_t* iters_host, + unsigned long long seed) { + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Current iteration + index_type_t iter; + + constexpr index_type_t grid_lower_bound{65535}; + + // Residual sum of squares at previous iteration + value_type_t residualPrev = 0; + + // Random number generator + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Initialization + // ------------------------------------------------------- + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Trivial cases + if (k == 1) { + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), + cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) + WARNING("could not compute k-means centroids"); + + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), + grid_lower_bound)}; + + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, + work); + CHECK_CUDA(stream); + *residual_host = + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), + thrust::device_pointer_cast(work + n)); + CHECK_CUDA(stream); + return 0; + } + if (n <= k) { + thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::device_pointer_cast(codes + n)); + CHECK_CUDA(stream); + thrust::fill_n(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), n, 1); + CHECK_CUDA(stream); + + if (n < k) + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, + (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); + *residual_host = 0; + return 0; + } + + // Initialize cuBLAS + CUBLAS_CHECK( + linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // k-means++ algorithm + // ------------------------------------------------------- + + // Choose initial cluster centroids + if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + codes, clusterSizes, work, seed)) + WARNING("could not initialize k-means centroids"); + + // Apply k-means iteration until convergence + for (iter = 0; iter < maxiter; ++iter) { + // Update cluster centroids + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) + WARNING("could not update k-means centroids"); + + // Determine centroid closest to each observation + residualPrev = *residual_host; + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + + // Reinitialize empty clusters with new centroids + index_type_t emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); + + // FIXME: emptyCentroid never reaches k (infinite loop) under certain + // conditions, such as if obs is corrupt (as seen as a result of a + // DataFrame column of NULL edge vals used to create the Graph) + while (emptyCentroid < k) { + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, + uniformDist(rng), obs, work, + centroids + IDX(0, emptyCentroid, d))) + WARNING("could not replace empty centroid"); + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); + CHECK_CUDA(stream); + } + + // Check for convergence + if (std::fabs(residualPrev - (*residual_host)) / n < tol) { + ++iter; + break; + } + } + + // Warning if k-means has failed to converge + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) + WARNING("k-means failed to converge"); + + *iters_host = iter; + return 0; +} + +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy + * (assumed to have same stream as handle.stream). + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param residual On exit, residual sum of squares (sum of squares + * of distances between observation vectors and centroids). + * @param iters on exit, number of k-means iterations. + * @param seed random seed to be used. + * @return error flag + */ +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, value_type_t& residual, + index_type_t& iters, unsigned long long seed = 123456) { + using namespace matrix; + + // Check that parameters are valid + RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); + + // Allocate memory + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); + + // Perform k-means + return kmeans( + handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, + clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, + &iters, seed); +} + +} // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp new file mode 100644 index 0000000000..d14bf05f37 --- /dev/null +++ b/cpp/include/raft/spectral/lapack.hpp @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#include +#include +#include + +//for now; TODO: check if/where this `define` should be; +// +#define USE_LAPACK + +namespace raft { + +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status \ + << " had an illegal value."; \ + throw exception(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ + } + +extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, + float *work, int *lwork, int *info); +extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, + double *work, int *lwork, int *info); +extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, + float *a, int *lda, const float *tau, float *c, + int *ldc, float *work, int *lwork, int *info); +extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, + double *a, int *lda, const double *tau, double *c, + int *ldc, double *work, int *lwork, int *info); +extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, + double *wr, double *wi, double *vl, int *ldvl, double *vr, + int *ldvr, double *work, int *lwork, int *info); + +extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, + float *wr, float *wi, float *vl, int *ldvl, float *vr, + int *ldvr, float *work, int *lwork, int *info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const float *alpha, const float *A, int lda, const float *B, int ldb, + const float *beta, float *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const double *alpha, const double *A, int lda, const double *B, int ldb, + const double *beta, double *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, + int n, float *d, float *e, + float *z, int ldz, float *work, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, + int n, double *d, double *e, + double *z, int ldz, + double *work, int *info); + +template +class Lapack { + private: + Lapack(); + ~Lapack(); + + public: + static void check_lapack_enabled(); + + static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc); + + // special QR for lanczos + static void sterf(int n, T *d, T *e); + static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + + // QR + // computes the QR factorization of a general matrix + static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. + + // multiply C by implicit Q + static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork); + + static void geev(T *A, T *eigenvalues, int dim, int lda); + static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr); + static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, + T *eigenvectors_i, int dim, int lda, int ldvr); + + private: + static void lapack_gemm(const char transa, const char transb, int m, int n, + int k, float alpha, const float *a, int lda, + const float *b, int ldb, float beta, float *c, + int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + } + + static void lapack_gemm(const signed char transa, const signed char transb, + int m, int n, int k, double alpha, const double *a, + int lda, const double *b, int ldb, double beta, + double *c, int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (double *)a, lda, (double *)b, ldb, &beta, c, ldc); + } + + static void lapack_sterf(int n, float *d, float *e, int *info) { + cusolverDnSsterfHost(n, d, e, info); + } + + static void lapack_sterf(int n, double *d, double *e, int *info) { + cusolverDnDsterfHost(n, d, e, info); + } + + static void lapack_steqr(const signed char compz, int n, float *d, float *e, + float *z, int ldz, float *work, int *info) { + cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); + } + + static void lapack_steqr(const signed char compz, int n, double *d, double *e, + double *z, int ldz, double *work, int *info) { + cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); + } + + static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, + float *work, int *lwork, int *info) { + sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); + } + + static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, + double *work, int *lwork, int *info) { + dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); + } + + static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, + int lda, float *tau, float *c, int ldc, float *work, + int *lwork, int *info) { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); + } + + static void lapack_ormqr(char side, char trans, int m, int n, int k, + double *a, int lda, double *tau, double *c, int ldc, + double *work, int *lwork, int *info) { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); + } + + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, + int *lda, double *wr, double *wi, double *vl, + int *ldvl, double *vr, int *ldvr, + double *work, int *lwork, int *info) { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); + } + + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, + int *lda, float *wr, float *wi, float *vl, + int *ldvl, float *vr, int *ldvr, float *work, + int *lwork, int *info) { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); + } + + // real eigenvalues + static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { + char job = 'N'; + std::vector WI(dim); + int ldv = 1; + T *vl = 0; + int work_size = 6 * dim; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, + &ldv, vl, &ldv, work.data(), &work_size, &info); + lapackCheckError(info); + } + + // real eigenpairs + static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, + int lda, int ldvr) { + char jobvl = 'N'; + char jobvr = 'V'; + std::vector WI(dim); + int work_size = 6 * dim; + T *vl = 0; + int ldvl = 1; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), + vl, &ldvl, eigenvectors, &ldvr, work.data(), + &work_size, &info); + lapackCheckError(info); + } + + // complex eigenpairs + static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, + int lda, int ldvr) { + char jobvl = 'N'; + char jobvr = 'V'; + int work_size = 8 * dim; + int ldvl = 1; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, + eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, + work.data(), &work_size, &info); + lapackCheckError(info); + } +}; + +template +void Lapack::check_lapack_enabled() { +#ifndef USE_LAPACK + RAFT_FAIL("Error: LAPACK not enabled."); +#endif +} + +template +void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc) { + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + const char transA_char = transa ? 'T' : 'N'; + const char transB_char = transb ? 'T' : 'N'; + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, + ldc); + //#endif +} + +template +void Lapack::sterf(int n, T *d, T *e) { + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + int info; + lapack_sterf(n, d, e, &info); + lapackCheckError(info); + //#endif +} + +template +void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + int info; + lapack_steqr(compz, n, d, e, z, ldz, work, &info); + lapackCheckError(info); + //#endif +} + +template +void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, + int *lwork) { + check_lapack_enabled(); +#ifdef USE_LAPACK + int info; + lapack_geqrf(m, n, a, lda, tau, work, lwork, &info); + lapackCheckError(info); +#endif +} +template +void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork) { + check_lapack_enabled(); +#ifdef USE_LAPACK + char side = right_side ? 'R' : 'L'; + char trans = transq ? 'T' : 'N'; + int info; + lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); + lapackCheckError(info); +#endif +} + +// real eigenvalues +template +void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues, dim, lda); +#endif +} +// real eigenpairs +template +void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr) { + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); +#endif +} +// complex eigenpairs +template +void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, + int ldvr) { + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, + dim, lda, ldvr); +#endif +} + +} // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp new file mode 100644 index 0000000000..08f213cd3a --- /dev/null +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include + +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace raft { +namespace matrix { + +using size_type = int; // for now; TODO: move it in appropriate header + +// Vector "view"-like aggregate for linear algebra purposes +// +template +struct vector_view_t { + value_type* buffer_; + size_type size_; + + vector_view_t(value_type* buffer, size_type sz) + : buffer_(buffer), size_(sz) {} + + vector_view_t(vector_view_t&& other) + : buffer_(other.buffer_), size_(other.size_) { + other.buffer_ = nullptr; + other.size_ = 0; + } + + vector_view_t& operator=(vector_view_t&& other) { + buffer_ = other.buffer_; + size_ = other.size_; + + other.buffer_ = nullptr; + other.size_ = 0; + } +}; + +// allocatable vector, using raft handle allocator +// +template +class vector_t { + handle_t const& handle_; + value_type* buffer_; + size_type size_; + cudaStream_t stream_; + + public: + vector_t(handle_t const& raft_handle, size_type sz) + : handle_(raft_handle), + buffer_( + static_cast(raft_handle.get_device_allocator()->allocate( + sz * sizeof(value_type), raft_handle.get_stream()))), + size_(sz), + stream_(raft_handle.get_stream()) {} + + ~vector_t(void) { + handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); + } + + size_type size(void) const { return size_; } + + value_type* raw(void) { return buffer_; } + + value_type const* raw(void) const { return buffer_; } + + template + value_type nrm1(ThrustExecPolicy t_exe_pol) const { + return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + [] __device__(auto left, auto right) { + auto abs_left = left > 0 ? left : -left; + auto abs_right = right > 0 ? right : -right; + return abs_left + abs_right; + }); + } + + template + void fill(ThrustExecPolicy t_exe_pol, value_type value) { + thrust::fill_n(t_exe_pol, buffer_, size_, value); + } +}; + +template +struct sparse_matrix_t { + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : handle_(raft_handle), + row_offsets_(row_offsets), + col_indices_(col_indices), + values_(values), + nrows_(nrows), + nnz_(nnz) {} + + template + sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) + : handle_(raft_handle), + row_offsets_(csr_view.offsets), + col_indices_(csr_view.indices), + values_(csr_view.edge_data), + nrows_(csr_view.number_of_vertices), + nnz_(csr_view.number_of_edges) {} + + virtual ~sparse_matrix_t(void) = + default; // virtual because used as base for following matrix types + + // y = alpha*A*x + beta*y + //(Note: removed const-ness of x, because CUDA 11 SpMV + // descriptor creation works with non-const, and const-casting + // down is dangerous) + // + virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const { + using namespace sparse; + + RAFT_EXPECTS(x != nullptr, "Null x buffer."); + RAFT_EXPECTS(y != nullptr, "Null y buffer."); + + auto cusparse_h = handle_.get_cusparse_handle(); + auto stream = handle_.get_stream(); + + cusparseOperation_t trans = + transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + +#if __CUDACC_VER_MAJOR__ > 10 + + //create descriptors: + // + cusparseSpMatDescr_t matA; + CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, nrows_, nnz_, row_offsets_, + col_indices_, values_)); + + cusparseDnVecDescr_t vecX; + CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, x)); + + cusparseDnVecDescr_t vecY; + CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y)); + + //get (scratch) external device buffer size: + // + size_t bufferSize; + CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, opA, &alpha, matA, vecX, + &beta, vecY, alg, &bufferSize, + stream)); + + //allocate external buffer: + // + vector_t external_buffer(handle_, bufferSize); + + //finally perform SpMV: + // + CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, + vecY, CUSPARSE_CSRMV_ALG1, + external_buffer.raw(), stream)); + + //free descriptors: + //(TODO: maybe wrap them in a RAII struct?) + // + CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); + CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); + CUSPARSE_CHECK(cusparseDestroySpMat(matA)); +#else + CUSPARSE_CHECK( + cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + cusparseMatDescr_t descr = 0; + CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); + if (symmetric) { + CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); + } else { + CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + } + CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_, + &alpha, descr, values_, row_offsets_, + col_indices_, x, &beta, y, stream)); + CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); +#endif + } + + handle_t const& get_handle(void) const { return handle_; } + + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + + handle_t const& handle_; + index_type const* row_offsets_; + index_type const* col_indices_; + value_type const* values_; + index_type const nrows_; + index_type const nnz_; +}; + +template +struct laplacian_matrix_t : sparse_matrix_t { + template + laplacian_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : sparse_matrix_t(raft_handle, row_offsets, + col_indices, values, nrows, nnz), + diagonal_(raft_handle, nrows) { + vector_t ones{raft_handle, nrows}; + ones.fill(thrust_exec_policy, 1.0); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); + } + + template + laplacian_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m) + : sparse_matrix_t(raft_handle, csr_m.row_offsets_, + csr_m.col_indices_, csr_m.values_, + csr_m.nrows_, csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) { + vector_t ones{raft_handle, csr_m.nrows_}; + ones.fill(thrust_exec_policy, 1.0); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); + } + + // y = alpha*A*x + beta*y + // + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const override { + constexpr int BLOCK_SIZE = 1024; + auto n = sparse_matrix_t::nrows_; + + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); + + // scales y by beta: + // + if (beta == 0) { + CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); + } else if (beta != 1) { + CUBLAS_CHECK(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); + } + + // Apply diagonal matrix + // + dim3 gridDim{ + std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + + dim3 blockDim{BLOCK_SIZE, 1, 1}; + utils::diagmv<<>>(n, alpha, diagonal_.raw(), + x, y); + CHECK_CUDA(stream); + + // Apply adjacency matrix + // + sparse_matrix_t::mv(-alpha, x, 1, y); + } + + vector_t diagonal_; +}; + +template +struct modularity_matrix_t : laplacian_matrix_t { + template + modularity_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : laplacian_matrix_t( + raft_handle, thrust_exec_policy, row_offsets, col_indices, values, + nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); + } + + template + modularity_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m) + : laplacian_matrix_t(raft_handle, + thrust_exec_policy, csr_m) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); + } + + // y = alpha*A*x + beta*y + // + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const override { + auto n = sparse_matrix_t::nrows_; + + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); + + // y = A*x + // + sparse_matrix_t::mv(alpha, x, 0, y); + value_type dot_res; + + // gamma = d'*x + // + // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); + CUBLAS_CHECK(linalg::cublasdot( + cublas_h, n, laplacian_matrix_t::diagonal_.raw(), + 1, x, 1, &dot_res, stream)); + + // y = y -(gamma/edge_sum)*d + // + value_type gamma_ = -dot_res / edge_sum_; + CUBLAS_CHECK(linalg::cublasaxpy( + cublas_h, n, &gamma_, + laplacian_matrix_t::diagonal_.raw(), 1, y, 1, + stream)); + } + + value_type edge_sum_; +}; + +} // namespace matrix +} // namespace raft diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp new file mode 100644 index 0000000000..f8dfe5daa3 --- /dev/null +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#ifdef COLLECT_TIME_STATISTICS +#include +#include +#include +#include +#include +#endif + +#ifdef COLLECT_TIME_STATISTICS +static double timer(void) { + struct timeval tv; + cudaDeviceSynchronize(); + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} +#endif + +namespace raft { +namespace spectral { + +using namespace matrix; +using namespace linalg; + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Cluster + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return error flag. + */ +template +std::tuple modularity_maximization( + handle_t const &handle, ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const &csr_m, + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + std::tuple + stats; // # iters eigen solver, cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // Compute eigenvectors of Modularity Matrix + + // Initialize Modularity Matrix + modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute eigenvectors corresponding to largest eigenvalues + std::get<0>(stats) = + eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + + // notice that at this point the matrix has already been transposed, so we are scaling + // columns + scale_obs(nEigVecs, n, eigVecs); + CHECK_CUDA(stream); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param G Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param clusters (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +void analyzeModularity(handle_t const &handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const &csr_m, + vertex_t nClusters, + vertex_t const *__restrict__ clusters, + weight_t &modularity) { + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + weight_t partModularity, clustersize; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Device memory + vector_t part_i(handle, n); + vector_t Bx(handle, n); + + // Initialize cuBLAS + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Modularity + modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + + // Initialize output + modularity = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, + partModularity, clusters, part_i, Bx, B)) { + WARNING("empty partition"); + continue; + } + + // Record results + modularity += partModularity; + } + + modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy); +} + +} // namespace spectral +} // namespace raft diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp new file mode 100644 index 0000000000..841fca04d9 --- /dev/null +++ b/cpp/include/raft/spectral/partition.hpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace raft { +namespace spectral { + +using namespace matrix; +using namespace linalg; + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple partition( + handle_t const &handle, ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const &csr_m, + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // ------------------------------------------------------- + // Spectral partitioner + // ------------------------------------------------------- + + // Compute eigenvectors of Laplacian + + // Initialize Laplacian + ///sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute smallest eigenvalues and eigenvectors + std::get<0>(stats) = + eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return error flag. + */ +template +void analyzePartition(handle_t const &handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const &csr_m, + vertex_t nClusters, const vertex_t *__restrict__ clusters, + weight_t &edgeCut, weight_t &cost) { + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + weight_t partEdgesCut, clustersize; + + // Device memory + vector_t part_i(handle, n); + vector_t Lx(handle, n); + + // Initialize cuBLAS + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Laplacian + ///sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + + // Initialize output + cost = 0; + edgeCut = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + // Construct indicator vector for ith partition + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, + partEdgesCut, clusters, part_i, Lx, L)) { + WARNING("empty partition"); + continue; + } + + // Record results + cost += partEdgesCut / clustersize; + edgeCut += partEdgesCut / 2; + } +} + +} // namespace spectral +} // namespace raft diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp new file mode 100644 index 0000000000..b77375b33b --- /dev/null +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace raft { +namespace spectral { + +template +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, + value_type_t* obs) { + index_type_t i, j, k, index, mm; + value_type_t alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +index_type_t next_pow2(index_type_t n) { + index_type_t v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { + index_type_t p2m; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + unsigned int xsize = max(2, min(p2m, 32)); + dim3 nthreads{xsize, 256 / xsize, 1}; + + dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel + <<>>(m, n, obs); + + return cudaSuccess; +} + +template +void transform_eigen_matrix(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, edge_t n, + vertex_t nEigVecs, weight_t* eigVecs) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + const weight_t zero{0.0}; + const weight_t one{1.0}; + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CHECK_CUDA(stream); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + CHECK_CUDA(stream); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + CHECK_CUDA(stream); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + vector_t work(handle, nEigVecs * n); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice, stream)); + } +} + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const index_type_t i; + + public: + equal_to_i_op(index_type_t _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + } +}; +} // namespace + +// Construct indicator vector for ith partition +// +template +bool construct_indicator(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, edge_t index, + edge_t n, weight_t& clustersize, weight_t& partStats, + vertex_t const* __restrict__ clusters, + vector_t& part_i, vector_t& Bx, + laplacian_matrix_t const& B) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); + CHECK_CUDA(stream); + + // Compute size of ith partition + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { + return false; + } + + // Compute part stats + B.mv(1, part_i.raw(), 0, Bx.raw()); + CUBLAS_CHECK( + cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + + return true; +} + +} // namespace spectral +} // namespace raft diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp new file mode 100644 index 0000000000..406f1b7c7e --- /dev/null +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include +#include + +#define STRINGIFY_DETAIL(x) #x +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) + +#ifdef DEBUG +#define COUT() (std::cout) +#define CERR() (std::cerr) + +//nope: +// +#define WARNING(message) \ + do { \ + std::stringstream ss; \ + ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message; \ + CERR() << ss.str() << std::endl; \ + } while (0) +#else // DEBUG +#define WARNING(message) +#endif diff --git a/cpp/include/raft/utils/sm_utils.hpp b/cpp/include/raft/utils/sm_utils.hpp new file mode 100644 index 0000000000..34eeec16bd --- /dev/null +++ b/cpp/include/raft/utils/sm_utils.hpp @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#define DEFAULT_MASK 0xffffffff + +#define USE_CG 1 +//(__CUDACC_VER__ >= 80500) + +namespace raft { +namespace utils { +static __device__ __forceinline__ int lane_id() { + int id; + asm("mov.u32 %0, %%laneid;" : "=r"(id)); + return id; +} + +static __device__ __forceinline__ int lane_mask_lt() { + int mask; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); + return mask; +} + +static __device__ __forceinline__ int lane_mask_le() { + int mask; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); + return mask; +} + +static __device__ __forceinline__ int warp_id() { return threadIdx.x >> 5; } + +static __device__ __forceinline__ unsigned int ballot(int p, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __ballot_sync(mask, p); +#else + return __ballot(p); +#endif +#else + return 0; +#endif +} + +static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0; +#endif +} + +static __device__ __forceinline__ float shfl(float r, int lane, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0.0f; +#endif +} + +/// Warp shuffle down function +/** Warp shuffle functions on 64-bit floating point values are not + * natively implemented as of Compute Capability 5.0. This + * implementation has been copied from + * (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler). + * Once this is natively implemented, this function can be replaced + * by __shfl_down. + * + */ +static __device__ __forceinline__ double shfl(double r, int lane, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_sync(mask, a.x, lane, bound); + a.y = __shfl_sync(mask, a.y, lane, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl(a.x, lane, bound); + a.y = __shfl(a.y, lane, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl(long long r, int lane, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_sync(mask, a.x, lane, bound); + a.y = __shfl_sync(mask, a.y, lane, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl(a.x, lane, bound); + a.y = __shfl(a.y, lane, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ int shfl_down(int r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_down_sync(mask, r, offset, bound); +#else + return __shfl_down(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ float shfl_down(float r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_down_sync(mask, r, offset, bound); +#else + return __shfl_down(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ double shfl_down(double r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(a.x, offset, bound); + a.y = __shfl_down(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl_down(long long r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(a.x, offset, bound); + a.y = __shfl_down(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +// specifically for triangles counting +static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(mask, a.x, offset, bound); + a.y = __shfl_down(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ int shfl_up(int r, int offset, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ float shfl_up(float r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ double shfl_up(double r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up_sync(mask, a.x, offset, bound); + a.y = __shfl_up_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up(a.x, offset, bound); + a.y = __shfl_up(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl_up(long long r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up_sync(mask, a.x, offset, bound); + a.y = __shfl_up_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up(a.x, offset, bound); + a.y = __shfl_up(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __inline__ __device__ double atomicFPAdd(double *addr, double val) { +// atomicAdd for double starts with sm_60 +#if __CUDA_ARCH__ >= 600 + return atomicAdd(addr, val); +#else + unsigned long long old = __double_as_longlong(addr[0]), assumed; + + do { + assumed = old; + old = atomicCAS((unsigned long long *)addr, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + + return old; +#endif +} + +// atomicAdd for float starts with sm_20 +static __inline__ __device__ float atomicFPAdd(float *addr, float val) { + return atomicAdd(addr, val); +} + +static __inline__ __device__ double atomicFPMin(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(min(val, assumed)))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPMin(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(min(val, assumed)))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPMax(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(max(val, assumed)))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPMax(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(max(val, assumed)))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPOr(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong((bool)val | (bool)assumed))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPOr(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int((bool)val | (bool)assumed))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPLog(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(-log(exp(-val) + exp(-assumed))))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPLog(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = + int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(-logf(expf(-val) + expf(-assumed))))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +// Apply diagonal matrix to vector: +// +template +static __global__ void diagmv(IndexType_ n, ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ *__restrict__ y) { + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + y[i] += alpha * D[i] * x[i]; + i += blockDim.x * gridDim.x; + } +} + +} // namespace utils + +} // namespace raft diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu new file mode 100644 index 0000000000..4ff6cdf5fa --- /dev/null +++ b/cpp/test/cluster_solvers.cu @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { + +TEST(Raft, ClusterSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + + index_type maxiter{100}; + value_type tol{1.0e-10}; + unsigned long long seed{100110021003}; + + auto stream = h.get_stream(); + + index_type n{100}; + index_type d{10}; + index_type k{5}; + + //nullptr expected to trigger exceptions: + // + value_type* eigvecs{nullptr}; + index_type* codes{nullptr}; + + cluster_solver_config_t cfg{k, maxiter, tol, seed}; + + kmeans_solver_t cluster_solver{cfg}; + + EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, + eigvecs, codes)); +} + +TEST(Raft, ModularitySolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + //nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = h.get_stream(); + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; + auto t_exe_p = thrust::cuda::par.on(stream); + + EXPECT_ANY_THROW(spectral::modularity_maximization( + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + + value_type modularity{0}; + EXPECT_ANY_THROW( + spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); +} + +} // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu new file mode 100644 index 0000000000..e6ee09262e --- /dev/null +++ b/cpp/test/eigen_solvers.cu @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { + +TEST(Raft, EigenSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); + + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + //nullptr expected to trigger exceptions: + // + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + + eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + + lanczos_solver_t eig_solver{cfg}; + + EXPECT_ANY_THROW( + eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + + EXPECT_ANY_THROW( + eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); +} + +TEST(Raft, SpectralSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + //nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = h.get_stream(); + + auto t_exe_p = thrust::cuda::par.on(stream); + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; + EXPECT_ANY_THROW(spectral::partition( + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + + value_type edgeCut{0}; + value_type cost{0}; + EXPECT_ANY_THROW( + spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); +} + +} // namespace raft diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu new file mode 100644 index 0000000000..e5c2d52764 --- /dev/null +++ b/cpp/test/spectral_matrix.cu @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { +namespace { +template +struct csr_view_t { + index_type* offsets; + index_type* indices; + value_type* edge_data; + index_type number_of_vertices; + index_type number_of_edges; +}; +} // namespace +TEST(Raft, SpectralMatrices) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + csr_view_t csr_v{nullptr, nullptr, nullptr, 0, 0}; + + int const sz = 10; + vector_t d_v{h, sz}; + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{h, csr_v}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + ASSERT_EQ(nullptr, sm2.row_offsets_); + + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); + + auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args + + auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) { + laplacian_matrix_t lm2{h, t_exe_pol, sm2}; + }; + EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args + + auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args + + auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) { + modularity_matrix_t mm2{h, t_exe_pol, sm2}; + }; + EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args +} + +} // namespace raft