diff --git a/CHANGELOG.md b/CHANGELOG.md index 0eb61cc062..a55599eb4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - PR #22: Preserve order in comms workers for rank initialization - PR #38: Remove #include from `raft/mr/` - PR #39: Adding a virtual destructor to `raft::handle_t` and `raft::comms::comms_t` +- PR #41: Upgrade to `cusparseSpMV()`, alg selection, and rectangular matrices. ## Bug Fixes - PR #17: Make destructor inline to avoid redeclaration error diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 1190d92c94..98189f15bc 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -213,7 +213,7 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, } /** @} */ -#if __CUDACC_VER_MAJOR__ > 10 +#if __CUDACC_VER_MAJOR__ >= 10 and __CUDACC_VER_MINOR__ > 0 /** * @defgroup cusparse Create CSR operations * @{ @@ -226,9 +226,8 @@ cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, - int64_t nnz, int32_t* csrRowOffsets, - int32_t* csrColInd, - float* csrValues) { + int64_t nnz, int* csrRowOffsets, + int* csrColInd, float* csrValues) { return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, @@ -237,9 +236,8 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, - int64_t nnz, int32_t* csrRowOffsets, - int32_t* csrColInd, - double* csrValues) { + int64_t nnz, int* csrRowOffsets, + int* csrColInd, double* csrValues) { return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, @@ -514,5 +512,76 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, } /** @} */ +/** + * @defgroup CsrmvEx cusparse csrmvex operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmvex_bufferSize( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, + const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, + const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, + const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const float* x, const float* beta, float* y, size_t* bufferSizeInBytes, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize( + handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA, + CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y, + CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes); +} +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, + const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const double* x, const double* beta, double* y, size_t* bufferSizeInBytes, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize( + handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA, + CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y, + CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes); +} + +template +cusparseStatus_t cusparsecsrmvex( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, + const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, + const T* beta, T* y, T* buffer, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, + const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const float* x, const float* beta, float* y, float* buffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, + descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x, + CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F, + CUDA_R_32F, buffer); +} +template <> +inline cusparseStatus_t cusparsecsrmvex( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, + const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const double* x, const double* beta, double* y, double* buffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, + descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x, + CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F, + CUDA_R_64F, buffer); +} +/** @} */ + } // namespace sparse } // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 2923c76902..4b531231f0 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -38,6 +38,16 @@ namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header +// specifies type of algorithm used +// for SpMv: +// +enum struct sparse_mv_alg_t : int { + SPARSE_MV_UNDEFINED = -1, + SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix + SPARSE_MV_ALG1, // typical for CSR + SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices +}; + // Vector "view"-like aggregate for linear algebra purposes // template @@ -109,6 +119,18 @@ class vector_t { template struct sparse_matrix_t { + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const ncols, + index_type const nnz) + : handle_(raft_handle), + row_offsets_(row_offsets), + col_indices_(col_indices), + values_(values), + nrows_(nrows), + ncols_(ncols), + nnz_(nnz) {} + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) @@ -117,6 +139,7 @@ struct sparse_matrix_t { col_indices_(col_indices), values_(values), nrows_(nrows), + ncols_(nrows), nnz_(nnz) {} template @@ -126,6 +149,7 @@ struct sparse_matrix_t { col_indices_(csr_view.indices), values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), + ncols_(csr_view.number_of_vertices), nnz_(csr_view.number_of_edges) {} virtual ~sparse_matrix_t(void) = @@ -137,8 +161,9 @@ struct sparse_matrix_t { // down is dangerous) // virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, - value_type* __restrict__ y, bool transpose = false, - bool symmetric = false) const { + value_type* __restrict__ y, + sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, + bool transpose = false, bool symmetric = false) const { using namespace sparse; RAFT_EXPECTS(x != nullptr, "Null x buffer."); @@ -151,26 +176,34 @@ struct sparse_matrix_t { transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose -#if __CUDACC_VER_MAJOR__ > 10 +#if __CUDACC_VER_MAJOR__ >= 10 and __CUDACC_VER_MINOR__ > 0 + auto size_x = transpose ? nrows_ : ncols_; + auto size_y = transpose ? ncols_ : nrows_; + + cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); //create descriptors: + //(below casts are necessary, because + // cusparseCreateCsr(...) takes non-const + // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, nrows_, nnz_, row_offsets_, - col_indices_, values_)); + CUSPARSE_CHECK(cusparsecreatecsr( + &matA, nrows_, ncols_, nnz_, const_cast(row_offsets_), + const_cast(col_indices_), const_cast(values_))); cusparseDnVecDescr_t vecX; - CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, x)); + CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); cusparseDnVecDescr_t vecY; - CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y)); + CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); //get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK( - cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, vecX, &beta, - vecY, CUSPARSE_CSRMV_ALG1, &bufferSize, stream)); + CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, + vecX, &beta, vecY, spmv_alg, + &bufferSize, stream)); //allocate external buffer: // @@ -179,8 +212,7 @@ struct sparse_matrix_t { //finally perform SpMV: // CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, - vecY, CUSPARSE_CSRMV_ALG1, - external_buffer.raw(), stream)); + vecY, spmv_alg, external_buffer.raw(), stream)); //free descriptors: //(TODO: maybe wrap them in a RAII struct?) @@ -199,7 +231,7 @@ struct sparse_matrix_t { CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_, + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_, &alpha, descr, values_, row_offsets_, col_indices_, x, &beta, y, stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); @@ -208,6 +240,19 @@ struct sparse_matrix_t { handle_t const& get_handle(void) const { return handle_; } +#if __CUDACC_VER_MAJOR__ >= 10 and __CUDACC_VER_MINOR__ > 0 + cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const { + switch (alg) { + case sparse_mv_alg_t::SPARSE_MV_ALG1: + return CUSPARSE_CSRMV_ALG1; + case sparse_mv_alg_t::SPARSE_MV_ALG2: + return CUSPARSE_CSRMV_ALG2; + default: + return CUSPARSE_MV_ALG_DEFAULT; + } + } +#endif + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate handle_t const& handle_; @@ -215,6 +260,7 @@ struct sparse_matrix_t { index_type const* col_indices_; value_type const* values_; index_type const nrows_; + index_type const ncols_; index_type const nnz_; }; @@ -252,8 +298,9 @@ struct laplacian_matrix_t : sparse_matrix_t { // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type* __restrict__ x, value_type beta, - value_type* __restrict__ y, bool transpose = false, - bool symmetric = false) const override { + value_type* __restrict__ y, + sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, + bool transpose = false, bool symmetric = false) const override { constexpr int BLOCK_SIZE = 1024; auto n = sparse_matrix_t::nrows_; @@ -282,7 +329,8 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply adjacency matrix // - sparse_matrix_t::mv(-alpha, x, 1, y); + sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, + symmetric); } vector_t diagonal_; @@ -316,8 +364,9 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type* __restrict__ x, value_type beta, - value_type* __restrict__ y, bool transpose = false, - bool symmetric = false) const override { + value_type* __restrict__ y, + sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, + bool transpose = false, bool symmetric = false) const override { auto n = sparse_matrix_t::nrows_; auto cublas_h = @@ -327,7 +376,8 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = A*x // - sparse_matrix_t::mv(alpha, x, 0, y); + sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, + symmetric); value_type dot_res; // gamma = d'*x