diff --git a/misc/test_gpu_sm_op.py b/misc/test_gpu_sm_op.py new file mode 100644 index 0000000000000..2de61af1d9a48 --- /dev/null +++ b/misc/test_gpu_sm_op.py @@ -0,0 +1,84 @@ +import numpy as np +import scipy +from numpy.random import default_rng +from scipy import stats +from scipy.sparse import random + +import taichi as ti + +ti.init(arch=ti.cuda) + +idx_dt = ti.i32 +val_dt = ti.f32 + +seed = 2 +np.random.seed(seed) + +rng = default_rng(seed) +rvs = stats.poisson(3, loc=1).rvs +N = 5 +np_dtype = np.float32 +rows = N +cols = N - 1 + +S1 = random(rows, cols, density=0.5, random_state=rng, + data_rvs=rvs).astype(np_dtype) +S2 = random(rows, cols, density=0.5, random_state=rng, + data_rvs=rvs).astype(np_dtype) + +nnz_A = len(S1.data) +nnz_B = len(S2.data) + +coo_S1 = scipy.sparse.coo_matrix(S1) +coo_S2 = scipy.sparse.coo_matrix(S2) + +row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) +col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) +value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt) +row_coo_A.from_numpy(coo_S1.row) +col_coo_A.from_numpy(coo_S1.col) +value_coo_A.from_numpy(coo_S1.data) + +row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) +col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) +value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt) +row_coo_B.from_numpy(coo_S2.row) +col_coo_B.from_numpy(coo_S2.col) +value_coo_B.from_numpy(coo_S2.data) + +A = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32) +B = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32) +A.build_coo(row_coo_A, col_coo_A, value_coo_A) +B.build_coo(row_coo_B, col_coo_B, value_coo_B) + +print('>>>> A:') +print(A) +print('>>>> B:') +print(B) + +print('>>>> C = A + B:') +C = A + B +print(C) +print('>>>> verify:') +S3 = S1 + S2 +print(S3.A) +print('>>>> C - A:') +D = C - A +print(D) +print('>>>> verify:') +print((S3 - S1).A) +print('>>>> A * 2.5:') +E = A * 2.5 +print(E) +print('>>>> verify:') +print((2.5 * S1).A) +print('>>>> A.T:') +F = A.transpose() +print(F) +print('>>>> verify:') +print(S1.T.A) +print('>>>> A @ B.T:') +G = A @ B.transpose() +print(G) +print('>>>> verify:') +print((S1 @ S2.T).A) diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp index f142766111c64..af11862dafe06 100644 --- a/taichi/program/sparse_matrix.cpp +++ b/taichi/program/sparse_matrix.cpp @@ -33,6 +33,28 @@ struct key_hash { return h1 ^ h2; } }; + +template +void print_triplet_from_csr(int64_t n_rows, + int n_cols, + T *row, + T1 *col, + T2 *value, + std::ostringstream &ostr) { + using Triplets = Eigen::Triplet; + std::vector trips; + for (int64_t i = 1; i <= n_rows; ++i) { + auto n_i = row[i] - row[i - 1]; + for (auto j = 0; j < n_i; ++j) { + trips.push_back({i - 1, col[row[i - 1] + j], value[row[i - 1] + j]}); + } + } + Eigen::SparseMatrix m(n_rows, n_cols); + m.setFromTriplets(trips.begin(), trips.end()); + Eigen::IOFormat clean_fmt(4, 0, ", ", "\n", "[", "]"); + ostr << Eigen::MatrixXf(m.cast()).format(clean_fmt); +} + } // namespace namespace taichi { @@ -169,6 +191,14 @@ std::unique_ptr make_cu_sparse_matrix(int rows, std::make_unique(rows, cols, dt)); } +std::unique_ptr make_cu_sparse_matrix(cusparseSpMatDescr_t mat, + int rows, + int cols, + DataType dt) { + return std::unique_ptr( + std::make_unique(mat, rows, cols, dt)); +} + template void build_ndarray_template(SparseMatrix &sm, intptr_t data_ptr, @@ -204,11 +234,11 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, int nnz) { #if defined(TI_WITH_CUDA) // Step 1: Sort coo first - cusparseHandle_t cusparse_handle = NULL; + cusparseHandle_t cusparse_handle = nullptr; CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle); cusparseSpVecDescr_t vec_permutation; cusparseDnVecDescr_t vec_values; - void *d_permutation = NULL, *d_values_sorted = NULL; + void *d_permutation = nullptr, *d_values_sorted = nullptr; CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int)); CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float)); CUSPARSEDriver::get_instance().cpCreateSpVec( @@ -220,7 +250,7 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt( cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr, &bufferSize); - void *dbuffer = NULL; + void *dbuffer = nullptr; if (bufferSize > 0) CUDADriver::get_instance().malloc(&dbuffer, bufferSize); // Setup permutation vector to identity @@ -234,7 +264,7 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, CUDADriver::get_instance().memcpy_device_to_device( coo_values_ptr, d_values_sorted, nnz * sizeof(float)); // Step 2: coo to csr - void *csr_row_offset_ptr = NULL; + void *csr_row_offset_ptr = nullptr; CUDADriver::get_instance().malloc(&csr_row_offset_ptr, sizeof(int) * (rows_ + 1)); CUSPARSEDriver::get_instance().cpCoo2Csr( @@ -276,6 +306,285 @@ void make_sparse_matrix_from_ndarray_cusparse(Program *prog, #endif } +// Reference::https://docs.nvidia.com/cuda/cusparse/index.html#csrgeam2 +std::unique_ptr CuSparseMatrix::addition( + const CuSparseMatrix &other, + const float alpha, + const float beta) const { +#if defined(TI_WITH_CUDA) + // Get information of this matrix: A + size_t nrows_A = 0, ncols_A = 0, nnz_A = 0; + void *drow_offsets_A = nullptr, *dcol_indices_A = nullptr, + *dvalues_A = nullptr; + cusparseIndexType_t csrRowOffsetsType_A, csrColIndType_A; + cusparseIndexBase_t idxBase_A; + cudaDataType valueType_A; + TI_ASSERT(matrix_ != nullptr); + + CUSPARSEDriver::get_instance().cpCsrGet( + matrix_, &nrows_A, &ncols_A, &nnz_A, &drow_offsets_A, &dcol_indices_A, + &dvalues_A, &csrRowOffsetsType_A, &csrColIndType_A, &idxBase_A, + &valueType_A); + // Get information of other matrix: B + size_t nrows_B = 0, ncols_B = 0, nnz_B = 0; + void *drow_offsets_B = nullptr, *dcol_indices_B = nullptr, + *dvalues_B = nullptr; + cusparseIndexType_t csrRowOffsetsType_B, csrColIndType_B; + cusparseIndexBase_t idxBase_B; + cudaDataType valueType_B; + CUSPARSEDriver::get_instance().cpCsrGet( + other.matrix_, &nrows_B, &ncols_B, &nnz_B, &drow_offsets_B, + &dcol_indices_B, &dvalues_B, &csrRowOffsetsType_B, &csrColIndType_B, + &idxBase_B, &valueType_B); + + // Create sparse matrix: C + int *drow_offsets_C = nullptr; + int *dcol_indices_C = nullptr; + float *dvalues_C = nullptr; + cusparseMatDescr_t descrA = nullptr, descrB = nullptr, descrC = nullptr; + CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrA); + CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrB); + CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrC); + CUSPARSEDriver::get_instance().cpSetMatType(descrA, + CUSPARSE_MATRIX_TYPE_GENERAL); + CUSPARSEDriver::get_instance().cpSetMatType(descrB, + CUSPARSE_MATRIX_TYPE_GENERAL); + CUSPARSEDriver::get_instance().cpSetMatType(descrC, + CUSPARSE_MATRIX_TYPE_GENERAL); + CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrC, + CUSPARSE_INDEX_BASE_ZERO); + CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrA, + CUSPARSE_INDEX_BASE_ZERO); + CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrB, + CUSPARSE_INDEX_BASE_ZERO); + + // Start to do addition + cusparseHandle_t cusparse_handle; + CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle); + // alpha, nnzTotalDevHostPtr points to host memory + size_t BufferSizeInBytes; + char *buffer = nullptr; + int nnzC; + int *nnzTotalDevHostPtr = &nnzC; + CUSPARSEDriver::get_instance().cpSetPointerMode(cusparse_handle, + CUSPARSE_POINTER_MODE_HOST); + CUDADriver::get_instance().malloc((void **)(&drow_offsets_C), + sizeof(int) * (nrows_A + 1)); + // Prepare buffer + CUSPARSEDriver::get_instance().cpScsrgeam2_bufferSizeExt( + cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A, + dvalues_A, drow_offsets_A, dcol_indices_A, (void *)&beta, descrB, nnz_B, + dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C, + drow_offsets_C, dcol_indices_C, &BufferSizeInBytes); + + if (BufferSizeInBytes > 0) + CUDADriver::get_instance().malloc((void **)(&buffer), BufferSizeInBytes); + + // Determine drow_offsets_C and the total number of nonzero elements. + CUSPARSEDriver::get_instance().cpXcsrgeam2Nnz( + cusparse_handle, nrows_A, ncols_A, descrA, nnz_A, drow_offsets_A, + dcol_indices_A, descrB, nnz_B, drow_offsets_B, dcol_indices_B, descrC, + drow_offsets_C, nnzTotalDevHostPtr, buffer); + + int baseC; + if (nullptr != nnzTotalDevHostPtr) { + nnzC = *nnzTotalDevHostPtr; + } else { + CUDADriver::get_instance().memcpy_device_to_host( + (void *)(&nnzC), (void *)(drow_offsets_C + nrows_A), sizeof(int)); + CUDADriver::get_instance().memcpy_device_to_host( + (void *)(&baseC), (void *)(drow_offsets_C), sizeof(int)); + nnzC -= baseC; + } + + CUDADriver::get_instance().malloc((void **)&dcol_indices_C, + sizeof(int) * nnzC); + CUDADriver::get_instance().malloc((void **)&dvalues_C, sizeof(float) * nnzC); + + CUSPARSEDriver::get_instance().cpScsrgeam2( + cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A, + dvalues_A, drow_offsets_A, dcol_indices_A, (void *)(&beta), descrB, nnz_B, + dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C, + drow_offsets_C, dcol_indices_C, buffer); + + cusparseSpMatDescr_t matrix_C; + CUSPARSEDriver::get_instance().cpCreateCsr( + &matrix_C, rows_, cols_, nnzC, drow_offsets_C, dcol_indices_C, dvalues_C, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); + + CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle); + CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA); + CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrB); + CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrC); + CUDADriver::get_instance().mem_free(buffer); + return make_cu_sparse_matrix(matrix_C, rows_, cols_, PrimitiveType::f32); +#else + TI_NOT_IMPLEMENTED; + return std::unique_ptr(); +#endif +} + +std::unique_ptr CuSparseMatrix::matmul( + const CuSparseMatrix &other) const { +#if defined(TI_WITH_CUDA) + return gemm(other, 1.0f, 1.0f); +#else + TI_NOT_IMPLEMENTED; + return std::unique_ptr(); +#endif +} + +// Reference: +// https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/spgemm +std::unique_ptr CuSparseMatrix::gemm(const CuSparseMatrix &other, + const float alpha, + const float beta) const { +#if defined(TI_WITH_CUDA) + cusparseHandle_t handle; + CUSPARSEDriver::get_instance().cpCreate(&handle); + cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE; + + size_t nrows_A = rows_; + size_t ncols_B = other.cols_; + auto mat_A = matrix_; + auto mat_B = other.matrix_; + + // 1. create resulting matrix `C` + cusparseSpMatDescr_t mat_C; + CUSPARSEDriver::get_instance().cpCreateCsr( + &mat_C, nrows_A, ncols_B, 0, nullptr, nullptr, nullptr, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); + + // 2. create gemm descr + cusparseSpGEMMDescr_t spgemm_desc; + CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc); + + // 3. ask buffer_size1 bytes for external memory + void *d_buffer1; + size_t buffer_size1 = 0; + CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation( + handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C, + CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1, nullptr); + CUDADriver::get_instance().malloc((void **)&d_buffer1, buffer_size1); + // 4. inspect the matrices A and B to understand the memory requirement for + // the next step + CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation( + handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C, + CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1, + d_buffer1); + + // 5. ask buffer_size2 bytes for external memory + size_t buffer_size2 = 0; + CUSPARSEDriver::get_instance().cpSpGEMM_compute( + handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr); + void *d_buffer2; + CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2); + + // 6. compute the intermediate product of A * B + CUSPARSEDriver::get_instance().cpSpGEMM_compute( + handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, d_buffer2); + + // 7. get info of matrix C + size_t nrows_C, cols_C, nnz_C; + CUSPARSEDriver::get_instance().cpGetSize(mat_C, &nrows_C, &cols_C, &nnz_C); + + // 8. allocate matric C + int *d_csr_row_ptr_C, *d_csr_col_ind_C; + float *d_values_C; + CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_C, + (nrows_A + 1) * sizeof(int)); + CUDADriver::get_instance().malloc((void **)&d_csr_col_ind_C, + nnz_C * sizeof(int)); + CUDADriver::get_instance().malloc((void **)&d_values_C, + nnz_C * sizeof(float)); + + // 9. update matrix C with new pointers + CUSPARSEDriver::get_instance().cpCsrSetPointers(mat_C, d_csr_row_ptr_C, + d_csr_col_ind_C, d_values_C); + + // 10. copy the final products of C. + CUSPARSEDriver::get_instance().cpSpGEMM_copy( + handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemm_desc); + + CUDADriver::get_instance().mem_free(d_buffer1); + CUDADriver::get_instance().mem_free(d_buffer2); + CUSPARSEDriver::get_instance().cpDestroy(handle); + CUSPARSEDriver::get_instance().cpDestroySpGEMM(spgemm_desc); + + return make_cu_sparse_matrix(mat_C, nrows_A, ncols_B, PrimitiveType::f32); +#else + TI_NOT_IMPLEMENTED; + return std::unique_ptr(); +#endif +} + +// Convert CSR to CSC format using routine `Csr2cscEx2` +// to implement transpose. +// Reference +// https://stackoverflow.com/questions/57368010/how-to-transpose-a-sparse-matrix-in-cusparse +std::unique_ptr CuSparseMatrix::transpose() const { +#if defined(TI_WITH_CUDA) + cusparseHandle_t handle; + CUSPARSEDriver::get_instance().cpCreate(&handle); + size_t nrows_A, ncols_A, nnz; + void *d_csr_val = nullptr, *d_csr_val_AT = nullptr; + int *d_csr_row_ptr = nullptr, *d_csr_col_ind = nullptr; + int *d_csr_row_ptr_AT = nullptr, *d_csr_col_ptr_AT = nullptr; + cusparseIndexType_t csr_row_otr_type, csr_col_otr_type; + cusparseIndexBase_t idx_base_type; + cudaDataType value_type; + size_t buffer_size; + + // 1. get pointers of A + CUSPARSEDriver::get_instance().cpCsrGet( + matrix_, &nrows_A, &ncols_A, &nnz, (void **)&d_csr_row_ptr, + (void **)&d_csr_col_ind, (void **)&d_csr_val, &csr_row_otr_type, + &csr_col_otr_type, &idx_base_type, &value_type); + + // 2. ask bufer size for Csr2cscEx2 + CUSPARSEDriver::get_instance().cpCsr2cscEx2_bufferSize( + handle, nrows_A, ncols_A, nnz, (void *)&d_csr_val, (int *)&d_csr_row_ptr, + (int *)&d_csr_col_ind, (void *)&d_csr_val_AT, (int *)&d_csr_row_ptr_AT, + (int *)&d_csr_col_ptr_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_size); + void *buffer = nullptr; + CUDADriver::get_instance().malloc((void **)&buffer, buffer_size); + + CUDADriver::get_instance().malloc((void **)&d_csr_val_AT, + nnz * sizeof(float)); + CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_AT, + (ncols_A + 1) * sizeof(int)); + CUDADriver::get_instance().malloc((void **)&d_csr_col_ptr_AT, + nnz * sizeof(int)); + + // 3. execute Csr2cscEx2 + CUSPARSEDriver::get_instance().cpCsr2cscEx2( + handle, nrows_A, ncols_A, nnz, d_csr_val, d_csr_row_ptr, d_csr_col_ind, + d_csr_val_AT, d_csr_row_ptr_AT, d_csr_col_ptr_AT, CUDA_R_32F, + CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, + buffer); + + // 4. create AT. + cusparseSpMatDescr_t mat_AT; + CUSPARSEDriver::get_instance().cpCreateCsr( + &mat_AT, ncols_A, nrows_A, nnz, (void *)d_csr_row_ptr_AT, + (void *)d_csr_col_ptr_AT, (void *)d_csr_val_AT, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); + CUDADriver::get_instance().mem_free(buffer); + CUSPARSEDriver::get_instance().cpDestroy(handle); + return make_cu_sparse_matrix(mat_AT, ncols_A, nrows_A, PrimitiveType::f32); +#else + TI_NOT_IMPLEMENTED; + return std::unique_ptr(); +#endif +} + void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) { #if defined(TI_WITH_CUDA) size_t dX = prog->get_ndarray_data_ptr_as_int(&x); @@ -295,7 +604,7 @@ void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) { cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX, &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize); - void *dBuffer = NULL; + void *dBuffer = nullptr; if (bufferSize > 0) CUDADriver::get_instance().malloc(&dBuffer, bufferSize); CUSPARSEDriver::get_instance().cpSpMV( @@ -309,5 +618,34 @@ void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) { #endif } +const std::string CuSparseMatrix::to_string() const { + std::ostringstream ostr; +#ifdef TI_WITH_CUDA + size_t rows, cols, nnz; + float *dR; + int *dC, *dV; + cusparseIndexType_t row_type, column_type; + cusparseIndexBase_t idx_base; + cudaDataType value_type; + CUSPARSEDriver::get_instance().cpCsrGet( + matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV, + &row_type, &column_type, &idx_base, &value_type); + + auto *hR = new int[rows + 1]; + auto *hC = new int[nnz]; + auto *hV = new float[nnz]; + + CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR, + (rows + 1) * sizeof(int)); + CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC, + (nnz) * sizeof(int)); + CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV, + (nnz) * sizeof(float)); + + print_triplet_from_csr(rows, cols, hR, hC, hV, ostr); +#endif + return ostr.str(); +} + } // namespace lang } // namespace taichi diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h index 53d262919cb06..2065703bdb252 100644 --- a/taichi/program/sparse_matrix.h +++ b/taichi/program/sparse_matrix.h @@ -126,6 +126,7 @@ class EigenSparseMatrix : public SparseMatrix { } ~EigenSparseMatrix() override = default; + void build_triplets(void *triplets_adr) override; const std::string to_string() const override; @@ -211,19 +212,64 @@ class CuSparseMatrix : public SparseMatrix { } #endif } + explicit CuSparseMatrix(cusparseSpMatDescr_t A, + int rows, + int cols, + DataType dt) + : SparseMatrix(rows, cols, dt), matrix_(A) { + } + CuSparseMatrix(const CuSparseMatrix &sm) + : SparseMatrix(sm.rows_, sm.cols_, sm.dtype_), matrix_(sm.matrix_) { + } virtual ~CuSparseMatrix(); + + // TODO: Overload +=, -= and *= + friend std::unique_ptr operator+(const CuSparseMatrix &lhs, + const CuSparseMatrix &rhs) { + auto m = lhs.addition(rhs, 1.0, 1.0); + return m; + }; + + friend std::unique_ptr operator-(const CuSparseMatrix &lhs, + const CuSparseMatrix &rhs) { + return lhs.addition(rhs, 1.0, -1.0); + }; + + friend std::unique_ptr operator*(const CuSparseMatrix &sm, + float scale) { + return sm.addition(sm, scale, 0.0); + } + + friend std::unique_ptr operator*(float scale, + const CuSparseMatrix &sm) { + return sm.addition(sm, scale, 0.0); + } + + std::unique_ptr addition(const CuSparseMatrix &other, + const float alpha, + const float beta) const; + + std::unique_ptr matmul(const CuSparseMatrix &other) const; + + std::unique_ptr gemm(const CuSparseMatrix &other, + const float alpha, + const float beta) const; + + std::unique_ptr transpose() const; + void build_csr_from_coo(void *coo_row_ptr, void *coo_col_ptr, void *coo_values_ptr, int nnz) override; + void spmv(Program *prog, const Ndarray &x, Ndarray &y); const void *get_matrix() const override { return &matrix_; }; - void print_info(); + const std::string to_string() const override; private: cusparseSpMatDescr_t matrix_; @@ -237,6 +283,10 @@ std::unique_ptr make_sparse_matrix( std::unique_ptr make_cu_sparse_matrix(int rows, int cols, DataType dt); +std::unique_ptr make_cu_sparse_matrix(cusparseSpMatDescr_t mat, + int rows, + int cols, + DataType dt); void make_sparse_matrix_from_ndarray(Program *prog, SparseMatrix &sm, diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 92a3067f0f0bc..3795ba380aebb 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -1221,7 +1221,15 @@ void export_lang(py::module &m) { py::class_(m, "CuSparseMatrix") .def(py::init()) - .def("spmv", &CuSparseMatrix::spmv); + .def(py::init()) + .def("spmv", &CuSparseMatrix::spmv) + .def(py::self + py::self) + .def(py::self - py::self) + .def(py::self * float32()) + .def(float32() * py::self) + .def("matmul", &CuSparseMatrix::matmul) + .def("transpose", &CuSparseMatrix::transpose) + .def("to_string", &CuSparseMatrix::to_string); py::class_(m, "SparseSolver") .def("compute", &SparseSolver::compute) diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h index 88bb33951e3e7..d44abd7124992 100644 --- a/taichi/rhi/cuda/cuda_types.h +++ b/taichi/rhi/cuda/cuda_types.h @@ -447,6 +447,10 @@ struct cusparseSpMatDescr; typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t; typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t; typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t; + +struct cusparseSpGEMMDescr; +typedef struct cusparseSpGEMMDescr *cusparseSpGEMMDescr_t; + typedef enum { CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector ///< indices @@ -504,6 +508,27 @@ typedef enum { CUSPARSE_SPMV_COO_ALG2 = 4 } cusparseSpMVAlg_t; +typedef enum { + CUSPARSE_SPGEMM_DEFAULT = 0, + CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC = 1, + CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2 +} cusparseSpGEMMAlg_t; + +typedef enum { + CUSPARSE_POINTER_MODE_HOST = 0, + CUSPARSE_POINTER_MODE_DEVICE = 1 +} cusparsePointerMode_t; + +typedef enum { + CUSPARSE_ACTION_SYMBOLIC = 0, + CUSPARSE_ACTION_NUMERIC = 1 +} cusparseAction_t; + +typedef enum { + CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc + CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc +} cusparseCsr2CscAlg_t; + typedef enum { CUSPARSE_MATRIX_TYPE_GENERAL = 0, CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1, diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h index 1a70e36de7883..53e7881e036b7 100644 --- a/taichi/rhi/cuda/cusparse_functions.inc.h +++ b/taichi/rhi/cuda/cusparse_functions.inc.h @@ -10,6 +10,7 @@ PER_CUSPARSE_FUNCTION(cpCreateCsr, cusparseCreateCsr, cusparseSpMatDescr_t*, int PER_CUSPARSE_FUNCTION(cpCoo2Csr, cusparseXcoo2csr, cusparseHandle_t ,const void*, int, int,void*, cusparseIndexBase_t ); PER_CUSPARSE_FUNCTION(cpCsrGet, cusparseCsrGet, cusparseSpMatDescr_t ,size_t *,size_t *,size_t *,void**,void**,void**,cusparseIndexType_t* ,cusparseIndexType_t* ,cusparseIndexBase_t* ,cudaDataType*); PER_CUSPARSE_FUNCTION(cpCreateMatDescr, cusparseCreateMatDescr, cusparseMatDescr_t*); +PER_CUSPARSE_FUNCTION(cpDestroyMatDescr, cusparseDestroyMatDescr, cusparseMatDescr_t); PER_CUSPARSE_FUNCTION(cpSetMatType, cusparseSetMatType, cusparseMatDescr_t, cusparseMatrixType_t); PER_CUSPARSE_FUNCTION(cpSetMatIndexBase, cusparseSetMatIndexBase, cusparseMatDescr_t, cusparseIndexBase_t); PER_CUSPARSE_FUNCTION(cpDestroySpMat, cusparseDestroySpMat, cusparseSpMatDescr_t); @@ -19,6 +20,8 @@ PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermuta PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*); PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*); PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t); +PER_CUSPARSE_FUNCTION(cpSetPointerMode, cusparseSetPointerMode, cusparseHandle_t, cusparsePointerMode_t); +PER_CUSPARSE_FUNCTION(cpCsrSetPointers, cusparseCsrSetPointers, cusparseSpMatDescr_t,void*, void*, void*); // cusparse dense vector description PER_CUSPARSE_FUNCTION(cpCreateDnVec, cusparseCreateDnVec, cusparseDnVecDescr_t*, int, void*, cudaDataType); @@ -27,3 +30,19 @@ PER_CUSPARSE_FUNCTION(cpDestroyDnVec, cusparseDestroyDnVec, cusparseDnVecDescr_t // cusparse sparse matrix-vector multiplication PER_CUSPARSE_FUNCTION(cpSpMV_bufferSize, cusparseSpMV_bufferSize, cusparseHandle_t, cusparseOperation_t, const void*,cusparseSpMatDescr_t, cusparseDnVecDescr_t,const void*, cusparseDnVecDescr_t,cudaDataType, cusparseSpMVAlg_t, size_t*); PER_CUSPARSE_FUNCTION(cpSpMV, cusparseSpMV, cusparseHandle_t, cusparseOperation_t, const void*,cusparseSpMatDescr_t, cusparseDnVecDescr_t,const void*, cusparseDnVecDescr_t,cudaDataType, cusparseSpMVAlg_t, void*); + + +// cusparse sparse matrix-matrix operation +PER_CUSPARSE_FUNCTION(cpScsrgeam2_bufferSizeExt, cusparseScsrgeam2_bufferSizeExt, cusparseHandle_t ,int,int,void*,const cusparseMatDescr_t ,int, void*, void*, void*,void*,const cusparseMatDescr_t ,int, void*,void*,void*,const cusparseMatDescr_t ,void*,void*,void*,void*); +PER_CUSPARSE_FUNCTION(cpGetSize, cusparseSpMatGetSize, cusparseSpMatDescr_t, size_t* , size_t* , size_t*); +PER_CUSPARSE_FUNCTION(cpXcsrgeam2Nnz, cusparseXcsrgeam2Nnz, cusparseHandle_t,int,int,const cusparseMatDescr_t ,int,void*,void*,const cusparseMatDescr_t ,int,void*,void*,const cusparseMatDescr_t ,void*,void*,void*); +PER_CUSPARSE_FUNCTION(cpScsrgeam2, cusparseScsrgeam2, cusparseHandle_t,int,int,void*,const cusparseMatDescr_t ,int, void*,void*,void*,void*,const cusparseMatDescr_t ,int,void*,void*,void*,const cusparseMatDescr_t ,void*,void*,void*,void*); +PER_CUSPARSE_FUNCTION(cpSpGEMM_workEstimation, cusparseSpGEMM_workEstimation, cusparseHandle_t,cusparseOperation_t,cusparseOperation_t,const void*,cusparseSpMatDescr_t,cusparseSpMatDescr_t,const void*,cusparseSpMatDescr_t,cudaDataType,cusparseSpGEMMAlg_t,cusparseSpGEMMDescr_t,size_t*,void*); +PER_CUSPARSE_FUNCTION(cpSpGEMM_compute, cusparseSpGEMM_compute, cusparseHandle_t,cusparseOperation_t,cusparseOperation_t,const void*,cusparseSpMatDescr_t,cusparseSpMatDescr_t,const void*,cusparseSpMatDescr_t,cudaDataType,cusparseSpGEMMAlg_t,cusparseSpGEMMDescr_t,size_t*,void*); +PER_CUSPARSE_FUNCTION(cpSpGEMM_copy, cusparseSpGEMM_copy, cusparseHandle_t,cusparseOperation_t,cusparseOperation_t,const void*,cusparseSpMatDescr_t,cusparseSpMatDescr_t,const void*,cusparseSpMatDescr_t,cudaDataType,cusparseSpGEMMAlg_t,cusparseSpGEMMDescr_t); +PER_CUSPARSE_FUNCTION(cpCreateSpGEMM, cusparseSpGEMM_createDescr, cusparseSpGEMMDescr_t* ); +PER_CUSPARSE_FUNCTION(cpDestroySpGEMM, cusparseSpGEMM_destroyDescr, cusparseSpGEMMDescr_t ); + +// cusparse sparse matrix convertions +PER_CUSPARSE_FUNCTION(cpCsr2cscEx2_bufferSize, cusparseCsr2cscEx2_bufferSize, cusparseHandle_t, int, int, int, const void*, const int*, const int*, void*, int*, int*, cudaDataType, cusparseAction_t, cusparseIndexBase_t, cusparseCsr2CscAlg_t, size_t*); +PER_CUSPARSE_FUNCTION(cpCsr2cscEx2, cusparseCsr2cscEx2, cusparseHandle_t, int, int, int, const void*, const int*, const int*, void*, int*, int*, cudaDataType, cusparseAction_t, cusparseIndexBase_t, cusparseCsr2CscAlg_t, void*);