From 8d43c87a5862ff0bff7635252a0c2fc0a6b61ec6 Mon Sep 17 00:00:00 2001 From: pengyu <6712304+FantasyVR@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:53:46 +0800 Subject: [PATCH] [Lang] Support GPU solve with analyzePattern and factorize (#6158) Issue: #2906 --- misc/test_coo_cusolver.py | 12 +++ python/taichi/linalg/sparse_solver.py | 10 ++ taichi/program/sparse_matrix.cpp | 4 + taichi/program/sparse_matrix.h | 17 ++++ taichi/program/sparse_solver.cpp | 110 ++++++++++++++++++++-- taichi/program/sparse_solver.h | 34 +++++-- taichi/python/export_lang.cpp | 1 + taichi/rhi/cuda/cuda_types.h | 6 ++ taichi/rhi/cuda/cusolver_functions.inc.h | 8 ++ tests/python/test_sparse_linear_solver.py | 55 +++++++++++ 10 files changed, 240 insertions(+), 17 deletions(-) diff --git a/misc/test_coo_cusolver.py b/misc/test_coo_cusolver.py index 7570f8cf53bb1..1be57f456aed9 100644 --- a/misc/test_coo_cusolver.py +++ b/misc/test_coo_cusolver.py @@ -61,3 +61,15 @@ def print_x(x: ti.types.ndarray(), ncols: ti.i32): print( f"The solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}" ) + +solver = ti.linalg.SparseSolver() +solver.analyze_pattern(A_ti) +solver.factorize(A_ti) +solver.solve_rf(A_ti, b, x_ti) + +ti.sync() +print_x(x_ti, ncols) +ti.sync() +print( + f"The cusolver rf solution and numpy solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}" +) diff --git a/python/taichi/linalg/sparse_solver.py b/python/taichi/linalg/sparse_solver.py index aa237bb5af2b8..19d78d2ff61d0 100644 --- a/python/taichi/linalg/sparse_solver.py +++ b/python/taichi/linalg/sparse_solver.py @@ -100,6 +100,16 @@ def solve_cu(self, sparse_matrix, b, x): f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now." ) + def solve_rf(self, sparse_matrix, b, x): + if isinstance(sparse_matrix, SparseMatrix) and isinstance( + b, Ndarray) and isinstance(x, Ndarray): + self.solver.solve_rf(get_runtime().prog, sparse_matrix.matrix, + b.arr, x.arr) + else: + raise TaichiRuntimeError( + f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now." + ) + def info(self): """Check if the linear systems are solved successfully. diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp index 23ca3371057e4..7d62b31dcfb4b 100644 --- a/taichi/program/sparse_matrix.cpp +++ b/taichi/program/sparse_matrix.cpp @@ -282,6 +282,10 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, CUDADriver::get_instance().mem_free(d_values_sorted); CUDADriver::get_instance().mem_free(d_permutation); CUDADriver::get_instance().mem_free(dbuffer); + csr_row_ptr_ = csr_row_offset_ptr; + csr_col_ind_ = coo_col_ptr; + csr_val_ = coo_values_ptr; + nnz_ = nnz; #endif } diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h index 90f4a49d1dad9..1126da088aa30 100644 --- a/taichi/program/sparse_matrix.h +++ b/taichi/program/sparse_matrix.h @@ -270,8 +270,25 @@ class CuSparseMatrix : public SparseMatrix { const std::string to_string() const override; + void *get_row_ptr() const { + return csr_row_ptr_; + } + void *get_col_ind() const { + return csr_col_ind_; + } + void *get_val_ptr() const { + return csr_val_; + } + int get_nnz() const { + return nnz_; + } + private: cusparseSpMatDescr_t matrix_; + void *csr_row_ptr_{nullptr}; + void *csr_col_ind_{nullptr}; + void *csr_val_{nullptr}; + int nnz_{0}; }; std::unique_ptr make_sparse_matrix( diff --git a/taichi/program/sparse_solver.cpp b/taichi/program/sparse_solver.cpp index 06d29f26ea0ba..7d6d5cb0c9ed5 100644 --- a/taichi/program/sparse_solver.cpp +++ b/taichi/program/sparse_solver.cpp @@ -82,6 +82,73 @@ CuSparseSolver::CuSparseSolver() { } #endif } +// Reference: +// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp +void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { +#if defined(TI_WITH_CUDA) + // Retrive the info of the sparse matrix + SparseMatrix *sm_no_cv = const_cast(&sm); + CuSparseMatrix *A = dynamic_cast(sm_no_cv); + size_t rowsA = A->num_rows(); + size_t nnzA = A->get_nnz(); + void *d_csrRowPtrA = A->get_row_ptr(); + void *d_csrColIndA = A->get_col_ind(); + + CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_); + CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_); + CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_); + CUSPARSEDriver::get_instance().cpSetMatType(descr_, + CUSPARSE_MATRIX_TYPE_GENERAL); + CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_, + CUSPARSE_INDEX_BASE_ZERO); + + // step 1: create opaque info structure + CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_); + + // step 2: analyze chol(A) to know structure of L + CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis( + cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrA, d_csrColIndA, info_); + +#else + TI_NOT_IMPLEMENTED +#endif +} + +void CuSparseSolver::factorize(const SparseMatrix &sm) { +#if defined(TI_WITH_CUDA) + // Retrive the info of the sparse matrix + SparseMatrix *sm_no_cv = const_cast(&sm); + CuSparseMatrix *A = dynamic_cast(sm_no_cv); + size_t rowsA = A->num_rows(); + size_t nnzA = A->get_nnz(); + void *d_csrRowPtrA = A->get_row_ptr(); + void *d_csrColIndA = A->get_col_ind(); + void *d_csrValA = A->get_val_ptr(); + + size_t size_internal = 0; + size_t size_chol = 0; // size of working space for csrlu + // step 1: workspace for chol(A) + CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo( + cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA, + d_csrColIndA, info_, &size_internal, &size_chol); + + if (size_chol > 0) + CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol); + + // step 2: compute A = L*L^T + CUSOLVERDriver::get_instance().csSpScsrcholFactor( + cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA, + d_csrColIndA, info_, gpu_buffer_); + // step 3: check if the matrix is singular + const double tol = 1.e-14; + int singularity = 0; + CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_, + tol, &singularity); + TI_ASSERT(singularity == -1); +#else + TI_NOT_IMPLEMENTED +#endif +} void CuSparseSolver::solve_cu(Program *prog, const SparseMatrix &sm, @@ -100,16 +167,15 @@ void CuSparseSolver::solve_cu(Program *prog, printf("Cusolver version: %d.%d.%d\n", major_version, minor_version, patch_level); - const cusparseSpMatDescr_t *A = - (const cusparseSpMatDescr_t *)(sm.get_matrix()); - size_t nrows = 0, ncols = 0, nnz = 0; - void *drow_offsets = NULL, *dcol_indices = NULL, *dvalues = NULL; - cusparseIndexType_t csrRowOffsetsType, csrColIndType; - cusparseIndexBase_t idxBase; - cudaDataType valueType; - CUSPARSEDriver::get_instance().cpCsrGet( - *A, &nrows, &ncols, &nnz, &drow_offsets, &dcol_indices, &dvalues, - &csrRowOffsetsType, &csrColIndType, &idxBase, &valueType); + // Retrive the info of the sparse matrix + SparseMatrix *sm_no_cv = const_cast(&sm); + CuSparseMatrix *A = dynamic_cast(sm_no_cv); + size_t nrows = A->num_rows(); + size_t ncols = A->num_cols(); + size_t nnz = A->get_nnz(); + void *drow_offsets = A->get_row_ptr(); + void *dcol_indices = A->get_col_ind(); + void *dvalues = A->get_val_ptr(); size_t db = prog->get_ndarray_data_ptr_as_int(&b); size_t dx = prog->get_ndarray_data_ptr_as_int(&x); @@ -249,6 +315,30 @@ void CuSparseSolver::solve_cu(Program *prog, #endif } +void CuSparseSolver::solve_rf(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + Ndarray &x) { +#if defined(TI_WITH_CUDA) + // Retrive the info of the sparse matrix + SparseMatrix *sm_no_cv = const_cast(&sm); + CuSparseMatrix *A = dynamic_cast(sm_no_cv); + size_t rowsA = A->num_rows(); + size_t d_b = prog->get_ndarray_data_ptr_as_int(&b); + size_t d_x = prog->get_ndarray_data_ptr_as_int(&x); + CUSOLVERDriver::get_instance().csSpScsrcholSolve( + cusolver_handle_, rowsA, (void *)d_b, (void *)d_x, info_, gpu_buffer_); + + // TODO: free allocated memory and handles + // CUDADriver::get_instance().mem_free(gpu_buffer_); + // CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_); + // CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_); + // CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA); +#else + TI_NOT_IMPLEMENTED +#endif +} + std::unique_ptr make_sparse_solver(DataType dt, const std::string &solver_type, const std::string &ordering) { diff --git a/taichi/program/sparse_solver.h b/taichi/program/sparse_solver.h index 96e19a2e896c3..639f0c2e94715 100644 --- a/taichi/program/sparse_solver.h +++ b/taichi/program/sparse_solver.h @@ -15,6 +15,10 @@ class SparseSolver { virtual void analyze_pattern(const SparseMatrix &sm) = 0; virtual void factorize(const SparseMatrix &sm) = 0; virtual Eigen::VectorXf solve(const Eigen::Ref &b) = 0; + virtual void solve_rf(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + Ndarray &x) = 0; virtual void solve_cu(Program *prog, const SparseMatrix &sm, const Ndarray &b, @@ -36,24 +40,36 @@ class EigenSparseSolver : public SparseSolver { void solve_cu(Program *prog, const SparseMatrix &sm, const Ndarray &b, - Ndarray &x) override{}; + Ndarray &x) override { + TI_NOT_IMPLEMENTED; + }; + void solve_rf(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + Ndarray &x) override { + TI_NOT_IMPLEMENTED; + }; + bool info() override; }; class CuSparseSolver : public SparseSolver { private: + csrcholInfo_t info_{nullptr}; + cusolverSpHandle_t cusolver_handle_{nullptr}; + cusparseHandle_t cusparse_handel_{nullptr}; + cusparseMatDescr_t descr_{nullptr}; + void *gpu_buffer_{nullptr}; + public: CuSparseSolver(); ~CuSparseSolver() override = default; bool compute(const SparseMatrix &sm) override { TI_NOT_IMPLEMENTED; }; - void analyze_pattern(const SparseMatrix &sm) override { - TI_NOT_IMPLEMENTED; - }; - void factorize(const SparseMatrix &sm) override { - TI_NOT_IMPLEMENTED; - }; + void analyze_pattern(const SparseMatrix &sm) override; + + void factorize(const SparseMatrix &sm) override; Eigen::VectorXf solve(const Eigen::Ref &b) override { TI_NOT_IMPLEMENTED; }; @@ -61,6 +77,10 @@ class CuSparseSolver : public SparseSolver { const SparseMatrix &sm, const Ndarray &b, Ndarray &x) override; + void solve_rf(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + Ndarray &x) override; bool info() override { TI_NOT_IMPLEMENTED; }; diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index f223719b3cf5f..5b4014c9b09ae 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -1249,6 +1249,7 @@ void export_lang(py::module &m) { .def("factorize", &SparseSolver::factorize) .def("solve", &SparseSolver::solve) .def("solve_cu", &SparseSolver::solve_cu) + .def("solve_rf", &SparseSolver::solve_rf) .def("info", &SparseSolver::info); m.def("make_sparse_solver", &make_sparse_solver); diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h index d44abd7124992..e731ece7929de 100644 --- a/taichi/rhi/cuda/cuda_types.h +++ b/taichi/rhi/cuda/cuda_types.h @@ -555,4 +555,10 @@ typedef enum libraryPropertyType_t { struct cusolverSpContext; typedef struct cusolverSpContext *cusolverSpHandle_t; + +// copy from cusolverSp_LOWLEVEL_PREVIEW.h +struct csrcholInfoHost; +typedef struct csrcholInfoHost *csrcholInfoHost_t; +struct csrcholInfo; +typedef struct csrcholInfo *csrcholInfo_t; #endif diff --git a/taichi/rhi/cuda/cusolver_functions.inc.h b/taichi/rhi/cuda/cusolver_functions.inc.h index bc9672e4f745c..fbde791da6853 100644 --- a/taichi/rhi/cuda/cusolver_functions.inc.h +++ b/taichi/rhi/cuda/cusolver_functions.inc.h @@ -12,3 +12,11 @@ PER_CUSOLVER_FUNCTION(csSpXcsrsymrcmHost, cusolverSpXcsrsymrcmHost, cusolverSpHa PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSizeHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,size_t *); PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *); PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *); + +// cusolver preview API +PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*); +PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t ); +PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,size_t *,size_t *); +PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *); +PER_CUSOLVER_FUNCTION(csSpScsrcholZeroPivot, cusolverSpScsrcholZeroPivot, cusolverSpHandle_t, csrcholInfo_t ,float ,void *); +PER_CUSOLVER_FUNCTION(csSpScsrcholSolve, cusolverSpScsrcholSolve, cusolverSpHandle_t ,int ,void *,void *,csrcholInfo_t ,void *); diff --git a/tests/python/test_sparse_linear_solver.py b/tests/python/test_sparse_linear_solver.py index a3021b855c431..6f2b09d8d19d9 100644 --- a/tests/python/test_sparse_linear_solver.py +++ b/tests/python/test_sparse_linear_solver.py @@ -58,3 +58,58 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), x = solver.solve(b) for i in range(n): assert x[i] == test_utils.approx(res[i]) + + +@test_utils.test(arch=ti.cuda) +def test_gpu_sparse_solver(): + from scipy.sparse import coo_matrix + + @ti.kernel + def init_b(b: ti.types.ndarray(), nrows: ti.i32): + for i in range(nrows): + b[i] = 1.0 + i / nrows + + """ + Generate a positive definite matrix with a given number of rows and columns. + Reference: https://stackoverflow.com/questions/619335/a-simple-algorithm-for-generating-positive-semidefinite-matrices + """ + matrixSize = 10 + A = np.random.rand(matrixSize, matrixSize) + A_psd = np.dot(A, A.transpose()) + + A_raw_coo = coo_matrix(A_psd) + nrows, ncols = A_raw_coo.shape + nnz = A_raw_coo.nnz + + A_csr = A_raw_coo.tocsr() + b = ti.ndarray(shape=nrows, dtype=ti.f32) + init_b(b, nrows) + + # solve Ax = b using cusolver + A_coo = A_csr.tocoo() + d_row_coo = ti.ndarray(shape=nnz, dtype=ti.i32) + d_col_coo = ti.ndarray(shape=nnz, dtype=ti.i32) + d_val_coo = ti.ndarray(shape=nnz, dtype=ti.f32) + d_row_coo.from_numpy(A_coo.row) + d_col_coo.from_numpy(A_coo.col) + d_val_coo.from_numpy(A_coo.data) + + A_ti = ti.linalg.SparseMatrix(n=nrows, m=ncols, dtype=ti.float32) + A_ti.build_coo(d_row_coo, d_col_coo, d_val_coo) + x_ti = ti.ndarray(shape=ncols, dtype=ti.float32) + solver = ti.linalg.SparseSolver() + solver.solve_cu(A_ti, b, x_ti) + ti.sync() + + # solve Ax=b using numpy + b_np = b.to_numpy() + x_np = np.linalg.solve(A_psd, b_np) + assert (np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)) + + # solve Ax=b using cusolver refectorization + solver = ti.linalg.SparseSolver() + solver.analyze_pattern(A_ti) + solver.factorize(A_ti) + solver.solve_rf(A_ti, b, x_ti) + ti.sync() + assert (np.allclose(x_ti.to_numpy(), x_np, atol=1e-1))