Skip to content

Commit

Permalink
[Lang] Support GPU solve with analyzePattern and factorize (#6158)
Browse files Browse the repository at this point in the history
Issue: #2906
  • Loading branch information
FantasyVR authored Sep 29, 2022
1 parent 50e5e3f commit 8d43c87
Show file tree
Hide file tree
Showing 10 changed files with 240 additions and 17 deletions.
12 changes: 12 additions & 0 deletions misc/test_coo_cusolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,15 @@ def print_x(x: ti.types.ndarray(), ncols: ti.i32):
print(
f"The solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}"
)

solver = ti.linalg.SparseSolver()
solver.analyze_pattern(A_ti)
solver.factorize(A_ti)
solver.solve_rf(A_ti, b, x_ti)

ti.sync()
print_x(x_ti, ncols)
ti.sync()
print(
f"The cusolver rf solution and numpy solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}"
)
10 changes: 10 additions & 0 deletions python/taichi/linalg/sparse_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ def solve_cu(self, sparse_matrix, b, x):
f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now."
)

def solve_rf(self, sparse_matrix, b, x):
if isinstance(sparse_matrix, SparseMatrix) and isinstance(
b, Ndarray) and isinstance(x, Ndarray):
self.solver.solve_rf(get_runtime().prog, sparse_matrix.matrix,
b.arr, x.arr)
else:
raise TaichiRuntimeError(
f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now."
)

def info(self):
"""Check if the linear systems are solved successfully.
Expand Down
4 changes: 4 additions & 0 deletions taichi/program/sparse_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
CUDADriver::get_instance().mem_free(d_values_sorted);
CUDADriver::get_instance().mem_free(d_permutation);
CUDADriver::get_instance().mem_free(dbuffer);
csr_row_ptr_ = csr_row_offset_ptr;
csr_col_ind_ = coo_col_ptr;
csr_val_ = coo_values_ptr;
nnz_ = nnz;
#endif
}

Expand Down
17 changes: 17 additions & 0 deletions taichi/program/sparse_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,25 @@ class CuSparseMatrix : public SparseMatrix {

const std::string to_string() const override;

void *get_row_ptr() const {
return csr_row_ptr_;
}
void *get_col_ind() const {
return csr_col_ind_;
}
void *get_val_ptr() const {
return csr_val_;
}
int get_nnz() const {
return nnz_;
}

private:
cusparseSpMatDescr_t matrix_;
void *csr_row_ptr_{nullptr};
void *csr_col_ind_{nullptr};
void *csr_val_{nullptr};
int nnz_{0};
};

std::unique_ptr<SparseMatrix> make_sparse_matrix(
Expand Down
110 changes: 100 additions & 10 deletions taichi/program/sparse_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,73 @@ CuSparseSolver::CuSparseSolver() {
}
#endif
}
// Reference:
// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp
void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) {
#if defined(TI_WITH_CUDA)
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();

CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_);
CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_);
CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_);
CUSPARSEDriver::get_instance().cpSetMatType(descr_,
CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_,
CUSPARSE_INDEX_BASE_ZERO);

// step 1: create opaque info structure
CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_);

// step 2: analyze chol(A) to know structure of L
CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis(
cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrA, d_csrColIndA, info_);

#else
TI_NOT_IMPLEMENTED
#endif
}

void CuSparseSolver::factorize(const SparseMatrix &sm) {
#if defined(TI_WITH_CUDA)
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();
void *d_csrValA = A->get_val_ptr();

size_t size_internal = 0;
size_t size_chol = 0; // size of working space for csrlu
// step 1: workspace for chol(A)
CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, &size_internal, &size_chol);

if (size_chol > 0)
CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol);

// step 2: compute A = L*L^T
CUSOLVERDriver::get_instance().csSpScsrcholFactor(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, gpu_buffer_);
// step 3: check if the matrix is singular
const double tol = 1.e-14;
int singularity = 0;
CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_,
tol, &singularity);
TI_ASSERT(singularity == -1);
#else
TI_NOT_IMPLEMENTED
#endif
}

void CuSparseSolver::solve_cu(Program *prog,
const SparseMatrix &sm,
Expand All @@ -100,16 +167,15 @@ void CuSparseSolver::solve_cu(Program *prog,
printf("Cusolver version: %d.%d.%d\n", major_version, minor_version,
patch_level);

const cusparseSpMatDescr_t *A =
(const cusparseSpMatDescr_t *)(sm.get_matrix());
size_t nrows = 0, ncols = 0, nnz = 0;
void *drow_offsets = NULL, *dcol_indices = NULL, *dvalues = NULL;
cusparseIndexType_t csrRowOffsetsType, csrColIndType;
cusparseIndexBase_t idxBase;
cudaDataType valueType;
CUSPARSEDriver::get_instance().cpCsrGet(
*A, &nrows, &ncols, &nnz, &drow_offsets, &dcol_indices, &dvalues,
&csrRowOffsetsType, &csrColIndType, &idxBase, &valueType);
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t nrows = A->num_rows();
size_t ncols = A->num_cols();
size_t nnz = A->get_nnz();
void *drow_offsets = A->get_row_ptr();
void *dcol_indices = A->get_col_ind();
void *dvalues = A->get_val_ptr();

size_t db = prog->get_ndarray_data_ptr_as_int(&b);
size_t dx = prog->get_ndarray_data_ptr_as_int(&x);
Expand Down Expand Up @@ -249,6 +315,30 @@ void CuSparseSolver::solve_cu(Program *prog,
#endif
}

void CuSparseSolver::solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) {
#if defined(TI_WITH_CUDA)
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t d_b = prog->get_ndarray_data_ptr_as_int(&b);
size_t d_x = prog->get_ndarray_data_ptr_as_int(&x);
CUSOLVERDriver::get_instance().csSpScsrcholSolve(
cusolver_handle_, rowsA, (void *)d_b, (void *)d_x, info_, gpu_buffer_);

// TODO: free allocated memory and handles
// CUDADriver::get_instance().mem_free(gpu_buffer_);
// CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_);
// CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_);
// CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA);
#else
TI_NOT_IMPLEMENTED
#endif
}

std::unique_ptr<SparseSolver> make_sparse_solver(DataType dt,
const std::string &solver_type,
const std::string &ordering) {
Expand Down
34 changes: 27 additions & 7 deletions taichi/program/sparse_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class SparseSolver {
virtual void analyze_pattern(const SparseMatrix &sm) = 0;
virtual void factorize(const SparseMatrix &sm) = 0;
virtual Eigen::VectorXf solve(const Eigen::Ref<const Eigen::VectorXf> &b) = 0;
virtual void solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) = 0;
virtual void solve_cu(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Expand All @@ -36,31 +40,47 @@ class EigenSparseSolver : public SparseSolver {
void solve_cu(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override{};
Ndarray &x) override {
TI_NOT_IMPLEMENTED;
};
void solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override {
TI_NOT_IMPLEMENTED;
};

bool info() override;
};

class CuSparseSolver : public SparseSolver {
private:
csrcholInfo_t info_{nullptr};
cusolverSpHandle_t cusolver_handle_{nullptr};
cusparseHandle_t cusparse_handel_{nullptr};
cusparseMatDescr_t descr_{nullptr};
void *gpu_buffer_{nullptr};

public:
CuSparseSolver();
~CuSparseSolver() override = default;
bool compute(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
void analyze_pattern(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
void factorize(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
void analyze_pattern(const SparseMatrix &sm) override;

void factorize(const SparseMatrix &sm) override;
Eigen::VectorXf solve(const Eigen::Ref<const Eigen::VectorXf> &b) override {
TI_NOT_IMPLEMENTED;
};
void solve_cu(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override;
void solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override;
bool info() override {
TI_NOT_IMPLEMENTED;
};
Expand Down
1 change: 1 addition & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,7 @@ void export_lang(py::module &m) {
.def("factorize", &SparseSolver::factorize)
.def("solve", &SparseSolver::solve)
.def("solve_cu", &SparseSolver::solve_cu)
.def("solve_rf", &SparseSolver::solve_rf)
.def("info", &SparseSolver::info);

m.def("make_sparse_solver", &make_sparse_solver);
Expand Down
6 changes: 6 additions & 0 deletions taichi/rhi/cuda/cuda_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -555,4 +555,10 @@ typedef enum libraryPropertyType_t {

struct cusolverSpContext;
typedef struct cusolverSpContext *cusolverSpHandle_t;

// copy from cusolverSp_LOWLEVEL_PREVIEW.h
struct csrcholInfoHost;
typedef struct csrcholInfoHost *csrcholInfoHost_t;
struct csrcholInfo;
typedef struct csrcholInfo *csrcholInfo_t;
#endif
8 changes: 8 additions & 0 deletions taichi/rhi/cuda/cusolver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,11 @@ PER_CUSOLVER_FUNCTION(csSpXcsrsymrcmHost, cusolverSpXcsrsymrcmHost, cusolverSpHa
PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSizeHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,size_t *);
PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *);
PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *);

// cusolver preview API
PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*);
PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t );
PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,size_t *,size_t *);
PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *);
PER_CUSOLVER_FUNCTION(csSpScsrcholZeroPivot, cusolverSpScsrcholZeroPivot, cusolverSpHandle_t, csrcholInfo_t ,float ,void *);
PER_CUSOLVER_FUNCTION(csSpScsrcholSolve, cusolverSpScsrcholSolve, cusolverSpHandle_t ,int ,void *,void *,csrcholInfo_t ,void *);
55 changes: 55 additions & 0 deletions tests/python/test_sparse_linear_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,58 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
x = solver.solve(b)
for i in range(n):
assert x[i] == test_utils.approx(res[i])


@test_utils.test(arch=ti.cuda)
def test_gpu_sparse_solver():
from scipy.sparse import coo_matrix

@ti.kernel
def init_b(b: ti.types.ndarray(), nrows: ti.i32):
for i in range(nrows):
b[i] = 1.0 + i / nrows

"""
Generate a positive definite matrix with a given number of rows and columns.
Reference: https://stackoverflow.com/questions/619335/a-simple-algorithm-for-generating-positive-semidefinite-matrices
"""
matrixSize = 10
A = np.random.rand(matrixSize, matrixSize)
A_psd = np.dot(A, A.transpose())

A_raw_coo = coo_matrix(A_psd)
nrows, ncols = A_raw_coo.shape
nnz = A_raw_coo.nnz

A_csr = A_raw_coo.tocsr()
b = ti.ndarray(shape=nrows, dtype=ti.f32)
init_b(b, nrows)

# solve Ax = b using cusolver
A_coo = A_csr.tocoo()
d_row_coo = ti.ndarray(shape=nnz, dtype=ti.i32)
d_col_coo = ti.ndarray(shape=nnz, dtype=ti.i32)
d_val_coo = ti.ndarray(shape=nnz, dtype=ti.f32)
d_row_coo.from_numpy(A_coo.row)
d_col_coo.from_numpy(A_coo.col)
d_val_coo.from_numpy(A_coo.data)

A_ti = ti.linalg.SparseMatrix(n=nrows, m=ncols, dtype=ti.float32)
A_ti.build_coo(d_row_coo, d_col_coo, d_val_coo)
x_ti = ti.ndarray(shape=ncols, dtype=ti.float32)
solver = ti.linalg.SparseSolver()
solver.solve_cu(A_ti, b, x_ti)
ti.sync()

# solve Ax=b using numpy
b_np = b.to_numpy()
x_np = np.linalg.solve(A_psd, b_np)
assert (np.allclose(x_ti.to_numpy(), x_np, atol=1e-1))

# solve Ax=b using cusolver refectorization
solver = ti.linalg.SparseSolver()
solver.analyze_pattern(A_ti)
solver.factorize(A_ti)
solver.solve_rf(A_ti, b, x_ti)
ti.sync()
assert (np.allclose(x_ti.to_numpy(), x_np, atol=1e-1))

0 comments on commit 8d43c87

Please sign in to comment.