Skip to content

Commit

Permalink
[Lang] Support sparse matrix on GPU (#5185)
Browse files Browse the repository at this point in the history
* cusparse loaded

* load cusolver

* add driver base class

* update comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* make CUDADriver a derived class from CUDADriverBase

* clean code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean code

* create sparse matrix using cusparse

* spmv with bugs

* bug fix: int -> size_t

* clean

* add safe loader

* seperate sparse matrix maker and spmv func

* refactor

* fix parameter bug

* fix test bug

* fix

* fix mac/windows failed tests

* fix

* add tests for gpu sparse matrix

* fix test

* fix

* fix cuMemAlloc_v2 bug and windows dll name bug

* fix ci

* csr datatype checking

* Apply suggestions from code review

Co-authored-by: Yi Xu <[email protected]>

* arch fix

* format

Co-authored-by: Jiafeng-Liu <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yi Xu <[email protected]>
  • Loading branch information
4 people authored Aug 17, 2022
1 parent 1884cc0 commit 1dad79f
Show file tree
Hide file tree
Showing 11 changed files with 413 additions and 26 deletions.
50 changes: 49 additions & 1 deletion python/taichi/linalg/sparse_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from taichi.lang.impl import get_runtime
from taichi.lang.matrix import Ndarray
from taichi.lang.util import warning
from taichi.types import annotations, f32
from taichi.types import annotations, f32, i32


class SparseMatrix:
Expand Down Expand Up @@ -198,6 +198,54 @@ def build_from_ndarray(self, ndarray):
'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
)

def build_csr_cusparse(self, data, indices, indptr):
"""Build a csr format sparse matrix using cuSparse where the column indices
for row i are stored in ``indices[indptr[i]:indptr[i+1]]``
and their corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
Args:
data (ti.ndarray): CSR format data array of the matrix.
indices (ti.ndarray): CSR format index array of the matrix.
indptr (ti.ndarray): CSR format index pointer array of the matrix.
"""
if not isinstance(data, Ndarray) or not isinstance(
indices, Ndarray) or not isinstance(indptr, Ndarray):
raise TaichiRuntimeError(
'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray].'
)
elif data.dtype != f32 or indices.dtype != i32 or indptr.dtype != i32:
raise TaichiRuntimeError(
'Sparse matrix only supports building from float32 data and int32 indices/indptr.'
)
else:
get_runtime().prog.make_sparse_matrix_from_ndarray_cusparse(
self.matrix, indptr.arr, indices.arr, data.arr)

def spmv(self, x, y):
"""Sparse matrix-vector multiplication using cuSparse.
Args:
x (ti.ndarray): the vector to be multiplied.
y (ti.ndarray): the result of matrix-vector multiplication.
Example::
>>> x = ti.ndarray(shape=4, dtype=val_dt)
>>> y = ti.ndarray(shape=4, dtype=val_dt)
>>> A = ti.linalg.SparseMatrix(n=4, m=4, dtype=ti.f32)
>>> A.build_from_ndarray_cusparse(row_csr, col_csr, value_csr)
>>> A.spmv(x, y)
"""
if not isinstance(x, Ndarray) or not isinstance(y, Ndarray):
raise TaichiRuntimeError(
'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
)
if self.m != x.shape[0]:
raise TaichiRuntimeError(
f"Dimension mismatch between sparse matrix ({self.n}, {self.m}) and vector ({x.shape})"
)

self.matrix.spmv(get_runtime().prog, x.arr, y.arr)


class SparseMatrixBuilder:
"""A python wrap around sparse matrix builder.
Expand Down
77 changes: 77 additions & 0 deletions taichi/program/sparse_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,13 @@ std::unique_ptr<SparseMatrix> make_sparse_matrix(
storage_format);
}

std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
int cols,
DataType dt) {
return std::unique_ptr<SparseMatrix>(
std::make_unique<CuSparseMatrix>(rows, cols, dt));
}

template <typename T>
void build_ndarray_template(SparseMatrix &sm,
intptr_t data_ptr,
Expand Down Expand Up @@ -191,5 +198,75 @@ void make_sparse_matrix_from_ndarray(Program *prog,
}
}

void CuSparseMatrix::build_csr(void *csr_ptr,
void *csr_indices_ptr,
void *csr_values_ptr,
int nnz) {
#if defined(TI_WITH_CUDA)
CUSPARSEDriver::get_instance().cpCreateCsr(
&matrix_, rows_, cols_, nnz, csr_ptr, csr_indices_ptr, csr_values_ptr,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
CUDA_R_32F);
#endif
}
CuSparseMatrix::~CuSparseMatrix() {
#if defined(TI_WITH_CUDA)
CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
#endif
}
void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
SparseMatrix &sm,
const Ndarray &row_offsets,
const Ndarray &col_indices,
const Ndarray &values) {
#if defined(TI_WITH_CUDA)
std::string sdtype = taichi::lang::data_type_name(sm.get_data_type());
if (!CUSPARSEDriver::get_instance().is_loaded()) {
bool load_success = CUSPARSEDriver::get_instance().load_cusparse();
if (!load_success) {
TI_ERROR("Failed to load cusparse library!");
}
}
size_t row_csr = prog->get_ndarray_data_ptr_as_int(&row_offsets);
size_t col_csr = prog->get_ndarray_data_ptr_as_int(&col_indices);
size_t values_csr = prog->get_ndarray_data_ptr_as_int(&values);
int nnz = values.get_nelement();
sm.build_csr((void *)row_csr, (void *)col_csr, (void *)values_csr, nnz);
#endif
}

void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
#if defined(TI_WITH_CUDA)
size_t dX = prog->get_ndarray_data_ptr_as_int(&x);
size_t dY = prog->get_ndarray_data_ptr_as_int(&y);

cusparseDnVecDescr_t vecX, vecY;
CUSPARSEDriver::get_instance().cpCreateDnVec(&vecX, cols_, (void *)dX,
CUDA_R_32F);
CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, rows_, (void *)dY,
CUDA_R_32F);

cusparseHandle_t cusparse_handle;
CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
float alpha = 1.0f, beta = 0.0f;
size_t bufferSize = 0;
CUSPARSEDriver::get_instance().cpSpMV_bufferSize(
cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
&beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize);

void *dBuffer = NULL;
if (bufferSize > 0)
CUDADriver::get_instance().malloc(&dBuffer, bufferSize);
CUSPARSEDriver::get_instance().cpSpMV(
cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
&beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, dBuffer);

CUSPARSEDriver::get_instance().cpDestroyDnVec(vecX);
CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY);
CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
CUDADriver::get_instance().mem_free(dBuffer);
#endif
}

} // namespace lang
} // namespace taichi
38 changes: 37 additions & 1 deletion taichi/program/sparse_matrix.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include "taichi/rhi/cuda/cuda_driver.h"
#include "taichi/common/core.h"
#include "taichi/inc/constants.h"
#include "taichi/ir/type_utils.h"
Expand Down Expand Up @@ -58,7 +59,16 @@ class SparseMatrix {
}
virtual ~SparseMatrix() = default;

virtual void build_triplets(void *triplets_adr){};
virtual void build_triplets(void *triplets_adr) {
TI_NOT_IMPLEMENTED;
};

virtual void build_csr(void *csr_ptr,
void *csr_indices_ptr,
void *csr_values_ptr,
int nnz) {
TI_NOT_IMPLEMENTED;
};

inline const int num_rows() const {
return rows_;
Expand Down Expand Up @@ -189,14 +199,40 @@ class EigenSparseMatrix : public SparseMatrix {
EigenMatrix matrix_;
};

class CuSparseMatrix : public SparseMatrix {
public:
explicit CuSparseMatrix(int rows, int cols, DataType dt)
: SparseMatrix(rows, cols, dt) {
}

virtual ~CuSparseMatrix();
void build_csr(void *csr_ptr,
void *csr_indices_ptr,
void *csr_values_ptr,
int nnz) override;

void spmv(Program *prog, const Ndarray &x, Ndarray &y);

private:
cusparseSpMatDescr_t matrix_;
};

std::unique_ptr<SparseMatrix> make_sparse_matrix(
int rows,
int cols,
DataType dt,
const std::string &storage_format);
std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
int cols,
DataType dt);

void make_sparse_matrix_from_ndarray(Program *prog,
SparseMatrix &sm,
const Ndarray &ndarray);
void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
SparseMatrix &sm,
const Ndarray &row_offsets,
const Ndarray &col_indices,
const Ndarray &values);
} // namespace lang
} // namespace taichi
29 changes: 23 additions & 6 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,23 +380,37 @@ void export_lang(py::module &m) {
[](Program *program, int n, int m, uint64 max_num_entries,
DataType dtype, const std::string &storage_format) {
TI_ERROR_IF(!arch_is_cpu(program->config.arch),
"SparseMatrix only supports CPU for now.");
"SparseMatrix Builder only supports CPU for now.");
return SparseMatrixBuilder(n, m, max_num_entries, dtype,
storage_format);
})
.def("create_sparse_matrix",
[](Program *program, int n, int m, DataType dtype,
std::string storage_format) {
TI_ERROR_IF(!arch_is_cpu(program->config.arch),
"SparseMatrix only supports CPU for now.");
return make_sparse_matrix(n, m, dtype, storage_format);
TI_ERROR_IF(!arch_is_cpu(program->config.arch) &&
!arch_is_cuda(program->config.arch),
"SparseMatrix only supports CPU and CUDA for now.");
if (arch_is_cpu(program->config.arch))
return make_sparse_matrix(n, m, dtype, storage_format);
else
return make_cu_sparse_matrix(n, m, dtype);
})
.def("make_sparse_matrix_from_ndarray",
[](Program *program, SparseMatrix &sm, const Ndarray &ndarray) {
TI_ERROR_IF(!arch_is_cpu(program->config.arch),
"SparseMatrix only supports CPU for now.");
TI_ERROR_IF(!arch_is_cpu(program->config.arch) &&
!arch_is_cuda(program->config.arch),
"SparseMatrix only supports CPU and CUDA for now.");
return make_sparse_matrix_from_ndarray(program, sm, ndarray);
})
.def("make_sparse_matrix_from_ndarray_cusparse",
[](Program *program, CuSparseMatrix &sm, const Ndarray &row_csr,
const Ndarray &col_csr, const Ndarray &val_csr) {
TI_ERROR_IF(
!arch_is_cuda(program->config.arch),
"SparseMatrix based on GPU only supports CUDA for now.");
return make_sparse_matrix_from_ndarray_cusparse(
program, sm, row_csr, col_csr, val_csr);
})
.def("no_activate",
[](Program *program, SNode *snode) {
// TODO(#2193): Also apply to @ti.func?
Expand Down Expand Up @@ -1171,6 +1185,9 @@ void export_lang(py::module &m) {
MAKE_SPARSE_MATRIX(64, ColMajor, d);
MAKE_SPARSE_MATRIX(64, RowMajor, d);

py::class_<CuSparseMatrix>(m, "CuSparseMatrix")
.def("spmv", &CuSparseMatrix::spmv);

py::class_<SparseSolver>(m, "SparseSolver")
.def("compute", &SparseSolver::compute)
.def("analyze_pattern", &SparseSolver::analyze_pattern)
Expand Down
4 changes: 4 additions & 0 deletions taichi/rhi/arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ bool arch_is_cpu(Arch arch) {
}
}

bool arch_is_cuda(Arch arch) {
return arch == Arch::cuda;
}

bool arch_uses_llvm(Arch arch) {
return (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::cuda ||
arch == Arch::wasm);
Expand Down
2 changes: 2 additions & 0 deletions taichi/rhi/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Arch arch_from_name(const std::string &arch);

bool arch_is_cpu(Arch arch);

bool arch_is_cuda(Arch arch);

bool arch_uses_llvm(Arch arch);

bool arch_is_gpu(Arch arch);
Expand Down
65 changes: 53 additions & 12 deletions taichi/rhi/cuda/cuda_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,39 @@ std::string get_cuda_error_message(uint32 err) {
return fmt::format("CUDA Error {}: {}", err_name_ptr, err_string_ptr);
}

bool CUDADriver::detected() {
return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded();
}

CUDADriver::CUDADriver() {
CUDADriverBase::CUDADriverBase() {
disabled_by_env_ = (get_environ_config("TI_ENABLE_CUDA", 1) == 0);
if (disabled_by_env_) {
TI_TRACE(
"CUDA driver disabled by environment variable \"TI_ENABLE_CUDA\".");
return;
TI_TRACE("CUDA driver disabled by enviroment variable \"TI_ENABLE_CUDA\".");
}
}

bool CUDADriverBase::load_lib(std::string lib_linux, std::string lib_windows) {
#if defined(TI_PLATFORM_LINUX)
loader_ = std::make_unique<DynamicLoader>("libcuda.so");
auto lib_name = lib_linux;
#elif defined(TI_PLATFORM_WINDOWS)
loader_ = std::make_unique<DynamicLoader>("nvcuda.dll");
auto lib_name = lib_windows;
#else
static_assert(false, "Taichi CUDA driver supports only Windows and Linux.");
#endif

loader_ = std::make_unique<DynamicLoader>(lib_name);
if (!loader_->loaded()) {
TI_WARN("CUDA driver not found.");
return;
TI_WARN("{} lib not found.", lib_name);
return false;
} else {
TI_TRACE("{} loaded!", lib_name);
return true;
}
}

bool CUDADriver::detected() {
return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded();
}

CUDADriver::CUDADriver() {
if (!load_lib("libcuda.so", "nvcuda.dll"))
return;

loader_->load_function("cuGetErrorName", get_error_name);
loader_->load_function("cuGetErrorString", get_error_string);
Expand Down Expand Up @@ -79,4 +88,36 @@ CUDADriver &CUDADriver::get_instance() {
return get_instance_without_context();
}

CUSPARSEDriver::CUSPARSEDriver() {
}

CUSPARSEDriver &CUSPARSEDriver::get_instance() {
static CUSPARSEDriver *instance = new CUSPARSEDriver();
return *instance;
}

bool CUSPARSEDriver::load_cusparse() {
cusparse_loaded_ = load_lib("libcusparse.so", "cusparse64_11.dll");

if (!cusparse_loaded_) {
return false;
}
#define PER_CUSPARSE_FUNCTION(name, symbol_name, ...) \
name.set(loader_->load_function(#symbol_name)); \
name.set_lock(&lock_); \
name.set_names(#name, #symbol_name);
#include "taichi/rhi/cuda/cusparse_functions.inc.h"
#undef PER_CUSPARSE_FUNCTION
return cusparse_loaded_;
}

CUSOLVERDriver::CUSOLVERDriver() {
load_lib("libcusolver.so", "cusolver.dll");
}

CUSOLVERDriver &CUSOLVERDriver::get_instance() {
static CUSOLVERDriver *instance = new CUSOLVERDriver();
return *instance;
}

TLANG_NAMESPACE_END
Loading

0 comments on commit 1dad79f

Please sign in to comment.