Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lang] Merge triplets in the same position when building GPU sparse matrix #6605

Merged
merged 15 commits into from
Nov 21, 2022
40 changes: 0 additions & 40 deletions misc/test_build_cusm_from_coo.py

This file was deleted.

75 changes: 0 additions & 75 deletions misc/test_coo_cusolver.py

This file was deleted.

38 changes: 11 additions & 27 deletions python/taichi/linalg/sparse_matrix.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from functools import reduce

import numpy as np
from taichi._lib import core as _ti_core
from taichi.lang._ndarray import Ndarray, ScalarNdarray
from taichi.lang.exception import TaichiRuntimeError
from taichi.lang.field import Field
from taichi.lang.impl import get_runtime
from taichi.lang.util import warning
from taichi.types import annotations, f32, i32
from taichi.types import annotations, f32


class SparseMatrix:
Expand Down Expand Up @@ -206,30 +207,6 @@ def build_from_ndarray(self, ndarray):
'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
)

def build_coo(self, row_coo, col_coo, value_coo):
"""Build a CSR format sparse matrix from COO format inputs.

Args:
row_indices (ti.ndarray): the row indices of the matrix entries.
col_indices (ti.ndarray): the column indices of the matrix entries.
data (ti.ndarray): the entries of the matrix.

Raises:
TaichiRuntimeError: If the inputs are not ``ti.ndarray`` or the datatypes of the ndarray are not correct.
"""
if not isinstance(row_coo, Ndarray) or not isinstance(
col_coo, Ndarray) or not isinstance(value_coo, Ndarray):
raise TaichiRuntimeError(
'Sparse matrix only supports COO format building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray].'
)
elif value_coo.dtype != f32 or row_coo.dtype != i32 or col_coo.dtype != i32:
raise TaichiRuntimeError(
'Sparse matrix only supports COO fromat building from float32 data and int32 row/col indices.'
)
else:
get_runtime().prog.make_sparse_matrix_from_ndarray_cusparse(
self.matrix, row_coo.arr, col_coo.arr, value_coo.arr)


class SparseMatrixBuilder:
"""A python wrap around sparse matrix builder.
Expand Down Expand Up @@ -270,8 +247,15 @@ def print_triplets(self):

def build(self, dtype=f32, _format='CSR'):
"""Create a sparse matrix using the triplets"""
sm = self.ptr.build()
return SparseMatrix(sm=sm)
taichi_arch = get_runtime().prog.config().arch
if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64:
sm = self.ptr.build()
return SparseMatrix(sm=sm)
if taichi_arch == _ti_core.Arch.cuda:
sm = self.ptr.build_cuda()
return SparseMatrix(sm=sm)
raise TaichiRuntimeError(
'Sparse matrix only supports CPU and CUDA backends.')


# TODO: remove this in 1.0 release
Expand Down
2 changes: 1 addition & 1 deletion python/taichi/linalg/sparse_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def compute(self, sparse_matrix):
if isinstance(sparse_matrix, SparseMatrix):
self.matrix = sparse_matrix
taichi_arch = taichi.lang.impl.get_runtime().prog.config().arch
if taichi_arch == _ti_core.Arch.x64:
if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64:
self.solver.compute(sparse_matrix.matrix)
elif taichi_arch == _ti_core.Arch.cuda:
self.analyze_pattern(self.matrix)
Expand Down
98 changes: 75 additions & 23 deletions taichi/program/sparse_matrix.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "taichi/program/sparse_matrix.h"

#include <map>
#include <sstream>
#include <string>
#include <unordered_map>
Expand Down Expand Up @@ -145,6 +146,58 @@ std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build() {
return sm;
}

std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build_cuda() {
TI_ASSERT(built_ == false);
built_ = true;
auto sm = make_cu_sparse_matrix(rows_, cols_, dtype_);
#ifdef TI_WITH_CUDA
num_triplets_ = ndarray_data_base_ptr_->read_int(std::vector<int>{0});
std::map<int, std::tuple<int, int, float32>> entries;
for (auto i = 0; i < num_triplets_; i++) {
auto idx = 3 * i + 1;
auto row = ndarray_data_base_ptr_->read_int(std::vector<int>{idx});
auto col = ndarray_data_base_ptr_->read_int(std::vector<int>{idx + 1});
auto val = ndarray_data_base_ptr_->read_float(std::vector<int>{idx + 2});
auto e_idx = row * cols_ + col;
if (entries.find(e_idx) == entries.end()) {
entries[e_idx] = std::make_tuple(row, col, val);
} else {
auto [r, c, v] = entries[e_idx];
entries[e_idx] = std::make_tuple(r, c, v + val);
}
}
auto entry_size = entries.size();
int *row_host = (int *)malloc(sizeof(int) * entry_size);
int *col_host = (int *)malloc(sizeof(int) * entry_size);
float32 *value_host = (float32 *)malloc(sizeof(float32) * entry_size);
int count = 0;
for (auto entry : entries) {
auto [row, col, value] = entry.second;
row_host[count] = row;
col_host[count] = col;
value_host[count] = value;
count++;
}
void *row_device = nullptr, *col_device = nullptr, *value_device = nullptr;
CUDADriver::get_instance().malloc(&row_device, entry_size * sizeof(int));
FantasyVR marked this conversation as resolved.
Show resolved Hide resolved
CUDADriver::get_instance().malloc(&col_device, entry_size * sizeof(int));
CUDADriver::get_instance().malloc(&value_device,
entry_size * sizeof(float32));
CUDADriver::get_instance().memcpy_host_to_device(row_device, (void *)row_host,
entry_size * sizeof(int));
CUDADriver::get_instance().memcpy_host_to_device(col_device, (void *)col_host,
entry_size * sizeof(int));
CUDADriver::get_instance().memcpy_host_to_device(
value_device, (void *)value_host, entry_size * sizeof(float32));
sm->build_csr_from_coo(row_device, col_device, value_device, entry_size);
clear();
free(row_host);
free(col_host);
free(value_host);
#endif
return sm;
}

void SparseMatrixBuilder::clear() {
built_ = false;
ndarray_data_base_ptr_->write_int(std::vector<int>{0}, 0);
Expand Down Expand Up @@ -286,14 +339,20 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
&matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr,
coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
// TODO: free csr_row_offset_ptr
// CUDADriver::get_instance().mem_free(csr_row_offset_ptr);
CUDADriver::get_instance().mem_free(d_values_sorted);
CUDADriver::get_instance().mem_free(d_permutation);
CUDADriver::get_instance().mem_free(dbuffer);
if (vec_permutation)
CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
if (vec_values)
CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
if (cusparse_handle)
CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
if (coo_row_ptr)
CUDADriver::get_instance().mem_free(coo_row_ptr);
if (d_values_sorted)
CUDADriver::get_instance().mem_free(d_values_sorted);
if (d_permutation)
CUDADriver::get_instance().mem_free(d_permutation);
if (dbuffer)
CUDADriver::get_instance().mem_free(dbuffer);
csr_row_ptr_ = csr_row_offset_ptr;
csr_col_ind_ = coo_col_ptr;
csr_val_ = coo_values_ptr;
Expand All @@ -303,21 +362,14 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,

CuSparseMatrix::~CuSparseMatrix() {
#if defined(TI_WITH_CUDA)
CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
#endif
}
void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
SparseMatrix &sm,
const Ndarray &row_coo,
const Ndarray &col_coo,
const Ndarray &val_coo) {
#if defined(TI_WITH_CUDA)
size_t coo_row_ptr = prog->get_ndarray_data_ptr_as_int(&row_coo);
size_t coo_col_ptr = prog->get_ndarray_data_ptr_as_int(&col_coo);
size_t coo_val_ptr = prog->get_ndarray_data_ptr_as_int(&val_coo);
int nnz = val_coo.get_nelement();
sm.build_csr_from_coo((void *)coo_row_ptr, (void *)coo_col_ptr,
(void *)coo_val_ptr, nnz);
if (matrix_)
CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
if (csr_row_ptr_)
CUDADriver::get_instance().mem_free(csr_row_ptr_);
if (csr_col_ind_)
CUDADriver::get_instance().mem_free(csr_col_ind_);
if (csr_val_)
CUDADriver::get_instance().mem_free(csr_val_);
#endif
}

Expand Down
9 changes: 3 additions & 6 deletions taichi/program/sparse_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class SparseMatrixBuilder {

std::unique_ptr<SparseMatrix> build();

std::unique_ptr<SparseMatrix> build_cuda();

void clear();

private:
Expand Down Expand Up @@ -287,7 +289,7 @@ class CuSparseMatrix : public SparseMatrix {
}

private:
cusparseSpMatDescr_t matrix_;
cusparseSpMatDescr_t matrix_{nullptr};
void *csr_row_ptr_{nullptr};
void *csr_col_ind_{nullptr};
void *csr_val_{nullptr};
Expand All @@ -310,9 +312,4 @@ std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(cusparseSpMatDescr_t mat,
void make_sparse_matrix_from_ndarray(Program *prog,
SparseMatrix &sm,
const Ndarray &ndarray);
void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
SparseMatrix &sm,
const Ndarray &row_indices,
const Ndarray &col_indices,
const Ndarray &values);
} // namespace taichi::lang
10 changes: 1 addition & 9 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,15 +422,6 @@ void export_lang(py::module &m) {
"SparseMatrix only supports CPU and CUDA for now.");
return make_sparse_matrix_from_ndarray(program, sm, ndarray);
})
.def("make_sparse_matrix_from_ndarray_cusparse",
[](Program *program, CuSparseMatrix &sm, const Ndarray &row_coo,
const Ndarray &col_coo, const Ndarray &val_coo) {
TI_ERROR_IF(
!arch_is_cuda(program->this_thread_config().arch),
"SparseMatrix based on GPU only supports CUDA for now.");
return make_sparse_matrix_from_ndarray_cusparse(
program, sm, row_coo, col_coo, val_coo);
})
.def("no_activate",
[](Program *program, SNode *snode) {
// TODO(#2193): Also apply to @ti.func?
Expand Down Expand Up @@ -1207,6 +1198,7 @@ void export_lang(py::module &m) {
.def("print_triplets", &SparseMatrixBuilder::print_triplets)
.def("get_ndarray_data_ptr", &SparseMatrixBuilder::get_ndarray_data_ptr)
.def("build", &SparseMatrixBuilder::build)
.def("build_cuda", &SparseMatrixBuilder::build_cuda)
.def("get_addr", [](SparseMatrixBuilder *mat) { return uint64(mat); });

py::class_<SparseMatrix>(m, "SparseMatrix")
Expand Down
Loading