Skip to content

Commit

Permalink
SD: cuBLAS and cuSOLVER libraries are initialized only once per time
Browse files Browse the repository at this point in the history
step
  • Loading branch information
biermanncarl committed Jun 29, 2020
1 parent e768253 commit 85eb621
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 58 deletions.
151 changes: 93 additions & 58 deletions libs/stokesian_dynamics/src/device_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ int dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b,

namespace internal {

// handle types for GPU libraries, dummy type in case no GPU is used
#if defined(__CUDACC__)
using blas_handle_t = cublasHandle_t;
using solver_handle_t = cusolverDnHandle_t;
#elif defined(__HIPCC__)
typedef rocblas_handle blas_handle_t;
typedef rocsolver_handle solver_handle_t;
#else
typedef int blas_handle_t;
typedef int solver_handle_t;
#endif

/** The `cublas` struct channels access to efficient basic matrix operations
* provided by either the cuBLAS library (after which it is named), which
* executes on an Nvidia GPU, rocBLAS, which executes on an AMD GPU, or by
Expand All @@ -125,20 +137,15 @@ struct cublas<policy::device, double> {
* \param m number of rows of A
* \param n number of columns of A
*/
static void geam(double const *A, double *C, int m, int n) {
static void geam(blas_handle_t &handle, double const *A, double *C, int m, int n) {
double const alpha = 1;
double const beta = 0;

MAYBE_UNUSED cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
assert(CUBLAS_STATUS_SUCCESS == stat);

stat = cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, n, m, &alpha, A, m,
&beta, A, m, C, n);
assert(CUBLAS_STATUS_SUCCESS == stat);

cublasDestroy(handle);
}

/** Matrix matrix multiplication
Expand All @@ -149,22 +156,17 @@ struct cublas<policy::device, double> {
* \param n number of columns of B
* \param k number of columns of A, rows of B
*/
static void gemm(const double *A, const double *B, double *C, int m, int k,
int n) {
static void gemm(blas_handle_t &handle, const double *A, const double *B,
double *C, int m, int k, int n) {
int lda = m, ldb = k, ldc = m;
double alpha = 1;
double beta = 0;

MAYBE_UNUSED cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
assert(CUBLAS_STATUS_SUCCESS == stat);

stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A,
lda, B, ldb, &beta, C, ldc);
assert(CUBLAS_STATUS_SUCCESS == stat);

cublasDestroy(handle);
}

/** Matrix vector multiplication
Expand All @@ -174,24 +176,19 @@ struct cublas<policy::device, double> {
* \param m number of rows of A
* \param n number of columns of A
*/
static void gemv(const double *A, const double *x, double *y, int m,
int n) {
static void gemv(blas_handle_t &handle, const double *A, const double *x,
double *y, int m, int n) {
int lda = m;
double alpha = 1;
double beta = 0;
int incx = 1;
int incy = 1;

MAYBE_UNUSED cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
assert(CUBLAS_STATUS_SUCCESS == stat);

stat = cublasDgemv(handle, CUBLAS_OP_N, m, n, &alpha, A, lda, x, incx,
&beta, y, incy);
assert(CUBLAS_STATUS_SUCCESS == stat);

cublasDestroy(handle);
}
};
#elif defined(__HIPCC__)
Expand All @@ -205,21 +202,17 @@ struct cublas<policy::device, double> {
* \param m number of rows of A
* \param n number of columns of A
*/
static void geam(double const *A, double *C, int m, int n) {
static void geam(blas_handle_t &handle, double const *A, double *C, int m,
int n) {
double const alpha = 1;
double const beta = 0;

MAYBE_UNUSED rocblas_status stat;
rocblas_handle handle;
stat = rocblas_create_handle(&handle);
assert(rocblas_status_success == stat);

stat = rocblas_dgeam(handle, rocblas_operation_transpose,
rocblas_operation_transpose, n, m, &alpha, A, m,
&beta, A, m, C, n);
assert(rocblas_status_success == stat);

rocblas_destroy_handle(handle);
}

/** Matrix matrix multiplication
Expand All @@ -230,23 +223,18 @@ struct cublas<policy::device, double> {
* \param n number of columns of B
* \param k number of columns of A, rows of B
*/
static void gemm(const double *A, const double *B, double *C, int m, int k,
int n) {
static void gemm(blas_handle_t &handle, const double *A, const double *B,
double *C, int m, int k, int n) {
int lda = m, ldb = k, ldc = m;
double alpha = 1;
double beta = 0;

MAYBE_UNUSED rocblas_status stat;
rocblas_handle handle;
stat = rocblas_create_handle(&handle);
assert(rocblas_status_success == stat);

stat = rocblas_dgemm(handle, rocblas_operation_none,
rocblas_operation_none, m, n, k, &alpha, A, lda,
B, ldb, &beta, C, ldc);
assert(rocblas_status_success == stat);

rocblas_destroy_handle(handle);
}

/** Matrix vector multiplication
Expand All @@ -256,24 +244,19 @@ struct cublas<policy::device, double> {
* \param m number of rows of A
* \param n number of columns of A
*/
static void gemv(const double *A, const double *x, double *y, int m,
int n) {
static void gemv(blas_handle_t &handle, const double *A, const double *x,
double *y, int m, int n) {
int lda = m;
double alpha = 1;
double beta = 0;
int incx = 1;
int incy = 1;

MAYBE_UNUSED rocblas_status stat;
rocblas_handle handle;
stat = rocblas_create_handle(&handle);
assert(rocblas_status_success == stat);

stat = rocblas_dgemv(handle, rocblas_operation_none, m, n, &alpha, A,
lda, x, incx, &beta, y, incy);
assert(rocblas_status_success == stat);

rocblas_destroy_handle(handle);
}
};
#else
Expand All @@ -287,7 +270,8 @@ struct cublas<policy::host, double> {
* \param m number of rows of A
* \param n number of columns of A
*/
static void geam(double const *A, double *C, int m, int n) {
static void geam(blas_handle_t &handle, double const *A, double *C, int m,
int n) {
// m = m_rows, n = m_cols
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
Expand All @@ -305,8 +289,8 @@ struct cublas<policy::host, double> {
* \param n number of columns of B
* \param k number of columns of A, rows of B
*/
static void gemm(const double *A, const double *B, double *C, int m, int k,
int n) {
static void gemm(blas_handle_t &handle, const double *A, const double *B,
double *C, int m, int k, int n) {
int lda = m, ldb = k, ldc = m;
double alpha = 1;
double beta = 0;
Expand All @@ -323,8 +307,8 @@ struct cublas<policy::host, double> {
* \param m number of rows of A
* \param n number of columns of A
*/
static void gemv(const double *A, const double *x, double *y, int m,
int n) {
static void gemv(blas_handle_t &handle, const double *A, const double *x,
double *y, int m, int n) {
int lda = m;
double alpha = 1;
double beta = 0;
Expand Down Expand Up @@ -367,11 +351,8 @@ struct cusolver<policy::device, double> {
* serves as output for the inverse
* \param N size of the matrix
*/
static void potrf(double *A, double *B, int N) {
static void potrf(solver_handle_t &handle, double *A, double *B, int N) {
MAYBE_UNUSED cusolverStatus_t stat;
cusolverDnHandle_t handle;
stat = cusolverDnCreate(&handle);
assert(CUSOLVER_STATUS_SUCCESS == stat);

int lwork = -1;
stat = cusolverDnDpotrf_bufferSize(handle, CUBLAS_FILL_MODE_UPPER, N, A,
Expand All @@ -392,8 +373,6 @@ struct cusolver<policy::device, double> {
N, thrust_wrapper::raw_pointer_cast(info.data()));
assert(CUSOLVER_STATUS_SUCCESS == stat);
assert(info[0] == 0);

cusolverDnDestroy(handle);
}
};
#elif defined(__HIPCC__)
Expand All @@ -411,15 +390,12 @@ struct cusolver<policy::device, double> {
* serves as output for the inverse
* \param N size of the matrix
*/
static void potrf(double *A, double *B, int N) {
static void potrf(solver_handle_t &handle, double *A, double *B, int N) {
// copy input so that it is available twice
thrust_wrapper::device_vector<double> C(N*N);
thrust_wrapper::copy_n(policy::device::par(), A, N*N, C.begin());

MAYBE_UNUSED rocsolver_status stat;
rocsolver_handle handle;
stat = rocsolver_create_handle(&handle);
assert(rocblas_status_success == stat);

// Cholesky decomposition
thrust_wrapper::device_vector<rocsolver_int> info(1);
Expand All @@ -442,8 +418,6 @@ struct cusolver<policy::device, double> {
thrust_wrapper::raw_pointer_cast(ipiv.data()),
B, N);
assert(rocblas_status_success == stat);

rocblas_destroy_handle(handle);
}
};
#else
Expand All @@ -459,7 +433,7 @@ struct cusolver<policy::host, double> {
* serves as output for the inverse
* \param N size of the matrix
*/
static void potrf(double *A, double *B, int N) {
static void potrf(solver_handle_t &handle, double *A, double *B, int N) {
char uplo = 'U';
int info;

Expand Down Expand Up @@ -525,12 +499,60 @@ class device_matrix { // NOLINT(bugprone-exception-escape)
size_type m_cols;
storage_type m_data;

// Handles for GPU accelerated libraries
static bool libs_init_done;
static internal::blas_handle_t blas_handle;
static internal::solver_handle_t solver_handle;

public:
device_matrix() : m_rows(0), m_cols(0), m_data(m_rows * m_cols) {}

explicit device_matrix(size_type rows, size_type cols)
: m_rows(rows), m_cols(cols), m_data(m_rows * m_cols) {}

/// Checks if GPU libraries are initialized. If not, it initializes them.
static void initialize_gpu_libraries() {
if (!libs_init_done) {
#if defined(__CUDACC__)
MAYBE_UNUSED cublasStatus_t blas_stat;
blas_stat = cublasCreate(&blas_handle);
assert(CUBLAS_STATUS_SUCCESS == blas_stat);
MAYBE_UNUSED cusolverStatus_t solver_stat;
solver_stat = cusolverDnCreate(&solver_handle);
assert(CUSOLVER_STATUS_SUCCESS == solver_stat);
#elif defined(__HIPCC__)
MAYBE_UNUSED rocblas_status blas_stat;
blas_stat = rocblas_create_handle(&blas_handle);
assert(rocblas_status_success == blas_stat);
MAYBE_UNUSED rocsolver_status solver_stat;
solver_stat = rocsolver_create_handle(&solver_handle);
assert(rocblas_status_success == solver_stat);
#else
//prevent empty function warnings
blas_handle = 42;
solver_handle = 42;
#endif
libs_init_done = true;
}
}

// Cleans up GPU libraries
static void destroy_gpu_libraries() {
if (libs_init_done) {
#if defined(__CUDACC__)
cublasDestroy(blas_handle);
cusolverDnDestroy(solver_handle);
#elif defined(__HIPCC__)
rocblas_destroy_handle(blas_handle);
rocblas_destroy_handle(solver_handle);
#else
blas_handle = 0;
solver_handle = 0;
#endif
libs_init_done = false;
}
}

reference operator()(size_type row, size_type col) noexcept {
return m_data[row + col * m_rows];
}
Expand Down Expand Up @@ -568,6 +590,7 @@ class device_matrix { // NOLINT(bugprone-exception-escape)
assert(m_cols == B.m_rows);
device_matrix C(m_rows, B.m_cols);
internal::cublas<Policy, value_type>::gemm(
blas_handle,
thrust_wrapper::raw_pointer_cast(data()),
thrust_wrapper::raw_pointer_cast(B.data()),
thrust_wrapper::raw_pointer_cast(C.data()), m_rows, m_cols, B.m_cols);
Expand All @@ -582,6 +605,7 @@ class device_matrix { // NOLINT(bugprone-exception-escape)
assert(m_cols == x.size());
storage_type y(m_rows);
internal::cublas<Policy, value_type>::gemv(
blas_handle,
thrust_wrapper::raw_pointer_cast(data()),
thrust_wrapper::raw_pointer_cast(x.data()),
thrust_wrapper::raw_pointer_cast(y.data()), m_rows, m_cols);
Expand Down Expand Up @@ -665,6 +689,7 @@ class device_matrix { // NOLINT(bugprone-exception-escape)
"BLAS/LAPACK operations");
device_matrix C(m_cols, m_rows);
internal::cublas<Policy, value_type>::geam(
blas_handle,
thrust_wrapper::raw_pointer_cast(data()),
thrust_wrapper::raw_pointer_cast(C.data()), m_rows, m_cols);
return C;
Expand All @@ -680,6 +705,7 @@ class device_matrix { // NOLINT(bugprone-exception-escape)
device_matrix B = device_matrix::Identity(m_rows, m_cols);

internal::cusolver<Policy, double>::potrf(
solver_handle,
thrust_wrapper::raw_pointer_cast(A.data()),
thrust_wrapper::raw_pointer_cast(B.data()), m_rows);

Expand Down Expand Up @@ -710,6 +736,15 @@ class device_matrix { // NOLINT(bugprone-exception-escape)
/// \}
};

// static members of device_matrix
template <typename T, typename Policy>
bool device_matrix<T,Policy>::libs_init_done = false;
template <typename T, typename Policy>
internal::blas_handle_t device_matrix<T,Policy>::blas_handle = 0;
template <typename T, typename Policy>
internal::solver_handle_t device_matrix<T,Policy>::solver_handle = 0;



/** Read-only reference to another device_matrix object. More accurately, it
* is a separate object that has different routines by which the same region
Expand Down
7 changes: 7 additions & 0 deletions libs/stokesian_dynamics/src/sd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1477,6 +1477,8 @@ struct solver {
assert(a_host.size() == n_part);
vector_type<T> a(a_host.begin(), a_host.end());

device_matrix<T, Policy>::initialize_gpu_libraries();

// TODO: More efficient!
// generate lookup table of all pairs
device_matrix<std::size_t, Policy> part_id(2, n_pair);
Expand Down Expand Up @@ -1554,6 +1556,7 @@ struct solver {
// The square root of R_FU is a byproduct of the matrix inversion,
// which we use for the thermalization
device_matrix<T, Policy> rfu_sqrt;

// 6. invert resistance matrix to obtain mobility matrix
// The grand mobility matrix is now finished.
thrust_wrapper::tie(rfu_inv, rfu_sqrt) = rfu.inverse_and_cholesky();
Expand Down Expand Up @@ -1585,6 +1588,10 @@ struct solver {
std::vector<T> out(u.size());
thrust_wrapper::copy(u.begin(), u.end(), out.begin());

// TODO: move this somewhere else where it is only called once at the
// end of the whole program.
device_matrix<T, Policy>::destroy_gpu_libraries();

return out;
}
};
Expand Down

0 comments on commit 85eb621

Please sign in to comment.