From 0a0e19a3bd756a14d714652c400fe8d2a7cd7de3 Mon Sep 17 00:00:00 2001 From: Critsium Date: Wed, 15 Jan 2025 09:52:37 -0500 Subject: [PATCH] [Feature] Add vector_mul_vector, vector_div_vector and vector_add_vector in blas_connector and added some GPU tests. (#5858) * Added some other necessary kernels * Fix compiling bug * XX * Finish CUDA kernel * Fix marcos * Fix typename * GPU implementation * Fix bugs * add vector_add_vector kernel * Add blas_connector CPU tests * Fix blas usgae * Add initializer and GPU tests --- source/module_base/blas_connector.cpp | 121 +++++++- source/module_base/blas_connector.h | 32 +- source/module_base/kernels/cuda/math_op.cu | 2 +- .../module_base/test/blas_connector_test.cpp | 291 +++++++++++++++++- .../kernels/cuda/math_kernel_op.cu | 1 + 5 files changed, 438 insertions(+), 9 deletions(-) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 106eb6c4e8..14fb76e2ed 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -1,4 +1,5 @@ #include "blas_connector.h" +#include "macros.h" #ifdef __DSP #include "module_base/kernels/dsp/dsp_connector.h" @@ -8,12 +9,10 @@ #ifdef __CUDA #include #include -#include -#include -#include -#include "module_base/tool_quit.h" - #include "cublas_v2.h" +#include "module_hsolver/kernels/math_kernel_op.h" +#include "module_base/module_device/memory_op.h" + namespace BlasUtils{ @@ -652,4 +651,116 @@ void BlasConnector::copy(const long n, const std::complex *a, const int if (device_type == base_device::AbacusDevice_t::CpuDevice) { zcopy_(&n, a, &incx, b, &incy); } +} + + +template +void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){ + using Real = typename GetTypeReal::type; + if (device_type == base_device::AbacusDevice_t::CpuDevice) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * vector2[i]; + } + } + else if (device_type == base_device::AbacusDevice_t::GpuDevice){ +#ifdef __CUDA + hsolver::vector_mul_vector_op()(gpu_ctx, dim, result, vector1, vector2); +#endif + } +} + + +template +void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){ + using Real = typename GetTypeReal::type; + if (device_type == base_device::AbacusDevice_t::CpuDevice) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] / vector2[i]; + } + } + else if (device_type == base_device::AbacusDevice_t::GpuDevice){ +#ifdef __CUDA + hsolver::vector_div_vector_op()(gpu_ctx, dim, result, vector1, vector2); +#endif + } +} + +void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(float)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } +} + +void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(double)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } +} + +void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const float constant1, const std::complex *vector2, const float constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } +} + +void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } } \ No newline at end of file diff --git a/source/module_base/blas_connector.h b/source/module_base/blas_connector.h index 7675429520..387ff32a46 100644 --- a/source/module_base/blas_connector.h +++ b/source/module_base/blas_connector.h @@ -3,6 +3,7 @@ #include #include "module_base/module_device/types.h" +#include "macros.h" // These still need to be linked in the header file // Because quite a lot of code will directly use the original cblas kernels. @@ -303,9 +304,38 @@ class BlasConnector static void copy(const long n, const std::complex *a, const int incx, std::complex *b, const int incy, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); - // A is symmetric + // There is some other operators needed, so implemented manually here + template + static + void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + template + static + void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + // y = alpha * x + beta * y + static + void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const float constant1, const std::complex *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); }; +#ifdef __CUDA + +namespace BlasUtils{ + void createGpuBlasHandle(); + void destoryBLAShandle(); +} + +#endif + // If GATHER_INFO is defined, the original function is replaced with a "i" suffix, // preventing changes on the original code. // The real function call is at gather_math_lib_info.cpp diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu index cee820414e..cef3c04f3f 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_op.cu @@ -1,4 +1,4 @@ -#include "cuda_runtime.h" +#include #include "module_base/kernels/math_op.h" #include diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp index cf445ecb7a..34f4cb51bb 100644 --- a/source/module_base/test/blas_connector_test.cpp +++ b/source/module_base/test/blas_connector_test.cpp @@ -1,4 +1,5 @@ #include "../blas_connector.h" +#include "../module_device/memory_op.h" #include "gtest/gtest.h" #include @@ -84,8 +85,7 @@ TEST(blas_connector, Scal) { }; for (int i = 0; i < size; i++) answer[i] = result[i] * scale; - BlasConnector bs; - bs.scal(size,scale,result,incx); + BlasConnector::scal(size,scale,result,incx); // incx is the spacing between elements if result for (int i = 0; i < size; i++) { EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); @@ -93,6 +93,33 @@ TEST(blas_connector, Scal) { } } +#ifdef __CUDA + +TEST(blas_connector, ScalGpu) { + const int size = 8; + const std::complex scale = {2, 3}; + const int incx = 1; + std::complex result[8], answer[8]; + std::complex* result_gpu = nullptr; + resmem_zd_op()(gpu_ctx, result_gpu, 8 * sizeof(std::complex)); + for (int i=0; i< size; i++) { + result[i] = std::complex{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }; + for (int i = 0; i < size; i++) + answer[i] = result[i] * scale; + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result, sizeof(std::complex) * 8); + BlasConnector::scal(size,scale,result_gpu,incx,base_device::AbacusDevice_t::GpuDevice); + syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result, result_gpu, sizeof(std::complex) * 8); + delmem_zd_op()(gpu_ctx, result_gpu); + // incx is the spacing between elements if result + for (int i = 0; i < size; i++) { + EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); + EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); + } +} + +#endif TEST(blas_connector, daxpy_) { typedef double T; @@ -136,6 +163,67 @@ TEST(blas_connector, zaxpy_) { } } +TEST(blas_connector, Axpy) { + typedef std::complex T; + const int size = 8; + const T scale = {2, 3}; + const int incx = 1; + const int incy = 1; + std::array x_const, result, answer; + std::generate(x_const.begin(), x_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size; i++) + answer[i] = x_const[i] * scale + result[i]; + BlasConnector::axpy(size, scale, x_const.data(), incx, result.data(), incy); + for (int i = 0; i < size; i++) { + EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); + EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); + } +} + +#ifdef __CUDA + +TEST(blas_connector, AxpyGpu) { + typedef std::complex T; + const int size = 8; + const T scale = {2, 3}; + const int incx = 1; + const int incy = 1; + std::array x_const, result, answer; + T* x_gpu = nullptr; + T* result_gpu = nullptr; + resmem_zd_op()(gpu_ctx, x_gpu, size * sizeof(std::complex)); + resmem_zd_op()(gpu_ctx, result_gpu, size * sizeof(std::complex)); + std::generate(x_const.begin(), x_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size; i++) + answer[i] = x_const[i] * scale + result[i]; + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size); + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, x_gpu, x_const.data(), sizeof(std::complex) * size); + BlasConnector::axpy(size, scale, x_gpu, incx, result_gpu, incy, base_device::AbacusDevice_t::GpuDevice); + syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size); + delmem_zd_op()(gpu_ctx, result_gpu); + delmem_zd_op()(gpu_ctx, x_gpu); + for (int i = 0; i < size; i++) { + EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); + EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); + } +} + +#endif + TEST(blas_connector, dcopy_) { typedef double T; long const size = 8; @@ -313,6 +401,84 @@ TEST(blas_connector, zgemv_) { } } +TEST(blas_connector, Gemv) { + typedef std::complex T; + const char transa_m = 'N'; + const char transa_n = 'T'; + const char transa_h = 'C'; + const int size_m = 3; + const int size_n = 4; + const T alpha_const = {2, 3}; + const T beta_const = {3, 4}; + const int lda = 5; + const int incx = 1; + const int incy = 1; + std::array x_const_m, x_const_c, result_m, answer_m, c_dot_m{}; + std::array x_const_n, result_n, result_c, answer_n, answer_c, + c_dot_n{}, c_dot_c{}; + std::generate(x_const_n.begin(), x_const_n.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result_n.begin(), result_n.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(x_const_m.begin(), x_const_m.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result_m.begin(), result_m.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::array a_const; + std::generate(a_const.begin(), a_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size_m; i++) { + for (int j = 0; j < size_n; j++) { + c_dot_m[i] += a_const[i + j * lda] * x_const_n[j]; + } + answer_m[i] = alpha_const * c_dot_m[i] + beta_const * result_m[i]; + } + BlasConnector::gemv(transa_m, size_m, size_n, alpha_const, a_const.data(), lda, + x_const_n.data(), incx, beta_const, result_m.data(), incy); + + for (int j = 0; j < size_n; j++) { + for (int i = 0; i < size_m; i++) { + c_dot_n[j] += a_const[i + j * lda] * x_const_m[i]; + } + answer_n[j] = alpha_const * c_dot_n[j] + beta_const * result_n[j]; + } + BlasConnector::gemv(transa_n, size_m, size_n, alpha_const, a_const.data(), lda, + x_const_m.data(), incx, beta_const, result_n.data(), incy); + + for (int j = 0; j < size_n; j++) { + for (int i = 0; i < size_m; i++) { + c_dot_c[j] += conj(a_const[i + j * lda]) * x_const_c[i]; + } + answer_c[j] = alpha_const * c_dot_c[j] + beta_const * result_c[j]; + } + BlasConnector::gemv(transa_h, size_m, size_n, alpha_const, a_const.data(), lda, + x_const_c.data(), incx, beta_const, result_c.data(), incy); + + for (int i = 0; i < size_m; i++) { + EXPECT_DOUBLE_EQ(answer_m[i].real(), result_m[i].real()); + EXPECT_DOUBLE_EQ(answer_m[i].imag(), result_m[i].imag()); + } + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer_n[j].real(), result_n[j].real()); + EXPECT_DOUBLE_EQ(answer_n[j].imag(), result_n[j].imag()); + } + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer_c[j].real(), result_c[j].real()); + EXPECT_DOUBLE_EQ(answer_c[j].imag(), result_c[j].imag()); + } +} + + TEST(blas_connector, dgemm_) { typedef double T; const char transa_m = 'N'; @@ -404,7 +570,128 @@ TEST(blas_connector, zgemm_) { } } +TEST(blas_connector, Gemm) { + typedef std::complex T; + const char transa_m = 'N'; + const char transb_m = 'N'; + const int size_m = 3; + const int size_n = 4; + const int size_k = 5; + const T alpha_const = {2, 3}; + const T beta_const = {3, 4}; + const int lda = 6; + const int ldb = 5; + const int ldc = 4; + std::array a_const; + std::array b_const; + std::array c_dot{}, answer, result; + std::generate(a_const.begin(), a_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(b_const.begin(), b_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size_m; i++) { + for (int j = 0; j < size_n; j++) { + for (int k = 0; k < size_k; k++) { + c_dot[i + j * ldc] += + a_const[i + k * lda] * b_const[k + j * ldb]; + } + answer[i + j * ldc] = alpha_const * c_dot[i + j * ldc] + + beta_const * result[i + j * ldc]; + } + } + BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, + a_const.data(), lda, b_const.data(), ldb, beta_const, + result.data(), ldc); + + for (int i = 0; i < size_m; i++) + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(), + result[i + j * ldc].real()); + EXPECT_DOUBLE_EQ(answer[i + j * ldc].imag(), + result[i + j * ldc].imag()); + } +} + +#ifdef __CUDA + +TEST(blas_connector, GemmGpu) { + typedef std::complex T; + const char transa_m = 'N'; + const char transb_m = 'N'; + const int size_m = 3; + const int size_n = 4; + const int size_k = 5; + const T alpha_const = {2, 3}; + const T beta_const = {3, 4}; + const int lda = 6; + const int ldb = 5; + const int ldc = 4; + std::array a_const; + std::array b_const; + std::array c_dot{}, answer, result; + std::complex* a_gpu = nullptr; + std::complex* b_gpu = nullptr; + std::complex* result_gpu = nullptr; + resmem_zd_op()(gpu_ctx, a_gpu, size_k * lda * sizeof(std::complex)); + resmem_zd_op()(gpu_ctx, b_gpu, size_n * ldb * sizeof(std::complex)); + resmem_zd_op()(gpu_ctx, result_gpu, size_n * ldc * sizeof(std::complex)); + std::generate(a_const.begin(), a_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(b_const.begin(), b_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size_m; i++) { + for (int j = 0; j < size_n; j++) { + for (int k = 0; k < size_k; k++) { + c_dot[i + j * ldc] += + a_const[i + k * lda] * b_const[k + j * ldb]; + } + answer[i + j * ldc] = alpha_const * c_dot[i + j * ldc] + + beta_const * result[i + j * ldc]; + } + } + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, a_gpu, a_const.data(), sizeof(std::complex) * size_k * lda); + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, b_gpu, b_const.data(), sizeof(std::complex) * size_n * ldb); + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size_n * ldc); + BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, + a_gpu, lda, b_gpu, ldb, beta_const, + result_gpu, ldc, base_device::AbacusDevice_t::GpuDevice); + syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size_n * ldc); + delmem_zd_op()(gpu_ctx, result_gpu); + delmem_zd_op()(gpu_ctx, a_gpu); + delmem_zd_op()(gpu_ctx, b_gpu); + for (int i = 0; i < size_m; i++) + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(), + result[i + j * ldc].real()); + EXPECT_DOUBLE_EQ(answer[i + j * ldc].imag(), + result[i + j * ldc].imag()); + } +} + +#endif + int main(int argc, char **argv) { +#ifdef __CUDA + std::cout << "Initializing CublasHandle..." << std::endl; + BlasUtils::createGpuBlasHandle(); + std::cout << "Initializing CublasHandle Done." << std::endl; +#endif testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 149b9ce389..70ed5ebf0b 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -1043,6 +1043,7 @@ template struct line_minimize_with_block_op, base_device::DE template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; +template struct constantvector_addORsub_constantVector_op; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>;