From 37cc1a3631f3227e929ae917bcd62fbd42fafd11 Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Sun, 3 Dec 2023 00:13:21 +0800 Subject: [PATCH] Fix cuda and rocm bugs (#3286) * fix cuda error * fix rocm implementation * fix cuda compilation within hsolver * fix-rocm-implementation * fix-typo --- CMakeLists.txt | 5 +- examples/scf/pw_Si2/INPUT | 19 +- .../ATen/kernels/cuda/linalg.cu | 10 +- .../rocm/{blas_op.hip.cu => blas.hip.cu} | 6 +- .../rocm/{lapack_op.hip.cu => lapack.hip.cu} | 6 +- .../ATen/kernels/rocm/linalg.hip.cu | 475 ++++++++++++++++++ .../ATen/kernels/rocm/linalg_op.hip.cu | 304 ----------- .../ATen/kernels/rocm/memory.hip.cu | 219 ++++++++ .../ATen/kernels/rocm/memory_op.hip.cu | 211 -------- .../module_hcontainer/transfer.h | 6 +- .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 29 +- .../kernels/rocm/stress_op.hip.cu | 19 +- source/module_hsolver/diago_cg.cpp | 12 +- source/module_hsolver/diago_cg.h | 1 - .../kernels/cuda/math_kernel_op.cu | 83 ++- .../kernels/rocm/dngvd_op.hip.cu | 4 + .../kernels/rocm/math_kernel_op.hip.cu | 102 ++-- 17 files changed, 841 insertions(+), 670 deletions(-) rename source/module_base/module_container/ATen/kernels/rocm/{blas_op.hip.cu => blas.hip.cu} (98%) rename source/module_base/module_container/ATen/kernels/rocm/{lapack_op.hip.cu => lapack.hip.cu} (98%) create mode 100644 source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu delete mode 100644 source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu create mode 100644 source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu delete mode 100644 source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index b3452043de..f8fb0914df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,7 +286,10 @@ endif() # Warning: CMake add support to HIP in version 3.21. This is rather a new version. # Use cmake with AMD-ROCm: https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html if(USE_ROCM) - if (NOT DEFINED ROCM_PATH ) + if(COMMIT_INFO) + message(FATAL_ERROR "Commit info is not supported on ROCm.") + endif() + if(NOT DEFINED ROCM_PATH ) set (ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory." ) endif () if(NOT DEFINED HIP_PATH) diff --git a/examples/scf/pw_Si2/INPUT b/examples/scf/pw_Si2/INPUT index 786cbed3fa..9dd1930909 100644 --- a/examples/scf/pw_Si2/INPUT +++ b/examples/scf/pw_Si2/INPUT @@ -1,9 +1,12 @@ INPUT_PARAMETERS -#Parameters (General) -pseudo_dir ../../../tests/PP_ORB -symmetry 1 -#Parameters (Accuracy) -basis_type pw -ecutwfc 60 -scf_thr 1e-8 -scf_nmax 100 \ No newline at end of file +#Parameters (General) +pseudo_dir ../../../tests/PP_ORB +symmetry 1 +#Parameters (Accuracy) +basis_type pw +ecutwfc 60 +scf_thr 1e-7 +scf_nmax 100 +device cpu +ks_solver cg +precision double diff --git a/source/module_base/module_container/ATen/kernels/cuda/linalg.cu b/source/module_base/module_container/ATen/kernels/cuda/linalg.cu index 97ccb514be..c59a75deb0 100644 --- a/source/module_base/module_container/ATen/kernels/cuda/linalg.cu +++ b/source/module_base/module_container/ATen/kernels/cuda/linalg.cu @@ -31,7 +31,7 @@ __global__ void do_add_kernel( T* z) { // Perform add operation for the specified range [begin, end) in the output Tensor. - for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. z[o_idx] = alpha * x[o_idx] + beta * y[o_idx]; } @@ -44,7 +44,7 @@ __global__ void do_mul_kernel( const T* x, T* y) { - for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. y[o_idx] = alpha * x[o_idx]; } @@ -58,7 +58,7 @@ __global__ void do_mul_kernel( const T* y, T* z) { - for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. z[o_idx] = alpha * x[o_idx] * y[o_idx]; } @@ -72,7 +72,7 @@ __global__ void do_div_kernel( const T* y, T* z) { - for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. z[o_idx] = alpha * x[o_idx] / y[o_idx]; } @@ -88,7 +88,7 @@ __global__ void do_fma_kernel( const T* z, T* out) { - for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. out[o_idx] = alpha * x[o_idx] * y[o_idx] + beta * z[o_idx]; } diff --git a/source/module_base/module_container/ATen/kernels/rocm/blas_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/blas.hip.cu similarity index 98% rename from source/module_base/module_container/ATen/kernels/rocm/blas_op.hip.cu rename to source/module_base/module_container/ATen/kernels/rocm/blas.hip.cu index d0f0525019..a505c6267c 100644 --- a/source/module_base/module_container/ATen/kernels/rocm/blas_op.hip.cu +++ b/source/module_base/module_container/ATen/kernels/rocm/blas.hip.cu @@ -1,11 +1,11 @@ -#include +#include #include #include #include namespace container { -namespace op { +namespace kernels { static hipblasHandle_t hipblas_handle = nullptr; @@ -241,5 +241,5 @@ template struct blas_gemm_batched_strided; template struct blas_gemm_batched_strided, DEVICE_GPU>; template struct blas_gemm_batched_strided, DEVICE_GPU>; -} // namespace op +} // namespace kernels } // namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/rocm/lapack_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/lapack.hip.cu similarity index 98% rename from source/module_base/module_container/ATen/kernels/rocm/lapack_op.hip.cu rename to source/module_base/module_container/ATen/kernels/rocm/lapack.hip.cu index 3b6cf2ec6d..cc0a77cb4c 100644 --- a/source/module_base/module_container/ATen/kernels/rocm/lapack_op.hip.cu +++ b/source/module_base/module_container/ATen/kernels/rocm/lapack.hip.cu @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -7,7 +7,7 @@ #include namespace container { -namespace op { +namespace kernels { static hipsolverHandle_t hipsolver_handle = nullptr; @@ -155,5 +155,5 @@ template struct lapack_dngvd; template struct lapack_dngvd, DEVICE_GPU>; template struct lapack_dngvd, DEVICE_GPU>; -} // namespace op +} // namespace kernels } // namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu new file mode 100644 index 0000000000..ef43a5408c --- /dev/null +++ b/source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu @@ -0,0 +1,475 @@ +#include +#include +#include +#include + +#include +#include + +namespace container { +namespace kernels { + +template +__device__ static inline +T conj(T& in) { + return in; +} + +template +__device__ static inline +thrust::complex conj(thrust::complex& in) { + return thrust::conj(in); +} + +template +__global__ void do_add_kernel( + const int num_element, + const T alpha, + const T* x, + const T beta, + const T* y, + T* z) +{ + // Perform add operation for the specified range [begin, end) in the output Tensor. + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. + z[o_idx] = alpha * x[o_idx] + beta * y[o_idx]; + } +} + +template +__global__ void do_mul_kernel( + const int num_element, + const T alpha, + const T* x, + T* y) +{ + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. + y[o_idx] = alpha * x[o_idx]; + } +} + +template +__global__ void do_mul_kernel( + const int num_element, + const T alpha, + const T* x, + const T* y, + T* z) +{ + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. + z[o_idx] = alpha * x[o_idx] * y[o_idx]; + } +} + +template +__global__ void do_div_kernel( + const int num_element, + const T alpha, + const T* x, + const T* y, + T* z) +{ + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. + z[o_idx] = alpha * x[o_idx] / y[o_idx]; + } +} + +template +__global__ void do_fma_kernel( + const int num_element, + const T alpha, + const T* x, + const T* y, + const T beta, + const T* z, + T* out) +{ + for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) { + // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'. + out[o_idx] = alpha * x[o_idx] * y[o_idx] + beta * z[o_idx]; + } +} + +template +__global__ void do_transpose_kernel( + int ndim, + int64_t num_elements, + const T* p, + const int* perm, + const int64_t* in_strides, + const int64_t* out_strides, + T* q) +{ + for (int64_t o_idx = 0; o_idx < num_elements; o_idx++) { + int64_t i_idx = 0; // Initialize the index for the input Tensor element. + int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element. + + // Iterate over each dimension of the output Tensor. + for (int ii = 0; ii < ndim; ++ii) { + // Calculate the ratio of the current output Tensor index 'current_o_idx' in the current dimension. + const int64_t ratio = current_o_idx / out_strides[ii]; + // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension. + current_o_idx -= ratio * out_strides[ii]; + // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'. + i_idx += ratio * in_strides[perm[ii]]; + } + // Check if conjugation is needed. + if (Conjugate) { + // Assign the conjugate value of the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. + q[o_idx] = kernels::conj(p[i_idx]); + } else { + // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. + q[o_idx] = p[i_idx]; + } + } +} + +template +__global__ void do_stride_kernel( + int ndim, + int64_t size, + const T* p, + const int64_t* stride, + const int64_t* in_strides, + const int64_t* out_strides, + T* q) +{ + // Perform stride operation for the specified range [begin, end) in the output Tensor. + for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { + int64_t i_idx = 0; // Initialize the index for the input Tensor element. + int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element. + // Iterate over each dimension of the output Tensor. + for (int ii = 0; ii < ndim; ++ii) { + // Calculate the index in the current dimension. + // It is natural to view a tensor as a multi-dimentional array. + const int64_t current_dim_idx = current_o_idx / out_strides[ii]; + // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension. + current_o_idx -= current_dim_idx * out_strides[ii]; + // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'. + i_idx += current_dim_idx * stride[ii] * in_strides[ii]; + } + // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. + q[o_idx] = p[i_idx]; + } +} + +template +__global__ void do_inflate_kernel( + int ndim, + int64_t size, + const T* p, + const int64_t* stride, + const int64_t* in_strides, + const int64_t* out_strides, + T* q) +{ + // Perform stride operation for the specified range [begin, end) in the output Tensor. + for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { + int64_t i_idx = 0; // Initialize the index for the input Tensor element. + int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element. + bool valid = true; + // Iterate over each dimension of the output Tensor. + for (int ii = 0; ii < ndim; ++ii) { + // Calculte the ratio of the current output Tensor index 'current_o_idx' in the current dimension. + const int64_t current_dim_idx = current_o_idx / out_strides[ii]; + // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension. + current_o_idx -= current_dim_idx * out_strides[ii]; + // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'. + if (current_dim_idx % stride[ii] == 0) { + i_idx += (current_dim_idx / stride[ii]) * in_strides[ii]; + } + else { + valid = false; + break; + } + } + // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. + q[o_idx] = p[i_idx] * static_cast(valid ? 1.0 : 0.0); + } +} + +template +__global__ void do_reduce_kernel( + int64_t size, + int64_t inner_most_dim, + const T* p, + T* q) +{ + for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { + T sum = 0; + for (int64_t i_idx = o_idx * inner_most_dim; i_idx < inner_most_dim + o_idx * inner_most_dim; i_idx++) { + sum += p[i_idx]; + } + q[o_idx] = sum; + } +} + +template +static std::vector compute_stride(const std::vector& shape) { + int ndims = shape.size(); + std::vector strides(ndims); + T stride = 1; + + auto it = shape.end(); // Start from the last element + for (int ii = ndims - 1; ii >= 0; ii--) { + it--; + strides[ii] = stride; + stride *= static_cast(*it); + } + return std::move(strides); +} + +template +void add::operator()(const int& num_element, const T& alpha, const T* x, const T& beta, const T* y, T* z) { + using Type = typename GetTypeThrust::type; + auto alpha_ = *reinterpret_cast(&alpha); + auto beta_ = *reinterpret_cast(&beta); + const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_add_kernel<<>> ( + num_element, alpha_, reinterpret_cast(x), + beta_, reinterpret_cast(y), reinterpret_cast(z)); +} + +template +void mul::operator()(const int& num_element, const T& alpha, const T* x, T* y) { + using Type = typename GetTypeThrust::type; + auto alpha_ = *reinterpret_cast(&alpha); + const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_mul_kernel<<>> ( + num_element, alpha_, + reinterpret_cast(x), reinterpret_cast(y)); +} + +template +void mul::operator()(const int& num_element, const T& alpha, const T* x, const T* y, T* z) { + using Type = typename GetTypeThrust::type; + auto alpha_ = *reinterpret_cast(&alpha); + const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_mul_kernel<<>> ( + num_element, alpha_, + reinterpret_cast(x), reinterpret_cast(y), reinterpret_cast(z)); +} + +template +void div::operator()(const int& num_element, const T& alpha, const T* x, const T* y, T* z) { + using Type = typename GetTypeThrust::type; + auto alpha_ = *reinterpret_cast(&alpha); + const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_div_kernel<<>> ( + num_element, alpha_, reinterpret_cast(x), reinterpret_cast(y), reinterpret_cast(z)); +} + +template +void fma::operator()(const int& num_element, const T& alpha, const T* x, const T* y, const T& beta, const T* z, T* out) { + using Type = typename GetTypeThrust::type; + auto alpha_ = *reinterpret_cast(&alpha); + auto beta_ = *reinterpret_cast(&beta); + const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_fma_kernel<<>> ( + num_element, alpha_, reinterpret_cast(x), reinterpret_cast(y), + beta_, reinterpret_cast(z), reinterpret_cast(out)); +} + +template +void transpose::operator()( + const std::vector &perm, + const std::vector &p_shape, + const std::vector &q_shape, + const T *p, + T *q) +{ + using Type = typename GetTypeThrust::type; + + REQUIRES_OK(p_shape.size() == q_shape.size(), + "transpose: p and q must have the same number of dimensions"); + const int ndim = static_cast(p_shape.size()); + auto in_strides = compute_stride(p_shape); + auto out_strides = compute_stride(q_shape); + + int num_elements = 1; + for (int ii = 0; ii < ndim; ++ii) { + num_elements *= static_cast(q_shape[ii]); + } + num_elements = ndim ? num_elements : 0; + + Tensor t_perm(DataType::DT_INT, DeviceType::GpuDevice, {ndim}); + Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + + kernels::synchronize_memory()( + t_perm.data(), perm.data(), perm.size()); + kernels::synchronize_memory()( + t_in_strides.data(), in_strides.data(), in_strides.size()); + kernels::synchronize_memory()( + t_out_strides.data(), out_strides.data(), out_strides.size()); + + const Type* p_ = reinterpret_cast(p); + Type* q_ = reinterpret_cast((q)); + + const int block = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_transpose_kernel<<>> ( + ndim, num_elements, p_, t_perm.data(), + t_in_strides.data(), t_out_strides.data(), q_); +} + +template +void stride::operator()( + const std::vector &stride, + const std::vector &p_shape, + const std::vector &q_shape, + const T *p, + T *q) +{ + using Type = typename GetTypeThrust::type; + + REQUIRES_OK(p_shape.size() == q_shape.size(), + "transpose: p and q must have the same number of dimensions"); + const int ndim = static_cast(p_shape.size()); + auto in_strides = compute_stride(p_shape); + auto out_strides = compute_stride(q_shape); + + int num_elements = 1; + for (int ii = 0; ii < ndim; ++ii) { + num_elements *= static_cast(q_shape[ii]); + } + num_elements = ndim ? num_elements : 0; + + Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + + kernels::synchronize_memory()( + t_stride.data(), stride.data(), stride.size()); + kernels::synchronize_memory()( + t_in_strides.data(), in_strides.data(), in_strides.size()); + kernels::synchronize_memory()( + t_out_strides.data(), out_strides.data(), out_strides.size()); + + const Type* p_ = reinterpret_cast(p); + Type* q_ = reinterpret_cast((q)); + + const int block = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_stride_kernel<<>> ( + ndim, num_elements, p_, t_stride.data(), t_in_strides.data(), t_out_strides.data(), q_); +} + +template +void inflate::operator()( + const std::vector &inflate, + const std::vector &p_shape, + const std::vector &q_shape, + const T *p, + T *q) +{ + using Type = typename GetTypeThrust::type; + + REQUIRES_OK(p_shape.size() == q_shape.size(), + "transpose: p and q must have the same number of dimensions"); + const int ndim = static_cast(p_shape.size()); + auto in_strides = compute_stride(p_shape); + auto out_strides = compute_stride(q_shape); + + int num_elements = 1; + for (int ii = 0; ii < ndim; ++ii) { + num_elements *= static_cast(q_shape[ii]); + } + num_elements = ndim ? num_elements : 0; + + Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); + + kernels::synchronize_memory()( + t_stride.data(), inflate.data(), inflate.size()); + kernels::synchronize_memory()( + t_in_strides.data(), in_strides.data(), in_strides.size()); + kernels::synchronize_memory()( + t_out_strides.data(), out_strides.data(), out_strides.size()); + + const Type* p_ = reinterpret_cast(p); + Type* q_ = reinterpret_cast((q)); + + const int block = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_inflate_kernel<<>> ( + ndim, num_elements, p_, t_stride.data(), t_in_strides.data(), t_out_strides.data(), q_); +} + +template +void reduce::operator()( + const int64_t &num_element, + const int64_t &inner_most_dim, + const T *p, + T *q) +{ + using Type = typename GetTypeThrust::type; + + const Type* p_ = reinterpret_cast(p); + Type* q_ = reinterpret_cast((q)); + + const int block = (static_cast(num_element) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_reduce_kernel<<>> ( + num_element, inner_most_dim, p_, q_); +} + +template struct add; +template struct add; +template struct add; +template struct add; +template struct add, DEVICE_GPU>; +template struct add, DEVICE_GPU>; + +template struct mul; +template struct mul; +template struct mul; +template struct mul; +template struct mul, DEVICE_GPU>; +template struct mul, DEVICE_GPU>; + +template struct div; +template struct div; +template struct div; +template struct div; +template struct div, DEVICE_GPU>; +template struct div, DEVICE_GPU>; + +template struct fma; +template struct fma; +template struct fma; +template struct fma; +template struct fma, DEVICE_GPU>; +template struct fma, DEVICE_GPU>; + +template struct transpose; +template struct transpose; +template struct transpose; +template struct transpose; +template struct transpose, DEVICE_GPU>; +template struct transpose, DEVICE_GPU>; + +template struct stride; +template struct stride; +template struct stride; +template struct stride; +template struct stride, DEVICE_GPU>; +template struct stride, DEVICE_GPU>; + +template struct inflate; +template struct inflate; +template struct inflate; +template struct inflate; +template struct inflate, DEVICE_GPU>; +template struct inflate, DEVICE_GPU>; + +template struct reduce; +template struct reduce; +template struct reduce; +template struct reduce; +template struct reduce, DEVICE_GPU>; +template struct reduce, DEVICE_GPU>; + +} // namespace kernels +} // namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu deleted file mode 100644 index 4627f14f28..0000000000 --- a/source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu +++ /dev/null @@ -1,304 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -namespace container { -namespace op { - -template -__device__ static inline -T conj(T& in) { - return in; -} - -template -__device__ static inline -thrust::complex conj(thrust::complex& in) { - return thrust::conj(in); -} - -template -__global__ void do_transpose_kernel( - int ndim, - int64_t size, - const T* p, - const int* perm, - const int64_t* in_strides, - const int64_t* out_strides, - T* q) -{ - for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { - int64_t i_idx = 0; // Initialize the index for the input Tensor element. - int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element. - - // Iterate over each dimension of the output Tensor. - for (int ii = 0; ii < ndim; ++ii) { - // Calculate the current_dim_idx of the current output Tensor index 'current_o_idx' in the current dimension. - const int64_t current_dim_idx = current_o_idx / out_strides[ii]; - // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension. - current_o_idx -= current_dim_idx * out_strides[ii]; - // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'. - i_idx += current_dim_idx * in_strides[perm[ii]]; - } - // Check if conjugation is needed. - if (Conjugate) { - // Assign the conjugate value of the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. - q[o_idx] = op::conj(p[i_idx]); - } else { - // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. - q[o_idx] = p[i_idx]; - } - } -} - -template -__global__ void do_stride_kernel( - int ndim, - int64_t size, - const T* p, - const int64_t* stride, - const int64_t* in_strides, - const int64_t* out_strides, - T* q) -{ - // Perform stride operation for the specified range [begin, end) in the output Tensor. - for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { - int64_t i_idx = 0; // Initialize the index for the input Tensor element. - int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element. - // Iterate over each dimension of the output Tensor. - for (int ii = 0; ii < ndim; ++ii) { - // Calculate the index in the current dimension. - // It is natural to view a tensor as a multi-dimentional array. - const int64_t current_dim_idx = current_o_idx / out_strides[ii]; - // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension. - current_o_idx -= current_dim_idx * out_strides[ii]; - // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'. - i_idx += current_dim_idx * stride[ii] * in_strides[ii]; - } - // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. - q[o_idx] = p[i_idx]; - } -} - -template -__global__ void do_inflate_kernel( - int ndim, - int64_t size, - const T* p, - const int64_t* stride, - const int64_t* in_strides, - const int64_t* out_strides, - T* q) -{ - // Perform stride operation for the specified range [begin, end) in the output Tensor. - for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { - int64_t i_idx = 0; // Initialize the index for the input Tensor element. - int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element. - bool valid = true; - // Iterate over each dimension of the output Tensor. - for (int ii = 0; ii < ndim; ++ii) { - // Calculte the ratio of the current output Tensor index 'current_o_idx' in the current dimension. - const int64_t current_dim_idx = current_o_idx / out_strides[ii]; - // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension. - current_o_idx -= current_dim_idx * out_strides[ii]; - // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'. - if (current_dim_idx % stride[ii] == 0) { - i_idx += (current_dim_idx / stride[ii]) * in_strides[ii]; - } - else { - valid = false; - break; - } - } - // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'. - q[o_idx] = p[i_idx] * static_cast(valid ? 1.0 : 0.0); - } -} - -template -__global__ void do_reduce_kernel( - int ndim, - int64_t size, - int64_t inner_most_dim, - const T* p, - T* q) -{ - for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) { - T sum = 0; - for (int64_t i_idx = o_idx * inner_most_dim; i_idx < inner_most_dim + o_idx * inner_most_dim; i_idx++) { - sum += p[i_idx]; - } - q[o_idx] = sum; - } -} - - -template -static std::vector compute_stride(const std::vector& shape) { - int ndims = shape.size(); - std::vector strides(ndims); - T stride = 1; - - auto it = shape.end(); // Start from the last element - for (int ii = ndims - 1; ii >= 0; ii--) { - it--; - strides[ii] = stride; - stride *= static_cast(*it); - } - return std::move(strides); -} - -template -struct transpose_op { - using Type = typename GetTypeThrust::type; - void operator()( - const Tensor& input, - const std::vector& perm, - Tensor& output) - { - const int ndim = input.shape().ndim(); - const int64_t output_size = output.NumElements(); - auto in_strides = compute_stride(input.shape().dims()); - auto out_strides = compute_stride(output.shape().dims()); - - Tensor t_perm(DataType::DT_INT, DeviceType::GpuDevice, {ndim}); - Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - - op::synchronize_memory_op()( - t_perm.data(), perm.data(), perm.size()); - op::synchronize_memory_op()( - t_in_strides.data(), in_strides.data(), in_strides.size()); - op::synchronize_memory_op()( - t_out_strides.data(), out_strides.data(), out_strides.size()); - - const Type* p = reinterpret_cast(input.data()); - Type* q = reinterpret_cast((output.data())); - - const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - do_transpose_kernel<<>> ( - ndim, output_size, p, t_perm.data(), - t_in_strides.data(), t_out_strides.data(), q); - } -}; - -template -struct stride_op { - using Type = typename GetTypeThrust::type; - void operator()( - const Tensor& input, - const TensorShape& stride, - Tensor& output) - { - const int ndim = input.shape().ndim(); - const int64_t output_size = output.NumElements(); - auto in_strides = compute_stride(input.shape().dims()); - auto out_strides = compute_stride(output.shape().dims()); - - Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - - op::synchronize_memory_op()( - t_stride.data(), stride.dims().data(), t_stride.NumElements()); - op::synchronize_memory_op()( - t_in_strides.data(), in_strides.data(), in_strides.size()); - op::synchronize_memory_op()( - t_out_strides.data(), out_strides.data(), out_strides.size()); - - const Type* p = reinterpret_cast(input.data()); - Type* q = reinterpret_cast((output.data())); - - const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - do_stride_kernel<<>> ( - ndim, output_size, p, t_stride.data(), t_in_strides.data(), t_out_strides.data(), q); - } -}; - -template -struct inflate_op { - using Type = typename GetTypeThrust::type; - void operator()( - const Tensor& input, - const TensorShape& stride, - Tensor& output) - { - const int ndim = input.shape().ndim(); - const int64_t output_size = output.NumElements(); - auto in_strides = compute_stride(input.shape().dims()); - auto out_strides = compute_stride(output.shape().dims()); - - Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim}); - - op::synchronize_memory_op()( - t_stride.data(), stride.dims().data(), t_stride.NumElements()); - op::synchronize_memory_op()( - t_in_strides.data(), in_strides.data(), in_strides.size()); - op::synchronize_memory_op()( - t_out_strides.data(), out_strides.data(), out_strides.size()); - - const Type* p = reinterpret_cast(input.data()); - Type* q = reinterpret_cast((output.data())); - - const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - do_inflate_kernel<<>> ( - ndim, output_size, p, t_stride.data(), t_in_strides.data(), t_out_strides.data(), q); - } -}; - -template -struct reduce_op { - using Type = typename GetTypeThrust::type; - void operator()( - const Tensor& input, - const int64_t& inner_most_dim, - Tensor& output) - { - const int ndim = input.shape().ndim(); - const int64_t output_size = output.NumElements(); - - const Type* p = reinterpret_cast(input.data()); - Type* q = reinterpret_cast((output.data())); - - const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - do_reduce_kernel<<>> ( - ndim, output_size, inner_most_dim, p, q); - } -}; - -template struct transpose_op; -template struct transpose_op; -template struct transpose_op; -template struct transpose_op; -template struct transpose_op, DEVICE_GPU>; -template struct transpose_op, DEVICE_GPU>; - -template struct stride_op; -template struct stride_op; -template struct stride_op; -template struct stride_op; -template struct stride_op, DEVICE_GPU>; -template struct stride_op, DEVICE_GPU>; - -template struct inflate_op; -template struct inflate_op; -template struct inflate_op; -template struct inflate_op; -template struct inflate_op, DEVICE_GPU>; -template struct inflate_op, DEVICE_GPU>; - -template struct reduce_op; -template struct reduce_op; -template struct reduce_op; -template struct reduce_op; -template struct reduce_op, DEVICE_GPU>; -template struct reduce_op, DEVICE_GPU>; - -} // namespace op -} // namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu new file mode 100644 index 0000000000..b45a7cfd56 --- /dev/null +++ b/source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu @@ -0,0 +1,219 @@ +#include +#include + +#include +#include + +namespace container { +namespace kernels { + +template +__global__ void do_set_memory( + T* out, + const T var, + const size_t size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size) {return;} + out[idx] = var; +} + +template +__global__ void do_cast_memory( + T_out* out, + const T_in* in, + const int size) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size) {return;} + out[idx] = static_cast(in[idx]); +} + +template +__global__ void do_cast_memory( + std::complex* out, + const std::complex* in, + const int size) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size) {return;} + auto* _out = reinterpret_cast*>(out); + const auto* _in = reinterpret_cast*>(in); + _out[idx] = static_cast>(_in[idx]); +} + +template +void resize_memory::operator()( + T*& arr, + const size_t& size, + const char* record_in) +{ + if (arr != nullptr) { + delete_memory()(arr); + } + hipMalloc((void **)&arr, sizeof(T) * size); +} + +template +void set_memory::operator()( + T* arr, + const T& var, + const size_t& size) +{ + const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + do_set_memory<<>>(arr, var, size); +} + +template +struct synchronize_memory { + void operator()( + T *arr_out, + const T *arr_in, + const size_t& size) + { + hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToHost); + } +}; + +template +struct synchronize_memory { + void operator()( + T *arr_out, + const T *arr_in, + const size_t& size) + { + hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyHostToDevice); + } +}; + +template +struct synchronize_memory { + void operator()( + T *arr_out, + const T *arr_in, + const size_t& size) + { + hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToDevice); + } +}; + +template +struct cast_memory { + void operator()( + T_out* arr_out, + const T_in* arr_in, + const size_t& size) + { + const int block = static_cast((size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK); + do_cast_memory<<>>(arr_out, arr_in, size); + } +}; + + +template +struct cast_memory { + void operator()( + T_out* arr_out, + const T_in* arr_in, + const size_t& size) + { + T_in * arr = nullptr; + hipMalloc((void **)&arr, sizeof(T_in) * size); + hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyHostToDevice); + const int block = static_cast((size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK); + do_cast_memory<<>>(arr_out, arr, size); + hipFree(arr); + } +}; + + +template +struct cast_memory { + void operator()( + T_out* arr_out, + const T_in* arr_in, + const size_t& size) + { + auto * arr = (T_in*) malloc(sizeof(T_in) * size); + hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyDeviceToHost); + for (int ii = 0; ii < size; ii++) { + arr_out[ii] = static_cast(arr[ii]); + } + free(arr); + } +}; + +template +void delete_memory::operator() ( + T* arr) +{ + hipFree(arr); +} + +template struct resize_memory; +template struct resize_memory; +template struct resize_memory; +template struct resize_memory; +template struct resize_memory, container::DEVICE_GPU>; +template struct resize_memory, container::DEVICE_GPU>; + +template struct set_memory; +template struct set_memory; +template struct set_memory; +template struct set_memory; +template struct set_memory, container::DEVICE_GPU>; +template struct set_memory, container::DEVICE_GPU>; + +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory; +template struct synchronize_memory, container::DEVICE_CPU, container::DEVICE_GPU>; +template struct synchronize_memory, container::DEVICE_GPU, container::DEVICE_CPU>; +template struct synchronize_memory, container::DEVICE_GPU, container::DEVICE_GPU>; +template struct synchronize_memory, container::DEVICE_CPU, container::DEVICE_GPU>; +template struct synchronize_memory, container::DEVICE_GPU, container::DEVICE_CPU>; +template struct synchronize_memory, container::DEVICE_GPU, container::DEVICE_GPU>; + +template struct cast_memory; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; +template struct cast_memory, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory; +template struct cast_memory, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; +template struct cast_memory, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; +template struct cast_memory, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; +template struct cast_memory, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; + +template struct delete_memory; +template struct delete_memory; +template struct delete_memory; +template struct delete_memory; +template struct delete_memory, container::DEVICE_GPU>; +template struct delete_memory, container::DEVICE_GPU>; + +} // end of namespace kernels +} // end of namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu deleted file mode 100644 index 53af15e8f8..0000000000 --- a/source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu +++ /dev/null @@ -1,211 +0,0 @@ -#include -#include - -#include -#include - -namespace container { -namespace op { - -template -__global__ void set_memory( - T* out, - const T var, - const size_t size) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if(idx >= size) {return;} - out[idx] = var; -} - -template -__global__ void cast_memory( - T_out* out, - const T_in* in, - const int size) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if(idx >= size) {return;} - out[idx] = static_cast(in[idx]); -} - -template -__global__ void cast_memory( - std::complex* out, - const std::complex* in, - const int size) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if(idx >= size) {return;} - auto* _out = reinterpret_cast*>(out); - const auto* _in = reinterpret_cast*>(in); - _out[idx] = static_cast>(_in[idx]); -} - -template -void resize_memory_op::operator()( - T*& arr, - const size_t size, - const char* record_in) -{ - if (arr != nullptr) { - delete_memory_op()(arr); - } - hipMalloc((void **)&arr, sizeof(T) * size); -} - -template -void set_memory_op::operator()( - T* arr, - const T& var, - const size_t& size) -{ - const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - set_memory<<>>(arr, var, size); -} - -template -void synchronize_memory_op::operator()( - T* arr_out, - const T* arr_in, - const size_t size) -{ - hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToHost); -} - -template -void synchronize_memory_op::operator()( - T* arr_out, - const T* arr_in, - const size_t size) -{ - hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyHostToDevice); -} - -template -void synchronize_memory_op::operator()( - T* arr_out, - const T* arr_in, - const size_t size) -{ - hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToDevice); -} - -template -struct cast_memory_op { - void operator()( - T_out* arr_out, - const T_in* arr_in, - const size_t size) - { - const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - cast_memory<<>>(arr_out, arr_in, size); - } -}; - -template -struct cast_memory_op { - void operator()( - T_out* arr_out, - const T_in* arr_in, - const size_t size) - { - T_in * arr = nullptr; - hipMalloc((void **)&arr, sizeof(T_in) * size); - hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyHostToDevice); - const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - cast_memory<<>>(arr_out, arr, size); - hipFree(arr); - } -}; - -template -struct cast_memory_op { - void operator()( - T_out* arr_out, - const T_in* arr_in, - const size_t size) - { - auto * arr = (T_in*) malloc(sizeof(T_in) * size); - hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyDeviceToHost); - for (int ii = 0; ii < size; ii++) { - arr_out[ii] = static_cast(arr[ii]); - } - free(arr); - } -}; - -template -void delete_memory_op::operator() ( - T* arr) -{ - hipFree(arr); -} - -template struct resize_memory_op; -template struct resize_memory_op; -template struct resize_memory_op; -template struct resize_memory_op; -template struct resize_memory_op, container::DEVICE_GPU>; -template struct resize_memory_op, container::DEVICE_GPU>; - -template struct set_memory_op; -template struct set_memory_op; -template struct set_memory_op; -template struct set_memory_op; -template struct set_memory_op, container::DEVICE_GPU>; -template struct set_memory_op, container::DEVICE_GPU>; - -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op; -template struct synchronize_memory_op, container::DEVICE_CPU, container::DEVICE_GPU>; -template struct synchronize_memory_op, container::DEVICE_GPU, container::DEVICE_CPU>; -template struct synchronize_memory_op, container::DEVICE_GPU, container::DEVICE_GPU>; -template struct synchronize_memory_op, container::DEVICE_CPU, container::DEVICE_GPU>; -template struct synchronize_memory_op, container::DEVICE_GPU, container::DEVICE_CPU>; -template struct synchronize_memory_op, container::DEVICE_GPU, container::DEVICE_GPU>; - -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_GPU>; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; -template struct cast_memory_op, std::complex, container::DEVICE_GPU, container::DEVICE_CPU>; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op; -template struct cast_memory_op, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; -template struct cast_memory_op, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; -template struct cast_memory_op, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; -template struct cast_memory_op, std::complex, container::DEVICE_CPU, container::DEVICE_GPU>; - -template struct delete_memory_op; -template struct delete_memory_op; -template struct delete_memory_op; -template struct delete_memory_op; -template struct delete_memory_op, container::DEVICE_GPU>; -template struct delete_memory_op, container::DEVICE_GPU>; - -} // end of namespace container -} // end of namespace op \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_hcontainer/transfer.h b/source/module_hamilt_lcao/module_hcontainer/transfer.h index 1031c9fe0c..deea0f5d7c 100644 --- a/source/module_hamilt_lcao/module_hcontainer/transfer.h +++ b/source/module_hamilt_lcao/module_hcontainer/transfer.h @@ -154,7 +154,7 @@ struct MPITraits; template <> struct MPITraits { - static constexpr MPI_Datatype datatype() + static MPI_Datatype datatype() { return MPI_INT; } @@ -163,7 +163,7 @@ struct MPITraits template <> struct MPITraits { - static constexpr MPI_Datatype datatype() + static MPI_Datatype datatype() { return MPI_DOUBLE; } @@ -172,7 +172,7 @@ struct MPITraits template <> struct MPITraits> { - static constexpr MPI_Datatype datatype() + static MPI_Datatype datatype() { return MPI_DOUBLE_COMPLEX; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index 306eb8bb46..13122029d1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -31,7 +31,7 @@ __global__ void cal_vkb1_nl( template __global__ void cal_force_nl( - const bool multi_proj, + const bool nondiagonal, const int wg_nc, const int ntype, const int spin, @@ -46,6 +46,8 @@ __global__ void cal_force_nl( const int *atom_na, const FPTYPE tpiba, const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, const FPTYPE *deeq, const thrust::complex *becp, const thrust::complex *dbecp, @@ -61,30 +63,33 @@ __global__ void cal_force_nl( } int Nprojs = atom_nh[it]; - double fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { - // double ps = GlobalC::ppcell.deeq[GlobalV::CURRENT_SPIN, iat, ip, ip]; - double ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]; + // FPTYPE ps = GlobalC::ppcell.deeq[GlobalV::CURRENT_SPIN, iat, ip, ip]; + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; const int inkb = sum + ip; //out<<"\n ps = "<::operator() ( template void cal_force_nl_op::operator() ( const psi::DEVICE_GPU *ctx, - const bool &multi_proj, + const bool &nondiagonal, const int &nbands_occ, const int &wg_nc, const int &ntype, @@ -144,19 +149,21 @@ void cal_force_nl_op::operator() ( const int *atom_na, const FPTYPE &tpiba, const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, const FPTYPE *deeq, const std::complex *becp, const std::complex *dbecp, FPTYPE *force) { hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, - multi_proj, + nondiagonal, wg_nc, ntype, spin, deeq_2, deeq_3, deeq_4, forcenl_nc, nbands, ik, nkb, atom_nh, atom_na, tpiba, - d_wg, deeq, + d_wg, d_ekb, qq_nt, deeq, reinterpret_cast*>(becp), reinterpret_cast*>(dbecp), force);// array of data diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index d4a2f55709..54e1fb2789 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -71,7 +71,7 @@ __global__ void cal_dbecp_noevc_nl( template __global__ void cal_stress_nl( - const bool multi_proj, + const bool nondiagonal, const int ipol, const int jpol, const int nkb, @@ -85,6 +85,8 @@ __global__ void cal_stress_nl( const int *atom_nh, const int *atom_na, const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, const FPTYPE *deeq, const thrust::complex *becp, const thrust::complex *dbecp, @@ -99,16 +101,17 @@ __global__ void cal_stress_nl( sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0; + FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; const int Nprojs = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { int ip1 = ii / Nprojs, ip2 = ii % Nprojs; - if(!multi_proj && ip1 != ip2) { + if(!nondiagonal && ip1 != ip2) { continue; } - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]; + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; //out<<"\n ps = "<::operator() ( template void cal_stress_nl_op::operator() ( const psi::DEVICE_GPU *ctx, - const bool &multi_proj, + const bool& nondiagonal, const int &ipol, const int &jpol, const int &nkb, @@ -178,13 +181,15 @@ void cal_stress_nl_op::operator() ( const int *atom_nh, const int *atom_na, const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, const FPTYPE *deeq, const std::complex *becp, const std::complex *dbecp, FPTYPE *stress) { hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, - multi_proj, + nondiagonal, ipol, jpol, nkb, @@ -198,6 +203,8 @@ void cal_stress_nl_op::operator() ( atom_nh, atom_na, d_wg, + d_ekb, + qq_nt, deeq, reinterpret_cast*>(becp), reinterpret_cast*>(dbecp), diff --git a/source/module_hsolver/diago_cg.cpp b/source/module_hsolver/diago_cg.cpp index 2adca28072..566e0b8df8 100644 --- a/source/module_hsolver/diago_cg.cpp +++ b/source/module_hsolver/diago_cg.cpp @@ -19,9 +19,9 @@ DiagoCG::DiagoCG(const Real* precondition_in) this->precondition = precondition_in; test_cg = 0; reorder = false; - this->one = &this->cs.one; - this->zero = &this->cs.zero; - this->neg_one = &this->cs.neg_one; + this->one = new T(static_cast(1.0)); + this->zero = new T(static_cast(0.0)); + this->neg_one = new T(static_cast(-1.0)); } template @@ -35,6 +35,10 @@ DiagoCG::~DiagoCG() { delmem_complex_op()(this->ctx, this->gradient); delmem_complex_op()(this->ctx, this->g0); delmem_complex_op()(this->ctx, this->lagrange); + + delete this->one; + delete this->zero; + delete this->neg_one; } template @@ -531,7 +535,7 @@ void DiagoCG::schmit_orth( // be careful , here reduce m+1 Parallel_Reduce::reduce_pool(lagrange_so, m + 1); - T var = cs.zero; + T var = {}; syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, &var, lagrange_so + m, 1); Real psi_norm = get_real(var); diff --git a/source/module_hsolver/diago_cg.h b/source/module_hsolver/diago_cg.h index 13d0a87e08..36fe4b6fc4 100644 --- a/source/module_hsolver/diago_cg.h +++ b/source/module_hsolver/diago_cg.h @@ -111,7 +111,6 @@ class DiagoCG : public DiagH using setmem_var_h_op = psi::memory::set_memory_op; using syncmem_var_h2d_op = psi::memory::synchronize_memory_op; - consts cs; const T * one = nullptr, * zero = nullptr, * neg_one = nullptr; }; diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 5a0f74eb1d..267d5491f7 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -21,6 +21,20 @@ struct GetTypeReal> { using type = double; /**< The return type specialization for std::complex. */ }; namespace hsolver { +template +struct GetTypeThrust { + using type = T; +}; + +template <> +struct GetTypeThrust> { + using type = thrust::complex; /**< The return type specialization for std::complex. */ +}; + +template <> +struct GetTypeThrust> { + using type = thrust::complex; /**< The return type specialization for std::complex. */ +}; static cublasHandle_t cublas_handle = nullptr; @@ -260,14 +274,14 @@ __global__ void vector_div_vector_kernel( } } -template +template __global__ void constantvector_addORsub_constantVector_kernel( const int size, T* result, const T* vector1, - const typename GetTypeReal::type constant1, + const Real constant1, const T* vector2, - const typename GetTypeReal::type constant2) + const Real constant2) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) @@ -563,61 +577,26 @@ void vector_div_vector_op, psi::DEVICE_GPU>::operator()( vector_div_vector_complex_wrapper(d, dim, result, vector1, vector2); } // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 -template <> -void constantvector_addORsub_constantVector_op::operator()( - const psi::DEVICE_GPU* d, - const int& dim, - double* result, - const double* vector1, - const double constant1, - const double* vector2, - const double constant2) -{ - int thread = 1024; - int block = (dim + thread - 1) / thread; - constantvector_addORsub_constantVector_kernel << > > (dim, result, vector1, constant1, vector2, constant2); -} -template -inline void constantvector_addORsub_constantVector_complex_wrapper( +template +void constantvector_addORsub_constantVector_op::operator()( const psi::DEVICE_GPU* d, const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE constant1, - const std::complex* vector2, - const FPTYPE constant2) + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) { - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - const thrust::complex* vector2_tmp = reinterpret_cast*>(vector2); + using Type = typename GetTypeThrust::type; + using Real = typename GetTypeReal::type; + + auto result_tmp = reinterpret_cast(result); + auto vector1_tmp = reinterpret_cast(vector1); + auto vector2_tmp = reinterpret_cast(vector2); int thread = 1024; int block = (dim + thread - 1) / thread; - constantvector_addORsub_constantVector_kernel> << > > (dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); -} -template <> -void constantvector_addORsub_constantVector_op, psi::DEVICE_GPU>::operator()( - const psi::DEVICE_GPU* d, - const int& dim, - std::complex* result, - const std::complex* vector1, - const float constant1, - const std::complex* vector2, - const float constant2) -{ - constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2); -} -template <> -void constantvector_addORsub_constantVector_op, psi::DEVICE_GPU>::operator()( - const psi::DEVICE_GPU* d, - const int& dim, - std::complex* result, - const std::complex* vector1, - const double constant1, - const std::complex* vector2, - const double constant2) -{ - constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2); + constantvector_addORsub_constantVector_kernel <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); } template <> diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu index 1872fbdd2e..b1aebe833b 100644 --- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu @@ -12,6 +12,7 @@ void destroyGpuSolverHandle() { return; } +#ifdef __LCAO template <> void dngvd_op::operator()(const psi::DEVICE_GPU* ctx, const int nstart, @@ -32,6 +33,7 @@ void dngvd_op::operator()(const psi::DEVICE_GPU* ctx, hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice); hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice); } +#endif // __LCAO template <> void dngvd_op, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx, @@ -75,6 +77,7 @@ void dngvd_op, psi::DEVICE_GPU>::operator()(const psi::DEVI hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice); } +#ifdef __LCAO template <> void dnevx_op::operator()(const psi::DEVICE_GPU* ctx, const int nstart, @@ -93,6 +96,7 @@ void dnevx_op::operator()(const psi::DEVICE_GPU* ctx, hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice); hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice); } +#endif // __LCAO template <> void dnevx_op, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx, diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu index a03ac0bc7a..a0a259621a 100644 --- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu @@ -9,9 +9,31 @@ #define WARP_SIZE 32 #define FULL_MASK 0xffffffff #define THREAD_PER_BLOCK 256 +template <> +struct GetTypeReal> { + using type = float; /**< The return type specialization for std::complex. */ +}; +template <> +struct GetTypeReal> { + using type = double; /**< The return type specialization for std::complex. */ +}; namespace hsolver { +template +struct GetTypeThrust { + using type = T; +}; + +template <> +struct GetTypeThrust> { + using type = thrust::complex; /**< The return type specialization for std::complex. */ +}; + +template <> +struct GetTypeThrust> { + using type = thrust::complex; /**< The return type specialization for std::complex. */ +}; static hipblasHandle_t cublas_handle = nullptr; @@ -233,7 +255,7 @@ __global__ void vector_mul_vector_kernel( } } -template +template __launch_bounds__(1024) __global__ void vector_div_vector_kernel( const int size, @@ -248,15 +270,15 @@ __global__ void vector_div_vector_kernel( } } -template +template __launch_bounds__(1024) __global__ void constantvector_addORsub_constantVector_kernel( const int size, T* result, const T* vector1, - const typename GetTypeReal::type constant1, + const Real constant1, const T* vector2, - const typename GetTypeReal::type constant2) + const Real constant2) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) @@ -408,7 +430,7 @@ double dot_real_op, psi::DEVICE_GPU>::operator()( } template <> -void vector_div_constant_op::operator()( +void vector_div_constant_op::operator()( const psi::DEVICE_GPU* d, const int dim, double* result, @@ -553,62 +575,26 @@ void vector_div_vector_op, psi::DEVICE_GPU>::operator()( } // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 -template <> -void constantvector_addORsub_constantVector_op::operator()( - const psi::DEVICE_GPU* d, - const int& dim, - double* result, - const double* vector1, - const double constant1, - const double* vector2, - const double constant2) -{ - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(constantvector_addORsub_constantVector_kernel), dim3(block), dim3(thread), 0, 0, dim, result, vector1, constant1, vector2, constant2); -} - -template -inline void constantvector_addORsub_constantVector_complex_wrapper( +template +void constantvector_addORsub_constantVector_op::operator()( const psi::DEVICE_GPU* d, const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE constant1, - const std::complex* vector2, - const FPTYPE constant2) + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) { - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - const thrust::complex* vector2_tmp = reinterpret_cast*>(vector2); + using Type = typename GetTypeThrust::type; + using Real = typename GetTypeReal::type; + + auto result_tmp = reinterpret_cast(result); + auto vector1_tmp = reinterpret_cast(vector1); + auto vector2_tmp = reinterpret_cast(vector2); int thread = 1024; int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(constantvector_addORsub_constantVector_kernel>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); -} -template <> -void constantvector_addORsub_constantVector_op, psi::DEVICE_GPU>::operator()( - const psi::DEVICE_GPU* d, - const int& dim, - std::complex* result, - const std::complex* vector1, - const float constant1, - const std::complex* vector2, - const float constant2) -{ - constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2); -} -template <> -void constantvector_addORsub_constantVector_op, psi::DEVICE_GPU>::operator()( - const psi::DEVICE_GPU* d, - const int& dim, - std::complex* result, - const std::complex* vector1, - const double constant1, - const std::complex* vector2, - const double constant2) -{ - constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2); + constantvector_addORsub_constantVector_kernel <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); } template <> @@ -675,7 +661,7 @@ void gemv_op::operator()( else if (trans == 'C') { cutrans = HIPBLAS_OP_C; } - hipblasZgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx); + hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx); } template <> @@ -700,7 +686,7 @@ void gemv_op, psi::DEVICE_GPU>::operator()( else if (trans == 'T'){ cutrans = HIPBLAS_OP_T; } - hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx); + hipblasCgemv(cublas_handle, cutrans, m, n, (hipblasComplex*)alpha, (hipblasComplex*)A, lda, (hipblasComplex*)X, incx, (hipblasComplex*)beta, (hipblasComplex*)Y, incx); } template <> @@ -1024,7 +1010,7 @@ template struct matrixSetToAnother, psi::DEVICE_GPU>; template struct dot_real_op; template struct vector_div_constant_op; template struct vector_mul_vector_op; -template struct vector_div_vector_op; +template struct vector_div_vector_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; #endif