From 37cc1a3631f3227e929ae917bcd62fbd42fafd11 Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Sun, 3 Dec 2023 00:13:21 +0800
Subject: [PATCH] Fix cuda and rocm bugs (#3286)

* fix cuda error

* fix rocm implementation

* fix cuda compilation within hsolver

* fix-rocm-implementation

* fix-typo
---
 CMakeLists.txt                                |   5 +-
 examples/scf/pw_Si2/INPUT                     |  19 +-
 .../ATen/kernels/cuda/linalg.cu               |  10 +-
 .../rocm/{blas_op.hip.cu => blas.hip.cu}      |   6 +-
 .../rocm/{lapack_op.hip.cu => lapack.hip.cu}  |   6 +-
 .../ATen/kernels/rocm/linalg.hip.cu           | 475 ++++++++++++++++++
 .../ATen/kernels/rocm/linalg_op.hip.cu        | 304 -----------
 .../ATen/kernels/rocm/memory.hip.cu           | 219 ++++++++
 .../ATen/kernels/rocm/memory_op.hip.cu        | 211 --------
 .../module_hcontainer/transfer.h              |   6 +-
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu |  29 +-
 .../kernels/rocm/stress_op.hip.cu             |  19 +-
 source/module_hsolver/diago_cg.cpp            |  12 +-
 source/module_hsolver/diago_cg.h              |   1 -
 .../kernels/cuda/math_kernel_op.cu            |  83 ++-
 .../kernels/rocm/dngvd_op.hip.cu              |   4 +
 .../kernels/rocm/math_kernel_op.hip.cu        | 102 ++--
 17 files changed, 841 insertions(+), 670 deletions(-)
 rename source/module_base/module_container/ATen/kernels/rocm/{blas_op.hip.cu => blas.hip.cu} (98%)
 rename source/module_base/module_container/ATen/kernels/rocm/{lapack_op.hip.cu => lapack.hip.cu} (98%)
 create mode 100644 source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu
 delete mode 100644 source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu
 create mode 100644 source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu
 delete mode 100644 source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3452043de..f8fb0914df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,7 +286,10 @@ endif()
 # Warning: CMake add support to HIP in version 3.21. This is rather a new version.
 # Use cmake with AMD-ROCm: https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
 if(USE_ROCM)
-  if (NOT DEFINED ROCM_PATH )
+  if(COMMIT_INFO)
+    message(FATAL_ERROR "Commit info is not supported on ROCm.")
+  endif()
+  if(NOT DEFINED ROCM_PATH )
     set (ROCM_PATH "/opt/rocm"  CACHE STRING "Default ROCM installation directory." )
   endif ()
   if(NOT DEFINED HIP_PATH)
diff --git a/examples/scf/pw_Si2/INPUT b/examples/scf/pw_Si2/INPUT
index 786cbed3fa..9dd1930909 100644
--- a/examples/scf/pw_Si2/INPUT
+++ b/examples/scf/pw_Si2/INPUT
@@ -1,9 +1,12 @@
 INPUT_PARAMETERS
-#Parameters	(General)
-pseudo_dir		../../../tests/PP_ORB	
-symmetry                1	
-#Parameters (Accuracy)
-basis_type		pw
-ecutwfc			60
-scf_thr                 1e-8
-scf_nmax		100
\ No newline at end of file
+#Parameters  (General)
+pseudo_dir      ../../../tests/PP_ORB	
+symmetry        1	
+#Parameters  (Accuracy)
+basis_type      pw
+ecutwfc         60 
+scf_thr         1e-7
+scf_nmax        100
+device          cpu
+ks_solver       cg
+precision       double
diff --git a/source/module_base/module_container/ATen/kernels/cuda/linalg.cu b/source/module_base/module_container/ATen/kernels/cuda/linalg.cu
index 97ccb514be..c59a75deb0 100644
--- a/source/module_base/module_container/ATen/kernels/cuda/linalg.cu
+++ b/source/module_base/module_container/ATen/kernels/cuda/linalg.cu
@@ -31,7 +31,7 @@ __global__ void do_add_kernel(
     T* z)
 {
     // Perform add operation for the specified range [begin, end) in the output Tensor.
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         z[o_idx] = alpha * x[o_idx] + beta * y[o_idx];
     }
@@ -44,7 +44,7 @@ __global__ void do_mul_kernel(
     const T* x,
     T* y)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         y[o_idx] = alpha * x[o_idx];
     }
@@ -58,7 +58,7 @@ __global__ void do_mul_kernel(
     const T* y,
     T* z)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         z[o_idx] = alpha * x[o_idx] * y[o_idx];
     }
@@ -72,7 +72,7 @@ __global__ void do_div_kernel(
     const T* y,
     T* z)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         z[o_idx] = alpha * x[o_idx] / y[o_idx];
     }
@@ -88,7 +88,7 @@ __global__ void do_fma_kernel(
     const T* z,
     T* out)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         out[o_idx] = alpha * x[o_idx] * y[o_idx] + beta * z[o_idx];
     }
diff --git a/source/module_base/module_container/ATen/kernels/rocm/blas_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/blas.hip.cu
similarity index 98%
rename from source/module_base/module_container/ATen/kernels/rocm/blas_op.hip.cu
rename to source/module_base/module_container/ATen/kernels/rocm/blas.hip.cu
index d0f0525019..a505c6267c 100644
--- a/source/module_base/module_container/ATen/kernels/rocm/blas_op.hip.cu
+++ b/source/module_base/module_container/ATen/kernels/rocm/blas.hip.cu
@@ -1,11 +1,11 @@
-#include <ATen/kernels/blas_op.h>
+#include <ATen/kernels/blas.h>
 #include <base/third_party/blas.h>
 
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 
 namespace container {
-namespace op {
+namespace kernels {
 
 static hipblasHandle_t hipblas_handle = nullptr;
 
@@ -241,5 +241,5 @@ template struct blas_gemm_batched_strided<double, DEVICE_GPU>;
 template struct blas_gemm_batched_strided<std::complex<float >, DEVICE_GPU>;
 template struct blas_gemm_batched_strided<std::complex<double>, DEVICE_GPU>;
 
-} // namespace op
+} // namespace kernels
 } // namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/rocm/lapack_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/lapack.hip.cu
similarity index 98%
rename from source/module_base/module_container/ATen/kernels/rocm/lapack_op.hip.cu
rename to source/module_base/module_container/ATen/kernels/rocm/lapack.hip.cu
index 3b6cf2ec6d..cc0a77cb4c 100644
--- a/source/module_base/module_container/ATen/kernels/rocm/lapack_op.hip.cu
+++ b/source/module_base/module_container/ATen/kernels/rocm/lapack.hip.cu
@@ -1,5 +1,5 @@
 #include <vector>
-#include <ATen/kernels/lapack_op.h>
+#include <ATen/kernels/lapack.h>
 #include <base/third_party/lapack.h>
 
 #include <hip/hip_runtime.h>
@@ -7,7 +7,7 @@
 #include <hipsolver/hipsolver.h>
 
 namespace container {
-namespace op {
+namespace kernels {
 
 
 static hipsolverHandle_t hipsolver_handle = nullptr;
@@ -155,5 +155,5 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;
 
-} // namespace op
+} // namespace kernels
 } // namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu
new file mode 100644
index 0000000000..ef43a5408c
--- /dev/null
+++ b/source/module_base/module_container/ATen/kernels/rocm/linalg.hip.cu
@@ -0,0 +1,475 @@
+#include <ATen/kernels/linalg.h>
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_types.h>
+#include <base/macros/rocm.h>
+
+#include <hip/hip_runtime.h>
+#include <thrust/complex.h>
+
+namespace container {
+namespace kernels {
+
+template <typename T>
+__device__ static inline 
+T conj(T& in) {
+    return in;
+}
+
+template <typename T>
+__device__ static inline 
+thrust::complex<T> conj(thrust::complex<T>& in) {
+    return thrust::conj(in);
+}
+
+template <typename T>
+__global__ void do_add_kernel(
+    const int num_element,
+    const T alpha,
+    const T* x,
+    const T beta,
+    const T* y,
+    T* z)
+{
+    // Perform add operation for the specified range [begin, end) in the output Tensor.
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+        // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
+        z[o_idx] = alpha * x[o_idx] + beta * y[o_idx];
+    }
+}
+
+template <typename T>
+__global__ void do_mul_kernel(
+    const int num_element,
+    const T alpha,
+    const T* x,
+    T* y)
+{
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+        // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
+        y[o_idx] = alpha * x[o_idx];
+    }
+}
+
+template <typename T>
+__global__ void do_mul_kernel(
+    const int num_element,
+    const T alpha,
+    const T* x,
+    const T* y,
+    T* z)
+{
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+        // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
+        z[o_idx] = alpha * x[o_idx] * y[o_idx];
+    }
+}
+
+template <typename T>
+__global__ void do_div_kernel(
+    const int num_element,
+    const T alpha,
+    const T* x,
+    const T* y,
+    T* z)
+{
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+        // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
+        z[o_idx] = alpha * x[o_idx] / y[o_idx];
+    }
+}
+
+template <typename T>
+__global__ void do_fma_kernel(
+    const int num_element,
+    const T alpha,
+    const T* x,
+    const T* y,
+    const T beta,
+    const T* z,
+    T* out)
+{
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+        // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
+        out[o_idx] = alpha * x[o_idx] * y[o_idx] + beta * z[o_idx];
+    }
+}
+
+template <typename T, bool Conjugate>
+__global__ void do_transpose_kernel(
+    int ndim,
+    int64_t num_elements,
+    const T* p,
+    const int* perm,
+    const int64_t* in_strides,
+    const int64_t* out_strides,
+    T* q)
+{
+    for (int64_t o_idx = 0; o_idx < num_elements; o_idx++) {
+        int64_t i_idx = 0; // Initialize the index for the input Tensor element.
+        int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element.
+
+        // Iterate over each dimension of the output Tensor.
+        for (int ii = 0; ii < ndim; ++ii) {
+            // Calculate the ratio of the current output Tensor index 'current_o_idx' in the current dimension.
+            const int64_t ratio = current_o_idx / out_strides[ii];
+            // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension.
+            current_o_idx -= ratio * out_strides[ii];
+            // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'.
+            i_idx += ratio * in_strides[perm[ii]];
+        }
+        // Check if conjugation is needed.
+        if (Conjugate) {
+            // Assign the conjugate value of the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
+            q[o_idx] = kernels::conj(p[i_idx]);
+        } else {
+            // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
+            q[o_idx] = p[i_idx];
+        }
+    }
+}
+
+template <typename T>
+__global__ void do_stride_kernel(
+    int ndim,
+    int64_t size,
+    const T* p,
+    const int64_t* stride,
+    const int64_t* in_strides,
+    const int64_t* out_strides,
+    T* q)
+{
+    // Perform stride operation for the specified range [begin, end) in the output Tensor.
+    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
+        int64_t i_idx = 0; // Initialize the index for the input Tensor element.
+        int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element.
+        // Iterate over each dimension of the output Tensor.
+        for (int ii = 0; ii < ndim; ++ii) {
+            // Calculate the index in the current dimension.
+            // It is natural to view a tensor as a multi-dimentional array.
+            const int64_t current_dim_idx = current_o_idx / out_strides[ii];
+            // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension.
+            current_o_idx -= current_dim_idx * out_strides[ii];
+            // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'.
+            i_idx += current_dim_idx * stride[ii] * in_strides[ii];
+        }
+        // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
+        q[o_idx] = p[i_idx];
+    }
+}
+
+template <typename T>
+__global__ void do_inflate_kernel(
+    int ndim,
+    int64_t size,
+    const T* p,
+    const int64_t* stride,
+    const int64_t* in_strides,
+    const int64_t* out_strides,
+    T* q)
+{
+    // Perform stride operation for the specified range [begin, end) in the output Tensor.
+    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
+        int64_t i_idx = 0; // Initialize the index for the input Tensor element.
+        int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element.
+        bool valid = true;
+        // Iterate over each dimension of the output Tensor.
+        for (int ii = 0; ii < ndim; ++ii) {
+            // Calculte the ratio of the current output Tensor index 'current_o_idx' in the current dimension.
+            const int64_t current_dim_idx = current_o_idx / out_strides[ii];
+            // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension.
+            current_o_idx -= current_dim_idx * out_strides[ii];
+            // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'.
+            if (current_dim_idx % stride[ii] == 0) {
+                i_idx += (current_dim_idx / stride[ii]) * in_strides[ii];
+            }
+            else {
+                valid = false;
+                break;
+            }
+        }
+        // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
+        q[o_idx] = p[i_idx] * static_cast<T>(valid ? 1.0 : 0.0);
+    }
+}
+
+template <typename T>
+__global__ void do_reduce_kernel(
+    int64_t size,
+    int64_t inner_most_dim,
+    const T* p,
+    T* q)
+{
+    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
+        T sum = 0;
+        for (int64_t i_idx = o_idx * inner_most_dim; i_idx < inner_most_dim + o_idx * inner_most_dim; i_idx++) {
+            sum += p[i_idx];
+        }
+        q[o_idx] = sum;
+    }
+}
+
+template <typename T>
+static std::vector<T> compute_stride(const std::vector<T>& shape) {
+    int ndims = shape.size();
+    std::vector<T> strides(ndims);
+    T stride = 1;
+
+    auto it = shape.end(); // Start from the last element
+    for (int ii = ndims - 1; ii >= 0; ii--) {
+        it--;
+        strides[ii] = stride;
+        stride *= static_cast<T>(*it);
+    }
+    return std::move(strides);
+}
+
+template<typename T, typename Device>
+void add<T, Device>::operator()(const int& num_element, const T& alpha, const T* x, const T& beta, const T* y, T* z) {
+    using Type = typename GetTypeThrust<T>::type;
+    auto alpha_ = *reinterpret_cast<const Type*>(&alpha);
+    auto beta_ = *reinterpret_cast<const Type*>(&beta);
+    const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_add_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        num_element, alpha_, reinterpret_cast<const Type*>(x),
+        beta_, reinterpret_cast<const Type*>(y), reinterpret_cast<Type*>(z));
+}
+
+template<typename T, typename Device>
+void mul<T, Device>::operator()(const int& num_element, const T& alpha, const T* x, T* y) {
+    using Type = typename GetTypeThrust<T>::type;
+    auto alpha_ = *reinterpret_cast<const Type*>(&alpha);
+    const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_mul_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        num_element, alpha_,
+        reinterpret_cast<const Type*>(x),  reinterpret_cast<Type*>(y));
+}
+
+template<typename T, typename Device>
+void mul<T, Device>::operator()(const int& num_element, const T& alpha, const T* x, const T* y, T* z) {
+    using Type = typename GetTypeThrust<T>::type;
+    auto alpha_ = *reinterpret_cast<const Type*>(&alpha);
+    const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_mul_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        num_element, alpha_,
+        reinterpret_cast<const Type*>(x), reinterpret_cast<const Type*>(y), reinterpret_cast<Type*>(z));
+}
+
+template<typename T, typename Device>
+void div<T, Device>::operator()(const int& num_element, const T& alpha, const T* x, const T* y, T* z) {
+    using Type = typename GetTypeThrust<T>::type;
+    auto alpha_ = *reinterpret_cast<const Type*>(&alpha);
+    const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_div_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        num_element, alpha_, reinterpret_cast<const Type*>(x), reinterpret_cast<const Type*>(y), reinterpret_cast<Type*>(z));
+}
+
+template<typename T, typename Device>
+void fma<T, Device>::operator()(const int& num_element, const T& alpha, const T* x, const T* y, const T& beta, const T* z, T* out) {
+    using Type = typename GetTypeThrust<T>::type;
+    auto alpha_ = *reinterpret_cast<const Type*>(&alpha);
+    auto beta_ = *reinterpret_cast<const Type*>(&beta);
+    const int block = (num_element + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_fma_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        num_element, alpha_, reinterpret_cast<const Type*>(x), reinterpret_cast<const Type*>(y),
+        beta_, reinterpret_cast<const Type*>(z), reinterpret_cast<Type*>(out));
+}
+
+template<typename T, typename Device, bool Conjugate>
+void transpose<T, Device, Conjugate>::operator()(
+    const std::vector<int> &perm,
+    const std::vector<int64_t> &p_shape,
+    const std::vector<int64_t> &q_shape,
+    const T *p,
+    T *q)
+{
+    using Type = typename GetTypeThrust<T>::type;
+
+    REQUIRES_OK(p_shape.size() == q_shape.size(),
+                "transpose: p and q must have the same number of dimensions");
+    const int ndim = static_cast<int>(p_shape.size());
+    auto in_strides = compute_stride(p_shape);
+    auto out_strides = compute_stride(q_shape);
+
+    int num_elements = 1;
+    for (int ii = 0; ii < ndim; ++ii) {
+        num_elements *= static_cast<int>(q_shape[ii]);
+    }
+    num_elements = ndim ? num_elements : 0;
+
+    Tensor t_perm(DataType::DT_INT, DeviceType::GpuDevice, {ndim});
+    Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+    Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+
+    kernels::synchronize_memory<int, DEVICE_GPU, DEVICE_CPU>()(
+            t_perm.data<int>(), perm.data(), perm.size());
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+            t_in_strides.data<int64_t>(), in_strides.data(), in_strides.size());
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+            t_out_strides.data<int64_t>(), out_strides.data(), out_strides.size());
+
+    const Type* p_ = reinterpret_cast<const Type*>(p);
+    Type* q_ = reinterpret_cast<Type*>((q));
+
+    const int block = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_transpose_kernel<Type, Conjugate><<<block, THREADS_PER_BLOCK>>> (
+        ndim, num_elements, p_, t_perm.data<int>(),
+        t_in_strides.data<int64_t>(), t_out_strides.data<int64_t>(), q_);
+}
+
+template<typename T, typename Device>
+void stride<T, Device>::operator()(
+    const std::vector<int64_t> &stride,
+    const std::vector<int64_t> &p_shape,
+    const std::vector<int64_t> &q_shape,
+    const T *p,
+    T *q)
+{
+    using Type = typename GetTypeThrust<T>::type;
+
+    REQUIRES_OK(p_shape.size() == q_shape.size(),
+                "transpose: p and q must have the same number of dimensions");
+    const int ndim = static_cast<int>(p_shape.size());
+    auto in_strides = compute_stride(p_shape);
+    auto out_strides = compute_stride(q_shape);
+
+    int num_elements = 1;
+    for (int ii = 0; ii < ndim; ++ii) {
+        num_elements *= static_cast<int>(q_shape[ii]);
+    }
+    num_elements = ndim ? num_elements : 0;
+
+    Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+    Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+    Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+        t_stride.data<int64_t>(), stride.data(), stride.size());
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+        t_in_strides.data<int64_t>(), in_strides.data(), in_strides.size());
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+        t_out_strides.data<int64_t>(), out_strides.data(), out_strides.size());
+
+    const Type* p_ = reinterpret_cast<const Type*>(p);
+    Type* q_ = reinterpret_cast<Type*>((q));
+
+    const int block = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_stride_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        ndim, num_elements, p_, t_stride.data<int64_t>(), t_in_strides.data<int64_t>(), t_out_strides.data<int64_t>(), q_);
+}
+
+template<typename T, typename Device>
+void inflate<T, Device>::operator()(
+    const std::vector<int64_t> &inflate,
+    const std::vector<int64_t> &p_shape,
+    const std::vector<int64_t> &q_shape,
+    const T *p,
+    T *q)
+{
+    using Type = typename GetTypeThrust<T>::type;
+
+    REQUIRES_OK(p_shape.size() == q_shape.size(),
+                "transpose: p and q must have the same number of dimensions");
+    const int ndim = static_cast<int>(p_shape.size());
+    auto in_strides = compute_stride(p_shape);
+    auto out_strides = compute_stride(q_shape);
+
+    int num_elements = 1;
+    for (int ii = 0; ii < ndim; ++ii) {
+        num_elements *= static_cast<int>(q_shape[ii]);
+    }
+    num_elements = ndim ? num_elements : 0;
+
+    Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+    Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+    Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
+
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+        t_stride.data<int64_t>(), inflate.data(), inflate.size());
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+        t_in_strides.data<int64_t>(), in_strides.data(), in_strides.size());
+    kernels::synchronize_memory<int64_t, DEVICE_GPU, DEVICE_CPU>()(
+        t_out_strides.data<int64_t>(), out_strides.data(), out_strides.size());
+
+    const Type* p_ = reinterpret_cast<const Type*>(p);
+    Type* q_ = reinterpret_cast<Type*>((q));
+
+    const int block = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_inflate_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        ndim, num_elements, p_, t_stride.data<int64_t>(), t_in_strides.data<int64_t>(), t_out_strides.data<int64_t>(), q_);
+}
+
+template<typename T, typename Device>
+void reduce<T, Device>::operator()(
+    const int64_t &num_element,
+    const int64_t &inner_most_dim,
+    const T *p,
+    T *q)
+{
+    using Type = typename GetTypeThrust<T>::type;
+
+    const Type* p_ = reinterpret_cast<const Type*>(p);
+    Type* q_ = reinterpret_cast<Type*>((q));
+
+    const int block = (static_cast<int>(num_element) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_reduce_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
+        num_element, inner_most_dim, p_, q_);
+}
+
+template struct add<int, DEVICE_GPU>;
+template struct add<int64_t, DEVICE_GPU>;
+template struct add<float, DEVICE_GPU>;
+template struct add<double, DEVICE_GPU>;
+template struct add<std::complex<float>, DEVICE_GPU>;
+template struct add<std::complex<double>, DEVICE_GPU>;
+
+template struct mul<int, DEVICE_GPU>;
+template struct mul<int64_t, DEVICE_GPU>;
+template struct mul<float, DEVICE_GPU>;
+template struct mul<double, DEVICE_GPU>;
+template struct mul<std::complex<float>, DEVICE_GPU>;
+template struct mul<std::complex<double>, DEVICE_GPU>;
+
+template struct div<int, DEVICE_GPU>;
+template struct div<int64_t, DEVICE_GPU>;
+template struct div<float, DEVICE_GPU>;
+template struct div<double, DEVICE_GPU>;
+template struct div<std::complex<float>, DEVICE_GPU>;
+template struct div<std::complex<double>, DEVICE_GPU>;
+
+template struct fma<int, DEVICE_GPU>;
+template struct fma<int64_t, DEVICE_GPU>;
+template struct fma<float, DEVICE_GPU>;
+template struct fma<double, DEVICE_GPU>;
+template struct fma<std::complex<float>, DEVICE_GPU>;
+template struct fma<std::complex<double>, DEVICE_GPU>;
+
+template struct transpose<int, DEVICE_GPU>;
+template struct transpose<int64_t, DEVICE_GPU>;
+template struct transpose<float, DEVICE_GPU>;
+template struct transpose<double, DEVICE_GPU>;
+template struct transpose<std::complex<float>, DEVICE_GPU>;
+template struct transpose<std::complex<double>, DEVICE_GPU>;
+
+template struct stride<int, DEVICE_GPU>;
+template struct stride<int64_t, DEVICE_GPU>;
+template struct stride<float, DEVICE_GPU>;
+template struct stride<double, DEVICE_GPU>;
+template struct stride<std::complex<float>, DEVICE_GPU>;
+template struct stride<std::complex<double>, DEVICE_GPU>;
+
+template struct inflate<int, DEVICE_GPU>;
+template struct inflate<int64_t, DEVICE_GPU>;
+template struct inflate<float, DEVICE_GPU>;
+template struct inflate<double, DEVICE_GPU>;
+template struct inflate<std::complex<float>, DEVICE_GPU>;
+template struct inflate<std::complex<double>, DEVICE_GPU>;
+
+template struct reduce<int, DEVICE_GPU>;
+template struct reduce<int64_t, DEVICE_GPU>;
+template struct reduce<float, DEVICE_GPU>;
+template struct reduce<double, DEVICE_GPU>;
+template struct reduce<std::complex<float>, DEVICE_GPU>;
+template struct reduce<std::complex<double>, DEVICE_GPU>;
+
+} // namespace kernels
+} // namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu
deleted file mode 100644
index 4627f14f28..0000000000
--- a/source/module_base/module_container/ATen/kernels/rocm/linalg_op.hip.cu
+++ /dev/null
@@ -1,304 +0,0 @@
-#include <ATen/kernels/linalg_op.h>
-#include <ATen/core/tensor.h>
-#include <ATen/core/tensor_types.h>
-#include <base/macros/rocm.h>
-
-#include <hip/hip_runtime.h>
-#include <thrust/complex.h>
-
-namespace container {
-namespace op {
-
-template <typename T>
-__device__ static inline 
-T conj(T& in) {
-    return in;
-}
-
-template <typename T>
-__device__ static inline 
-thrust::complex<T> conj(thrust::complex<T>& in) {
-    return thrust::conj(in);
-}
-
-template <typename T, bool Conjugate>
-__global__ void do_transpose_kernel(
-    int ndim,
-    int64_t size,
-    const T* p,
-    const int* perm,
-    const int64_t* in_strides,
-    const int64_t* out_strides,
-    T* q)
-{
-    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
-        int64_t i_idx = 0; // Initialize the index for the input Tensor element.
-        int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element.
-
-        // Iterate over each dimension of the output Tensor.
-        for (int ii = 0; ii < ndim; ++ii) {
-            // Calculate the current_dim_idx of the current output Tensor index 'current_o_idx' in the current dimension.
-            const int64_t current_dim_idx = current_o_idx / out_strides[ii];
-            // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension.
-            current_o_idx -= current_dim_idx * out_strides[ii];
-            // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'.
-            i_idx += current_dim_idx * in_strides[perm[ii]];
-        }
-        // Check if conjugation is needed.
-        if (Conjugate) {
-            // Assign the conjugate value of the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
-            q[o_idx] = op::conj(p[i_idx]);
-        } else {
-            // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
-            q[o_idx] = p[i_idx];
-        }
-    }
-}
-
-template <typename T>
-__global__ void do_stride_kernel(
-    int ndim,
-    int64_t size,
-    const T* p,
-    const int64_t* stride,
-    const int64_t* in_strides,
-    const int64_t* out_strides,
-    T* q)
-{
-    // Perform stride operation for the specified range [begin, end) in the output Tensor.
-    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
-        int64_t i_idx = 0; // Initialize the index for the input Tensor element.
-        int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element.
-        // Iterate over each dimension of the output Tensor.
-        for (int ii = 0; ii < ndim; ++ii) {
-            // Calculate the index in the current dimension.
-            // It is natural to view a tensor as a multi-dimentional array.
-            const int64_t current_dim_idx = current_o_idx / out_strides[ii];
-            // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension.
-            current_o_idx -= current_dim_idx * out_strides[ii];
-            // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'.
-            i_idx += current_dim_idx * stride[ii] * in_strides[ii];
-        }
-        // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
-        q[o_idx] = p[i_idx];
-    }
-}
-
-template <typename T>
-__global__ void do_inflate_kernel(
-    int ndim,
-    int64_t size,
-    const T* p,
-    const int64_t* stride,
-    const int64_t* in_strides,
-    const int64_t* out_strides,
-    T* q)
-{
-    // Perform stride operation for the specified range [begin, end) in the output Tensor.
-    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
-        int64_t i_idx = 0; // Initialize the index for the input Tensor element.
-        int64_t current_o_idx = o_idx; // Calculate the index for the output Tensor element.
-        bool valid = true;
-        // Iterate over each dimension of the output Tensor.
-        for (int ii = 0; ii < ndim; ++ii) {
-            // Calculte the ratio of the current output Tensor index 'current_o_idx' in the current dimension.
-            const int64_t current_dim_idx = current_o_idx / out_strides[ii];
-            // Update the output Tensor index 'current_o_idx' by removing the offset in the current dimension.
-            current_o_idx -= current_dim_idx * out_strides[ii];
-            // Calculate the offset for the corresponding index position in the input Tensor and accumulate it in 'i_idx'.
-            if (current_dim_idx % stride[ii] == 0) {
-                i_idx += (current_dim_idx / stride[ii]) * in_strides[ii];
-            }
-            else {
-                valid = false;
-                break;
-            }
-        }
-        // Assign the input Tensor element at index 'i_idx' to the output Tensor element at index 'o_idx'.
-        q[o_idx] = p[i_idx] * static_cast<T>(valid ? 1.0 : 0.0);
-    }
-}
-
-template <typename T>
-__global__ void do_reduce_kernel(
-    int ndim,
-    int64_t size,
-    int64_t inner_most_dim,
-    const T* p,
-    T* q)
-{
-    for (int64_t o_idx = threadIdx.x; o_idx < size; o_idx += blockDim.x) {
-        T sum = 0;
-        for (int64_t i_idx = o_idx * inner_most_dim; i_idx < inner_most_dim + o_idx * inner_most_dim; i_idx++) {
-            sum += p[i_idx];
-        }
-        q[o_idx] = sum;
-    }
-}
-
-
-template <typename T>
-static std::vector<T> compute_stride(const std::vector<T>& shape) {
-    int ndims = shape.size();
-    std::vector<T> strides(ndims);
-    T stride = 1;
-
-    auto it = shape.end(); // Start from the last element
-    for (int ii = ndims - 1; ii >= 0; ii--) {
-        it--;
-        strides[ii] = stride;
-        stride *= static_cast<T>(*it);
-    }
-    return std::move(strides);
-}
-
-template <typename T, bool Conjugate>
-struct transpose_op<T, DEVICE_GPU, Conjugate> {
-    using Type = typename GetTypeThrust<T>::type;
-    void operator()(
-        const Tensor& input,
-        const std::vector<int>& perm,
-        Tensor& output)
-    {
-        const int ndim = input.shape().ndim();
-        const int64_t output_size = output.NumElements();
-        auto in_strides = compute_stride<int64_t>(input.shape().dims());
-        auto out_strides = compute_stride<int64_t>(output.shape().dims());
-
-        Tensor t_perm(DataType::DT_INT, DeviceType::GpuDevice, {ndim});
-        Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-        Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-
-        op::synchronize_memory_op<int, DEVICE_GPU, DEVICE_CPU>()(
-                t_perm.data<int>(), perm.data(), perm.size());
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_in_strides.data<int64_t>(), in_strides.data(), in_strides.size());
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_out_strides.data<int64_t>(), out_strides.data(), out_strides.size());
-
-        const Type* p = reinterpret_cast<const Type*>(input.data());
-        Type* q = reinterpret_cast<Type*>((output.data()));
-
-        const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-        do_transpose_kernel<Type, Conjugate><<<block, THREADS_PER_BLOCK>>> (
-            ndim, output_size, p, t_perm.data<int>(), 
-            t_in_strides.data<int64_t>(), t_out_strides.data<int64_t>(), q);
-    }
-};
-
-template <typename T>
-struct stride_op<T, DEVICE_GPU> {
-    using Type = typename GetTypeThrust<T>::type;
-    void operator()(
-        const Tensor& input,
-        const TensorShape& stride,
-        Tensor& output)
-    {
-        const int ndim = input.shape().ndim();
-        const int64_t output_size = output.NumElements();
-        auto in_strides = compute_stride<int64_t>(input.shape().dims());
-        auto out_strides = compute_stride<int64_t>(output.shape().dims());
-
-        Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-        Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-        Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_stride.data<int64_t>(), stride.dims().data(), t_stride.NumElements());
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_in_strides.data<int64_t>(), in_strides.data(), in_strides.size());
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_out_strides.data<int64_t>(), out_strides.data(), out_strides.size());
-
-        const Type* p = reinterpret_cast<const Type*>(input.data());
-        Type* q = reinterpret_cast<Type*>((output.data()));
-
-        const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-        do_stride_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
-            ndim, output_size, p, t_stride.data<int64_t>(), t_in_strides.data<int64_t>(), t_out_strides.data<int64_t>(), q);
-    }
-};
-
-template <typename T>
-struct inflate_op<T, DEVICE_GPU> {
-    using Type = typename GetTypeThrust<T>::type;
-    void operator()(
-        const Tensor& input,
-        const TensorShape& stride,
-        Tensor& output)
-    {
-        const int ndim = input.shape().ndim();
-        const int64_t output_size = output.NumElements();
-        auto in_strides = compute_stride<int64_t>(input.shape().dims());
-        auto out_strides = compute_stride<int64_t>(output.shape().dims());
-
-        Tensor t_stride(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-        Tensor t_in_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-        Tensor t_out_strides(DataType::DT_INT64, DeviceType::GpuDevice, {ndim});
-
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_stride.data<int64_t>(), stride.dims().data(), t_stride.NumElements());
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_in_strides.data<int64_t>(), in_strides.data(), in_strides.size());
-        op::synchronize_memory_op<int64_t, DEVICE_GPU, DEVICE_CPU>()(
-                t_out_strides.data<int64_t>(), out_strides.data(), out_strides.size());
-
-        const Type* p = reinterpret_cast<const Type*>(input.data());
-        Type* q = reinterpret_cast<Type*>((output.data()));
-
-        const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-        do_inflate_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
-            ndim, output_size, p, t_stride.data<int64_t>(), t_in_strides.data<int64_t>(), t_out_strides.data<int64_t>(), q);
-    }
-};
-
-template <typename T>
-struct reduce_op<T, DEVICE_GPU> {
-    using Type = typename GetTypeThrust<T>::type;
-    void operator()(
-        const Tensor& input,
-        const int64_t& inner_most_dim,
-        Tensor& output)
-    {
-        const int ndim = input.shape().ndim();
-        const int64_t output_size = output.NumElements();
-
-        const Type* p = reinterpret_cast<const Type*>(input.data());
-        Type* q = reinterpret_cast<Type*>((output.data()));
-
-        const int block = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-        do_reduce_kernel<Type><<<block, THREADS_PER_BLOCK>>> (
-            ndim, output_size, inner_most_dim, p, q);
-    }
-};
-
-template struct transpose_op<int, DEVICE_GPU>;
-template struct transpose_op<int64_t, DEVICE_GPU>;
-template struct transpose_op<float, DEVICE_GPU>;
-template struct transpose_op<double, DEVICE_GPU>;
-template struct transpose_op<std::complex<float>, DEVICE_GPU>;
-template struct transpose_op<std::complex<double>, DEVICE_GPU>;
-
-template struct stride_op<int, DEVICE_GPU>;
-template struct stride_op<int64_t, DEVICE_GPU>;
-template struct stride_op<float, DEVICE_GPU>;
-template struct stride_op<double, DEVICE_GPU>;
-template struct stride_op<std::complex<float>, DEVICE_GPU>;
-template struct stride_op<std::complex<double>, DEVICE_GPU>;
-
-template struct inflate_op<int, DEVICE_GPU>;
-template struct inflate_op<int64_t, DEVICE_GPU>;
-template struct inflate_op<float, DEVICE_GPU>;
-template struct inflate_op<double, DEVICE_GPU>;
-template struct inflate_op<std::complex<float>, DEVICE_GPU>;
-template struct inflate_op<std::complex<double>, DEVICE_GPU>;
-
-template struct reduce_op<int, DEVICE_GPU>;
-template struct reduce_op<int64_t, DEVICE_GPU>;
-template struct reduce_op<float, DEVICE_GPU>;
-template struct reduce_op<double, DEVICE_GPU>;
-template struct reduce_op<std::complex<float>, DEVICE_GPU>;
-template struct reduce_op<std::complex<double>, DEVICE_GPU>;
-
-} // namespace op
-} // namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu
new file mode 100644
index 0000000000..b45a7cfd56
--- /dev/null
+++ b/source/module_base/module_container/ATen/kernels/rocm/memory.hip.cu
@@ -0,0 +1,219 @@
+#include <ATen/kernels/memory.h>
+#include <base/macros/rocm.h>
+
+#include <hip/hip_runtime.h>
+#include <thrust/complex.h>
+
+namespace container {
+namespace kernels {
+
+template <typename T>
+__global__ void do_set_memory(
+    T* out,
+    const T var,
+    const size_t size)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx >= size) {return;}
+    out[idx] = var;
+}
+
+template <typename T_out, typename T_in>
+__global__ void do_cast_memory(
+    T_out* out,
+    const T_in* in,
+    const int size)
+{
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx >= size) {return;}
+    out[idx] = static_cast<T_out>(in[idx]);
+}
+
+template <typename T_out, typename T_in>
+__global__ void do_cast_memory(
+    std::complex<T_out>* out,
+    const std::complex<T_in>* in,
+    const int size)
+{
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx >= size) {return;}
+    auto* _out = reinterpret_cast<thrust::complex<T_out>*>(out);
+    const auto* _in = reinterpret_cast<const thrust::complex<T_in>*>(in);
+    _out[idx] = static_cast<thrust::complex<T_out>>(_in[idx]);
+}
+
+template <typename T, typename Device>
+void resize_memory<T, Device>::operator()(
+    T*& arr,
+    const size_t& size,
+    const char* record_in)
+{
+    if (arr != nullptr) {
+        delete_memory<T, container::DEVICE_GPU>()(arr);
+    }
+    hipMalloc((void **)&arr, sizeof(T) * size);
+}
+
+template <typename T, typename Device>
+void set_memory<T, Device>::operator()(
+    T* arr,
+    const T& var, 
+    const size_t& size) 
+{
+    const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    do_set_memory<<<block, THREADS_PER_BLOCK>>>(arr, var, size);
+}
+
+template <typename T>
+struct synchronize_memory<T, DEVICE_CPU, DEVICE_GPU> {
+    void operator()(
+        T *arr_out,
+        const T *arr_in,
+        const size_t& size)
+    {
+        hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToHost);
+    }
+};
+
+template <typename T>
+struct synchronize_memory<T, DEVICE_GPU, DEVICE_CPU> {
+    void operator()(
+        T *arr_out,
+        const T *arr_in,
+        const size_t& size)
+    {
+        hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyHostToDevice);
+    }
+};
+
+template <typename T>
+struct synchronize_memory<T, DEVICE_GPU, DEVICE_GPU> {
+    void operator()(
+        T *arr_out,
+        const T *arr_in,
+        const size_t& size)
+    {
+        hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToDevice);
+    }
+};
+
+template <typename T_out, typename T_in>
+struct cast_memory<T_out, T_in, container::DEVICE_GPU, container::DEVICE_GPU> {
+    void operator()(
+        T_out* arr_out,
+        const T_in* arr_in,
+        const size_t& size)
+    {
+        const int block = static_cast<int>((size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
+        do_cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr_in, size);
+    }
+};
+
+
+template <typename T_out, typename T_in>
+struct cast_memory<T_out, T_in, container::DEVICE_GPU, container::DEVICE_CPU> {
+    void operator()(
+        T_out* arr_out,
+        const T_in* arr_in,
+        const size_t& size)
+    {
+        T_in * arr = nullptr;
+        hipMalloc((void **)&arr, sizeof(T_in) * size);
+        hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyHostToDevice);
+        const int block = static_cast<int>((size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
+        do_cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr, size);
+        hipFree(arr);
+    }
+};
+
+
+template <typename T_out, typename T_in>
+struct cast_memory<T_out, T_in, container::DEVICE_CPU, container::DEVICE_GPU> {
+    void operator()(
+        T_out* arr_out,
+        const T_in* arr_in,
+        const size_t& size)
+    {
+        auto * arr = (T_in*) malloc(sizeof(T_in) * size);
+        hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyDeviceToHost);
+        for (int ii = 0; ii < size; ii++) {
+            arr_out[ii] = static_cast<T_out>(arr[ii]);
+        }
+        free(arr);
+    }
+};
+
+template <typename T, typename Device>
+void delete_memory<T, Device>::operator() (
+    T* arr)
+{
+    hipFree(arr);
+}
+
+template struct resize_memory<int, container::DEVICE_GPU>;
+template struct resize_memory<int64_t, container::DEVICE_GPU>;
+template struct resize_memory<float, container::DEVICE_GPU>;
+template struct resize_memory<double, container::DEVICE_GPU>;
+template struct resize_memory<std::complex<float>, container::DEVICE_GPU>;
+template struct resize_memory<std::complex<double>, container::DEVICE_GPU>;
+
+template struct set_memory<int, container::DEVICE_GPU>;
+template struct set_memory<int64_t , container::DEVICE_GPU>;
+template struct set_memory<float, container::DEVICE_GPU>;
+template struct set_memory<double, container::DEVICE_GPU>;
+template struct set_memory<std::complex<float>, container::DEVICE_GPU>;
+template struct set_memory<std::complex<double>, container::DEVICE_GPU>;
+
+template struct synchronize_memory<int, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct synchronize_memory<int, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct synchronize_memory<int, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct synchronize_memory<int64_t, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct synchronize_memory<int64_t, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct synchronize_memory<int64_t, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct synchronize_memory<float, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct synchronize_memory<float, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct synchronize_memory<float, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct synchronize_memory<double, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct synchronize_memory<double, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct synchronize_memory<double, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct synchronize_memory<std::complex<float>, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct synchronize_memory<std::complex<float>, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct synchronize_memory<std::complex<float>, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct synchronize_memory<std::complex<double>, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct synchronize_memory<std::complex<double>, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct synchronize_memory<std::complex<double>, container::DEVICE_GPU, container::DEVICE_GPU>;
+
+template struct cast_memory<float, float, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<double, double, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<float, double, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<double, float, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<float>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<double>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<float>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<double>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_GPU>;
+template struct cast_memory<float, float, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<double, double, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<float, double, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<double, float, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<std::complex<float>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<std::complex<double>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<std::complex<float>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<std::complex<double>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_CPU>;
+template struct cast_memory<float, float, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<double, double, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<float, double, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<double, float, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<float>, std::complex<float>, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<double>, std::complex<double>, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<float>, std::complex<double>, container::DEVICE_CPU, container::DEVICE_GPU>;
+template struct cast_memory<std::complex<double>, std::complex<float>, container::DEVICE_CPU, container::DEVICE_GPU>;
+
+template struct delete_memory<int, container::DEVICE_GPU>;
+template struct delete_memory<int64_t, container::DEVICE_GPU>;
+template struct delete_memory<float, container::DEVICE_GPU>;
+template struct delete_memory<double, container::DEVICE_GPU>;
+template struct delete_memory<std::complex<float>, container::DEVICE_GPU>;
+template struct delete_memory<std::complex<double>, container::DEVICE_GPU>;
+
+} // end of namespace kernels
+} // end of namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu b/source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu
deleted file mode 100644
index 53af15e8f8..0000000000
--- a/source/module_base/module_container/ATen/kernels/rocm/memory_op.hip.cu
+++ /dev/null
@@ -1,211 +0,0 @@
-#include <ATen/kernels/memory_op.h>
-#include <base/macros/rocm.h>
-
-#include <hip/hip_runtime.h>
-#include <thrust/complex.h>
-
-namespace container {
-namespace op {
-
-template <typename T>
-__global__ void set_memory(
-    T* out,
-    const T var,
-    const size_t size)
-{
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if(idx >= size) {return;}
-    out[idx] = var;
-}
-
-template <typename T_out, typename T_in>
-__global__ void cast_memory(
-    T_out* out,
-    const T_in* in,
-    const int size)
-{
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if(idx >= size) {return;}
-    out[idx] = static_cast<T_out>(in[idx]);
-}
-
-template <typename T_out, typename T_in>
-__global__ void cast_memory(
-    std::complex<T_out>* out,
-    const std::complex<T_in>* in,
-    const int size)
-{
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if(idx >= size) {return;}
-    auto* _out = reinterpret_cast<thrust::complex<T_out>*>(out);
-    const auto* _in = reinterpret_cast<const thrust::complex<T_in>*>(in);
-    _out[idx] = static_cast<thrust::complex<T_out>>(_in[idx]);
-}
-
-template <typename T>
-void resize_memory_op<T, container::DEVICE_GPU>::operator()(
-    T*& arr,
-    const size_t size,
-    const char* record_in)
-{
-    if (arr != nullptr) {
-        delete_memory_op<T, container::DEVICE_GPU>()(arr);
-    }
-    hipMalloc((void **)&arr, sizeof(T) * size);
-}
-
-template <typename T>
-void set_memory_op<T, container::DEVICE_GPU>::operator()(
-    T* arr,
-    const T& var, 
-    const size_t& size) 
-{
-    const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    set_memory<<<block, THREADS_PER_BLOCK>>>(arr, var, size);
-}
-
-template <typename T>
-void synchronize_memory_op<T, container::DEVICE_CPU, container::DEVICE_GPU>::operator()(
-    T* arr_out,
-    const T* arr_in,
-    const size_t size) 
-{
-    hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToHost);
-}
-
-template <typename T>
-void synchronize_memory_op<T, container::DEVICE_GPU, container::DEVICE_CPU>::operator()(
-    T* arr_out,
-    const T* arr_in,
-    const size_t size) 
-{
-    hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyHostToDevice);
-}
-
-template <typename T>
-void synchronize_memory_op<T, container::DEVICE_GPU, container::DEVICE_GPU>::operator()(
-    T* arr_out,
-    const T* arr_in,
-    const size_t size) 
-{
-    hipMemcpy(arr_out, arr_in, sizeof(T) * size, hipMemcpyDeviceToDevice);
-}
-
-template <typename T_out, typename T_in>
-struct cast_memory_op<T_out, T_in, container::DEVICE_GPU, container::DEVICE_GPU> {
-    void operator()(
-        T_out* arr_out,
-        const T_in* arr_in,
-        const size_t size) 
-    {
-        const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-        cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr_in, size);
-    }
-};
-
-template <typename T_out, typename T_in>
-struct cast_memory_op<T_out, T_in, container::DEVICE_GPU, container::DEVICE_CPU> {
-    void operator()(
-        T_out* arr_out,
-        const T_in* arr_in,
-        const size_t size) 
-    {
-        T_in * arr = nullptr;
-        hipMalloc((void **)&arr, sizeof(T_in) * size);
-        hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyHostToDevice);
-        const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-        cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr, size);
-        hipFree(arr);
-    }
-};
-
-template <typename T_out, typename T_in>
-struct cast_memory_op<T_out, T_in, container::DEVICE_CPU, container::DEVICE_GPU> {
-    void operator()(
-        T_out* arr_out,
-        const T_in* arr_in,
-        const size_t size) 
-    {
-        auto * arr = (T_in*) malloc(sizeof(T_in) * size);
-        hipMemcpy(arr, arr_in, sizeof(T_in) * size, hipMemcpyDeviceToHost);
-        for (int ii = 0; ii < size; ii++) {
-            arr_out[ii] = static_cast<T_out>(arr[ii]);
-        }
-        free(arr);
-    }
-};
-
-template <typename T>
-void delete_memory_op<T, container::DEVICE_GPU>::operator() (
-    T* arr)
-{
-    hipFree(arr);
-}
-
-template struct resize_memory_op<int, container::DEVICE_GPU>;
-template struct resize_memory_op<int64_t, container::DEVICE_GPU>;
-template struct resize_memory_op<float, container::DEVICE_GPU>;
-template struct resize_memory_op<double, container::DEVICE_GPU>;
-template struct resize_memory_op<std::complex<float>, container::DEVICE_GPU>;
-template struct resize_memory_op<std::complex<double>, container::DEVICE_GPU>;
-
-template struct set_memory_op<int, container::DEVICE_GPU>;
-template struct set_memory_op<int64_t , container::DEVICE_GPU>;
-template struct set_memory_op<float, container::DEVICE_GPU>;
-template struct set_memory_op<double, container::DEVICE_GPU>;
-template struct set_memory_op<std::complex<float>, container::DEVICE_GPU>;
-template struct set_memory_op<std::complex<double>, container::DEVICE_GPU>;
-
-template struct synchronize_memory_op<int, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<int, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct synchronize_memory_op<int, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<int64_t, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<int64_t, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct synchronize_memory_op<int64_t, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<float, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<float, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct synchronize_memory_op<float, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<double, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<double, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct synchronize_memory_op<double, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<std::complex<float>, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<std::complex<float>, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct synchronize_memory_op<std::complex<float>, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<std::complex<double>, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct synchronize_memory_op<std::complex<double>, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct synchronize_memory_op<std::complex<double>, container::DEVICE_GPU, container::DEVICE_GPU>;
-
-template struct cast_memory_op<float, float, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<double, double, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<float, double, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<double, float, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<float>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<double>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<float>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<double>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_GPU>;
-template struct cast_memory_op<float, float, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<double, double, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<float, double, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<double, float, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<std::complex<float>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<std::complex<double>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<std::complex<float>, std::complex<double>, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<std::complex<double>, std::complex<float>, container::DEVICE_GPU, container::DEVICE_CPU>;
-template struct cast_memory_op<float, float, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<double, double, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<float, double, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<double, float, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<float>, std::complex<float>, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<double>, std::complex<double>, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<float>, std::complex<double>, container::DEVICE_CPU, container::DEVICE_GPU>;
-template struct cast_memory_op<std::complex<double>, std::complex<float>, container::DEVICE_CPU, container::DEVICE_GPU>;
-
-template struct delete_memory_op<int, container::DEVICE_GPU>;
-template struct delete_memory_op<int64_t, container::DEVICE_GPU>;
-template struct delete_memory_op<float, container::DEVICE_GPU>;
-template struct delete_memory_op<double, container::DEVICE_GPU>;
-template struct delete_memory_op<std::complex<float>, container::DEVICE_GPU>;
-template struct delete_memory_op<std::complex<double>, container::DEVICE_GPU>;
-
-} // end of namespace container
-} // end of namespace op
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_hcontainer/transfer.h b/source/module_hamilt_lcao/module_hcontainer/transfer.h
index 1031c9fe0c..deea0f5d7c 100644
--- a/source/module_hamilt_lcao/module_hcontainer/transfer.h
+++ b/source/module_hamilt_lcao/module_hcontainer/transfer.h
@@ -154,7 +154,7 @@ struct MPITraits;
 template <>
 struct MPITraits<int>
 {
-    static constexpr MPI_Datatype datatype()
+    static MPI_Datatype datatype()
     {
         return MPI_INT;
     }
@@ -163,7 +163,7 @@ struct MPITraits<int>
 template <>
 struct MPITraits<double>
 {
-    static constexpr MPI_Datatype datatype()
+    static MPI_Datatype datatype()
     {
         return MPI_DOUBLE;
     }
@@ -172,7 +172,7 @@ struct MPITraits<double>
 template <>
 struct MPITraits<std::complex<double>>
 {
-    static constexpr MPI_Datatype datatype()
+    static MPI_Datatype datatype()
     {
         return MPI_DOUBLE_COMPLEX;
     }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index 306eb8bb46..13122029d1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -31,7 +31,7 @@ __global__ void cal_vkb1_nl(
 
 template <typename FPTYPE>
 __global__ void cal_force_nl(
-        const bool multi_proj,
+        const bool nondiagonal,
         const int wg_nc,
         const int ntype,
         const int spin,
@@ -46,6 +46,8 @@ __global__ void cal_force_nl(
         const int *atom_na,
         const FPTYPE tpiba,
         const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
         const FPTYPE *deeq,
         const thrust::complex<FPTYPE> *becp,
         const thrust::complex<FPTYPE> *dbecp,
@@ -61,30 +63,33 @@ __global__ void cal_force_nl(
     }
 
     int Nprojs = atom_nh[it];
-    double fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
         for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
-            // double ps = GlobalC::ppcell.deeq[GlobalV::CURRENT_SPIN, iat, ip, ip];
-            double ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip];
+            // FPTYPE ps = GlobalC::ppcell.deeq[GlobalV::CURRENT_SPIN, iat, ip, ip];
+            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
+                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
             const int inkb = sum + ip;
             //out<<"\n ps = "<<ps;
 
             for (int ipol = 0; ipol < 3; ipol++) {
-                const double dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) *
+                const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) *
                                     becp[ib * nkb + inkb]).real();
                 // force[iat * forcenl_nc + ipol] -= ps * fac * dbb;
                 atomicAdd(force + iat * forcenl_nc + ipol, -ps * fac * dbb);
                 //cf[iat*3+ipol] += ps * fac * dbb;
             }
 
-            if (multi_proj) {
+            if (nondiagonal) {
                 //for (int ip2=0; ip2<Nprojs; ip2++)
                 for (int ip2 = 0; ip2 < Nprojs; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
-                        ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                        ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
+                             - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
                         for (int ipol = 0; ipol < 3; ipol++) {
-                            const double dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) *
+                            const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) *
                                                 becp[ib * nkb + jnkb]).real();
                             atomicAdd(force + iat * forcenl_nc + ipol, -ps * fac * dbb);
                         }
@@ -128,7 +133,7 @@ void cal_vkb1_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
 template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
         const psi::DEVICE_GPU *ctx,
-        const bool &multi_proj,
+        const bool &nondiagonal,
         const int &nbands_occ,
         const int &wg_nc,
         const int &ntype,
@@ -144,19 +149,21 @@ void cal_force_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
         const int *atom_na,
         const FPTYPE &tpiba,
         const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
         const FPTYPE *deeq,
         const std::complex<FPTYPE> *becp,
         const std::complex<FPTYPE> *dbecp,
         FPTYPE *force)
 {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, 
-            multi_proj,
+            nondiagonal,
             wg_nc, ntype, spin,
             deeq_2, deeq_3, deeq_4,
             forcenl_nc, nbands, ik, nkb,
             atom_nh, atom_na,
             tpiba,
-            d_wg, deeq,
+            d_wg, d_ekb, qq_nt, deeq,
             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
             force);// array of data
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index d4a2f55709..54e1fb2789 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -71,7 +71,7 @@ __global__ void cal_dbecp_noevc_nl(
 
 template <typename FPTYPE>
 __global__ void cal_stress_nl(
-        const bool multi_proj,
+        const bool nondiagonal,
         const int ipol,
         const int jpol,
         const int nkb,
@@ -85,6 +85,8 @@ __global__ void cal_stress_nl(
         const int *atom_nh,
         const int *atom_na,
         const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
         const FPTYPE *deeq,
         const thrust::complex<FPTYPE> *becp,
         const thrust::complex<FPTYPE> *dbecp,
@@ -99,16 +101,17 @@ __global__ void cal_stress_nl(
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0;
+    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
     const int Nprojs = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
         for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
             int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
-            if(!multi_proj && ip1 != ip2) {
+            if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
-            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2];
+            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
+                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const int inkb1 = sum + ip1;
             const int inkb2 = sum + ip2;
             //out<<"\n ps = "<<ps;
@@ -163,7 +166,7 @@ void cal_dbecp_noevc_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
 template <typename FPTYPE>
 void cal_stress_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
         const psi::DEVICE_GPU *ctx,
-        const bool &multi_proj,
+        const bool& nondiagonal,
         const int &ipol,
         const int &jpol,
         const int &nkb,
@@ -178,13 +181,15 @@ void cal_stress_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
         const int *atom_nh,
         const int *atom_na,
         const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
         const FPTYPE *deeq,
         const std::complex<FPTYPE> *becp,
         const std::complex<FPTYPE> *dbecp,
         FPTYPE *stress)
 {
      hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, 
-             multi_proj,
+             nondiagonal,
              ipol,
              jpol,
              nkb,
@@ -198,6 +203,8 @@ void cal_stress_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
              atom_nh,
              atom_na,
              d_wg,
+             d_ekb,
+             qq_nt,
              deeq,
              reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
              reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
diff --git a/source/module_hsolver/diago_cg.cpp b/source/module_hsolver/diago_cg.cpp
index 2adca28072..566e0b8df8 100644
--- a/source/module_hsolver/diago_cg.cpp
+++ b/source/module_hsolver/diago_cg.cpp
@@ -19,9 +19,9 @@ DiagoCG<T, Device>::DiagoCG(const Real* precondition_in)
     this->precondition = precondition_in;
     test_cg = 0;
     reorder = false;
-    this->one = &this->cs.one;
-    this->zero = &this->cs.zero;
-    this->neg_one = &this->cs.neg_one;
+    this->one = new T(static_cast<T>(1.0));
+    this->zero = new T(static_cast<T>(0.0));
+    this->neg_one = new T(static_cast<T>(-1.0));
 }
 
 template<typename T, typename Device>
@@ -35,6 +35,10 @@ DiagoCG<T, Device>::~DiagoCG() {
     delmem_complex_op()(this->ctx, this->gradient);
     delmem_complex_op()(this->ctx, this->g0);
     delmem_complex_op()(this->ctx, this->lagrange);
+
+    delete this->one;
+    delete this->zero;
+    delete this->neg_one;
 }
 
 template<typename T, typename Device>
@@ -531,7 +535,7 @@ void DiagoCG<T, Device>::schmit_orth(
     // be careful , here reduce m+1
     Parallel_Reduce::reduce_pool(lagrange_so, m + 1);
 
-    T var = cs.zero;
+    T var = {};
     syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, &var, lagrange_so + m, 1);
     Real psi_norm = get_real(var);
 
diff --git a/source/module_hsolver/diago_cg.h b/source/module_hsolver/diago_cg.h
index 13d0a87e08..36fe4b6fc4 100644
--- a/source/module_hsolver/diago_cg.h
+++ b/source/module_hsolver/diago_cg.h
@@ -111,7 +111,6 @@ class DiagoCG : public DiagH<T, Device>
     using setmem_var_h_op = psi::memory::set_memory_op<Real, psi::DEVICE_CPU>;
     using syncmem_var_h2d_op = psi::memory::synchronize_memory_op<Real, Device, psi::DEVICE_CPU>;
 
-    consts<T> cs;
     const T * one = nullptr, * zero = nullptr, * neg_one = nullptr;
 };
 
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 5a0f74eb1d..267d5491f7 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -21,6 +21,20 @@ struct GetTypeReal<thrust::complex<double>> {
     using type = double; /**< The return type specialization for std::complex<double>. */
 };
 namespace hsolver {
+template <typename T>
+struct GetTypeThrust {
+    using type = T;
+};
+
+template <>
+struct GetTypeThrust<std::complex<float>> {
+    using type = thrust::complex<float>; /**< The return type specialization for std::complex<float>. */
+};
+
+template <>
+struct GetTypeThrust<std::complex<double>> {
+    using type = thrust::complex<double>; /**< The return type specialization for std::complex<float>. */
+};
 
 static cublasHandle_t cublas_handle = nullptr;
 
@@ -260,14 +274,14 @@ __global__ void vector_div_vector_kernel(
     }
 }
 
-template <typename T>
+template <typename T, typename Real>
 __global__ void constantvector_addORsub_constantVector_kernel(
     const int size,
     T* result,
     const T* vector1,
-    const typename GetTypeReal<T>::type constant1,
+    const Real constant1,
     const T* vector2,
-    const typename GetTypeReal<T>::type constant2)
+    const Real constant2)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size) 
@@ -563,61 +577,26 @@ void vector_div_vector_op<std::complex<double>, psi::DEVICE_GPU>::operator()(
     vector_div_vector_complex_wrapper(d, dim, result, vector1, vector2);
 }
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
-template <>
-void constantvector_addORsub_constantVector_op<double, psi::DEVICE_GPU>::operator()(
-    const psi::DEVICE_GPU* d,
-    const int& dim,
-    double* result,
-    const double* vector1,
-    const double constant1,
-    const double* vector2,
-    const double constant2)
-{
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    constantvector_addORsub_constantVector_kernel<double> << <block, thread >> > (dim, result, vector1, constant1, vector2, constant2);
-}
-template <typename FPTYPE>
-inline void constantvector_addORsub_constantVector_complex_wrapper(
+template <typename T>
+void constantvector_addORsub_constantVector_op<T, psi::DEVICE_GPU>::operator()(
     const psi::DEVICE_GPU* d,
     const int& dim,
-    std::complex<FPTYPE>* result,
-    const std::complex<FPTYPE>* vector1,
-    const FPTYPE constant1,
-    const std::complex<FPTYPE>* vector2,
-    const FPTYPE constant2)
+    T* result,
+    const T* vector1,
+    const Real constant1,
+    const T* vector2,
+    const Real constant2) 
 {
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    const thrust::complex<FPTYPE>* vector2_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector2);
+    using Type = typename GetTypeThrust<T>::type;
+    using Real = typename GetTypeReal<T>::type;
+    
+    auto result_tmp = reinterpret_cast<Type*>(result);
+    auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
+    auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
 
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    constantvector_addORsub_constantVector_kernel<thrust::complex<FPTYPE>> << <block, thread >> > (dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
-}
-template <>
-void constantvector_addORsub_constantVector_op<std::complex<float>, psi::DEVICE_GPU>::operator()(
-    const psi::DEVICE_GPU* d,
-    const int& dim,
-    std::complex<float>* result,
-    const std::complex<float>* vector1,
-    const float constant1,
-    const std::complex<float>* vector2,
-    const float constant2)
-{
-    constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2);
-}
-template <>
-void constantvector_addORsub_constantVector_op<std::complex<double>, psi::DEVICE_GPU>::operator()(
-    const psi::DEVICE_GPU* d,
-    const int& dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector1,
-    const double constant1,
-    const std::complex<double>* vector2,
-    const double constant2)
-{
-    constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2);
+    constantvector_addORsub_constantVector_kernel<Type, Real> <<<block, thread >>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
 }
 
 template <>
diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
index 1872fbdd2e..b1aebe833b 100644
--- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
@@ -12,6 +12,7 @@ void destroyGpuSolverHandle() {
     return;
 }
 
+#ifdef __LCAO
 template <>
 void dngvd_op<double, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx,
     const int nstart,
@@ -32,6 +33,7 @@ void dngvd_op<double, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx,
     hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice);
     hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice);
 }
+#endif // __LCAO
 
 template <>
 void dngvd_op<std::complex<float>, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx,
@@ -75,6 +77,7 @@ void dngvd_op<std::complex<double>, psi::DEVICE_GPU>::operator()(const psi::DEVI
     hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice);
 }
 
+#ifdef __LCAO
 template <>
 void dnevx_op<double, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx,
     const int nstart,
@@ -93,6 +96,7 @@ void dnevx_op<double, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx,
     hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice);
     hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice);
 }
+#endif // __LCAO
 
 template <>
 void dnevx_op<std::complex<float>, psi::DEVICE_GPU>::operator()(const psi::DEVICE_GPU* ctx,
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index a03ac0bc7a..a0a259621a 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -9,9 +9,31 @@
 #define WARP_SIZE 32
 #define FULL_MASK 0xffffffff
 #define THREAD_PER_BLOCK 256
+template <>
+struct GetTypeReal<thrust::complex<float>> {
+    using type = float; /**< The return type specialization for std::complex<double>. */
+};
+template <>
+struct GetTypeReal<thrust::complex<double>> {
+    using type = double; /**< The return type specialization for std::complex<double>. */
+};
 
 namespace hsolver {
 
+template <typename T>
+struct GetTypeThrust {
+    using type = T;
+};
+
+template <>
+struct GetTypeThrust<std::complex<float>> {
+    using type = thrust::complex<float>; /**< The return type specialization for std::complex<float>. */
+};
+
+template <>
+struct GetTypeThrust<std::complex<double>> {
+    using type = thrust::complex<double>; /**< The return type specialization for std::complex<float>. */
+};
 
 static hipblasHandle_t cublas_handle = nullptr;
 
@@ -233,7 +255,7 @@ __global__ void vector_mul_vector_kernel(
     }
 }
 
-template <typename FPTYTPE>
+template <typename T>
 __launch_bounds__(1024) 
 __global__ void vector_div_vector_kernel(
     const int size, 
@@ -248,15 +270,15 @@ __global__ void vector_div_vector_kernel(
     }
 }
 
-template <typename T>
+template <typename T, typename Real>
 __launch_bounds__(1024) 
 __global__ void constantvector_addORsub_constantVector_kernel(
     const int size,
     T* result,
     const T* vector1,
-    const  typename GetTypeReal<T>::type constant1,
+    const Real constant1,
     const T* vector2,
-    const  typename GetTypeReal<T>::type constant2)
+    const Real constant2)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size) 
@@ -408,7 +430,7 @@ double dot_real_op<std::complex<double>, psi::DEVICE_GPU>::operator()(
 }
 
 template <>
-void vector_div_constant_op<double psi::DEVICE_GPU>::operator()(
+void vector_div_constant_op<double, psi::DEVICE_GPU>::operator()(
     const psi::DEVICE_GPU* d,
     const int dim,
     double* result,
@@ -553,62 +575,26 @@ void vector_div_vector_op<std::complex<double>, psi::DEVICE_GPU>::operator()(
 }
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
-template <>
-void constantvector_addORsub_constantVector_op<double, psi::DEVICE_GPU>::operator()(
-    const psi::DEVICE_GPU* d,
-    const int& dim,
-    double* result,
-    const double* vector1,
-    const double constant1,
-    const double* vector2,
-    const double constant2)
-{
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(constantvector_addORsub_constantVector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, constant1, vector2, constant2);
-}
-
-template <typename FPTYPE>
-inline void constantvector_addORsub_constantVector_complex_wrapper(
+template <typename T>
+void constantvector_addORsub_constantVector_op<T, psi::DEVICE_GPU>::operator()(
     const psi::DEVICE_GPU* d,
     const int& dim,
-    std::complex<FPTYPE>* result,
-    const std::complex<FPTYPE>* vector1,
-    const FPTYPE constant1,
-    const std::complex<FPTYPE>* vector2,
-    const FPTYPE constant2)
+    T* result,
+    const T* vector1,
+    const Real constant1,
+    const T* vector2,
+    const Real constant2) 
 {
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    const thrust::complex<FPTYPE>* vector2_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector2);
+    using Type = typename GetTypeThrust<T>::type;
+    using Real = typename GetTypeReal<T>::type;
+    
+    auto result_tmp = reinterpret_cast<Type*>(result);
+    auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
+    auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
 
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(constantvector_addORsub_constantVector_kernel<std::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
-}
-template <>
-void constantvector_addORsub_constantVector_op<std::complex<float>, psi::DEVICE_GPU>::operator()(
-    const psi::DEVICE_GPU* d,
-    const int& dim,
-    std::complex<float>* result,
-    const std::complex<float>* vector1,
-    const float constant1,
-    const std::complex<float>* vector2,
-    const float constant2)
-{
-    constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2);
-}
-template <>
-void constantvector_addORsub_constantVector_op<std::complex<double>, psi::DEVICE_GPU>::operator()(
-    const psi::DEVICE_GPU* d,
-    const int& dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector1,
-    const double constant1,
-    const std::complex<double>* vector2,
-    const double constant2)
-{
-    constantvector_addORsub_constantVector_complex_wrapper(d, dim, result, vector1, constant1, vector2, constant2);
+    constantvector_addORsub_constantVector_kernel<Type, Real> <<<block, thread >>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
 }
 
 template <>
@@ -675,7 +661,7 @@ void gemv_op<double, psi::DEVICE_GPU>::operator()(
     else if (trans == 'C') {
         cutrans = HIPBLAS_OP_C;
     }
-    hipblasZgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx);
+    hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx);
 }
 
 template <>
@@ -700,7 +686,7 @@ void gemv_op<std::complex<float>, psi::DEVICE_GPU>::operator()(
     else if (trans == 'T'){
         cutrans = HIPBLAS_OP_T;
     } 
-    hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx);
+    hipblasCgemv(cublas_handle, cutrans, m, n, (hipblasComplex*)alpha, (hipblasComplex*)A, lda, (hipblasComplex*)X, incx, (hipblasComplex*)beta, (hipblasComplex*)Y, incx);
 }
 
 template <> 
@@ -1024,7 +1010,7 @@ template struct matrixSetToAnother<std::complex<double>, psi::DEVICE_GPU>;
 template struct dot_real_op<double, psi::DEVICE_GPU>;
 template struct vector_div_constant_op<double, psi::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, psi::DEVICE_GPU>;
-template struct vector_div_vector_op<double psi::DEVICE_GPU>;
+template struct vector_div_vector_op<double, psi::DEVICE_GPU>;
 template struct matrixSetToAnother<double, psi::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, psi::DEVICE_GPU>;
 #endif