From 019270302f22404da6c067bc64c184f11e0b62a3 Mon Sep 17 00:00:00 2001
From: haozhihan <haozhi.han@stu.pku.edu.cn>
Date: Fri, 14 Jun 2024 16:26:02 +0800
Subject: [PATCH 1/2] delete some useless files

---
 source/module_psi/kernels/cuda/memory_op.cu   | 288 --------
 source/module_psi/kernels/device.cpp          | 687 ------------------
 source/module_psi/kernels/device.h            |  56 --
 source/module_psi/kernels/memory_op.h         | 234 ------
 source/module_psi/kernels/psi_memory_op.cpp   | 338 ---------
 .../module_psi/kernels/rocm/memory_op.hip.cu  | 269 -------
 source/module_psi/kernels/test/CMakeLists.txt |   5 -
 .../module_psi/kernels/test/device_test.cpp   |  31 -
 .../module_psi/kernels/test/memory_test.cpp   | 349 ---------
 source/module_psi/kernels/types.h             |  16 -
 10 files changed, 2273 deletions(-)
 delete mode 100644 source/module_psi/kernels/cuda/memory_op.cu
 delete mode 100644 source/module_psi/kernels/device.cpp
 delete mode 100644 source/module_psi/kernels/device.h
 delete mode 100644 source/module_psi/kernels/memory_op.h
 delete mode 100644 source/module_psi/kernels/psi_memory_op.cpp
 delete mode 100644 source/module_psi/kernels/rocm/memory_op.hip.cu
 delete mode 100644 source/module_psi/kernels/test/CMakeLists.txt
 delete mode 100644 source/module_psi/kernels/test/device_test.cpp
 delete mode 100644 source/module_psi/kernels/test/memory_test.cpp
 delete mode 100644 source/module_psi/kernels/types.h
diff --git a/source/module_psi/kernels/cuda/memory_op.cu b/source/module_psi/kernels/cuda/memory_op.cu
deleted file mode 100644
index 89b745528d..0000000000
--- a/source/module_psi/kernels/cuda/memory_op.cu
+++ /dev/null
@@ -1,288 +0,0 @@
-// #include "module_psi/kernels/memory_op.h"
-
-// #include <complex>
-// #include <type_traits>
-
-// #include <cuda_runtime.h>
-// #include <thrust/complex.h>
-// #include <base/macros/macros.h>
-
-// #define THREADS_PER_BLOCK 256
-
-// namespace psi {
-// namespace memory {
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// __global__ void cast_memory(
-//         FPTYPE_out* out,
-//         const FPTYPE_in* in,
-//         const int size)
-// {
-//     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-//     if(idx >= size) {return;}
-//     out[idx] = static_cast<FPTYPE_out>(in[idx]);
-// }
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// __global__ void cast_memory(
-//         std::complex<FPTYPE_out>* out,
-//         const std::complex<FPTYPE_in>* in,
-//         const int size)
-// {
-//     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-//     if(idx >= size) {return;}
-//     auto* _out = reinterpret_cast<thrust::complex<FPTYPE_out>*>(out);
-//     const auto* _in = reinterpret_cast<const thrust::complex<FPTYPE_in>*>(in);
-//     _out[idx] = static_cast<thrust::complex<FPTYPE_out>>(_in[idx]);
-// }
-
-// template <typename FPTYPE>
-// void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-//                                                                    FPTYPE*& arr,
-//                                                                    const size_t size,
-//                                                                    const char* record_in)
-// {
-//   if (arr != nullptr) {
-//       delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(dev, arr);
-//   }
-//   cudaErrcheck(cudaMalloc((void **)&arr, sizeof(FPTYPE) * size));
-// }
-
-// template <typename FPTYPE>
-// void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-//                                                                 FPTYPE* arr,
-//                                                                 const int var,
-//                                                                 const size_t size)
-// {
-//   cudaErrcheck(cudaMemset(arr, var, sizeof(FPTYPE) * size));  
-// }
-
-// template <typename FPTYPE>
-// void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
-//     const base_device::DEVICE_CPU* dev_out,
-//     const base_device::DEVICE_GPU* dev_in,
-//     FPTYPE* arr_out,
-//     const FPTYPE* arr_in,
-//     const size_t size)
-// {
-//   cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));  
-// }
-
-// template <typename FPTYPE>
-// void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
-//     const base_device::DEVICE_GPU* dev_out,
-//     const base_device::DEVICE_CPU* dev_in,
-//     FPTYPE* arr_out,
-//     const FPTYPE* arr_in,
-//     const size_t size)
-// {
-//   cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));  
-// }
-
-// template <typename FPTYPE>
-// void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
-//     const base_device::DEVICE_GPU* dev_out,
-//     const base_device::DEVICE_GPU* dev_in,
-//     FPTYPE* arr_out,
-//     const FPTYPE* arr_in,
-//     const size_t size)
-// {
-//   cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyDeviceToDevice));  
-// }
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size)
-//     {
-//         if (size == 0) {return;}
-//         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//         cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr_in, size);
-
-//         cudaErrcheck(cudaGetLastError());
-//         cudaErrcheck(cudaDeviceSynchronize());
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, psi::DEVICE_GPU, psi::DEVICE_CPU> {
-//     void operator()(const psi::DEVICE_GPU* dev_out,
-//                     const psi::DEVICE_CPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size) {
-        
-//         if (size == 0) {return;}
-//         // No need to cast the memory if the data types are the same.
-//         if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
-//         {
-//             synchronize_memory_op<FPTYPE_out, psi::DEVICE_GPU, psi::DEVICE_CPU>()(dev_out,
-//                                                                          dev_in,
-//                                                                          arr_out,
-//                                                                          reinterpret_cast<const FPTYPE_out*>(arr_in),
-//                                                                          size);
-//             return;
-//         }
-//         FPTYPE_in * arr = nullptr;
-//         cudaErrcheck(cudaMalloc((void **)&arr, sizeof(FPTYPE_in) * size));
-//         cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyHostToDevice));
-//         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//         cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr, size);
-//         cudaErrcheck(cudaGetLastError());
-//         cudaErrcheck(cudaDeviceSynchronize());
-//         cudaErrcheck(cudaFree(arr));
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, psi::DEVICE_CPU, psi::DEVICE_GPU> {
-//     void operator()(const psi::DEVICE_CPU* dev_out,
-//                     const psi::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size) {
-//         if (size == 0) {return;}
-//         // No need to cast the memory if the data types are the same.
-//         if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
-//         {
-//             synchronize_memory_op<FPTYPE_out, psi::DEVICE_CPU, psi::DEVICE_GPU>()(dev_out,
-//                                                                                   dev_in,
-//                                                                                   arr_out,
-//                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
-//                                                                                   size);
-//             return;
-//         }
-//         auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
-//         cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyDeviceToHost));
-//         for (int ii = 0; ii < size; ii++) {
-//             arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
-//         }
-//         free(arr);
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size)
-//     {
-//         auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
-//         cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyDeviceToHost));
-//         for (int ii = 0; ii < size; ii++) {
-//             arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
-//         }
-//         free(arr);
-//     }
-// };
-
-// template <typename FPTYPE>
-// void delete_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr)
-// {
-//   cudaErrcheck(cudaFree(arr));
-// }
-
-// template struct resize_memory_op<int, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<float, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<double, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// template struct set_memory_op<int, base_device::DEVICE_GPU>;
-// template struct set_memory_op<float, base_device::DEVICE_GPU>;
-// template struct set_memory_op<double, base_device::DEVICE_GPU>;
-// template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-
-// template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-
-// template struct delete_memory_op<int, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<float, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<double, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// } // end of namespace gpu_cuda
-// } // end of namespace psi
\ No newline at end of file
diff --git a/source/module_psi/kernels/device.cpp b/source/module_psi/kernels/device.cpp
deleted file mode 100644
index b2505388cb..0000000000
--- a/source/module_psi/kernels/device.cpp
+++ /dev/null
@@ -1,687 +0,0 @@
-
-// #include <iostream>
-// #include <cstring>
-// #include "module_base/module_device/types.h"
-// #include "module_base/tool_quit.h"
-
-// #include <base/macros/macros.h>
-
-// #if defined(__CUDA)
-// #include <cuda_runtime.h>
-// #endif
-
-// #if defined(__ROCM)
-// #include <hip/hip_runtime.h>
-// #endif
-
-// #ifdef __MPI
-// #include "mpi.h"
-// #endif
-
-// namespace psi{
-
-// namespace device{
-
-// // static bool is_init = false;
-
-// // // functions used in custom ops
-// // template <>
-// // base_device::AbacusDevice_t get_device_type<base_device::DEVICE_CPU>(const base_device::DEVICE_CPU* dev)
-// // {
-// //     return base_device::CpuDevice;
-// // }
-// // template<> std::string get_current_precision(const double* var) {
-// //     return "double";
-// // }
-// // template<> std::string get_current_precision (const std::complex<float> * var) {
-// //     return "single";
-// // }
-// // template<> std::string get_current_precision (const std::complex<double> * var) {
-// //     return "double";
-// // }
-
-// #if ((defined __CUDA) || (defined __ROCM))
-// // template <>
-// // base_device::AbacusDevice_t get_device_type<base_device::DEVICE_GPU>(const base_device::DEVICE_GPU* dev)
-// // {
-// //     return base_device::GpuDevice;
-// // }
-
-// void set_device(const int rank) {
-// #if defined (__CUDA)
-//     cudaErrcheck(cudaSetDevice(rank));
-// #elif defined (__ROCM)
-//     hipErrcheck(hipSetDevice(rank));
-// #endif
-// }
-
-// int get_device_num() {
-//     int device_num = -1;
-// #if defined (__CUDA)
-//     cudaErrcheck(cudaGetDeviceCount(&device_num));
-// #elif defined (__ROCM)
-//     hipErrcheck(hipGetDeviceCount(&device_num));
-// #endif
-//     return device_num;
-// }
-// #endif
-
-// // #if defined(__CUDA)
-// // template <>
-// // void print_device_info<base_device::DEVICE_GPU>(const base_device::DEVICE_GPU* ctx, std::ofstream& ofs_device)
-// // {
-// //     if (is_init)
-// //     {
-// //         return;
-// //     }
-// //     int deviceCount = 0;
-// //     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
-// //     if (error_id != cudaSuccess)
-// //     {
-// //         ofs_device << "cudaGetDeviceCount returned " << static_cast<int>(error_id) << "\n-> "
-// //                    << cudaGetErrorString(error_id) << std::endl;
-// //         ModuleBase::WARNING_QUIT("device", "GPU returned is without cudaSuccess");
-// //     }
-// //     // This function call returns 0 if there are no CUDA capable devices.
-// //     if (deviceCount == 0)
-// //     {
-// //         ofs_device << "There are no available device(s) that support CUDA\n";
-// //     }
-// //     else
-// //     {
-// //         ofs_device << "Detected " << deviceCount << " CUDA Capable device(s)\n";
-// //     }
-// //   int dev = 0, driverVersion = 0, runtimeVersion = 0;
-// //   cudaErrcheck(cudaSetDevice(dev));
-// //   cudaDeviceProp deviceProp;
-// //   cudaErrcheck(cudaGetDeviceProperties(&deviceProp, dev));
-// //   ofs_device << "\nDevice " << dev << ":\t " << deviceProp.name << std::endl;
-// //   // Console log
-// //   cudaErrcheck(cudaDriverGetVersion(&driverVersion));
-// //   cudaErrcheck(cudaRuntimeGetVersion(&runtimeVersion));
-// //   char msg[1024];
-// //   sprintf(msg,
-// //           "  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
-// //           driverVersion / 1000, (driverVersion % 100) / 10,
-// //           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
-// //   ofs_device << msg << std::endl;
-// //   sprintf(msg,
-// //           "  CUDA Capability Major/Minor version number:    %d.%d\n",
-// //           deviceProp.major, deviceProp.minor);
-// //   ofs_device << msg << std::endl;
-// //   sprintf(msg,
-// //           "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
-// //           "GHz)\n",
-// //           deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
-// //   ofs_device << msg << std::endl;
-// //   // This is supported in CUDA 5.0 (runtime API device properties)
-// //   sprintf(msg, 
-// //           "  Memory Clock rate:                             %.0f Mhz\n",
-// //           deviceProp.memoryClockRate * 1e-3f);
-// //   ofs_device << msg << std::endl;   
-   
-// //   sprintf(msg, 
-// //           "  Memory Bus Width:                              %d-bit\n",
-// //           deviceProp.memoryBusWidth);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg,
-// //       "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
-// //       "%d), 3D=(%d, %d, %d)\n",
-// //       deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
-// //       deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
-// //       deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
-// //   ofs_device << msg << std::endl;   
-  
-// //   sprintf(msg, 
-// //       "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
-// //       deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, 
-// //       "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
-// //       "layers\n",
-// //       deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
-// //       deviceProp.maxTexture2DLayered[2]);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Total amount of constant memory:               %zu bytes\n",
-// //          deviceProp.totalConstMem);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Total amount of shared memory per block:       %zu bytes\n",
-// //          deviceProp.sharedMemPerBlock);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Total shared memory per multiprocessor:        %zu bytes\n",
-// //          deviceProp.sharedMemPerMultiprocessor);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Total number of registers available per block: %d\n",
-// //          deviceProp.regsPerBlock);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Warp size:                                     %d\n",
-// //          deviceProp.warpSize);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Maximum number of threads per multiprocessor:  %d\n",
-// //          deviceProp.maxThreadsPerMultiProcessor);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Maximum number of threads per block:           %d\n",
-// //          deviceProp.maxThreadsPerBlock);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
-// //          deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
-// //          deviceProp.maxThreadsDim[2]);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
-// //          deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
-// //          deviceProp.maxGridSize[2]);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Maximum memory pitch:                          %zu bytes\n",
-// //          deviceProp.memPitch);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Texture alignment:                             %zu bytes\n",
-// //          deviceProp.textureAlignment);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, 
-// //       "  Concurrent copy and kernel execution:          %s with %d copy "
-// //       "engine(s)\n",
-// //       (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Run time limit on kernels:                     %s\n",
-// //          deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Integrated GPU sharing Host Memory:            %s\n",
-// //          deviceProp.integrated ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Support host page-locked memory mapping:       %s\n",
-// //          deviceProp.canMapHostMemory ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Alignment requirement for Surfaces:            %s\n",
-// //          deviceProp.surfaceAlignment ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device has ECC support:                        %s\n",
-// //          deviceProp.ECCEnabled ? "Enabled" : "Disabled");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device supports Unified Addressing (UVA):      %s\n",
-// //          deviceProp.unifiedAddressing ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device supports Managed Memory:                %s\n",
-// //          deviceProp.managedMemory ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device supports Compute Preemption:            %s\n",
-// //          deviceProp.computePreemptionSupported ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Supports Cooperative Kernel Launch:            %s\n",
-// //          deviceProp.cooperativeLaunch ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Supports MultiDevice Co-op Kernel Launch:      %s\n",
-// //          deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
-// //          deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
-// //   ofs_device << msg << std::endl;   
-// //   const char *sComputeMode[] = {
-// //       "Default (multiple host threads can use ::cudaSetDevice() with device "
-// //       "simultaneously)",
-// //       "Exclusive (only one host thread in one process is able to use "
-// //       "::cudaSetDevice() with this device)",
-// //       "Prohibited (no host thread can use ::cudaSetDevice() with this "
-// //       "device)",
-// //       "Exclusive Process (many threads in one process is able to use "
-// //       "::cudaSetDevice() with this device)",
-// //       "Unknown", NULL};
-// //   sprintf(msg, "  Compute Mode:\n");
-// //   ofs_device << msg << std::endl;   
-// //   ofs_device << "  " << sComputeMode[deviceProp.computeMode] << std::endl << std::endl;
-
-// //   // If there are 2 or more GPUs, query to determine whether RDMA is supported
-// //   if (deviceCount >= 2) {
-// //     cudaDeviceProp prop[64];
-// //     int gpuid[64];  // we want to find the first two GPUs that can support P2P
-// //     int gpu_p2p_count = 0;
-
-// //     for (int i = 0; i < deviceCount; i++) {
-// //       cudaErrcheck(cudaGetDeviceProperties(&prop[i], i));
-
-// //       // Only boards based on Fermi or later can support P2P
-// //       if (prop[i].major >= 2) {
-// //         // This is an array of P2P capable GPUs
-// //         gpuid[gpu_p2p_count++] = i;
-// //       }
-// //     }
-
-// //     // Show all the combinations of support P2P GPUs
-// //     int can_access_peer;
-
-// //     if (gpu_p2p_count >= 2) {
-// //       for (int i = 0; i < gpu_p2p_count; i++) {
-// //         for (int j = 0; j < gpu_p2p_count; j++) {
-// //           if (gpuid[i] == gpuid[j]) {
-// //             continue;
-// //           }
-// //           cudaErrcheck(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
-// //           sprintf(msg, "> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
-// //                  prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
-// //                  can_access_peer ? "Yes" : "No");
-// //           ofs_device << msg << std::endl;
-// //         }
-// //       }
-// //     }
-// //   }
-
-// //   // csv masterlog info
-// //   // *****************************
-// //   // exe and CUDA driver name
-// //   std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
-// //   char cTemp[16];
-
-// //   // driver version
-// //   sProfileString += ", CUDA Driver Version = ";
-
-// //   snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
-// //            (driverVersion % 100) / 10);
-// //   sProfileString += cTemp;
-
-// //   // Runtime version
-// //   sProfileString += ", CUDA Runtime Version = ";
-// //   snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
-// //            (runtimeVersion % 100) / 10);
-// //   sProfileString += cTemp;
-
-// //   // Device count
-// //   sProfileString += ", NumDevs = ";
-// //   snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
-// //   sProfileString += cTemp;
-// //   sProfileString += "\n";
-
-// //   ofs_device << sProfileString.c_str() << std::endl;
-// //   is_init = true;
-// //   ofs_device << "End of device informations." << std::endl << std::endl;
-// // }
-
-// // template <>
-// // void record_device_memory<base_device::DEVICE_GPU>(const base_device::DEVICE_GPU* ctx,
-// //                                                    std::ofstream& ofs_device,
-// //                                                    std::string str,
-// //                                                    size_t size)
-// // {
-// //     ofs_device << "Allocate " << static_cast<double>(size) / 8 / 1024 / 1024 << " \tMB device memory\t"
-// //                << "from " << str << std::endl
-// //                << std::endl;
-// // }
-
-// // #elif defined(__ROCM)
-// // template <>
-// // void print_device_info<base_device::DEVICE_GPU>(const base_device::DEVICE_GPU* ctx, std::ofstream& ofs_device)
-// // {
-// //     if (is_init)
-// //     {
-// //         return;
-// //     }
-// //     int deviceCount = 0;
-// //     hipError_t error_id = hipGetDeviceCount(&deviceCount);
-// //     if (error_id != hipSuccess)
-// //     {
-// //         ofs_device << "hipGetDeviceCount returned " << static_cast<int>(error_id) << "\n-> "
-// //                    << hipGetErrorString(error_id) << std::endl;
-// //         ModuleBase::WARNING_QUIT("device", "GPU returned is without hipSuccess");
-// //     }
-// //     // This function call returns 0 if there are no CUDA capable devices.
-// //     if (deviceCount == 0)
-// //     {
-// //         ofs_device << "There are no available device(s) that support CUDA\n";
-// //     }
-// //     else
-// //     {
-// //         ofs_device << "Detected " << deviceCount << " CUDA Capable device(s)\n";
-// //     }
-// //   int dev = 0, driverVersion = 0, runtimeVersion = 0;
-// //   hipErrcheck(hipSetDevice(dev));
-// //   hipDeviceProp_t deviceProp;
-// //   hipErrcheck(hipGetDeviceProperties(&deviceProp, dev));
-// //   ofs_device << "\nDevice " << dev << ":\t " << deviceProp.name << std::endl;
-// //   // Console log
-// //   hipErrcheck(hipDriverGetVersion(&driverVersion));
-// //   hipErrcheck(hipRuntimeGetVersion(&runtimeVersion));
-// //   char msg[1024];
-// //   sprintf(msg,
-// //           "  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
-// //           driverVersion / 1000, (driverVersion % 100) / 10,
-// //           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
-// //   ofs_device << msg << std::endl;
-// //   sprintf(msg,
-// //           "  CUDA Capability Major/Minor version number:    %d.%d\n",
-// //           deviceProp.major, deviceProp.minor);
-// //   ofs_device << msg << std::endl;
-// //   sprintf(msg,
-// //           "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
-// //           "GHz)\n",
-// //           deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
-// //   ofs_device << msg << std::endl;
-// //   // This is supported in CUDA 5.0 (runtime API device properties)
-// //   sprintf(msg, 
-// //           "  Memory Clock rate:                             %.0f Mhz\n",
-// //           deviceProp.memoryClockRate * 1e-3f);
-// //   ofs_device << msg << std::endl;   
-   
-// //   sprintf(msg, 
-// //           "  Memory Bus Width:                              %d-bit\n",
-// //           deviceProp.memoryBusWidth);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg,
-// //       "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
-// //       "%d), 3D=(%d, %d, %d)\n",
-// //       deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
-// //       deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
-// //       deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
-// //   ofs_device << msg << std::endl;   
-  
-// //   sprintf(msg, "  Total amount of constant memory:               %zu bytes\n",
-// //          deviceProp.totalConstMem);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Total amount of shared memory per block:       %zu bytes\n",
-// //          deviceProp.sharedMemPerBlock);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Total number of registers available per block: %d\n",
-// //          deviceProp.regsPerBlock);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Warp size:                                     %d\n",
-// //          deviceProp.warpSize);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Maximum number of threads per multiprocessor:  %d\n",
-// //          deviceProp.maxThreadsPerMultiProcessor);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Maximum number of threads per block:           %d\n",
-// //          deviceProp.maxThreadsPerBlock);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
-// //          deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
-// //          deviceProp.maxThreadsDim[2]);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
-// //          deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
-// //          deviceProp.maxGridSize[2]);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Maximum memory pitch:                          %zu bytes\n",
-// //          deviceProp.memPitch);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Texture alignment:                             %zu bytes\n",
-// //          deviceProp.textureAlignment);
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Run time limit on kernels:                     %s\n",
-// //          deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Integrated GPU sharing Host Memory:            %s\n",
-// //          deviceProp.integrated ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Support host page-locked memory mapping:       %s\n",
-// //          deviceProp.canMapHostMemory ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device has ECC support:                        %s\n",
-// //          deviceProp.ECCEnabled ? "Enabled" : "Disabled");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device supports Managed Memory:                %s\n",
-// //          deviceProp.managedMemory ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Supports Cooperative Kernel Launch:            %s\n",
-// //          deviceProp.cooperativeLaunch ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Supports MultiDevice Co-op Kernel Launch:      %s\n",
-// //          deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
-// //   ofs_device << msg << std::endl;   
-// //   sprintf(msg, "  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
-// //          deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
-// //   ofs_device << msg << std::endl;   
-// //   const char *sComputeMode[] = {
-// //       "Default (multiple host threads can use ::hipSetDevice() with device "
-// //       "simultaneously)",
-// //       "Exclusive (only one host thread in one process is able to use "
-// //       "::hipSetDevice() with this device)",
-// //       "Prohibited (no host thread can use ::hipSetDevice() with this "
-// //       "device)",
-// //       "Exclusive Process (many threads in one process is able to use "
-// //       "::hipSetDevice() with this device)",
-// //       "Unknown", NULL};
-// //   sprintf(msg, "  Compute Mode:\n");
-// //   ofs_device << msg << std::endl;   
-// //   ofs_device << "  " << sComputeMode[deviceProp.computeMode] << std::endl << std::endl;
-
-// //   // If there are 2 or more GPUs, query to determine whether RDMA is supported
-// //   if (deviceCount >= 2) {
-// //     hipDeviceProp_t prop[64];
-// //     int gpuid[64];  // we want to find the first two GPUs that can support P2P
-// //     int gpu_p2p_count = 0;
-
-// //     for (int i = 0; i < deviceCount; i++) {
-// //       hipErrcheck(hipGetDeviceProperties(&prop[i], i));
-
-// //       // Only boards based on Fermi or later can support P2P
-// //       if (prop[i].major >= 2) {
-// //         // This is an array of P2P capable GPUs
-// //         gpuid[gpu_p2p_count++] = i;
-// //       }
-// //     }
-
-// //     // Show all the combinations of support P2P GPUs
-// //     int can_access_peer;
-
-// //     if (gpu_p2p_count >= 2) {
-// //       for (int i = 0; i < gpu_p2p_count; i++) {
-// //         for (int j = 0; j < gpu_p2p_count; j++) {
-// //           if (gpuid[i] == gpuid[j]) {
-// //             continue;
-// //           }
-// //           hipErrcheck(hipDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
-// //           sprintf(msg, "> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
-// //                  prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
-// //                  can_access_peer ? "Yes" : "No");
-// //           ofs_device << msg << std::endl;
-// //         }
-// //       }
-// //     }
-// //   }
-
-// //   // csv masterlog info
-// //   // *****************************
-// //   // exe and CUDA driver name
-// //   std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
-// //   char cTemp[16];
-
-// //   // driver version
-// //   sProfileString += ", CUDA Driver Version = ";
-
-// //   snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
-// //            (driverVersion % 100) / 10);
-// //   sProfileString += cTemp;
-
-// //   // Runtime version
-// //   sProfileString += ", CUDA Runtime Version = ";
-// //   snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
-// //            (runtimeVersion % 100) / 10);
-// //   sProfileString += cTemp;
-
-// //   // Device count
-// //   sProfileString += ", NumDevs = ";
-// //   snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
-// //   sProfileString += cTemp;
-// //   sProfileString += "\n";
-
-// //   ofs_device << sProfileString.c_str() << std::endl;
-// //   is_init = true;
-// //   ofs_device << "End of device informations." << std::endl << std::endl;
-// // }
-
-// // template <>
-// // void record_device_memory<base_device::DEVICE_GPU>(const base_device::DEVICE_GPU* ctx,
-// //                                                    std::ofstream& ofs_device,
-// //                                                    std::string str,
-// //                                                    size_t size)
-// // {
-// //     ofs_device << "Allocate " << static_cast<double>(size) / 8 / 1024 / 1024 << " \tMB device memory\t"
-// //                << "from " << str << std::endl
-// //                << std::endl;
-// // }
-
-// // #endif
-
-// // #if __MPI
-// // int stringCmp(const void *a, const void* b)
-// // {
-// //     char* m = (char*)a;
-// //     char* n = (char*)b;
-// //     int i, sum = 0;
-
-// //     for(i = 0; i < MPI_MAX_PROCESSOR_NAME; i++)
-// //         if (m[i] == n[i])
-// //             continue;
-// //         else
-// //         {
-// //             sum = m[i] - n[i];
-// //             break;
-// //         }
-// //     return sum;
-// // }
-
-// // int get_node_rank() {
-// //     char host_name[MPI_MAX_PROCESSOR_NAME];
-// //     memset(host_name, '\0', sizeof(char) * MPI_MAX_PROCESSOR_NAME);
-// //     char (*host_names)[MPI_MAX_PROCESSOR_NAME];
-// //     int n, namelen, color, rank, nprocs, myrank;
-// //     size_t bytes;
-// //     MPI_Comm nodeComm;
-    
-// //     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-// //     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-// //     MPI_Get_processor_name(host_name,&namelen);
-
-// //     bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
-// //     host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
-// //     for (int ii = 0; ii < nprocs; ii++) {
-// //         memset(host_names[ii], '\0', sizeof(char) * MPI_MAX_PROCESSOR_NAME);
-// //     }
-
-// //     strcpy(host_names[rank], host_name);
-
-// //     for (n=0; n<nprocs; n++)
-// //         MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
-// //     qsort(host_names, nprocs,  sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
-
-// //     color = 0;
-// //     for (n=0; n<nprocs-1; n++)
-// //     {
-// //         if(strcmp(host_name, host_names[n]) == 0)
-// //         {
-// //             break;
-// //         }
-// //         if(strcmp(host_names[n], host_names[n+1]))
-// //         {
-// //             color++;
-// //         }
-// //     }
-
-// //     MPI_Comm_split(MPI_COMM_WORLD, color, 0, &nodeComm);
-// //     MPI_Comm_rank(nodeComm, &myrank);
-
-// //     MPI_Barrier(MPI_COMM_WORLD);
-// //     int looprank=myrank;
-// //     // printf (" Assigning device %d  to process on node %s rank %d, OK\n",looprank,  host_name, rank );
-// //     free(host_names);
-// //     return looprank;
-// // }
-// // #endif
-
-// std::string get_device_flag(const std::string& device, const std::string& ks_solver,
-//                             const std::string& basis_type, const bool& gamma_only) {
-//     if (device == "cpu") {
-//         return "cpu";
-//     }
-//     else if (device == "gpu") {
-// #if ((defined __CUDA) || (defined __ROCM))
-//         int device_num = device::get_device_num();
-//         if (device_num <= 0) {
-//             std::string msg = "Cannot find GPU on this computer!";
-//             ModuleBase::WARNING_QUIT("device", msg);
-//             return "unknown";
-//         }
-// #else
-//         std::string msg = "The GPU is not supported in this build!";
-//         ModuleBase::WARNING_QUIT("device", msg);
-//         return "unknown";
-// #endif
-//         if (basis_type == "lcao_in_pw") {
-//             std::string msg = "The GPU currently does not support the basis type \"lcao_in_pw\"!";
-//             ModuleBase::WARNING_QUIT("device", msg);
-//             return "unknown";
-//         }
-//         else if (basis_type == "lcao" && gamma_only == false) {
-//             std::string msg = "The GPU currently does not support the basis type \"lcao\" with \"gamma_only\" set to \"0\"!";
-//             ModuleBase::WARNING_QUIT("device", msg);
-//             return "unknow";
-//         }
-//         else {
-//             return "gpu";
-//         }
-//     }
-//     else {
-//         std::string msg = "INPUT device can only be set to \"cpu\" or \"gpu\"!";
-//         ModuleBase::WARNING_QUIT("device", msg);
-//         return "unknown";
-//     }
-// }
-
-// int get_device_kpar(const int& kpar) {
-// #if __MPI && (__CUDA || __ROCM)
-//     int temp_nproc;
-//     MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc);
-//     if (temp_nproc != kpar)
-//     {
-//         ModuleBase::WARNING("Input_conv", "None kpar set in INPUT file, auto set kpar value.");
-//     }
-//     // GlobalV::KPAR = temp_nproc;
-//     // band the CPU processor to the devices
-//     int node_rank = psi::device::get_node_rank();
-//     int device_num = psi::device::get_device_num();
-//     psi::device::set_device(node_rank % device_num);
-//     return temp_nproc;
-// #endif
-//     return kpar;
-// }
-
-// // std::string get_device_info(std::string device_flag) 
-// // {
-// //     std::string device_info = "Unknown";
-
-// // #if defined(__CUDA)
-// //     if (device_flag == "gpu") {
-// //         int dev = 0;
-// //         cudaDeviceProp deviceProp;
-// //         cudaErrcheck(cudaGetDeviceProperties(&deviceProp, dev));
-// //         device_info = deviceProp.name;
-// //     }
-// // #elif defined(__ROCM)
-// //     if (device_flag == "gpu") {
-// //         int dev = 0;
-// //         hipDeviceProp_t deviceProp;
-// //         hipErrcheck(hipGetDeviceProperties(&deviceProp, dev));
-// //         device_info = deviceProp.name;
-// //     }
-// // #endif
-// //     if (device_flag == "cpu") {
-// //         std::ifstream cpuinfo("/proc/cpuinfo");
-// //         std::string line = "", cpu_name = "";
-
-// //         while (std::getline(cpuinfo, line)) {
-// //             if (line.find("model name") != std::string::npos) {
-// //                 // Extract the CPU name from the line
-// //                 size_t colonPos = line.find(":");
-// //                 if (colonPos != std::string::npos) {
-// //                     cpu_name = line.substr(colonPos + 2); // Skip the colon and space
-// //                     break; // Stop after the first match
-// //                 }
-// //             }
-// //         }
-// //         if (cpu_name != "") {
-// //             device_info = cpu_name;
-// //         }
-// //         cpuinfo.close();
-// //     }
-// //     return device_info;
-// // }
-
-// } // end of namespace device
-// } // end of namespace psi
\ No newline at end of file
diff --git a/source/module_psi/kernels/device.h b/source/module_psi/kernels/device.h
deleted file mode 100644
index ffe4853e2f..0000000000
--- a/source/module_psi/kernels/device.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// // TODO: This is a temperary location for these functions.
-// // And will be moved to a global module(module base) later.
-// #ifndef MODULE_PSI_DEVICE_H_
-// #define MODULE_PSI_DEVICE_H_
-
-// #include "module_base/module_device/types.h"
-
-// #include <complex>
-// #include <iostream>
-
-// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-// static __inline__ __device__ double atomicAdd(double *address, double val) {
-//   unsigned long long int *address_as_ull = (unsigned long long int *)address;
-//   unsigned long long int old = *address_as_ull, assumed;
-//   do {
-//     assumed = old;
-//     old = atomicCAS(address_as_ull, assumed,
-//                     __double_as_longlong(val + __longlong_as_double(assumed)));
-//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
-//     // NaN) } while (assumed != old);
-//   } while (assumed != old);
-//   return __longlong_as_double(old);
-// }
-// #endif
-
-// namespace psi {
-// namespace device {
-
-// template <typename Device>
-// base_device::AbacusDevice_t get_device_type(const Device* dev);
-
-// template<typename T> std::string get_current_precision(const T* var);
-
-// template<typename Device> void print_device_info (const Device* dev, std::ofstream& ofs_device) {return;}
-
-// template<typename Device> void record_device_memory (const Device* dev, std::ofstream& ofs_device, std::string str, size_t size) {return;}
-
-// std::string get_device_info(std::string device_flag);
-
-// int get_device_kpar(const int& kpar);
-// std::string get_device_flag(const std::string& device, const std::string& ks_solver, const std::string& basis_type, const bool& gamma_only);
-
-// #if __MPI
-// int get_node_rank();
-// int stringCmp(const void *a, const void* b);
-// #endif
-
-// #if ((defined __CUDA) || (defined __ROCM))
-// int get_device_num();
-// void set_device(const int rank);
-// #endif
-
-// } // end of namespace device
-// } // end of namespace psi
-
-// #endif  // MODULE_PSI_DEVICE_H_
\ No newline at end of file
diff --git a/source/module_psi/kernels/memory_op.h b/source/module_psi/kernels/memory_op.h
deleted file mode 100644
index a8154426ef..0000000000
--- a/source/module_psi/kernels/memory_op.h
+++ /dev/null
@@ -1,234 +0,0 @@
-// // TODO: This is a temperary location for these functions.
-// // And will be moved to a global module(module base) later.
-// #ifndef MODULE_PSI_MEMORY_H_
-// #define MODULE_PSI_MEMORY_H_
-
-// #include "module_base/module_device/types.h"
-
-// #include <vector>
-// #include <complex>
-// #include <stddef.h>
-
-// namespace psi {
-// namespace memory {
-
-// template <typename FPTYPE, typename Device> 
-// struct resize_memory_op {
-//   /// @brief Allocate memory for a given pointer. Note this op will free the pointer first.
-//   ///
-//   /// Input Parameters
-//   /// \param dev : the type of computing device
-//   /// \param size : array size
-//   /// \param record_string : label for memory record
-//   ///
-//   /// Output Parameters
-//   /// \param arr : allocated array
-//   void operator()(const Device* dev, FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
-// };
-
-// template <typename FPTYPE, typename Device> 
-// struct set_memory_op {
-//   /// @brief memset for multi-device
-//   ///
-//   /// Input Parameters
-//   /// \param dev : the type of computing device
-//   /// \param var : the specified constant value
-//   /// \param size : array size
-//   ///
-//   /// Output Parameters
-//   /// \param arr : output array initialized by the input value
-//   void operator()(const Device* dev, FPTYPE* arr, const int var, const size_t size);
-// };
-
-// template <typename FPTYPE, typename Device_out, typename Device_in> 
-// struct synchronize_memory_op {
-//   /// @brief memcpy for multi-device
-//   ///
-//   /// Input Parameters
-//   /// \param dev_out : the type of computing device of arr_out
-//   /// \param dev_in : the type of computing device of arr_in
-//   /// \param arr_in : input array
-//   /// \param size : array size
-//   ///
-//   /// Output Parameters
-//   /// \param arr_out : output array initialized by the input array
-//   void operator()(
-//       const Device_out* dev_out, 
-//       const Device_in* dev_in, 
-//       FPTYPE* arr_out, 
-//       const FPTYPE* arr_in, 
-//       const size_t size);
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in, typename Device_out, typename Device_in>
-// struct cast_memory_op {
-//     /// @brief memcpy for multi-device
-//     ///
-//     /// Input Parameters
-//     /// \param dev_out : the type of computing device of arr_out
-//     /// \param dev_in : the type of computing device of arr_in
-//     /// \param arr_in : input array
-//     /// \param size : array size
-//     ///
-//     /// Output Parameters
-//     /// \param arr_out : output array initialized by the input array
-//     void operator()(
-//             const Device_out* dev_out,
-//             const Device_in* dev_in,
-//             FPTYPE_out* arr_out,
-//             const FPTYPE_in* arr_in,
-//             const size_t size);
-// };
-
-// template <typename FPTYPE, typename Device>
-// struct delete_memory_op {
-//   /// @brief free memory for multi-device
-//   ///
-//   /// Input Parameters
-//   /// \param dev : the type of computing device
-//   /// \param arr : the input array
-//   void operator()(const Device* dev, FPTYPE* arr);
-// };
-
-// #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-// // Partially specialize operator for base_device::GpuDevice.
-// template <typename FPTYPE>
-// struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev,
-//                     FPTYPE*& arr,
-//                     const size_t size,
-//                     const char* record_in = nullptr);
-// };
-
-// template <typename FPTYPE>
-// struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr, const int var, const size_t size);
-// };
-
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size);
-// };
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_CPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size);
-// };
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size);
-// };
-
-// template <typename FPTYPE>
-// struct delete_memory_op<FPTYPE, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr);
-// };
-// #endif
-// // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM 
-
-// } // end of namespace memory
-// } // end of namespace psi
-
-// // using resmem_sh_op = base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>;
-// // using resmem_dh_op = base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>;
-// // using resmem_ch_op = base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
-// // using resmem_zh_op = base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-// // using resmem_sd_op = base_device::memory::resize_memory_op<float, base_device::DEVICE_GPU>;
-// // using resmem_dd_op = base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>;
-// // using resmem_cd_op = base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// // using resmem_zd_op = base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// // using setmem_sh_op = base_device::memory::set_memory_op<float, base_device::DEVICE_CPU>;
-// // using setmem_dh_op = base_device::memory::set_memory_op<double, base_device::DEVICE_CPU>;
-// // using setmem_ch_op = base_device::memory::set_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
-// // using setmem_zh_op = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-// // using setmem_sd_op = base_device::memory::set_memory_op<float, base_device::DEVICE_GPU>;
-// // using setmem_dd_op = base_device::memory::set_memory_op<double, base_device::DEVICE_GPU>;
-// // using setmem_cd_op = base_device::memory::set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// // using setmem_zd_op = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// // using delmem_sh_op = base_device::memory::delete_memory_op<float, base_device::DEVICE_CPU>;
-// // using delmem_dh_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_CPU>;
-// // using delmem_ch_op = base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
-// // using delmem_zh_op = base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-// // using delmem_sd_op = base_device::memory::delete_memory_op<float, base_device::DEVICE_GPU>;
-// // using delmem_dd_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>;
-// // using delmem_cd_op = base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// // using delmem_zd_op = base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// // using syncmem_s2s_h2h_op
-// //     = base_device::memory::synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using syncmem_s2s_h2d_op
-// //     = base_device::memory::synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using syncmem_s2s_d2h_op
-// //     = base_device::memory::synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// // using syncmem_d2d_h2h_op
-// //     = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using syncmem_d2d_h2d_op
-// //     = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using syncmem_d2d_d2h_op
-// //     = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-
-// // using syncmem_c2c_h2h_op
-// //     = base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using syncmem_c2c_h2d_op
-// //     = base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using syncmem_c2c_d2h_op
-// //     = base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// // using syncmem_z2z_h2h_op = base_device::memory::
-// //     synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using syncmem_z2z_h2d_op = base_device::memory::
-// //     synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using syncmem_z2z_d2h_op = base_device::memory::
-// //     synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-
-// // using castmem_s2d_h2h_op
-// //     = base_device::memory::cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using castmem_s2d_h2d_op
-// //     = base_device::memory::cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using castmem_s2d_d2h_op
-// //     = base_device::memory::cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// // using castmem_d2s_h2h_op
-// //     = base_device::memory::cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using castmem_d2s_h2d_op
-// //     = base_device::memory::cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using castmem_d2s_d2h_op
-// //     = base_device::memory::cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-
-// // using castmem_c2z_h2h_op = psi::memory::
-// //     cast_memory_op<std::complex<double>, std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using castmem_c2z_h2d_op = psi::memory::
-// //     cast_memory_op<std::complex<double>, std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using castmem_c2z_d2h_op = psi::memory::
-// //     cast_memory_op<std::complex<double>, std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// // using castmem_z2c_h2h_op = psi::memory::
-// //     cast_memory_op<std::complex<float>, std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// // using castmem_z2c_h2d_op = psi::memory::
-// //     cast_memory_op<std::complex<float>, std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// // using castmem_z2c_d2h_op = psi::memory::
-// //     cast_memory_op<std::complex<float>, std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-
-// // static base_device::DEVICE_CPU* cpu_ctx = {};
-// // static base_device::DEVICE_GPU* gpu_ctx = {};
-
-// #endif // MODULE_PSI_MEMORY_H_
\ No newline at end of file
diff --git a/source/module_psi/kernels/psi_memory_op.cpp b/source/module_psi/kernels/psi_memory_op.cpp
deleted file mode 100644
index 06b63ebd7b..0000000000
--- a/source/module_psi/kernels/psi_memory_op.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-// #include <iostream>
-// #include <complex>
-// #include <string.h>
-// #include "module_base/module_device/types.h"
-// #include "module_psi/kernels/memory_op.h"
-// #include "module_base/memory.h"
-// #include "module_base/tool_threading.h"
-
-// namespace psi{
-// namespace memory{
-
-// template <typename FPTYPE>
-// struct resize_memory_op<FPTYPE, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev, FPTYPE*& arr, const size_t size, const char* record_in)
-//     {
-//         if (arr != nullptr)
-//         {
-//             free(arr);
-//         }
-//         arr = (FPTYPE*)malloc(sizeof(FPTYPE) * size);
-//         std::string record_string;
-//         if (record_in != nullptr)
-//         {
-//             record_string = record_in;
-//         }
-//         else
-//         {
-//             record_string = "no_record";
-//         }
-
-//         if (record_string != "no_record")
-//         {
-//             ModuleBase::Memory::record(record_string, sizeof(FPTYPE) * size);
-//         }
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct set_memory_op<FPTYPE, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr, const int var, const size_t size)
-//     {
-//         ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) {
-//             int beg = 0, len = 0;
-//             ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, size, (size_t)4096 / sizeof(FPTYPE), beg, len);
-//             memset(arr + beg, var, sizeof(FPTYPE) * len);
-//         });
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev_out,
-//                     const base_device::DEVICE_CPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size)
-//     {
-//         ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) {
-//             int beg = 0, len = 0;
-//             ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, size, (size_t)4096 / sizeof(FPTYPE), beg, len);
-//             memcpy(arr_out + beg, arr_in + beg, sizeof(FPTYPE) * len);
-//         });
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev_out,
-//                     const base_device::DEVICE_CPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size)
-//     {
-// #ifdef _OPENMP
-// #pragma omp parallel for schedule(static, 4096/sizeof(FPTYPE_out))
-// #endif
-//         for (int ii = 0; ii < size; ii++) {
-//             arr_out[ii] = static_cast<FPTYPE_out>(arr_in[ii]);
-//         }
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct delete_memory_op<FPTYPE, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr)
-//     {
-//         free(arr);
-//     }
-// };
-
-// template struct resize_memory_op<int, base_device::DEVICE_CPU>;
-// template struct resize_memory_op<float, base_device::DEVICE_CPU>;
-// template struct resize_memory_op<double, base_device::DEVICE_CPU>;
-// template struct resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
-// template struct resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-// template struct set_memory_op<int, base_device::DEVICE_CPU>;
-// template struct set_memory_op<float, base_device::DEVICE_CPU>;
-// template struct set_memory_op<double, base_device::DEVICE_CPU>;
-// template struct set_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
-// template struct set_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-// template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-
-// template struct cast_memory_op<float, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_CPU>;
-
-// template struct delete_memory_op<int, base_device::DEVICE_CPU>;
-// template struct delete_memory_op<float, base_device::DEVICE_CPU>;
-// template struct delete_memory_op<double, base_device::DEVICE_CPU>;
-// template struct delete_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
-// template struct delete_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-// #if !(defined(__CUDA) || defined(__ROCM))
-// template <typename FPTYPE>
-// struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev,
-//                     FPTYPE*& arr,
-//                     const size_t size,
-//                     const char* record_in = nullptr)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr, const int var, const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_CPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE* arr_out,
-//                     const FPTYPE* arr_in,
-//                     const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev_out,
-//                     const base_device::DEVICE_CPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_CPU* dev_out,
-//                     const base_device::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size)
-//     {
-//     }
-// };
-
-// template <typename FPTYPE>
-// struct delete_memory_op<FPTYPE, base_device::DEVICE_GPU>
-// {
-//     void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr)
-//     {
-//     }
-// };
-
-// template struct resize_memory_op<int, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<float, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<double, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// template struct set_memory_op<int, base_device::DEVICE_GPU>;
-// template struct set_memory_op<float, base_device::DEVICE_GPU>;
-// template struct set_memory_op<double, base_device::DEVICE_GPU>;
-// template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-
-// template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-
-// template struct delete_memory_op<int, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<float, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<double, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-// #endif
-
-// }
-// }
\ No newline at end of file
diff --git a/source/module_psi/kernels/rocm/memory_op.hip.cu b/source/module_psi/kernels/rocm/memory_op.hip.cu
deleted file mode 100644
index 03ed12632d..0000000000
--- a/source/module_psi/kernels/rocm/memory_op.hip.cu
+++ /dev/null
@@ -1,269 +0,0 @@
-// #include "module_psi/kernels/memory_op.h"
-
-// #include <complex>
-// #include <type_traits>
-
-// #include <hip/hip_runtime.h>
-// #include <thrust/complex.h>
-// #include <base/macros/macros.h>
-
-// #define THREADS_PER_BLOCK 256
-
-// namespace psi {
-// namespace memory {
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// __global__ void cast_memory(
-//         FPTYPE_out* out,
-//         const FPTYPE_in* in,
-//         const int size)
-// {
-//     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-//     if(idx >= size) {return;}
-//     out[idx] = static_cast<FPTYPE_out>(in[idx]);
-// }
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// __global__ void cast_memory(
-//         std::complex<FPTYPE_out>* out,
-//         const std::complex<FPTYPE_in>* in,
-//         const int size)
-// {
-//     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-//     if(idx >= size) {return;}
-//     auto* _out = reinterpret_cast<thrust::complex<FPTYPE_out>*>(out);
-//     const auto* _in = reinterpret_cast<const thrust::complex<FPTYPE_in>*>(in);
-//     _out[idx] = static_cast<thrust::complex<FPTYPE_out>>(_in[idx]);
-// }
-
-// template <typename FPTYPE>
-// void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-//                                                                    FPTYPE*& arr,
-//                                                                    const size_t size,
-//                                                                    const char* record_in)
-// {
-//   if (arr != nullptr) {
-//       delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(dev, arr);
-//   }
-//   hipErrcheck(hipMalloc((void **)&arr, sizeof(FPTYPE) * size));
-// }
-
-// template <typename FPTYPE>
-// void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-//                                                                 FPTYPE* arr,
-//                                                                 const int var,
-//                                                                 const size_t size)
-// {
-//   hipErrcheck(hipMemset(arr, var, sizeof(FPTYPE) * size));  
-// }
-
-// template <typename FPTYPE>
-// void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
-//     const base_device::DEVICE_CPU* dev_out,
-//     const base_device::DEVICE_GPU* dev_in,
-//     FPTYPE* arr_out,
-//     const FPTYPE* arr_in,
-//     const size_t size)
-// {
-//   hipErrcheck(hipMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));  
-// }
-
-// template <typename FPTYPE>
-// void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
-//     const base_device::DEVICE_GPU* dev_out,
-//     const base_device::DEVICE_CPU* dev_in,
-//     FPTYPE* arr_out,
-//     const FPTYPE* arr_in,
-//     const size_t size)
-// {
-//   hipErrcheck(hipMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));  
-// }
-
-// template <typename FPTYPE>
-// void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
-//     const base_device::DEVICE_GPU* dev_out,
-//     const base_device::DEVICE_GPU* dev_in,
-//     FPTYPE* arr_out,
-//     const FPTYPE* arr_in,
-//     const size_t size)
-// {
-//   hipErrcheck(hipMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, hipMemcpyDeviceToDevice));  
-// }
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, psi::DEVICE_GPU, psi::DEVICE_GPU> {
-//     void operator()(const psi::DEVICE_GPU* dev_out,
-//                     const psi::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size) {
-        
-//         if (size == 0) {return;}
-//         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//         hipLaunchKernelGGL(cast_memory, dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, arr_out, arr_in, size);
-//         hipErrcheck(hipGetLastError());
-//         hipErrcheck(hipDeviceSynchronize());
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, psi::DEVICE_GPU, psi::DEVICE_CPU> {
-//     void operator()(const psi::DEVICE_GPU* dev_out,
-//                     const psi::DEVICE_CPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size) {
-
-//         if (size == 0) {return;}
-//         // No need to cast the memory if the data types are the same.
-//         if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
-//         {
-//             synchronize_memory_op<FPTYPE_out, psi::DEVICE_GPU, psi::DEVICE_CPU>()(dev_out,
-//                                                                                   dev_in,
-//                                                                                   arr_out,
-//                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
-//                                                                                   size);
-//             return;
-//         }
-//         FPTYPE_in * arr = nullptr;
-//         hipErrcheck(hipMalloc((void **)&arr, sizeof(FPTYPE_in) * size));
-//         hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyHostToDevice));
-//         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//         hipLaunchKernelGGL(cast_memory, dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, arr_out, arr, size);
-//         hipErrcheck(hipGetLastError());
-//         hipErrcheck(hipDeviceSynchronize());
-//         hipErrcheck(hipFree(arr));
-//     }
-// };
-
-// template <typename FPTYPE_out, typename FPTYPE_in>
-// struct cast_memory_op<FPTYPE_out, FPTYPE_in, psi::DEVICE_CPU, psi::DEVICE_GPU> {
-//     void operator()(const psi::DEVICE_CPU* dev_out,
-//                     const psi::DEVICE_GPU* dev_in,
-//                     FPTYPE_out* arr_out,
-//                     const FPTYPE_in* arr_in,
-//                     const size_t size) {
-        
-//         if (size == 0) {return;}
-//         // No need to cast the memory if the data types are the same.
-//         if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
-//         {
-//             synchronize_memory_op<FPTYPE_out, psi::DEVICE_CPU, psi::DEVICE_GPU>()(dev_out,
-//                                                                                   dev_in,
-//                                                                                   arr_out,
-//                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
-//                                                                                   size);
-//             return;
-//         }
-//         auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
-//         hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyDeviceToHost));
-//         for (int ii = 0; ii < size; ii++) {
-//             arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
-//         }
-//         free(arr);
-//     }
-// };
-
-// template <typename FPTYPE>
-// void delete_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr)
-// {
-//   hipErrcheck(hipFree(arr));
-// }
-
-// template struct resize_memory_op<int, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<float, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<double, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// template struct set_memory_op<int, base_device::DEVICE_GPU>;
-// template struct set_memory_op<float, base_device::DEVICE_GPU>;
-// template struct set_memory_op<double, base_device::DEVICE_GPU>;
-// template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-
-// template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_GPU,
-//                                base_device::DEVICE_CPU>;
-// template struct cast_memory_op<float, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<float>,
-//                                std::complex<double>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-// template struct cast_memory_op<std::complex<double>,
-//                                std::complex<float>,
-//                                base_device::DEVICE_CPU,
-//                                base_device::DEVICE_GPU>;
-
-// template struct delete_memory_op<int, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<float, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<double, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
-// template struct delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-
-// } // end of namespace gpu_cuda
-// } // end of namespace psi
\ No newline at end of file
diff --git a/source/module_psi/kernels/test/CMakeLists.txt b/source/module_psi/kernels/test/CMakeLists.txt
deleted file mode 100644
index 2c09c8741b..0000000000
--- a/source/module_psi/kernels/test/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# AddTest(
-#   TARGET Module_Psi_UTs
-#   LIBS ${math_libs} base device
-#   SOURCES memory_test.cpp device_test.cpp
-# )
\ No newline at end of file
diff --git a/source/module_psi/kernels/test/device_test.cpp b/source/module_psi/kernels/test/device_test.cpp
deleted file mode 100644
index b5aff72fd8..0000000000
--- a/source/module_psi/kernels/test/device_test.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <complex>
-#include <iostream>
-#include <gtest/gtest.h>
-#include "module_base/module_device/types.h"
-
-class TestModulePsiDevice : public ::testing::Test
-{
-  protected:
-    const base_device::DEVICE_CPU* cpu_ctx = {};
-    const base_device::DEVICE_GPU* gpu_ctx = {};
-
-    void SetUp() override {
-    }
-    void TearDown() override {
-    }
-};
-
-TEST_F(TestModulePsiDevice, get_device_type_cpu)
-{
-    base_device::AbacusDevice_t device = psi::device::get_device_type<base_device::DEVICE_CPU>(cpu_ctx);
-    EXPECT_EQ(device, base_device::CpuDevice);
-}
-
-#if __UT_USE_CUDA || __UT_USE_ROCM
-TEST_F(TestModulePsiDevice, get_device_type_gpu)
-{
-    base_device::AbacusDevice_t device = psi::device::get_device_type<base_device::DEVICE_GPU>(gpu_ctx);
-    EXPECT_EQ(device, base_device::GpuDevice);
-}
-#endif // __UT_USE_CUDA || __UT_USE_ROCM
-
diff --git a/source/module_psi/kernels/test/memory_test.cpp b/source/module_psi/kernels/test/memory_test.cpp
deleted file mode 100644
index d154cbc4fd..0000000000
--- a/source/module_psi/kernels/test/memory_test.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-#include <complex>
-#include <iostream>
-#include <gtest/gtest.h>
-#include "module_psi/kernels/memory_op.h"
-#if __UT_USE_CUDA || __UT_USE_ROCM
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/complex.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_free.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_malloc.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#endif
-
-class TestModulePsiMemory : public ::testing::Test
-{
-  protected:
-    const std::vector<double> xx = {
-        -0.65412617, -0.74208893, -2.21731157,  0.42540039, 
-        3.36373004,  -2.51647562, -2.985111  , -0.53251562, 
-        0.37908265,   0.81605825,  1.66281318,  2.71761869, 
-        2.2010268 ,   0.65498149,  1.51153638,  0.71501482, 
-        0.53546578,   1.4564317 , -2.36701143,  1.23009056, 
-        3.41302551,  -2.3175205 , -0.27628221, -1.35701656
-    };
-
-    const std::vector<std::complex<double> > z_xx = {
-        {-0.65412617, -0.74208893}, {-2.21731157,  0.42540039},
-        {3.36373004,  -2.51647562}, {-2.985111  , -0.53251562},
-        {0.37908265,   0.81605825}, { 1.66281318,  2.71761869},
-        {2.2010268 ,   0.65498149}, { 1.51153638,  0.71501482},
-        {0.53546578,   1.4564317 }, {-2.36701143,  1.23009056},
-        {3.41302551,  -2.3175205 }, {-0.27628221, -1.35701656}
-    };
-
-    const int z_dim = z_xx.size();
-
-    const base_device::DEVICE_CPU* cpu_ctx = {};
-    const base_device::DEVICE_GPU* gpu_ctx = {};
-
-    void SetUp() override {
-    }
-    void TearDown() override {
-    }
-
-    using set_memory_double_cpu_op = base_device::memory::set_memory_op<double, base_device::DEVICE_CPU>;
-    using set_memory_complex_double_cpu_op
-        = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-    using resize_memory_double_cpu_op = base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>;
-    using resize_memory_comlex_double_cpu_op
-        = base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-    using synchronize_memory_double_cpu_to_cpu_op
-        = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-    using synchronize_memory_complex_double_cpu_to_cpu_op = base_device::memory::
-        synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
-    using delete_memory_double_cpu_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_CPU>;
-    using delete_memory_complex_double_cpu_op
-        = base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
-
-#if __UT_USE_CUDA || __UT_USE_ROCM
-    using set_memory_double_gpu_op = base_device::memory::set_memory_op<double, base_device::DEVICE_GPU>;
-    using set_memory_complex_double_gpu_op
-        = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-    using resize_memory_double_gpu_op = base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>;
-    using resize_memory_comlex_double_gpu_op
-        = base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-    using synchronize_memory_double_cpu_to_gpu_op
-        = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-    using synchronize_memory_double_gpu_to_cpu_op
-        = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-    using synchronize_memory_double_gpu_to_gpu_op
-        = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-    using synchronize_memory_complex_double_cpu_to_gpu_op = base_device::memory::
-        synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
-    using synchronize_memory_complex_double_gpu_to_cpu_op = base_device::memory::
-        synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
-    using synchronize_memory_complex_double_gpu_to_gpu_op = base_device::memory::
-        synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
-    using delete_memory_double_gpu_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>;
-    using delete_memory_complex_double_gpu_op
-        = base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
-#endif // __UT_USE_CUDA || __UT_USE_ROCM
-};
-
-TEST_F(TestModulePsiMemory, set_memory_op_double_cpu)
-{
-  std::vector<double> v_xx = xx;
-  set_memory_double_cpu_op()(cpu_ctx, v_xx.data(), 0, xx.size());
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(v_xx[ii], 0.0);
-  }
-}
-
-TEST_F(TestModulePsiMemory, set_memory_op_complex_double_cpu)
-{
-  std::vector<std::complex<double>> vz_xx = z_xx;
-  set_memory_complex_double_cpu_op()(cpu_ctx, vz_xx.data(), 0, z_xx.size());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(vz_xx[ii], std::complex<double>(0.0, 0.0));
-  }
-}
-
-TEST_F(TestModulePsiMemory, resize_memory_op_double_cpu)
-{
-  double* xx_tmp = NULL;
-  resize_memory_double_cpu_op()(cpu_ctx, xx_tmp, xx.size());
-  for (int ii = 0; ii < xx.size(); ii++) {
-    xx_tmp[ii] = xx[ii];
-  }
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(xx_tmp[ii], xx[ii]);
-  }
-  free(xx_tmp);
-}
-
-TEST_F(TestModulePsiMemory, resize_memory_op_comlex_double_cpu)
-{
-  std::complex<double>* z_xx_tmp = NULL;
-  resize_memory_comlex_double_cpu_op()(cpu_ctx, z_xx_tmp, z_xx.size());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    z_xx_tmp[ii] = z_xx[ii];
-  }
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(z_xx_tmp[ii], z_xx[ii]);
-  }
-  free(z_xx_tmp);
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_cpu)
-{
-  std::vector<double> h_xx(xx.size(), 0);
-  synchronize_memory_double_cpu_to_cpu_op()(cpu_ctx, cpu_ctx, h_xx.data(), xx.data(), xx.size());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], xx[ii]);
-  }
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_cpu)
-{
-  std::vector<std::complex<double>> hz_xx(z_xx.size(), std::complex<double>(0, 0));
-  synchronize_memory_complex_double_cpu_to_cpu_op()(cpu_ctx, cpu_ctx, hz_xx.data(), z_xx.data(), z_xx.size());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(hz_xx[ii], z_xx[ii]);
-  }
-}
-
-TEST_F(TestModulePsiMemory, delete_memory_op_double_cpu)
-{
-  double * h_xx = (double*)malloc(sizeof(double) * xx.size());
-  delete_memory_double_cpu_op()(cpu_ctx, h_xx);
-}
-
-TEST_F(TestModulePsiMemory, delete_memory_op_complex_double_cpu)
-{
-  std::complex<double> * hz_xx = (std::complex<double>*)malloc(sizeof(std::complex<double>) * z_xx.size());
-  delete_memory_complex_double_cpu_op()(cpu_ctx, hz_xx);
-}
-
-
-#if __UT_USE_CUDA || __UT_USE_ROCM
-TEST_F(TestModulePsiMemory, set_memory_op_double_gpu)
-{
-  thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
-  thrust::copy(xx.begin(), xx.end(), d_xx);
-  set_memory_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(d_xx), 0, xx.size());
-  thrust::host_vector<double> h_xx(xx.size());
-  thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin());
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], 0.0);
-  }
-}
-
-TEST_F(TestModulePsiMemory, set_memory_op_complex_double_gpu)
-{
-  thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-  thrust::copy(z_xx.begin(), z_xx.end(), dz_xx);
-  set_memory_complex_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(dz_xx), 0, z_xx.size());
-  thrust::host_vector<std::complex<double>> h_xx(z_xx.size());
-  thrust::copy(dz_xx, dz_xx + z_xx.size(), h_xx.begin());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], std::complex<double>(0.0, 0.0));
-  }
-}
-
-TEST_F(TestModulePsiMemory, resize_memory_op_double_gpu)
-{
-  double* xx_tmp = NULL;
-  resize_memory_double_gpu_op()(gpu_ctx, xx_tmp, xx.size());
-
-  thrust::device_ptr<double> d_xx(xx_tmp);
-  thrust::copy(xx.begin(), xx.end(), d_xx);
-
-  thrust::host_vector<double> h_xx(xx.size());
-  thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin());
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], xx[ii]);
-  }
-  thrust::device_free(d_xx);
-}
-
-TEST_F(TestModulePsiMemory, resize_memory_op_complex_double_gpu)
-{
-  std::complex<double>* z_xx_tmp = NULL;
-  resize_memory_comlex_double_gpu_op()(gpu_ctx, z_xx_tmp, z_xx.size());
-
-  thrust::device_ptr<std::complex<double>> dz_xx(z_xx_tmp);
-  thrust::copy(z_xx.begin(), z_xx.end(), dz_xx);
-
-  thrust::host_vector<std::complex<double>> h_z_xx(z_xx.size());
-  thrust::copy(dz_xx, dz_xx + z_xx.size(), h_z_xx.begin());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(h_z_xx[ii], z_xx[ii]);
-  }
-  thrust::device_free(dz_xx);
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_gpu)
-{
-  thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
-  std::vector<double> hv_xx(xx.size(), 0);
-  thrust::copy(hv_xx.begin(), hv_xx.end(), d_xx);
-  synchronize_memory_double_cpu_to_gpu_op()(
-    gpu_ctx, 
-    cpu_ctx, 
-    thrust::raw_pointer_cast(d_xx), 
-    xx.data(), 
-    xx.size());
-
-  thrust::host_vector<double> h_xx(xx.size());
-  thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin());
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], xx[ii]);
-  }
-  thrust::device_free(d_xx);
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_double_gpu_to_cpu)
-{
-  thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
-  thrust::copy(xx.begin(), xx.end(), d_xx);
-  thrust::host_vector<double> h_xx(xx.size());
-  synchronize_memory_double_gpu_to_cpu_op()(
-    cpu_ctx, 
-    gpu_ctx, 
-    thrust::raw_pointer_cast(h_xx.data()), 
-    thrust::raw_pointer_cast(d_xx), 
-    xx.size());
-
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], xx[ii]);
-  }
-  thrust::device_free(d_xx);
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_double_gpu_to_gpu)
-{
-  thrust::device_ptr<double> d1_xx = thrust::device_malloc<double>(xx.size());
-  thrust::device_ptr<double> d2_xx = thrust::device_malloc<double>(xx.size());
-  thrust::copy(xx.begin(), xx.end(), d1_xx);
-  synchronize_memory_double_gpu_to_gpu_op()(
-    gpu_ctx, 
-    gpu_ctx, 
-    thrust::raw_pointer_cast(d2_xx), 
-    thrust::raw_pointer_cast(d1_xx), 
-    xx.size());
-
-  thrust::host_vector<double> h_xx(xx.size());
-  thrust::copy(d2_xx, d2_xx + xx.size(), h_xx.begin());
-  for (int ii = 0; ii < xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], xx[ii]);
-  }
-  thrust::device_free(thrust::device_ptr<double>(d1_xx));
-  thrust::device_free(thrust::device_ptr<double>(d2_xx));
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_gpu)
-{
-  thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-  std::vector<std::complex<double>> hvz_xx(z_xx.size(), 0);
-  thrust::copy(hvz_xx.begin(), hvz_xx.end(), dz_xx);
-  synchronize_memory_complex_double_cpu_to_gpu_op()(
-    gpu_ctx, 
-    cpu_ctx, 
-    thrust::raw_pointer_cast(dz_xx), 
-    z_xx.data(), 
-    z_xx.size());
-
-  thrust::host_vector<std::complex<double>> hz_xx(z_xx.size());
-  thrust::copy(dz_xx, dz_xx + z_xx.size(), hz_xx.begin());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(hz_xx[ii], z_xx[ii]);
-  }
-  thrust::device_free(dz_xx);
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_cpu)
-{
-  thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-  thrust::copy(z_xx.begin(), z_xx.end(), dz_xx);
-  thrust::host_vector<std::complex<double>> hz_xx(z_xx.size());
-  synchronize_memory_complex_double_gpu_to_cpu_op()(
-    cpu_ctx, 
-    gpu_ctx, 
-    thrust::raw_pointer_cast(hz_xx.data()), 
-    thrust::raw_pointer_cast(dz_xx), 
-    z_xx.size());
-
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(hz_xx[ii], z_xx[ii]);
-  }
-  thrust::device_free(dz_xx);
-}
-
-TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_gpu)
-{
-  thrust::device_ptr<std::complex<double>> dz1_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-  thrust::device_ptr<std::complex<double>> dz2_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-  thrust::copy(z_xx.begin(), z_xx.end(), dz1_xx);
-  synchronize_memory_complex_double_gpu_to_gpu_op()(
-    gpu_ctx, 
-    gpu_ctx, 
-    thrust::raw_pointer_cast(dz2_xx), 
-    thrust::raw_pointer_cast(dz1_xx), 
-    z_xx.size());
-
-  thrust::host_vector<std::complex<double>> h_xx(z_xx.size());
-  thrust::copy(dz2_xx, dz2_xx + z_xx.size(), h_xx.begin());
-  for (int ii = 0; ii < z_xx.size(); ii++) {
-    EXPECT_EQ(h_xx[ii], z_xx[ii]);
-  }
-  thrust::device_free(thrust::device_ptr<std::complex<double>>(dz1_xx));
-  thrust::device_free(thrust::device_ptr<std::complex<double>>(dz2_xx));
-}
-
-TEST_F(TestModulePsiMemory, delete_memory_op_double_gpu)
-{
-  thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
-  delete_memory_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(d_xx));
-}
-
-TEST_F(TestModulePsiMemory, delete_memory_op_complex_double_gpu)
-{
-  thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-  delete_memory_complex_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(dz_xx));
-}
-
-#endif // __UT_USE_CUDA || __UT_USE_ROCM
diff --git a/source/module_psi/kernels/types.h b/source/module_psi/kernels/types.h
deleted file mode 100644
index 6c07620d17..0000000000
--- a/source/module_psi/kernels/types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// // TODO: This is a temperary location for these functions.
-// // And will be moved to a global module(module base) later.
-// #ifndef MODULE_PSI_TYPES_H_
-// #define MODULE_PSI_TYPES_H_
-
-// namespace psi {
-
-// struct DEVICE_CPU;
-// struct DEVICE_GPU;
-// struct DEVICE_SYCL;
-
-// enum base_device::AbacusDevice_t {UnKnown, CpuDevice, GpuDevice, SyclDevice};
-
-// }  // end of namespace psi
-
-// #endif  // MODULE_PSI_TYPES_H_
\ No newline at end of file

From 3538efd2b34d7fdf3877756f98f49d16ab017d3b Mon Sep 17 00:00:00 2001
From: haozhihan <haozhi.han@stu.pku.edu.cn>
Date: Fri, 14 Jun 2024 20:05:10 +0800
Subject: [PATCH 2/2] fix build bug

---
 source/module_base/kernels/test/math_op_test.cpp                 | 1 -
 source/module_base/math_ylmreal.cpp                              | 1 -
 source/module_basis/module_pw/kernels/test/pw_op_test.cpp        | 1 -
 source/module_elecstate/kernels/test/elecstate_op_test.cpp       | 1 -
 source/module_hamilt_pw/hamilt_pwdft/forces.h                    | 1 -
 .../hamilt_pwdft/kernels/test/ekinetic_op_test.cpp               | 1 -
 .../module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp | 1 -
 .../module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp  | 1 -
 .../hamilt_pwdft/kernels/test/nonlocal_op_test.cpp               | 1 -
 .../hamilt_pwdft/kernels/test/stress_op_test.cpp                 | 1 -
 .../module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp  | 1 -
 .../module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp   | 1 -
 source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp | 1 -
 source/module_hsolver/diago_bpcg.h                               | 1 -
 source/module_hsolver/kernels/cuda/math_kernel_op.cu             | 1 -
 source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu         | 1 -
 source/module_hsolver/kernels/test/math_dngvd_test.cpp           | 1 -
 source/module_hsolver/kernels/test/math_kernel_test.cpp          | 1 -
 source/module_hsolver/kernels/test/perf_math_kernel.cpp          | 1 -
 source/module_psi/psi.h                                          | 1 -
 20 files changed, 20 deletions(-)

diff --git a/source/module_base/kernels/test/math_op_test.cpp b/source/module_base/kernels/test/math_op_test.cpp
index ea736c97c9..7136ab8d35 100644
--- a/source/module_base/kernels/test/math_op_test.cpp
+++ b/source/module_base/kernels/test/math_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_base/kernels/math_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_base/math_ylmreal.cpp b/source/module_base/math_ylmreal.cpp
index ecd9aebbf4..331b867c93 100644
--- a/source/module_base/math_ylmreal.cpp
+++ b/source/module_base/math_ylmreal.cpp
@@ -4,7 +4,6 @@
 #include "module_base/kernels/math_op.h"
 #include "module_base/libm/libm.h"
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 #include "realarray.h"
 #include "timer.h"
 #include "tool_quit.h"
diff --git a/source/module_basis/module_pw/kernels/test/pw_op_test.cpp b/source/module_basis/module_pw/kernels/test/pw_op_test.cpp
index 62d94adc37..96cc760383 100644
--- a/source/module_basis/module_pw/kernels/test/pw_op_test.cpp
+++ b/source/module_basis/module_pw/kernels/test/pw_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_basis/module_pw/kernels/pw_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_elecstate/kernels/test/elecstate_op_test.cpp b/source/module_elecstate/kernels/test/elecstate_op_test.cpp
index 58018a836a..79635c7895 100644
--- a/source/module_elecstate/kernels/test/elecstate_op_test.cpp
+++ b/source/module_elecstate/kernels/test/elecstate_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_elecstate/kernels/elecstate_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h
index 2f067e92ec..979620d7ec 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h
@@ -11,7 +11,6 @@
 #include "module_elecstate/elecstate.h"
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
-// #include "module_psi/kernels/memory_op.h"
 #include "module_psi/psi.h"
 #include "structure_factor.h"
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp
index fd4126801a..03a5792bed 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/ekinetic_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
index 089f2c720f..fbe10d9af3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp
index 58790c8c2b..d9e9244004 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/meta_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp
index ebecc24c24..8591182d4b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/nonlocal_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
index 96cc2d9d77..a6e482cdae 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp
index 747c02ce93..318646f063 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/veff_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp
index be4df99e1a..77ec09beac 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/vnl_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp
index 09ec5ee1c0..8b46679d67 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp
@@ -1,7 +1,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/wf_op.h"
 
 #include "module_base/module_device/memory_op.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h
index 75b4474b36..71040960c3 100644
--- a/source/module_hsolver/diago_bpcg.h
+++ b/source/module_hsolver/diago_bpcg.h
@@ -6,7 +6,6 @@
 #include "module_hamilt_pw/hamilt_pwdft/structure_factor.h"
 
 #include "module_base/module_device/types.h"
-// #include "module_psi/kernels/memory_op.h"
 #include "module_base/module_device/memory_op.h"
 
 #include "module_hsolver/kernels/math_kernel_op.h"
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index a731930628..ed1b9379f9 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -1,6 +1,5 @@
 #include "module_base/module_device/memory_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
-#include "module_psi/kernels/memory_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index 2d83f4172f..89cf59f6fa 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -1,6 +1,5 @@
 #include "module_base/module_device/memory_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
-#include "module_psi/kernels/memory_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
diff --git a/source/module_hsolver/kernels/test/math_dngvd_test.cpp b/source/module_hsolver/kernels/test/math_dngvd_test.cpp
index 230ab05afd..8b614ae9a0 100644
--- a/source/module_hsolver/kernels/test/math_dngvd_test.cpp
+++ b/source/module_hsolver/kernels/test/math_dngvd_test.cpp
@@ -3,7 +3,6 @@
 #include "module_base/module_device/memory_op.h"
 #include "module_hsolver/kernels/dngvd_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
-#include "module_psi/kernels/memory_op.h"
 
 #include <algorithm>
 #include <complex>
diff --git a/source/module_hsolver/kernels/test/math_kernel_test.cpp b/source/module_hsolver/kernels/test/math_kernel_test.cpp
index 56d7309d45..c2c66fb936 100644
--- a/source/module_hsolver/kernels/test/math_kernel_test.cpp
+++ b/source/module_hsolver/kernels/test/math_kernel_test.cpp
@@ -2,7 +2,6 @@
 #include "module_base/constants.h"
 #include "module_base/module_device/memory_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
-#include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <gtest/gtest.h>
diff --git a/source/module_hsolver/kernels/test/perf_math_kernel.cpp b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
index da5994e1a9..173ef8b40b 100644
--- a/source/module_hsolver/kernels/test/perf_math_kernel.cpp
+++ b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
@@ -2,7 +2,6 @@
 #include "module_base/constants.h"
 #include "module_base/module_device/memory_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
-#include "module_psi/kernels/memory_op.h"
 
 #include <complex>
 #include <benchmark/benchmark.h>
diff --git a/source/module_psi/psi.h b/source/module_psi/psi.h
index 95ca8028e2..39956321fd 100644
--- a/source/module_psi/psi.h
+++ b/source/module_psi/psi.h
@@ -3,7 +3,6 @@
 
 #include "module_base/module_device/memory_op.h"
 #include "module_base/module_device/types.h"
-// #include "module_psi/kernels/memory_op.h"
 
 #include <tuple>