Merge branch 'develop' into float-fftw-info

denghuilu · Dec 2, 2023 · 98ff034 · 98ff034
2 parents d83e59f + 37cc1a3
commit 98ff034
Show file tree

Hide file tree

Showing 17 changed files with 841 additions and 670 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -286,7 +286,10 @@ endif()
 # Warning: CMake add support to HIP in version 3.21. This is rather a new version.
 # Use cmake with AMD-ROCm: https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
 if(USE_ROCM)
-  if (NOT DEFINED ROCM_PATH )
+  if(COMMIT_INFO)
+    message(FATAL_ERROR "Commit info is not supported on ROCm.")
+  endif()
+  if(NOT DEFINED ROCM_PATH )
     set (ROCM_PATH "/opt/rocm"  CACHE STRING "Default ROCM installation directory." )
   endif ()
   if(NOT DEFINED HIP_PATH)

diff --git a/examples/scf/pw_Si2/INPUT b/examples/scf/pw_Si2/INPUT
@@ -1,9 +1,12 @@
 INPUT_PARAMETERS
-#Parameters	(General)
-pseudo_dir		../../../tests/PP_ORB	
-symmetry                1	
-#Parameters (Accuracy)
-basis_type		pw
-ecutwfc			60
-scf_thr                 1e-8
-scf_nmax		100
+#Parameters  (General)
+pseudo_dir      ../../../tests/PP_ORB	
+symmetry        1	
+#Parameters  (Accuracy)
+basis_type      pw
+ecutwfc         60 
+scf_thr         1e-7
+scf_nmax        100
+device          cpu
+ks_solver       cg
+precision       double
diff --git a/source/module_base/module_container/ATen/kernels/cuda/linalg.cu b/source/module_base/module_container/ATen/kernels/cuda/linalg.cu
@@ -31,7 +31,7 @@ __global__ void do_add_kernel(
     T* z)
 {
     // Perform add operation for the specified range [begin, end) in the output Tensor.
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         z[o_idx] = alpha * x[o_idx] + beta * y[o_idx];
     }
@@ -44,7 +44,7 @@ __global__ void do_mul_kernel(
     const T* x,
     T* y)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         y[o_idx] = alpha * x[o_idx];
     }
@@ -58,7 +58,7 @@ __global__ void do_mul_kernel(
     const T* y,
     T* z)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         z[o_idx] = alpha * x[o_idx] * y[o_idx];
     }
@@ -72,7 +72,7 @@ __global__ void do_div_kernel(
     const T* y,
     T* z)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         z[o_idx] = alpha * x[o_idx] / y[o_idx];
     }
@@ -88,7 +88,7 @@ __global__ void do_fma_kernel(
     const T* z,
     T* out)
 {
-    for (auto o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
+    for (int o_idx = threadIdx.x; o_idx < num_element; o_idx += blockDim.x) {
         // Assign the sum of the input Tensor elements at index 'o_idx' to the output Tensor element at index 'o_idx'.
         out[o_idx] = alpha * x[o_idx] * y[o_idx] + beta * z[o_idx];
     }

diff --git a/...ontainer/ATen/kernels/rocm/blas_op.hip.cu → ...e_container/ATen/kernels/rocm/blas.hip.cu b/...ontainer/ATen/kernels/rocm/blas_op.hip.cu → ...e_container/ATen/kernels/rocm/blas.hip.cu
@@ -1,11 +1,11 @@
-#include <ATen/kernels/blas_op.h>
+#include <ATen/kernels/blas.h>
 #include <base/third_party/blas.h>
 
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 
 namespace container {
-namespace op {
+namespace kernels {
 
 static hipblasHandle_t hipblas_handle = nullptr;
 
@@ -241,5 +241,5 @@ template struct blas_gemm_batched_strided<double, DEVICE_GPU>;
 template struct blas_gemm_batched_strided<std::complex<float >, DEVICE_GPU>;
 template struct blas_gemm_batched_strided<std::complex<double>, DEVICE_GPU>;
 
-} // namespace op
+} // namespace kernels
 } // namespace container
diff --git a/...tainer/ATen/kernels/rocm/lapack_op.hip.cu → ...container/ATen/kernels/rocm/lapack.hip.cu b/...tainer/ATen/kernels/rocm/lapack_op.hip.cu → ...container/ATen/kernels/rocm/lapack.hip.cu
@@ -1,13 +1,13 @@
 #include <vector>
-#include <ATen/kernels/lapack_op.h>
+#include <ATen/kernels/lapack.h>
 #include <base/third_party/lapack.h>
 
 #include <hip/hip_runtime.h>
 #include <thrust/complex.h>
 #include <hipsolver/hipsolver.h>
 
 namespace container {
-namespace op {
+namespace kernels {
 
 
 static hipsolverHandle_t hipsolver_handle = nullptr;
@@ -155,5 +155,5 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;
 
-} // namespace op
+} // namespace kernels
 } // namespace container