Merge branch 'develop' into fix-dcu-single

denghuilu · May 22, 2024 · 490169e · 490169e
2 parents 8d8db65 + 0ecc169
commit 490169e
Show file tree

Hide file tree

Showing 11 changed files with 223 additions and 122 deletions.
diff --git a/docs/quick_start/easy_install.md b/docs/quick_start/easy_install.md
@@ -176,7 +176,16 @@ Use 4 MPI processes to run, for example:
 mpirun -n 4 abacus
 ```
 
-> The total thread count(i.e. OpenMP per-process thread count * MPI process count) should not exceed the number of cores in your machine.
+The total thread count (i.e. OpenMP per-process thread count * MPI process count) should not exceed the number of cores in your machine.
+To use 4 threads and 4 MPI processes, set the environment variable `OMP_NUM_THREADS` before running `mpirun`:
+
+```bash
+OMP_NUM_THREADS=4 mpirun -n 4 abacus
+```
+
+In this case, the total thread count is 16.
+
+ABACUS will try to determine the number of threads used by each process if `OMP_NUM_THREADS` is not set. However, it is **required** to set `OMP_NUM_THREADS` before running `mpirun` to avoid potential performance issues.
 
 Please refer to [hands-on guide](./hands_on.md) for more instructions.
 

diff --git a/source/module_base/global_variable.cpp b/source/module_base/global_variable.cpp
@@ -108,26 +108,19 @@ std::vector<double> ocp_kb;
 bool out_mul = false; // qifeng add 2019/9/10
 //----------------------------------------------------------
 // EXPLAIN : Parallel information
-// GLOBAL VARIABLES :
-// NAME : NPROC( global number of process )
-// NAME : KPAR( global number of pools )
-// NAME : MY_RANK( global index of process )
-// NAME : MY_POOL( global index of pool (count in pool))
-// NAME : NPROC_IN_POOL( local number of process in a pool.)
-// NAME : RANK_IN_POOL( global index of pool (count in process),
-//  		my_rank in each pool)
 //----------------------------------------------------------
-int NPROC = 1;
-int KPAR = 1;
+
+int NPROC = 1; ///< global number of process
+int KPAR = 1; ///< global number of pools
 int NSTOGROUP = 1;
-int MY_RANK = 0;
-int MY_POOL = 0;
+int MY_RANK = 0; ///< global index of process
+int MY_POOL = 0; ///< global index of pool (count in pool)
 int MY_STOGROUP = 0;
-int NPROC_IN_POOL = 1;
+int NPROC_IN_POOL = 1; ///< local number of process in a pool
 int NPROC_IN_STOGROUP = 1;
-int RANK_IN_POOL = 0;
+int RANK_IN_POOL = 0; ///< global index of pool (count in process), my_rank in each pool
 int RANK_IN_STOGROUP = 0;
-int DRANK = -1; // mohan add 2012-01-13, must be -1, so we can recognize who didn't in DIAG_WORLD
+int DRANK = -1; ///< mohan add 2012-01-13, must be -1, so we can recognize who didn't in DIAG_WORLD
 int DSIZE = KPAR;
 int DCOLOR = -1;
 int GRANK = MY_RANK;
@@ -237,7 +230,7 @@ double of_tole = 2e-6;
 double of_tolp = 1e-5;
 double of_tf_weight = 1.;
 double of_vw_weight = 1.;
-double of_wt_alpha = 5./6.;  
+double of_wt_alpha = 5./6.;
 double of_wt_beta = 5./6.;
 double of_wt_rho0 = 0.;
 bool of_hold_rho0 = false;

diff --git a/source/module_base/module_device/cuda/memory_op.cu b/source/module_base/module_device/cuda/memory_op.cu
@@ -4,6 +4,9 @@
 #include <cuda_runtime.h>
 #include <thrust/complex.h>
 
+#include <complex>
+#include <type_traits>
+
 #define THREADS_PER_BLOCK 256
 
 namespace base_device
@@ -112,21 +115,26 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 };
 
 template <typename FPTYPE_out, typename FPTYPE_in>
-struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
-{
+struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU> {
     void operator()(const base_device::DEVICE_GPU* dev_out,
                     const base_device::DEVICE_CPU* dev_in,
                     FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
-                    const size_t size)
-    {
-
-        if (size == 0)
+                    const size_t size) {
+
+        if (size == 0) {return;}
+        // No need to cast the memory if the data types are the same.
+        if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
         {
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
+                                                                                                  dev_in,
+                                                                                                  arr_out,
+                                                                                                  reinterpret_cast<const FPTYPE_out*>(arr_in),
+                                                                                                  size);
             return;
         }
-        FPTYPE_in* arr = nullptr;
-        cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE_in) * size));
+        FPTYPE_in * arr = nullptr;
+        cudaErrcheck(cudaMalloc((void **)&arr, sizeof(FPTYPE_in) * size));
         cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyHostToDevice));
         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
         cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr, size);
@@ -137,18 +145,26 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 };
 
 template <typename FPTYPE_out, typename FPTYPE_in>
-struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
-{
+struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU> {
     void operator()(const base_device::DEVICE_CPU* dev_out,
                     const base_device::DEVICE_GPU* dev_in,
                     FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
-                    const size_t size)
-    {
-        auto* arr = (FPTYPE_in*)malloc(sizeof(FPTYPE_in) * size);
-        cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyDeviceToHost));
-        for (int ii = 0; ii < size; ii++)
+                    const size_t size) {
+        if (size == 0) {return;}
+        // No need to cast the memory if the data types are the same.
+        if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
         {
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
+                                                                                                  dev_in,
+                                                                                                  arr_out,
+                                                                                                  reinterpret_cast<const FPTYPE_out*>(arr_in),
+                                                                                                  size);
+            return;
+        }
+        auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
+        cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyDeviceToHost));
+        for (int ii = 0; ii < size; ii++) {
             arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
         }
         free(arr);

diff --git a/source/module_base/module_device/rocm/memory_op.hip.cu b/source/module_base/module_device/rocm/memory_op.hip.cu
@@ -4,6 +4,9 @@
 #include <hip/hip_runtime.h>
 #include <thrust/complex.h>
 
+#include <complex>
+#include <type_traits>
+
 #define THREADS_PER_BLOCK 256
 
 namespace base_device
@@ -91,14 +94,14 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
 }
 
 template <typename FPTYPE_out, typename FPTYPE_in>
-struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
-{
+struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU> {
     void operator()(const base_device::DEVICE_GPU* dev_out,
                     const base_device::DEVICE_GPU* dev_in,
                     FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
-                    const size_t size)
-    {
+                    const size_t size) {
+
+        if (size == 0) {return;}
         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
         hipLaunchKernelGGL(cast_memory, dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, arr_out, arr_in, size);
         hipErrcheck(hipGetLastError());
@@ -107,16 +110,26 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 };
 
 template <typename FPTYPE_out, typename FPTYPE_in>
-struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
-{
+struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU> {
     void operator()(const base_device::DEVICE_GPU* dev_out,
                     const base_device::DEVICE_CPU* dev_in,
                     FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
-                    const size_t size)
-    {
-        FPTYPE_in* arr = nullptr;
-        hipErrcheck(hipMalloc((void**)&arr, sizeof(FPTYPE_in) * size));
+                    const size_t size) {
+
+        if (size == 0) {return;}
+        // No need to cast the memory if the data types are the same.
+        if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
+        {
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
+                                                                                                  dev_in,
+                                                                                                  arr_out,
+                                                                                                  reinterpret_cast<const FPTYPE_out*>(arr_in),
+                                                                                                  size);
+            return;
+        }
+        FPTYPE_in * arr = nullptr;
+        hipErrcheck(hipMalloc((void **)&arr, sizeof(FPTYPE_in) * size));
         hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyHostToDevice));
         const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
         hipLaunchKernelGGL(cast_memory, dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, arr_out, arr, size);
@@ -127,18 +140,27 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 };
 
 template <typename FPTYPE_out, typename FPTYPE_in>
-struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
-{
+struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU> {
     void operator()(const base_device::DEVICE_CPU* dev_out,
                     const base_device::DEVICE_GPU* dev_in,
                     FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
-                    const size_t size)
-    {
-        auto* arr = (FPTYPE_in*)malloc(sizeof(FPTYPE_in) * size);
-        hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyDeviceToHost));
-        for (int ii = 0; ii < size; ii++)
+                    const size_t size) {
+        
+        if (size == 0) {return;}
+        // No need to cast the memory if the data types are the same.
+        if (std::is_same<FPTYPE_out, FPTYPE_in>::value) 
         {
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
+                                                                                                  dev_in,
+                                                                                                  arr_out,
+                                                                                                  reinterpret_cast<const FPTYPE_out*>(arr_in),
+                                                                                                  size);
+            return;
+        }
+        auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
+        hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyDeviceToHost));
+        for (int ii = 0; ii < size; ii++) {
             arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
         }
         free(arr);

diff --git a/source/module_base/parallel_global.cpp b/source/module_base/parallel_global.cpp
@@ -172,43 +172,56 @@ void Parallel_Global::read_mpi_parameters(int argc,char **argv)
 
     //  GlobalV::KPAR = atoi(argv[1]); // mohan abandon 2010-06-09
 
-	// get the size --> GlobalV::NPROC
-	// get the rank --> GlobalV::MY_RANK
-	MPI_Comm_size(MPI_COMM_WORLD,&GlobalV::NPROC);
+    // get world size --> GlobalV::NPROC
+    // get global rank --> GlobalV::MY_RANK
+    MPI_Comm_size(MPI_COMM_WORLD,&GlobalV::NPROC);
     MPI_Comm_rank(MPI_COMM_WORLD, &GlobalV::MY_RANK);
+    int process_num = 0; // number of processes in the current node
+    int local_rank = 0;  // rank of the process in the current node
+    MPI_Comm shmcomm;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
+    MPI_Comm_size(shmcomm, &process_num);
+    MPI_Comm_rank(shmcomm, &local_rank);
+    MPI_Comm_free(&shmcomm);
 
-    // determining appropriate thread number for OpenMP
+    // Determining appropriate thread number for OpenMP:
+    // 1. If the number of threads is set by the user by `OMP_NUM_THREADS`, use it.
+    // 2. Otherwise, set to number of CPU cores / number of processes.
+    // 3. If the number of threads is larger than the hardware availability (should only happens if route 1 taken),
+    //  output a warning message.
+    // 4. If the number of threads is smaller than the hardware availability, output an info message.
+    // CAVEAT: The user should set the number of threads properly to avoid oversubscribing.
+    // This mechanism only handles the worst case for the default setting (not setting number of threads at all, causing oversubscribing and extremely slow performance), not guaranteed to be optimal.
     const int max_thread_num = std::thread::hardware_concurrency(); // Consider Hyperthreading disabled.
 #ifdef _OPENMP
-    int current_thread_num = omp_get_max_threads();
+    int current_thread_num = omp_get_max_threads(); // Get the number of threads set by the user.
+    if (current_thread_num == max_thread_num && process_num >= 1) // Avoid oversubscribing on the number of threads not set.
+    {
+        current_thread_num = max_thread_num / process_num;
+        omp_set_num_threads(current_thread_num);
+    }
 #else
     int current_thread_num = 1;
 #endif
-    MPI_Comm shmcomm;
-    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
-    int process_num = 0, local_rank = 0;
-    MPI_Comm_size(shmcomm, &process_num);
-    MPI_Comm_rank(shmcomm, &local_rank);
-    MPI_Comm_free(&shmcomm);
     mpi_number = process_num;
     omp_number = current_thread_num;
     if (current_thread_num * process_num > max_thread_num && local_rank==0)
     {
         std::stringstream mess;
-        mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
-        mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
-        mess << "%% WARNING: Total thread number(" << current_thread_num * process_num <<  ") " 
-             << "is larger than hardware availability(" << max_thread_num << ")." << std::endl;
-        mess << "%% WARNING: The results may be INCORRECT. Please be sure what you are doing." << std::endl;
-        mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
-        mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
-		std::cerr << mess.str() << std::endl;
+        mess << "WARNING: Total thread number(" << current_thread_num * process_num << ") "
+             << "is larger than hardware availability(" << max_thread_num << ")." << std::endl
+             << "The results may be INCORRECT. Please set the environment variable OMP_NUM_THREADS to a proper value."
+             << std::endl;
+        std::cerr << mess.str() << std::endl;
+        // the user may take their own risk by set the OMP_NUM_THREADS env var.
+        if (std::getenv("OMP_NUM_THREADS") == nullptr)
+        {
+            exit(1);
+        }
     }
     else if (current_thread_num * process_num < max_thread_num && local_rank==0)
     {
     	// only output info in local rank 0
-        std::cerr << "WARNING: Total thread number on this node mismatches with hardware availability. "
-            "This may cause poor performance."<< std::endl;
         std::cerr << "Info: Local MPI proc number: " << process_num << ","
                   << "OpenMP thread number: " << current_thread_num << ","
                   << "Total thread number: " << current_thread_num * process_num << ","
@@ -336,25 +349,25 @@ void Parallel_Global::divide_pools(void)
     {
         std::cout<<"\n NPROC=" << GlobalV::NPROC << " KPAR=" << GlobalV::KPAR;
         std::cout<<"Error : Too many pools !"<<std::endl;
-        exit(0);
+        exit(1);
     }
 
     // (1) per process in each stogroup
     if(GlobalV::NPROC%GlobalV::NSTOGROUP!=0)
     {
         std::cout<<"\n Error! NPROC="<<GlobalV::NPROC
         <<" must be divided evenly by BNDPAR="<<GlobalV::NSTOGROUP<<std::endl;
-        exit(0);
+        exit(1);
     }
     GlobalV::NPROC_IN_STOGROUP = GlobalV::NPROC/GlobalV::NSTOGROUP;
     GlobalV::MY_STOGROUP = int(GlobalV::MY_RANK / GlobalV::NPROC_IN_STOGROUP);
     GlobalV::RANK_IN_STOGROUP = GlobalV::MY_RANK%GlobalV::NPROC_IN_STOGROUP;
     if (GlobalV::NPROC_IN_STOGROUP < GlobalV::KPAR)
     {
-        std::cout<<"\n Error! NPROC_IN_BNDGROUP=" << GlobalV::NPROC_IN_STOGROUP 
+        std::cout<<"\n Error! NPROC_IN_BNDGROUP=" << GlobalV::NPROC_IN_STOGROUP
             <<" is smaller than"<< " KPAR=" << GlobalV::KPAR<<std::endl;
         std::cout<<" Please reduce KPAR or reduce BNDPAR"<<std::endl;
-        exit(0);
+        exit(1);
     }
 
     // (2) per process in each pool
@@ -370,7 +383,7 @@ void Parallel_Global::divide_pools(void)
         GlobalV::MY_POOL = int( (GlobalV::RANK_IN_STOGROUP-GlobalV::NPROC_IN_STOGROUP%GlobalV::KPAR) / GlobalV::NPROC_IN_POOL);
         GlobalV::RANK_IN_POOL = (GlobalV::RANK_IN_STOGROUP-GlobalV::NPROC_IN_STOGROUP%GlobalV::KPAR)%GlobalV::NPROC_IN_POOL;
     }
-    
+
 
 
 

diff --git a/source/module_cell/read_atoms.cpp b/source/module_cell/read_atoms.cpp
@@ -1027,6 +1027,7 @@ void UnitCell::print_tau(void) const
     bool direct = (Coordinate == "Direct");
     std::string table;
     table += direct? "DIRECT COORDINATES\n": FmtCore::format("CARTESIAN COORDINATES ( UNIT = %20.12f Bohr ).\n", lat0);
+    const std::string redundant_header = direct? "taud_": "tauc_";
     table += FmtCore::format("%8s%20s%20s%20s%8s%20s%20s%20s\n", "atom", "x", "y", "z", "mag", "vx", "vy", "vz");
     for(int it = 0; it < ntype; it++)
     {
@@ -1035,9 +1036,9 @@ void UnitCell::print_tau(void) const
             const double& x = direct? atoms[it].taud[ia].x: atoms[it].tau[ia].x;
             const double& y = direct? atoms[it].taud[ia].y: atoms[it].tau[ia].y;
             const double& z = direct? atoms[it].taud[ia].z: atoms[it].tau[ia].z;
-            table += FmtCore::format("%3s%-5d%20.10f%20.10f%20.10f%8.4f%20.10f%20.10f%20.10f\n", 
-                                      atoms[it].label, ia+1, x, y, z, atoms[it].mag[ia], 
-                                      atoms[it].vel[ia].x, atoms[it].vel[ia].y, atoms[it].vel[ia].z);
+            table += FmtCore::format("%5s%-s%-5d%20.10f%20.10f%20.10f%8.4f%20.10f%20.10f%20.10f\n", // I dont know why there must be a redundant "tau[c|d]_" in the output. So ugly, it should be removed!
+                                     redundant_header, atoms[it].label, ia+1, x, y, z, atoms[it].mag[ia], 
+                                     atoms[it].vel[ia].x, atoms[it].vel[ia].y, atoms[it].vel[ia].z);
         }
     }
     table += "\n";