Skip to content

Commit

Permalink
Merge branch 'develop' into fix-dcu-single
Browse files Browse the repository at this point in the history
  • Loading branch information
denghuilu authored May 22, 2024
2 parents 8d8db65 + 0ecc169 commit 490169e
Show file tree
Hide file tree
Showing 11 changed files with 223 additions and 122 deletions.
11 changes: 10 additions & 1 deletion docs/quick_start/easy_install.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,16 @@ Use 4 MPI processes to run, for example:
mpirun -n 4 abacus
```

> The total thread count(i.e. OpenMP per-process thread count * MPI process count) should not exceed the number of cores in your machine.
The total thread count (i.e. OpenMP per-process thread count * MPI process count) should not exceed the number of cores in your machine.
To use 4 threads and 4 MPI processes, set the environment variable `OMP_NUM_THREADS` before running `mpirun`:

```bash
OMP_NUM_THREADS=4 mpirun -n 4 abacus
```

In this case, the total thread count is 16.

ABACUS will try to determine the number of threads used by each process if `OMP_NUM_THREADS` is not set. However, it is **required** to set `OMP_NUM_THREADS` before running `mpirun` to avoid potential performance issues.

Please refer to [hands-on guide](./hands_on.md) for more instructions.

Expand Down
25 changes: 9 additions & 16 deletions source/module_base/global_variable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,26 +108,19 @@ std::vector<double> ocp_kb;
bool out_mul = false; // qifeng add 2019/9/10
//----------------------------------------------------------
// EXPLAIN : Parallel information
// GLOBAL VARIABLES :
// NAME : NPROC( global number of process )
// NAME : KPAR( global number of pools )
// NAME : MY_RANK( global index of process )
// NAME : MY_POOL( global index of pool (count in pool))
// NAME : NPROC_IN_POOL( local number of process in a pool.)
// NAME : RANK_IN_POOL( global index of pool (count in process),
// my_rank in each pool)
//----------------------------------------------------------
int NPROC = 1;
int KPAR = 1;

int NPROC = 1; ///< global number of process
int KPAR = 1; ///< global number of pools
int NSTOGROUP = 1;
int MY_RANK = 0;
int MY_POOL = 0;
int MY_RANK = 0; ///< global index of process
int MY_POOL = 0; ///< global index of pool (count in pool)
int MY_STOGROUP = 0;
int NPROC_IN_POOL = 1;
int NPROC_IN_POOL = 1; ///< local number of process in a pool
int NPROC_IN_STOGROUP = 1;
int RANK_IN_POOL = 0;
int RANK_IN_POOL = 0; ///< global index of pool (count in process), my_rank in each pool
int RANK_IN_STOGROUP = 0;
int DRANK = -1; // mohan add 2012-01-13, must be -1, so we can recognize who didn't in DIAG_WORLD
int DRANK = -1; ///< mohan add 2012-01-13, must be -1, so we can recognize who didn't in DIAG_WORLD
int DSIZE = KPAR;
int DCOLOR = -1;
int GRANK = MY_RANK;
Expand Down Expand Up @@ -237,7 +230,7 @@ double of_tole = 2e-6;
double of_tolp = 1e-5;
double of_tf_weight = 1.;
double of_vw_weight = 1.;
double of_wt_alpha = 5./6.;
double of_wt_alpha = 5./6.;
double of_wt_beta = 5./6.;
double of_wt_rho0 = 0.;
bool of_hold_rho0 = false;
Expand Down
46 changes: 31 additions & 15 deletions source/module_base/module_device/cuda/memory_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include <cuda_runtime.h>
#include <thrust/complex.h>

#include <complex>
#include <type_traits>

#define THREADS_PER_BLOCK 256

namespace base_device
Expand Down Expand Up @@ -112,21 +115,26 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
};

template <typename FPTYPE_out, typename FPTYPE_in>
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
{
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU> {
void operator()(const base_device::DEVICE_GPU* dev_out,
const base_device::DEVICE_CPU* dev_in,
FPTYPE_out* arr_out,
const FPTYPE_in* arr_in,
const size_t size)
{

if (size == 0)
const size_t size) {

if (size == 0) {return;}
// No need to cast the memory if the data types are the same.
if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
{
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
dev_in,
arr_out,
reinterpret_cast<const FPTYPE_out*>(arr_in),
size);
return;
}
FPTYPE_in* arr = nullptr;
cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE_in) * size));
FPTYPE_in * arr = nullptr;
cudaErrcheck(cudaMalloc((void **)&arr, sizeof(FPTYPE_in) * size));
cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyHostToDevice));
const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
cast_memory<<<block, THREADS_PER_BLOCK>>>(arr_out, arr, size);
Expand All @@ -137,18 +145,26 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
};

template <typename FPTYPE_out, typename FPTYPE_in>
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
{
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU> {
void operator()(const base_device::DEVICE_CPU* dev_out,
const base_device::DEVICE_GPU* dev_in,
FPTYPE_out* arr_out,
const FPTYPE_in* arr_in,
const size_t size)
{
auto* arr = (FPTYPE_in*)malloc(sizeof(FPTYPE_in) * size);
cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyDeviceToHost));
for (int ii = 0; ii < size; ii++)
const size_t size) {
if (size == 0) {return;}
// No need to cast the memory if the data types are the same.
if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
{
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
dev_in,
arr_out,
reinterpret_cast<const FPTYPE_out*>(arr_in),
size);
return;
}
auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
cudaErrcheck(cudaMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, cudaMemcpyDeviceToHost));
for (int ii = 0; ii < size; ii++) {
arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
}
free(arr);
Expand Down
56 changes: 39 additions & 17 deletions source/module_base/module_device/rocm/memory_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include <hip/hip_runtime.h>
#include <thrust/complex.h>

#include <complex>
#include <type_traits>

#define THREADS_PER_BLOCK 256

namespace base_device
Expand Down Expand Up @@ -91,14 +94,14 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
}

template <typename FPTYPE_out, typename FPTYPE_in>
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
{
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU> {
void operator()(const base_device::DEVICE_GPU* dev_out,
const base_device::DEVICE_GPU* dev_in,
FPTYPE_out* arr_out,
const FPTYPE_in* arr_in,
const size_t size)
{
const size_t size) {

if (size == 0) {return;}
const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
hipLaunchKernelGGL(cast_memory, dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, arr_out, arr_in, size);
hipErrcheck(hipGetLastError());
Expand All @@ -107,16 +110,26 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
};

template <typename FPTYPE_out, typename FPTYPE_in>
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
{
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU> {
void operator()(const base_device::DEVICE_GPU* dev_out,
const base_device::DEVICE_CPU* dev_in,
FPTYPE_out* arr_out,
const FPTYPE_in* arr_in,
const size_t size)
{
FPTYPE_in* arr = nullptr;
hipErrcheck(hipMalloc((void**)&arr, sizeof(FPTYPE_in) * size));
const size_t size) {

if (size == 0) {return;}
// No need to cast the memory if the data types are the same.
if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
{
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
dev_in,
arr_out,
reinterpret_cast<const FPTYPE_out*>(arr_in),
size);
return;
}
FPTYPE_in * arr = nullptr;
hipErrcheck(hipMalloc((void **)&arr, sizeof(FPTYPE_in) * size));
hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyHostToDevice));
const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
hipLaunchKernelGGL(cast_memory, dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, arr_out, arr, size);
Expand All @@ -127,18 +140,27 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
};

template <typename FPTYPE_out, typename FPTYPE_in>
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
{
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU> {
void operator()(const base_device::DEVICE_CPU* dev_out,
const base_device::DEVICE_GPU* dev_in,
FPTYPE_out* arr_out,
const FPTYPE_in* arr_in,
const size_t size)
{
auto* arr = (FPTYPE_in*)malloc(sizeof(FPTYPE_in) * size);
hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyDeviceToHost));
for (int ii = 0; ii < size; ii++)
const size_t size) {
if (size == 0) {return;}
// No need to cast the memory if the data types are the same.
if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
{
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
dev_in,
arr_out,
reinterpret_cast<const FPTYPE_out*>(arr_in),
size);
return;
}
auto * arr = (FPTYPE_in*) malloc(sizeof(FPTYPE_in) * size);
hipErrcheck(hipMemcpy(arr, arr_in, sizeof(FPTYPE_in) * size, hipMemcpyDeviceToHost));
for (int ii = 0; ii < size; ii++) {
arr_out[ii] = static_cast<FPTYPE_out>(arr[ii]);
}
free(arr);
Expand Down
65 changes: 39 additions & 26 deletions source/module_base/parallel_global.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,43 +172,56 @@ void Parallel_Global::read_mpi_parameters(int argc,char **argv)

// GlobalV::KPAR = atoi(argv[1]); // mohan abandon 2010-06-09

// get the size --> GlobalV::NPROC
// get the rank --> GlobalV::MY_RANK
MPI_Comm_size(MPI_COMM_WORLD,&GlobalV::NPROC);
// get world size --> GlobalV::NPROC
// get global rank --> GlobalV::MY_RANK
MPI_Comm_size(MPI_COMM_WORLD,&GlobalV::NPROC);
MPI_Comm_rank(MPI_COMM_WORLD, &GlobalV::MY_RANK);
int process_num = 0; // number of processes in the current node
int local_rank = 0; // rank of the process in the current node
MPI_Comm shmcomm;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
MPI_Comm_size(shmcomm, &process_num);
MPI_Comm_rank(shmcomm, &local_rank);
MPI_Comm_free(&shmcomm);

// determining appropriate thread number for OpenMP
// Determining appropriate thread number for OpenMP:
// 1. If the number of threads is set by the user by `OMP_NUM_THREADS`, use it.
// 2. Otherwise, set to number of CPU cores / number of processes.
// 3. If the number of threads is larger than the hardware availability (should only happens if route 1 taken),
// output a warning message.
// 4. If the number of threads is smaller than the hardware availability, output an info message.
// CAVEAT: The user should set the number of threads properly to avoid oversubscribing.
// This mechanism only handles the worst case for the default setting (not setting number of threads at all, causing oversubscribing and extremely slow performance), not guaranteed to be optimal.
const int max_thread_num = std::thread::hardware_concurrency(); // Consider Hyperthreading disabled.
#ifdef _OPENMP
int current_thread_num = omp_get_max_threads();
int current_thread_num = omp_get_max_threads(); // Get the number of threads set by the user.
if (current_thread_num == max_thread_num && process_num >= 1) // Avoid oversubscribing on the number of threads not set.
{
current_thread_num = max_thread_num / process_num;
omp_set_num_threads(current_thread_num);
}
#else
int current_thread_num = 1;
#endif
MPI_Comm shmcomm;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
int process_num = 0, local_rank = 0;
MPI_Comm_size(shmcomm, &process_num);
MPI_Comm_rank(shmcomm, &local_rank);
MPI_Comm_free(&shmcomm);
mpi_number = process_num;
omp_number = current_thread_num;
if (current_thread_num * process_num > max_thread_num && local_rank==0)
{
std::stringstream mess;
mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
mess << "%% WARNING: Total thread number(" << current_thread_num * process_num << ") "
<< "is larger than hardware availability(" << max_thread_num << ")." << std::endl;
mess << "%% WARNING: The results may be INCORRECT. Please be sure what you are doing." << std::endl;
mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
mess << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" << std::endl;
std::cerr << mess.str() << std::endl;
mess << "WARNING: Total thread number(" << current_thread_num * process_num << ") "
<< "is larger than hardware availability(" << max_thread_num << ")." << std::endl
<< "The results may be INCORRECT. Please set the environment variable OMP_NUM_THREADS to a proper value."
<< std::endl;
std::cerr << mess.str() << std::endl;
// the user may take their own risk by set the OMP_NUM_THREADS env var.
if (std::getenv("OMP_NUM_THREADS") == nullptr)
{
exit(1);
}
}
else if (current_thread_num * process_num < max_thread_num && local_rank==0)
{
// only output info in local rank 0
std::cerr << "WARNING: Total thread number on this node mismatches with hardware availability. "
"This may cause poor performance."<< std::endl;
std::cerr << "Info: Local MPI proc number: " << process_num << ","
<< "OpenMP thread number: " << current_thread_num << ","
<< "Total thread number: " << current_thread_num * process_num << ","
Expand Down Expand Up @@ -336,25 +349,25 @@ void Parallel_Global::divide_pools(void)
{
std::cout<<"\n NPROC=" << GlobalV::NPROC << " KPAR=" << GlobalV::KPAR;
std::cout<<"Error : Too many pools !"<<std::endl;
exit(0);
exit(1);
}

// (1) per process in each stogroup
if(GlobalV::NPROC%GlobalV::NSTOGROUP!=0)
{
std::cout<<"\n Error! NPROC="<<GlobalV::NPROC
<<" must be divided evenly by BNDPAR="<<GlobalV::NSTOGROUP<<std::endl;
exit(0);
exit(1);
}
GlobalV::NPROC_IN_STOGROUP = GlobalV::NPROC/GlobalV::NSTOGROUP;
GlobalV::MY_STOGROUP = int(GlobalV::MY_RANK / GlobalV::NPROC_IN_STOGROUP);
GlobalV::RANK_IN_STOGROUP = GlobalV::MY_RANK%GlobalV::NPROC_IN_STOGROUP;
if (GlobalV::NPROC_IN_STOGROUP < GlobalV::KPAR)
{
std::cout<<"\n Error! NPROC_IN_BNDGROUP=" << GlobalV::NPROC_IN_STOGROUP
std::cout<<"\n Error! NPROC_IN_BNDGROUP=" << GlobalV::NPROC_IN_STOGROUP
<<" is smaller than"<< " KPAR=" << GlobalV::KPAR<<std::endl;
std::cout<<" Please reduce KPAR or reduce BNDPAR"<<std::endl;
exit(0);
exit(1);
}

// (2) per process in each pool
Expand All @@ -370,7 +383,7 @@ void Parallel_Global::divide_pools(void)
GlobalV::MY_POOL = int( (GlobalV::RANK_IN_STOGROUP-GlobalV::NPROC_IN_STOGROUP%GlobalV::KPAR) / GlobalV::NPROC_IN_POOL);
GlobalV::RANK_IN_POOL = (GlobalV::RANK_IN_STOGROUP-GlobalV::NPROC_IN_STOGROUP%GlobalV::KPAR)%GlobalV::NPROC_IN_POOL;
}




Expand Down
7 changes: 4 additions & 3 deletions source/module_cell/read_atoms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,7 @@ void UnitCell::print_tau(void) const
bool direct = (Coordinate == "Direct");
std::string table;
table += direct? "DIRECT COORDINATES\n": FmtCore::format("CARTESIAN COORDINATES ( UNIT = %20.12f Bohr ).\n", lat0);
const std::string redundant_header = direct? "taud_": "tauc_";
table += FmtCore::format("%8s%20s%20s%20s%8s%20s%20s%20s\n", "atom", "x", "y", "z", "mag", "vx", "vy", "vz");
for(int it = 0; it < ntype; it++)
{
Expand All @@ -1035,9 +1036,9 @@ void UnitCell::print_tau(void) const
const double& x = direct? atoms[it].taud[ia].x: atoms[it].tau[ia].x;
const double& y = direct? atoms[it].taud[ia].y: atoms[it].tau[ia].y;
const double& z = direct? atoms[it].taud[ia].z: atoms[it].tau[ia].z;
table += FmtCore::format("%3s%-5d%20.10f%20.10f%20.10f%8.4f%20.10f%20.10f%20.10f\n",
atoms[it].label, ia+1, x, y, z, atoms[it].mag[ia],
atoms[it].vel[ia].x, atoms[it].vel[ia].y, atoms[it].vel[ia].z);
table += FmtCore::format("%5s%-s%-5d%20.10f%20.10f%20.10f%8.4f%20.10f%20.10f%20.10f\n", // I dont know why there must be a redundant "tau[c|d]_" in the output. So ugly, it should be removed!
redundant_header, atoms[it].label, ia+1, x, y, z, atoms[it].mag[ia],
atoms[it].vel[ia].x, atoms[it].vel[ia].y, atoms[it].vel[ia].z);
}
}
table += "\n";
Expand Down
Loading

0 comments on commit 490169e

Please sign in to comment.