Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: GPU Memory Recorder #5471

Merged
merged 3 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 201 additions & 17 deletions source/module_base/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ int Memory::n_memory = 1000;
int Memory::n_now = 0;
bool Memory::init_flag = false;

#if defined(__CUDA) || defined(__ROCM)

double Memory::total_gpu = 0.0;
int Memory::n_now_gpu = 0;
bool Memory::init_flag_gpu = false;

std::string *Memory::name_gpu;
std::string *Memory::class_name_gpu;
double *Memory::consume_gpu;

#endif


std::string *Memory::name;
std::string *Memory::class_name;
double *Memory::consume;
Expand Down Expand Up @@ -208,6 +221,126 @@ void Memory::record
return;
}

#if defined(__CUDA) || defined(__ROCM)

double Memory::record_gpu
(
const std::string &class_name_in,
const std::string &name_in,
const long &n_in,
const std::string &type,
const bool accumulate
)
{
if(!Memory::init_flag_gpu)
{
name_gpu = new std::string[n_memory];
class_name_gpu = new std::string[n_memory];
consume_gpu = new double[n_memory];
for(int i=0;i<n_memory;i++)
{
consume_gpu[i] = 0.0;
}
Memory::init_flag_gpu = true;
}

int find = 0;
for(find = 0; find < n_now_gpu; find++)
{
if( name_in == name_gpu[find] )
{
break;
}
}

// find == n_now : found a new record.
if(find == n_now_gpu)
{
n_now_gpu++;
name_gpu[find] = name_in;
class_name_gpu[find] = class_name_in;
}
if(n_now_gpu >= n_memory)
{
std::cout<<" Error! Too many gpu memories required.";
return 0.0;
}

consume_gpu[find] = Memory::calculate_mem(n_in,type);

if(consume_gpu[find] > 5)
{
print(find);
}
return consume_gpu[find];
}

void Memory::record_gpu
(
const std::string &name_in,
const size_t &n_in,
const bool accumulate
)
{
if(!Memory::init_flag_gpu)
{
name_gpu = new std::string[n_memory];
class_name_gpu = new std::string[n_memory];
consume_gpu = new double[n_memory];
for(int i=0;i<n_memory;i++)
{
consume_gpu[i] = 0.0;
}
Memory::init_flag_gpu = true;
}

int find = 0;
for(find = 0; find < n_now_gpu; find++)
{
if( name_in == name_gpu[find] )
{
break;
}
}

// find == n_now : found a new record.
if(find == n_now_gpu)
{
n_now_gpu++;
name_gpu[find] = name_in;
class_name_gpu[find] = "";
}
if(n_now_gpu >= n_memory)
{
std::cout<<" Error! Too many gpu memories has been recorded.";
return;
}

const double factor = 1.0/1024.0/1024.0;
double size_mb = n_in * factor;

if(accumulate)
{
consume_gpu[find] += size_mb;
Memory::total_gpu += size_mb;
}
else
{
if(consume_gpu[find] < size_mb)
{
Memory::total_gpu += size_mb - consume_gpu[find];
consume_gpu[find] = size_mb;
if(consume_gpu[find] > 5)
{
print(find);
}
}
}
return;
}

#endif

void Memory::print(const int find)
{
GlobalV::ofs_running <<"\n Warning_Memory_Consuming allocated: "
Expand All @@ -226,19 +359,34 @@ void Memory::finish(std::ofstream &ofs)
delete[] consume;
init_flag = false;
}
#if defined(__CUDA) || defined(__ROCM)
if(init_flag_gpu)
{
delete[] name_gpu;
delete[] class_name_gpu;
delete[] consume_gpu;
}
#endif
return;
}

void Memory::print_all(std::ofstream &ofs)
{
if(!init_flag)
if(!init_flag
#if defined(__CUDA) || defined(__ROCM)
&& !init_flag_gpu
#endif
)
{
return;
}

const double small = 1.0; // unit is MB
#ifdef __MPI
Parallel_Reduce::reduce_all(Memory::total);
#if defined(__CUDA) || defined(__ROCM)
Parallel_Reduce::reduce_all(Memory::total_gpu);
#endif
#endif
ofs <<"\n NAME-------------------------|MEMORY(MB)--------" << std::endl;
ofs <<std::setw(30)<< "total" << std::setw(15) <<std::setprecision(4)<< Memory::total << std::endl;
Expand All @@ -254,23 +402,7 @@ void Memory::print_all(std::ofstream &ofs)

for (int i=0; i<n_memory; i++)
{
// int k = 0;
// double tmp = -1.0;
// for(int j=0; j<n_memory; j++)
// {
// if(print_flag[j])
// {
// continue;
// }
// else if(tmp < consume[j])
// {
// k = j;
// tmp = consume[j];
// }
// }
// print_flag[k] = true;
#ifdef __MPI
// Parallel_Reduce::reduce_all(consume[k]);
Parallel_Reduce::reduce_all(consume[i]);
#endif
}
Expand Down Expand Up @@ -304,6 +436,58 @@ void Memory::print_all(std::ofstream &ofs)

}

#if defined(__CUDA) || defined(__ROCM)
ofs <<"\n NAME-------------------------|GPU MEMORY(MB)----" << std::endl;
ofs <<std::setw(30)<< "total" << std::setw(15) <<std::setprecision(4)<< Memory::total_gpu << std::endl;

assert(n_memory>0);

bool *print_flag_gpu = new bool[n_memory];

for(int i=0; i<n_memory; i++)
{
print_flag_gpu[i] = false;
}

for (int i=0; i<n_memory; i++)
{
#ifdef __MPI
Parallel_Reduce::reduce_all(consume_gpu[i]);
#endif
}

for (int i=0; i<n_memory; i++) // Xiaoyang fix memory record sum bug 2023/10/25
{
int k = 0;
double tmp = -1.0;
for(int j=0; j<n_memory; j++)
{
if(print_flag_gpu[j])
{
continue;
}
else if(tmp < consume_gpu[j])
{
k = j;
tmp = consume_gpu[j];
}
}
print_flag_gpu[k] = true;
if ( consume_gpu[k] < small )
{
continue;
}
else
{
ofs << std::setw(30) << name_gpu[k]
<< std::setw(15) << consume_gpu[k] << std::endl;
}

}

delete[] print_flag_gpu;
#endif

ofs<<" ------------- < 1.0 MB has been ignored ----------------"<<std::endl;
ofs<<" ----------------------------------------------------------"<<std::endl;

Expand Down
42 changes: 42 additions & 0 deletions source/module_base/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,39 @@ class Memory
const bool accumulate = false
);

#if defined(__CUDA) || defined(__ROCM)

/**
* @brief Record memory consumed on gpu during computation
*
* @param class_name The name of a class
* @param name The name of a quantity
* @param n The number of the quantity
* @param type The type of data
* @param accumulate Useless, always set false
* @return double
*/
static double record_gpu(const std::string &class_name,
const std::string &name,
const long &n,
const std::string &type,
const bool accumulate = false);

/**
* @brief Record memory consumed on gpu during computation
*
* @param name The name of a quantity
* @param n The number of the quantity
* @param accumulate Useless, always set false
*/
static void record_gpu(
const std::string &name_in,
const size_t &n_in,
const bool accumulate = false
);

#endif

static double &get_total(void)
{
return total;
Expand Down Expand Up @@ -84,6 +117,15 @@ class Memory
static int n_now;
static bool init_flag;

#if defined(__CUDA) || defined(__ROCM)
static double total_gpu;
static std::string *name_gpu;
static std::string *class_name_gpu;
static double *consume_gpu;
static int n_now_gpu;
static bool init_flag_gpu;
#endif

static int complex_matrix_memory; //(16 Byte)
static int double_memory; //(8 Byte)
static int int_memory; //(4 Byte)
Expand Down
15 changes: 15 additions & 0 deletions source/module_base/module_device/cuda/memory_op.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "module_base/module_device/memory_op.h"
#include "module_base/memory.h"

#include <base/macros/macros.h>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -61,6 +62,20 @@ void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(dev, arr);
}
cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE) * size));
std::string record_string;
if (record_in != nullptr)
{
record_string = record_in;
}
else
{
record_string = "no_record";
}

if (record_string != "no_record")
{
ModuleBase::Memory::record_gpu(record_string, sizeof(FPTYPE) * size);
}
}

template <typename FPTYPE>
Expand Down
Loading