Skip to content

Commit

Permalink
Block global kron cpu (#687)
Browse files Browse the repository at this point in the history
* block-global kronmult for the cpu
  • Loading branch information
mkstoyanov authored Jun 25, 2024
1 parent ad62a76 commit 5c37468
Show file tree
Hide file tree
Showing 16 changed files with 1,301 additions and 48 deletions.
14 changes: 13 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,24 @@ option (ASGARD_BUILD_DOCS "Build the documentation." OFF)

if (ASGARD_USE_MPI)
option (KRON_MODE_GLOBAL "Global or local Kronecker products" OFF)
option (KRON_MODE_GLOBAL_BLOCK "Use block variant for global kron" OFF)
else()
option (KRON_MODE_GLOBAL "Global or local Kronecker products" ON)
if (ASGARD_USE_CUDA)
option (KRON_MODE_GLOBAL "Global or local Kronecker products" ON)
option (KRON_MODE_GLOBAL_BLOCK "Use block variant for global kron" OFF)
else()
option (KRON_MODE_GLOBAL "Global or local Kronecker products" ON)
option (KRON_MODE_GLOBAL_BLOCK "Use block variant for global kron" ON)
endif()
endif()
if (KRON_MODE_GLOBAL_BLOCK AND NOT KRON_MODE_GLOBAL)
message (FATAL_ERROR "KRON_MODE_GLOBAL_BLOCK=ON kronmult requires global KRON_MODE_GLOBAL=ON")
endif()
if (ASGARD_USE_MPI AND KRON_MODE_GLOBAL)
message (FATAL_ERROR "Global Kronecker is not yet implemented for MPI")
endif()


if (ASGARD_RECOMMENDED_DEFAULTS)
# add compiler flags we always want to use
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wshadow")
Expand Down Expand Up @@ -332,6 +343,7 @@ target_sources (libasgard
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/device/asgard_spkronmult.cpp>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/device/asgard_spkronmult_cpu.cpp>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/device/asgard_glkronmult_cpu.cpp>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/device/asgard_glkronmult_bcpu.cpp>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/device/asgard_glkronmult_gpu.cpp>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/device/asgard_preconditioner_gpu.cpp>
)
Expand Down
220 changes: 201 additions & 19 deletions src/asgard_kronmult_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
#include <cuda_runtime.h>
#endif

#ifdef ASGARD_USE_OPENMP
#include <omp.h>
#endif

#include <cstdlib>
#include <limits.h>
#include <mutex>
Expand Down Expand Up @@ -41,6 +37,15 @@ std::vector<int> get_used_terms(PDE<precision> const &pde, options const &opts,
}
}

template<typename precision>
vector2d<int> get_cells(int num_dimensions, adapt::distributed_grid<precision> const &dis_grid)
{
auto const &grid = dis_grid.get_subgrid(get_rank());
int const *const asg_idx = dis_grid.get_table().get_active_table().data();
int const num_cells = grid.col_stop - grid.col_start + 1;
return asg2tsg_convert(num_dimensions, num_cells, asg_idx);
}

#ifndef KRON_MODE_GLOBAL

void check_available_memory(int64_t baseline_memory, int64_t available_MB)
Expand Down Expand Up @@ -361,8 +366,7 @@ make_kronmult_sparse(PDE<precision> const &pde,
int const num_1d = spcache.cells1d.num_connections();

#ifdef ASGARD_USE_CUDA
int const tensor_size = kronmult_matrix<precision>::compute_tensor_size(
num_dimensions, kron_size);
int const tensor_size = fm::ipow<int>(kron_size, num_dimensions);
#endif

// storing the 1D operator matrices by 1D row and column
Expand Down Expand Up @@ -839,8 +843,7 @@ compute_mem_usage(PDE<P> const &pde,
// first we compute the size of the state vectors (x and y) and then we
// add the size of the coefficients (based on sparse/dense mode)
int64_t base_line_entries =
(num_rows + num_cols) *
kronmult_matrix<P>::compute_tensor_size(num_dimensions, kron_size);
(num_rows + num_cols) * fm::ipow<int64_t>(kron_size, num_dimensions);

if (program_options.kmode == kronmult_mode::dense and not force_sparse)
{
Expand Down Expand Up @@ -1191,7 +1194,7 @@ bool check_identity_term(PDE<precision> const &pde, int term_id, int dim)
}

template<typename precision>
bool get_flux_direction(PDE<precision> const &pde, int term_id)
int get_flux_direction(PDE<precision> const &pde, int term_id)
{
for (int d = 0; d < pde.num_dims(); d++)
for (auto const &pt : pde.get_terms()[term_id][d].get_partial_terms())
Expand Down Expand Up @@ -1319,21 +1322,18 @@ make_global_kron_matrix(PDE<precision> const &pde,
std::vector<int> active_dirs(num_dimensions);
for (int t = 0; t < num_terms; t++)
{
int const flux_dir = get_flux_direction(pde, t);

active_dirs.clear();
for (int d = 0; d < num_dimensions; d++)
if (not check_identity_term(pde, t, d))
{
active_dirs.push_back(d);
if (d == flux_dir and active_dirs.size() > 1)
std::swap(active_dirs.front(), active_dirs.back());
}

int const num_active = static_cast<int>(active_dirs.size());
if (num_active > 1)
{
int const flux_dir = get_flux_direction(pde, t);
if (flux_dir > -1 and flux_dir != active_dirs[0]) // make the flux direction first
std::swap(active_dirs[0], active_dirs[flux_dir]);
}

permutations.push_back(kronmult::permutes(num_active));
permutations.back().remap_directions(active_dirs);
permutations.push_back(kronmult::permutes(active_dirs));
}

return global_kron_matrix<precision>(
Expand Down Expand Up @@ -1600,6 +1600,153 @@ void global_kron_matrix<precision>::apply(
}
#endif

#ifdef KRON_MODE_GLOBAL_BLOCK

template<typename precision>
template<resource rec>
void block_global_kron_matrix<precision>::apply(
matrix_entry etype, precision alpha, precision const *x,
precision beta, precision *y) const
{
int const imex = flag2int(etype);

std::vector<int> const &used_terms = term_groups_[imex];

if (beta == 0)
kronmult::set_buffer_to_zero<rec>(num_active_, y);
else
lib_dispatch::scal<resource::host>(num_active_, beta, y, 1);

if (used_terms.size() == 0)
return;

std::copy_n(x, num_active_, workspace_->x.begin());
std::fill_n(workspace_->y.begin(), num_padded_, precision{0});

kronmult::global_cpu(num_dimensions_, blockn_, block_size_, ilist_, dsort_,
perms_, flux_dir_, conn_volumes_, conn_full_,
gvals_, used_terms, workspace_->x.data(),
workspace_->y.data(), *workspace_);

precision const *py = workspace_->y.data();
#pragma omp parallel for
for (int64_t i = 0; i < num_active_; i++)
y[i] += alpha * py[i];
}

template<typename precision>
block_global_kron_matrix<precision>
make_block_global_kron_matrix(PDE<precision> const &pde,
adapt::distributed_grid<precision> const &dis_grid,
options const &program_options,
kronmult::block_global_workspace<precision> *workspace)
{
int const porder = pde.get_dimensions()[0].get_degree() - 1;
int const pterms = porder + 1; // poly degrees of freedom
int const max_level = (program_options.do_adapt_levels) ? program_options.max_level : pde.max_level();

int const num_dimensions = pde.num_dims();
int const num_terms = pde.num_terms();

int64_t block_size = fm::ipow(pterms, num_dimensions);

connect_1d fluxes(max_level, connect_1d::hierarchy::full);
connect_1d volumes(max_level, connect_1d::hierarchy::volume);

vector2d<int> cells = get_cells(num_dimensions, dis_grid);
int const num_cells = cells.num_strips();

indexset padded = compute_ancestry_completion(make_index_set(cells), volumes);
std::cout << " number of padding cells = " << padded.num_indexes() << '\n';

if (padded.num_indexes() > 0)
cells.append(padded[0], padded.num_indexes());

dimension_sort dsort(cells);

// figure out the permutation patterns
std::vector<int> flux_dir(num_terms, -1);
std::vector<kronmult::permutes> permutations;
permutations.reserve(num_terms);
std::vector<int> active_dirs(num_dimensions);
for (int t = 0; t < num_terms; t++)
{
flux_dir[t] = get_flux_direction(pde, t);

active_dirs.clear();
// add only the dimensions that are not identity
// make sure that the flux direction comes first
for (int d = 0; d < num_dimensions; d++)
if (not check_identity_term(pde, t, d))
{
active_dirs.push_back(d);
if (d == flux_dir[t] and active_dirs.size() > 1)
std::swap(active_dirs.front(), active_dirs.back());
}

permutations.emplace_back(active_dirs);
}

int64_t num_padded = cells.num_strips() * block_size;
workspace->x.resize(num_padded);
std::fill_n(workspace->x.begin(), num_padded, precision{0});
workspace->y.resize(num_padded);
workspace->w1.resize(num_padded);
workspace->w2.resize(num_padded);

return block_global_kron_matrix<precision>(
num_cells * block_size, num_padded,
num_dimensions, pterms, block_size,
std::move(cells), std::move(dsort), std::move(permutations),
std::move(flux_dir), std::move(volumes), std::move(fluxes),
workspace);
}

template<typename precision>
void set_specific_mode(PDE<precision> const &pde,
adapt::distributed_grid<precision> const &dis_grid,
options const &program_options, imex_flag const imex,
block_global_kron_matrix<precision> &mat)
{
int const imex_indx = block_global_kron_matrix<precision>::flag2int(imex);

mat.term_groups_[imex_indx] = get_used_terms(pde, program_options, imex);

std::vector<int> const &used_terms = mat.term_groups_[imex_indx];

int const n = mat.blockn_;

int const num_dimensions = pde.num_dims();

for (int const t : used_terms)
{
for (int d = 0; d < num_dimensions; d++)
{
if (not check_identity_term(pde, t, d))
{
fk::matrix<precision> const &ops = pde.get_coefficients(t, d);

connect_1d const &conn = (mat.flux_dir_[t] == d) ? mat.conn_full_ : mat.conn_volumes_;

mat.gvals_[t * num_dimensions + d].resize(n * n * conn.num_connections());

precision *A = mat.gvals_[t * num_dimensions + d].data();
for (int r = 0; r < conn.num_rows(); r++)
for (int j = conn.row_begin(r); j < conn.row_end(r); j++)
for (int k = 0; k < n; k++)
A = std::copy_n(ops.data(r * n, conn[j] * n + k), n, A);
}
}
}

if (imex == imex_flag::imex_implicit or program_options.use_implicit_stepping)
// prepare a preconditioner
build_preconditioner(pde, mat.ilist_.num_strips() * mat.block_size_, dis_grid,
used_terms, mat.pre_con_);
}

#endif // KRON_MODE_GLOBAL_BLOCK

#ifdef ASGARD_ENABLE_DOUBLE
template std::vector<int> get_used_terms(PDE<double> const &pde, options const &opts,
imex_flag const imex);
Expand All @@ -1624,6 +1771,23 @@ template void global_kron_matrix<double>::apply<resource::host>(
template void global_kron_matrix<double>::apply<resource::device>(
matrix_entry, double, double const *, double, double *) const;
#endif

#ifdef KRON_MODE_GLOBAL_BLOCK
template class block_global_kron_matrix<double>;
template void block_global_kron_matrix<double>::apply<resource::host>(
matrix_entry, double, double const *, double, double *) const;

template block_global_kron_matrix<double>
make_block_global_kron_matrix<double>(PDE<double> const &,
adapt::distributed_grid<double> const &,
options const &,
kronmult::block_global_workspace<double> *workspace);
template void set_specific_mode<double>(PDE<double> const &,
adapt::distributed_grid<double> const &,
options const &, imex_flag const,
block_global_kron_matrix<double> &);
#endif

#else // KRON_MODE_GLOBAL
template kronmult_matrix<double>
make_kronmult_matrix<double>(PDE<double> const &,
Expand Down Expand Up @@ -1666,6 +1830,24 @@ template void global_kron_matrix<float>::apply<resource::host>(
template void global_kron_matrix<float>::apply<resource::device>(
matrix_entry, float, float const *, float, float *) const;
#endif

#ifdef KRON_MODE_GLOBAL_BLOCK
template class block_global_kron_matrix<float>;

template void block_global_kron_matrix<float>::apply<resource::host>(
matrix_entry, float, float const *, float, float *) const;

template block_global_kron_matrix<float>
make_block_global_kron_matrix<float>(PDE<float> const &,
adapt::distributed_grid<float> const &,
options const &,
kronmult::block_global_workspace<float> *workspace);
template void set_specific_mode<float>(PDE<float> const &,
adapt::distributed_grid<float> const &,
options const &, imex_flag const,
block_global_kron_matrix<float> &);
#endif

#else // KRON_MODE_GLOBAL
template kronmult_matrix<float>
make_kronmult_matrix<float>(PDE<float> const &,
Expand Down
Loading

0 comments on commit 5c37468

Please sign in to comment.