Skip to content

Commit

Permalink
cuda/hip batch changes
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Nov 21, 2024
1 parent ed888c0 commit 09c777a
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 74 deletions.
4 changes: 2 additions & 2 deletions common/cuda_hip/solver/batch_bicgstab_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ void launch_apply_kernel(

#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
mat_t, log_t, pre_t, stop_t) \
void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_bicgstab::storage_config& sconf, \
const settings<remove_complex<device_type<_vtype>>>& settings, \
const settings<remove_complex<_vtype>>& settings, \
log_t<gko::remove_complex<device_type<_vtype>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
Expand Down
26 changes: 13 additions & 13 deletions common/cuda_hip/solver/batch_cg_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,19 @@ void launch_apply_kernel(
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size);

#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
log_t, pre_t, stop_t) \
void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_cg::storage_config& sconf, \
const settings<remove_complex<_vtype>>& settings, \
log_t<device_type<gko::remove_complex<device_type<_vtype>>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
const device_type<_vtype>* const __restrict__ b_values, \
device_type<_vtype>* const __restrict__ x_values, \
device_type<_vtype>* const __restrict__ workspace_data, \
#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
log_t, pre_t, stop_t) \
void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_cg::storage_config& sconf, \
const settings<remove_complex<_vtype>>& settings, \
log_t<gko::remove_complex<device_type<_vtype>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
const device_type<_vtype>* const __restrict__ b_values, \
device_type<_vtype>* const __restrict__ x_values, \
device_type<_vtype>* const __restrict__ workspace_data, \
const int& block_size, const size_t& shared_size)

#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...) \
Expand Down
22 changes: 11 additions & 11 deletions cuda/solver/batch_bicgstab_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,58 +71,58 @@ public:
// Template parameters launch_apply_kernel<StopType, n_shared,
// prec_shared>
if (sconf.prec_shared) {
launch_apply_kernel<cuda_value_type, 9, true, StopType>(
launch_apply_kernel<ValueType, 9, true, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
} else {
switch (sconf.n_shared) {
case 0:
launch_apply_kernel<cuda_value_type, 0, false, StopType>(
launch_apply_kernel<ValueType, 0, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 1:
launch_apply_kernel<cuda_value_type, 1, false, StopType>(
launch_apply_kernel<ValueType, 1, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 2:
launch_apply_kernel<cuda_value_type, 2, false, StopType>(
launch_apply_kernel<ValueType, 2, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 3:
launch_apply_kernel<cuda_value_type, 3, false, StopType>(
launch_apply_kernel<ValueType, 3, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 4:
launch_apply_kernel<cuda_value_type, 4, false, StopType>(
launch_apply_kernel<ValueType, 4, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 5:
launch_apply_kernel<cuda_value_type, 5, false, StopType>(
launch_apply_kernel<ValueType, 5, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 6:
launch_apply_kernel<cuda_value_type, 6, false, StopType>(
launch_apply_kernel<ValueType, 6, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 7:
launch_apply_kernel<cuda_value_type, 7, false, StopType>(
launch_apply_kernel<ValueType, 7, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 8:
launch_apply_kernel<cuda_value_type, 8, false, StopType>(
launch_apply_kernel<ValueType, 8, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 9:
launch_apply_kernel<cuda_value_type, 9, false, StopType>(
launch_apply_kernel<ValueType, 9, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
Expand Down
31 changes: 16 additions & 15 deletions cuda/solver/batch_bicgstab_launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,16 @@ template <typename StopType, typename PrecType, typename LogType,
int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
const int num_rows);

#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_num_threads_per_block< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec, \
const int num_rows)
#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_num_threads_per_block< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<cuda_type<_vtype>>>, \
mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>( \
std::shared_ptr<const DefaultExecutor> exec, const int num_rows)

#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(...) \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF( \
GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__)

#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK \
Expand All @@ -52,15 +52,16 @@ template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);

#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_max_dynamic_shared_memory< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_max_dynamic_shared_memory< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<cuda_type<_vtype>>>, \
mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>( \
std::shared_ptr<const DefaultExecutor> exec)

#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF( \
GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, \
__VA_ARGS__)

Expand Down
14 changes: 7 additions & 7 deletions cuda/solver/batch_cg_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,38 +72,38 @@ public:
// Template parameters launch_apply_kernel<ValueType, n_shared,
// prec_shared, StopType>
if (sconf.prec_shared) {
launch_apply_kernel<cuda_value_type, 5, true, StopType>(
launch_apply_kernel<ValueType, 5, true, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
} else {
switch (sconf.n_shared) {
case 0:
launch_apply_kernel<cuda_value_type, 0, false, StopType>(
launch_apply_kernel<ValueType, 0, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 1:
launch_apply_kernel<cuda_value_type, 1, false, StopType>(
launch_apply_kernel<ValueType, 1, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 2:
launch_apply_kernel<cuda_value_type, 2, false, StopType>(
launch_apply_kernel<ValueType, 2, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 3:
launch_apply_kernel<cuda_value_type, 3, false, StopType>(
launch_apply_kernel<ValueType, 3, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 4:
launch_apply_kernel<cuda_value_type, 4, false, StopType>(
launch_apply_kernel<ValueType, 4, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 5:
launch_apply_kernel<cuda_value_type, 5, false, StopType>(
launch_apply_kernel<ValueType, 5, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
Expand Down
17 changes: 9 additions & 8 deletions cuda/solver/batch_cg_launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
std::shared_ptr<const DefaultExecutor> exec, const int num_rows)

#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(...) \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF( \
GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__)

#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK \
Expand All @@ -51,16 +51,17 @@ template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);

#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_max_dynamic_shared_memory< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_max_dynamic_shared_memory< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<cuda_type<_vtype>>>, \
mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>( \
std::shared_ptr<const DefaultExecutor> exec)


#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS_WITH_HALF( \
GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, __VA_ARGS__)

#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY \
Expand Down
22 changes: 11 additions & 11 deletions hip/solver/batch_bicgstab_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,58 +95,58 @@ class kernel_caller {
// Template parameters launch_apply_kernel<StopType, n_shared,
// prec_shared)
if (sconf.prec_shared) {
launch_apply_kernel<hip_value_type, 9, true, StopType>(
launch_apply_kernel<ValueType, 9, true, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
} else {
switch (sconf.n_shared) {
case 0:
launch_apply_kernel<hip_value_type, 0, false, StopType>(
launch_apply_kernel<ValueType, 0, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 1:
launch_apply_kernel<hip_value_type, 1, false, StopType>(
launch_apply_kernel<ValueType, 1, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 2:
launch_apply_kernel<hip_value_type, 2, false, StopType>(
launch_apply_kernel<ValueType, 2, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 3:
launch_apply_kernel<hip_value_type, 3, false, StopType>(
launch_apply_kernel<ValueType, 3, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 4:
launch_apply_kernel<hip_value_type, 4, false, StopType>(
launch_apply_kernel<ValueType, 4, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 5:
launch_apply_kernel<hip_value_type, 5, false, StopType>(
launch_apply_kernel<ValueType, 5, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 6:
launch_apply_kernel<hip_value_type, 6, false, StopType>(
launch_apply_kernel<ValueType, 6, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 7:
launch_apply_kernel<hip_value_type, 7, false, StopType>(
launch_apply_kernel<ValueType, 7, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 8:
launch_apply_kernel<hip_value_type, 8, false, StopType>(
launch_apply_kernel<ValueType, 8, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 9:
launch_apply_kernel<hip_value_type, 9, false, StopType>(
launch_apply_kernel<ValueType, 9, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
Expand Down
Loading

0 comments on commit 09c777a

Please sign in to comment.