From 970f4a660c993e2911c41642145eaa1083209293 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 16 Sep 2022 15:28:34 -0700 Subject: [PATCH 1/2] ParallelFor with compile time optimization of kernels with run time parameters Branches inside ParallelFor can be very expensive. If a branch uses a lot of resources (e.g., registers), it can significantly affect the performance even if at run time the branch is never executed because it affects the GPU occupancy. For CPUs, it can affect vectorization of the kernel. The new ParallelFor functions use C++17 fold expression to generate kernel launches for all run time variants. The kernel function can use constexpr if to discard unused code blocks for better run time performance. Here are two examples of how to use them. int runtime_option = ...; enum All_options : int { A0, A1, A2, A3}; // Four ParallelFors will be generated. ParallelFor(TypeList>{}, {runtime_option}, box, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto control) { ... if constexpr (control.value == A0) { ... } else if constexpr (control.value == A1) { ... } else if constexpr (control.value == A2) { ... else { ... } ... }); and int A_runtime_option = ...; int B_runtime_option = ...; enum A_options : int { A0, A1, A2, A3}; enum B_options : int { B0, B1 }; // 4*2=8 ParallelFors will be generated. ParallelFor(TypeList, CompileTimeOptions > {}, {A_runtime_option, B_runtime_option}, N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control) { ... if constexpr (A_control.value == A0) { ... } else if constexpr (A_control.value == A1) { ... } else if constexpr (A_control.value == A2) { ... else { ... } if constexpr (A_control.value != A3 && B_control.value == B1) { ... } ... }); Note that that due to a limitation of CUDA's extended device lambda, the constexpr if block cannot be the one that captures a variable first. If nvcc complains about it, you will have to manually capture it outside constexpr if. The data type for the parameters is int. Thank Maikel Nadolski and Alex Sinn for showing us the meta-programming techniques used here. --- Src/Base/AMReX_CTOParallelForImpl.H | 324 ++++++++++++++++++++++++++++ Src/Base/AMReX_GpuLaunch.H | 2 + Src/Base/CMakeLists.txt | 1 + Src/Base/Make.package | 1 + Tests/CMakeLists.txt | 2 +- Tests/CTOParFor/CMakeLists.txt | 7 + Tests/CTOParFor/GNUmakefile | 20 ++ Tests/CTOParFor/Make.package | 4 + Tests/CTOParFor/main.cpp | 64 ++++++ 9 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 Src/Base/AMReX_CTOParallelForImpl.H create mode 100644 Tests/CTOParFor/CMakeLists.txt create mode 100644 Tests/CTOParFor/GNUmakefile create mode 100644 Tests/CTOParFor/Make.package create mode 100644 Tests/CTOParFor/main.cpp diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H new file mode 100644 index 00000000000..33cdf50da83 --- /dev/null +++ b/Src/Base/AMReX_CTOParallelForImpl.H @@ -0,0 +1,324 @@ +#ifndef AMREX_CTO_PARALLEL_FOR_H_ +#define AMREX_CTO_PARALLEL_FOR_H_ + +/* This header is not for the users to include directly. It's meant to be + * included in AMReX_GpuLaunch.H, which has included the headers needed + * here. */ + +/* Thank Maikel Nadolski and Alex Sinn for the techniques used here! */ + +namespace amrex { + +template +struct CompileTimeOptions { + // TypeList is defined in AMReX_Tuple.H + using list_type = TypeList...>; +}; + +#if (__cplusplus >= 201703L) + +namespace meta +{ + template + constexpr auto operator+ (TypeList, TypeList) { + return TypeList{}; + } + + template + constexpr auto single_product (TypeList, A) { + return TypeList{})...>{}; + } + + template + constexpr auto operator* (LLs, TypeList) { + return (TypeList<>{} + ... + single_product(LLs{}, As{})); + } + + template + constexpr auto cartesian_product_n (TypeList) { + return (TypeList>{} * ... * Ls{}); + } +} + +namespace detail +{ + template + std::enable_if_t::value || std::is_same::value, bool> + ParallelFor_helper2 (T const& N, F&& f, TypeList, + std::array const& runtime_options) + { + if (runtime_options == std::array{As::value...}) { + if constexpr (std::is_integral::value) { + ParallelFor(N, [f] AMREX_GPU_DEVICE (T i) noexcept + { + f(i, As{}...); + }); + } else { + ParallelFor(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + f(i, j, k, As{}...); + }); + } + return true; + } else { + return false; + } + } + + template + std::enable_if_t::value, bool> + ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList, + std::array const& runtime_options) + { + if (runtime_options == std::array{As::value...}) { + ParallelFor(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept + { + f(i, j, k, n, As{}...); + }); + return true; + } else { + return false; + } + } + + template + std::enable_if_t::value || std::is_same::value> + ParallelFor_helper1 (T const& N, F&& f, TypeList, + RO const& runtime_options) + { + bool found_option = (false || ... || + ParallelFor_helper2(N, std::forward(f), + PPs{}, runtime_options)); + amrex::ignore_unused(found_option); + AMREX_ASSERT(found_option); + } + + template + std::enable_if_t::value> + ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList, + RO const& runtime_options) + { + bool found_option = (false || ... || + ParallelFor_helper2(box, ncomp, std::forward(f), + PPs{}, runtime_options)); + amrex::ignore_unused(found_option); + AMREX_ASSERT(found_option); + } +} + +#endif + +template +std::enable_if_t::value> +ParallelFor (TypeList /*list_of_compile_time_options*/, + std::array const& runtime_options, + T N, F&& f) +{ +#if (__cplusplus >= 201703L) + using OptionsListList = TypeList; + detail::ParallelFor_helper1(N, std::forward(f), + meta::cartesian_product_n(OptionsListList{}), + runtime_options); +#else + amrex::ignore_unused(N, f, runtime_options); + static_assert(std::is_integral::value, "This requires C++17"); +#endif +} + +template +void ParallelFor (TypeList /*list_of_compile_time_options*/, + std::array const& runtime_options, + Box const& box, F&& f) +{ +#if (__cplusplus >= 201703L) + using OptionsListList = TypeList; + detail::ParallelFor_helper1(box, std::forward(f), + meta::cartesian_product_n(OptionsListList{}), + runtime_options); +#else + amrex::ignore_unused(box, f, runtime_options); + static_assert(std::is_integral::value, "This requires C++17"); +#endif +} + +template +std::enable_if_t::value> +ParallelFor (TypeList /*list_of_compile_time_options*/, + std::array const& runtime_options, + Box const& box, T ncomp, F&& f) +{ +#if (__cplusplus >= 201703L) + using OptionsListList = TypeList; + detail::ParallelFor_helper1(box, ncomp, std::forward(f), + meta::cartesian_product_n(OptionsListList{}), + runtime_options); +#else + amrex::ignore_unused(box, ncomp, f, runtime_options); + static_assert(std::is_integral::value, "This requires C++17"); +#endif +} + +/** + * \brief ParallelFor with compile time optimization of kernels with run time options. + * + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelFors + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3}; + enum B_options : int { B0, B1 }; + ParallelFor(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + }); + \endverbatim + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. The data type for the parameters is int. + * + * \param ctos list of all possible values of the parameters. + * \param option the run time parameters. + * \param N an interger specifying the 1D for loop's range. + * \param f a callable object taking an integer and working on that iteration. + */ +template +std::enable_if_t::value> +ParallelFor (TypeList ctos, + std::array const& option, + T N, F&& f) +{ + ParallelFor(ctos, option, N, std::forward(f)); +} + +/** + * \brief ParallelFor with compile time optimization of kernels with run time options. + * + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelFors + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3}; + enum B_options : int { B0, B1 }; + ParallelFor(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + box, [=] AMREX_GPU_DEVICE (int i, int j, int k, + auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + }); + \endverbatim + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. The data type for the parameters is int. + * + * \param ctos list of all possible values of the parameters. + * \param option the run time parameters. + * \param box a Box specifying the 3D for loop's range. + * \param f a callable object taking three integers and working on the given cell. + */ +template +void ParallelFor (TypeList ctos, + std::array const& option, + Box const& box, F&& f) +{ + ParallelFor(ctos, option, box, std::forward(f)); +} + +/** + * \brief ParallelFor with compile time optimization of kernels with run time options. + * + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelFors + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3}; + enum B_options : int { B0, B1 }; + ParallelFor(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + box, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, + auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + }); + \endverbatim + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. The data type for the parameters is int. + * + * \param ctos list of all possible values of the parameters. + * \param option the run time parameters. + * \param box a Box specifying the iteration in 3D space. + * \param ncomp an integer specifying the range for iteration over components. + * \param f a callable object taking three integers and working on the given cell. + */ +template +std::enable_if_t::value> +ParallelFor (TypeList ctos, + std::array const& option, + Box const& box, T ncomp, F&& f) +{ + ParallelFor(ctos, option, box, ncomp, std::forward(f)); +} + +} + +#endif diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index d1a9e352336..39fac18835e 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -443,4 +443,6 @@ namespace Gpu { #endif +#include + #endif diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index c47fdcae706..38d45d4d4dc 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -223,6 +223,7 @@ target_sources( amrex AMReX_MFParallelForC.H AMReX_MFParallelForG.H AMReX_TagParallelFor.H + AMReX_CTOParallelForImpl.H AMReX_ParReduce.H # CUDA -------------------------------------------------------------------- AMReX_CudaGraph.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index 79085ae70a1..5b1a0e7e267 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -100,6 +100,7 @@ C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H C$(AMREX_BASE)_headers += AMReX_MFParallelForG.H C$(AMREX_BASE)_headers += AMReX_TagParallelFor.H +C$(AMREX_BASE)_headers += AMReX_CTOParallelForImpl.H C$(AMREX_BASE)_headers += AMReX_ParReduce.H diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 50cc2bb8cb2..8d318f918b8 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -1,7 +1,7 @@ # # List of subdirectories to search for CMakeLists. # -set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser) +set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser CTOParFor) if (AMReX_PARTICLES) list(APPEND AMREX_TESTS_SUBDIRS Particles) diff --git a/Tests/CTOParFor/CMakeLists.txt b/Tests/CTOParFor/CMakeLists.txt new file mode 100644 index 00000000000..57c1e7715e2 --- /dev/null +++ b/Tests/CTOParFor/CMakeLists.txt @@ -0,0 +1,7 @@ +set(_sources main.cpp) +set(_input_files) + +setup_test(_sources _input_files) + +unset(_sources) +unset(_input_files) diff --git a/Tests/CTOParFor/GNUmakefile b/Tests/CTOParFor/GNUmakefile new file mode 100644 index 00000000000..0dbc65578af --- /dev/null +++ b/Tests/CTOParFor/GNUmakefile @@ -0,0 +1,20 @@ +AMREX_HOME = ../../ + +DEBUG = FALSE +DIM = 3 +COMP = gcc + +USE_MPI = FALSE +USE_OMP = FALSE +USE_CUDA = FALSE + +TINY_PROFILE = FALSE + +CXXSTD = c++17 + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/CTOParFor/Make.package b/Tests/CTOParFor/Make.package new file mode 100644 index 00000000000..4497b0e25b9 --- /dev/null +++ b/Tests/CTOParFor/Make.package @@ -0,0 +1,4 @@ +CEXE_sources += main.cpp + + + diff --git a/Tests/CTOParFor/main.cpp b/Tests/CTOParFor/main.cpp new file mode 100644 index 00000000000..0cf1d7ea35a --- /dev/null +++ b/Tests/CTOParFor/main.cpp @@ -0,0 +1,64 @@ +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc,argv); +#if (__cplusplus >= 201703L) + { + enum A_options: int { + A0 = 0, A1 + }; + + enum B_options: int { + B0 = 0, B1, B2 + }; + + Box box(IntVect(0),IntVect(7)); + IArrayBox fab(box,2); + fab.setVal(-10); + + auto const& arr = fab.array(); + + for (int ia = 0; ia < 2; ++ia) { + for (int ib = 0; ib < 3; ++ib) { + ParallelFor(TypeList, + CompileTimeOptions>{}, + {ia, ib}, + box, [=] AMREX_GPU_DEVICE (int i, int j, int k, + auto A_control, + auto B_control) + { + auto const& larr = arr; + int a, b; + if constexpr (A_control.value == 0) { + a = 0; + } else if constexpr (A_control.value == 1) { + a = 1; + } else { + a = -1; + } + if constexpr (B_control.value == 0) { + b = 0; + } else if constexpr (B_control.value == 1) { + b = 1; + } else if constexpr (B_control.value == 2) { + b = 2; + } else if constexpr (B_control.value == 3) { + b = 3; + } + larr(i,j,k) = a*10 + b; + }); + + auto s = fab.sum(0); + AMREX_ALWAYS_ASSERT(s == box.numPts()*(ia*10+ib)); + } + } + } +#else + amrex::Print() << "This test requires C++17." << std::endl; +#endif + amrex::Finalize(); +} From 78e3061a44f361f9e419b046951ac040a829561a Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Sat, 15 Oct 2022 14:03:32 -0700 Subject: [PATCH 2/2] Improve Include Stability --- Src/Base/AMReX_CTOParallelForImpl.H | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H index 33cdf50da83..f4dd41ca0c8 100644 --- a/Src/Base/AMReX_CTOParallelForImpl.H +++ b/Src/Base/AMReX_CTOParallelForImpl.H @@ -1,6 +1,13 @@ #ifndef AMREX_CTO_PARALLEL_FOR_H_ #define AMREX_CTO_PARALLEL_FOR_H_ +#include +#include +#include + +#include +#include + /* This header is not for the users to include directly. It's meant to be * included in AMReX_GpuLaunch.H, which has included the headers needed * here. */