From cfe9c48577676c7a0fb3548c38f82ccc47652c3f Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Sat, 13 Apr 2024 00:44:13 +0200 Subject: [PATCH 01/11] refactor AnyFFT --- .../FFTPoissonSolverDirichlet.H | 6 +- .../FFTPoissonSolverDirichlet.cpp | 33 +-- .../FFTPoissonSolverPeriodic.H | 4 +- .../FFTPoissonSolverPeriodic.cpp | 39 +-- src/fields/fft_poisson_solver/fft/AnyFFT.H | 122 ++------- .../fft_poisson_solver/fft/CMakeLists.txt | 5 - .../fft_poisson_solver/fft/WrapCuFFT.cpp | 219 ++++++++++------ .../fft_poisson_solver/fft/WrapFFTW.cpp | 176 +++++++++---- .../fft_poisson_solver/fft/WrapRocFFT.cpp | 243 ++++++++++++------ .../{fft => fft_old}/AnyDST.H | 0 .../fft_poisson_solver/fft_old/AnyFFT.H | 120 +++++++++ .../fft_poisson_solver/fft_old/CMakeLists.txt | 28 ++ .../{fft => fft_old}/CuFFTUtils.H | 0 .../{fft => fft_old}/CuFFTUtils.cpp | 0 .../{fft => fft_old}/RocFFTUtils.H | 0 .../{fft => fft_old}/RocFFTUtils.cpp | 0 .../{fft => fft_old}/WrapCuDST.cpp | 0 .../fft_poisson_solver/fft_old/WrapCuFFT.cpp | 95 +++++++ .../{fft => fft_old}/WrapDSTW.cpp | 0 .../fft_poisson_solver/fft_old/WrapFFTW.cpp | 72 ++++++ .../{fft => fft_old}/WrapRocDST.cpp | 0 .../fft_poisson_solver/fft_old/WrapRocFFT.cpp | 102 ++++++++ src/laser/MultiLaser.H | 72 +----- src/laser/MultiLaser.cpp | 142 +--------- 24 files changed, 915 insertions(+), 563 deletions(-) rename src/fields/fft_poisson_solver/{fft => fft_old}/AnyDST.H (100%) create mode 100644 src/fields/fft_poisson_solver/fft_old/AnyFFT.H create mode 100644 src/fields/fft_poisson_solver/fft_old/CMakeLists.txt rename src/fields/fft_poisson_solver/{fft => fft_old}/CuFFTUtils.H (100%) rename src/fields/fft_poisson_solver/{fft => fft_old}/CuFFTUtils.cpp (100%) rename src/fields/fft_poisson_solver/{fft => fft_old}/RocFFTUtils.H (100%) rename src/fields/fft_poisson_solver/{fft => fft_old}/RocFFTUtils.cpp (100%) rename src/fields/fft_poisson_solver/{fft => fft_old}/WrapCuDST.cpp (100%) create mode 100644 src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp rename src/fields/fft_poisson_solver/{fft => fft_old}/WrapDSTW.cpp (100%) create mode 100644 src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp rename src/fields/fft_poisson_solver/{fft => fft_old}/WrapRocDST.cpp (100%) create mode 100644 src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H index bc4dc76d1c..53e002ab09 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H @@ -8,7 +8,7 @@ #ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ #define FFT_POISSON_SOLVER_DIRICHLET_H_ -#include "fields/fft_poisson_solver/fft/AnyDST.H" +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "FFTPoissonSolver.H" #include @@ -64,7 +64,9 @@ private: /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ amrex::MultiFab m_eigenvalue_matrix; /** DST plans */ - AnyDST::DSTplans m_plan; + AnyFFT m_forward_fft; + AnyFFT m_backward_fft; + amrex::Gpu::DeviceVector m_fft_work_area; }; #endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp index 3fb2bc93aa..f61404886e 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp @@ -7,7 +7,7 @@ * License: BSD-3-Clause-LBNL */ #include "FFTPoissonSolverDirichlet.H" -#include "fft/AnyDST.H" +#include "fft/AnyFFT.H" #include "fields/Fields.H" #include "utils/Constants.H" #include "utils/GPUUtil.H" @@ -81,17 +81,16 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, } // Allocate and initialize the FFT plans - m_plan = AnyDST::DSTplans(a_realspace_ba, dm); - // Loop over boxes and allocate the corresponding plan - // for each box owned by the local MPI proc - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Note: the size of the real-space box and spectral-space box - // differ when using real-to-complex FFT. When initializing - // the FFT plan, the valid dimensions are those of the real-space box. - amrex::IntVect fft_size = fft_box.length(); - m_plan[mfi] = AnyDST::CreatePlan( - fft_size, &m_stagingArea[mfi], &m_tmpSpectralField[mfi]); - } + amrex::IntVect fft_size = m_stagingArea[0].box().length(); + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), + m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), + m_fft_work_area.dataPtr()); } @@ -100,10 +99,7 @@ FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { HIPACE_PROFILE("FFTPoissonSolverDirichlet::SolvePoissonEquation()"); - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from the staging area to `tmpSpectralField` - AnyDST::Execute(m_plan[mfi], AnyDST::direction::forward); - } + m_forward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) @@ -120,10 +116,7 @@ FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf) }); } - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from `tmpSpectralField` to the staging area - AnyDST::Execute(m_plan[mfi], AnyDST::direction::backward); - } + m_backward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H index e570e70d15..15b2195562 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H @@ -70,7 +70,9 @@ private: /** Multifab containing 1/(kx^2 + ky^2), to solve Poisson equation. */ amrex::MultiFab m_inv_k2; /** FFT plans */ - AnyFFT::FFTplans m_forward_plan, m_backward_plan; + AnyFFT m_forward_fft; + AnyFFT m_backward_fft; + amrex::Gpu::DeviceVector m_fft_work_area; }; #endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp index 54045693c4..26ff0ebb6d 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp @@ -93,25 +93,16 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba, } // Allocate and initialize the FFT plans - m_forward_plan = AnyFFT::FFTplans(spectralspace_ba, dm); - m_backward_plan = AnyFFT::FFTplans(spectralspace_ba, dm); - // Loop over boxes and allocate the corresponding plan - // for each box owned by the local MPI proc - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Note: the size of the real-space box and spectral-space box - // differ when using real-to-complex FFT. When initializing - // the FFT plan, the valid dimensions are those of the real-space box. - amrex::IntVect fft_size = m_stagingArea[mfi].box().length(); - m_forward_plan[mfi] = AnyFFT::CreatePlan( - fft_size, m_stagingArea[mfi].dataPtr(), - reinterpret_cast( m_tmpSpectralField[mfi].dataPtr()), - AnyFFT::direction::R2C); - - m_backward_plan[mfi] = AnyFFT::CreatePlan( - fft_size, m_stagingArea[mfi].dataPtr(), - reinterpret_cast( m_tmpSpectralField[mfi].dataPtr()), - AnyFFT::direction::C2R); - } + amrex::IntVect fft_size = m_stagingArea[0].box().length(); + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2C_2D, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2R_2D, fft_size[0], fft_size[1]); + + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), + m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), + m_fft_work_area.dataPtr()); } @@ -120,10 +111,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { HIPACE_PROFILE("FFTPoissonSolverPeriodic::SolvePoissonEquation()"); - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from the staging area to `tmpSpectralField` - AnyFFT::Execute(m_forward_plan[mfi]); - } + m_forward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) @@ -139,10 +127,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf) }); } - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from `tmpSpectralField` to the staging area - AnyFFT::Execute(m_backward_plan[mfi]); - } + m_backward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index cbbd077954..c98e695234 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -1,120 +1,38 @@ -/* Copyright 2020-2021 +/* Copyright 2024 * * This file is part of HiPACE++. * - * Authors: Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs - * WeiqunZhang - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. + * Authors: AlexanderSinn * * License: BSD-3-Clause-LBNL */ #ifndef ANYFFT_H_ #define ANYFFT_H_ -#include - -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif - -#include - -/** - * \brief Wrapper around multiple FFT libraries. - * - * The header file defines the API and the base types - * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is - * done in different cpp files. This wrapper only depends on the underlying FFT library - * AND on AMReX (There is no dependence on WarpX). - */ -namespace AnyFFT -{ - // First, define library-dependent types (complex, FFT plan) - - /** Complex type for FFT, depends on FFT library */ -#ifdef AMREX_USE_CUDA -# ifdef AMREX_USE_FLOAT - using Complex = cuComplex; -# else - using Complex = cuDoubleComplex; -# endif -#elif defined(AMREX_USE_HIP) -# ifdef AMREX_USE_FLOAT - using Complex = float2; -# else - using Complex = double2; -# endif -#else -# ifdef AMREX_USE_FLOAT - using Complex = fftwf_complex; -# else - using Complex = fftw_complex; -# endif -#endif - - /** Library-dependent FFT plans type, which holds one fft plan per box - * (plans are only initialized for the boxes that are owned by the local MPI rank). - */ -#ifdef AMREX_USE_CUDA - using VendorFFTPlan = cufftHandle; -#elif defined(AMREX_USE_HIP) - using VendorFFTPlan = rocfft_plan; -#else -# ifdef AMREX_USE_FLOAT - using VendorFFTPlan = fftwf_plan; -# else - using VendorFFTPlan = fftw_plan; -# endif -#endif +struct VendorPlan; - // Second, define library-independent API +enum struct FFTType { + C2C_2D_fwd, + C2C_2D_bkw, + C2R_2D, + R2C_2D, + R2R_2D, + C2R_1D_batched, + R2C_1D_batched +}; - /** Direction in which the FFT is performed. */ - enum struct direction {R2C, C2R}; +struct AnyFFT { - /** \brief This struct contains the vendor FFT plan and additional metadata - */ - struct FFTplan - { - amrex::Real* m_real_array; /**< pointer to real array */ - Complex* m_complex_array; /**< pointer to complex array */ - VendorFFTPlan m_plan; /**< Vendor FFT plan */ - direction m_dir; /**< direction (C2R or R2C) */ - }; + std::size_t Initialize (FFTType type, int nx, int ny); - /** Collection of FFT plans, one FFTplan per box */ - using FFTplans = amrex::LayoutData; + void SetBuffers (void* in, void* out, void* work_area); - /** \brief create FFT plan for the backend FFT library. - * \param[in] real_size Size of the real array, along each dimension. - * \param[out] real_array Real array from/to where R2C/C2R FFT is performed - * \param[out] complex_array Complex array to/from where R2C/C2R FFT is performed - * \param[in] dir direction, either R2C or C2R - */ - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir); + void Execute (); - /** \brief Destroy library FFT plan. - * \param[out] fft_plan plan to destroy - */ - void DestroyPlan (FFTplan& fft_plan); + ~AnyFFT (); - /** \brief Perform FFT with backend library. - * \param[out] fft_plan plan for which the FFT is performed - */ - void Execute (FFTplan& fft_plan); -} +private: + VendorPlan* m_plan = nullptr; +}; #endif // ANYFFT_H_ diff --git a/src/fields/fft_poisson_solver/fft/CMakeLists.txt b/src/fields/fft_poisson_solver/fft/CMakeLists.txt index e89ecd84f3..e8ef1c1fd7 100644 --- a/src/fields/fft_poisson_solver/fft/CMakeLists.txt +++ b/src/fields/fft_poisson_solver/fft/CMakeLists.txt @@ -9,20 +9,15 @@ if (HiPACE_COMPUTE STREQUAL CUDA) target_sources(HiPACE PRIVATE WrapCuFFT.cpp - WrapCuDST.cpp - CuFFTUtils.cpp ) elseif(HiPACE_COMPUTE STREQUAL HIP) target_sources(HiPACE PRIVATE WrapRocFFT.cpp - WrapRocDST.cpp - RocFFTUtils.cpp ) else() target_sources(HiPACE PRIVATE WrapFFTW.cpp - WrapDSTW.cpp ) endif() diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index 739ab2cdcb..bd0f8eb8f4 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -1,95 +1,170 @@ -/* Copyright 2020-2022 +/* Copyright 2024 * * This file is part of HiPACE++. * - * Authors: MaxThevenet, Remi Lehe, Severin Diederichs - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. + * Authors: AlexanderSinn * * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" -#include "CuFFTUtils.H" #include "utils/HipaceProfilerWrapper.H" -namespace AnyFFT -{ +#include + +#include "CuFFTUtils.H" + +#include +#include #ifdef AMREX_USE_FLOAT - cufftType VendorR2C = CUFFT_R2C; - cufftType VendorC2R = CUFFT_C2R; +static constexpr bool use_float = true; #else - cufftType VendorR2C = CUFFT_D2Z; - cufftType VendorC2R = CUFFT_Z2D; +static constexpr bool use_float = false; #endif - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - FFTplan fft_plan; +struct VendorPlan { + cufftHandle m_cufftplan; + int m_direction = 0; + void* m_in; + void* m_out; +}; - if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) { - AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024), - "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA " - "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes."); - } +std::string cufftErrorToString (const cufftResult& err) { + const auto res2string = std::map{ + {CUFFT_SUCCESS, "CUFFT_SUCCESS"}, + {CUFFT_INVALID_PLAN,"CUFFT_INVALID_PLAN"}, + {CUFFT_ALLOC_FAILED,"CUFFT_ALLOC_FAILED"}, + {CUFFT_INVALID_TYPE,"CUFFT_INVALID_TYPE"}, + {CUFFT_INVALID_VALUE,"CUFFT_INVALID_VALUE"}, + {CUFFT_INTERNAL_ERROR,"CUFFT_INTERNAL_ERROR"}, + {CUFFT_EXEC_FAILED,"CUFFT_EXEC_FAILED"}, + {CUFFT_SETUP_FAILED,"CUFFT_SETUP_FAILED"}, + {CUFFT_INVALID_SIZE,"CUFFT_INVALID_SIZE"}, + {CUFFT_UNALIGNED_DATA,"CUFFT_UNALIGNED_DATA"} + }; - // Initialize fft_plan.m_plan with the vendor fft plan. - cufftResult result; - if (dir == direction::R2C){ - result = cufftPlan2d( - &(fft_plan.m_plan), real_size[1], real_size[0], VendorR2C); - } else { - result = cufftPlan2d( - &(fft_plan.m_plan), real_size[1], real_size[0], VendorC2R); - } - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; - - return fft_plan; + const auto it = res2string.find(err); + if(it != res2string.end()){ + return it->second; + } else { + return std::to_string(err) + " (unknown error code)"; } +} - void DestroyPlan (FFTplan& fft_plan) - { - cufftDestroy( fft_plan.m_plan ); +void assert_cufft_status (std::string const& name, const cufftResult& status) { + if (status != CUFFT_SUCCESS) { + amrex::Abort(name + " failed! Error: " + cufftErrorToString(status)); } +} + +std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { + // https://docs.nvidia.com/cuda/cufft/index.html#cufft-api-reference + m_plan = new VendorPlan; + + cufftType transform_type; + int rank = 0; + long long int n[2] = {0, 0}; + long long int batch = 0; + + switch (type) { + case FFTType::C2C_2D_fwd: + transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; + m_plan->m_direction = cuFFTFORWARD; + rank = 2; + n[0] = nx; + n[1] = ny; + batch = 1; + break; + case FFTType::C2C_2D_bkw: + transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; + m_plan->m_direction = cuFFTINVERSE; + rank = 2; + n[0] = nx; + n[1] = ny; + batch = 1; + break; + case FFTType::C2R_2D: + transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D; + rank = 2; + n[0] = nx; + n[1] = ny; + batch = 1; + break; + case FFTType::R2C_2D: + transform_type = use_float ? CUFFT_R2C : CUFFT_D2Z; + rank = 2; + n[0] = nx; + n[1] = ny; + batch = 1; + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by cufft"); + return 0; + case FFTType::C2R_1D_batched: + transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D; + rank = 1; + n[0] = nx; + batch = ny; + break; + case FFTType::R2C_1D_batched: + transform_type = use_float ? CUFFT_R2C : CUFFT_D2Z; + rank = 1; + n[0] = nx; + batch = ny; + break; + } + + cufftResult result; + + result = cufftCreate(&(m_plan->m_cufftplan)); + assert_cufft_status("cufftCreate", result); + + result = cufftSetAutoAllocation(m_plan->m_cufftplan, 0); + assert_cufft_status("cufftSetAutoAllocation", result); + + std::size_t workSize = 0; + + result = cufftMakePlanMany64( + m_plan->m_cufftplan, + rank, + n, + nullptr, 0, 0, + nullptr, 0, 0, + transform_type, + batch, + &workSize); + assert_cufft_status("cufftMakePlanMany64", result); - void Execute (FFTplan& fft_plan){ - HIPACE_PROFILE("AnyFFT::Execute()"); - // make sure that this is done on the same GPU stream as the above copy - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( fft_plan.m_plan, stream); + result = cufftSetStream(m_plan->m_cufftplan, amrex::Gpu::Device::cudaStream(()); + assert_cufft_status("cufftSetStream", result); + + return workSize; +} + +void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { + m_plan->m_in = in; + m_plan->m_out = out; + + cufftResult result; + + result = cufftSetWorkArea(m_plan->m_cufftplan, work_area); + assert_cufft_status("cufftSetWorkArea", result); +} + +void AnyFFT::Execute () { + cufftResult result; + + result = cufftXtExec(m_plan->m_cufftplan, m_plan->m_in, m_plan->m_out, m_plan->m_direction); + assert_cufft_status("cufftXtExec", result); +} + +AnyFFT::~AnyFFT () { + if (m_plan) { cufftResult result; - if (fft_plan.m_dir == direction::R2C){ -#ifdef AMREX_USE_FLOAT - result = cufftExecR2C(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); -#else - result = cufftExecD2Z(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); -#endif - } else if (fft_plan.m_dir == direction::C2R){ -#ifdef AMREX_USE_FLOAT - result = cufftExecC2R(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); -#else - result = cufftExecZ2D(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); -#endif - } else { - amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); - } - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } + + result = cufftDestroy(m_plan->m_cufftplan); + assert_cufft_status("cufftDestroy", result); + + delete m_plan; } } diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 8a6565dbed..5cd9e32f60 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -1,72 +1,148 @@ -/* Copyright 2020 +/* Copyright 2024 * * This file is part of HiPACE++. * - * Authors: MaxThevenet, Remi Lehe - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. + * Authors: AlexanderSinn * * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" #include "utils/HipaceProfilerWrapper.H" -namespace AnyFFT -{ +#include + +#include + +#include + #ifdef AMREX_USE_FLOAT - const auto VendorCreatePlanR2C3D = fftwf_plan_dft_r2c_3d; - const auto VendorCreatePlanC2R3D = fftwf_plan_dft_c2r_3d; - const auto VendorCreatePlanR2C2D = fftwf_plan_dft_r2c_2d; - const auto VendorCreatePlanC2R2D = fftwf_plan_dft_c2r_2d; +static constexpr bool use_float = true; #else - const auto VendorCreatePlanR2C3D = fftw_plan_dft_r2c_3d; - const auto VendorCreatePlanC2R3D = fftw_plan_dft_c2r_3d; - const auto VendorCreatePlanR2C2D = fftw_plan_dft_r2c_2d; - const auto VendorCreatePlanC2R2D = fftw_plan_dft_c2r_2d; +static constexpr bool use_float = false; #endif - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - FFTplan fft_plan; +struct VendorPlan { + std::conditional_t m_fftwplan; + AnyFFT::FFTType m_type; + int m_nx; + int m_ny; +}; - // Initialize fft_plan.m_plan with the vendor fft plan. - // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order - if (dir == direction::R2C){ - fft_plan.m_plan = VendorCreatePlanR2C2D( - real_size[1], real_size[0], real_array, complex_array, FFTW_ESTIMATE); - } else if (dir == direction::C2R){ - fft_plan.m_plan = VendorCreatePlanC2R2D( - real_size[1], real_size[0], complex_array, real_array, FFTW_ESTIMATE); - } +std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { + // https://www.fftw.org/fftw3_doc/FFTW-Reference.html + m_plan = new VendorPlan; - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; + m_plan->m_type = type; + m_plan->m_nx = nx; + m_plan->m_ny = ny; + return 0; +} - return fft_plan; +void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { + if constexpr (use_float) { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + m_plan->m_fftwplan = fftwf_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_FORWARD, FFTW_MEASURE); + break; + case FFTType::C2C_2D_bkw: + m_plan->m_fftwplan = fftwf_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_BACKWARD, FFTW_MEASURE); + break; + case FFTType::C2R_2D: + m_plan->m_fftwplan = fftwf_plan_dft_c2r_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + break; + case FFTType::R2C_2D: + m_plan->m_fftwplan = fftwf_plan_dft_r2c_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + break; + case FFTType::R2R_2D: + m_plan->m_fftwplan = fftwf_plan_r2r_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); + break; + case FFTType::C2R_1D_batched: + { + int n[1] = {m_plan->m_nx}; + m_plan->m_fftwplan = fftwf_plan_many_dft_c2r( + 1, n, m_plan->m_ny, + in, nullptr, 1, m_plan->m_nx/2+1, + out, nullptr, 1, m_plan->m_nx, + FFTW_MEASURE); + } + break; + case FFTType::R2C_1D_batched: + { + int n[1] = {m_plan->m_nx}; + m_plan->m_fftwplan = fftwf_plan_many_dft_r2c( + 1, n, m_plan->m_ny, + in, nullptr, 1, m_plan->m_nx, + out, nullptr, 1, m_plan->m_nx/2+1, + FFTW_MEASURE); + } + break; + } + } else { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + m_plan->m_fftwplan = fftw_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_FORWARD, FFTW_MEASURE); + break; + case FFTType::C2C_2D_bkw: + m_plan->m_fftwplan = fftw_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_BACKWARD, FFTW_MEASURE); + break; + case FFTType::C2R_2D: + m_plan->m_fftwplan = fftw_plan_dft_c2r_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + break; + case FFTType::R2C_2D: + m_plan->m_fftwplan = fftw_plan_dft_r2c_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + break; + case FFTType::R2R_2D: + m_plan->m_fftwplan = fftw_plan_r2r_2d( + m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + break; + case FFTType::C2R_1D_batched: + { + int n[1] = {m_plan->m_nx}; + m_plan->m_fftwplan = fftw_plan_many_dft_c2r( + 1, n, m_plan->m_ny, + in, nullptr, 1, m_plan->m_nx/2+1, + out, nullptr, 1, m_plan->m_nx, + FFTW_MEASURE); + } + break; + case FFTType::R2C_1D_batched: + { + int n[1] = {m_plan->m_nx}; + m_plan->m_fftwplan = fftw_plan_many_dft_r2c( + 1, n, m_plan->m_ny, + in, nullptr, 1, m_plan->m_nx, + out, nullptr, 1, m_plan->m_nx/2+1, + FFTW_MEASURE); + } + break; + } } +} - void DestroyPlan (FFTplan& fft_plan) - { -# ifdef AMREX_USE_FLOAT - fftwf_destroy_plan( fft_plan.m_plan ); -# else - fftw_destroy_plan( fft_plan.m_plan ); -# endif +void AnyFFT::Execute () { + if constexpr (use_float) { + fftwf_execute(m_plan->m_fftwplan); + } else { + fftw_execute(m_plan->m_fftwplan); } +} - void Execute (FFTplan& fft_plan){ - HIPACE_PROFILE("Execute_FFTplan()"); -# ifdef AMREX_USE_FLOAT - fftwf_execute( fft_plan.m_plan ); -# else - fftw_execute( fft_plan.m_plan ); -# endif +AnyFFT::~AnyFFT () { + if (m_plan) { + if constexpr (use_float) { + fftwf_destroy_plan(m_plan->m_fftwplan); + } else { + fftw_destroy_plan(m_plan->m_fftwplan); + } + delete m_plan; } } diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index 0ba7a2c7db..46bb72e489 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -1,102 +1,177 @@ -/* Copyright 2021 +/* Copyright 2024 * * This file is part of HiPACE++. * - * Authors: Axel Huebl - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. + * Authors: AlexanderSinn * * License: BSD-3-Clause-LBNL */ - #include "AnyFFT.H" -#include "RocFFTUtils.H" #include "utils/HipaceProfilerWrapper.H" -namespace AnyFFT -{ - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - const int dim = 2; - FFTplan fft_plan; - - const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(real_size[0]), - std::size_t(real_size[1]), - std::size_t(real_size[2]))}; - // Initialize fft_plan.m_plan with the vendor fft plan. - rocfft_status result = rocfft_plan_create(&(fft_plan.m_plan), - rocfft_placement_notinplace, - (dir == direction::R2C) - ? rocfft_transform_type_real_forward - : rocfft_transform_type_real_inverse, +#include + +#if __has_include() // ROCm 5.3+ +#include +#else +#include +#endif + +#include + #ifdef AMREX_USE_FLOAT - rocfft_precision_single, +static constexpr bool use_float = true; #else - rocfft_precision_double, +static constexpr bool use_float = false; #endif - dim, lengths, - 1, // number of transforms, - nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; - //fft_plan.m_dim = dim; - - return fft_plan; + +struct VendorPlan { + rocfft_plan m_rocfftplan; + rocfft_execution_info m_execinfo; + std::size_t m_work_size = 0; + void* m_in; + void* m_out; +}; + +std::string rocfftErrorToString (const rocfft_status& err) { + if (err == rocfft_status_success) { + return std::string("rocfft_status_success"); + } else if (err == rocfft_status_failure) { + return std::string("rocfft_status_failure"); + } else if (err == rocfft_status_invalid_arg_value) { + return std::string("rocfft_status_invalid_arg_value"); + } else if (err == rocfft_status_invalid_dimensions) { + return std::string("rocfft_status_invalid_dimensions"); + } else if (err == rocfft_status_invalid_array_type) { + return std::string("rocfft_status_invalid_array_type"); + } else if (err == rocfft_status_invalid_strides) { + return std::string("rocfft_status_invalid_strides"); + } else if (err == rocfft_status_invalid_distance) { + return std::string("rocfft_status_invalid_distance"); + } else if (err == rocfft_status_invalid_offset) { + return std::string("rocfft_status_invalid_offset"); + } else { + return std::to_string(err) + " (unknown error code)"; } +} - void DestroyPlan (FFTplan& fft_plan) - { - rocfft_plan_destroy( fft_plan.m_plan ); +void assert_rocfft_status (std::string const& name, const rocfft_status& status) { + if (status != rocfft_status_success) { + amrex::Abort(name + " failed! Error: " + rocfftErrorToString(status)); } +} + +std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { + // https://rocm.docs.amd.com/projects/rocFFT/en/latest/allapi.html + m_plan = new VendorPlan; + + rocfft_transform_type transform_type; + std::size_t dimensions = 0; + std::size_t lengths[2] = {0, 0}; + std::size_t number_of_transforms = 0; + + switch (type) { + case FFTType::C2C_2D_fwd: + transform_type = rocfft_transform_type_complex_forward; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::C2C_2D_bkw: + transform_type = rocfft_transform_type_complex_inverse; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::C2R_2D: + transform_type = rocfft_transform_type_real_inverse; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::R2C_2D: + transform_type = rocfft_transform_type_real_forward; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by rocfft"); + return 0; + case FFTType::C2R_1D_batched: + transform_type = rocfft_transform_type_real_inverse; + dimensions = 1; + lengths[0] = nx; + number_of_transforms = ny; + break; + case FFTType::R2C_1D_batched: + transform_type = rocfft_transform_type_real_forward; + dimensions = 1; + lengths[0] = nx; + number_of_transforms = ny; + break; + } + + rocfft_status status; + + status = rocfft_plan_create( + &(m_plan->m_rocfftplan), + rocfft_placement_notinplace, + transform_type, + use_float ? rocfft_precision_single : rocfft_precision_double, + dimensions, + lengths, + number_of_transforms, + nullptr); + assert_rocfft_status("rocfft_plan_create", status); + + status = rocfft_plan_get_work_buffer_size(m_plan->m_rocfftplan, &(m_plan->m_work_size)); + assert_rocfft_status("rocfft_plan_get_work_buffer_size", status); + + return m_plan->m_work_size; +} + +void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { + m_plan->m_in = in; + m_plan->m_out = out; + + rocfft_status status; + + status = rocfft_execution_info_create(&(m_plan->m_execinfo)); + assert_rocfft_status("rocfft_execution_info_create", status); + + status = rocfft_execution_info_set_work_buffer( + m_plan->m_execinfo, work_area, m_plan->m_work_size); + assert_rocfft_status("rocfft_execution_info_set_work_buffer", status); + + status = rocfft_execution_info_set_stream(m_plan->m_execinfo, amrex::Gpu::gpuStream()); + assert_rocfft_status("rocfft_execution_info_set_stream", status); +} + +void AnyFFT::Execute () { + rocfft_status status; + + void* in_arr[2] = {m_plan->m_in, nullptr}; + void* out_arr[2] = {m_plan->m_out, nullptr}; + + status = rocfft_execute(m_plan->m_rocfftplan, in_arr, out_arr, m_plan->m_execinfo); + assert_rocfft_status("rocfft_execute", status); +} + +AnyFFT::~AnyFFT () { + if (m_plan) { + rocfft_status status; + + status = rocfft_execution_info_destroy(m_plan->m_execinfo); + assert_rocfft_status("rocfft_execution_info_destroy", status); + + status = rocfft_plan_destroy(m_plan->m_rocfftplan); + assert_rocfft_status("rocfft_plan_destroy", status); - void Execute (FFTplan& fft_plan) - { - HIPACE_PROFILE("AnyFFT::Execute()"); - rocfft_execution_info execinfo = NULL; - rocfft_status result = rocfft_execution_info_create(&execinfo); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - std::size_t buffersize = 0; - result = rocfft_plan_get_work_buffer_size(fft_plan.m_plan, &buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - void* buffer = amrex::The_Arena()->alloc(buffersize); - result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - if (fft_plan.m_dir == direction::R2C) { - result = rocfft_execute(fft_plan.m_plan, - (void**)&(fft_plan.m_real_array), // in - (void**)&(fft_plan.m_complex_array), // out - execinfo); - } else if (fft_plan.m_dir == direction::C2R) { - result = rocfft_execute(fft_plan.m_plan, - (void**)&(fft_plan.m_complex_array), // in - (void**)&(fft_plan.m_real_array), // out - execinfo); - } else { - amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); - } - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - - amrex::Gpu::streamSynchronize(); - - amrex::The_Arena()->free(buffer); - - result = rocfft_execution_info_destroy(execinfo); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); + delete m_plan; } } diff --git a/src/fields/fft_poisson_solver/fft/AnyDST.H b/src/fields/fft_poisson_solver/fft_old/AnyDST.H similarity index 100% rename from src/fields/fft_poisson_solver/fft/AnyDST.H rename to src/fields/fft_poisson_solver/fft_old/AnyDST.H diff --git a/src/fields/fft_poisson_solver/fft_old/AnyFFT.H b/src/fields/fft_poisson_solver/fft_old/AnyFFT.H new file mode 100644 index 0000000000..cbbd077954 --- /dev/null +++ b/src/fields/fft_poisson_solver/fft_old/AnyFFT.H @@ -0,0 +1,120 @@ +/* Copyright 2020-2021 + * + * This file is part of HiPACE++. + * + * Authors: Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs + * WeiqunZhang + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. + * + * License: BSD-3-Clause-LBNL + */ +#ifndef ANYFFT_H_ +#define ANYFFT_H_ + +#include + +#ifdef AMREX_USE_CUDA +# include +#elif defined(AMREX_USE_HIP) +# if __has_include() // ROCm 5.3+ +# include +# else +# include +# endif +#else +# include +#endif + +#include + +/** + * \brief Wrapper around multiple FFT libraries. + * + * The header file defines the API and the base types + * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is + * done in different cpp files. This wrapper only depends on the underlying FFT library + * AND on AMReX (There is no dependence on WarpX). + */ +namespace AnyFFT +{ + // First, define library-dependent types (complex, FFT plan) + + /** Complex type for FFT, depends on FFT library */ +#ifdef AMREX_USE_CUDA +# ifdef AMREX_USE_FLOAT + using Complex = cuComplex; +# else + using Complex = cuDoubleComplex; +# endif +#elif defined(AMREX_USE_HIP) +# ifdef AMREX_USE_FLOAT + using Complex = float2; +# else + using Complex = double2; +# endif +#else +# ifdef AMREX_USE_FLOAT + using Complex = fftwf_complex; +# else + using Complex = fftw_complex; +# endif +#endif + + /** Library-dependent FFT plans type, which holds one fft plan per box + * (plans are only initialized for the boxes that are owned by the local MPI rank). + */ +#ifdef AMREX_USE_CUDA + using VendorFFTPlan = cufftHandle; +#elif defined(AMREX_USE_HIP) + using VendorFFTPlan = rocfft_plan; +#else +# ifdef AMREX_USE_FLOAT + using VendorFFTPlan = fftwf_plan; +# else + using VendorFFTPlan = fftw_plan; +# endif +#endif + + // Second, define library-independent API + + /** Direction in which the FFT is performed. */ + enum struct direction {R2C, C2R}; + + /** \brief This struct contains the vendor FFT plan and additional metadata + */ + struct FFTplan + { + amrex::Real* m_real_array; /**< pointer to real array */ + Complex* m_complex_array; /**< pointer to complex array */ + VendorFFTPlan m_plan; /**< Vendor FFT plan */ + direction m_dir; /**< direction (C2R or R2C) */ + }; + + /** Collection of FFT plans, one FFTplan per box */ + using FFTplans = amrex::LayoutData; + + /** \brief create FFT plan for the backend FFT library. + * \param[in] real_size Size of the real array, along each dimension. + * \param[out] real_array Real array from/to where R2C/C2R FFT is performed + * \param[out] complex_array Complex array to/from where R2C/C2R FFT is performed + * \param[in] dir direction, either R2C or C2R + */ + FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, + Complex * const complex_array, const direction dir); + + /** \brief Destroy library FFT plan. + * \param[out] fft_plan plan to destroy + */ + void DestroyPlan (FFTplan& fft_plan); + + /** \brief Perform FFT with backend library. + * \param[out] fft_plan plan for which the FFT is performed + */ + void Execute (FFTplan& fft_plan); +} + +#endif // ANYFFT_H_ diff --git a/src/fields/fft_poisson_solver/fft_old/CMakeLists.txt b/src/fields/fft_poisson_solver/fft_old/CMakeLists.txt new file mode 100644 index 0000000000..e89ecd84f3 --- /dev/null +++ b/src/fields/fft_poisson_solver/fft_old/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright 2020-2021 +# +# This file is part of HiPACE++. +# +# Authors: Axel Huebl, MaxThevenet, Remi Lehe +# License: BSD-3-Clause-LBNL + +if (HiPACE_COMPUTE STREQUAL CUDA) + target_sources(HiPACE + PRIVATE + WrapCuFFT.cpp + WrapCuDST.cpp + CuFFTUtils.cpp + ) +elseif(HiPACE_COMPUTE STREQUAL HIP) + target_sources(HiPACE + PRIVATE + WrapRocFFT.cpp + WrapRocDST.cpp + RocFFTUtils.cpp + ) +else() + target_sources(HiPACE + PRIVATE + WrapFFTW.cpp + WrapDSTW.cpp + ) +endif() diff --git a/src/fields/fft_poisson_solver/fft/CuFFTUtils.H b/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.H similarity index 100% rename from src/fields/fft_poisson_solver/fft/CuFFTUtils.H rename to src/fields/fft_poisson_solver/fft_old/CuFFTUtils.H diff --git a/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp b/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.cpp similarity index 100% rename from src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp rename to src/fields/fft_poisson_solver/fft_old/CuFFTUtils.cpp diff --git a/src/fields/fft_poisson_solver/fft/RocFFTUtils.H b/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.H similarity index 100% rename from src/fields/fft_poisson_solver/fft/RocFFTUtils.H rename to src/fields/fft_poisson_solver/fft_old/RocFFTUtils.H diff --git a/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp b/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.cpp similarity index 100% rename from src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp rename to src/fields/fft_poisson_solver/fft_old/RocFFTUtils.cpp diff --git a/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp b/src/fields/fft_poisson_solver/fft_old/WrapCuDST.cpp similarity index 100% rename from src/fields/fft_poisson_solver/fft/WrapCuDST.cpp rename to src/fields/fft_poisson_solver/fft_old/WrapCuDST.cpp diff --git a/src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp new file mode 100644 index 0000000000..739ab2cdcb --- /dev/null +++ b/src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp @@ -0,0 +1,95 @@ +/* Copyright 2020-2022 + * + * This file is part of HiPACE++. + * + * Authors: MaxThevenet, Remi Lehe, Severin Diederichs + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. + * + * License: BSD-3-Clause-LBNL + */ +#include "AnyFFT.H" +#include "CuFFTUtils.H" +#include "utils/HipaceProfilerWrapper.H" + +namespace AnyFFT +{ + +#ifdef AMREX_USE_FLOAT + cufftType VendorR2C = CUFFT_R2C; + cufftType VendorC2R = CUFFT_C2R; +#else + cufftType VendorR2C = CUFFT_D2Z; + cufftType VendorC2R = CUFFT_Z2D; +#endif + + FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, + Complex * const complex_array, const direction dir) + { + HIPACE_PROFILE("AnyFFT::CreatePlan()"); + FFTplan fft_plan; + + if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024), + "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA " + "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes."); + } + + // Initialize fft_plan.m_plan with the vendor fft plan. + cufftResult result; + if (dir == direction::R2C){ + result = cufftPlan2d( + &(fft_plan.m_plan), real_size[1], real_size[0], VendorR2C); + } else { + result = cufftPlan2d( + &(fft_plan.m_plan), real_size[1], real_size[0], VendorC2R); + } + + if ( result != CUFFT_SUCCESS ) { + amrex::Print() << " cufftplan failed! Error: " << + CuFFTUtils::cufftErrorToString(result) << "\n"; + } + + // Store meta-data in fft_plan + fft_plan.m_real_array = real_array; + fft_plan.m_complex_array = complex_array; + fft_plan.m_dir = dir; + + return fft_plan; + } + + void DestroyPlan (FFTplan& fft_plan) + { + cufftDestroy( fft_plan.m_plan ); + } + + void Execute (FFTplan& fft_plan){ + HIPACE_PROFILE("AnyFFT::Execute()"); + // make sure that this is done on the same GPU stream as the above copy + cudaStream_t stream = amrex::Gpu::Device::cudaStream(); + cufftSetStream ( fft_plan.m_plan, stream); + cufftResult result; + if (fft_plan.m_dir == direction::R2C){ +#ifdef AMREX_USE_FLOAT + result = cufftExecR2C(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); +#else + result = cufftExecD2Z(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); +#endif + } else if (fft_plan.m_dir == direction::C2R){ +#ifdef AMREX_USE_FLOAT + result = cufftExecC2R(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); +#else + result = cufftExecZ2D(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); +#endif + } else { + amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); + } + if ( result != CUFFT_SUCCESS ) { + amrex::Print() << " forward transform using cufftExec failed ! Error: " << + CuFFTUtils::cufftErrorToString(result) << "\n"; + } + } +} diff --git a/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp b/src/fields/fft_poisson_solver/fft_old/WrapDSTW.cpp similarity index 100% rename from src/fields/fft_poisson_solver/fft/WrapDSTW.cpp rename to src/fields/fft_poisson_solver/fft_old/WrapDSTW.cpp diff --git a/src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp new file mode 100644 index 0000000000..8a6565dbed --- /dev/null +++ b/src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp @@ -0,0 +1,72 @@ +/* Copyright 2020 + * + * This file is part of HiPACE++. + * + * Authors: MaxThevenet, Remi Lehe + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. + * + * License: BSD-3-Clause-LBNL + */ +#include "AnyFFT.H" +#include "utils/HipaceProfilerWrapper.H" + +namespace AnyFFT +{ +#ifdef AMREX_USE_FLOAT + const auto VendorCreatePlanR2C3D = fftwf_plan_dft_r2c_3d; + const auto VendorCreatePlanC2R3D = fftwf_plan_dft_c2r_3d; + const auto VendorCreatePlanR2C2D = fftwf_plan_dft_r2c_2d; + const auto VendorCreatePlanC2R2D = fftwf_plan_dft_c2r_2d; +#else + const auto VendorCreatePlanR2C3D = fftw_plan_dft_r2c_3d; + const auto VendorCreatePlanC2R3D = fftw_plan_dft_c2r_3d; + const auto VendorCreatePlanR2C2D = fftw_plan_dft_r2c_2d; + const auto VendorCreatePlanC2R2D = fftw_plan_dft_c2r_2d; +#endif + + FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, + Complex * const complex_array, const direction dir) + { + HIPACE_PROFILE("AnyFFT::CreatePlan()"); + FFTplan fft_plan; + + // Initialize fft_plan.m_plan with the vendor fft plan. + // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order + if (dir == direction::R2C){ + fft_plan.m_plan = VendorCreatePlanR2C2D( + real_size[1], real_size[0], real_array, complex_array, FFTW_ESTIMATE); + } else if (dir == direction::C2R){ + fft_plan.m_plan = VendorCreatePlanC2R2D( + real_size[1], real_size[0], complex_array, real_array, FFTW_ESTIMATE); + } + + // Store meta-data in fft_plan + fft_plan.m_real_array = real_array; + fft_plan.m_complex_array = complex_array; + fft_plan.m_dir = dir; + + return fft_plan; + } + + void DestroyPlan (FFTplan& fft_plan) + { +# ifdef AMREX_USE_FLOAT + fftwf_destroy_plan( fft_plan.m_plan ); +# else + fftw_destroy_plan( fft_plan.m_plan ); +# endif + } + + void Execute (FFTplan& fft_plan){ + HIPACE_PROFILE("Execute_FFTplan()"); +# ifdef AMREX_USE_FLOAT + fftwf_execute( fft_plan.m_plan ); +# else + fftw_execute( fft_plan.m_plan ); +# endif + } +} diff --git a/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp b/src/fields/fft_poisson_solver/fft_old/WrapRocDST.cpp similarity index 100% rename from src/fields/fft_poisson_solver/fft/WrapRocDST.cpp rename to src/fields/fft_poisson_solver/fft_old/WrapRocDST.cpp diff --git a/src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp new file mode 100644 index 0000000000..0ba7a2c7db --- /dev/null +++ b/src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp @@ -0,0 +1,102 @@ +/* Copyright 2021 + * + * This file is part of HiPACE++. + * + * Authors: Axel Huebl + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. + * + * License: BSD-3-Clause-LBNL + */ + +#include "AnyFFT.H" +#include "RocFFTUtils.H" +#include "utils/HipaceProfilerWrapper.H" + +namespace AnyFFT +{ + FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, + Complex * const complex_array, const direction dir) + { + HIPACE_PROFILE("AnyFFT::CreatePlan()"); + const int dim = 2; + FFTplan fft_plan; + + const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(real_size[0]), + std::size_t(real_size[1]), + std::size_t(real_size[2]))}; + // Initialize fft_plan.m_plan with the vendor fft plan. + rocfft_status result = rocfft_plan_create(&(fft_plan.m_plan), + rocfft_placement_notinplace, + (dir == direction::R2C) + ? rocfft_transform_type_real_forward + : rocfft_transform_type_real_inverse, +#ifdef AMREX_USE_FLOAT + rocfft_precision_single, +#else + rocfft_precision_double, +#endif + dim, lengths, + 1, // number of transforms, + nullptr); + RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); + + // Store meta-data in fft_plan + fft_plan.m_real_array = real_array; + fft_plan.m_complex_array = complex_array; + fft_plan.m_dir = dir; + //fft_plan.m_dim = dim; + + return fft_plan; + } + + void DestroyPlan (FFTplan& fft_plan) + { + rocfft_plan_destroy( fft_plan.m_plan ); + } + + void Execute (FFTplan& fft_plan) + { + HIPACE_PROFILE("AnyFFT::Execute()"); + rocfft_execution_info execinfo = NULL; + rocfft_status result = rocfft_execution_info_create(&execinfo); + RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); + + std::size_t buffersize = 0; + result = rocfft_plan_get_work_buffer_size(fft_plan.m_plan, &buffersize); + RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); + + void* buffer = amrex::The_Arena()->alloc(buffersize); + result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize); + RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); + + result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream()); + RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); + + if (fft_plan.m_dir == direction::R2C) { + result = rocfft_execute(fft_plan.m_plan, + (void**)&(fft_plan.m_real_array), // in + (void**)&(fft_plan.m_complex_array), // out + execinfo); + } else if (fft_plan.m_dir == direction::C2R) { + result = rocfft_execute(fft_plan.m_plan, + (void**)&(fft_plan.m_complex_array), // in + (void**)&(fft_plan.m_real_array), // out + execinfo); + } else { + amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); + } + + RocFFTUtils::assert_rocfft_status("rocfft_execute", result); + + amrex::Gpu::streamSynchronize(); + + amrex::The_Arena()->free(buffer); + + result = rocfft_execution_info_destroy(execinfo); + RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); + } +} diff --git a/src/laser/MultiLaser.H b/src/laser/MultiLaser.H index 0495fa68fb..3eff5e6a3b 100644 --- a/src/laser/MultiLaser.H +++ b/src/laser/MultiLaser.H @@ -13,59 +13,13 @@ #include "Laser.H" #include "fields/Fields.H" #include "mg_solver/HpMultiGrid.H" +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include #include #include #include -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif - -namespace LaserFFT { -#ifdef AMREX_USE_CUDA - using VendorFFT = cufftHandle; - const auto VendorCreate = cufftPlan2d; - const auto VendorDestroy = cufftDestroy; -# ifdef AMREX_USE_FLOAT - const auto VendorExecute = cufftExecC2C; - const auto cufft_type = CUFFT_C2C; - using cufftComplex = cuComplex; -# else - const auto VendorExecute = cufftExecZ2Z; - const auto cufft_type = CUFFT_Z2Z; - using cufftComplex = cuDoubleComplex; -# endif -#elif defined(AMREX_USE_HIP) - using VendorFFT = rocfft_plan; - const auto VendorDestroy = rocfft_plan_destroy; -// TODO -#else -# ifdef AMREX_USE_FLOAT - using VendorFFT = fftwf_plan; - using FFTWComplex = fftwf_complex; - const auto VendorCreate = fftwf_plan_dft_2d; - const auto VendorExecute = fftwf_execute; - const auto VendorDestroy = fftwf_destroy_plan; -# else - using VendorFFT = fftw_plan; - using FFTWComplex = fftw_complex; - const auto VendorCreate = fftw_plan_dft_2d; - const auto VendorExecute = fftw_execute; - const auto VendorDestroy = fftw_destroy_plan; -# endif -#endif -} - /** \brief describes which slice with respect to the currently calculated is used */ namespace WhichLaserSlice { // n stands for the time step, j for the longitudinal slice. @@ -108,15 +62,6 @@ public: ReadParameters(); } - ~MultiLaser () - { - if (!m_use_laser) return; - if (m_solver_type == "fft") { - LaserFFT::VendorDestroy( m_plan_fwd ); - LaserFFT::VendorDestroy( m_plan_bkw ); - } - } - void ReadParameters (); /** get function for the 2D slices */ @@ -265,26 +210,17 @@ private: /** hpmg solver for the envelope solver */ std::unique_ptr m_mg; - // Elements for the FFT-based laser envelope solver - // This could belong to AnyFFT etc., with 2 caveats: - // - This solver operates on a FArrayBox instead of a MultiFab, which is more adequate - // - The array in position space must be Complex rather than real, which takes up quite some - // rewriting, see https://github.com/MaxThevenet/hipace/tree/laser_solve, - // not sure what the best way to proceed is. /** FFTW plan for forward C2C transform to solve Complex Poisson equation */ - LaserFFT::VendorFFT m_plan_fwd; + AnyFFT m_forward_fft; /** FFTW plan for backward C2C transform to solve Complex Poisson equation */ - LaserFFT::VendorFFT m_plan_bkw; + AnyFFT m_backward_fft; + amrex::Gpu::DeviceVector m_fft_work_area; /** Complex FAB to store the solution (e.g. laser envelope on current slice) */ SpectralFieldLoc m_sol; /** Complex FAB to store the RHS in position space */ SpectralFieldLoc m_rhs; /** Complex FAB to store the RHS in Fourier space */ SpectralFieldLoc m_rhs_fourier; -#ifdef AMREX_USE_CUDA - cufftResult m_result_fwd; - cufftResult m_result_bkw; -#endif // Data for in-situ diagnostics: /** Number of real laser properties for in-situ per-slice reduced diagnostics. */ diff --git a/src/laser/MultiLaser.cpp b/src/laser/MultiLaser.cpp index fbcbd97f66..ec0828d153 100644 --- a/src/laser/MultiLaser.cpp +++ b/src/laser/MultiLaser.cpp @@ -13,11 +13,7 @@ #include "utils/HipaceProfilerWrapper.H" #include "utils/DeprecatedInput.H" #include "utils/InsituUtil.H" -#ifdef AMREX_USE_CUDA -# include "fields/fft_poisson_solver/fft/CuFFTUtils.H" -#elif defined(AMREX_USE_HIP) -# include "fields/fft_poisson_solver/fft/RocFFTUtils.H" -#endif +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "particles/particles_utils/ShapeFactors.H" #ifdef HIPACE_USE_OPENPMD # include @@ -25,18 +21,6 @@ #include -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif - void MultiLaser::ReadParameters () { @@ -111,62 +95,13 @@ MultiLaser::InitData (const amrex::BoxArray& slice_ba, // Create FFT plans amrex::IntVect fft_size = m_slice_box.length(); -#ifdef AMREX_USE_CUDA - cufftResult result; - // Forward FFT plan - result = LaserFFT::VendorCreate( - &(m_plan_fwd), fft_size[1], fft_size[0], LaserFFT::cufft_type); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - // Backward FFT plan - result = LaserFFT::VendorCreate( - &(m_plan_bkw), fft_size[1], fft_size[0], LaserFFT::cufft_type); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } -#elif defined(AMREX_USE_HIP) - const std::size_t lengths[] = { - static_cast(fft_size[0]), static_cast(fft_size[1]) - }; - - rocfft_status result; - // Forward FFT plan - result = rocfft_plan_create(&(m_plan_fwd), rocfft_placement_notinplace, - rocfft_transform_type_complex_forward, -#ifdef AMREX_USE_FLOAT - rocfft_precision_single, -#else - rocfft_precision_double, -#endif - 2, lengths, 1, nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - // Backward FFT plan - result = rocfft_plan_create(&(m_plan_bkw), rocfft_placement_notinplace, - rocfft_transform_type_complex_inverse, -#ifdef AMREX_USE_FLOAT - rocfft_precision_single, -#else - rocfft_precision_double, -#endif - 2, lengths, 1, nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); -#else - // Forward FFT plan - m_plan_fwd = LaserFFT::VendorCreate( - fft_size[1], fft_size[0], - reinterpret_cast(m_rhs.dataPtr()), - reinterpret_cast(m_rhs_fourier.dataPtr()), - FFTW_FORWARD, FFTW_ESTIMATE); - // Backward FFT plan - m_plan_bkw = LaserFFT::VendorCreate( - fft_size[1], fft_size[0], - reinterpret_cast(m_rhs_fourier.dataPtr()), - reinterpret_cast(m_sol.dataPtr()), - FFTW_BACKWARD, FFTW_ESTIMATE); -#endif + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::C2C_2D_fwd, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2C_2D_bkw, fft_size[0], fft_size[1]); + + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_rhs.dataPtr(), m_rhs_fourier.dataPtr(), m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_rhs_fourier.dataPtr(), m_sol.dataPtr(), m_fft_work_area.dataPtr()); } if (m_laser_from_file) { @@ -924,36 +859,7 @@ MultiLaser::AdvanceSliceFFT (const Fields& fields, const amrex::Real dt, int ste }); // Transform rhs to Fourier space -#ifdef AMREX_USE_CUDA - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream(m_plan_fwd, stream); - cufftResult result; - result = LaserFFT::VendorExecute( - m_plan_fwd, - reinterpret_cast( m_rhs.dataPtr() ), - reinterpret_cast( m_rhs_fourier.dataPtr() ), - CUFFT_FORWARD); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } -#elif defined(AMREX_USE_HIP) - rocfft_execution_info execinfo_fwd = nullptr; - rocfft_status result = rocfft_execution_info_create(&execinfo_fwd); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - result = rocfft_execution_info_set_stream(execinfo_fwd, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - void* in_buffer_fwd[2] = {(void*)m_rhs.dataPtr(), nullptr}; - void* out_bufferfwd[2] = {(void*)m_rhs_fourier.dataPtr(), nullptr}; - - result = rocfft_execute(m_plan_fwd, in_buffer_fwd, out_bufferfwd, execinfo_fwd); - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - result = rocfft_execution_info_destroy(execinfo_fwd); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); -#else - LaserFFT::VendorExecute( m_plan_fwd ); -#endif + m_forward_fft.Execute(); // Multiply by appropriate factors in Fourier space amrex::Real dkx = 2.*MathConst::pi/m_laser_geom_3D.ProbLength(0); @@ -975,35 +881,7 @@ MultiLaser::AdvanceSliceFFT (const Fields& fields, const amrex::Real dt, int ste }); // Transform rhs to Fourier space to get solution in sol -#ifdef AMREX_USE_CUDA - cudaStream_t stream_bkw = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( m_plan_bkw, stream_bkw); - result = LaserFFT::VendorExecute( - m_plan_bkw, - reinterpret_cast( m_rhs_fourier.dataPtr() ), - reinterpret_cast( m_sol.dataPtr() ), - CUFFT_INVERSE); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } -#elif defined(AMREX_USE_HIP) - rocfft_execution_info execinfo_bkw = nullptr; - result = rocfft_execution_info_create(&execinfo_bkw); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - result = rocfft_execution_info_set_stream(execinfo_bkw, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - void* in_buffer_bkw[2] = {(void*)m_rhs_fourier.dataPtr(), nullptr}; - void* out_buffer_bkw[2] = {(void*)m_sol.dataPtr(), nullptr}; - - result = rocfft_execute(m_plan_bkw, in_buffer_bkw, out_buffer_bkw, execinfo_bkw); - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - result = rocfft_execution_info_destroy(execinfo_bkw); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); -#else - LaserFFT::VendorExecute( m_plan_bkw ); -#endif + m_backward_fft.Execute(); // Normalize and store solution in np1j00[0]. Guard cells are filled with 0s. amrex::Box grown_bx = bx; From a8511c7a3d3c954697b75cdcc3a94645d9cc264b Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Sun, 14 Apr 2024 01:42:48 +0200 Subject: [PATCH 02/11] fix fftw and cufft --- src/fields/fft_poisson_solver/CMakeLists.txt | 2 + .../FFTPoissonSolverDirichletExpanded.H | 76 ++++++++ .../FFTPoissonSolverDirichletExpanded.cpp | 173 ++++++++++++++++++ .../FFTPoissonSolverDirichletFast.H | 72 ++++++++ .../FFTPoissonSolverDirichletFast.cpp | 136 ++++++++++++++ src/fields/fft_poisson_solver/fft/AnyFFT.H | 2 + .../fft_poisson_solver/fft/WrapCuFFT.cpp | 24 +-- .../fft_poisson_solver/fft/WrapFFTW.cpp | 99 ++++++---- 8 files changed, 533 insertions(+), 51 deletions(-) create mode 100644 src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H create mode 100644 src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp create mode 100644 src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H create mode 100644 src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp diff --git a/src/fields/fft_poisson_solver/CMakeLists.txt b/src/fields/fft_poisson_solver/CMakeLists.txt index 847506f194..d0292fb2aa 100644 --- a/src/fields/fft_poisson_solver/CMakeLists.txt +++ b/src/fields/fft_poisson_solver/CMakeLists.txt @@ -10,6 +10,8 @@ target_sources(HiPACE FFTPoissonSolver.cpp FFTPoissonSolverPeriodic.cpp FFTPoissonSolverDirichlet.cpp + FFTPoissonSolverDirichletFast.cpp + FFTPoissonSolverDirichletExpanded.cpp MGPoissonSolverDirichlet.cpp ) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H new file mode 100644 index 0000000000..d559e2ca5f --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H @@ -0,0 +1,76 @@ +/* Copyright 2020-2021 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs + * License: BSD-3-Clause-LBNL + */ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_H_ + +#include "fields/fft_poisson_solver/fft/AnyFFT.H" +#include "FFTPoissonSolver.H" + +#include +#include + +/** + * \brief This class handles functions and data to perform transverse Fourier-based Poisson solves. + * + * For a given source S, it solves equation Laplacian(F) = S and returns F. + * Once an instance is created, a typical use consists in: + * 1. Compute S directly in FFTPoissonSolver::m_stagingArea + * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS + * in the staging area and return the LHS in mf. + */ +class FFTPoissonSolverDirichletExpanded final : public FFTPoissonSolver +{ +public: + /** Constructor */ + FFTPoissonSolverDirichletExpanded ( amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** virtual destructor */ + virtual ~FFTPoissonSolverDirichletExpanded () override final {} + + /** + * \brief Define real space and spectral space boxes and multifabs, Dirichlet + * eigenvalue matrix m_eigenvalue_matrix to solve Poisson equation and FFT plans. + * Currently only works with a single box, i.e., serial FFT. + * + * \param[in] realspace_ba BoxArray on which the FFT is executed. + * \param[in] dm DistributionMapping for the BoxArray. + * \param[in] gm Geometry, contains the box dimensions. + */ + void define ( amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** + * Solve Poisson equation. The source term must be stored in the staging area m_stagingArea prior to this call. + * + * \param[in] lhs_mf Destination array, where the result is stored. + */ + virtual void SolvePoissonEquation (amrex::MultiFab& lhs_mf) override final; + + /** Position and relative factor used to apply inhomogeneous Dirichlet boundary conditions */ + virtual amrex::Real BoundaryOffset() override final { return 1.; } + virtual amrex::Real BoundaryFactor() override final { return 1.; } + +private: + /** Spectral fields, contains (real) field in Fourier space */ + amrex::MultiFab m_tmpSpectralField; + /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ + amrex::MultiFab m_eigenvalue_matrix; + + amrex::FArrayBox m_expanded_position_array; + + amrex::BaseFab> m_expanded_fourier_array; + /** DST plans */ + AnyFFT m_forward_fft; + AnyFFT m_backward_fft; + amrex::Gpu::DeviceVector m_fft_work_area; +}; + +#endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp new file mode 100644 index 0000000000..11f3bdf90b --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp @@ -0,0 +1,173 @@ +/* Copyright 2020-2022 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs + * + * License: BSD-3-Clause-LBNL + */ +#include "FFTPoissonSolverDirichletExpanded.H" +#include "fft/AnyFFT.H" +#include "fields/Fields.H" +#include "utils/Constants.H" +#include "utils/GPUUtil.H" +#include "utils/HipaceProfilerWrapper.H" + +FFTPoissonSolverDirichletExpanded::FFTPoissonSolverDirichletExpanded ( + amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + define(realspace_ba, dm, gm); +} + +void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src) +{ + const amrex::Box bx = src.box(); + const int nx = bx.length(0); + const int ny = bx.length(1); + const amrex::IntVect lo = bx.smallEnd(); + Array2 const src_array = src.const_array(); + Array2 const dst_array = dst.array(); + + amrex::ParallelFor(bx, + [=] AMREX_GPU_DEVICE(int i, int j, int) + { + /* upper left quadrant */ + dst_array(i+1,j+1) = src_array(i, j); + /* lower left quadrant */ + dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]); + /* upper right quadrant */ + dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j); + /* lower right quadrant */ + dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]); + }); +} + +void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab>& src) +{ + const amrex::Box bx = dst.box(); + Array2 const> const src_array = src.const_array(); + Array2 const dst_array = dst.array(); + amrex::ParallelFor(bx, + [=] AMREX_GPU_DEVICE(int i, int j, int) + { + /* upper left quadrant */ + dst_array(i,j) = -src_array(i+1, j+1).real(); + }); +} + +void +FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + using namespace amrex::literals; + + // If we are going to support parallel FFT, the constructor needs to take a communicator. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_realspace_ba.size() == 1, "Parallel FFT not supported yet"); + + // Allocate temporary arrays - in real space and spectral space + // These arrays will store the data just before/after the FFT + // The stagingArea is also created from 0 to nx, because the real space array may have + // an offset for levels > 0 + m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required + m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards); + + // This must be true even for parallel FFT. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1, + "There should be only one box locally."); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1, + "There should be only one box locally."); + + const amrex::Box fft_box = m_stagingArea[0].box(); + const amrex::IntVect fft_size = fft_box.length(); + const auto dx = gm.CellSizeArray(); + const amrex::Real dxsquared = dx[0]*dx[0]; + const amrex::Real dysquared = dx[1]*dx[1]; + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_size[0] + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_size[1] + 1 )); + + // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) + // This normalization is used regardless of the sine transform library + const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_size[0] + 1 ) + *( fft_size[1] + 1 ))); + + // Calculate the array of m_eigenvalue_matrix + for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + amrex::IntVect lo = fft_box.smallEnd(); + amrex::ParallelFor( + fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept + { + /* fast poisson solver diagonal x coeffs */ + amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); + /* fast poisson solver diagonal y coeffs */ + amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); + + if ((sinex_sq!=0) && (siney_sq!=0)) { + eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); + } else { + // Avoid division by 0 + eigenvalue_matrix(i,j) = 0._rt; + } + }); + } + + // Allocate and initialize the FFT plans + amrex::IntVect fft_size = m_stagingArea[0].box().length(); + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), + m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), + m_fft_work_area.dataPtr()); +} + + +void +FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf) +{ + HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::SolvePoissonEquation()"); + + m_forward_fft.Execute(); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Solve Poisson equation in Fourier space: + // Multiply `tmpSpectralField` by eigenvalue_matrix + Array2 tmp_cmplx_arr = m_tmpSpectralField.array(mfi); + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + + amrex::ParallelFor( mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j); + }); + } + + m_backward_fft.Execute(); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Copy from the staging area to output array (and normalize) + Array2 tmp_real_arr = m_stagingArea.array(mfi); + Array2 lhs_arr = lhs_mf.array(mfi); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1, + "Slice MFs must be defined on one box only"); + amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + // Copy field + lhs_arr(i,j) = tmp_real_arr(i,j); + }); + } +} diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H new file mode 100644 index 0000000000..6022b0e7df --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H @@ -0,0 +1,72 @@ +/* Copyright 2020-2021 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs + * License: BSD-3-Clause-LBNL + */ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_H_ + +#include "fields/fft_poisson_solver/fft/AnyFFT.H" +#include "FFTPoissonSolver.H" + +#include +#include + +/** + * \brief This class handles functions and data to perform transverse Fourier-based Poisson solves. + * + * For a given source S, it solves equation Laplacian(F) = S and returns F. + * Once an instance is created, a typical use consists in: + * 1. Compute S directly in FFTPoissonSolver::m_stagingArea + * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS + * in the staging area and return the LHS in mf. + */ +class FFTPoissonSolverDirichletFast final : public FFTPoissonSolver +{ +public: + /** Constructor */ + FFTPoissonSolverDirichletFast ( amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** virtual destructor */ + virtual ~FFTPoissonSolverDirichletFast () override final {} + + /** + * \brief Define real space and spectral space boxes and multifabs, Dirichlet + * eigenvalue matrix m_eigenvalue_matrix to solve Poisson equation and FFT plans. + * Currently only works with a single box, i.e., serial FFT. + * + * \param[in] realspace_ba BoxArray on which the FFT is executed. + * \param[in] dm DistributionMapping for the BoxArray. + * \param[in] gm Geometry, contains the box dimensions. + */ + void define ( amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** + * Solve Poisson equation. The source term must be stored in the staging area m_stagingArea prior to this call. + * + * \param[in] lhs_mf Destination array, where the result is stored. + */ + virtual void SolvePoissonEquation (amrex::MultiFab& lhs_mf) override final; + + /** Position and relative factor used to apply inhomogeneous Dirichlet boundary conditions */ + virtual amrex::Real BoundaryOffset() override final { return 1.; } + virtual amrex::Real BoundaryFactor() override final { return 1.; } + +private: + /** Spectral fields, contains (real) field in Fourier space */ + amrex::MultiFab m_tmpSpectralField; + /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ + amrex::MultiFab m_eigenvalue_matrix; + /** DST plans */ + AnyFFT m_forward_fft; + AnyFFT m_backward_fft; + amrex::Gpu::DeviceVector m_fft_work_area; +}; + +#endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp new file mode 100644 index 0000000000..9d05318593 --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp @@ -0,0 +1,136 @@ +/* Copyright 2020-2022 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs + * + * License: BSD-3-Clause-LBNL + */ +#include "FFTPoissonSolverDirichletFast.H" +#include "fft/AnyFFT.H" +#include "fields/Fields.H" +#include "utils/Constants.H" +#include "utils/GPUUtil.H" +#include "utils/HipaceProfilerWrapper.H" + +FFTPoissonSolverDirichletFast::FFTPoissonSolverDirichletFast ( + amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + define(realspace_ba, dm, gm); +} + +void +FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + using namespace amrex::literals; + + // If we are going to support parallel FFT, the constructor needs to take a communicator. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_realspace_ba.size() == 1, "Parallel FFT not supported yet"); + + // Allocate temporary arrays - in real space and spectral space + // These arrays will store the data just before/after the FFT + // The stagingArea is also created from 0 to nx, because the real space array may have + // an offset for levels > 0 + m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required + m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards); + + // This must be true even for parallel FFT. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1, + "There should be only one box locally."); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1, + "There should be only one box locally."); + + const amrex::Box fft_box = m_stagingArea[0].box(); + const auto dx = gm.CellSizeArray(); + const amrex::Real dxsquared = dx[0]*dx[0]; + const amrex::Real dysquared = dx[1]*dx[1]; + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_box.length(0) + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_box.length(1) + 1 )); + + // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) + // This normalization is used regardless of the sine transform library + const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_box.length(0) + 1 ) + *( fft_box.length(1) + 1 ))); + + // Calculate the array of m_eigenvalue_matrix + for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + amrex::IntVect lo = fft_box.smallEnd(); + amrex::ParallelFor( + fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept + { + /* fast poisson solver diagonal x coeffs */ + amrex::Real sinex_sq = sin(( i - lo[0] + 1 ) * sine_x_factor) * sin(( i - lo[0] + 1 ) * sine_x_factor); + /* fast poisson solver diagonal y coeffs */ + amrex::Real siney_sq = sin(( j - lo[1] + 1 ) * sine_y_factor) * sin(( j - lo[1] + 1 ) * sine_y_factor); + + if ((sinex_sq!=0) && (siney_sq!=0)) { + eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); + } else { + // Avoid division by 0 + eigenvalue_matrix(i,j) = 0._rt; + } + }); + } + + // Allocate and initialize the FFT plans + amrex::IntVect fft_size = m_stagingArea[0].box().length(); + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), + m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), + m_fft_work_area.dataPtr()); +} + + +void +FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) +{ + HIPACE_PROFILE("FFTPoissonSolverDirichletFast::SolvePoissonEquation()"); + + m_forward_fft.Execute(); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Solve Poisson equation in Fourier space: + // Multiply `tmpSpectralField` by eigenvalue_matrix + Array2 tmp_cmplx_arr = m_tmpSpectralField.array(mfi); + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + + amrex::ParallelFor( mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j); + }); + } + + m_backward_fft.Execute(); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Copy from the staging area to output array (and normalize) + Array2 tmp_real_arr = m_stagingArea.array(mfi); + Array2 lhs_arr = lhs_mf.array(mfi); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1, + "Slice MFs must be defined on one box only"); + amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + // Copy field + lhs_arr(i,j) = tmp_real_arr(i,j); + }); + } +} diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index c98e695234..d4014c5f31 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -9,6 +9,8 @@ #ifndef ANYFFT_H_ #define ANYFFT_H_ +#include + struct VendorPlan; enum struct FFTType { diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index bd0f8eb8f4..37943ec33b 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -11,7 +11,7 @@ #include -#include "CuFFTUtils.H" +#include #include #include @@ -69,32 +69,32 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { switch (type) { case FFTType::C2C_2D_fwd: transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; - m_plan->m_direction = cuFFTFORWARD; + m_plan->m_direction = CUFFT_FORWARD; rank = 2; - n[0] = nx; - n[1] = ny; + n[0] = ny; + n[1] = nx; batch = 1; break; case FFTType::C2C_2D_bkw: transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; - m_plan->m_direction = cuFFTINVERSE; + m_plan->m_direction = CUFFT_INVERSE; rank = 2; - n[0] = nx; - n[1] = ny; + n[0] = ny; + n[1] = nx; batch = 1; break; case FFTType::C2R_2D: transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D; rank = 2; - n[0] = nx; - n[1] = ny; + n[0] = ny; + n[1] = nx; batch = 1; break; case FFTType::R2C_2D: transform_type = use_float ? CUFFT_R2C : CUFFT_D2Z; rank = 2; - n[0] = nx; - n[1] = ny; + n[0] = ny; + n[1] = nx; batch = 1; break; case FFTType::R2R_2D: @@ -135,7 +135,7 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { &workSize); assert_cufft_status("cufftMakePlanMany64", result); - result = cufftSetStream(m_plan->m_cufftplan, amrex::Gpu::Device::cudaStream(()); + result = cufftSetStream(m_plan->m_cufftplan, amrex::Gpu::Device::cudaStream()); assert_cufft_status("cufftSetStream", result); return workSize; diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 5cd9e32f60..204314a3e5 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -22,8 +22,9 @@ static constexpr bool use_float = false; #endif struct VendorPlan { - std::conditional_t m_fftwplan; - AnyFFT::FFTType m_type; + fftwf_plan m_fftwf_plan; + fftw_plan m_fftw_plan; + FFTType m_type; int m_nx; int m_ny; }; @@ -38,46 +39,56 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { return 0; } -void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { +void AnyFFT::SetBuffers (void* in, void* out, [[maybe_unused]] void* work_area) { if constexpr (use_float) { switch (m_plan->m_type) { case FFTType::C2C_2D_fwd: - m_plan->m_fftwplan = fftwf_plan_dft_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_FORWARD, FFTW_MEASURE); + m_plan->m_fftwf_plan = fftwf_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_FORWARD, FFTW_MEASURE); break; case FFTType::C2C_2D_bkw: - m_plan->m_fftwplan = fftwf_plan_dft_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_BACKWARD, FFTW_MEASURE); + m_plan->m_fftwf_plan = fftwf_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_BACKWARD, FFTW_MEASURE); break; case FFTType::C2R_2D: - m_plan->m_fftwplan = fftwf_plan_dft_c2r_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + m_plan->m_fftwf_plan = fftwf_plan_dft_c2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); break; case FFTType::R2C_2D: - m_plan->m_fftwplan = fftwf_plan_dft_r2c_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + m_plan->m_fftwf_plan = fftwf_plan_dft_r2c_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); break; case FFTType::R2R_2D: - m_plan->m_fftwplan = fftwf_plan_r2r_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); + m_plan->m_fftwf_plan = fftwf_plan_r2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); break; case FFTType::C2R_1D_batched: { int n[1] = {m_plan->m_nx}; - m_plan->m_fftwplan = fftwf_plan_many_dft_c2r( + m_plan->m_fftwf_plan = fftwf_plan_many_dft_c2r( 1, n, m_plan->m_ny, - in, nullptr, 1, m_plan->m_nx/2+1, - out, nullptr, 1, m_plan->m_nx, + reinterpret_cast(in), nullptr, 1, m_plan->m_nx/2+1, + reinterpret_cast(out), nullptr, 1, m_plan->m_nx, FFTW_MEASURE); } break; case FFTType::R2C_1D_batched: { int n[1] = {m_plan->m_nx}; - m_plan->m_fftwplan = fftwf_plan_many_dft_r2c( + m_plan->m_fftwf_plan = fftwf_plan_many_dft_r2c( 1, n, m_plan->m_ny, - in, nullptr, 1, m_plan->m_nx, - out, nullptr, 1, m_plan->m_nx/2+1, + reinterpret_cast(in), nullptr, 1, m_plan->m_nx, + reinterpret_cast(out), nullptr, 1, m_plan->m_nx/2+1, FFTW_MEASURE); } break; @@ -85,42 +96,52 @@ void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { } else { switch (m_plan->m_type) { case FFTType::C2C_2D_fwd: - m_plan->m_fftwplan = fftw_plan_dft_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_FORWARD, FFTW_MEASURE); + m_plan->m_fftw_plan = fftw_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_FORWARD, FFTW_MEASURE); break; case FFTType::C2C_2D_bkw: - m_plan->m_fftwplan = fftw_plan_dft_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_BACKWARD, FFTW_MEASURE); + m_plan->m_fftw_plan = fftw_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_BACKWARD, FFTW_MEASURE); break; case FFTType::C2R_2D: - m_plan->m_fftwplan = fftw_plan_dft_c2r_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + m_plan->m_fftw_plan = fftw_plan_dft_c2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); break; case FFTType::R2C_2D: - m_plan->m_fftwplan = fftw_plan_dft_r2c_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + m_plan->m_fftw_plan = fftw_plan_dft_r2c_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); break; case FFTType::R2R_2D: - m_plan->m_fftwplan = fftw_plan_r2r_2d( - m_plan->m_ny, m_plan->m_nx, in, out, FFTW_MEASURE); + m_plan->m_fftw_plan = fftw_plan_r2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); break; case FFTType::C2R_1D_batched: { int n[1] = {m_plan->m_nx}; - m_plan->m_fftwplan = fftw_plan_many_dft_c2r( + m_plan->m_fftw_plan = fftw_plan_many_dft_c2r( 1, n, m_plan->m_ny, - in, nullptr, 1, m_plan->m_nx/2+1, - out, nullptr, 1, m_plan->m_nx, + reinterpret_cast(in), nullptr, 1, m_plan->m_nx/2+1, + reinterpret_cast(out), nullptr, 1, m_plan->m_nx, FFTW_MEASURE); } break; case FFTType::R2C_1D_batched: { int n[1] = {m_plan->m_nx}; - m_plan->m_fftwplan = fftw_plan_many_dft_r2c( + m_plan->m_fftw_plan = fftw_plan_many_dft_r2c( 1, n, m_plan->m_ny, - in, nullptr, 1, m_plan->m_nx, - out, nullptr, 1, m_plan->m_nx/2+1, + reinterpret_cast(in), nullptr, 1, m_plan->m_nx, + reinterpret_cast(out), nullptr, 1, m_plan->m_nx/2+1, FFTW_MEASURE); } break; @@ -130,18 +151,18 @@ void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { void AnyFFT::Execute () { if constexpr (use_float) { - fftwf_execute(m_plan->m_fftwplan); + fftwf_execute(m_plan->m_fftwf_plan); } else { - fftw_execute(m_plan->m_fftwplan); + fftw_execute(m_plan->m_fftw_plan); } } AnyFFT::~AnyFFT () { if (m_plan) { if constexpr (use_float) { - fftwf_destroy_plan(m_plan->m_fftwplan); + fftwf_destroy_plan(m_plan->m_fftwf_plan); } else { - fftw_destroy_plan(m_plan->m_fftwplan); + fftw_destroy_plan(m_plan->m_fftw_plan); } delete m_plan; } From 50b2f6a913b3241d54f3816bea5854e491657612 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Wed, 17 Apr 2024 04:00:28 +0200 Subject: [PATCH 03/11] add Expanded and Fast solvers --- src/fields/Fields.H | 2 +- src/fields/Fields.cpp | 32 +- src/fields/fft_poisson_solver/CMakeLists.txt | 4 +- ...et.H => FFTPoissonSolverDirichletDirect.H} | 14 +- ...pp => FFTPoissonSolverDirichletDirect.cpp} | 30 +- .../FFTPoissonSolverDirichletExpanded.H | 9 +- .../FFTPoissonSolverDirichletExpanded.cpp | 48 +- .../FFTPoissonSolverDirichletFast.H | 13 +- .../FFTPoissonSolverDirichletFast.cpp | 206 ++++++++- .../FFTPoissonSolverPeriodic.cpp | 1 + .../MGPoissonSolverDirichlet.cpp | 1 + .../fft_poisson_solver/fft/WrapCuFFT.cpp | 2 +- .../fft_poisson_solver/fft/WrapFFTW.cpp | 1 - .../fft_poisson_solver/fft/WrapRocFFT.cpp | 2 +- .../fft_poisson_solver/fft_old/AnyDST.H | 86 ---- .../fft_poisson_solver/fft_old/AnyFFT.H | 120 ----- .../fft_poisson_solver/fft_old/CMakeLists.txt | 28 -- .../fft_poisson_solver/fft_old/CuFFTUtils.H | 27 -- .../fft_poisson_solver/fft_old/CuFFTUtils.cpp | 37 -- .../fft_poisson_solver/fft_old/RocFFTUtils.H | 33 -- .../fft_old/RocFFTUtils.cpp | 45 -- .../fft_poisson_solver/fft_old/WrapCuDST.cpp | 429 ------------------ .../fft_poisson_solver/fft_old/WrapCuFFT.cpp | 95 ---- .../fft_poisson_solver/fft_old/WrapDSTW.cpp | 84 ---- .../fft_poisson_solver/fft_old/WrapFFTW.cpp | 72 --- .../fft_poisson_solver/fft_old/WrapRocDST.cpp | 421 ----------------- .../fft_poisson_solver/fft_old/WrapRocFFT.cpp | 102 ----- 27 files changed, 292 insertions(+), 1652 deletions(-) rename src/fields/fft_poisson_solver/{FFTPoissonSolverDirichlet.H => FFTPoissonSolverDirichletDirect.H} (83%) rename src/fields/fft_poisson_solver/{FFTPoissonSolverDirichlet.cpp => FFTPoissonSolverDirichletDirect.cpp} (81%) delete mode 100644 src/fields/fft_poisson_solver/fft_old/AnyDST.H delete mode 100644 src/fields/fft_poisson_solver/fft_old/AnyFFT.H delete mode 100644 src/fields/fft_poisson_solver/fft_old/CMakeLists.txt delete mode 100644 src/fields/fft_poisson_solver/fft_old/CuFFTUtils.H delete mode 100644 src/fields/fft_poisson_solver/fft_old/CuFFTUtils.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/RocFFTUtils.H delete mode 100644 src/fields/fft_poisson_solver/fft_old/RocFFTUtils.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/WrapCuDST.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/WrapDSTW.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/WrapRocDST.cpp delete mode 100644 src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp diff --git a/src/fields/Fields.H b/src/fields/Fields.H index 45132492a7..96e6b485a4 100644 --- a/src/fields/Fields.H +++ b/src/fields/Fields.H @@ -507,7 +507,7 @@ private: /** Vector over levels of all required fields to compute current slice */ amrex::Vector m_slices; /** Type of poisson solver to use */ - std::string m_poisson_solver_str = "FFTDirichlet"; + std::string m_poisson_solver_str = ""; /** Class to handle transverse FFT Poisson solver on 1 slice */ amrex::Vector> m_poisson_solver; /** Stores temporary values for z interpolation in Fields::Copy */ diff --git a/src/fields/Fields.cpp b/src/fields/Fields.cpp index 3d1ea83b18..07917f2511 100644 --- a/src/fields/Fields.cpp +++ b/src/fields/Fields.cpp @@ -8,7 +8,9 @@ */ #include "Fields.H" #include "fft_poisson_solver/FFTPoissonSolverPeriodic.H" -#include "fft_poisson_solver/FFTPoissonSolverDirichlet.H" +#include "fft_poisson_solver/FFTPoissonSolverDirichletDirect.H" +#include "fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H" +#include "fft_poisson_solver/FFTPoissonSolverDirichletFast.H" #include "fft_poisson_solver/MGPoissonSolverDirichlet.H" #include "Hipace.H" #include "OpenBoundary.H" @@ -29,6 +31,11 @@ Fields::Fields (const int nlev) { amrex::ParmParse ppf("fields"); DeprecatedInput("fields", "do_dirichlet_poisson", "poisson_solver", ""); +#ifdef AMREX_USE_GPU + m_poisson_solver_str = "FFTDirichletFast"; +#else + m_poisson_solver_str = "FFTDirichletDirect"; +#endif queryWithParser(ppf, "poisson_solver", m_poisson_solver_str); queryWithParser(ppf, "extended_solve", m_extended_solve); queryWithParser(ppf, "open_boundary", m_open_boundary); @@ -178,11 +185,21 @@ Fields::AllocData ( // The Poisson solver operates on transverse slices only. // The constructor takes the BoxArray and the DistributionMap of a slice, // so the FFTPlans are built on a slice. - if (m_poisson_solver_str == "FFTDirichlet"){ - m_poisson_solver.push_back(std::unique_ptr( - new FFTPoissonSolverDirichlet(getSlices(lev).boxArray(), - getSlices(lev).DistributionMap(), - geom)) ); + if (m_poisson_solver_str == "FFTDirichletDirect"){ + m_poisson_solver.push_back(std::unique_ptr( + new FFTPoissonSolverDirichletDirect(getSlices(lev).boxArray(), + getSlices(lev).DistributionMap(), + geom)) ); + } else if (m_poisson_solver_str == "FFTDirichletExpanded"){ + m_poisson_solver.push_back(std::unique_ptr( + new FFTPoissonSolverDirichletExpanded(getSlices(lev).boxArray(), + getSlices(lev).DistributionMap(), + geom)) ); + } else if (m_poisson_solver_str == "FFTDirichletFast"){ + m_poisson_solver.push_back(std::unique_ptr( + new FFTPoissonSolverDirichletFast(getSlices(lev).boxArray(), + getSlices(lev).DistributionMap(), + geom)) ); } else if (m_poisson_solver_str == "FFTPeriodic") { m_poisson_solver.push_back(std::unique_ptr( new FFTPoissonSolverPeriodic(getSlices(lev).boxArray(), @@ -195,7 +212,8 @@ Fields::AllocData ( geom)) ); } else { amrex::Abort("Unknown poisson solver '" + m_poisson_solver_str + - "', must be 'FFTDirichlet', 'FFTPeriodic' or 'MGDirichlet'"); + "', must be 'FFTDirichletDirect', 'FFTDirichletExpanded', 'FFTDirichletFast', " + + "'FFTPeriodic' or 'MGDirichlet'"); } if (lev == 0 && m_insitu_period > 0) { diff --git a/src/fields/fft_poisson_solver/CMakeLists.txt b/src/fields/fft_poisson_solver/CMakeLists.txt index d0292fb2aa..46e480ec2f 100644 --- a/src/fields/fft_poisson_solver/CMakeLists.txt +++ b/src/fields/fft_poisson_solver/CMakeLists.txt @@ -9,9 +9,9 @@ target_sources(HiPACE PRIVATE FFTPoissonSolver.cpp FFTPoissonSolverPeriodic.cpp - FFTPoissonSolverDirichlet.cpp - FFTPoissonSolverDirichletFast.cpp + FFTPoissonSolverDirichletDirect.cpp FFTPoissonSolverDirichletExpanded.cpp + FFTPoissonSolverDirichletFast.cpp MGPoissonSolverDirichlet.cpp ) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H similarity index 83% rename from src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H rename to src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H index 53e002ab09..86a3ab914f 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H @@ -5,8 +5,8 @@ * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs * License: BSD-3-Clause-LBNL */ -#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ -#define FFT_POISSON_SOLVER_DIRICHLET_H_ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_DIRECT_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_DIRECT_H_ #include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "FFTPoissonSolver.H" @@ -23,16 +23,16 @@ * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS * in the staging area and return the LHS in mf. */ -class FFTPoissonSolverDirichlet final : public FFTPoissonSolver +class FFTPoissonSolverDirichletDirect final : public FFTPoissonSolver { public: /** Constructor */ - FFTPoissonSolverDirichlet ( amrex::BoxArray const& a_realspace_ba, - amrex::DistributionMapping const& dm, - amrex::Geometry const& gm); + FFTPoissonSolverDirichletDirect ( amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); /** virtual destructor */ - virtual ~FFTPoissonSolverDirichlet () override final {} + virtual ~FFTPoissonSolverDirichletDirect () override final {} /** * \brief Define real space and spectral space boxes and multifabs, Dirichlet diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp similarity index 81% rename from src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp rename to src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp index f61404886e..bec25bd937 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp @@ -6,14 +6,14 @@ * * License: BSD-3-Clause-LBNL */ -#include "FFTPoissonSolverDirichlet.H" +#include "FFTPoissonSolverDirichletDirect.H" #include "fft/AnyFFT.H" #include "fields/Fields.H" #include "utils/Constants.H" #include "utils/GPUUtil.H" #include "utils/HipaceProfilerWrapper.H" -FFTPoissonSolverDirichlet::FFTPoissonSolverDirichlet ( +FFTPoissonSolverDirichletDirect::FFTPoissonSolverDirichletDirect ( amrex::BoxArray const& realspace_ba, amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) @@ -22,10 +22,11 @@ FFTPoissonSolverDirichlet::FFTPoissonSolverDirichlet ( } void -FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, - amrex::DistributionMapping const& dm, - amrex::Geometry const& gm ) +FFTPoissonSolverDirichletDirect::define (amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) { + HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::define()"); using namespace amrex::literals; // If we are going to support parallel FFT, the constructor needs to take a communicator. @@ -48,16 +49,18 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, "There should be only one box locally."); const amrex::Box fft_box = m_stagingArea[0].box(); + const amrex::IntVect fft_size = fft_box.length(); + const int nx = fft_size[0]; + const int ny = fft_size[1]; const auto dx = gm.CellSizeArray(); const amrex::Real dxsquared = dx[0]*dx[0]; const amrex::Real dysquared = dx[1]*dx[1]; - const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_box.length(0) + 1 )); - const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_box.length(1) + 1 )); + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 )); // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) // This normalization is used regardless of the sine transform library - const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_box.length(0) + 1 ) - *( fft_box.length(1) + 1 ))); + const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); // Calculate the array of m_eigenvalue_matrix for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ @@ -67,9 +70,9 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept { /* fast poisson solver diagonal x coeffs */ - amrex::Real sinex_sq = sin(( i - lo[0] + 1 ) * sine_x_factor) * sin(( i - lo[0] + 1 ) * sine_x_factor); + amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); /* fast poisson solver diagonal y coeffs */ - amrex::Real siney_sq = sin(( j - lo[1] + 1 ) * sine_y_factor) * sin(( j - lo[1] + 1 ) * sine_y_factor); + amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); if ((sinex_sq!=0) && (siney_sq!=0)) { eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); @@ -81,7 +84,6 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, } // Allocate and initialize the FFT plans - amrex::IntVect fft_size = m_stagingArea[0].box().length(); std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); @@ -95,9 +97,9 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, void -FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf) +FFTPoissonSolverDirichletDirect::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { - HIPACE_PROFILE("FFTPoissonSolverDirichlet::SolvePoissonEquation()"); + HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::SolvePoissonEquation()"); m_forward_fft.Execute(); diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H index d559e2ca5f..79b6b79a8c 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H @@ -5,8 +5,8 @@ * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs * License: BSD-3-Clause-LBNL */ -#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ -#define FFT_POISSON_SOLVER_DIRICHLET_H_ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_EXPANDED_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_EXPANDED_H_ #include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "FFTPoissonSolver.H" @@ -67,9 +67,8 @@ private: amrex::FArrayBox m_expanded_position_array; amrex::BaseFab> m_expanded_fourier_array; - /** DST plans */ - AnyFFT m_forward_fft; - AnyFFT m_backward_fft; + /** DST plan */ + AnyFFT m_fft; amrex::Gpu::DeviceVector m_fft_work_area; }; diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp index 11f3bdf90b..0831a5e12f 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp @@ -62,6 +62,7 @@ FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) { + HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::define()"); using namespace amrex::literals; // If we are going to support parallel FFT, the constructor needs to take a communicator. @@ -85,16 +86,17 @@ FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba const amrex::Box fft_box = m_stagingArea[0].box(); const amrex::IntVect fft_size = fft_box.length(); + const int nx = fft_size[0]; + const int ny = fft_size[1]; const auto dx = gm.CellSizeArray(); const amrex::Real dxsquared = dx[0]*dx[0]; const amrex::Real dysquared = dx[1]*dx[1]; - const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_size[0] + 1 )); - const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_size[1] + 1 )); + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 )); // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) // This normalization is used regardless of the sine transform library - const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_size[0] + 1 ) - *( fft_size[1] + 1 ))); + const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); // Calculate the array of m_eigenvalue_matrix for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ @@ -117,17 +119,27 @@ FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba }); } + // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2) + // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2) + amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}}; + amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}}; + // shift box to match rest of fields + expanded_position_box += fft_box.smallEnd(); + expanded_fourier_box += fft_box.smallEnd(); + + m_expanded_position_array.resize(expanded_position_box); + m_expanded_fourier_array.resize(expanded_fourier_box); + + m_expanded_position_array.setVal(0._rt); + // Allocate and initialize the FFT plans - amrex::IntVect fft_size = m_stagingArea[0].box().length(); - std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); - std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + std::size_t wrok_size = m_fft.Initialize(FFTType::R2C_2D, expanded_position_box.length(0), + expanded_position_box.length(1)); - m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + m_fft_work_area.resize(wrok_size); - m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), - m_fft_work_area.dataPtr()); - m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), - m_fft_work_area.dataPtr()); + m_fft.SetBuffers(m_expanded_position_array.dataPtr(), m_expanded_fourier_array.dataPtr(), + m_fft_work_area.dataPtr()); } @@ -136,7 +148,11 @@ FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf { HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::SolvePoissonEquation()"); - m_forward_fft.Execute(); + ExpandR2R(m_expanded_position_array, m_stagingArea[0]); + + m_fft.Execute(); + + ShrinkC2R(m_tmpSpectralField[0], m_expanded_fourier_array); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) @@ -153,7 +169,11 @@ FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf }); } - m_backward_fft.Execute(); + ExpandR2R(m_expanded_position_array, m_tmpSpectralField[0]); + + m_fft.Execute(); + + ShrinkC2R(m_stagingArea[0], m_expanded_fourier_array); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H index 6022b0e7df..8e47b367ae 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H @@ -5,8 +5,8 @@ * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs * License: BSD-3-Clause-LBNL */ -#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ -#define FFT_POISSON_SOLVER_DIRICHLET_H_ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_FAST_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_FAST_H_ #include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "FFTPoissonSolver.H" @@ -63,9 +63,14 @@ private: amrex::MultiFab m_tmpSpectralField; /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ amrex::MultiFab m_eigenvalue_matrix; + + amrex::Gpu::DeviceVector m_position_array; + + amrex::Gpu::DeviceVector> m_fourier_array; + /** DST plans */ - AnyFFT m_forward_fft; - AnyFFT m_backward_fft; + AnyFFT m_x_fft; + AnyFFT m_y_fft; amrex::Gpu::DeviceVector m_fft_work_area; }; diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp index 9d05318593..7476eba77c 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp @@ -21,11 +21,142 @@ FFTPoissonSolverDirichletFast::FFTPoissonSolverDirichletFast ( define(realspace_ba, dm, gm); } +/** \brief Make Complex array out of Real array to prepare for fft. + * out[idx] = -in[2*idx-2] + in[2*idx] + i*in[2*idx-1] for each column with + * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1] + * + * \param[in] in input real array + * \param[out] out output complex array + * \param[in] n_data number of (contiguous) rows in position matrix + * \param[in] n_batch number of (strided) columns in position matrix + */ +void ToComplex (const amrex::Real* const AMREX_RESTRICT in, + amrex::GpuComplex* const AMREX_RESTRICT out, + const int n_data, const int n_batch) +{ + const int n_half = (n_data+1)/2; + if((n_data%2 == 1)) { + amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + const int stride_in = n_data*j; + const int i_is_zero = (i==0); + const int i_is_n_half = (i==n_half); + const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) + +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); + const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] + *!i_is_zero*!i_is_n_half; + out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); + }); + } else { + amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + const int stride_in = n_data*j; + const int i_is_zero = (i==0); + const int i_is_n_half = (i==n_half); + const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) + +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; + const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; + out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); + }); + } +} + +/** \brief Make Sine-space Real array out of array from fft. + * out[idx] = 0.5 *(in[n_data-idx] - in[idx+1] + (in[n_data-idx] + in[idx+1])/ + * (2*sin((idx+1)*pi/(n_data+1)))) for each column + * + * \param[in] in input real array + * \param[out] out output real array + * \param[in] n_data number of (contiguous) rows in position matrix + * \param[in] n_batch number of (strided) columns in position matrix + */ +void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, + const int n_data, const int n_batch) +{ + using namespace amrex::literals; + + const amrex::Real n_1_real_inv = 1._rt / (n_data+1._rt); + const int n_1 = n_data+1; + amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + const int i_rev = n_1-i; + const int stride_in = n_1*j; + const int stride_out = n_data*j; + const amrex::Real in_a = in[i + stride_in]; + const amrex::Real in_b = in[i_rev + stride_in]; + out[i - 1 + stride_out] = + 0.5_rt*(in_b - in_a + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i * n_1_real_inv))); + out[i_rev - 1 + stride_out] = + 0.5_rt*(in_a - in_b + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i_rev * n_1_real_inv))); + }); +} + +/** \brief Transpose input matrix + * out[idy][idx] = in[idx][idy] + * + * \param[in] in input real array + * \param[out] out output real array + * \param[in] n_data number of (contiguous) rows in input matrix + * \param[in] n_batch number of (strided) columns in input matrix + */ +void Transpose (const amrex::Real* const AMREX_RESTRICT in, + amrex::Real* const AMREX_RESTRICT out, + const int n_data, const int n_batch) +{ +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + constexpr int tile_dim = 32; //must be power of 2 + constexpr int block_rows = 8; + const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; + const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; + amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, + tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE() noexcept + { + amrex::Gpu::SharedMemory gsm; + amrex::Real* const tile = gsm.dataPtr(); + + const int thread_x = threadIdx.x&(tile_dim-1); + const int thread_y = threadIdx.x/tile_dim; + const int block_y = blockIdx.x/num_blocks_x; + const int block_x = blockIdx.x - block_y*num_blocks_x; + int mat_x = block_x * tile_dim + thread_x; + int mat_y = block_y * tile_dim + thread_y; + + for (int i = 0; i < tile_dim; i += block_rows) { + if(mat_x < n_data && (mat_y+i) < n_batch) { + tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; + } + } + + __syncthreads(); + + mat_x = block_y * tile_dim + thread_x; + mat_y = block_x * tile_dim + thread_y; + + for (int i = 0; i < tile_dim; i += block_rows) { + if(mat_x < n_batch && (mat_y+i) < n_data) { + out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; + } + } + }); +#else + amrex::ParallelFor({{0,0,0}, {n_batch-1, n_data-1, 0}}, + [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept + { + out[i + n_batch*j] = in[j + n_data*i]; + }); +#endif +} + void FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) { + HIPACE_PROFILE("FFTPoissonSolverDirichletFast::define()"); using namespace amrex::literals; // If we are going to support parallel FFT, the constructor needs to take a communicator. @@ -48,16 +179,18 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, "There should be only one box locally."); const amrex::Box fft_box = m_stagingArea[0].box(); + const amrex::IntVect fft_size = fft_box.length(); + const int nx = fft_size[0]; + const int ny = fft_size[1]; const auto dx = gm.CellSizeArray(); const amrex::Real dxsquared = dx[0]*dx[0]; const amrex::Real dysquared = dx[1]*dx[1]; - const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_box.length(0) + 1 )); - const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_box.length(1) + 1 )); + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 )); // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) // This normalization is used regardless of the sine transform library - const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_box.length(0) + 1 ) - *( fft_box.length(1) + 1 ))); + const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); // Calculate the array of m_eigenvalue_matrix for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ @@ -67,9 +200,9 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept { /* fast poisson solver diagonal x coeffs */ - amrex::Real sinex_sq = sin(( i - lo[0] + 1 ) * sine_x_factor) * sin(( i - lo[0] + 1 ) * sine_x_factor); + amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); /* fast poisson solver diagonal y coeffs */ - amrex::Real siney_sq = sin(( j - lo[1] + 1 ) * sine_y_factor) * sin(( j - lo[1] + 1 ) * sine_y_factor); + amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); if ((sinex_sq!=0) && (siney_sq!=0)) { eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); @@ -80,17 +213,22 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, }); } + // Allocate 1d Array for 2d data or 2d transpose data + const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); + const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx); + m_position_array.resize(real_1d_size); + m_fourier_array.resize(complex_1d_size); + // Allocate and initialize the FFT plans - amrex::IntVect fft_size = m_stagingArea[0].box().length(); - std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); - std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + std::size_t fft_x_area = m_x_fft.Initialize(FFTType::C2R_1D_batched, nx+1, ny); + std::size_t fft_y_area = m_y_fft.Initialize(FFTType::C2R_1D_batched, ny+1, nx); - m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + m_fft_work_area.resize(std::max(fft_x_area, fft_y_area)); - m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), - m_fft_work_area.dataPtr()); - m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), - m_fft_work_area.dataPtr()); + m_x_fft.SetBuffers(m_position_array.dataPtr(), m_fourier_array.dataPtr(), + m_fft_work_area.dataPtr()); + m_y_fft.SetBuffers(m_position_array.dataPtr(), m_fourier_array.dataPtr(), + m_fft_work_area.dataPtr()); } @@ -99,7 +237,29 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { HIPACE_PROFILE("FFTPoissonSolverDirichletFast::SolvePoissonEquation()"); - m_forward_fft.Execute(); + const int nx = m_stagingArea[0].box().length(0); // initially contiguous + const int ny = m_stagingArea[0].box().length(1); // contiguous after transpose + + amrex::Real* const pos_arr = m_stagingArea[0].dataPtr(); + amrex::Real* const fourier_arr = m_tmpSpectralField[0].dataPtr(); + amrex::Real* const real_arr = m_position_array.dataPtr(); + amrex::GpuComplex* comp_arr = m_fourier_array.dataPtr(); + + ToComplex(pos_arr, comp_arr, nx, ny); + + m_x_fft.Execute(); + + ToSine(real_arr, pos_arr, nx, ny); + + Transpose(pos_arr, fourier_arr, nx, ny); + + ToComplex(fourier_arr, comp_arr, ny, nx); + + m_y_fft.Execute(); + + ToSine(real_arr, pos_arr, ny, nx); + + Transpose(pos_arr, fourier_arr, ny, nx); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) @@ -116,7 +276,21 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) }); } - m_backward_fft.Execute(); + ToComplex(fourier_arr, comp_arr, nx, ny); + + m_x_fft.Execute(); + + ToSine(real_arr, fourier_arr, nx, ny); + + Transpose(fourier_arr, pos_arr, nx, ny); + + ToComplex(pos_arr, comp_arr, ny, nx); + + m_y_fft.Execute(); + + ToSine(real_arr, fourier_arr, ny, nx); + + Transpose(fourier_arr, pos_arr, ny, nx); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp index 26ff0ebb6d..21089be089 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp @@ -26,6 +26,7 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba, amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) { + HIPACE_PROFILE("FFTPoissonSolverPeriodic::define()"); using namespace amrex::literals; // If we are going to support parallel FFT, the constructor needs to take a communicator. diff --git a/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp index c8aa30d600..9ce7aa34b2 100644 --- a/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp +++ b/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp @@ -17,6 +17,7 @@ MGPoissonSolverDirichlet::MGPoissonSolverDirichlet ( amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) { + HIPACE_PROFILE("MGPoissonSolverDirichlet::define()"); // need extra ghost cell for 2^n-1 HPMG m_stagingArea = amrex::MultiFab(ba, dm, 1, Fields::m_poisson_nguards + amrex::IntVect{1, 1, 0}); AMREX_ALWAYS_ASSERT_WITH_MESSAGE(ba.size() == 1, "Parallel MG not supported"); diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index 37943ec33b..e7c38920e7 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -7,9 +7,9 @@ * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" -#include "utils/HipaceProfilerWrapper.H" #include +#include #include diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 204314a3e5..746d58c308 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -7,7 +7,6 @@ * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" -#include "utils/HipaceProfilerWrapper.H" #include diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index 46bb72e489..29d0ab3d09 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -7,9 +7,9 @@ * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" -#include "utils/HipaceProfilerWrapper.H" #include +#include #if __has_include() // ROCm 5.3+ #include diff --git a/src/fields/fft_poisson_solver/fft_old/AnyDST.H b/src/fields/fft_poisson_solver/fft_old/AnyDST.H deleted file mode 100644 index 7997e7c3a6..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/AnyDST.H +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, MaxThevenet - * License: BSD-3-Clause-LBNL - */ -#ifndef ANYDST_H_ -#define ANYDST_H_ - -#include "AnyFFT.H" - -#include -#include - -#include - -/** - * \brief Wrapper around multiple FFT libraries. - * - * The header file defines the API and the base types - * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is - * done in different cpp files. This wrapper only depends on the underlying FFT library - * AND on AMReX (There is no dependence on WarpX). - */ -namespace AnyDST -{ - - /** Direction in which the FFT is performed. */ - enum struct direction {forward, backward}; - - /** \brief This struct contains the vendor FFT plan and additional metadata - */ - struct DSTplan - { - - /** pointer to array in position space */ - amrex::FArrayBox* m_position_array; - /** pointer to array in Fourier space */ - amrex::FArrayBox* m_fourier_array; - - /** Expanded data in position space, only for Cuda */ - std::unique_ptr m_expanded_position_array; - /** Expanded data in Fourier, only for Cuda */ - std::unique_ptr>> m_expanded_fourier_array; - - /** Vendor FFT plan */ - AnyFFT::VendorFFTPlan m_plan; - /** Vendor FFT plan for the transpose transform. Used for use_small_dst and DSTW */ - AnyFFT::VendorFFTPlan m_plan_b; - - /** Use large R2C or small C2R dst */ - bool use_small_dst; - -#if defined(AMREX_USE_HIP) - /** execution info for rocFFT */ - rocfft_execution_info m_execinfo; - /** work buffer for rocFFT */ - void* m_buffer; -#endif - }; - - /** Collection of FFT plans, one FFTplan per box */ - using DSTplans = amrex::LayoutData; - - /** \brief create FFT plan for the backend FFT library. - * \param[in] real_size Size of the real array, along each dimension. - * \param[out] position_array Real array from/to where R2R DST is performed - * \param[out] fourier_array Real array to/from where R2R DST is performed - */ - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array); - - /** \brief Destroy library FFT plan. - * \param[out] dst_plan plan to destroy - */ - void DestroyPlan (DSTplan& dst_plan); - - /** \brief Perform FFT with backend library. - * \param[out] dst_plan plan for which the FFT is performed - * \param[in] d direction of the FFT - */ - void Execute (DSTplan& dst_plan, AnyDST::direction d); -} - -#endif // ANYDST_H_ diff --git a/src/fields/fft_poisson_solver/fft_old/AnyFFT.H b/src/fields/fft_poisson_solver/fft_old/AnyFFT.H deleted file mode 100644 index cbbd077954..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/AnyFFT.H +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright 2020-2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs - * WeiqunZhang - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. - * - * License: BSD-3-Clause-LBNL - */ -#ifndef ANYFFT_H_ -#define ANYFFT_H_ - -#include - -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif - -#include - -/** - * \brief Wrapper around multiple FFT libraries. - * - * The header file defines the API and the base types - * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is - * done in different cpp files. This wrapper only depends on the underlying FFT library - * AND on AMReX (There is no dependence on WarpX). - */ -namespace AnyFFT -{ - // First, define library-dependent types (complex, FFT plan) - - /** Complex type for FFT, depends on FFT library */ -#ifdef AMREX_USE_CUDA -# ifdef AMREX_USE_FLOAT - using Complex = cuComplex; -# else - using Complex = cuDoubleComplex; -# endif -#elif defined(AMREX_USE_HIP) -# ifdef AMREX_USE_FLOAT - using Complex = float2; -# else - using Complex = double2; -# endif -#else -# ifdef AMREX_USE_FLOAT - using Complex = fftwf_complex; -# else - using Complex = fftw_complex; -# endif -#endif - - /** Library-dependent FFT plans type, which holds one fft plan per box - * (plans are only initialized for the boxes that are owned by the local MPI rank). - */ -#ifdef AMREX_USE_CUDA - using VendorFFTPlan = cufftHandle; -#elif defined(AMREX_USE_HIP) - using VendorFFTPlan = rocfft_plan; -#else -# ifdef AMREX_USE_FLOAT - using VendorFFTPlan = fftwf_plan; -# else - using VendorFFTPlan = fftw_plan; -# endif -#endif - - // Second, define library-independent API - - /** Direction in which the FFT is performed. */ - enum struct direction {R2C, C2R}; - - /** \brief This struct contains the vendor FFT plan and additional metadata - */ - struct FFTplan - { - amrex::Real* m_real_array; /**< pointer to real array */ - Complex* m_complex_array; /**< pointer to complex array */ - VendorFFTPlan m_plan; /**< Vendor FFT plan */ - direction m_dir; /**< direction (C2R or R2C) */ - }; - - /** Collection of FFT plans, one FFTplan per box */ - using FFTplans = amrex::LayoutData; - - /** \brief create FFT plan for the backend FFT library. - * \param[in] real_size Size of the real array, along each dimension. - * \param[out] real_array Real array from/to where R2C/C2R FFT is performed - * \param[out] complex_array Complex array to/from where R2C/C2R FFT is performed - * \param[in] dir direction, either R2C or C2R - */ - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir); - - /** \brief Destroy library FFT plan. - * \param[out] fft_plan plan to destroy - */ - void DestroyPlan (FFTplan& fft_plan); - - /** \brief Perform FFT with backend library. - * \param[out] fft_plan plan for which the FFT is performed - */ - void Execute (FFTplan& fft_plan); -} - -#endif // ANYFFT_H_ diff --git a/src/fields/fft_poisson_solver/fft_old/CMakeLists.txt b/src/fields/fft_poisson_solver/fft_old/CMakeLists.txt deleted file mode 100644 index e89ecd84f3..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2020-2021 -# -# This file is part of HiPACE++. -# -# Authors: Axel Huebl, MaxThevenet, Remi Lehe -# License: BSD-3-Clause-LBNL - -if (HiPACE_COMPUTE STREQUAL CUDA) - target_sources(HiPACE - PRIVATE - WrapCuFFT.cpp - WrapCuDST.cpp - CuFFTUtils.cpp - ) -elseif(HiPACE_COMPUTE STREQUAL HIP) - target_sources(HiPACE - PRIVATE - WrapRocFFT.cpp - WrapRocDST.cpp - RocFFTUtils.cpp - ) -else() - target_sources(HiPACE - PRIVATE - WrapFFTW.cpp - WrapDSTW.cpp - ) -endif() diff --git a/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.H b/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.H deleted file mode 100644 index 01daf5c2b9..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.H +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright 2020-2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl, MaxThevenet - * License: BSD-3-Clause-LBNL - */ -#ifndef CUFFTUTILS_H_ -#define CUFFTUTILS_H_ - -#include - -#include - - -namespace CuFFTUtils -{ - /** \brief This method converts a cufftResult - * into the corresponding string - * - * @param[in] err a cufftResult - * @return an std::string - */ - std::string cufftErrorToString (const cufftResult& err); -} - -#endif // CUFFTUTILS_H_ diff --git a/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.cpp b/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.cpp deleted file mode 100644 index 090dcf2b9c..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/CuFFTUtils.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright 2020 - * - * This file is part of HiPACE++. - * - * Authors: MaxThevenet - * License: BSD-3-Clause-LBNL - */ -#include "CuFFTUtils.H" - -#include - -namespace CuFFTUtils -{ - std::string cufftErrorToString (const cufftResult& err) - { - const auto res2string = std::map{ - {CUFFT_SUCCESS, "CUFFT_SUCCESS"}, - {CUFFT_INVALID_PLAN,"CUFFT_INVALID_PLAN"}, - {CUFFT_ALLOC_FAILED,"CUFFT_ALLOC_FAILED"}, - {CUFFT_INVALID_TYPE,"CUFFT_INVALID_TYPE"}, - {CUFFT_INVALID_VALUE,"CUFFT_INVALID_VALUE"}, - {CUFFT_INTERNAL_ERROR,"CUFFT_INTERNAL_ERROR"}, - {CUFFT_EXEC_FAILED,"CUFFT_EXEC_FAILED"}, - {CUFFT_SETUP_FAILED,"CUFFT_SETUP_FAILED"}, - {CUFFT_INVALID_SIZE,"CUFFT_INVALID_SIZE"}, - {CUFFT_UNALIGNED_DATA,"CUFFT_UNALIGNED_DATA"}}; - - const auto it = res2string.find(err); - if(it != res2string.end()){ - return it->second; - } - else{ - return std::to_string(err) + - " (unknown error code)"; - } - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.H b/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.H deleted file mode 100644 index 0180b0a97f..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.H +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright 2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl - * License: BSD-3-Clause-LBNL - */ -#ifndef ROCFFTUTILS_H_ -#define ROCFFTUTILS_H_ - -#if __has_include() // ROCm 5.3+ -# include -#else -# include -#endif - -#include - - -namespace RocFFTUtils -{ - void assert_rocfft_status (std::string const& name, rocfft_status status); - - /** \brief This method converts a cufftResult - * into the corresponding string - * - * @param[in] err a cufftResult - * @return an std::string - */ - std::string rocfftErrorToString (const rocfft_status err); -} - -#endif // ROCFFTUTILS_H_ diff --git a/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.cpp b/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.cpp deleted file mode 100644 index 5c6f3d68ed..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/RocFFTUtils.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl - * License: BSD-3-Clause-LBNL - */ -#include "RocFFTUtils.H" -#include - -#include -#include - -namespace RocFFTUtils -{ - void assert_rocfft_status (std::string const& name, rocfft_status status) - { - if (status != rocfft_status_success) { - amrex::Abort(name + " failed! Error: " + rocfftErrorToString(status)); - } - } - - std::string rocfftErrorToString (const rocfft_status err) - { - if (err == rocfft_status_success) { - return std::string("rocfft_status_success"); - } else if (err == rocfft_status_failure) { - return std::string("rocfft_status_failure"); - } else if (err == rocfft_status_invalid_arg_value) { - return std::string("rocfft_status_invalid_arg_value"); - } else if (err == rocfft_status_invalid_dimensions) { - return std::string("rocfft_status_invalid_dimensions"); - } else if (err == rocfft_status_invalid_array_type) { - return std::string("rocfft_status_invalid_array_type"); - } else if (err == rocfft_status_invalid_strides) { - return std::string("rocfft_status_invalid_strides"); - } else if (err == rocfft_status_invalid_distance) { - return std::string("rocfft_status_invalid_distance"); - } else if (err == rocfft_status_invalid_offset) { - return std::string("rocfft_status_invalid_offset"); - } else { - return std::to_string(err) + " (unknown error code)"; - } - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/WrapCuDST.cpp b/src/fields/fft_poisson_solver/fft_old/WrapCuDST.cpp deleted file mode 100644 index 571e402010..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/WrapCuDST.cpp +++ /dev/null @@ -1,429 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs - * License: BSD-3-Clause-LBNL - */ -#include "AnyDST.H" -#include "CuFFTUtils.H" -#include "utils/GPUUtil.H" -#include "utils/HipaceProfilerWrapper.H" -#include "utils/Parser.H" - -#include - -namespace AnyDST -{ -#ifdef AMREX_USE_FLOAT - cufftType VendorR2C = CUFFT_R2C; - cufftType VendorC2R = CUFFT_C2R; -#else - cufftType VendorR2C = CUFFT_D2Z; - cufftType VendorC2R = CUFFT_Z2D; -#endif - - /** \brief Extend src into a symmetrized larger array dst - * - * \param[in,out] dst destination array, odd symmetry around 0 and the middle points in x and y - * \param[in] src source array - */ - void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = src.box(); - const int nx = bx.length(0); - const int ny = bx.length(1); - const amrex::IntVect lo = bx.smallEnd(); - Array2 const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i+1,j+1) = src_array(i, j); - /* lower left quadrant */ - dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]); - /* upper right quadrant */ - dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j); - /* lower right quadrant */ - dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]); - } - ); - }; - - /** \brief Extract symmetrical src array into smaller array dst - * - * \param[in,out] dst destination array - * \param[in] src destination array, symmetric in x and y - */ - void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab>& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = dst.box(); - Array2 const> const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i,j) = -src_array(i+1, j+1).real(); - } - ); - }; - - /** \brief Make Complex array out of Real array to prepare for fft. - * out[idx] = -in[2*idx-2] + in[2*idx] + i*in[2*idx-1] for each column with - * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1] - * - * \param[in] in input real array - * \param[out] out output complex array - * \param[in] n_data number of (contiguous) rows in position matrix - * \param[in] n_batch number of (strided) columns in position matrix - */ - void ToComplex (const amrex::Real* const AMREX_RESTRICT in, - amrex::GpuComplex* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - const int n_half = (n_data+1)/2; - if((n_data%2 == 1)) { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); - const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] - *!i_is_zero*!i_is_n_half; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } else { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; - const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } - }; - - /** \brief Complex to Real fft for every column of the input matrix. - * The output Matrix has its indexes reversed compared to some other libraries - * - * \param[in] plan cuda fft plan for transformation - * \param[in] in input complex array - * \param[out] out output real array - */ - void C2Rfft (AnyFFT::VendorFFTPlan& plan, amrex::GpuComplex* AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out) - { - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream(plan, stream); - cufftResult result; - -#ifdef AMREX_USE_FLOAT - result = cufftExecC2R(plan, reinterpret_cast(in), out); -#else - result = cufftExecZ2D(plan, reinterpret_cast(in), out); -#endif - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - }; - - /** \brief Make Sine-space Real array out of array from fft. - * out[idx] = 0.5 *(in[n_data-idx] - in[idx+1] + (in[n_data-idx] + in[idx+1])/ - * (2*sin((idx+1)*pi/(n_data+1)))) for each column - * - * \param[in] in input real array - * \param[out] out output real array - * \param[in] n_data number of (contiguous) rows in position matrix - * \param[in] n_batch number of (strided) columns in position matrix - */ - void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - using namespace amrex::literals; - - const amrex::Real n_1_real = n_data+1._rt; - const int n_1 = n_data+1; - amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int i_rev = n_1-i; - const int stride_in = n_1*j; - const int stride_out = n_data*j; - const amrex::Real in_a = in[i+stride_in]; - const amrex::Real in_b = in[i_rev+stride_in]; -#ifdef AMREX_USE_FLOAT - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpif(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpif(i_rev/n_1_real))); -#else - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpi(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpi(i_rev/n_1_real))); -#endif - }); - }; - - /** \brief Transpose input matrix - * out[idy][idx] = in[idx][idy] - * - * \param[in] in input real array - * \param[out] out output real array - * \param[in] n_data number of (contiguous) rows in input matrix - * \param[in] n_batch number of (strided) columns in input matrix - */ - void Transpose (const amrex::Real* const AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - constexpr int tile_dim = 32; //must be power of 2 - constexpr int block_rows = 8; - const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; - const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; - amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, - tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE() noexcept - { - amrex::Gpu::SharedMemory gsm; - amrex::Real* const tile = gsm.dataPtr(); - - const int thread_x = threadIdx.x&(tile_dim-1); - const int thread_y = threadIdx.x/tile_dim; - const int block_y = blockIdx.x/num_blocks_x; - const int block_x = blockIdx.x - block_y*num_blocks_x; - int mat_x = block_x * tile_dim + thread_x; - int mat_y = block_y * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_data && (mat_y+i) < n_batch) { - tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; - } - } - - __syncthreads(); - - mat_x = block_y * tile_dim + thread_x; - mat_y = block_x * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_batch && (mat_y+i) < n_data) { - out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; - } - } - }); - }; - - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array) - { - HIPACE_PROFILE("AnyDST::CreatePlan()"); - DSTplan dst_plan; - - amrex::ParmParse pp("hipace"); - dst_plan.use_small_dst = (std::max(real_size[0], real_size[1]) >= 511); - queryWithParser(pp, "use_small_dst", dst_plan.use_small_dst); - - if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) { - AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024), - "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA " - "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes."); - } - - if(!dst_plan.use_small_dst) { - const int nx = real_size[0]; - const int ny = real_size[1]; - - // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2) - // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2) - amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}}; - amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}}; - // shift box to match rest of fields - expanded_position_box += position_array->box().smallEnd(); - expanded_fourier_box += fourier_array->box().smallEnd(); - dst_plan.m_expanded_position_array = - std::make_unique( - expanded_position_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - expanded_fourier_box, 1); - - // setting the initial values to 0 - // we don't set the expanded Fourier array, because it will be initialized by the FFT - dst_plan.m_expanded_position_array->setVal(0., - dst_plan.m_expanded_position_array->box(), 0, - dst_plan.m_expanded_position_array->nComp()); - - const amrex::IntVect& expanded_size = expanded_position_box.length(); - - // Initialize fft_plan.m_plan with the vendor fft plan. - cufftResult result; - result = cufftPlan2d( - &(dst_plan.m_plan), expanded_size[1], expanded_size[0], VendorR2C); - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - - std::size_t buffersize = 0; - cufftGetSize(dst_plan.m_plan, &buffersize); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using R2C cuFFT of size " << expanded_size[0] << " * " - << expanded_size[1] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - else { - const int nx = real_size[0]; // contiguous - const int ny = real_size[1]; // not contiguous - - // Allocate 1d Array for 2d data or 2d transpose data - const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); - const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx); - amrex::Box real_box {{0, 0, 0}, {real_1d_size-1, 0, 0}}; - amrex::Box complex_box {{0, 0, 0}, {complex_1d_size-1, 0, 0}}; - dst_plan.m_expanded_position_array = - std::make_unique( - real_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - complex_box, 1); - - // Initialize fft_plan.m_plan with the vendor fft plan. - int s_1 = nx+1; - cufftResult result; - result = cufftPlanMany( - &(dst_plan.m_plan), 1, &s_1, NULL, 1, (nx+1)/2+1, NULL, 1, nx+1, VendorC2R, ny); - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - - // Initialize transposed fft_plan.m_plan_b with the vendor fft plan. - int s_2 = ny+1; - cufftResult resultb; - resultb = cufftPlanMany( - &(dst_plan.m_plan_b), 1, &s_2, NULL, 1, (ny+1)/2+1, NULL, 1, ny+1, VendorC2R, nx); - - if ( resultb != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(resultb) << "\n"; - } - - std::size_t buffersize = 0; - std::size_t buffersize_b = 0; - cufftGetSize(dst_plan.m_plan, &buffersize); - cufftGetSize(dst_plan.m_plan_b, &buffersize_b); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using C2R cuFFT of sizes " << s_1 << " and " - << s_2 << " with " << (buffersize+buffersize_b+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - } - - void DestroyPlan (DSTplan& dst_plan) - { - cufftDestroy( dst_plan.m_plan ); - cufftDestroy( dst_plan.m_plan_b ); - } - - void Execute (DSTplan& dst_plan, direction d){ - HIPACE_PROFILE("AnyDST::Execute()"); - - if(!dst_plan.use_small_dst) { - // Swap position and fourier space based on execute direction - amrex::FArrayBox* position_array = - (d == direction::forward) ? dst_plan.m_position_array : dst_plan.m_fourier_array; - amrex::FArrayBox* fourier_array = - (d == direction::forward) ? dst_plan.m_fourier_array : dst_plan.m_position_array; - - // Expand in position space m_position_array -> m_expanded_position_array - ExpandR2R(*dst_plan.m_expanded_position_array, *position_array); - - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( dst_plan.m_plan, stream); - cufftResult result; - - // R2C FFT m_expanded_position_array -> m_expanded_fourier_array -#ifdef AMREX_USE_FLOAT - result = cufftExecR2C( - dst_plan.m_plan, dst_plan.m_expanded_position_array->dataPtr(), - reinterpret_cast(dst_plan.m_expanded_fourier_array->dataPtr())); -#else - result = cufftExecD2Z( - dst_plan.m_plan, dst_plan.m_expanded_position_array->dataPtr(), - reinterpret_cast(dst_plan.m_expanded_fourier_array->dataPtr())); -#endif - // Shrink in Fourier space m_expanded_fourier_array -> m_fourier_array - ShrinkC2R(*fourier_array, *dst_plan.m_expanded_fourier_array); - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - } - else { - const int nx = dst_plan.m_position_array->box().length(0); // initially contiguous - const int ny = dst_plan.m_position_array->box().length(1); // contiguous after transpose - - amrex::Real* const tmp_pos_arr = dst_plan.m_position_array->dataPtr(); - amrex::Real* const tmp_fourier_arr = dst_plan.m_fourier_array->dataPtr(); - amrex::GpuComplex* comp_arr = dst_plan.m_expanded_fourier_array->dataPtr(); - amrex::Real* const real_arr = dst_plan.m_expanded_position_array->dataPtr(); - - // Swap position and fourier space based on execute direction - amrex::Real* const pos_arr = - (d == direction::forward) ? tmp_pos_arr : tmp_fourier_arr; - amrex::Real* const fourier_arr = - (d == direction::forward) ? tmp_fourier_arr : tmp_pos_arr; - - ToComplex(pos_arr, comp_arr, nx, ny); - - C2Rfft(dst_plan.m_plan, comp_arr, real_arr); - - ToSine(real_arr, pos_arr, nx, ny); - - Transpose(pos_arr, fourier_arr, nx, ny); - - ToComplex(fourier_arr, comp_arr, ny, nx); - - C2Rfft(dst_plan.m_plan_b, comp_arr, real_arr); - - ToSine(real_arr, pos_arr, ny, nx); - - Transpose(pos_arr, fourier_arr, ny, nx); - } - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp deleted file mode 100644 index 739ab2cdcb..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/WrapCuFFT.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: MaxThevenet, Remi Lehe, Severin Diederichs - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. - * - * License: BSD-3-Clause-LBNL - */ -#include "AnyFFT.H" -#include "CuFFTUtils.H" -#include "utils/HipaceProfilerWrapper.H" - -namespace AnyFFT -{ - -#ifdef AMREX_USE_FLOAT - cufftType VendorR2C = CUFFT_R2C; - cufftType VendorC2R = CUFFT_C2R; -#else - cufftType VendorR2C = CUFFT_D2Z; - cufftType VendorC2R = CUFFT_Z2D; -#endif - - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - FFTplan fft_plan; - - if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) { - AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024), - "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA " - "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes."); - } - - // Initialize fft_plan.m_plan with the vendor fft plan. - cufftResult result; - if (dir == direction::R2C){ - result = cufftPlan2d( - &(fft_plan.m_plan), real_size[1], real_size[0], VendorR2C); - } else { - result = cufftPlan2d( - &(fft_plan.m_plan), real_size[1], real_size[0], VendorC2R); - } - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; - - return fft_plan; - } - - void DestroyPlan (FFTplan& fft_plan) - { - cufftDestroy( fft_plan.m_plan ); - } - - void Execute (FFTplan& fft_plan){ - HIPACE_PROFILE("AnyFFT::Execute()"); - // make sure that this is done on the same GPU stream as the above copy - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( fft_plan.m_plan, stream); - cufftResult result; - if (fft_plan.m_dir == direction::R2C){ -#ifdef AMREX_USE_FLOAT - result = cufftExecR2C(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); -#else - result = cufftExecD2Z(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); -#endif - } else if (fft_plan.m_dir == direction::C2R){ -#ifdef AMREX_USE_FLOAT - result = cufftExecC2R(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); -#else - result = cufftExecZ2D(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); -#endif - } else { - amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); - } - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/WrapDSTW.cpp b/src/fields/fft_poisson_solver/fft_old/WrapDSTW.cpp deleted file mode 100644 index ca14a5e358..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/WrapDSTW.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs - * License: BSD-3-Clause-LBNL - */ -#include "AnyDST.H" -#include "utils/HipaceProfilerWrapper.H" - -#ifdef AMREX_USE_OMP -# include -#endif - -namespace AnyDST -{ -#ifdef AMREX_USE_FLOAT - const auto VendorCreatePlanR2R2D = fftwf_plan_r2r_2d; -#else - const auto VendorCreatePlanR2R2D = fftw_plan_r2r_2d; -#endif - - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array) - { - DSTplan dst_plan; - const int nx = real_size[0]; - const int ny = real_size[1]; - -#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP) - if (nx > 32 && ny > 32) { -# ifdef AMREX_USE_FLOAT - fftwf_init_threads(); - fftwf_plan_with_nthreads(omp_get_max_threads()); -# else - fftw_init_threads(); - fftw_plan_with_nthreads(omp_get_max_threads()); -# endif - } -#endif - - // Initialize fft_plan.m_plan with the vendor fft plan. - // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order - dst_plan.m_plan = VendorCreatePlanR2R2D( - ny, nx, position_array->dataPtr(), fourier_array->dataPtr(), - FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); - - // Initialize fft_plan.m_plan_b with the vendor fft plan. - // Swap arrays: now for backward direction. - dst_plan.m_plan_b = VendorCreatePlanR2R2D( - ny, nx, fourier_array->dataPtr(), position_array->dataPtr(), - FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); - - amrex::Print() << "using R2R FFTW of size " << nx << " * " << ny << "\n"; - - // Store meta-data in fft_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - - void DestroyPlan (DSTplan& dst_plan) - { -# ifdef AMREX_USE_FLOAT - fftwf_destroy_plan( dst_plan.m_plan ); - fftwf_destroy_plan( dst_plan.m_plan_b ); -# else - fftw_destroy_plan( dst_plan.m_plan ); - fftw_destroy_plan( dst_plan.m_plan_b ); -# endif - } - - void Execute (DSTplan& dst_plan, direction d){ - HIPACE_PROFILE("AnyDST::Execute()"); - // Swap position and fourier space based on execute direction - AnyFFT::VendorFFTPlan& plan = (d==direction::forward) ? dst_plan.m_plan : dst_plan.m_plan_b; -# ifdef AMREX_USE_FLOAT - fftwf_execute( plan ); -# else - fftw_execute( plan ); -# endif - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp deleted file mode 100644 index 8a6565dbed..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/WrapFFTW.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright 2020 - * - * This file is part of HiPACE++. - * - * Authors: MaxThevenet, Remi Lehe - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. - * - * License: BSD-3-Clause-LBNL - */ -#include "AnyFFT.H" -#include "utils/HipaceProfilerWrapper.H" - -namespace AnyFFT -{ -#ifdef AMREX_USE_FLOAT - const auto VendorCreatePlanR2C3D = fftwf_plan_dft_r2c_3d; - const auto VendorCreatePlanC2R3D = fftwf_plan_dft_c2r_3d; - const auto VendorCreatePlanR2C2D = fftwf_plan_dft_r2c_2d; - const auto VendorCreatePlanC2R2D = fftwf_plan_dft_c2r_2d; -#else - const auto VendorCreatePlanR2C3D = fftw_plan_dft_r2c_3d; - const auto VendorCreatePlanC2R3D = fftw_plan_dft_c2r_3d; - const auto VendorCreatePlanR2C2D = fftw_plan_dft_r2c_2d; - const auto VendorCreatePlanC2R2D = fftw_plan_dft_c2r_2d; -#endif - - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - FFTplan fft_plan; - - // Initialize fft_plan.m_plan with the vendor fft plan. - // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order - if (dir == direction::R2C){ - fft_plan.m_plan = VendorCreatePlanR2C2D( - real_size[1], real_size[0], real_array, complex_array, FFTW_ESTIMATE); - } else if (dir == direction::C2R){ - fft_plan.m_plan = VendorCreatePlanC2R2D( - real_size[1], real_size[0], complex_array, real_array, FFTW_ESTIMATE); - } - - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; - - return fft_plan; - } - - void DestroyPlan (FFTplan& fft_plan) - { -# ifdef AMREX_USE_FLOAT - fftwf_destroy_plan( fft_plan.m_plan ); -# else - fftw_destroy_plan( fft_plan.m_plan ); -# endif - } - - void Execute (FFTplan& fft_plan){ - HIPACE_PROFILE("Execute_FFTplan()"); -# ifdef AMREX_USE_FLOAT - fftwf_execute( fft_plan.m_plan ); -# else - fftw_execute( fft_plan.m_plan ); -# endif - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/WrapRocDST.cpp b/src/fields/fft_poisson_solver/fft_old/WrapRocDST.cpp deleted file mode 100644 index ae96334494..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/WrapRocDST.cpp +++ /dev/null @@ -1,421 +0,0 @@ -/* Copyright 2021-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs - * - * License: BSD-3-Clause-LBNL - */ -#include "AnyDST.H" -#include "RocFFTUtils.H" -#include "utils/GPUUtil.H" -#include "utils/HipaceProfilerWrapper.H" -#include "utils/Parser.H" - -#include - -namespace AnyDST -{ - // see WrapCuDST for documentation - void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = src.box(); - const int nx = bx.length(0); - const int ny = bx.length(1); - const amrex::IntVect lo = bx.smallEnd(); - Array2 const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i+1,j+1) = src_array(i, j); - /* lower left quadrant */ - dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]); - /* upper right quadrant */ - dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j); - /* lower right quadrant */ - dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]); - } - ); - } - - // see WrapCuDST for documentation - void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab>& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = dst.box(); - Array2 const> const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i,j) = -src_array(i+1, j+1).real(); - } - ); - } - - // see WrapCuDST for documentation - void ToComplex (const amrex::Real* const AMREX_RESTRICT in, - amrex::GpuComplex* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - const int n_half = (n_data+1)/2; - if((n_data%2 == 1)) { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); - const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] - *!i_is_zero*!i_is_n_half; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } else { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; - const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } - } - - // see WrapCuDST for documentation - void C2Rfft (AnyFFT::VendorFFTPlan& plan, amrex::GpuComplex* AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, rocfft_execution_info execinfo) - { - rocfft_status result; - - void* in_arr[2] = {in, nullptr}; - void* out_arr[2] = {out, nullptr}; - result = rocfft_execute(plan, in_arr, out_arr, execinfo); - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - } - - // see WrapCuDST for documentation - void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - using namespace amrex::literals; - - const amrex::Real n_1_real = n_data+1._rt; - const int n_1 = n_data+1; - amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int i_rev = n_1-i; - const int stride_in = n_1*j; - const int stride_out = n_data*j; - const amrex::Real in_a = in[i+stride_in]; - const amrex::Real in_b = in[i_rev+stride_in]; -#ifdef AMREX_USE_FLOAT - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpif(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpif(i_rev/n_1_real))); -#else - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpi(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpi(i_rev/n_1_real))); -#endif - }); - } - - // see WrapCuDST for documentation - void Transpose (const amrex::Real* const AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - constexpr int tile_dim = 32; //must be power of 2 - constexpr int block_rows = 8; - const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; - const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; - amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, - tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE() noexcept - { - amrex::Gpu::SharedMemory gsm; - amrex::Real* const tile = gsm.dataPtr(); - - const int thread_x = threadIdx.x&(tile_dim-1); - const int thread_y = threadIdx.x/tile_dim; - const int block_y = blockIdx.x/num_blocks_x; - const int block_x = blockIdx.x - block_y*num_blocks_x; - int mat_x = block_x * tile_dim + thread_x; - int mat_y = block_y * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_data && (mat_y+i) < n_batch) { - tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; - } - } - - __syncthreads(); - - mat_x = block_y * tile_dim + thread_x; - mat_y = block_x * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_batch && (mat_y+i) < n_data) { - out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; - } - } - }); - } - - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array) - { - HIPACE_PROFILE("AnyDST::CreatePlan()"); - DSTplan dst_plan; - - amrex::ParmParse pp("hipace"); - dst_plan.use_small_dst = (std::max(real_size[0], real_size[1]) >= 511); - queryWithParser(pp, "use_small_dst", dst_plan.use_small_dst); - - if(!dst_plan.use_small_dst) { - const int nx = real_size[0]; - const int ny = real_size[1]; - int dim = 2; - - // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2) - // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2) - amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}}; - amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}}; - // shift box to match rest of fields - expanded_position_box += position_array->box().smallEnd(); - expanded_fourier_box += fourier_array->box().smallEnd(); - dst_plan.m_expanded_position_array = - std::make_unique( - expanded_position_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - expanded_fourier_box, 1); - - // setting the initial values to 0 - // we don't set the expanded Fourier array, because it will be initialized by the FFT - dst_plan.m_expanded_position_array->setVal(0., - dst_plan.m_expanded_position_array->box(), 0, - dst_plan.m_expanded_position_array->nComp()); - - // check for type of expanded size, should be const size_t * - const amrex::IntVect& expanded_size = expanded_position_box.length(); - const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(expanded_size[0]), - std::size_t(expanded_size[1]), - std::size_t(expanded_size[2]))}; - - #ifdef AMREX_USE_FLOAT - rocfft_precision precision = rocfft_precision_single; - #else - rocfft_precision precision = rocfft_precision_double; - #endif - - // Initialize fft_plan.m_plan with the vendor fft plan. - rocfft_status result; - result = rocfft_plan_create(&(dst_plan.m_plan), - rocfft_placement_notinplace, - rocfft_transform_type_real_forward, - precision, - dim, - lengths, - 1, - nullptr); - - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - std::size_t buffersize = 0; - result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan, &buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - result = rocfft_execution_info_create(&(dst_plan.m_execinfo)); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - dst_plan.m_buffer = amrex::The_Arena()->alloc(buffersize); - result = rocfft_execution_info_set_work_buffer(dst_plan.m_execinfo, dst_plan.m_buffer, - buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(dst_plan.m_execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using R2C rocFFT of size " << expanded_size[0] << " * " - << expanded_size[1] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - else { - const int nx = real_size[0]; // contiguous - const int ny = real_size[1]; // not contiguous - - // Allocate 1d Array for 2d data or 2d transpose data - const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); - const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx); - amrex::Box real_box {{0, 0, 0}, {real_1d_size-1, 0, 0}}; - amrex::Box complex_box {{0, 0, 0}, {complex_1d_size-1, 0, 0}}; - dst_plan.m_expanded_position_array = - std::make_unique( - real_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - complex_box, 1); - -#ifdef AMREX_USE_FLOAT - rocfft_precision precision = rocfft_precision_single; -#else - rocfft_precision precision = rocfft_precision_double; -#endif - - rocfft_status result; - - const std::size_t s_1[3] = {std::size_t(nx+1) ,0u ,0u}; - - result = rocfft_plan_create(&(dst_plan.m_plan), - rocfft_placement_notinplace, - rocfft_transform_type_real_inverse, - precision, - 1, - s_1, - ny, - nullptr); - - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - const std::size_t s_2[3] = {std::size_t(ny+1) ,0u ,0u}; - - result = rocfft_plan_create(&(dst_plan.m_plan_b), - rocfft_placement_notinplace, - rocfft_transform_type_real_inverse, - precision, - 1, - s_2, - nx, - nullptr); - - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - std::size_t work_size = 0; - std::size_t work_size_b = 0; - - result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan, &work_size); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan_b, &work_size_b); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - result = rocfft_execution_info_create(&(dst_plan.m_execinfo)); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - std::size_t buffersize = std::max(work_size, work_size_b); - dst_plan.m_buffer = amrex::The_Arena()->alloc(buffersize); - - result = rocfft_execution_info_set_work_buffer(dst_plan.m_execinfo, dst_plan.m_buffer, - buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(dst_plan.m_execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using C2R rocFFT of sizes " << s_1[0] << " and " - << s_2[0] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - } - - void DestroyPlan (DSTplan& dst_plan) - { - rocfft_plan_destroy( dst_plan.m_plan ); - rocfft_plan_destroy( dst_plan.m_plan_b ); - - amrex::The_Arena()->free(dst_plan.m_buffer); - rocfft_execution_info_destroy(dst_plan.m_execinfo); - } - - void Execute (DSTplan& dst_plan, direction d){ - HIPACE_PROFILE("AnyDST::Execute()"); - - if(!dst_plan.use_small_dst) { - // Swap position and fourier space based on execute direction - amrex::FArrayBox* position_array = - (d == direction::forward) ? dst_plan.m_position_array : dst_plan.m_fourier_array; - amrex::FArrayBox* fourier_array = - (d == direction::forward) ? dst_plan.m_fourier_array : dst_plan.m_position_array; - - // Expand in position space m_position_array -> m_expanded_position_array - ExpandR2R(*dst_plan.m_expanded_position_array, *position_array); - - rocfft_status result; - - // R2C FFT m_expanded_position_array -> m_expanded_fourier_array - void* in[2] = {dst_plan.m_expanded_position_array->dataPtr(), nullptr}; - void* out[2] = {dst_plan.m_expanded_fourier_array->dataPtr(), nullptr}; - result = rocfft_execute(dst_plan.m_plan, in, out, dst_plan.m_execinfo); - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - - // Shrink in Fourier space m_expanded_fourier_array -> m_fourier_array - ShrinkC2R(*fourier_array, *dst_plan.m_expanded_fourier_array); - } - else { - const int nx = dst_plan.m_position_array->box().length(0); // initially contiguous - const int ny = dst_plan.m_position_array->box().length(1); // contiguous after transpose - - amrex::Real* const tmp_pos_arr = dst_plan.m_position_array->dataPtr(); - amrex::Real* const tmp_fourier_arr = dst_plan.m_fourier_array->dataPtr(); - amrex::GpuComplex* comp_arr = dst_plan.m_expanded_fourier_array->dataPtr(); - amrex::Real* const real_arr = dst_plan.m_expanded_position_array->dataPtr(); - - // Swap position and fourier space based on execute direction - amrex::Real* const pos_arr = - (d == direction::forward) ? tmp_pos_arr : tmp_fourier_arr; - amrex::Real* const fourier_arr = - (d == direction::forward) ? tmp_fourier_arr : tmp_pos_arr; - - ToComplex(pos_arr, comp_arr, nx, ny); - - C2Rfft(dst_plan.m_plan, comp_arr, real_arr, dst_plan.m_execinfo); - - ToSine(real_arr, pos_arr, nx, ny); - - Transpose(pos_arr, fourier_arr, nx, ny); - - ToComplex(fourier_arr, comp_arr, ny, nx); - - C2Rfft(dst_plan.m_plan_b, comp_arr, real_arr, dst_plan.m_execinfo); - - ToSine(real_arr, pos_arr, ny, nx); - - Transpose(pos_arr, fourier_arr, ny, nx); - } - } -} diff --git a/src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp deleted file mode 100644 index 0ba7a2c7db..0000000000 --- a/src/fields/fft_poisson_solver/fft_old/WrapRocFFT.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright 2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl - * License: BSD-3-Clause-LBNL - */ -/* Copyright 2019-2020 - * - * This file is part of WarpX. - * - * License: BSD-3-Clause-LBNL - */ - -#include "AnyFFT.H" -#include "RocFFTUtils.H" -#include "utils/HipaceProfilerWrapper.H" - -namespace AnyFFT -{ - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - const int dim = 2; - FFTplan fft_plan; - - const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(real_size[0]), - std::size_t(real_size[1]), - std::size_t(real_size[2]))}; - // Initialize fft_plan.m_plan with the vendor fft plan. - rocfft_status result = rocfft_plan_create(&(fft_plan.m_plan), - rocfft_placement_notinplace, - (dir == direction::R2C) - ? rocfft_transform_type_real_forward - : rocfft_transform_type_real_inverse, -#ifdef AMREX_USE_FLOAT - rocfft_precision_single, -#else - rocfft_precision_double, -#endif - dim, lengths, - 1, // number of transforms, - nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; - //fft_plan.m_dim = dim; - - return fft_plan; - } - - void DestroyPlan (FFTplan& fft_plan) - { - rocfft_plan_destroy( fft_plan.m_plan ); - } - - void Execute (FFTplan& fft_plan) - { - HIPACE_PROFILE("AnyFFT::Execute()"); - rocfft_execution_info execinfo = NULL; - rocfft_status result = rocfft_execution_info_create(&execinfo); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - std::size_t buffersize = 0; - result = rocfft_plan_get_work_buffer_size(fft_plan.m_plan, &buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - void* buffer = amrex::The_Arena()->alloc(buffersize); - result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - if (fft_plan.m_dir == direction::R2C) { - result = rocfft_execute(fft_plan.m_plan, - (void**)&(fft_plan.m_real_array), // in - (void**)&(fft_plan.m_complex_array), // out - execinfo); - } else if (fft_plan.m_dir == direction::C2R) { - result = rocfft_execute(fft_plan.m_plan, - (void**)&(fft_plan.m_complex_array), // in - (void**)&(fft_plan.m_real_array), // out - execinfo); - } else { - amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); - } - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - - amrex::Gpu::streamSynchronize(); - - amrex::The_Arena()->free(buffer); - - result = rocfft_execution_info_destroy(execinfo); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); - } -} From 27b3a95f14f64ce94ee53ec24a81e2226513a386 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Wed, 17 Apr 2024 22:01:47 +0200 Subject: [PATCH 04/11] fix cufft --- .../FFTPoissonSolverDirichletFast.cpp | 4 +- .../fft_poisson_solver/fft/WrapCuFFT.cpp | 98 +++++++++++++++++-- 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp index 7476eba77c..587f773804 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp @@ -225,9 +225,9 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, m_fft_work_area.resize(std::max(fft_x_area, fft_y_area)); - m_x_fft.SetBuffers(m_position_array.dataPtr(), m_fourier_array.dataPtr(), + m_x_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(), m_fft_work_area.dataPtr()); - m_y_fft.SetBuffers(m_position_array.dataPtr(), m_fourier_array.dataPtr(), + m_y_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(), m_fft_work_area.dataPtr()); } diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index e7c38920e7..3a28fb5759 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include @@ -24,7 +24,7 @@ static constexpr bool use_float = false; struct VendorPlan { cufftHandle m_cufftplan; - int m_direction = 0; + FFTType m_type; void* m_in; void* m_out; }; @@ -61,6 +61,7 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { // https://docs.nvidia.com/cuda/cufft/index.html#cufft-api-reference m_plan = new VendorPlan; + m_plan->m_type = type; cufftType transform_type; int rank = 0; long long int n[2] = {0, 0}; @@ -69,7 +70,6 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { switch (type) { case FFTType::C2C_2D_fwd: transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; - m_plan->m_direction = CUFFT_FORWARD; rank = 2; n[0] = ny; n[1] = nx; @@ -77,7 +77,6 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { break; case FFTType::C2C_2D_bkw: transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; - m_plan->m_direction = CUFFT_INVERSE; rank = 2; n[0] = ny; n[1] = nx; @@ -154,8 +153,95 @@ void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { void AnyFFT::Execute () { cufftResult result; - result = cufftXtExec(m_plan->m_cufftplan, m_plan->m_in, m_plan->m_out, m_plan->m_direction); - assert_cufft_status("cufftXtExec", result); + if constexpr (use_float) { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + result = cufftExecC2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_FORWARD); + assert_cufft_status("cufftExecC2C", result); + break; + case FFTType::C2C_2D_bkw: + result = cufftExecC2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_INVERSE); + assert_cufft_status("cufftExecC2C", result); + break; + case FFTType::C2R_2D: + result = cufftExecC2R(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecC2R", result); + break; + case FFTType::R2C_2D: + result = cufftExecR2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecR2C", result); + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by cufft"); + break; + case FFTType::C2R_1D_batched: + result = cufftExecC2R(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecC2R", result); + break; + case FFTType::R2C_1D_batched: + result = cufftExecR2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecR2C", result); + break; + } + } else { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + result = cufftExecZ2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_FORWARD); + assert_cufft_status("cufftExecZ2Z", result); + break; + case FFTType::C2C_2D_bkw: + result = cufftExecZ2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_INVERSE); + assert_cufft_status("cufftExecZ2Z", result); + break; + case FFTType::C2R_2D: + result = cufftExecZ2D(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecZ2D", result); + break; + case FFTType::R2C_2D: + result = cufftExecD2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecD2Z", result); + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by cufft"); + break; + case FFTType::C2R_1D_batched: + result = cufftExecZ2D(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecZ2D", result); + break; + case FFTType::R2C_1D_batched: + result = cufftExecD2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecD2Z", result); + break; + } + } } AnyFFT::~AnyFFT () { From 14222bed757bdcb56aa859b838cabe9df583c6cc Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Thu, 18 Apr 2024 15:50:00 +0200 Subject: [PATCH 05/11] add setup and cleanup --- src/Hipace.H | 2 + src/Hipace.cpp | 7 +++ src/fields/fft_poisson_solver/fft/AnyFFT.H | 7 ++- .../fft_poisson_solver/fft/WrapCuFFT.cpp | 22 ++------- .../fft_poisson_solver/fft/WrapFFTW.cpp | 48 ++++++++++--------- .../fft_poisson_solver/fft/WrapRocFFT.cpp | 22 ++++++--- 6 files changed, 59 insertions(+), 49 deletions(-) diff --git a/src/Hipace.H b/src/Hipace.H index 7e46ea4dc4..7fc843f03c 100644 --- a/src/Hipace.H +++ b/src/Hipace.H @@ -73,6 +73,8 @@ public: * and initialize longitudinal and transverse MPI communicators */ Hipace (); + ~Hipace (); + /** Get singleton instance */ static Hipace& GetInstance (); diff --git a/src/Hipace.cpp b/src/Hipace.cpp index faca043427..f5c71fc6c2 100644 --- a/src/Hipace.cpp +++ b/src/Hipace.cpp @@ -17,6 +17,7 @@ #include "utils/GPUUtil.H" #include "particles/pusher/GetAndSetPosition.H" #include "mg_solver/HpMultiGrid.H" +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include #include @@ -53,6 +54,7 @@ Hipace_early_init::Hipace_early_init (Hipace* instance) int max_level = 0; queryWithParser(pp_amr, "max_level", max_level); m_N_level = max_level + 1; + AnyFFT::setup(); } Hipace& @@ -171,6 +173,11 @@ Hipace::Hipace () : } } +Hipace::~Hipace() +{ + AnyFFT::cleanup(); +} + void Hipace::InitData () { diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index d4014c5f31..9613c4f7b2 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -19,8 +19,7 @@ enum struct FFTType { C2R_2D, R2C_2D, R2R_2D, - C2R_1D_batched, - R2C_1D_batched + C2R_1D_batched }; struct AnyFFT { @@ -33,6 +32,10 @@ struct AnyFFT { ~AnyFFT (); + static void setup (); + + static void cleanup (); + private: VendorPlan* m_plan = nullptr; }; diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index 3a28fb5759..c5691451cc 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -105,12 +105,6 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { n[0] = nx; batch = ny; break; - case FFTType::R2C_1D_batched: - transform_type = use_float ? CUFFT_R2C : CUFFT_D2Z; - rank = 1; - n[0] = nx; - batch = ny; - break; } cufftResult result; @@ -190,12 +184,6 @@ void AnyFFT::Execute () { reinterpret_cast(m_plan->m_out)); assert_cufft_status("cufftExecC2R", result); break; - case FFTType::R2C_1D_batched: - result = cufftExecR2C(m_plan->m_cufftplan, - reinterpret_cast(m_plan->m_in), - reinterpret_cast(m_plan->m_out)); - assert_cufft_status("cufftExecR2C", result); - break; } } else { switch (m_plan->m_type) { @@ -234,12 +222,6 @@ void AnyFFT::Execute () { reinterpret_cast(m_plan->m_out)); assert_cufft_status("cufftExecZ2D", result); break; - case FFTType::R2C_1D_batched: - result = cufftExecD2Z(m_plan->m_cufftplan, - reinterpret_cast(m_plan->m_in), - reinterpret_cast(m_plan->m_out)); - assert_cufft_status("cufftExecD2Z", result); - break; } } } @@ -254,3 +236,7 @@ AnyFFT::~AnyFFT () { delete m_plan; } } + +void AnyFFT::setup () {} + +void AnyFFT::cleanup () {} diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 746d58c308..79c8a69dbd 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -10,9 +10,11 @@ #include -#include +#ifdef AMREX_USE_OMP +#include +#endif -#include +#include #ifdef AMREX_USE_FLOAT static constexpr bool use_float = true; @@ -81,16 +83,6 @@ void AnyFFT::SetBuffers (void* in, void* out, [[maybe_unused]] void* work_area) FFTW_MEASURE); } break; - case FFTType::R2C_1D_batched: - { - int n[1] = {m_plan->m_nx}; - m_plan->m_fftwf_plan = fftwf_plan_many_dft_r2c( - 1, n, m_plan->m_ny, - reinterpret_cast(in), nullptr, 1, m_plan->m_nx, - reinterpret_cast(out), nullptr, 1, m_plan->m_nx/2+1, - FFTW_MEASURE); - } - break; } } else { switch (m_plan->m_type) { @@ -134,16 +126,6 @@ void AnyFFT::SetBuffers (void* in, void* out, [[maybe_unused]] void* work_area) FFTW_MEASURE); } break; - case FFTType::R2C_1D_batched: - { - int n[1] = {m_plan->m_nx}; - m_plan->m_fftw_plan = fftw_plan_many_dft_r2c( - 1, n, m_plan->m_ny, - reinterpret_cast(in), nullptr, 1, m_plan->m_nx, - reinterpret_cast(out), nullptr, 1, m_plan->m_nx/2+1, - FFTW_MEASURE); - } - break; } } } @@ -166,3 +148,25 @@ AnyFFT::~AnyFFT () { delete m_plan; } } + +void AnyFFT::setup () { +#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP) + if constexpr (use_float) { + fftwf_init_threads(); + fftwf_plan_with_nthreads(omp_get_max_threads()); + } else { + fftw_init_threads(); + fftw_plan_with_nthreads(omp_get_max_threads()); + } +#endif +} + +void AnyFFT::cleanup () { +#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP) + if constexpr (use_float) { + fftwf_cleanup_threads(); + } else { + fftw_cleanup_threads(); + } +#endif +} diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index 29d0ab3d09..b448dc1000 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -62,7 +62,7 @@ void assert_rocfft_status (std::string const& name, const rocfft_status& status) } std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { - // https://rocm.docs.amd.com/projects/rocFFT/en/latest/allapi.html + // https://rocm.docs.amd.com/projects/rocFFT/en/latest/reference/allapi.html# m_plan = new VendorPlan; rocfft_transform_type transform_type; @@ -108,12 +108,6 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { lengths[0] = nx; number_of_transforms = ny; break; - case FFTType::R2C_1D_batched: - transform_type = rocfft_transform_type_real_forward; - dimensions = 1; - lengths[0] = nx; - number_of_transforms = ny; - break; } rocfft_status status; @@ -175,3 +169,17 @@ AnyFFT::~AnyFFT () { delete m_plan; } } + +void AnyFFT::setup () { + rocfft_status status; + + status = rocfft_setup(); + assert_rocfft_status("rocfft_setup", status); +} + +void AnyFFT::cleanup () { + rocfft_status status; + + status = rocfft_cleanup(); + assert_rocfft_status("rocfft_cleanup", status); +} From dafd788455ed087925f8cf00c272364f912f39e5 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 22 Apr 2024 03:41:55 +0200 Subject: [PATCH 06/11] add doc --- docs/source/run/parameters.rst | 30 ++++++++++++++----- src/Hipace.H | 5 ++-- src/Hipace.cpp | 10 +++---- src/fields/Fields.cpp | 1 + .../FFTPoissonSolverDirichletDirect.H | 4 ++- .../FFTPoissonSolverDirichletDirect.cpp | 3 +- .../FFTPoissonSolverDirichletExpanded.H | 7 +++-- .../FFTPoissonSolverDirichletExpanded.cpp | 3 +- .../FFTPoissonSolverDirichletFast.H | 9 +++--- .../FFTPoissonSolverDirichletFast.cpp | 5 ++++ .../FFTPoissonSolverPeriodic.H | 4 ++- .../FFTPoissonSolverPeriodic.cpp | 1 + src/fields/fft_poisson_solver/fft/AnyFFT.H | 21 +++++++++++++ .../fft_poisson_solver/fft/WrapCuFFT.cpp | 3 ++ .../fft_poisson_solver/fft/WrapFFTW.cpp | 2 ++ .../fft_poisson_solver/fft/WrapRocFFT.cpp | 1 + src/laser/MultiLaser.H | 1 + src/laser/MultiLaser.cpp | 1 + 18 files changed, 86 insertions(+), 25 deletions(-) diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst index 1506f8bee6..3956633f23 100644 --- a/docs/source/run/parameters.rst +++ b/docs/source/run/parameters.rst @@ -255,15 +255,31 @@ The default is to use the explicit solver. **We strongly recommend to use the ex Which solver to use. Possible values: ``explicit`` and ``predictor-corrector``. -* ``fields.poisson_solver`` (`string`) optional (default `FFTDirichlet`) +* ``fields.poisson_solver`` (`string`) optional (default CPU: `FFTDirichletDirect`, GPU: `FFTDirichletFast`) Which Poisson solver to use for ``Psi``, ``Ez`` and ``Bz``. The ``predictor-corrector`` BxBy - solver also uses this poisson solver for ``Bx`` and ``By`` internally. Available solvers are - ``FFTDirichlet``, ``FFTPeriodic`` and ``MGDirichlet``. + solver also uses this poisson solver for ``Bx`` and ``By`` internally. Available solvers are: -* ``hipace.use_small_dst`` (`bool`) optional (default `0` or `1`) - Whether to use a large R2C or a small C2R fft in the dst of the Poisson solver. - The small dst is quicker for simulations with :math:`\geq 511` transverse grid points. - The default is set accordingly. + * ``FFTDirichletDirect`` Use the discrete sine transformation that is directly implemented + by FFTW to solve the Poisson equation with Dirichlet boundary conditions. + This option is only available when compiling for CPUs with FFTW. + Preferred resolution: :math:`2^N-1`. + + * ``FFTDirichletExpanded`` Perform the discrete sine transformation by symmetrically + expanding the field to twice its size. + Preferred resolution: :math:`2^N-1`. + + * ``FFTDirichletFast`` Perform the discrete sine transformation using a fast sine transform + algorithm that uses FFTs of the same size as the fields. + Preferred resolution: :math:`2^N-1`. + + * ``MGDirichlet`` Use the HiPACE++ multigrid solver to solve the Poisson equation with + Dirichlet boundary conditions. + Preferred resolution: :math:`2^N` and :math:`2^N-1`. + + * ``FFTPeriodic`` Use FFTs to solve the Poisson equation with Periodic boundary conditions. + Note that this does not work with features that change the boundary values, + like mesh refinement or open boundaries. + Preferred resolution: :math:`2^N`. * ``fields.extended_solve`` (`bool`) optional (default `0`) Extends the area of the FFT Poisson solver to the ghost cells. This can reduce artifacts diff --git a/src/Hipace.H b/src/Hipace.H index 7fc843f03c..ad83ba1365 100644 --- a/src/Hipace.H +++ b/src/Hipace.H @@ -44,6 +44,9 @@ struct Hipace_early_init * and Parser Constants */ Hipace_early_init (Hipace* instance); + /** Destructor for FFT cleanup */ + ~Hipace_early_init (); + /** Struct containing physical constants (which values depends on the unit system, determined * at runtime): SI or normalized units. */ PhysConst m_phys_const; @@ -73,8 +76,6 @@ public: * and initialize longitudinal and transverse MPI communicators */ Hipace (); - ~Hipace (); - /** Get singleton instance */ static Hipace& GetInstance (); diff --git a/src/Hipace.cpp b/src/Hipace.cpp index f5c71fc6c2..71164a11e7 100644 --- a/src/Hipace.cpp +++ b/src/Hipace.cpp @@ -57,6 +57,11 @@ Hipace_early_init::Hipace_early_init (Hipace* instance) AnyFFT::setup(); } +Hipace_early_init::~Hipace_early_init () +{ + AnyFFT::cleanup(); +} + Hipace& Hipace::GetInstance () { @@ -173,11 +178,6 @@ Hipace::Hipace () : } } -Hipace::~Hipace() -{ - AnyFFT::cleanup(); -} - void Hipace::InitData () { diff --git a/src/fields/Fields.cpp b/src/fields/Fields.cpp index 07917f2511..b222db5294 100644 --- a/src/fields/Fields.cpp +++ b/src/fields/Fields.cpp @@ -31,6 +31,7 @@ Fields::Fields (const int nlev) { amrex::ParmParse ppf("fields"); DeprecatedInput("fields", "do_dirichlet_poisson", "poisson_solver", ""); + // set default Poisson solver based on the platform #ifdef AMREX_USE_GPU m_poisson_solver_str = "FFTDirichletFast"; #else diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H index 86a3ab914f..78f3d5dc38 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H @@ -63,9 +63,11 @@ private: amrex::MultiFab m_tmpSpectralField; /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ amrex::MultiFab m_eigenvalue_matrix; - /** DST plans */ + /** forward DST plan */ AnyFFT m_forward_fft; + /** backward DST plan */ AnyFFT m_backward_fft; + /** work area for both DST plans */ amrex::Gpu::DeviceVector m_fft_work_area; }; diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp index bec25bd937..70da5ac1d8 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp @@ -87,6 +87,7 @@ FFTPoissonSolverDirichletDirect::define (amrex::BoxArray const& a_realspace_ba, std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + // Allocate work area for both FFTs m_fft_work_area.resize(std::max(fwd_area, bkw_area)); m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), @@ -101,7 +102,7 @@ FFTPoissonSolverDirichletDirect::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::SolvePoissonEquation()"); - m_forward_fft.Execute(); + m_forward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H index 79b6b79a8c..d54393dd72 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H @@ -63,12 +63,13 @@ private: amrex::MultiFab m_tmpSpectralField; /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ amrex::MultiFab m_eigenvalue_matrix; - + /** Twice expanded field that gets filled symmetrically */ amrex::FArrayBox m_expanded_position_array; - + /** Twice expanded Complex field */ amrex::BaseFab> m_expanded_fourier_array; - /** DST plan */ + /** FFT plan */ AnyFFT m_fft; + /** work area for FFT plan */ amrex::Gpu::DeviceVector m_fft_work_area; }; diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp index 0831a5e12f..82aef98292 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp @@ -132,10 +132,11 @@ FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba m_expanded_position_array.setVal(0._rt); - // Allocate and initialize the FFT plans + // Allocate and initialize the FFT plan std::size_t wrok_size = m_fft.Initialize(FFTType::R2C_2D, expanded_position_box.length(0), expanded_position_box.length(1)); + // Allocate work area for the FFT m_fft_work_area.resize(wrok_size); m_fft.SetBuffers(m_expanded_position_array.dataPtr(), m_expanded_fourier_array.dataPtr(), diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H index 8e47b367ae..c981181c7f 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H @@ -63,14 +63,15 @@ private: amrex::MultiFab m_tmpSpectralField; /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ amrex::MultiFab m_eigenvalue_matrix; - + /** Real array for the FFTs */ amrex::Gpu::DeviceVector m_position_array; - + /** Complex array for the FFTs */ amrex::Gpu::DeviceVector> m_fourier_array; - - /** DST plans */ + /** FFT plan in x direction */ AnyFFT m_x_fft; + /** FFT plan in y direction */ AnyFFT m_y_fft; + /** work area for both DST plans */ amrex::Gpu::DeviceVector m_fft_work_area; }; diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp index 587f773804..50f1547bca 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp @@ -223,6 +223,7 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, std::size_t fft_x_area = m_x_fft.Initialize(FFTType::C2R_1D_batched, nx+1, ny); std::size_t fft_y_area = m_y_fft.Initialize(FFTType::C2R_1D_batched, ny+1, nx); + // Allocate work area for both FFTs m_fft_work_area.resize(std::max(fft_x_area, fft_y_area)); m_x_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(), @@ -245,6 +246,7 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) amrex::Real* const real_arr = m_position_array.dataPtr(); amrex::GpuComplex* comp_arr = m_fourier_array.dataPtr(); + // 1D DST in x ToComplex(pos_arr, comp_arr, nx, ny); m_x_fft.Execute(); @@ -253,6 +255,7 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) Transpose(pos_arr, fourier_arr, nx, ny); + // 1D DST in y ToComplex(fourier_arr, comp_arr, ny, nx); m_y_fft.Execute(); @@ -276,6 +279,7 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) }); } + // 1D DST in x ToComplex(fourier_arr, comp_arr, nx, ny); m_x_fft.Execute(); @@ -284,6 +288,7 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) Transpose(fourier_arr, pos_arr, nx, ny); + // 1D DST in y ToComplex(pos_arr, comp_arr, ny, nx); m_y_fft.Execute(); diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H index 15b2195562..e18098764f 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H @@ -69,9 +69,11 @@ private: SpectralField m_tmpSpectralField; /** Multifab containing 1/(kx^2 + ky^2), to solve Poisson equation. */ amrex::MultiFab m_inv_k2; - /** FFT plans */ + /** forward FFT plan */ AnyFFT m_forward_fft; + /** backward FFT plan */ AnyFFT m_backward_fft; + /** work area for both FFT plans */ amrex::Gpu::DeviceVector m_fft_work_area; }; diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp index 21089be089..b37c13d353 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp @@ -98,6 +98,7 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba, std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2C_2D, fft_size[0], fft_size[1]); std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2R_2D, fft_size[0], fft_size[1]); + // Allocate work area for both FFTs m_fft_work_area.resize(std::max(fwd_area, bkw_area)); m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index 9613c4f7b2..7044682416 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -24,19 +24,40 @@ enum struct FFTType { struct AnyFFT { + /** \brief Initialize an FFT plan for the requested transform type using a Vendor FFT library. + * For 1D batched transforms, ny represents the number of batches to calculate at once. + * This function returns the number of bytes of the work area needed for the FFT. The work area + * has to be allocated by the function that uses the FFT and passed into SetBuffers. + * + * \param[in] type Type of FFT to perform + * \param[in] nx Size of the contiguous dimension of the FFT + * \param[in] ny Size of the second dimension of the FFT + */ std::size_t Initialize (FFTType type, int nx, int ny); + /** \brief Set the pointers to the input, output and work area of the FFT. + * This function has to be called after Initialize and before Execute. + * + * \param[in] in Pointer to the input of the FFT + * \param[in] out Pointer to the output of the FFT + * \param[in] work_area Pointer to the work area for the FFT + */ void SetBuffers (void* in, void* out, void* work_area); + /** \brief Perform the initialized FFT */ void Execute (); + /** \brief Destructor to destroy the FFT plan */ ~AnyFFT (); + /** \brief Setup function that has to be called before any FFT plan is initialized. */ static void setup (); + /** \brief Cleanup function that has to be called at the end of the program. */ static void cleanup (); private: + /** Vendor specific data for the FFT */ VendorPlan* m_plan = nullptr; }; diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index c5691451cc..f5615a9088 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -64,6 +64,7 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { m_plan->m_type = type; cufftType transform_type; int rank = 0; + // n is in C order long long int n[2] = {0, 0}; long long int batch = 0; @@ -147,6 +148,8 @@ void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { void AnyFFT::Execute () { cufftResult result; + // There is also cufftXtExec that could replace all of these specific Exec calls, + // however in testing it doesn't work if constexpr (use_float) { switch (m_plan->m_type) { case FFTType::C2C_2D_fwd: diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 79c8a69dbd..24b2e8009f 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -37,6 +37,8 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { m_plan->m_type = type; m_plan->m_nx = nx; m_plan->m_ny = ny; + // fftw doesn't allow for the manual allocation of work area, additionally the input and output + // arrays have to be provided when planing, so we do all the work in the SetBuffers function. return 0; } diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index b448dc1000..95aa27246e 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -67,6 +67,7 @@ std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { rocfft_transform_type transform_type; std::size_t dimensions = 0; + // lengths is in FORTRAN order std::size_t lengths[2] = {0, 0}; std::size_t number_of_transforms = 0; diff --git a/src/laser/MultiLaser.H b/src/laser/MultiLaser.H index 3eff5e6a3b..715c2ef9ad 100644 --- a/src/laser/MultiLaser.H +++ b/src/laser/MultiLaser.H @@ -214,6 +214,7 @@ private: AnyFFT m_forward_fft; /** FFTW plan for backward C2C transform to solve Complex Poisson equation */ AnyFFT m_backward_fft; + /** work area for both FFT plans */ amrex::Gpu::DeviceVector m_fft_work_area; /** Complex FAB to store the solution (e.g. laser envelope on current slice) */ SpectralFieldLoc m_sol; diff --git a/src/laser/MultiLaser.cpp b/src/laser/MultiLaser.cpp index ec0828d153..de5ef83305 100644 --- a/src/laser/MultiLaser.cpp +++ b/src/laser/MultiLaser.cpp @@ -98,6 +98,7 @@ MultiLaser::InitData (const amrex::BoxArray& slice_ba, std::size_t fwd_area = m_forward_fft.Initialize(FFTType::C2C_2D_fwd, fft_size[0], fft_size[1]); std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2C_2D_bkw, fft_size[0], fft_size[1]); + // Allocate work area for both FFTs m_fft_work_area.resize(std::max(fwd_area, bkw_area)); m_forward_fft.SetBuffers(m_rhs.dataPtr(), m_rhs_fourier.dataPtr(), m_fft_work_area.dataPtr()); From 4ee03e99d97c4ee1f2a6115bf1d97c8169b3e968 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 22 Apr 2024 03:46:50 +0200 Subject: [PATCH 07/11] fix doc --- src/fields/fft_poisson_solver/fft/AnyFFT.H | 4 ++-- src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index 7044682416..5ed8736703 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -27,7 +27,7 @@ struct AnyFFT { /** \brief Initialize an FFT plan for the requested transform type using a Vendor FFT library. * For 1D batched transforms, ny represents the number of batches to calculate at once. * This function returns the number of bytes of the work area needed for the FFT. The work area - * has to be allocated by the function that uses the FFT and passed into SetBuffers. + * has to be allocated by the function that uses the FFT and passed into SetBuffers. * * \param[in] type Type of FFT to perform * \param[in] nx Size of the contiguous dimension of the FFT @@ -36,7 +36,7 @@ struct AnyFFT { std::size_t Initialize (FFTType type, int nx, int ny); /** \brief Set the pointers to the input, output and work area of the FFT. - * This function has to be called after Initialize and before Execute. + * This function has to be called after Initialize and before Execute. * * \param[in] in Pointer to the input of the FFT * \param[in] out Pointer to the output of the FFT diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index f5615a9088..69d166c4d4 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -148,7 +148,7 @@ void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { void AnyFFT::Execute () { cufftResult result; - // There is also cufftXtExec that could replace all of these specific Exec calls, + // There is also cufftXtExec that could replace all of these specific Exec calls, // however in testing it doesn't work if constexpr (use_float) { switch (m_plan->m_type) { From c9c983fc0bbba54dcb6c8f74da342356a0b4b4d3 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 22 Apr 2024 17:00:25 +0200 Subject: [PATCH 08/11] fix index bug --- src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp index b37c13d353..c83e4af64c 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp @@ -118,7 +118,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf) #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) #endif - for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + for ( amrex::MFIter mfi(m_tmpSpectralField, DfltMfiTlng); mfi.isValid(); ++mfi ){ // Solve Poisson equation in Fourier space: // Multiply `tmpSpectralField` by inv_k2 Array2> tmp_cmplx_arr = m_tmpSpectralField.array(mfi); From db8c077c5f3cb0b9a0d5fbfe4f8564ebb226b738 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 22 Apr 2024 19:25:56 +0200 Subject: [PATCH 09/11] fix rocfft workbuffer --- src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index 95aa27246e..0fd52045f7 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -50,7 +50,9 @@ std::string rocfftErrorToString (const rocfft_status& err) { return std::string("rocfft_status_invalid_distance"); } else if (err == rocfft_status_invalid_offset) { return std::string("rocfft_status_invalid_offset"); - } else { + } else if (err == rocfft_status_invalid_work_buffer) { + return std::string("rocfft_status_invalid_work_buffer"); + }else { return std::to_string(err) + " (unknown error code)"; } } @@ -139,9 +141,11 @@ void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { status = rocfft_execution_info_create(&(m_plan->m_execinfo)); assert_rocfft_status("rocfft_execution_info_create", status); - status = rocfft_execution_info_set_work_buffer( - m_plan->m_execinfo, work_area, m_plan->m_work_size); - assert_rocfft_status("rocfft_execution_info_set_work_buffer", status); + if (m_plan->m_work_size > 0) { + status = rocfft_execution_info_set_work_buffer( + m_plan->m_execinfo, work_area, m_plan->m_work_size); + assert_rocfft_status("rocfft_execution_info_set_work_buffer", status); + } status = rocfft_execution_info_set_stream(m_plan->m_execinfo, amrex::Gpu::gpuStream()); assert_rocfft_status("rocfft_execution_info_set_stream", status); From 3264e3c43c5be8b0c1172b7b582528a55a6bc92f Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 22 Apr 2024 19:37:26 +0200 Subject: [PATCH 10/11] fix FFTDirichletExpanded for rocfft --- .../fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp index 82aef98292..5b1c3ad258 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp @@ -148,6 +148,9 @@ void FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::SolvePoissonEquation()"); + using namespace amrex::literals; + + m_expanded_position_array.setVal(0._rt); ExpandR2R(m_expanded_position_array, m_stagingArea[0]); @@ -170,6 +173,8 @@ FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf }); } + m_expanded_position_array.setVal(0._rt); + ExpandR2R(m_expanded_position_array, m_tmpSpectralField[0]); m_fft.Execute(); From 7a77d22c1f635e02472c41e194499a6735517915 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Thu, 2 May 2024 20:52:43 +0200 Subject: [PATCH 11/11] add old headers back --- src/fields/fft_poisson_solver/fft/AnyFFT.H | 10 ++++++++-- src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp | 9 +++++++-- src/fields/fft_poisson_solver/fft/WrapFFTW.cpp | 9 +++++++-- src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp | 9 +++++++-- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index 5ed8736703..eb07cc1cb4 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -1,8 +1,14 @@ -/* Copyright 2024 +/* Copyright 2020-2024 * * This file is part of HiPACE++. * - * Authors: AlexanderSinn + * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs + * WeiqunZhang + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. * * License: BSD-3-Clause-LBNL */ diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index 69d166c4d4..84471c665c 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -1,8 +1,13 @@ -/* Copyright 2024 +/* Copyright 2020-2024 * * This file is part of HiPACE++. * - * Authors: AlexanderSinn + * Authors: AlexanderSinn, MaxThevenet, Remi Lehe, Severin Diederichs + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. * * License: BSD-3-Clause-LBNL */ diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 24b2e8009f..e3dc010111 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -1,8 +1,13 @@ -/* Copyright 2024 +/* Copyright 2020-2024 * * This file is part of HiPACE++. * - * Authors: AlexanderSinn + * Authors: AlexanderSinn, MaxThevenet, Remi Lehe + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. * * License: BSD-3-Clause-LBNL */ diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index 0fd52045f7..18cd7ca0d9 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -1,8 +1,13 @@ -/* Copyright 2024 +/* Copyright 2021-2024 * * This file is part of HiPACE++. * - * Authors: AlexanderSinn + * Authors: AlexanderSinn, Axel Huebl + * License: BSD-3-Clause-LBNL + */ +/* Copyright 2019-2020 + * + * This file is part of WarpX. * * License: BSD-3-Clause-LBNL */