From 50e25ec7ff2175a6377b777a5dcc060b7ad1c4d6 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Mon, 9 Dec 2024 11:41:57 -0800 Subject: [PATCH] AMReX Algebra: AlgVector and SpMatrix (#4259) This implements distributed vector and sparse matrix. The functionalities are limited so far. However, there are enough functions implemented for a simple matrix and vector based GMRES solver with a weighted Jacobi smoother. This is still experimental and is not ready for users yet. --- .../dependencies/dependencies_hip.sh | 3 +- .../dependencies/dependencies_nvcc.sh | 3 +- Src/Base/AMReX_TableData.H | 38 +- Src/LinearSolvers/AMReX_AlgPartition.H | 58 ++ Src/LinearSolvers/AMReX_AlgPartition.cpp | 102 +++ Src/LinearSolvers/AMReX_AlgVector.H | 453 ++++++++++++ Src/LinearSolvers/AMReX_Algebra.H | 9 + Src/LinearSolvers/AMReX_GMRES_MV.H | 160 +++++ Src/LinearSolvers/AMReX_Smoother_MV.H | 46 ++ Src/LinearSolvers/AMReX_SpMV.H | 192 ++++++ Src/LinearSolvers/AMReX_SpMatrix.H | 645 ++++++++++++++++++ Src/LinearSolvers/CMakeLists.txt | 8 + Src/LinearSolvers/Make.package | 11 + Tests/Algebra/GMRES/CMakeLists.txt | 9 + Tests/Algebra/GMRES/GNUmakefile | 17 + Tests/Algebra/GMRES/Make.package | 1 + Tests/Algebra/GMRES/main.cpp | 119 ++++ Tests/CMakeLists.txt | 2 +- Tools/CMake/AMReXParallelBackends.cmake | 23 +- Tools/GNUMake/Make.defs | 2 +- Tools/GNUMake/comps/hip.mak | 10 +- 21 files changed, 1883 insertions(+), 28 deletions(-) create mode 100644 Src/LinearSolvers/AMReX_AlgPartition.H create mode 100644 Src/LinearSolvers/AMReX_AlgPartition.cpp create mode 100644 Src/LinearSolvers/AMReX_AlgVector.H create mode 100644 Src/LinearSolvers/AMReX_Algebra.H create mode 100644 Src/LinearSolvers/AMReX_GMRES_MV.H create mode 100644 Src/LinearSolvers/AMReX_Smoother_MV.H create mode 100644 Src/LinearSolvers/AMReX_SpMV.H create mode 100644 Src/LinearSolvers/AMReX_SpMatrix.H create mode 100644 Tests/Algebra/GMRES/CMakeLists.txt create mode 100644 Tests/Algebra/GMRES/GNUmakefile create mode 100644 Tests/Algebra/GMRES/Make.package create mode 100644 Tests/Algebra/GMRES/main.cpp diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh index 6b69c5433a2..527379e7e83 100755 --- a/.github/workflows/dependencies/dependencies_hip.sh +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -58,7 +58,8 @@ sudo apt-get install -y --no-install-recommends \ rocprofiler-dev \ rocrand-dev \ rocfft-dev \ - rocprim-dev + rocprim-dev \ + rocsparse-dev # hiprand-dev is a new package that does not exist in old versions sudo apt-get install -y --no-install-recommends hiprand-dev || true diff --git a/.github/workflows/dependencies/dependencies_nvcc.sh b/.github/workflows/dependencies/dependencies_nvcc.sh index 2578bd33fe7..14bae699d7e 100755 --- a/.github/workflows/dependencies/dependencies_nvcc.sh +++ b/.github/workflows/dependencies/dependencies_nvcc.sh @@ -36,5 +36,6 @@ sudo apt-get install -y \ cuda-nvml-dev-$VERSION_DASHED \ cuda-nvtx-$VERSION_DASHED \ libcufft-dev-$VERSION_DASHED \ - libcurand-dev-$VERSION_DASHED + libcurand-dev-$VERSION_DASHED \ + libcusparse-dev-$VERSION_DASHED sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H index ee2471d36cb..7f58a6d6333 100644 --- a/Src/Base/AMReX_TableData.H +++ b/Src/Base/AMReX_TableData.H @@ -15,12 +15,12 @@ namespace amrex { -template +template struct Table1D { T* AMREX_RESTRICT p = nullptr; - int begin = 1; - int end = 0; + IDX begin = 1; + IDX end = 0; constexpr Table1D () noexcept = default; @@ -33,7 +33,7 @@ struct Table1D {} AMREX_GPU_HOST_DEVICE - constexpr Table1D (T* a_p, int a_begin, int a_end) noexcept + constexpr Table1D (T* a_p, IDX a_begin, IDX a_end) noexcept : p(a_p), begin(a_begin), end(a_end) @@ -44,7 +44,7 @@ struct Table1D template ,int> = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - U& operator() (int i) const noexcept { + U& operator() (IDX i) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i); #endif @@ -53,14 +53,30 @@ struct Table1D #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) AMREX_GPU_HOST_DEVICE inline - void index_assert (int i) const + void index_assert (IDX i) const { if (i < begin || i >= end) { - AMREX_IF_ON_DEVICE(( - AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", - i, begin, end-1); - amrex::Abort(); - )) + if constexpr (std::is_same_v) { + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", + i, begin, end-1); + amrex::Abort(); + )) + } else if constexpr (std::is_same_v) { + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%ld) is out of bound (%ld:%ld)\n", + i, begin, end-1); + amrex::Abort(); + )) + } else if constexpr (std::is_same_v) { + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%lld) is out of bound (%lld:%lld)\n", + i, begin, end-1); + amrex::Abort(); + )) + } else { + AMREX_IF_ON_DEVICE(( amrex::Abort(" Out of bound\n"); )) + } AMREX_IF_ON_HOST(( std::stringstream ss; ss << " (" << i << ") is out of bound (" diff --git a/Src/LinearSolvers/AMReX_AlgPartition.H b/Src/LinearSolvers/AMReX_AlgPartition.H new file mode 100644 index 00000000000..d10284cb80a --- /dev/null +++ b/Src/LinearSolvers/AMReX_AlgPartition.H @@ -0,0 +1,58 @@ +#ifndef AMREX_ALG_PARTITION_H_ +#define AMREX_ALG_PARTITION_H_ +#include + +#include +#include +#include + +#include + +namespace amrex { + +class AlgPartition +{ +public: + AlgPartition (); + explicit AlgPartition (Long global_size); + explicit AlgPartition (Vector const& rows); + explicit AlgPartition (Vector&& rows) noexcept; + + void define (Long global_size); + void define (Vector const& rows); + void define (Vector&& rows); + + [[nodiscard]] bool empty () const { return m_ref->m_row.empty(); } + + [[nodiscard]] Long operator[] (int i) const { return m_ref->m_row[i]; } + [[nodiscard]] Long numGlobalRows () const { return m_ref->m_row.back(); } + [[nodiscard]] int numActiveProcs () const { return m_ref->m_n_active_procs; } + + [[nodiscard]] Vector const& dataVector () const { return m_ref->m_row; } + + [[nodiscard]] bool operator== (AlgPartition const& rhs) const noexcept; + [[nodiscard]] bool operator!= (AlgPartition const& rhs) const noexcept; + +private: + struct Ref + { + friend class AlgPartition; + Ref () = default; + explicit Ref (Long global_size); + explicit Ref (Vector const& rows); + explicit Ref (Vector&& rows); + void define (Long global_size); + void define (Vector const& rows); + void define (Vector&& rows); + void update_n_active_procs (); + + Vector m_row; // size: nprocs + 1 + int m_n_active_procs = 0; + }; + + std::shared_ptr m_ref; +}; + +} + +#endif diff --git a/Src/LinearSolvers/AMReX_AlgPartition.cpp b/Src/LinearSolvers/AMReX_AlgPartition.cpp new file mode 100644 index 00000000000..766b38d0e90 --- /dev/null +++ b/Src/LinearSolvers/AMReX_AlgPartition.cpp @@ -0,0 +1,102 @@ +#include + +namespace amrex { + +AlgPartition::AlgPartition () + : m_ref(std::make_shared()) +{} + +AlgPartition::AlgPartition (Long global_size) + : m_ref(std::make_shared(global_size)) +{} + +AlgPartition::AlgPartition (Vector const& rows) + : m_ref(std::make_shared(rows)) +{} + +AlgPartition::AlgPartition (Vector&& rows) noexcept + : m_ref(std::make_shared(std::move(rows))) +{} + +void AlgPartition::define (Long global_size) +{ + m_ref->define(global_size); +} + +void AlgPartition::define (Vector const& rows) +{ + m_ref->define(rows); +} + +void AlgPartition::define (Vector&& rows) +{ + m_ref->define(std::move(rows)); +} + +bool AlgPartition::operator== (AlgPartition const& rhs) const noexcept +{ + return m_ref == rhs.m_ref || m_ref->m_row == rhs.m_ref->m_row; +} + +bool AlgPartition::operator!= (AlgPartition const& rhs) const noexcept +{ + return !operator==(rhs); +} + +AlgPartition::Ref::Ref (Long global_size) +{ + define(global_size); +} + +AlgPartition::Ref::Ref (Vector const& rows) + : m_row(rows) +{ + update_n_active_procs(); +} + +AlgPartition::Ref::Ref (Vector&& rows) + : m_row(std::move(rows)) +{ + update_n_active_procs(); +} + +void AlgPartition::Ref::define (Long global_size) +{ + auto nprocs = Long(ParallelDescriptor::NProcs()); + Long sz = global_size / nprocs; + Long extra = global_size - sz*nprocs; + m_row.resize(nprocs+1); + for (Long i = 0; i < nprocs; ++i) { + if (i < extra) { + m_row[i] = i*(sz+1); + } else { + m_row[i] = i*sz + extra; + } + } + m_row[nprocs] = global_size; + + update_n_active_procs(); +} + +void AlgPartition::Ref::define (Vector const& rows) +{ + m_row = rows; + update_n_active_procs(); +} + +void AlgPartition::Ref::define (Vector&& rows) +{ + m_row = std::move(rows); + update_n_active_procs(); +} + +void AlgPartition::Ref::update_n_active_procs () +{ + AMREX_ASSERT(m_row.size() == ParallelDescriptor::NProcs()+1); + m_n_active_procs = 0; + for (int i = 0, N = int(m_row.size())-1; i < N; ++i) { + if (m_row[i] < m_row[i+1]) { ++m_n_active_procs; } + } +} + +} diff --git a/Src/LinearSolvers/AMReX_AlgVector.H b/Src/LinearSolvers/AMReX_AlgVector.H new file mode 100644 index 00000000000..f2cb7b45b17 --- /dev/null +++ b/Src/LinearSolvers/AMReX_AlgVector.H @@ -0,0 +1,453 @@ +#ifndef AMREX_ALG_VECTOR_H_ +#define AMREX_ALG_VECTOR_H_ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace amrex { + +template > +class AlgVector +{ +public: + using value_type = T; + using allocator_type = Allocator; + + using Vec = PODVector; + + AlgVector () = default; + explicit AlgVector (Long global_size); + explicit AlgVector (AlgPartition partition); + + AlgVector (AlgVector const&) = delete; + AlgVector& operator= (AlgVector const&) = delete; + + AlgVector (AlgVector &&) noexcept = default; + AlgVector& operator= (AlgVector &&) noexcept = default; + + ~AlgVector () = default; + + void define (Long global_size); + void define (AlgPartition partition); + + [[nodiscard]] bool empty () const { return m_partition.empty(); } + + [[nodiscard]] AlgPartition const& partition () const { return m_partition; } + + [[nodiscard]] Long numLocalRows () const { return m_end - m_begin; } + [[ nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); } + + //! Inclusive global index begin. + [[nodiscard]] Long globalBegin () const { return m_begin; } + //! Exclusive global index end. + [[nodiscard]] Long globalEnd () const { return m_end; } + + [[nodiscard]] T const* data () const { return m_data.data(); } + [[nodiscard]] T * data () { return m_data.data(); } + + [[nodiscard]] AMREX_FORCE_INLINE + Table1D view () const { + return Table1D{m_data.data(), m_begin, m_end}; + } + + [[nodiscard]] AMREX_FORCE_INLINE + Table1D const_view () const { + return Table1D{m_data.data(), m_begin, m_end}; + } + + [[nodiscard]] AMREX_FORCE_INLINE + Table1D view () { + return Table1D{m_data.data(), m_begin, m_end}; + } + + void setVal (T val); + void setValAsync (T val); + + void copy (AlgVector const& rhs); + void copyAsync (AlgVector const& rhs); + + void plus (AlgVector const& rhs); + void plusAsync (AlgVector const& rhs); + + void scale (T scale_factor); + void scaleAsync (T scale_factor); + + [[nodiscard]] T sum (bool local = false) const; + + [[nodiscard]] T norminf (bool local = false) const; + [[nodiscard]] T norm2 (bool local = false) const; + + template ::value && + std::is_same_v, int> = 0> + void copyFrom (FabArray const& fa); + + template ::value && + std::is_same_v,int> = 0> + void copyTo (FabArray & fa) const; + + void printToFile (std::string const& file) const; + +private: + AlgPartition m_partition; + Long m_begin = 0; + Long m_end = 0; + Vec m_data; +}; + +template +AlgVector::AlgVector (Long global_size) + : m_partition(global_size), + m_begin(m_partition[ParallelDescriptor::MyProc()]), + m_end(m_partition[ParallelDescriptor::MyProc()+1]), + m_data(m_end-m_begin) +{ + static_assert(std::is_floating_point::value, "AlgVector is for floating point type only"); +} + +template +AlgVector::AlgVector (AlgPartition partition) + : m_partition(std::move(partition)), + m_begin(m_partition[ParallelDescriptor::MyProc()]), + m_end(m_partition[ParallelDescriptor::MyProc()+1]), + m_data(m_end-m_begin) +{ + static_assert(std::is_floating_point::value, "AlgVector is for floating point type only"); +} + +template +void AlgVector::define (Long global_size) +{ + m_partition.define(global_size); + m_begin = m_partition[ParallelDescriptor::MyProc()]; + m_end = m_partition[ParallelDescriptor::MyProc()+1]; + m_data.resize(m_end-m_begin); +} + +template +void AlgVector::define (AlgPartition partition) +{ + m_partition = std::move(partition); + m_begin = m_partition[ParallelDescriptor::MyProc()]; + m_end = m_partition[ParallelDescriptor::MyProc()+1]; + m_data.resize(m_end-m_begin); +} + +template +void AlgVector::setVal (T val) +{ + setValAsync(val); + Gpu::streamSynchronize(); +} + +template +void AlgVector::setValAsync (T val) +{ + Long n = m_data.size(); + T* p = m_data.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept { p[i] = val; }); +} + +template +void AlgVector::copy (AlgVector const& rhs) +{ + copyAsync(rhs); + Gpu::streamSynchronize(); +} + +template +void AlgVector::copyAsync (AlgVector const& rhs) +{ + Long n = m_data.size(); + AMREX_ASSERT(m_data.size() == rhs.m_data.size()); + T* dst = m_data.data(); + T const* src = rhs.data(); +#ifdef AMREX_USE_GPU + Gpu::dtod_memcpy_async(dst, src, n*sizeof(T)); +#else + std::memcpy(dst, src, n*sizeof(T)); +#endif +} + +template +void AlgVector::plus (AlgVector const& rhs) +{ + plusAsync(rhs); + Gpu::streamSynchronize(); +} + +template +void AlgVector::plusAsync (AlgVector const& rhs) +{ + Long n = m_data.size(); + AMREX_ASSERT(m_data.size() == rhs.m_data.size()); + T* dst = m_data.data(); + T const* src = rhs.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept { dst[i] += src[i]; }); +} + +template +void AlgVector::scale (T scale_factor) +{ + scaleAsync(scale_factor); + Gpu::streamSynchronize(); +} + +template +void AlgVector::scaleAsync (T scale_factor) +{ + Long n = m_data.size(); + T* p = m_data.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept { p[i] *= scale_factor; }); +} + +template +T AlgVector::sum (bool local) const +{ + Long n = m_data.size(); + T const* p = m_data.data(); + T r = Reduce::Sum(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + return p[i]; + }); + if (!local) { + ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub()); + } + return r; +} + +template +T AlgVector::norminf (bool local) const +{ + Long n = m_data.size(); + T const* p = m_data.data(); + T r = Reduce::Max(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + return amrex::Math::abs(p[i]); + }); + if (!local) { + ParallelAllReduce::Max(r, ParallelContext::CommunicatorSub()); + } + return r; +} + +template +T AlgVector::norm2 (bool local) const +{ + Long n = m_data.size(); + T const* p = m_data.data(); + T r = Reduce::Sum(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + return p[i]*p[i]; + }); + if (!local) { + ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub()); + } + return std::sqrt(r); +} + +template +template ::value && + std::is_same_v, int> > +void AlgVector::copyFrom (FabArray const& fa) +{ + AMREX_ASSERT(fa.is_cell_centered()); + + LayoutData dptrs(fa.boxArray(), fa.DistributionMap()); + T* p = m_data.data(); + for (MFIter mfi(fa); mfi.isValid(); ++mfi) { + dptrs[mfi] = p; + p += mfi.validbox().numPts(); + } + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) +#pragma omp parallel +#endif + for (MFIter mfi(fa); mfi.isValid(); ++mfi) { + fa[mfi].template copyToMem(mfi.validbox(), 0, 1, dptrs[mfi]); + } +} + +template +template ::value && + std::is_same_v, int> > +void AlgVector::copyTo (FabArray & fa) const +{ + AMREX_ASSERT(fa.is_cell_centered()); + + LayoutData dptrs(fa.boxArray(), fa.DistributionMap()); + T const* p = m_data.data(); + for (MFIter mfi(fa); mfi.isValid(); ++mfi) { + dptrs[mfi] = p; + p += mfi.validbox().numPts(); + } + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) +#pragma omp parallel +#endif + for (MFIter mfi(fa); mfi.isValid(); ++mfi) { + fa[mfi].template copyFromMem(mfi.validbox(), 0, 1, dptrs[mfi]); + } +} + +template +void AlgVector::printToFile (std::string const& file) const +{ + std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc())); + ofs << m_begin << " " << m_end << "\n"; +#ifdef AMREX_USE_GPU + Gpu::PinnedVector hv(m_data.size()); + Gpu::dtoh_memcpy_async(hv.data(), m_data.data(), m_data.size()*sizeof(T)); + Gpu::streamSynchronize(); + T const* p = hv.data(); +#else + T const* p = m_data; +#endif + for (Long i = 0, N = m_data.size(); i < N; ++i) { + ofs << i+m_begin << " " << p[i] << "\n"; + } +} + +template struct IsAlgVector : std::false_type {}; +// +template +struct IsAlgVector, + V> > > + : std::true_type {}; + +template +std::enable_if_t >::value> +ForEach (V1 & x, F const& f) +{ + Long n = x.numLocalRows(); + auto* px = x.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + f(px[i]); + }); +} + +template +std::enable_if_t >::value && + IsAlgVector >::value> +ForEach (V1 & x, V2 & y, F const& f) +{ + AMREX_ASSERT(x.numLocalRows() == y.numLocalRows()); + Long n = x.numLocalRows(); + auto* AMREX_RESTRICT px = x.data(); + auto* AMREX_RESTRICT py = y.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + f(px[i], py[i]); + }); +} + +template +std::enable_if_t >::value && + IsAlgVector >::value && + IsAlgVector >::value> +ForEach (V1 & x, V2 & y, V3 & z, F const& f) +{ + AMREX_ASSERT(x.numLocalRows() == y.numLocalRows()); + AMREX_ASSERT(x.numLocalRows() == z.numLocalRows()); + Long n = x.numLocalRows(); + auto* AMREX_RESTRICT px = x.data(); + auto* AMREX_RESTRICT py = y.data(); + auto* AMREX_RESTRICT pz = z.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + f(px[i], py[i], pz[i]); + }); +} + +template +std::enable_if_t >::value && + IsAlgVector >::value && + IsAlgVector >::value && + IsAlgVector >::value> +ForEach (V1 & x, V2 & y, V3 & z, V4 & a, F const& f) +{ + AMREX_ASSERT(x.numLocalRows() == y.numLocalRows()); + AMREX_ASSERT(x.numLocalRows() == z.numLocalRows()); + AMREX_ASSERT(x.numLocalRows() == a.numLocalRows()); + Long n = x.numLocalRows(); + auto* AMREX_RESTRICT px = x.data(); + auto* AMREX_RESTRICT py = y.data(); + auto* AMREX_RESTRICT pz = z.data(); + auto* AMREX_RESTRICT pa = a.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + f(px[i], py[i], pz[i], pa[i]); + }); +} + +template +std::enable_if_t >::value && + IsAlgVector >::value && + IsAlgVector >::value && + IsAlgVector >::value && + IsAlgVector >::value> +ForEach (V1 & x, V2 & y, V3 & z, V4 & a, V5 & b, F const& f) +{ + AMREX_ASSERT(x.numLocalRows() == y.numLocalRows()); + AMREX_ASSERT(x.numLocalRows() == z.numLocalRows()); + AMREX_ASSERT(x.numLocalRows() == a.numLocalRows()); + AMREX_ASSERT(x.numLocalRows() == b.numLocalRows()); + Long n = x.numLocalRows(); + auto* AMREX_RESTRICT px = x.data(); + auto* AMREX_RESTRICT py = y.data(); + auto* AMREX_RESTRICT pz = z.data(); + auto* AMREX_RESTRICT pa = a.data(); + auto* AMREX_RESTRICT pb = b.data(); + ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + f(px[i], py[i], pz[i], pa[i], pb[i]); + }); +} + +template +T Dot (AlgVector const& x, AlgVector const& y, bool local = false) +{ + AMREX_ASSERT(x.numLocalRows() == y.numLocalRows()); + Long n = x.numLocalRows(); + auto const* px = x.data(); + auto const* py = y.data(); + T r = Reduce::Sum(n, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + return px[i] * py[i]; + }); + if (!local) { + ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub()); + } + return r; +} + +template +void Axpy (AlgVector& y, T a, AlgVector const& x, bool async = false) +{ + ForEach(y, x, [=] AMREX_GPU_DEVICE (T& yi, T const& xi) { yi += a*xi; }); + if (!async) { Gpu::streamSynchronize(); } +} + +template +void LinComb (AlgVector& y, T a, AlgVector const& xa, T b, AlgVector const& xb, bool async = false) +{ + ForEach(y, xa, xb, [=] AMREX_GPU_DEVICE (T& yi, T const& xai, T const& xbi) { + yi = a*xai + b*xbi; + }); + if (!async) { Gpu::streamSynchronize(); } +} + +} + +#endif diff --git a/Src/LinearSolvers/AMReX_Algebra.H b/Src/LinearSolvers/AMReX_Algebra.H new file mode 100644 index 00000000000..86c1d11f86c --- /dev/null +++ b/Src/LinearSolvers/AMReX_Algebra.H @@ -0,0 +1,9 @@ +#ifndef AMREX_ALGEBRA_H_ +#define AMREX_ALGEBRA_H_ + +#include +#include +#include +#include + +#endif diff --git a/Src/LinearSolvers/AMReX_GMRES_MV.H b/Src/LinearSolvers/AMReX_GMRES_MV.H new file mode 100644 index 00000000000..22eb2ad3957 --- /dev/null +++ b/Src/LinearSolvers/AMReX_GMRES_MV.H @@ -0,0 +1,160 @@ +#ifndef AMREX_GMRES_MV_H_ +#define AMREX_GMRES_MV_H_ + +#include +#include +#include +#include + +namespace amrex { + +template +class GMRES_MV +{ +public: + using RT = Real; // double or float + using VEC = AlgVector; + using MAT = SpMatrix; + using GM = amrex::GMRES>; + using PC = std::function; + + GMRES_MV (MAT const* a_mat); + + void setPrecond (PC a_pc) { m_pc = std::move(a_pc); } + + /** + * \brief Solve the linear system + * + * \param a_sol unknowns, i.e., x in A x = b. + * \param a_rhs RHS, i.e., b in A x = b. + * \param a_tol_rel relative tolerance. + * \param a_tol_abs absolute tolerance. + */ + void solve (VEC& a_sol, VEC const& a_rhs, T a_tol_rel, T a_tol_abs); + + //! Sets verbosity. + void setVerbose (int v) { m_gmres.setVerbose(v); } + + //! Get the GMRES object. + GM& getGMRES () { return m_gmres; } + + //! Make MultiFab without ghost cells + [[nodiscard]] VEC makeVecRHS () const; + + //! Make MultiFab with ghost cells and set ghost cells to zero + [[nodiscard]] VEC makeVecLHS () const; + + static T norm2 (VEC const& vec); + + static void scale (VEC& vec, T scale_factor); + + static T dotProduct (VEC const& vec1, VEC const& vec2); + + //! lhs = 0 + static void setToZero (VEC& lhs); + + //! lhs = rhs + static void assign (VEC& lhs, VEC const& rhs); + + //! lhs += a*rhs + static void increment (VEC& lhs, VEC const& rhs, T a); + + //! lhs = a*rhs_a + b*rhs_b + static void linComb (VEC& lhs, T a, VEC const& rhs_a, T b, VEC const& rhs_b); + + //! lhs = L(rhs) + void apply (VEC& lhs, VEC& rhs) const; + + void precond (VEC& lhs, VEC const& rhs) const; + +private: + GM m_gmres; + MAT const* m_mat = nullptr; + PC m_pc; +}; + +template +GMRES_MV::GMRES_MV (MAT const* a_mat) + : m_mat(a_mat) +{ + m_gmres.define(*this); +} + +template +void GMRES_MV::solve (VEC& a_sol, VEC const& a_rhs, T a_tol_rel, T a_tol_abs) +{ + m_gmres.solve(a_sol, a_rhs, a_tol_rel, a_tol_abs); +} + +template +auto GMRES_MV::makeVecRHS () const -> VEC +{ + return VEC(m_mat->partition()); +} + +template +auto GMRES_MV::makeVecLHS () const -> VEC +{ + return VEC(m_mat->partition()); +} + +template +T GMRES_MV::norm2 (VEC const& vec) +{ + return vec.norm2(); +} + +template +void GMRES_MV::scale (VEC& vec, T scale_factor) +{ + vec.scaleAsync(scale_factor); +} + +template +T GMRES_MV::dotProduct (VEC const& vec1, VEC const& vec2) +{ + return amrex::Dot(vec1,vec2); +} + +template +void GMRES_MV::setToZero (VEC& lhs) +{ + lhs.setValAsync(0); +} + +template +void GMRES_MV::assign (VEC& lhs, VEC const& rhs) +{ + lhs.copyAsync(rhs); +} + +template +void GMRES_MV::increment (VEC& lhs, VEC const& rhs, T a) +{ + amrex::Axpy(lhs, a, rhs, true); +} + +template +void GMRES_MV::linComb (VEC& lhs, T a, VEC const& rhs_a, T b, VEC const& rhs_b) +{ + amrex::LinComb(lhs, a, rhs_a, b, rhs_b, true); +} + +template +void GMRES_MV::apply (VEC& lhs, VEC& rhs) const +{ + amrex::SpMV(lhs, *m_mat, rhs); +} + +template +void GMRES_MV::precond (VEC& lhs, VEC const& rhs) const +{ + if (m_pc) { + m_pc(lhs, rhs); + } else { + lhs.copyAsync(rhs); + } +} + +} +#endif diff --git a/Src/LinearSolvers/AMReX_Smoother_MV.H b/Src/LinearSolvers/AMReX_Smoother_MV.H new file mode 100644 index 00000000000..fa65a385c4b --- /dev/null +++ b/Src/LinearSolvers/AMReX_Smoother_MV.H @@ -0,0 +1,46 @@ +#ifndef AMREX_SMOOTHER_MV_H_ +#define AMREX_SMOOTHER_MV_H_ + +#include +#include + +namespace amrex { + +template +class JacobiSmoother +{ +public: + explicit JacobiSmoother (SpMatrix const* a_A) : m_A(a_A) {} + + int setNumIters (int a_niters) { return std::exchange(m_niters, a_niters); } + + void operator() (AlgVector& xvec, AlgVector const& bvec) + { + auto const& diag = m_A->diagonalVector(); + AlgVector Axvec(xvec.partition()); + xvec.setVal(0); + for (int iter = 0; iter < m_niters; ++iter) { + if (iter == 0) { + Axvec.setVal(0); + } else { + SpMV(Axvec, *m_A, xvec); + } + ForEach(xvec, Axvec, bvec, diag, + [=] AMREX_GPU_DEVICE (T& x, T const& ax, T const& b, T const& d) + { + if (d != T(0)) { + x += (b-ax)/d * T(2./3.); // weighted Jacobi + } + }); + } + Gpu::streamSynchronize(); + } + +private: + SpMatrix const* m_A; + int m_niters = 4; +}; + +} + +#endif diff --git a/Src/LinearSolvers/AMReX_SpMV.H b/Src/LinearSolvers/AMReX_SpMV.H new file mode 100644 index 00000000000..7141dcf676e --- /dev/null +++ b/Src/LinearSolvers/AMReX_SpMV.H @@ -0,0 +1,192 @@ +#ifndef AMREX_SPMV_H_ +#define AMREX_SPMV_H_ +#include + +#include +#include +#include + +#if defined(AMREX_USE_CUDA) +# include +#elif defined(AMREX_USE_HIP) +# include +#elif defined(AMREX_USE_DPCPP) +# include +#endif + +namespace amrex { + +template +void SpMV (AlgVector& y, SpMatrix const& A, AlgVector const& x) +{ + // xxxxx TODOL We might want to cache the cusparse and rocsparse handles + + // xxxxx TODO: let's assume it's square matrix for now. + AMREX_ALWAYS_ASSERT(x.partition() == y.partition() && + x.partition() == A.partition()); + + const_cast&>(A).startComm(x); + + T * AMREX_RESTRICT py = y.data(); + T const* AMREX_RESTRICT px = x.data(); + T const* AMREX_RESTRICT mat = A.data(); + auto const* AMREX_RESTRICT col = A.columnIndex(); + auto const* AMREX_RESTRICT row = A.rowOffset(); + +#if defined(AMREX_USE_GPU) + + Long const nrows = A.numLocalRows(); + Long const ncols = x.numLocalRows(); + Long const nnz = A.numLocalNonZero(); + +#if defined(AMREX_USE_CUDA) + + cusparseHandle_t handle; + cusparseCreate(&handle); + cusparseSetStream(handle, Gpu::gpuStream()); + + cudaDataType data_type; + if constexpr (std::is_same_v) { + data_type = CUDA_R_32F; + } else if constexpr (std::is_same_v) { + data_type = CUDA_R_64F; + } else if constexpr (std::is_same_v>) { + data_type = CUDA_C_32F; + } else if constexpr (std::is_same_v>) { + data_type = CUDA_C_64F; + } else { + amrex::Abort("SpMV: unsupported data type"); + } + + cusparseIndexType_t index_type = CUSPARSE_INDEX_64I; + + cusparseSpMatDescr_t mat_descr; + cusparseCreateCsr(&mat_descr, nrows, ncols, nnz, (void*)row, (void*)col, (void*)mat, + index_type, index_type, CUSPARSE_INDEX_BASE_ZERO, data_type); + + cusparseDnVecDescr_t x_descr; + cusparseCreateDnVec(&x_descr, ncols, (void*)px, data_type); + + cusparseDnVecDescr_t y_descr; + cusparseCreateDnVec(&y_descr, nrows, (void*)py, data_type); + + T alpha = T(1); + T beta = T(0); + + std::size_t buffer_size; + cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_descr, x_descr, + &beta, y_descr, data_type, CUSPARSE_SPMV_ALG_DEFAULT, &buffer_size); + + auto* pbuffer = (void*)The_Arena()->alloc(buffer_size); + + cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_descr, x_descr, + &beta, y_descr, data_type, CUSPARSE_SPMV_ALG_DEFAULT, pbuffer); + + Gpu::streamSynchronize(); + + cusparseDestroySpMat(mat_descr); + cusparseDestroyDnVec(x_descr); + cusparseDestroyDnVec(y_descr); + cusparseDestroy(handle); + The_Arena()->free(pbuffer); + +#elif defined(AMREX_USE_HIP) + + rocsparse_handle handle; + rocsparse_create_handle(&handle); + rocsparse_set_stream(handle, Gpu::gpuStream()); + + rocsparse_datatype data_type; + if constexpr (std::is_same_v) { + data_type = rocsparse_datatype_f32_r; + } else if constexpr (std::is_same_v) { + data_type = rocsparse_datatype_f64_r; + } else if constexpr (std::is_same_v>) { + data_type = rocsparse_datatype_f32_c; + } else if constexpr (std::is_same_v>) { + data_type = rocsparse_datatype_f64_c; + } else { + amrex::Abort("SpMV: unsupported data type"); + } + + rocsparse_indextype index_type = rocsparse_indextype_i64; + + rocsparse_spmat_descr mat_descr; + rocsparse_create_csr_descr(&mat_descr, nrows, ncols, nnz, (void*)row, (void*)col, + (void*)mat, index_type, index_type, + rocsparse_index_base_zero, data_type); + + rocsparse_dnvec_descr x_descr; + rocsparse_create_dnvec_descr(&x_descr, ncols, (void*)px, data_type); + + rocsparse_dnvec_descr y_descr; + rocsparse_create_dnvec_descr(&y_descr, nrows, (void*)py, data_type); + + T alpha = T(1.0); + T beta = T(0.0); + + std::size_t buffer_size; + rocsparse_spmv(handle, rocsparse_operation_none, &alpha, mat_descr, x_descr, + &beta, y_descr, data_type, rocsparse_spmv_alg_default, +#if (HIP_VERSION_MAJOR >= 6) + rocsparse_spmv_stage_buffer_size, +#endif + &buffer_size, nullptr); + + void* pbuffer = (void*)The_Arena()->alloc(buffer_size); + +#if (HIP_VERSION_MAJOR >= 6) + rocsparse_spmv(handle, rocsparse_operation_none, &alpha, mat_descr, x_descr, + &beta, y_descr, data_type, rocsparse_spmv_alg_default, + rocsparse_spmv_stage_preprocess, &buffer_size, pbuffer); +#endif + + rocsparse_spmv(handle, rocsparse_operation_none, &alpha, mat_descr, x_descr, + &beta, y_descr, data_type, rocsparse_spmv_alg_default, +#if (HIP_VERSION_MAJOR >= 6) + rocsparse_spmv_stage_compute, +#endif + &buffer_size, pbuffer); + + Gpu::streamSynchronize(); + + rocsparse_destroy_spmat_descr(mat_descr); + rocsparse_destroy_dnvec_descr(x_descr); + rocsparse_destroy_dnvec_descr(y_descr); + rocsparse_destroy_handle(handle); + The_Arena()->free(pbuffer); + +#elif defined(AMREX_USE_DPCPP) + + mkl::sparse::matrix_handle_t handle{}; + mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle, nrows, ncols, mkl::index_base::zero, + (Long*)row, (Long*)col, (T*)mat); + mkl::sparse::gemv(Gpu::Device::streamQueue(), mkl::transpose::nontrans, + T(1), handle, px, T(0), py); + +#endif + + AMREX_GPU_ERROR_CHECK(); + +#else + + Long const ny = y.numLocalRows(); + for (Long i = 0; i < ny; ++i) { + T r = 0; +#ifdef AMREX_USE_OMP +#pragma omp parallel for reduction(+:r) +#endif + for (Long j = row[i]; j < row[i+1]; ++j) { + r += mat[j] * px[col[j]]; + } + py[i] = r; + } + +#endif + + const_cast&>(A).finishComm(y); +} + +} + +#endif diff --git a/Src/LinearSolvers/AMReX_SpMatrix.H b/Src/LinearSolvers/AMReX_SpMatrix.H new file mode 100644 index 00000000000..a380fffda48 --- /dev/null +++ b/Src/LinearSolvers/AMReX_SpMatrix.H @@ -0,0 +1,645 @@ +#ifndef AMREX_SP_MATRIX_H_ +#define AMREX_SP_MATRIX_H_ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace amrex { + +template class Allocator = DefaultAllocator> +class SpMatrix +{ +public: + using value_type = T; + + SpMatrix () = default; + + SpMatrix (AlgPartition partition, int nnz); + + SpMatrix (SpMatrix const&) = delete; + SpMatrix& operator= (SpMatrix const&) = delete; + + SpMatrix (SpMatrix &&) = default; + SpMatrix& operator= (SpMatrix &&) = default; + + ~SpMatrix () = default; + + void define (AlgPartition partition, int nnz); + + [[nodiscard]] AlgPartition const& partition () const { return m_partition; } + + [[nodiscard]] Long numLocalRows () const { return m_row_end - m_row_begin; } + [[nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); } + [[nodiscard]] Long numLocalNonZero () const { return m_data.nnz; } + + //! Inclusive global index begin. + [[nodiscard]] Long globalRowBegin () const { return m_row_begin; } + //! Exclusive global index end. + [[nodiscard]] Long globalRowEnd () const { return m_row_end; } + + [[nodiscard]] T const* data () const { return m_data.mat.data(); } + [[nodiscard]] T * data () { return m_data.mat.data(); } + [[nodiscard]] Long const* columnIndex () const { return m_data.col_index.data(); } + [[nodiscard]] Long * columnIndex () { return m_data.col_index.data(); } + [[nodiscard]] Long const* rowOffset () const { return m_data.row_offset.data(); } + [[nodiscard]] Long * rowOffset () { return m_data.row_offset.data(); } + + void printToFile (std::string const& file) const; + + template + void setVal (F const& f); + + [[nodiscard]] AlgVector const& diagonalVector () const; + + template friend void SpMV(AlgVector& y, SpMatrix const& A, AlgVector const& x); + + //! Private function, but public for cuda + void define_doit (int nnz); + +#ifdef AMREX_USE_MPI + //! Private function, but public for cuda + void prepare_comm (); + void pack_buffer (AlgVector const& v); + void unpack_buffer (AlgVector& v); +#endif + +private: + + void startComm (AlgVector const& x); + void finishComm (AlgVector& y); + + template using DVec = PODVector >; + + template