diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index 6b69c5433a..527379e7e8 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -58,7 +58,8 @@ sudo apt-get install -y --no-install-recommends \
     rocprofiler-dev \
     rocrand-dev     \
     rocfft-dev      \
-    rocprim-dev
+    rocprim-dev     \
+    rocsparse-dev
 
 # hiprand-dev is a new package that does not exist in old versions
 sudo apt-get install -y --no-install-recommends hiprand-dev || true
diff --git a/.github/workflows/dependencies/dependencies_nvcc.sh b/.github/workflows/dependencies/dependencies_nvcc.sh
index 2578bd33fe..14bae699d7 100755
--- a/.github/workflows/dependencies/dependencies_nvcc.sh
+++ b/.github/workflows/dependencies/dependencies_nvcc.sh
@@ -36,5 +36,6 @@ sudo apt-get install -y \
     cuda-nvml-dev-$VERSION_DASHED           \
     cuda-nvtx-$VERSION_DASHED               \
     libcufft-dev-$VERSION_DASHED            \
-    libcurand-dev-$VERSION_DASHED
+    libcurand-dev-$VERSION_DASHED           \
+    libcusparse-dev-$VERSION_DASHED
 sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda
diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H
index ee2471d36c..7f58a6d633 100644
--- a/Src/Base/AMReX_TableData.H
+++ b/Src/Base/AMReX_TableData.H
@@ -15,12 +15,12 @@
 
 namespace amrex {
 
-template <typename T>
+template <typename T, typename IDX = int>
 struct Table1D
 {
     T* AMREX_RESTRICT p = nullptr;
-    int begin = 1;
-    int end = 0;
+    IDX begin = 1;
+    IDX end = 0;
 
     constexpr Table1D () noexcept = default;
 
@@ -33,7 +33,7 @@ struct Table1D
         {}
 
     AMREX_GPU_HOST_DEVICE
-    constexpr Table1D (T* a_p, int a_begin, int a_end) noexcept
+    constexpr Table1D (T* a_p, IDX a_begin, IDX a_end) noexcept
         : p(a_p),
           begin(a_begin),
           end(a_end)
@@ -44,7 +44,7 @@ struct Table1D
 
     template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    U& operator() (int i) const noexcept {
+    U& operator() (IDX i) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i);
 #endif
@@ -53,14 +53,30 @@ struct Table1D
 
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
     AMREX_GPU_HOST_DEVICE inline
-    void index_assert (int i) const
+    void index_assert (IDX i) const
     {
         if (i < begin || i >= end) {
-            AMREX_IF_ON_DEVICE((
-                AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n",
-                                    i, begin, end-1);
-                amrex::Abort();
-            ))
+            if constexpr (std::is_same_v<IDX,int>) {
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n",
+                                        i, begin, end-1);
+                    amrex::Abort();
+                ))
+            } else if constexpr (std::is_same_v<IDX,long>) {
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" (%ld) is out of bound (%ld:%ld)\n",
+                                        i, begin, end-1);
+                    amrex::Abort();
+                ))
+            } else if constexpr (std::is_same_v<IDX,long long>) {
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" (%lld) is out of bound (%lld:%lld)\n",
+                                        i, begin, end-1);
+                    amrex::Abort();
+                ))
+            } else {
+                AMREX_IF_ON_DEVICE(( amrex::Abort(" Out of bound\n"); ))
+            }
             AMREX_IF_ON_HOST((
                 std::stringstream ss;
                 ss << " (" << i << ") is out of bound ("
diff --git a/Src/LinearSolvers/AMReX_AlgPartition.H b/Src/LinearSolvers/AMReX_AlgPartition.H
new file mode 100644
index 0000000000..d10284cb80
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_AlgPartition.H
@@ -0,0 +1,58 @@
+#ifndef AMREX_ALG_PARTITION_H_
+#define AMREX_ALG_PARTITION_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_INT.H>
+#include <AMReX_ParallelDescriptor.H>
+#include <AMReX_Vector.H>
+
+#include <memory>
+
+namespace amrex {
+
+class AlgPartition
+{
+public:
+    AlgPartition ();
+    explicit AlgPartition (Long global_size);
+    explicit AlgPartition (Vector<Long> const& rows);
+    explicit AlgPartition (Vector<Long>&& rows) noexcept;
+
+    void define (Long global_size);
+    void define (Vector<Long> const& rows);
+    void define (Vector<Long>&& rows);
+
+    [[nodiscard]] bool empty () const { return m_ref->m_row.empty(); }
+
+    [[nodiscard]] Long operator[] (int i) const { return m_ref->m_row[i]; }
+    [[nodiscard]] Long numGlobalRows () const { return m_ref->m_row.back(); }
+    [[nodiscard]] int numActiveProcs () const { return m_ref->m_n_active_procs; }
+
+    [[nodiscard]] Vector<Long> const& dataVector () const { return m_ref->m_row; }
+
+    [[nodiscard]] bool operator== (AlgPartition const& rhs) const noexcept;
+    [[nodiscard]] bool operator!= (AlgPartition const& rhs) const noexcept;
+
+private:
+    struct Ref
+    {
+        friend class AlgPartition;
+        Ref () = default;
+        explicit Ref (Long global_size);
+        explicit Ref (Vector<Long> const& rows);
+        explicit Ref (Vector<Long>&& rows);
+        void define (Long global_size);
+        void define (Vector<Long> const& rows);
+        void define (Vector<Long>&& rows);
+        void update_n_active_procs ();
+
+        Vector<Long> m_row; // size: nprocs + 1
+        int m_n_active_procs = 0;
+    };
+
+    std::shared_ptr<Ref> m_ref;
+};
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/AMReX_AlgPartition.cpp b/Src/LinearSolvers/AMReX_AlgPartition.cpp
new file mode 100644
index 0000000000..766b38d0e9
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_AlgPartition.cpp
@@ -0,0 +1,102 @@
+#include <AMReX_AlgPartition.H>
+
+namespace amrex {
+
+AlgPartition::AlgPartition ()
+    : m_ref(std::make_shared<Ref>())
+{}
+
+AlgPartition::AlgPartition (Long global_size)
+    : m_ref(std::make_shared<Ref>(global_size))
+{}
+
+AlgPartition::AlgPartition (Vector<Long> const& rows)
+    : m_ref(std::make_shared<Ref>(rows))
+{}
+
+AlgPartition::AlgPartition (Vector<Long>&& rows) noexcept
+    : m_ref(std::make_shared<Ref>(std::move(rows)))
+{}
+
+void AlgPartition::define (Long global_size)
+{
+    m_ref->define(global_size);
+}
+
+void AlgPartition::define (Vector<Long> const& rows)
+{
+    m_ref->define(rows);
+}
+
+void AlgPartition::define (Vector<Long>&& rows)
+{
+    m_ref->define(std::move(rows));
+}
+
+bool AlgPartition::operator== (AlgPartition const& rhs) const noexcept
+{
+    return m_ref == rhs.m_ref || m_ref->m_row == rhs.m_ref->m_row;
+}
+
+bool AlgPartition::operator!= (AlgPartition const& rhs) const noexcept
+{
+    return !operator==(rhs);
+}
+
+AlgPartition::Ref::Ref (Long global_size)
+{
+    define(global_size);
+}
+
+AlgPartition::Ref::Ref (Vector<Long> const& rows)
+    : m_row(rows)
+{
+    update_n_active_procs();
+}
+
+AlgPartition::Ref::Ref (Vector<Long>&& rows)
+    : m_row(std::move(rows))
+{
+    update_n_active_procs();
+}
+
+void AlgPartition::Ref::define (Long global_size)
+{
+    auto nprocs = Long(ParallelDescriptor::NProcs());
+    Long sz = global_size / nprocs;
+    Long extra = global_size - sz*nprocs;
+    m_row.resize(nprocs+1);
+    for (Long i = 0; i < nprocs; ++i) {
+        if (i < extra) {
+            m_row[i] = i*(sz+1);
+        } else {
+            m_row[i] = i*sz + extra;
+        }
+    }
+    m_row[nprocs] = global_size;
+
+    update_n_active_procs();
+}
+
+void AlgPartition::Ref::define (Vector<Long> const& rows)
+{
+    m_row = rows;
+    update_n_active_procs();
+}
+
+void AlgPartition::Ref::define (Vector<Long>&& rows)
+{
+    m_row = std::move(rows);
+    update_n_active_procs();
+}
+
+void AlgPartition::Ref::update_n_active_procs ()
+{
+    AMREX_ASSERT(m_row.size() == ParallelDescriptor::NProcs()+1);
+    m_n_active_procs = 0;
+    for (int i = 0, N = int(m_row.size())-1; i < N; ++i) {
+        if (m_row[i] < m_row[i+1]) { ++m_n_active_procs; }
+    }
+}
+
+}
diff --git a/Src/LinearSolvers/AMReX_AlgVector.H b/Src/LinearSolvers/AMReX_AlgVector.H
new file mode 100644
index 0000000000..f2cb7b45b1
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_AlgVector.H
@@ -0,0 +1,453 @@
+#ifndef AMREX_ALG_VECTOR_H_
+#define AMREX_ALG_VECTOR_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_AlgPartition.H>
+#include <AMReX_FabArray.H>
+#include <AMReX_INT.H>
+#include <AMReX_LayoutData.H>
+#include <AMReX_TableData.H>
+
+#include <fstream>
+#include <string>
+#include <type_traits>
+
+namespace amrex {
+
+template <typename T, typename Allocator = DefaultAllocator<T> >
+class AlgVector
+{
+public:
+    using value_type = T;
+    using allocator_type = Allocator;
+
+    using Vec = PODVector<T,Allocator>;
+
+    AlgVector () = default;
+    explicit AlgVector (Long global_size);
+    explicit AlgVector (AlgPartition partition);
+
+    AlgVector (AlgVector<T, Allocator> const&) = delete;
+    AlgVector& operator= (AlgVector<T, Allocator> const&) = delete;
+
+    AlgVector (AlgVector<T, Allocator> &&) noexcept = default;
+    AlgVector& operator= (AlgVector<T, Allocator> &&) noexcept = default;
+
+    ~AlgVector () = default;
+
+    void define (Long global_size);
+    void define (AlgPartition partition);
+
+    [[nodiscard]] bool empty () const { return m_partition.empty(); }
+
+    [[nodiscard]] AlgPartition const& partition () const { return m_partition; }
+
+    [[nodiscard]] Long numLocalRows () const { return m_end - m_begin; }
+    [[ nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); }
+
+    //! Inclusive global index begin.
+    [[nodiscard]] Long globalBegin () const { return m_begin; }
+    //! Exclusive global index end.
+    [[nodiscard]] Long globalEnd () const { return m_end; }
+
+    [[nodiscard]] T const* data () const { return m_data.data(); }
+    [[nodiscard]] T      * data ()       { return m_data.data(); }
+
+    [[nodiscard]] AMREX_FORCE_INLINE
+    Table1D<T const, Long> view () const {
+        return Table1D<T const, Long>{m_data.data(), m_begin, m_end};
+    }
+
+    [[nodiscard]] AMREX_FORCE_INLINE
+    Table1D<T const, Long> const_view () const {
+        return Table1D<T const, Long>{m_data.data(), m_begin, m_end};
+    }
+
+    [[nodiscard]] AMREX_FORCE_INLINE
+    Table1D<T, Long> view () {
+        return Table1D<T, Long>{m_data.data(), m_begin, m_end};
+    }
+
+    void setVal (T val);
+    void setValAsync (T val);
+
+    void copy (AlgVector<T> const& rhs);
+    void copyAsync (AlgVector<T> const& rhs);
+
+    void plus (AlgVector<T> const& rhs);
+    void plusAsync (AlgVector<T> const& rhs);
+
+    void scale (T scale_factor);
+    void scaleAsync (T scale_factor);
+
+    [[nodiscard]] T sum (bool local = false) const;
+
+    [[nodiscard]] T norminf (bool local = false) const;
+    [[nodiscard]] T norm2 (bool local = false) const;
+
+    template <typename FAB,
+              std::enable_if_t<amrex::IsBaseFab<FAB>::value &&
+                               std::is_same_v<T,typename FAB::value_type>, int> = 0>
+    void copyFrom (FabArray<FAB> const& fa);
+
+    template <typename FAB,
+              std::enable_if_t<amrex::IsBaseFab<FAB>::value &&
+                               std::is_same_v<T,typename FAB::value_type>,int> = 0>
+    void copyTo (FabArray<FAB> & fa) const;
+
+    void printToFile (std::string const& file) const;
+
+private:
+    AlgPartition m_partition;
+    Long m_begin = 0;
+    Long m_end = 0;
+    Vec m_data;
+};
+
+template <typename T, typename Allocator>
+AlgVector<T,Allocator>::AlgVector (Long global_size)
+    : m_partition(global_size),
+      m_begin(m_partition[ParallelDescriptor::MyProc()]),
+      m_end(m_partition[ParallelDescriptor::MyProc()+1]),
+      m_data(m_end-m_begin)
+{
+    static_assert(std::is_floating_point<T>::value, "AlgVector is for floating point type only");
+}
+
+template <typename T, typename Allocator>
+AlgVector<T,Allocator>::AlgVector (AlgPartition partition)
+    : m_partition(std::move(partition)),
+      m_begin(m_partition[ParallelDescriptor::MyProc()]),
+      m_end(m_partition[ParallelDescriptor::MyProc()+1]),
+      m_data(m_end-m_begin)
+{
+    static_assert(std::is_floating_point<T>::value, "AlgVector is for floating point type only");
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::define (Long global_size)
+{
+    m_partition.define(global_size);
+    m_begin = m_partition[ParallelDescriptor::MyProc()];
+    m_end = m_partition[ParallelDescriptor::MyProc()+1];
+    m_data.resize(m_end-m_begin);
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::define (AlgPartition partition)
+{
+    m_partition = std::move(partition);
+    m_begin = m_partition[ParallelDescriptor::MyProc()];
+    m_end = m_partition[ParallelDescriptor::MyProc()+1];
+    m_data.resize(m_end-m_begin);
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::setVal (T val)
+{
+    setValAsync(val);
+    Gpu::streamSynchronize();
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::setValAsync (T val)
+{
+    Long n = m_data.size();
+    T* p = m_data.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept { p[i] = val; });
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::copy (AlgVector<T> const& rhs)
+{
+    copyAsync(rhs);
+    Gpu::streamSynchronize();
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::copyAsync (AlgVector<T> const& rhs)
+{
+    Long n = m_data.size();
+    AMREX_ASSERT(m_data.size() == rhs.m_data.size());
+    T* dst = m_data.data();
+    T const* src = rhs.data();
+#ifdef AMREX_USE_GPU
+    Gpu::dtod_memcpy_async(dst, src, n*sizeof(T));
+#else
+    std::memcpy(dst, src, n*sizeof(T));
+#endif
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::plus (AlgVector<T> const& rhs)
+{
+    plusAsync(rhs);
+    Gpu::streamSynchronize();
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::plusAsync (AlgVector<T> const& rhs)
+{
+    Long n = m_data.size();
+    AMREX_ASSERT(m_data.size() == rhs.m_data.size());
+    T* dst = m_data.data();
+    T const* src = rhs.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept { dst[i] += src[i]; });
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::scale (T scale_factor)
+{
+    scaleAsync(scale_factor);
+    Gpu::streamSynchronize();
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::scaleAsync (T scale_factor)
+{
+    Long n = m_data.size();
+    T* p = m_data.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept { p[i] *= scale_factor; });
+}
+
+template <typename T, typename Allocator>
+T AlgVector<T,Allocator>::sum (bool local) const
+{
+    Long n = m_data.size();
+    T const* p = m_data.data();
+    T r = Reduce::Sum<T>(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+                         {
+                             return p[i];
+                         });
+    if (!local) {
+        ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub());
+    }
+    return r;
+}
+
+template <typename T, typename Allocator>
+T AlgVector<T,Allocator>::norminf (bool local) const
+{
+    Long n = m_data.size();
+    T const* p = m_data.data();
+    T r = Reduce::Max<T>(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+                         {
+                             return amrex::Math::abs(p[i]);
+                         });
+    if (!local) {
+        ParallelAllReduce::Max(r, ParallelContext::CommunicatorSub());
+    }
+    return r;
+}
+
+template <typename T, typename Allocator>
+T AlgVector<T,Allocator>::norm2 (bool local) const
+{
+    Long n = m_data.size();
+    T const* p = m_data.data();
+    T r = Reduce::Sum<T>(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+                         {
+                             return p[i]*p[i];
+                         });
+    if (!local) {
+        ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub());
+    }
+    return std::sqrt(r);
+}
+
+template <typename T, typename Allocator>
+template <typename FAB, std::enable_if_t<amrex::IsBaseFab<FAB>::value &&
+                                         std::is_same_v<T,typename FAB::value_type>, int> >
+void AlgVector<T,Allocator>::copyFrom (FabArray<FAB> const& fa)
+{
+    AMREX_ASSERT(fa.is_cell_centered());
+
+    LayoutData<T*> dptrs(fa.boxArray(), fa.DistributionMap());
+    T* p = m_data.data();
+    for (MFIter mfi(fa); mfi.isValid(); ++mfi) {
+        dptrs[mfi] = p;
+        p += mfi.validbox().numPts();
+    }
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+#pragma omp parallel
+#endif
+    for (MFIter mfi(fa); mfi.isValid(); ++mfi) {
+        fa[mfi].template copyToMem<RunOn::Device>(mfi.validbox(), 0, 1, dptrs[mfi]);
+    }
+}
+
+template <typename T, typename Allocator>
+template <typename FAB, std::enable_if_t<amrex::IsBaseFab<FAB>::value &&
+                                         std::is_same_v<T,typename FAB::value_type>, int> >
+void AlgVector<T,Allocator>::copyTo (FabArray<FAB> & fa) const
+{
+    AMREX_ASSERT(fa.is_cell_centered());
+
+    LayoutData<T const*> dptrs(fa.boxArray(), fa.DistributionMap());
+    T const* p = m_data.data();
+    for (MFIter mfi(fa); mfi.isValid(); ++mfi) {
+        dptrs[mfi] = p;
+        p += mfi.validbox().numPts();
+    }
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+#pragma omp parallel
+#endif
+    for (MFIter mfi(fa); mfi.isValid(); ++mfi) {
+        fa[mfi].template copyFromMem<RunOn::Device>(mfi.validbox(), 0, 1, dptrs[mfi]);
+    }
+}
+
+template <typename T, typename Allocator>
+void AlgVector<T,Allocator>::printToFile (std::string const& file) const
+{
+    std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc()));
+    ofs << m_begin << " " << m_end << "\n";
+#ifdef AMREX_USE_GPU
+    Gpu::PinnedVector<T> hv(m_data.size());
+    Gpu::dtoh_memcpy_async(hv.data(), m_data.data(), m_data.size()*sizeof(T));
+    Gpu::streamSynchronize();
+    T const* p = hv.data();
+#else
+    T const* p = m_data;
+#endif
+    for (Long i = 0, N = m_data.size(); i < N; ++i) {
+        ofs << i+m_begin << " " << p[i] << "\n";
+    }
+}
+
+template <class V, class Enable = void> struct IsAlgVector : std::false_type {};
+//
+template <class V>
+struct IsAlgVector<V, std::enable_if_t<std::is_same_v<AlgVector<typename V::value_type,
+                                                                typename V::allocator_type>,
+                                                      V> > >
+    : std::true_type {};
+
+template <typename V1, typename F>
+std::enable_if_t<IsAlgVector<std::decay_t<V1> >::value>
+ForEach (V1 & x, F const& f)
+{
+    Long n = x.numLocalRows();
+    auto* px = x.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+    {
+        f(px[i]);
+    });
+}
+
+template <typename V1, typename V2, typename F>
+std::enable_if_t<IsAlgVector<std::decay_t<V1> >::value &&
+                 IsAlgVector<std::decay_t<V2> >::value>
+ForEach (V1 & x, V2 & y, F const& f)
+{
+    AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());
+    Long n = x.numLocalRows();
+    auto* AMREX_RESTRICT px = x.data();
+    auto* AMREX_RESTRICT py = y.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+    {
+        f(px[i], py[i]);
+    });
+}
+
+template <typename V1, typename V2, typename V3, typename F>
+std::enable_if_t<IsAlgVector<std::decay_t<V1> >::value &&
+                 IsAlgVector<std::decay_t<V2> >::value &&
+                 IsAlgVector<std::decay_t<V3> >::value>
+ForEach (V1 & x, V2 & y, V3 & z, F const& f)
+{
+    AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());
+    AMREX_ASSERT(x.numLocalRows() == z.numLocalRows());
+    Long n = x.numLocalRows();
+    auto* AMREX_RESTRICT px = x.data();
+    auto* AMREX_RESTRICT py = y.data();
+    auto* AMREX_RESTRICT pz = z.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+    {
+        f(px[i], py[i], pz[i]);
+    });
+}
+
+template <typename V1, typename V2, typename V3, typename V4, typename F>
+std::enable_if_t<IsAlgVector<std::decay_t<V1> >::value &&
+                 IsAlgVector<std::decay_t<V2> >::value &&
+                 IsAlgVector<std::decay_t<V3> >::value &&
+                 IsAlgVector<std::decay_t<V4> >::value>
+ForEach (V1 & x, V2 & y, V3 & z, V4 & a, F const& f)
+{
+    AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());
+    AMREX_ASSERT(x.numLocalRows() == z.numLocalRows());
+    AMREX_ASSERT(x.numLocalRows() == a.numLocalRows());
+    Long n = x.numLocalRows();
+    auto* AMREX_RESTRICT px = x.data();
+    auto* AMREX_RESTRICT py = y.data();
+    auto* AMREX_RESTRICT pz = z.data();
+    auto* AMREX_RESTRICT pa = a.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+    {
+        f(px[i], py[i], pz[i], pa[i]);
+    });
+}
+
+template <typename V1, typename V2, typename V3, typename V4, typename V5, typename F>
+std::enable_if_t<IsAlgVector<std::decay_t<V1> >::value &&
+                 IsAlgVector<std::decay_t<V2> >::value &&
+                 IsAlgVector<std::decay_t<V3> >::value &&
+                 IsAlgVector<std::decay_t<V4> >::value &&
+                 IsAlgVector<std::decay_t<V5> >::value>
+ForEach (V1 & x, V2 & y, V3 & z, V4 & a, V5 & b, F const& f)
+{
+    AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());
+    AMREX_ASSERT(x.numLocalRows() == z.numLocalRows());
+    AMREX_ASSERT(x.numLocalRows() == a.numLocalRows());
+    AMREX_ASSERT(x.numLocalRows() == b.numLocalRows());
+    Long n = x.numLocalRows();
+    auto* AMREX_RESTRICT px = x.data();
+    auto* AMREX_RESTRICT py = y.data();
+    auto* AMREX_RESTRICT pz = z.data();
+    auto* AMREX_RESTRICT pa = a.data();
+    auto* AMREX_RESTRICT pb = b.data();
+    ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+    {
+        f(px[i], py[i], pz[i], pa[i], pb[i]);
+    });
+}
+
+template <typename T>
+T Dot (AlgVector<T> const& x, AlgVector<T> const& y, bool local = false)
+{
+    AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());
+    Long n = x.numLocalRows();
+    auto const* px = x.data();
+    auto const* py = y.data();
+    T r = Reduce::Sum<T>(n, [=] AMREX_GPU_DEVICE (Long i) noexcept
+                         {
+                             return px[i] * py[i];
+                         });
+    if (!local) {
+        ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub());
+    }
+    return r;
+}
+
+template <typename T>
+void Axpy (AlgVector<T>& y, T a, AlgVector<T> const& x, bool async = false)
+{
+    ForEach(y, x, [=] AMREX_GPU_DEVICE (T& yi, T const& xi) { yi += a*xi; });
+    if (!async) { Gpu::streamSynchronize(); }
+}
+
+template <typename T>
+void LinComb (AlgVector<T>& y, T a, AlgVector<T> const& xa, T b, AlgVector<T> const& xb, bool async = false)
+{
+    ForEach(y, xa, xb, [=] AMREX_GPU_DEVICE (T& yi, T const& xai, T const& xbi) {
+                           yi = a*xai + b*xbi;
+                       });
+    if (!async) { Gpu::streamSynchronize(); }
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/AMReX_Algebra.H b/Src/LinearSolvers/AMReX_Algebra.H
new file mode 100644
index 0000000000..86c1d11f86
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_Algebra.H
@@ -0,0 +1,9 @@
+#ifndef AMREX_ALGEBRA_H_
+#define AMREX_ALGEBRA_H_
+
+#include <AMReX_AlgPartition.H>
+#include <AMReX_AlgVector.H>
+#include <AMReX_SpMatrix.H>
+#include <AMReX_SpMV.H>
+
+#endif
diff --git a/Src/LinearSolvers/AMReX_GMRES_MV.H b/Src/LinearSolvers/AMReX_GMRES_MV.H
new file mode 100644
index 0000000000..22eb2ad395
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_GMRES_MV.H
@@ -0,0 +1,160 @@
+#ifndef AMREX_GMRES_MV_H_
+#define AMREX_GMRES_MV_H_
+
+#include <AMReX_GMRES.H>
+#include <AMReX_Algebra.H>
+#include <AMReX_Smoother_MV.H>
+#include <functional>
+
+namespace amrex {
+
+template <typename T>
+class GMRES_MV
+{
+public:
+    using RT = Real; // double or float
+    using VEC = AlgVector<T>;
+    using MAT = SpMatrix<T>;
+    using GM = amrex::GMRES<VEC,GMRES_MV<T>>;
+    using PC = std::function<void(VEC&,VEC const&)>;
+
+    GMRES_MV (MAT const* a_mat);
+
+    void setPrecond (PC a_pc) { m_pc = std::move(a_pc); }
+
+    /**
+     * \brief Solve the linear system
+     *
+     * \param a_sol     unknowns, i.e., x in A x = b.
+     * \param a_rhs     RHS, i.e., b in A x = b.
+     * \param a_tol_rel relative tolerance.
+     * \param a_tol_abs absolute tolerance.
+     */
+    void solve (VEC& a_sol, VEC const& a_rhs, T a_tol_rel, T a_tol_abs);
+
+    //! Sets verbosity.
+    void setVerbose (int v) { m_gmres.setVerbose(v); }
+
+    //! Get the GMRES object.
+    GM& getGMRES () { return m_gmres; }
+
+    //! Make MultiFab without ghost cells
+    [[nodiscard]] VEC makeVecRHS () const;
+
+    //! Make MultiFab with ghost cells and set ghost cells to zero
+    [[nodiscard]] VEC makeVecLHS () const;
+
+    static T norm2 (VEC const& vec);
+
+    static void scale (VEC& vec, T scale_factor);
+
+    static T dotProduct (VEC const& vec1, VEC const& vec2);
+
+    //! lhs = 0
+    static void setToZero (VEC& lhs);
+
+    //! lhs = rhs
+    static void assign (VEC& lhs, VEC const& rhs);
+
+    //! lhs += a*rhs
+    static void increment (VEC& lhs, VEC const& rhs, T a);
+
+    //! lhs = a*rhs_a + b*rhs_b
+    static void linComb (VEC& lhs, T a, VEC const& rhs_a, T b, VEC const& rhs_b);
+
+    //! lhs = L(rhs)
+    void apply (VEC& lhs, VEC& rhs) const;
+
+    void precond (VEC& lhs, VEC const& rhs) const;
+
+private:
+    GM m_gmres;
+    MAT const* m_mat = nullptr;
+    PC m_pc;
+};
+
+template <typename T>
+GMRES_MV<T>::GMRES_MV (MAT const* a_mat)
+    : m_mat(a_mat)
+{
+    m_gmres.define(*this);
+}
+
+template <typename T>
+void GMRES_MV<T>::solve (VEC& a_sol, VEC const& a_rhs, T a_tol_rel, T a_tol_abs)
+{
+    m_gmres.solve(a_sol, a_rhs, a_tol_rel, a_tol_abs);
+}
+
+template <typename T>
+auto GMRES_MV<T>::makeVecRHS () const -> VEC
+{
+    return VEC(m_mat->partition());
+}
+
+template <typename T>
+auto GMRES_MV<T>::makeVecLHS () const -> VEC
+{
+    return VEC(m_mat->partition());
+}
+
+template <typename T>
+T GMRES_MV<T>::norm2 (VEC const& vec)
+{
+    return vec.norm2();
+}
+
+template <typename T>
+void GMRES_MV<T>::scale (VEC& vec, T scale_factor)
+{
+    vec.scaleAsync(scale_factor);
+}
+
+template <typename T>
+T GMRES_MV<T>::dotProduct (VEC const& vec1, VEC const& vec2)
+{
+    return amrex::Dot(vec1,vec2);
+}
+
+template <typename T>
+void GMRES_MV<T>::setToZero (VEC& lhs)
+{
+    lhs.setValAsync(0);
+}
+
+template <typename T>
+void GMRES_MV<T>::assign (VEC& lhs, VEC const& rhs)
+{
+    lhs.copyAsync(rhs);
+}
+
+template <typename T>
+void GMRES_MV<T>::increment (VEC& lhs, VEC const& rhs, T a)
+{
+    amrex::Axpy(lhs, a, rhs, true);
+}
+
+template <typename T>
+void GMRES_MV<T>::linComb (VEC& lhs, T a, VEC const& rhs_a, T b, VEC const& rhs_b)
+{
+    amrex::LinComb(lhs, a, rhs_a, b, rhs_b, true);
+}
+
+template <typename T>
+void GMRES_MV<T>::apply (VEC& lhs, VEC& rhs) const
+{
+    amrex::SpMV(lhs, *m_mat, rhs);
+}
+
+template <typename T>
+void GMRES_MV<T>::precond (VEC& lhs, VEC const& rhs) const
+{
+    if (m_pc) {
+        m_pc(lhs, rhs);
+    } else {
+        lhs.copyAsync(rhs);
+    }
+}
+
+}
+#endif
diff --git a/Src/LinearSolvers/AMReX_Smoother_MV.H b/Src/LinearSolvers/AMReX_Smoother_MV.H
new file mode 100644
index 0000000000..fa65a385c4
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_Smoother_MV.H
@@ -0,0 +1,46 @@
+#ifndef AMREX_SMOOTHER_MV_H_
+#define AMREX_SMOOTHER_MV_H_
+
+#include <AMReX_Algebra.H>
+#include <utility>
+
+namespace amrex {
+
+template <typename T>
+class JacobiSmoother
+{
+public:
+    explicit JacobiSmoother (SpMatrix<T> const* a_A) : m_A(a_A) {}
+
+    int setNumIters (int a_niters) { return std::exchange(m_niters, a_niters); }
+
+    void operator() (AlgVector<T>& xvec, AlgVector<T> const& bvec)
+    {
+        auto const& diag = m_A->diagonalVector();
+        AlgVector<T> Axvec(xvec.partition());
+        xvec.setVal(0);
+        for (int iter = 0; iter < m_niters; ++iter) {
+            if (iter == 0) {
+                Axvec.setVal(0);
+            } else {
+                SpMV(Axvec, *m_A, xvec);
+            }
+            ForEach(xvec, Axvec, bvec, diag,
+                    [=] AMREX_GPU_DEVICE (T& x, T const& ax, T const& b, T const& d)
+                    {
+                        if (d != T(0)) {
+                            x += (b-ax)/d * T(2./3.); // weighted Jacobi
+                        }
+                    });
+        }
+        Gpu::streamSynchronize();
+    }
+
+private:
+    SpMatrix<T> const* m_A;
+    int m_niters = 4;
+};
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/AMReX_SpMV.H b/Src/LinearSolvers/AMReX_SpMV.H
new file mode 100644
index 0000000000..7141dcf676
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_SpMV.H
@@ -0,0 +1,192 @@
+#ifndef AMREX_SPMV_H_
+#define AMREX_SPMV_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_AlgVector.H>
+#include <AMReX_GpuComplex.H>
+#include <AMReX_SpMatrix.H>
+
+#if defined(AMREX_USE_CUDA)
+#  include <cusparse.h>
+#elif defined(AMREX_USE_HIP)
+#  include <rocsparse/rocsparse.h>
+#elif defined(AMREX_USE_DPCPP)
+#  include <oneapi/mkl/spblas.hpp>
+#endif
+
+namespace amrex {
+
+template <typename T>
+void SpMV (AlgVector<T>& y, SpMatrix<T> const& A, AlgVector<T> const& x)
+{
+    // xxxxx TODOL We might want to cache the cusparse and rocsparse handles
+
+    // xxxxx TODO: let's assume it's square matrix for now.
+    AMREX_ALWAYS_ASSERT(x.partition() == y.partition() &&
+                        x.partition() == A.partition());
+
+    const_cast<SpMatrix<T>&>(A).startComm(x);
+
+    T      * AMREX_RESTRICT py = y.data();
+    T const* AMREX_RESTRICT px = x.data();
+    T const* AMREX_RESTRICT mat = A.data();
+    auto const* AMREX_RESTRICT col = A.columnIndex();
+    auto const* AMREX_RESTRICT row = A.rowOffset();
+
+#if defined(AMREX_USE_GPU)
+
+    Long const nrows = A.numLocalRows();
+    Long const ncols = x.numLocalRows();
+    Long const nnz = A.numLocalNonZero();
+
+#if defined(AMREX_USE_CUDA)
+
+    cusparseHandle_t handle;
+    cusparseCreate(&handle);
+    cusparseSetStream(handle, Gpu::gpuStream());
+
+    cudaDataType data_type;
+    if constexpr (std::is_same_v<T,float>) {
+        data_type = CUDA_R_32F;
+    } else if constexpr (std::is_same_v<T,double>) {
+        data_type = CUDA_R_64F;
+    } else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
+        data_type = CUDA_C_32F;
+    } else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
+        data_type = CUDA_C_64F;
+    } else {
+        amrex::Abort("SpMV: unsupported data type");
+    }
+
+    cusparseIndexType_t index_type = CUSPARSE_INDEX_64I;
+
+    cusparseSpMatDescr_t mat_descr;
+    cusparseCreateCsr(&mat_descr, nrows, ncols, nnz, (void*)row, (void*)col, (void*)mat,
+                      index_type, index_type, CUSPARSE_INDEX_BASE_ZERO, data_type);
+
+    cusparseDnVecDescr_t x_descr;
+    cusparseCreateDnVec(&x_descr, ncols, (void*)px, data_type);
+
+    cusparseDnVecDescr_t y_descr;
+    cusparseCreateDnVec(&y_descr, nrows, (void*)py, data_type);
+
+    T alpha = T(1);
+    T beta = T(0);
+
+    std::size_t buffer_size;
+    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_descr, x_descr,
+                            &beta, y_descr, data_type, CUSPARSE_SPMV_ALG_DEFAULT, &buffer_size);
+
+    auto* pbuffer = (void*)The_Arena()->alloc(buffer_size);
+
+    cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_descr, x_descr,
+                 &beta, y_descr, data_type, CUSPARSE_SPMV_ALG_DEFAULT, pbuffer);
+
+    Gpu::streamSynchronize();
+
+    cusparseDestroySpMat(mat_descr);
+    cusparseDestroyDnVec(x_descr);
+    cusparseDestroyDnVec(y_descr);
+    cusparseDestroy(handle);
+    The_Arena()->free(pbuffer);
+
+#elif defined(AMREX_USE_HIP)
+
+    rocsparse_handle handle;
+    rocsparse_create_handle(&handle);
+    rocsparse_set_stream(handle, Gpu::gpuStream());
+
+    rocsparse_datatype data_type;
+    if constexpr (std::is_same_v<T,float>) {
+        data_type = rocsparse_datatype_f32_r;
+    } else if constexpr (std::is_same_v<T,double>) {
+        data_type = rocsparse_datatype_f64_r;
+    } else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
+        data_type = rocsparse_datatype_f32_c;
+    } else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
+        data_type = rocsparse_datatype_f64_c;
+    } else {
+        amrex::Abort("SpMV: unsupported data type");
+    }
+
+    rocsparse_indextype index_type = rocsparse_indextype_i64;
+
+    rocsparse_spmat_descr mat_descr;
+    rocsparse_create_csr_descr(&mat_descr, nrows, ncols, nnz, (void*)row, (void*)col,
+                               (void*)mat, index_type, index_type,
+                               rocsparse_index_base_zero, data_type);
+
+    rocsparse_dnvec_descr x_descr;
+    rocsparse_create_dnvec_descr(&x_descr, ncols, (void*)px, data_type);
+
+    rocsparse_dnvec_descr y_descr;
+    rocsparse_create_dnvec_descr(&y_descr, nrows, (void*)py, data_type);
+
+    T alpha = T(1.0);
+    T beta = T(0.0);
+
+    std::size_t buffer_size;
+    rocsparse_spmv(handle, rocsparse_operation_none, &alpha, mat_descr, x_descr,
+                   &beta, y_descr, data_type, rocsparse_spmv_alg_default,
+#if (HIP_VERSION_MAJOR >= 6)
+                   rocsparse_spmv_stage_buffer_size,
+#endif
+                   &buffer_size, nullptr);
+
+    void* pbuffer = (void*)The_Arena()->alloc(buffer_size);
+
+#if (HIP_VERSION_MAJOR >= 6)
+    rocsparse_spmv(handle, rocsparse_operation_none, &alpha, mat_descr, x_descr,
+                   &beta, y_descr, data_type, rocsparse_spmv_alg_default,
+                   rocsparse_spmv_stage_preprocess, &buffer_size, pbuffer);
+#endif
+
+    rocsparse_spmv(handle, rocsparse_operation_none, &alpha, mat_descr, x_descr,
+                   &beta, y_descr, data_type, rocsparse_spmv_alg_default,
+#if (HIP_VERSION_MAJOR >= 6)
+                   rocsparse_spmv_stage_compute,
+#endif
+                   &buffer_size, pbuffer);
+
+    Gpu::streamSynchronize();
+
+    rocsparse_destroy_spmat_descr(mat_descr);
+    rocsparse_destroy_dnvec_descr(x_descr);
+    rocsparse_destroy_dnvec_descr(y_descr);
+    rocsparse_destroy_handle(handle);
+    The_Arena()->free(pbuffer);
+
+#elif defined(AMREX_USE_DPCPP)
+
+    mkl::sparse::matrix_handle_t handle{};
+    mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle, nrows, ncols, mkl::index_base::zero,
+                              (Long*)row, (Long*)col, (T*)mat);
+    mkl::sparse::gemv(Gpu::Device::streamQueue(), mkl::transpose::nontrans,
+                      T(1), handle, px, T(0), py);
+
+#endif
+
+    AMREX_GPU_ERROR_CHECK();
+
+#else
+
+    Long const ny = y.numLocalRows();
+    for (Long i = 0; i < ny; ++i) {
+        T r = 0;
+#ifdef AMREX_USE_OMP
+#pragma omp parallel for reduction(+:r)
+#endif
+        for (Long j = row[i]; j < row[i+1]; ++j) {
+            r += mat[j] * px[col[j]];
+        }
+        py[i] = r;
+    }
+
+#endif
+
+    const_cast<SpMatrix<T>&>(A).finishComm(y);
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/AMReX_SpMatrix.H b/Src/LinearSolvers/AMReX_SpMatrix.H
new file mode 100644
index 0000000000..a380fffda4
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_SpMatrix.H
@@ -0,0 +1,645 @@
+#ifndef AMREX_SP_MATRIX_H_
+#define AMREX_SP_MATRIX_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_AlgPartition.H>
+#include <AMReX_AlgVector.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_INT.H>
+#include <AMReX_Scan.H>
+
+#include <fstream>
+#include <string>
+#include <type_traits>
+
+namespace amrex {
+
+template <typename T, template<typename> class Allocator = DefaultAllocator>
+class SpMatrix
+{
+public:
+    using value_type = T;
+
+    SpMatrix () = default;
+
+    SpMatrix (AlgPartition partition, int nnz);
+
+    SpMatrix (SpMatrix const&) = delete;
+    SpMatrix& operator= (SpMatrix const&) = delete;
+
+    SpMatrix (SpMatrix &&) = default;
+    SpMatrix& operator= (SpMatrix &&) = default;
+
+    ~SpMatrix () = default;
+
+    void define (AlgPartition partition, int nnz);
+
+    [[nodiscard]] AlgPartition const& partition () const { return m_partition; }
+
+    [[nodiscard]] Long numLocalRows () const { return m_row_end - m_row_begin; }
+    [[nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); }
+    [[nodiscard]] Long numLocalNonZero () const { return m_data.nnz; }
+
+    //! Inclusive global index begin.
+    [[nodiscard]] Long globalRowBegin () const { return m_row_begin; }
+    //! Exclusive global index end.
+    [[nodiscard]] Long globalRowEnd () const { return m_row_end; }
+
+    [[nodiscard]] T const* data () const { return m_data.mat.data(); }
+    [[nodiscard]] T      * data ()       { return m_data.mat.data(); }
+    [[nodiscard]] Long const* columnIndex () const { return m_data.col_index.data(); }
+    [[nodiscard]] Long      * columnIndex ()       { return m_data.col_index.data(); }
+    [[nodiscard]] Long const* rowOffset () const { return m_data.row_offset.data(); }
+    [[nodiscard]] Long      * rowOffset ()       { return m_data.row_offset.data(); }
+
+    void printToFile (std::string const& file) const;
+
+    template <typename F>
+    void setVal (F const& f);
+
+    [[nodiscard]] AlgVector<T> const& diagonalVector () const;
+
+    template <typename U> friend void SpMV(AlgVector<U>& y, SpMatrix<U> const& A, AlgVector<U> const& x);
+
+    //! Private function, but public for cuda
+    void define_doit (int nnz);
+
+#ifdef AMREX_USE_MPI
+    //! Private function, but public for cuda
+    void prepare_comm ();
+    void pack_buffer (AlgVector<T> const& v);
+    void unpack_buffer (AlgVector<T>& v);
+#endif
+
+private:
+
+    void startComm (AlgVector<T> const& x);
+    void finishComm (AlgVector<T>& y);
+
+    template <class U> using DVec = PODVector<U,Allocator<U> >;
+
+    template <template <typename> class V>
+    struct CSR {
+        V<T> mat;
+        V<Long> col_index;
+        V<Long> row_offset;
+        Long nnz = -1;
+    };
+
+    AlgPartition m_partition;
+    Long m_row_begin = 0;
+    Long m_row_end = 0;
+    CSR<DVec> m_data;
+
+    mutable AlgVector<T> m_diagonal;
+
+#ifdef AMREX_USE_MPI
+    CSR<DVec> m_data_remote;
+
+#ifdef AMREX_USE_GPU
+    Gpu::PinnedVector<Long> m_remote_cols;
+#endif
+
+    DVec<Long> m_rtol;
+
+    Vector<int> m_send_to;
+    Vector<int> m_send_counts;
+    Gpu::DeviceVector<Long> m_send_indices;
+
+    Vector<int> m_recv_from;
+    Vector<int> m_recv_counts;
+
+    Vector<MPI_Request> m_send_reqs;
+    T* m_send_buffer = nullptr;
+    Long m_total_counts_send = 0;
+
+    Vector<MPI_Request> m_recv_reqs;
+    T* m_recv_buffer = nullptr;
+    Long m_total_counts_recv = 0;
+
+    bool m_comm_prepared = false;
+#endif
+
+    bool m_shifted = false;
+};
+
+template <typename T, template<typename> class Allocator>
+SpMatrix<T,Allocator>::SpMatrix (AlgPartition partition, int nnz)
+    : m_partition(std::move(partition)),
+      m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
+      m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
+{
+    static_assert(std::is_floating_point<T>::value, "SpMatrix is for floating point type only");
+    define_doit(nnz);
+}
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::define (AlgPartition partition, int nnz)
+{
+    m_partition = std::move(partition);
+    m_row_begin = m_partition[ParallelDescriptor::MyProc()];
+    m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
+    define_doit(nnz);
+}
+
+template <typename T, template<typename> class Allocator>
+void
+SpMatrix<T,Allocator>::define_doit (int nnz)
+{
+    Long nlocalrows = this->numLocalRows();
+    Long total_nnz = nlocalrows*nnz;
+    m_data.mat.resize(total_nnz);
+    m_data.col_index.resize(total_nnz);
+    m_data.row_offset.resize(nlocalrows+1);
+    m_data.nnz = total_nnz;
+
+    auto* poffset = m_data.row_offset.data();
+    ParallelFor(nlocalrows+1, [=] AMREX_GPU_DEVICE (Long lrow) noexcept
+    {
+        poffset[lrow] = lrow*nnz;
+    });
+}
+
+template <typename T, template<typename> class Allocator>
+void
+SpMatrix<T,Allocator>::printToFile (std::string const& file) const
+{
+    // xxxxx TODO: This function only prints square part of the local rows,
+    // not the full rows.
+
+#ifdef AMREX_USE_GPU
+    CSR<Gpu::PinnedVector> csr;
+    csr.mat.resize(m_data.mat.size());
+    csr.col_index.resize(m_data.col_index.size());
+    csr.row_offset.resize(m_data.row_offset.size());
+    Gpu::copyAsync(Gpu::deviceToHost, m_data.mat.begin(), m_data.mat.end(), csr.mat.begin());
+    Gpu::copyAsync(Gpu::deviceToHost, m_data.col_index.begin(), m_data.col_index.end(), csr.col_index.begin());
+    Gpu::copyAsync(Gpu::deviceToHost, m_data.row_offset.begin(), m_data.row_offset.end(), csr.row_offset.begin());
+    csr.nnz = m_data.nnz;
+    Gpu::streamSynchronize();
+#else
+    auto const& csr = m_data;
+#endif
+
+    const Long lrow_begin = m_partition[ParallelDescriptor::MyProc()];
+    std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc()));
+    ofs << m_row_begin << " " << m_row_end << " " << csr.nnz << "\n";
+    for (Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
+        Long nnz_row = csr.row_offset[i+1] - csr.row_offset[i];
+        T    const* mat = csr.mat.data()       + csr.row_offset[i];
+        Long const* col = csr.col_index.data() + csr.row_offset[i];
+        for (Long j = 0; j < nnz_row; ++j) {
+            ofs << i+lrow_begin << " " << col[j] << " " << mat[j] << "\n";
+        }
+    }
+}
+
+template <typename T, template<typename> class Allocator>
+template <typename F>
+void SpMatrix<T,Allocator>::setVal (F const& f)
+{
+    // xxxxx TODO: We can try to optimize this later by using shared memory.
+
+    Long nlocalrows = this->numLocalRows();
+    Long rowbegin = this->globalRowBegin();
+    auto* pmat = m_data.mat.data();
+    auto* pcolindex = m_data.col_index.data();
+    auto* prowoffset = m_data.row_offset.data();
+    ParallelFor(nlocalrows, [=] AMREX_GPU_DEVICE (int lrow) noexcept
+    {
+        f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
+    });
+}
+
+template <typename T, template<typename> class Allocator>
+AlgVector<T> const& SpMatrix<T,Allocator>::diagonalVector () const
+{
+    if (m_diagonal.empty()) {
+        m_diagonal.define(this->partition());
+        auto* AMREX_RESTRICT p = m_diagonal.data();
+        auto const* AMREX_RESTRICT mat = m_data.mat.data();
+        auto const* AMREX_RESTRICT col = m_data.col_index.data();
+        auto const* AMREX_RESTRICT row = m_data.row_offset.data();
+        auto offset = m_shifted ? Long(0) : m_row_begin;
+        Long nrows = this->numLocalRows();
+        ParallelFor(nrows, [=] AMREX_GPU_DEVICE (Long i)
+        {
+            T d = 0;
+            for (Long j = row[i]; j < row[i+1]; ++j) {
+                if (i == col[j] - offset) {
+                    d = mat[j];
+                    break;
+                }
+            }
+            p[i] = d;
+        });
+    }
+    return m_diagonal;
+}
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::startComm (AlgVector<T> const& x)
+{
+#ifndef AMREX_USE_MPI
+    amrex::ignore_unused(x);
+#else
+    if (this->partition().numActiveProcs() <= 1) { return; }
+
+    this->prepare_comm();
+
+    auto const mpi_tag = ParallelDescriptor::SeqNum();
+    auto const mpi_t_type = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
+    auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
+
+    auto const nrecvs = int(m_recv_from.size());
+    if (nrecvs > 0) {
+        m_recv_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_total_counts_recv);
+        m_recv_reqs.resize(nrecvs, MPI_REQUEST_NULL);
+        auto* p_recv = m_recv_buffer;
+        for (int irecv = 0; irecv < nrecvs; ++irecv) {
+            BL_MPI_REQUIRE(MPI_Irecv(p_recv,
+                                     m_recv_counts[irecv], mpi_t_type,
+                                     m_recv_from[irecv], mpi_tag, mpi_comm,
+                                     &(m_recv_reqs[irecv])));
+            p_recv += m_recv_counts[irecv];
+        }
+        AMREX_ASSERT(p_recv == m_recv_buffer + m_total_counts_recv);
+    }
+
+    auto const nsends = int(m_send_to.size());
+    if (nsends > 0) {
+        m_send_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_total_counts_send);
+
+        pack_buffer(x);
+        Gpu::streamSynchronize();
+
+        m_send_reqs.resize(nsends, MPI_REQUEST_NULL);
+        auto* p_send = m_send_buffer;
+        for (int isend = 0; isend < nsends; ++isend) {
+            auto count = m_send_counts[isend];
+            BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_send_to[isend],
+                                     mpi_tag, mpi_comm, &(m_send_reqs[isend])));
+            p_send += count;
+        }
+        AMREX_ASSERT(p_send == m_send_buffer + m_total_counts_send);
+    }
+#endif
+}
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::finishComm (AlgVector<T>& y)
+{
+    if (this->numLocalRows() == 0) { return; }
+
+#ifndef AMREX_USE_MPI
+    amrex::ignore_unused(y);
+#else
+    if (this->partition().numActiveProcs() <= 1) { return; }
+
+    if ( ! m_recv_reqs.empty()) {
+        Vector<MPI_Status> mpi_statuses(m_recv_reqs.size());
+        BL_MPI_REQUIRE(MPI_Waitall(int(m_recv_reqs.size()),
+                                   m_recv_reqs.data(),
+                                   mpi_statuses.data()));
+    }
+
+    unpack_buffer(y);
+
+    if ( ! m_send_reqs.empty()) {
+        Vector<MPI_Status> mpi_statuses(m_send_reqs.size());
+        BL_MPI_REQUIRE(MPI_Waitall(int(m_send_reqs.size()),
+                                   m_send_reqs.data(),
+                                   mpi_statuses.data()));
+    }
+
+    Gpu::streamSynchronize();
+    The_Comms_Arena()->free(m_send_buffer);
+    The_Comms_Arena()->free(m_recv_buffer);
+    m_send_reqs.clear();
+    m_recv_reqs.clear();
+#endif
+}
+
+#ifdef AMREX_USE_MPI
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::prepare_comm ()
+{
+    if (m_comm_prepared) { return; }
+
+    // This function needs to be safe when nnz is zero.
+
+    // xxxxx TODO: check there is no int overflow.
+
+    const int nprocs = ParallelContext::NProcsSub();
+
+    // First, we need to split the matrix into two parts, a square matrix
+    // for pure local operations and another part for remote operations.
+
+    Long all_nnz = m_data.nnz;
+    Long local_nnz;
+    Gpu::DeviceVector<Long> pfsum(all_nnz);
+    auto* p_pfsum = pfsum.data();
+    auto row_begin = m_row_begin;
+    auto row_end = m_row_end;
+    if (m_data.nnz < Long(std::numeric_limits<int>::max())) {
+        auto const* pcol = m_data.col_index.data();
+        local_nnz = Scan::PrefixSum<int>(int(all_nnz),
+                                         [=] AMREX_GPU_DEVICE (int i) -> int {
+                                             return (pcol[i] >= row_begin &&
+                                                     pcol[i] <  row_end); },
+                                         [=] AMREX_GPU_DEVICE (int i, int const& x) {
+                                             p_pfsum[i] = x; },
+                                         Scan::Type::exclusive, Scan::retSum);
+    } else {
+        auto const* pcol = m_data.col_index.data();
+        local_nnz = Scan::PrefixSum<Long>(all_nnz,
+                                          [=] AMREX_GPU_DEVICE (Long i) -> Long {
+                                              return (pcol[i] >= row_begin &&
+                                                      pcol[i] <  row_end); },
+                                          [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
+                                              p_pfsum[i] = x; },
+                                          Scan::Type::exclusive, Scan::retSum);
+    }
+
+    m_data.nnz = local_nnz;
+    Long remote_nnz = all_nnz - local_nnz;
+    m_data_remote.nnz = remote_nnz;
+
+    Vector<Vector<Long>>unique_remote_cols_vv(nprocs);
+    Vector<Long> unique_remote_cols_v;
+
+    if (local_nnz != all_nnz) {
+        m_data_remote.mat.resize(remote_nnz);
+        m_data_remote.col_index.resize(remote_nnz);
+        DVec<T> new_mat(local_nnz);
+        DVec<Long> new_col(local_nnz);
+        auto const* pmat = m_data.mat.data();
+        auto const* pcol = m_data.col_index.data();
+        auto* pmat_l = new_mat.data();
+        auto* pcol_l = new_col.data();
+        auto* pmat_r = m_data_remote.mat.data();
+        auto* pcol_r = m_data_remote.col_index.data();
+        ParallelFor(all_nnz, [=] AMREX_GPU_DEVICE (Long i)
+        {
+            auto ps = p_pfsum[i];
+            auto local = (pcol[i] >= row_begin &&
+                          pcol[i] <  row_end);
+            if (local) {
+                pmat_l[ps] = pmat[i];
+                pcol_l[ps] = pcol[i] - row_begin; // shift the column index to local
+            } else {
+                pmat_r[i-ps] = pmat[i];
+                pcol_r[i-ps] = pcol[i];
+            }
+        });
+        m_shifted = true;
+        auto noffset = Long(m_data.row_offset.size());
+        auto* pro = m_data.row_offset.data();
+        m_data_remote.row_offset.resize(noffset);
+        auto* pro_r = m_data_remote.row_offset.data();
+        ParallelFor(noffset, [=] AMREX_GPU_DEVICE (Long i)
+        {
+            if (i < noffset-1) {
+                auto ro_l = p_pfsum[pro[i]];
+                pro_r[i] = pro[i] - ro_l;
+                pro[i] = ro_l;
+            } else {
+                pro[i] = local_nnz;
+                pro_r[i] = remote_nnz;
+            }
+        });
+        Gpu::streamSynchronize();
+        m_data.mat.swap(new_mat);
+        m_data.col_index.swap(new_col);
+
+        // In the remote part, it's expected that some rows don't have
+        // non-zeros. So we trim them off.
+        {
+            Long old_size = m_data_remote.row_offset.size();
+            m_rtol.resize(old_size-1);
+            auto* p_rtol = m_rtol.data();
+            DVec<Long> trimmed_row_offset(old_size);
+            auto const* p_ro = m_data_remote.row_offset.data();
+            auto* p_tro = trimmed_row_offset.data();
+            Long new_size;
+            if (old_size < Long(std::numeric_limits<int>::max())) {
+                // This is basically std::unique.
+                new_size = Scan::PrefixSum<int>(int(old_size),
+                                                [=] AMREX_GPU_DEVICE (int i) -> int {
+                                                    if (i+1 < old_size) {
+                                                        return (p_ro[i+1] > p_ro[i]);
+                                                    } else {
+                                                        return 1;
+                                                    }
+                                                },
+                                                [=] AMREX_GPU_DEVICE (int i, int const& x) {
+                                                    if (i == 0) {
+                                                        p_tro[0] = 0;
+                                                    } else if (p_ro[i] > p_ro[i-1]) {
+                                                        p_tro[x] = p_ro[i];
+                                                    }
+                                                    if ((i+1 < old_size) &&
+                                                        p_ro[i+1] > p_ro[i])
+                                                    {
+                                                        p_rtol[x] = i;
+                                                    }
+                                                },
+                                                Scan::Type::exclusive, Scan::retSum);
+            } else {
+                // This is basically std::unique.
+                new_size = Scan::PrefixSum<Long>(old_size,
+                                                [=] AMREX_GPU_DEVICE (Long i) -> Long {
+                                                    if (i+1 < old_size) {
+                                                        return (p_ro[i+1] > p_ro[i]);
+                                                    } else {
+                                                        return 1;
+                                                    }
+                                                },
+                                                [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
+                                                    if (i == 0) {
+                                                        p_tro[0] = 0;
+                                                    } else if (p_ro[i] > p_ro[i-1]) {
+                                                        p_tro[x] = p_ro[i];
+                                                    }
+                                                    if ((i+1 < old_size) &&
+                                                        p_ro[i+1] > p_ro[i])
+                                                    {
+                                                        p_rtol[x] = i;
+                                                    }
+                                                },
+                                                Scan::Type::exclusive, Scan::retSum);
+            }
+
+            m_rtol.resize(new_size-1);
+            trimmed_row_offset.resize(new_size);
+#ifdef AMREX_USE_GPU
+            m_rtol.shrink_to_fit();
+            trimmed_row_offset.shrink_to_fit();
+#endif
+            m_data_remote.row_offset.swap(trimmed_row_offset);
+        }
+
+#ifdef AMREX_USE_GPU
+        m_remote_cols.resize(m_data_remote.col_index.size());
+        Gpu::copyAsync(Gpu::deviceToHost, m_data_remote.col_index.begin(),
+                                          m_data_remote.col_index.end(),
+                                          m_remote_cols.begin());
+        Gpu::streamSynchronize();
+#else
+        auto const& m_remote_cols = m_data_remote.col_index;
+#endif
+
+        unique_remote_cols_v.resize(m_remote_cols.size());
+        std::partial_sort_copy(m_remote_cols.begin(),
+                               m_remote_cols.end(),
+                               unique_remote_cols_v.begin(),
+                               unique_remote_cols_v.end());
+        amrex::RemoveDuplicates(unique_remote_cols_v);
+
+        m_total_counts_recv = Long(unique_remote_cols_v.size());
+
+        // Note that amrex::RemoveDuplicates sorts the data.
+        auto const& rows = this->m_partition.dataVector();
+        auto it = rows.cbegin();
+        for (auto c : unique_remote_cols_v) {
+            it = std::find_if(it, rows.cend(), [&] (auto x) { return x > c; });
+            if (it != rows.cend()) {
+                int iproc = int(std::distance(rows.cbegin(),it)) - 1;
+                unique_remote_cols_vv[iproc].push_back(c);
+            } else {
+                amrex::Abort("SpMatrix::prepare_comm: how did this happen?");
+            }
+        }
+    }
+
+    // Need to make plans for MPI
+    auto const mpi_tag = ParallelDescriptor::SeqNum();
+    auto const mpi_int = ParallelDescriptor::Mpi_typemap<int>::type(); // NOLINT(readability-qualified-auto)
+    auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
+    auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
+
+    amrex::Vector<int> need_from(nprocs);
+    for (int iproc = 0; iproc < nprocs; ++iproc) {
+        need_from[iproc] = unique_remote_cols_vv[iproc].empty() ? 0 : 1;
+    }
+    amrex::Vector<int> reduce_scatter_counts(nprocs,1);
+    int nsends = 0;
+    BL_MPI_REQUIRE(MPI_Reduce_scatter
+                   (need_from.data(), &nsends, reduce_scatter_counts.data(),
+                    mpi_int, MPI_SUM, mpi_comm));
+
+    // nsends is the number of processes that need data from me.
+
+    Vector<MPI_Request> mpi_requests;
+    for (int iproc = 0; iproc < nprocs; ++iproc) {
+        if ( ! unique_remote_cols_vv[iproc].empty()) {
+            mpi_requests.push_back(MPI_REQUEST_NULL);
+            // I need to let other processes know what I need from them.
+            BL_MPI_REQUIRE(MPI_Isend(unique_remote_cols_vv[iproc].data(),
+                                     int(unique_remote_cols_vv[iproc].size()),
+                                     mpi_long, iproc, mpi_tag, mpi_comm,
+                                     &(mpi_requests.back())));
+            m_recv_from.push_back(iproc);
+            m_recv_counts.push_back(int(unique_remote_cols_vv[iproc].size()));
+        }
+    }
+
+    Vector<Vector<Long>> send_indices(nsends);
+    m_total_counts_send = 0;
+    for (int isend = 0; isend < nsends; ++isend) {
+        MPI_Status mpi_status;
+        BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
+        int receiver = mpi_status.MPI_SOURCE;
+        int count;
+        BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
+        m_send_to.push_back(receiver);
+        m_send_counts.push_back(count);
+        send_indices[isend].resize(count);
+        BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
+                                receiver, mpi_tag, mpi_comm, &mpi_status));
+        m_total_counts_send += count;
+    }
+
+    m_send_indices.resize(m_total_counts_send);
+    Gpu::PinnedVector<Long> send_indices_all;
+    send_indices_all.reserve(m_total_counts_send);
+    for (auto const& vl : send_indices) {
+        for (auto x : vl) {
+            send_indices_all.push_back(x);
+        }
+    }
+    Gpu::copyAsync(Gpu::hostToDevice, send_indices_all.begin(), send_indices_all.end(),
+                   m_send_indices.begin());
+    Gpu::streamSynchronize();
+
+    Vector<MPI_Status> mpi_statuses(mpi_requests.size());
+    BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
+                               mpi_statuses.data()));
+
+    // Now we convert the remote indices from global to local.
+    std::map<Long,Long> gtol;
+    for (Long i = 0, N = Long(unique_remote_cols_v.size()); i < N; ++i) {
+        gtol[unique_remote_cols_v[i]] = i;
+    }
+#ifdef AMREX_USE_GPU
+    auto& cols = m_remote_cols;
+#else
+    auto& cols = m_data_remote.col_index;
+#endif
+    for (auto& c : cols) {
+        c = gtol[c];
+    }
+
+#ifdef AMREX_USE_GPU
+    Gpu::copyAsync(Gpu::hostToDevice, m_remote_cols.begin(), m_remote_cols.end(),
+                   m_data_remote.col_index.data());
+#endif
+
+    m_comm_prepared = true;
+}
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::pack_buffer (AlgVector<T> const& v)
+{
+    auto* pdst = m_send_buffer;
+    auto* pidx = m_send_indices.data();
+    auto const& vv = v.view();
+    auto const nsends = Long(m_send_indices.size());
+    ParallelFor(nsends, [=] AMREX_GPU_DEVICE (Long i)
+    {
+        pdst[i] = vv(pidx[i]);
+    });
+}
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::unpack_buffer (AlgVector<T>& v)
+{
+    auto const& csr = m_data_remote;
+    if (csr.nnz > 0) {
+        T const* AMREX_RESTRICT mat = csr.mat.data();
+        auto const* AMREX_RESTRICT col = csr.col_index.data();
+        auto const* AMREX_RESTRICT row = csr.row_offset.data();
+
+        auto const* rtol = m_rtol.data();
+
+        auto const* AMREX_RESTRICT px = m_recv_buffer;
+        auto      * AMREX_RESTRICT py = v.data();
+
+        auto const nrr = Long(csr.row_offset.size())-1;
+        ParallelFor(nrr, [=] AMREX_GPU_DEVICE (Long i)
+        {
+            T r = 0;
+            for (Long j = row[i]; j < row[i+1]; ++j) {
+                r += mat[j] * px[col[j]];
+            }
+            py[rtol[i]] += r;
+        });
+    }
+}
+
+#endif
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/CMakeLists.txt b/Src/LinearSolvers/CMakeLists.txt
index ddf2de454a..267e5f8845 100644
--- a/Src/LinearSolvers/CMakeLists.txt
+++ b/Src/LinearSolvers/CMakeLists.txt
@@ -35,6 +35,14 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        MLMG/AMReX_MLPoisson_${D}D_K.H
        AMReX_GMRES.H
        AMReX_GMRES_MLMG.H
+       AMReX_GMRES_MV.H
+       AMReX_Smoother_MV.H
+       AMReX_Algebra.H
+       AMReX_AlgPartition.H
+       AMReX_AlgPartition.cpp
+       AMReX_AlgVector.H
+       AMReX_SpMatrix.H
+       AMReX_SpMV.H
        )
 
     if (D EQUAL 3)
diff --git a/Src/LinearSolvers/Make.package b/Src/LinearSolvers/Make.package
index c818714cbf..79e8c38461 100644
--- a/Src/LinearSolvers/Make.package
+++ b/Src/LinearSolvers/Make.package
@@ -1,5 +1,16 @@
 CEXE_headers += AMReX_GMRES.H AMReX_GMRES_MLMG.H
 
+CEXE_headers += AMReX_GMRES_MV.H
+
+CEXE_headers += AMReX_Smoother_MV.H
+
+CEXE_headers += AMReX_Algebra.H
+CEXE_headers += AMReX_AlgPartition.H
+CEXE_sources += AMReX_AlgPartition.cpp
+CEXE_headers += AMReX_AlgVector.H
+CEXE_headers += AMReX_SpMatrix.H
+CEXE_headers += AMReX_SpMV.H
+
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers
 
diff --git a/Tests/Algebra/GMRES/CMakeLists.txt b/Tests/Algebra/GMRES/CMakeLists.txt
new file mode 100644
index 0000000000..7e8d9a8080
--- /dev/null
+++ b/Tests/Algebra/GMRES/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources main.cpp)
+    set(_input_files )
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/Algebra/GMRES/GNUmakefile b/Tests/Algebra/GMRES/GNUmakefile
new file mode 100644
index 0000000000..a817a61487
--- /dev/null
+++ b/Tests/Algebra/GMRES/GNUmakefile
@@ -0,0 +1,17 @@
+# AMREX_HOME defines the directory in which we will find all the AMReX code.
+AMREX_HOME := ../../..
+
+DEBUG        = FALSE
+USE_MPI      = TRUE
+USE_OMP      = FALSE
+COMP         = gnu
+DIM          = 3
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+
+include $(AMREX_HOME)/Src/Base/Make.package
+include $(AMREX_HOME)/Src/LinearSolvers/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/Algebra/GMRES/Make.package b/Tests/Algebra/GMRES/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/Algebra/GMRES/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/Algebra/GMRES/main.cpp b/Tests/Algebra/GMRES/main.cpp
new file mode 100644
index 0000000000..f87d7317c4
--- /dev/null
+++ b/Tests/Algebra/GMRES/main.cpp
@@ -0,0 +1,119 @@
+#include <AMReX_GMRES_MV.H>
+
+#include <AMReX.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+    {
+        Box domain(IntVect(0),IntVect(15));
+        Long n = domain.numPts();
+        AlgVector<Real> xvec(n);
+        AlgVector<Real> bvec(xvec.partition());
+        AlgVector<Real> exact(xvec.partition());
+
+        Real a = Real(1.e-6);
+        Real dx = Real(2)*amrex::Math::pi<Real>()/Real(domain.length(0));
+
+        // The system is a * phi - del dot grad phi.
+        // Where phi = sin^5(x)*sin^5(y)*sin^5(z)
+
+        BoxIndexer box_indexer(domain);
+
+        // Initialzie bvec
+        {
+            auto* rhs = bvec.data();
+            auto* phi = exact.data();
+            auto nrows = bvec.numLocalRows();
+            auto ib = bvec.globalBegin();
+            ParallelFor(nrows, [=] AMREX_GPU_DEVICE (Long lrow)
+            {
+                auto row = lrow + ib; // global row index
+                IntVect cell = box_indexer.intVect(row);
+#if (AMREX_SPACEDIM == 1)
+                auto x = (cell[0]+Real(0.5))*dx;
+                auto phi0 = Math::powi<5>(std::sin(x));
+                auto phixm = Math::powi<5>(std::sin(x-dx));
+                auto phixp = Math::powi<5>(std::sin(x+dx));
+                rhs[lrow] = a*phi0 + (Real(2)*phi0-phixm-phixp) / (dx*dx);
+#elif (AMREX_SPACEDIM == 2)
+                auto x = (cell[0]+Real(0.5))*dx;
+                auto y = (cell[1]+Real(0.5))*dx;
+                auto phi0 = Math::powi<5>(std::sin(x)*std::sin(y));
+                auto phixm = Math::powi<5>(std::sin(x-dx)*std::sin(y));
+                auto phixp = Math::powi<5>(std::sin(x+dx)*std::sin(y));
+                auto phiym = Math::powi<5>(std::sin(x)*std::sin(y-dx));
+                auto phiyp = Math::powi<5>(std::sin(x)*std::sin(y+dx));
+                rhs[lrow] = a*phi0 + (Real(4)*phi0-phixm-phixp-phiym-phiyp) / (dx*dx);
+#else
+                auto x = (cell[0]+Real(0.5))*dx;
+                auto y = (cell[1]+Real(0.5))*dx;
+                auto z = (cell[2]+Real(0.5))*dx;
+                auto phi0 = Math::powi<5>(std::sin(x)*std::sin(y)*std::sin(z));
+                auto phixm = Math::powi<5>(std::sin(x-dx)*std::sin(y)*std::sin(z));
+                auto phixp = Math::powi<5>(std::sin(x+dx)*std::sin(y)*std::sin(z));
+                auto phiym = Math::powi<5>(std::sin(x)*std::sin(y-dx)*std::sin(z));
+                auto phiyp = Math::powi<5>(std::sin(x)*std::sin(y+dx)*std::sin(z));
+                auto phizm = Math::powi<5>(std::sin(x)*std::sin(y)*std::sin(z-dx));
+                auto phizp = Math::powi<5>(std::sin(x)*std::sin(y)*std::sin(z+dx));
+                rhs[lrow] = a*phi0 + (Real(6)*phi0-phixm-phixp-phiym-phiyp-phizm-phizp) / (dx*dx);
+#endif
+                phi[lrow] = phi0;
+            });
+        }
+
+        // Initial guess
+        xvec.setVal(0);
+
+        // cross stencil w/ periodic boundaries
+        auto set_stencil = [=] AMREX_GPU_DEVICE (Long row, Long* col, Real* val)
+        {
+            IntVect cell = box_indexer.intVect(row);
+            int i = 0;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                IntVect cell2 = cell;
+                if (cell[idim] == domain.smallEnd(idim)) {
+                    cell2[idim] = domain.bigEnd(idim);
+                } else {
+                    cell2[idim] = cell[idim] - 1;
+                }
+                Long row2 = domain.index(cell2);
+                col[i] = row2;
+                val[i] = Real(-1.0)/(dx*dx);
+                ++i;
+
+                if (cell[idim] == domain.bigEnd(idim)) {
+                    cell2[idim] = domain.smallEnd(idim);
+                } else {
+                    cell2[idim] = cell[idim] + 1;
+                }
+                row2 = domain.index(cell2);
+                col[i] = row2;
+                val[i] = Real(-1.0)/(dx*dx);
+                ++i;
+            }
+            col[i] = row;
+            val[i] = Real(2*AMREX_SPACEDIM)/(dx*dx) + a;
+        };
+
+        int num_non_zeros = 2*AMREX_SPACEDIM+1;
+        SpMatrix<Real> mat(xvec.partition(), num_non_zeros);
+        mat.setVal(set_stencil);
+
+        GMRES_MV<Real> gmres(&mat);
+        gmres.setPrecond(JacobiSmoother<Real>(&mat));
+        gmres.setVerbose(2);
+
+        auto eps = (sizeof(Real) == 4) ? Real(1.e-5) : Real (1.e-12);
+        gmres.solve(xvec, bvec, eps, Real(0.0));
+
+        // Check the solution
+        amrex::Axpy(xvec, Real(-1.0), exact);
+        auto error = xvec.norminf();
+        amrex::Print() << " Max norm error: " << error << "\n";
+        AMREX_ALWAYS_ASSERT(error*10 < eps);
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 9d6afba857..9b2d6278c5 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -138,7 +138,7 @@ else()
    endif ()
 
    if (AMReX_LINEAR_SOLVERS)
-      list(APPEND AMREX_TESTS_SUBDIRS LinearSolvers)
+      list(APPEND AMREX_TESTS_SUBDIRS Algebra LinearSolvers)
    endif ()
 
    if (AMReX_FFT)
diff --git a/Tools/CMake/AMReXParallelBackends.cmake b/Tools/CMake/AMReXParallelBackends.cmake
index e7f631ca43..b9adb6e365 100644
--- a/Tools/CMake/AMReXParallelBackends.cmake
+++ b/Tools/CMake/AMReXParallelBackends.cmake
@@ -66,10 +66,14 @@ if (  AMReX_GPU_BACKEND STREQUAL "CUDA"
    foreach(D IN LISTS AMReX_SPACEDIM)
        target_link_libraries(amrex_${D}d PUBLIC CUDA::curand)
 
-        # nvToolsExt: if tiny profiler or base profiler are on.
-        if (AMReX_TINY_PROFILE OR AMReX_BASE_PROFILE)
-            target_link_libraries(amrex_${D}d PUBLIC CUDA::nvToolsExt)
-        endif ()
+       if (AMReX_LINEAR_SOLVERS)
+           target_link_libraries(amrex_${D}d PUBLIC CUDA::cusparse)
+       endif ()
+
+       # nvToolsExt: if tiny profiler or base profiler are on.
+       if (AMReX_TINY_PROFILE OR AMReX_BASE_PROFILE)
+           target_link_libraries(amrex_${D}d PUBLIC CUDA::nvToolsExt)
+       endif ()
    endforeach()
 
    # Check cuda compiler and host compiler
@@ -277,7 +281,7 @@ if (AMReX_HIP)
           )
        endforeach()
    endif()
-   
+
    # Debug issues with -O0: internal compiler errors
    # work-around for
    #   https://github.com/AMReX-Codes/amrex/pull/3311
@@ -291,6 +295,10 @@ if (AMReX_HIP)
    find_package(rocrand REQUIRED CONFIG)
    find_package(rocprim REQUIRED CONFIG)
    find_package(hiprand REQUIRED CONFIG)
+   if (AMReX_LINEAR_SOLVERS)
+      find_package(rocsparse REQUIRED CONFIG)
+   endif()
+
    if(AMReX_ROCTX)
        foreach(D IN LISTS AMReX_SPACEDIM)
           # To be modernized in the future, please see:
@@ -308,6 +316,11 @@ if (AMReX_HIP)
    foreach(D IN LISTS AMReX_SPACEDIM)
       target_link_libraries(amrex_${D}d PUBLIC hip::hiprand roc::rocrand roc::rocprim)
    endforeach()
+   if (AMReX_LINEAR_SOLVERS)
+      foreach(D IN LISTS AMReX_SPACEDIM)
+         target_link_libraries(amrex_${D}d PUBLIC roc::rocsparse)
+      endforeach()
+   endif()
 
    # avoid forcing the rocm LLVM flags on a gfortran
    # https://github.com/ROCm-Developer-Tools/HIP/issues/2275
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index 136a30e7e0..4cfe205a57 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -1174,7 +1174,7 @@ else ifeq ($(USE_CUDA),TRUE)
         MAKE_CUDA_PATH := $(SYSTEM_CUDA_PATH)
     endif
 
-    LIBRARIES += -lcuda -lcurand
+    LIBRARIES += -lcuda -lcurand -lcusparse
 
     ifneq ($(MAKE_CUDA_PATH),)
         LIBRARY_LOCATIONS += $(MAKE_CUDA_PATH)/lib64
diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak
index 26dff7f94f..f890098696 100644
--- a/Tools/GNUMake/comps/hip.mak
+++ b/Tools/GNUMake/comps/hip.mak
@@ -122,16 +122,10 @@ ifeq ($(HIP_COMPILER),clang)
   ROC_PATH=$(realpath $(HIP_PATH))
   SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include
 
-  # rocRand
-  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand
   LIBRARY_LOCATIONS += $(ROC_PATH)/lib
-  LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lhiprand -lrocrand
 
-  # rocPrim - Header only
-  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocprim
-
-  # rocThrust - Header only
-  # SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocthrust
+  # hiprand & rocsparse
+  LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lhiprand -lrocrand -lrocsparse
 
   # rocTracer
   ifeq ($(USE_ROCTX),TRUE)