From f236e57313c446db5b98ba6e18fb23b4068f8bd4 Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Tue, 10 May 2022 16:19:05 -0700
Subject: [PATCH 01/25] First draft.

---
 Src/Base/AMReX_BLProfiler.H     | 38 ++++++++++++++-
 Src/Base/AMReX_BLProfiler.cpp   | 85 ++++++++++++++++++++++++++++++---
 Src/Base/AMReX_FabArray.H       | 41 +++++++++++++++-
 Src/Base/AMReX_FabArrayCommI.H  |  9 ++++
 Src/Base/AMReX_TinyProfiler.H   | 20 ++++++++
 Src/Base/AMReX_TinyProfiler.cpp | 71 +++++++++++++++++++++++++++
 6 files changed, 254 insertions(+), 10 deletions(-)
diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H
index 8cde01a9ff4..24ce1c8dacc 100644
--- a/Src/Base/AMReX_BLProfiler.H
+++ b/Src/Base/AMReX_BLProfiler.H
@@ -161,6 +161,8 @@ class BLProfiler
       Real timeStamp;
     };
 
+
+
     static std::map<std::string, BLProfiler *> mFortProfs;  // [fname, fortfunc]
     static Vector<std::string> mFortProfsErrors;     // [error string]
     static Vector<BLProfiler *> mFortProfsInt;     // [fortfuncindex]
@@ -314,6 +316,24 @@ private:
     std::string regname;
 };
 
+class BLProfileSync {
+
+    friend amrex::BLProfiler;
+
+public:
+    static void Sync() noexcept;
+    static void Sync(const std::string& name) noexcept;
+    static void Sync(const char* name) noexcept;
+
+    static void StartSyncRegion() noexcept;
+    static void StartSyncRegion(const std::string& name) noexcept;
+    static void StartSyncRegion(const char* name) noexcept;
+    static void EndSyncRegion() noexcept;
+
+private:
+    static int sync_counter;
+    static int use_prof_syncs;
+};
 
 namespace BLProfilerUtils {
   void WriteHeader(std::ostream &os, const int colWidth,
@@ -426,7 +446,11 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
                                             bl_profiler_##rvname.start();
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)  bl_profiler_##rvname.stop();   \
     amrex::BLProfiler::RegionStop(fname);
-
+#define BL_PROFILE_SYNC()                       amrex::BLProfileSync::Sync()
+#define BL_PROFILE_SYNC_TIMED(fname)            amrex::BLProfileSync::Sync()
+#define BL_PROFILE_SYNC_START()                 amrex::BLProfileSync::StartSyncRegion()
+#define BL_PROFILE_SYNC_START_TIMED(fname)      amrex::BLProfileSync::StartSyncRegion(fname)
+#define BL_PROFILE_SYNC_STOP()                  amrex::BLProfileSync::EndSyncRegion()
 #define BL_PROFILE_TINY_FLUSH()
 #define BL_PROFILE_FLUSH() { amrex::BLProfiler::Finalize(true); }
 
@@ -508,6 +532,13 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 #define BL_PROFILE_REGION_VAR(fname, rvname)
 #define BL_PROFILE_REGION_VAR_START(fname, rvname)
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)
+
+#define BL_PROFILE_SYNC()                       amrex::TinyProfileSync::Sync()
+#define BL_PROFILE_SYNC_TIMED(fname)            amrex::TinyProfileSync::Sync()
+#define BL_PROFILE_SYNC_START()                 amrex::TinyProfileSync::StartSyncRegion()
+#define BL_PROFILE_SYNC_START_TIMED(fname)      amrex::TinyProfileSync::StartSyncRegion(fname)
+#define BL_PROFILE_SYNC_STOP()                  amrex::TinyProfileSync::EndSyncRegion()
+
 #define BL_PROFILE_TINY_FLUSH() amrex::TinyProfiler::Finalize(true)
 #define BL_PROFILE_FLUSH()
 #define BL_TRACE_PROFILE_FLUSH()
@@ -568,6 +599,11 @@ class BLProfiler
 #define BL_PROFILE_REGION_VAR(fname, rvname)
 #define BL_PROFILE_REGION_VAR_START(fname, rvname)
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)
+#define BL_PROFILE_SYNC()
+#define BL_PROFILE_SYNC_TIMED(fname)
+#define BL_PROFILE_SYNC_START()
+#define BL_PROFILE_SYNC_START_TIMED(fname)
+#define BL_PROFILE_SYNC_STOP()
 #define BL_PROFILE_TINY_FLUSH()
 #define BL_PROFILE_FLUSH()
 #define BL_TRACE_PROFILE_FLUSH()
diff --git a/Src/Base/AMReX_BLProfiler.cpp b/Src/Base/AMReX_BLProfiler.cpp
index 516d0e11833..cc69bfc5784 100644
--- a/Src/Base/AMReX_BLProfiler.cpp
+++ b/Src/Base/AMReX_BLProfiler.cpp
@@ -271,13 +271,20 @@ void BLProfiler::Initialize() {
 
 
 void BLProfiler::InitParams() {
-  ParmParse pParse("blprofiler");
-  pParse.queryAdd("prof_nfiles", nProfFiles);
-  pParse.queryAdd("prof_csflushsize", csFlushSize);
-  pParse.queryAdd("prof_traceflushsize", traceFlushSize);
-  pParse.queryAdd("prof_flushinterval", flushInterval);
-  pParse.queryAdd("prof_flushtimeinterval", flushTimeInterval);
-  pParse.queryAdd("prof_flushprint", bFlushPrint);
+  {
+    ParmParse pParse("blprofiler");
+    pParse.queryAdd("prof_nfiles", nProfFiles);
+    pParse.queryAdd("prof_csflushsize", csFlushSize);
+    pParse.queryAdd("prof_traceflushsize", traceFlushSize);
+    pParse.queryAdd("prof_flushinterval", flushInterval);
+    pParse.queryAdd("prof_flushtimeinterval", flushTimeInterval);
+    pParse.queryAdd("prof_flushprint", bFlushPrint);
+  }
+
+  {
+    ParmParse pParse("amrex");
+    pParse.queryAdd("use_profiler_syncs", BLProfileSync::use_prof_syncs);
+  }
 }
 
 
@@ -1546,6 +1553,70 @@ void BLProfiler::CommStats::UnFilter(CommFuncType cft) {
   }
 }
 
+void
+BLProfileSync::Sync () noexcept
+{
+    if (use_prof_syncs)
+        { ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub()); }
+}
+
+void
+BLProfileSync::Sync (const std::string& name) noexcept
+{
+    if (use_prof_syncs) {
+        TinyProfiler synctimer(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+BLProfileSync::Sync (const char* name) noexcept
+{
+    if (use_prof_syncs) {
+        BLProfiler synctimer(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion () noexcept
+{
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion (const std::string& name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            BLProfiler synctimer(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion (const char* name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            BLProfiler synctimer(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::EndSyncRegion () noexcept {
+    if (use_prof_syncs) {
+        sync_counter--;
+    }
+}
 
 namespace {
   const int EOS(-1);
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 166f3d964ea..0dffc85d457 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -2588,11 +2588,13 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(0, nComp(), n_grow, Periodicity::NonPeriodic(), cross);
         FillBoundary_finish<BUF>();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2600,11 +2602,13 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (const Periodicity& period, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(0, nComp(), n_grow, period, cross);
         FillBoundary_finish<BUF>();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2612,6 +2616,7 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (const IntVect& nghost, const Periodicity& period, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(nghost.allLE(nGrowVect()),
                                      "FillBoundary: asked to fill more ghost cells than we have");
@@ -2619,6 +2624,7 @@ FabArray<FAB>::FillBoundary (const IntVect& nghost, const Periodicity& period, b
         FillBoundary_nowait<BUF>(0, nComp(), nghost, period, cross);
         FillBoundary_finish<BUF>();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2626,11 +2632,13 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (int scomp, int ncomp, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(scomp, ncomp, n_grow, Periodicity::NonPeriodic(), cross);
         FillBoundary_finish<BUF>();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2638,11 +2646,13 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (int scomp, int ncomp, const Periodicity& period, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(scomp, ncomp, n_grow, period, cross);
         FillBoundary_finish<BUF>();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2651,6 +2661,7 @@ void
 FabArray<FAB>::FillBoundary (int scomp, int ncomp, const IntVect& nghost,
                              const Periodicity& period, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(nghost.allLE(nGrowVect()),
                                      "FillBoundary: asked to fill more ghost cells than we have");
@@ -2658,6 +2669,7 @@ FabArray<FAB>::FillBoundary (int scomp, int ncomp, const IntVect& nghost,
         FillBoundary_nowait<BUF>(scomp, ncomp, nghost, period, cross);
         FillBoundary_finish<BUF>();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2696,11 +2708,13 @@ template <class FAB>
 void
 FabArray<FAB>::FillBoundaryAndSync (const Periodicity& period)
 {
-    BL_PROFILE("FAbArray::FillBoundaryAndSync()");
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
+    BL_PROFILE("FabArray::FillBoundaryAndSync()");
     if (n_grow.max() > 0 || !is_cell_centered()) {
         FillBoundaryAndSync_nowait(0, nComp(), n_grow, period);
         FillBoundaryAndSync_finish();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2708,11 +2722,13 @@ void
 FabArray<FAB>::FillBoundaryAndSync (int scomp, int ncomp, const IntVect& nghost,
                                     const Periodicity& period)
 {
-    BL_PROFILE("FAbArray::FillBoundaryAndSync()");
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
+    BL_PROFILE("FabArray::FillBoundaryAndSync()");
     if (nghost.max() > 0 || !is_cell_centered()) {
         FillBoundaryAndSync_nowait(scomp, ncomp, nghost, period);
         FillBoundaryAndSync_finish();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2727,6 +2743,7 @@ void
 FabArray<FAB>::FillBoundaryAndSync_nowait (int scomp, int ncomp, const IntVect& nghost,
                                            const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FillBoundaryAndSync_nowait()");
     FBEP_nowait(scomp, ncomp, nghost, period, false, false, true);
 }
@@ -2737,34 +2754,40 @@ FabArray<FAB>::FillBoundaryAndSync_finish ()
 {
     BL_PROFILE("FillBoundaryAndSync_finish()");
     FillBoundary_finish();
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::OverrideSync (const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FAbArray::OverrideSync()");
     if (!is_cell_centered()) {
         OverrideSync_nowait(0, nComp(), period);
         OverrideSync_finish();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::OverrideSync (int scomp, int ncomp, const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FAbArray::OverrideSync()");
     if (!is_cell_centered()) {
         OverrideSync_nowait(scomp, ncomp, period);
         OverrideSync_finish();
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::OverrideSync_nowait (const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     OverrideSync_nowait(0, nComp(), period);
 }
 
@@ -2772,6 +2795,7 @@ template <class FAB>
 void
 FabArray<FAB>::OverrideSync_nowait (int scomp, int ncomp, const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("OverrideSync_nowait()");
     FBEP_nowait(scomp, ncomp, IntVect(0), period, false, false, true);
 }
@@ -2782,6 +2806,7 @@ FabArray<FAB>::OverrideSync_finish ()
 {
     BL_PROFILE("OverrideSync_finish()");
     FillBoundary_finish();
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2809,10 +2834,12 @@ template <class FAB>
 void
 FabArray<FAB>::SumBoundary (int scomp, int ncomp, IntVect const& src_nghost, IntVect const& dst_nghost, const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray<FAB>::SumBoundary()");
 
     SumBoundary_nowait(scomp, ncomp, src_nghost, dst_nghost, period);
     SumBoundary_finish();
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2840,6 +2867,7 @@ template <class FAB>
 void
 FabArray<FAB>::SumBoundary_nowait (int scomp, int ncomp, IntVect const& src_nghost, IntVect const& dst_nghost, const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray<FAB>::SumBoundary_nowait()");
 
     if ( n_grow == IntVect::TheZeroVector() && boxArray().ixType().cellCentered()) return;
@@ -2867,28 +2895,33 @@ FabArray<FAB>::SumBoundary_finish ()
     FabArray<FAB>* tmp = const_cast<FabArray<FAB>*> (this->pcd->src);
     this->ParallelCopy_finish();
     delete tmp;
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::EnforcePeriodicity (const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::EnforcePeriodicity");
     if (period.isAnyPeriodic()) {
         FBEP_nowait(0, nComp(), nGrowVect(), period, false, true);
         FillBoundary_finish(); // unsafe unless isAnyPeriodic()
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::EnforcePeriodicity (int scomp, int ncomp, const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::EnforcePeriodicity");
     if (period.isAnyPeriodic()) {
         FBEP_nowait(scomp, ncomp, nGrowVect(), period, false, true);
         FillBoundary_finish(); // unsafe unless isAnyPeriodic()
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2896,11 +2929,13 @@ void
 FabArray<FAB>::EnforcePeriodicity (int scomp, int ncomp, const IntVect& nghost,
                                    const Periodicity& period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::EnforcePeriodicity");
     if (period.isAnyPeriodic()) {
         FBEP_nowait(scomp, ncomp, nghost, period, false, true);
         FillBoundary_finish(); // unsafe unless isAnyPeriodic()
     }
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2908,6 +2943,7 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const Periodicity& period, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FillBoundary_nowait()");
     FBEP_nowait<BUF>(scomp, ncomp, nGrowVect(), period, cross);
 }
@@ -2918,6 +2954,7 @@ void
 FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost,
                                     const Periodicity& period, bool cross)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FillBoundary_nowait()");
     FBEP_nowait<BUF>(scomp, ncomp, nghost, period, cross);
 }
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index ef57321ebf6..177ff50236a 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -240,6 +240,8 @@ FabArray<FAB>::FillBoundary_finish ()
     fbd.reset();
 
 #endif
+
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -254,10 +256,12 @@ FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src,
                              CpOp                 op,
                              const FabArrayBase::CPC * a_cpc)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::ParallelCopy()");
 
     ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period, op, a_cpc);
     ParallelCopy_finish();
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -270,11 +274,13 @@ FabArray<FAB>::ParallelCopyToGhost (const FabArray<FAB>& src,
                                     const IntVect&       dnghost,
                                     const Periodicity&   period)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::ParallelCopyToGhost()");
 
     ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
                         FabArrayBase::COPY, nullptr, true);
     ParallelCopy_finish();
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -312,6 +318,7 @@ FabArray<FAB>::ParallelCopy_nowait (const FabArray<FAB>& src,
                                     const FabArrayBase::CPC * a_cpc,
                                     bool                 to_ghost_cells_only)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::ParallelCopy_nowait()");
 
     AMREX_ASSERT_WITH_MESSAGE(!pcd, "ParallelCopy_nowait() called when comm operation already in progress.");
@@ -559,6 +566,8 @@ FabArray<FAB>::ParallelCopy_finish ()
 
 #endif /*BL_USE_MPI*/
 
+    BL_PROFILE_SYNC_STOP();
+
 }
 
 template <class FAB>
diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H
index 677b4448d3b..4c4a75a5bc1 100644
--- a/Src/Base/AMReX_TinyProfiler.H
+++ b/Src/Base/AMReX_TinyProfiler.H
@@ -111,5 +111,25 @@ private:
     TinyProfiler tprof;
 };
 
+
+class TinyProfileSync {
+
+    friend amrex::TinyProfiler;
+
+public:
+    static void Sync() noexcept;
+    static void Sync(const std::string& name) noexcept;
+    static void Sync(const char* name) noexcept;
+
+    static void StartSyncRegion() noexcept;
+    static void StartSyncRegion(const std::string& name) noexcept;
+    static void StartSyncRegion(const char* name) noexcept;
+    static void EndSyncRegion() noexcept;
+
+private:
+    static int sync_counter;
+    static int use_prof_syncs;
+};
+
 }
 #endif
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 3688e13032d..386a9e9c338 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -35,6 +35,8 @@ double TinyProfiler::t_init = std::numeric_limits<double>::max();
 int TinyProfiler::device_synchronize_around_region = 0;
 int TinyProfiler::n_print_tabs = 0;
 int TinyProfiler::verbose = 0;
+int TinyProfileSync::use_prof_syncs = 0;
+int TinyProfileSync::sync_counter = 0;
 
 namespace {
     std::set<std::string> improperly_nested_timers;
@@ -304,6 +306,10 @@ TinyProfiler::Initialize () noexcept
         pp.queryAdd("verbose", verbose);
         pp.queryAdd("v", verbose);
     }
+    {
+        amrex::ParmParse pp("amrex");
+        pp.queryAdd("use_profiler_syncs", TinyProfileSync::use_prof_syncs);
+    }
 }
 
 void
@@ -638,4 +644,69 @@ TinyProfiler::PrintCallStack (std::ostream& os)
     }
 }
 
+void
+TinyProfileSync::Sync () noexcept
+{
+    if (use_prof_syncs)
+        { ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub()); }
+}
+
+void
+TinyProfileSync::Sync (const std::string& name) noexcept
+{
+    if (use_prof_syncs) {
+        TinyProfiler synctimer(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+TinyProfileSync::Sync (const char* name) noexcept
+{
+    if (use_prof_syncs) {
+        TinyProfiler synctimer(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+TinyProfileSync::StartSyncRegion () noexcept
+{
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+TinyProfileSync::StartSyncRegion (const std::string& name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            TinyProfiler synctimer(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+TinyProfileSync::StartSyncRegion (const char* name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            TinyProfiler synctimer(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+TinyProfileSync::EndSyncRegion () noexcept {
+    if (use_prof_syncs) {
+        sync_counter--;
+    }
+}
+
 }

From c6fd9257ef09c3fb26e555667ec9c557a84716be Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Tue, 10 May 2022 18:57:02 -0700
Subject: [PATCH 02/25] Base Profile Fixes.

---
 Src/Base/AMReX_BLProfiler.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_BLProfiler.cpp b/Src/Base/AMReX_BLProfiler.cpp
index cc69bfc5784..e512f54b509 100644
--- a/Src/Base/AMReX_BLProfiler.cpp
+++ b/Src/Base/AMReX_BLProfiler.cpp
@@ -104,6 +104,9 @@ int BLProfiler::CallStats::cstatsVersion(1);
 Vector<BLProfiler::CallStatsStack> BLProfiler::callIndexStack;
 Vector<BLProfiler::CallStatsPatch> BLProfiler::callIndexPatch;
 
+int BLProfileSync::use_prof_syncs = 0;
+int BLProfileSync::sync_counter = 0;
+
 #ifdef BL_TRACE_PROFILING
 int BLProfiler::callStackDepth(-1);
 int BLProfiler::prevCallStackDepth(0);
@@ -1564,7 +1567,7 @@ void
 BLProfileSync::Sync (const std::string& name) noexcept
 {
     if (use_prof_syncs) {
-        TinyProfiler synctimer(name);
+        BLProfiler synctimer(name);
         ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
     }
 }

From 8bafe673da8df7baadc92fb050d16ebaf847fa8d Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Wed, 11 May 2022 11:32:52 -0700
Subject: [PATCH 03/25] Adjust timers and syncs for consistency.

---
 Src/Base/AMReX_FabArray.H      | 39 ----------------------------------
 Src/Base/AMReX_FabArrayCommI.H |  8 +++----
 2 files changed, 3 insertions(+), 44 deletions(-)

diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 0dffc85d457..229ea3aa195 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -2588,13 +2588,11 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(0, nComp(), n_grow, Periodicity::NonPeriodic(), cross);
         FillBoundary_finish<BUF>();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2602,13 +2600,11 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (const Periodicity& period, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(0, nComp(), n_grow, period, cross);
         FillBoundary_finish<BUF>();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2616,7 +2612,6 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (const IntVect& nghost, const Periodicity& period, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(nghost.allLE(nGrowVect()),
                                      "FillBoundary: asked to fill more ghost cells than we have");
@@ -2624,7 +2619,6 @@ FabArray<FAB>::FillBoundary (const IntVect& nghost, const Periodicity& period, b
         FillBoundary_nowait<BUF>(0, nComp(), nghost, period, cross);
         FillBoundary_finish<BUF>();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2632,13 +2626,11 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (int scomp, int ncomp, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(scomp, ncomp, n_grow, Periodicity::NonPeriodic(), cross);
         FillBoundary_finish<BUF>();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2646,13 +2638,11 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary (int scomp, int ncomp, const Periodicity& period, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     if ( n_grow.max() > 0 ) {
         FillBoundary_nowait<BUF>(scomp, ncomp, n_grow, period, cross);
         FillBoundary_finish<BUF>();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2661,7 +2651,6 @@ void
 FabArray<FAB>::FillBoundary (int scomp, int ncomp, const IntVect& nghost,
                              const Periodicity& period, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundary()");
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(nghost.allLE(nGrowVect()),
                                      "FillBoundary: asked to fill more ghost cells than we have");
@@ -2669,7 +2658,6 @@ FabArray<FAB>::FillBoundary (int scomp, int ncomp, const IntVect& nghost,
         FillBoundary_nowait<BUF>(scomp, ncomp, nghost, period, cross);
         FillBoundary_finish<BUF>();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2708,13 +2696,11 @@ template <class FAB>
 void
 FabArray<FAB>::FillBoundaryAndSync (const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundaryAndSync()");
     if (n_grow.max() > 0 || !is_cell_centered()) {
         FillBoundaryAndSync_nowait(0, nComp(), n_grow, period);
         FillBoundaryAndSync_finish();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2722,13 +2708,11 @@ void
 FabArray<FAB>::FillBoundaryAndSync (int scomp, int ncomp, const IntVect& nghost,
                                     const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::FillBoundaryAndSync()");
     if (nghost.max() > 0 || !is_cell_centered()) {
         FillBoundaryAndSync_nowait(scomp, ncomp, nghost, period);
         FillBoundaryAndSync_finish();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2743,7 +2727,6 @@ void
 FabArray<FAB>::FillBoundaryAndSync_nowait (int scomp, int ncomp, const IntVect& nghost,
                                            const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FillBoundaryAndSync_nowait()");
     FBEP_nowait(scomp, ncomp, nghost, period, false, false, true);
 }
@@ -2754,40 +2737,34 @@ FabArray<FAB>::FillBoundaryAndSync_finish ()
 {
     BL_PROFILE("FillBoundaryAndSync_finish()");
     FillBoundary_finish();
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::OverrideSync (const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FAbArray::OverrideSync()");
     if (!is_cell_centered()) {
         OverrideSync_nowait(0, nComp(), period);
         OverrideSync_finish();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::OverrideSync (int scomp, int ncomp, const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FAbArray::OverrideSync()");
     if (!is_cell_centered()) {
         OverrideSync_nowait(scomp, ncomp, period);
         OverrideSync_finish();
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::OverrideSync_nowait (const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     OverrideSync_nowait(0, nComp(), period);
 }
 
@@ -2795,7 +2772,6 @@ template <class FAB>
 void
 FabArray<FAB>::OverrideSync_nowait (int scomp, int ncomp, const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("OverrideSync_nowait()");
     FBEP_nowait(scomp, ncomp, IntVect(0), period, false, false, true);
 }
@@ -2806,7 +2782,6 @@ FabArray<FAB>::OverrideSync_finish ()
 {
     BL_PROFILE("OverrideSync_finish()");
     FillBoundary_finish();
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2834,12 +2809,10 @@ template <class FAB>
 void
 FabArray<FAB>::SumBoundary (int scomp, int ncomp, IntVect const& src_nghost, IntVect const& dst_nghost, const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray<FAB>::SumBoundary()");
 
     SumBoundary_nowait(scomp, ncomp, src_nghost, dst_nghost, period);
     SumBoundary_finish();
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2867,7 +2840,6 @@ template <class FAB>
 void
 FabArray<FAB>::SumBoundary_nowait (int scomp, int ncomp, IntVect const& src_nghost, IntVect const& dst_nghost, const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray<FAB>::SumBoundary_nowait()");
 
     if ( n_grow == IntVect::TheZeroVector() && boxArray().ixType().cellCentered()) return;
@@ -2895,33 +2867,28 @@ FabArray<FAB>::SumBoundary_finish ()
     FabArray<FAB>* tmp = const_cast<FabArray<FAB>*> (this->pcd->src);
     this->ParallelCopy_finish();
     delete tmp;
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::EnforcePeriodicity (const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::EnforcePeriodicity");
     if (period.isAnyPeriodic()) {
         FBEP_nowait(0, nComp(), nGrowVect(), period, false, true);
         FillBoundary_finish(); // unsafe unless isAnyPeriodic()
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
 void
 FabArray<FAB>::EnforcePeriodicity (int scomp, int ncomp, const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::EnforcePeriodicity");
     if (period.isAnyPeriodic()) {
         FBEP_nowait(scomp, ncomp, nGrowVect(), period, false, true);
         FillBoundary_finish(); // unsafe unless isAnyPeriodic()
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2929,13 +2896,11 @@ void
 FabArray<FAB>::EnforcePeriodicity (int scomp, int ncomp, const IntVect& nghost,
                                    const Periodicity& period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::EnforcePeriodicity");
     if (period.isAnyPeriodic()) {
         FBEP_nowait(scomp, ncomp, nghost, period, false, true);
         FillBoundary_finish(); // unsafe unless isAnyPeriodic()
     }
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -2943,8 +2908,6 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const Periodicity& period, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
-    BL_PROFILE("FillBoundary_nowait()");
     FBEP_nowait<BUF>(scomp, ncomp, nGrowVect(), period, cross);
 }
 
@@ -2954,8 +2917,6 @@ void
 FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost,
                                     const Periodicity& period, bool cross)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
-    BL_PROFILE("FillBoundary_nowait()");
     FBEP_nowait<BUF>(scomp, ncomp, nghost, period, cross);
 }
 
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index 177ff50236a..b9accf9c7bc 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -10,6 +10,9 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
                             bool enforce_periodicity_only,
                             bool override_sync)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
+    BL_PROFILE("FillBoundary_nowait()");
+
     AMREX_ASSERT_WITH_MESSAGE(!fbd, "FillBoundary_nowait() called when comm operation already in progress.");
     AMREX_ASSERT(!enforce_periodicity_only || !override_sync);
 
@@ -256,12 +259,9 @@ FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src,
                              CpOp                 op,
                              const FabArrayBase::CPC * a_cpc)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::ParallelCopy()");
-
     ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period, op, a_cpc);
     ParallelCopy_finish();
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -274,13 +274,11 @@ FabArray<FAB>::ParallelCopyToGhost (const FabArray<FAB>& src,
                                     const IntVect&       dnghost,
                                     const Periodicity&   period)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::ParallelCopyToGhost()");
 
     ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
                         FabArrayBase::COPY, nullptr, true);
     ParallelCopy_finish();
-    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>

From bea31cbb9d533ff1148fa538247a5206e3b702e1 Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Wed, 11 May 2022 14:19:17 -0700
Subject: [PATCH 04/25] Extra lines.

---
 Src/Base/AMReX_BLProfiler.H | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H
index 24ce1c8dacc..0cc488cca74 100644
--- a/Src/Base/AMReX_BLProfiler.H
+++ b/Src/Base/AMReX_BLProfiler.H
@@ -161,8 +161,6 @@ class BLProfiler
       Real timeStamp;
     };
 
-
-
     static std::map<std::string, BLProfiler *> mFortProfs;  // [fname, fortfunc]
     static Vector<std::string> mFortProfsErrors;     // [error string]
     static Vector<BLProfiler *> mFortProfsInt;     // [fortfuncindex]

From 088771c7c147fc58d8e10f7661ef091beb2199d7 Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Thu, 12 May 2022 16:11:47 -0700
Subject: [PATCH 05/25] Take into account leaving finishes early.

---
 Src/Base/AMReX_FabArrayCommI.H | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index b9accf9c7bc..40431a61557 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -171,7 +171,7 @@ FabArray<FAB>::FillBoundary_finish ()
 
     BL_PROFILE("FillBoundary_finish()");
 
-    if (!fbd) { n_filled = IntVect::TheZeroVector(); return; }
+    if (!fbd) { n_filled = IntVect::TheZeroVector(); BL_PROFILE_SYNC_STOP(); return; }
 
     const FB* TheFB = fbd->fb;
     const int N_rcvs = TheFB->m_RcvTags->size();
@@ -499,7 +499,7 @@ FabArray<FAB>::ParallelCopy_finish ()
 
     BL_PROFILE("FabArray::ParallelCopy_finish()");
 
-    if (!pcd) { return; }
+    if (!pcd) { BL_PROFILE_SYNC_STOP(); return; }
 
     const CPC* thecpc = pcd->cpc;
 

From 08638fc8198a1344e44dc8c1f07a9ec249e58260 Mon Sep 17 00:00:00 2001
From: Jon Rood <jon.rood@nrel.gov>
Date: Wed, 11 May 2022 12:39:13 -0600
Subject: [PATCH 06/25] Add HDF5 H5Z-ZFP support in CMake (#2753)

---
 .../source/BuildingAMReX.rst                  |  2 ++
 Tools/CMake/AMReXConfig.cmake.in              |  6 ++++
 Tools/CMake/AMReXOptions.cmake                |  4 +++
 Tools/CMake/AMReXSetDefines.cmake             |  1 +
 Tools/CMake/AMReXThirdPartyLibraries.cmake    | 28 +++++++++++++++++--
 Tools/CMake/AMReX_Config.H.in                 |  1 +
 6 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst
index 47684fe305c..f5b8b263a4e 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst
@@ -515,6 +515,8 @@ The list of available options is reported in the :ref:`table <tab:cmakevar>` bel
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_HDF5                   |  Enable HDF5-based I/O                          | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_HDF5_ZFP               |  Enable compression with ZFP in HDF5-based I/O  | NO                      | YES, NO               |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_PLOTFILE_TOOLS         |  Build and install plotfile postprocessing tools| NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_ENABLE_TESTS           |  Enable CTest suite                             | NO                      | YES, NO               |
diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index d4be691c6dd..6fa60344f17 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -85,6 +85,7 @@ set(AMReX_HYPRE_FOUND               @AMReX_HYPRE@)
 set(AMReX_PETSC_FOUND               @AMReX_PETSC@)
 set(AMReX_SUNDIALS_FOUND            @AMReX_SUNDIALS@)
 set(AMReX_HDF5_FOUND                @AMReX_HDF5@)
+set(AMReX_HDF5_ZFP_FOUND            @AMReX_HDF5_ZFP@)
 
 # Compilation options
 set(AMReX_FPE_FOUND                 @AMReX_FPE@)
@@ -131,6 +132,7 @@ set(AMReX_ASCENT                    @AMReX_ASCENT@)
 set(AMReX_HYPRE                     @AMReX_HYPRE@)
 set(AMReX_PETSC                     @AMReX_PETSC@)
 set(AMReX_HDF5                      @AMReX_HDF5@)
+set(AMReX_HDF5_ZFP                  @AMReX_HDF5_ZFP@)
 
 # Compilation options
 set(AMReX_FPE                       @AMReX_FPE@)
@@ -200,6 +202,10 @@ if (@AMReX_HDF5@)
     find_dependency(HDF5 REQUIRED)
 endif ()
 
+if (@AMReX_HDF5_ZFP@)
+    find_dependency(H5Z-ZFP REQUIRED)
+endif ()
+
 if (@AMReX_HYPRE@)
    find_dependency(HYPRE 2.20.0 REQUIRED)
 endif ()
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
index 6b45fb62d0a..663fefa606b 100644
--- a/Tools/CMake/AMReXOptions.cmake
+++ b/Tools/CMake/AMReXOptions.cmake
@@ -303,6 +303,10 @@ if (AMReX_HDF5_ASYNC)
    message(FATAL_ERROR "\nAMReX_HDF5_ASYNC not yet supported\n")
 endif ()
 
+cmake_dependent_option(AMReX_HDF5_ZFP "Enable ZFP compression in HDF5-based IO" OFF
+   "AMReX_HDF5" OFF )
+print_option(AMReX_HDF5_ZFP)
+
 # SUNDIALS
 option( AMReX_SUNDIALS "Enable SUNDIALS interfaces" OFF )
 print_option( AMReX_SUNDIALS )
diff --git a/Tools/CMake/AMReXSetDefines.cmake b/Tools/CMake/AMReXSetDefines.cmake
index cd4c12bf838..358f08db766 100644
--- a/Tools/CMake/AMReXSetDefines.cmake
+++ b/Tools/CMake/AMReXSetDefines.cmake
@@ -165,6 +165,7 @@ endif ()
 #
 add_amrex_define(AMREX_USE_HDF5 NO_LEGACY IF AMReX_HDF5)
 add_amrex_define(AMREX_USE_HDF5_ASYNC NO_LEGACY IF AMReX_HDF5_ASYNC)
+add_amrex_define(AMREX_USE_HDF5_ZFP NO_LEGACY IF AMReX_HDF5_ZFP)
 
 
 #
diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake
index 9f1f771d9fb..e8be52dc173 100644
--- a/Tools/CMake/AMReXThirdPartyLibraries.cmake
+++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake
@@ -2,11 +2,16 @@
 # HDF5 -- here it would be best to create an imported target
 #
 if (AMReX_HDF5)
-    set(HDF5_PREFER_PARALLEL TRUE)
+    if (AMReX_MPI)
+       set(HDF5_PREFER_PARALLEL TRUE)
+    endif ()
     find_package(HDF5 1.10.4 REQUIRED)
     if (AMReX_MPI AND (NOT HDF5_IS_PARALLEL))
-        message(FATAL_ERROR "\nHDF5 library does not support parallel I/O")
-     endif ()
+       message(FATAL_ERROR "\nHDF5 library does not support parallel I/O")
+    endif ()
+    if (HDF5_IS_PARALLEL AND (NOT AMReX_MPI))
+       message(FATAL_ERROR "\nMPI enabled in HDF5 but not in AMReX, which will likely fail to build")
+    endif ()
 
     if (TARGET hdf5::hdf5)  # CMake >= 3.19
        target_link_libraries(amrex PUBLIC hdf5::hdf5)
@@ -18,6 +23,23 @@ if (AMReX_HDF5)
 
 endif ()
 
+#
+# H5Z-ZFP
+#
+if (AMReX_HDF5_ZFP)
+   set(H5Z_ZFP_USE_STATIC_LIBS ON) # Static ON means using as a library, or OFF as an HDF5 plugin
+   find_package(H5Z_ZFP 1.0.1 CONFIG)
+   if (NOT AMReX_HDF5)
+      message(FATAL_ERROR "\nHDF5 must be enabled for ZFP support in HDF5")
+   endif ()
+
+   if (TARGET h5z_zfp::h5z_zfp)  # CMake >= 3.19
+      target_link_libraries(amrex PUBLIC h5z_zfp::h5z_zfp)
+   else ()  # CMake < 3.19 -- Remove when minimum cmake version is bumped up
+      target_include_directories(amrex PUBLIC ${H5Z_ZFP_INCLUDE_DIR})
+      target_link_libraries(amrex PUBLIC ${H5Z_ZFP_LIBRARY})
+   endif ()
+endif ()
 
 #
 # Sensei
diff --git a/Tools/CMake/AMReX_Config.H.in b/Tools/CMake/AMReX_Config.H.in
index c4cf03571f8..c38d6a4e9b2 100644
--- a/Tools/CMake/AMReX_Config.H.in
+++ b/Tools/CMake/AMReX_Config.H.in
@@ -50,6 +50,7 @@
 #cmakedefine AMREX_PARTICLES
 #cmakedefine AMREX_USE_HDF5
 #cmakedefine AMREX_USE_HDF5_ASYNC
+#cmakedefine AMREX_USE_HDF5_ZFP
 #cmakedefine AMREX_USE_HYPRE
 #cmakedefine AMREX_USE_PETSC
 #cmakedefine AMREX_USE_SUNDIALS

From cdc0daa1dd36bbf30209d80ca8403c5a360ad011 Mon Sep 17 00:00:00 2001
From: "Don E. Willcox" <dwillcox@users.noreply.github.com>
Date: Wed, 11 May 2022 14:02:03 -0700
Subject: [PATCH 07/25] add scomp and ncomp arguments to IntegratorOps
 functions. (#2759)

---
 Src/Base/AMReX_IntegratorBase.H | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/Src/Base/AMReX_IntegratorBase.H b/Src/Base/AMReX_IntegratorBase.H
index 7e136dc5767..0b9c11c3ed9 100644
--- a/Src/Base/AMReX_IntegratorBase.H
+++ b/Src/Base/AMReX_IntegratorBase.H
@@ -97,23 +97,33 @@ struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::Vector<amrex
         }
     }
 
-    static void Copy (T& Y, const T& Other, bool Grow = true)
+    static void Copy (T& Y, const T& Other, const Vector<int> scomp={}, const Vector<int> ncomp={}, bool Grow = true)
     {
         // Copy the contents of Other into Y
         const int size = Y.size();
+        bool specify_components = scomp.size() > 0 && ncomp.size() == scomp.size();
         for (int i = 0; i < size; ++i) {
             IntVect nGrow = Grow ? Other[i].nGrowVect() : IntVect(0);
-            amrex::MultiFab::Copy(Y[i], Other[i], 0, 0, Other[i].nComp(), nGrow);
+            const int iscomp = specify_components ? scomp[i] : 0;
+            const int incomp = specify_components ? ncomp[i] : Other[i].nComp();
+            if (incomp > 0) {
+                amrex::MultiFab::Copy(Y[i], Other[i], iscomp, iscomp, incomp, nGrow);
+            }
         }
     }
 
-    static void Saxpy (T& Y, const amrex::Real a, const T& X, bool Grow = false)
+    static void Saxpy (T& Y, const amrex::Real a, const T& X, const Vector<int> scomp={}, const Vector<int> ncomp={}, bool Grow = false)
     {
         // Calculate Y += a * X
         const int size = Y.size();
+        bool specify_components = scomp.size() > 0 && ncomp.size() == scomp.size();
         for (int i = 0; i < size; ++i) {
             IntVect nGrow = Grow ? X[i].nGrowVect() : IntVect(0);
-            amrex::MultiFab::Saxpy(Y[i], a, X[i], 0, 0, X[i].nComp(), nGrow);
+            const int iscomp = specify_components ? scomp[i] : 0;
+            const int incomp = specify_components ? ncomp[i] : X[i].nComp();
+            if (incomp > 0) {
+                amrex::MultiFab::Saxpy(Y[i], a, X[i], iscomp, iscomp, incomp, nGrow);
+            }
         }
     }
 
@@ -130,18 +140,20 @@ struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::MultiFab, T>
         V.emplace_back(std::make_unique<T>(Other.boxArray(), Other.DistributionMap(), Other.nComp(), nGrow));
     }
 
-    static void Copy (T& Y, const T& Other, bool Grow = true)
+    static void Copy (T& Y, const T& Other, const int scomp=0, const int ncomp=-1, bool Grow = true)
     {
         // Copy the contents of Other into Y
         IntVect nGrow = Grow ? Other.nGrowVect() : IntVect(0);
-        amrex::MultiFab::Copy(Y, Other, 0, 0, Other.nComp(), nGrow);
+        const int mf_ncomp = ncomp > 0 ? ncomp : Other.nComp();
+        amrex::MultiFab::Copy(Y, Other, scomp, scomp, mf_ncomp, nGrow);
     }
 
-    static void Saxpy (T& Y, const amrex::Real a, const T& X, bool Grow = false)
+    static void Saxpy (T& Y, const amrex::Real a, const T& X, const int scomp=0, const int ncomp=-1, bool Grow = false)
     {
         // Calculate Y += a * X
         IntVect nGrow = Grow ? X.nGrowVect() : IntVect(0);
-        amrex::MultiFab::Saxpy(Y, a, X, 0, 0, X.nComp(), nGrow);
+        const int mf_ncomp = ncomp > 0 ? ncomp : X.nComp();
+        amrex::MultiFab::Saxpy(Y, a, X, scomp, scomp, mf_ncomp, nGrow);
     }
 
 };

From 102e93c56b23f793842b4b68a5f6fbcfe92c7fd5 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 11 May 2022 14:02:31 -0700
Subject: [PATCH 08/25] Avoid the use of null stream (#2754)

The default stream (e.g., stream used outside MFIter) used to be the null
stream for CUDA and HIP.  By default, there is implicit synchronization
between the null stream and other streams.  To avoid that, the default
stream in AMReX is now no longer the null stream.

The behavior of Gpu::synchronize being device wide synchronization has not
changed.  However, for most of its use cases, it can be replaced by a new
function Gpu::streamSynchronizeAll that will synchronize the activities on
all AMReX streams without performing a device wide synchronization that
could potentially interfere with other libraries (e.g., MPI).

The behavior of [dtod|dtoh|htod]_memcpy has changed.  For CUDA and HIP,
these functions used to call the synchronous version of the memcpy.
However, the exact synchronization behavior depends on the memory types.
For SYCL/DPC++, there is no equivalent form because a queue (i.e., stream)
must be specified.  Furthermore, there is no guarantee of consistence across
different vendor platforms.  This has now changed to calling the
asynchronous form using the current stream followed by a stream
synchronization.
---
 Docs/sphinx_documentation/source/GPU.rst      |  59 +++++++--
 Src/Amr/AMReX_AmrLevel.cpp                    |   4 +-
 Src/Amr/AMReX_StateData.cpp                   |   4 +-
 Src/AmrCore/AMReX_FluxRegister.cpp            |   2 +-
 Src/AmrCore/AMReX_TagBox.cpp                  |   5 +-
 Src/Base/AMReX.cpp                            |   2 +-
 Src/Base/AMReX_BlockMutex.cpp                 |   3 +-
 Src/Base/AMReX_FBI.H                          |   8 +-
 Src/Base/AMReX_FabArray.H                     |   2 +-
 Src/Base/AMReX_GpuAsyncArray.H                |   2 +-
 Src/Base/AMReX_GpuBuffer.H                    |   6 -
 Src/Base/AMReX_GpuContainers.H                |  10 +-
 Src/Base/AMReX_GpuDevice.H                    | 125 +++++-------------
 Src/Base/AMReX_GpuDevice.cpp                  |  40 +++---
 Src/Base/AMReX_GpuLaunchFunctsG.H             |  38 ++----
 Src/Base/AMReX_GpuLaunchMacrosG.H             |   6 -
 Src/Base/AMReX_GpuUtility.cpp                 |   9 +-
 Src/Base/AMReX_MFIter.cpp                     |  11 +-
 Src/Base/AMReX_NonLocalBCImpl.H               |   4 +-
 Src/Base/AMReX_Partition.H                    |   2 +-
 Src/Base/AMReX_Random.cpp                     |   6 +-
 Src/Base/AMReX_Reduce.H                       |   8 +-
 Src/Base/AMReX_TagParallelFor.H               |   2 +-
 Src/Base/AMReX_TinyProfiler.cpp               |   6 +-
 Src/Base/AMReX_VisMF.cpp                      |   6 +-
 Src/EB/AMReX_EBMultiFabUtil.cpp               |   2 -
 Src/EB/AMReX_EB_utils.cpp                     |   4 +-
 Src/Extern/HDF5/AMReX_ParticleHDF5.H          |  20 +--
 .../HDF5/AMReX_WriteBinaryParticleDataHDF5.H  |   2 +-
 Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp      |   2 +-
 Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp  |   2 +-
 .../MLMG/AMReX_MLEBNodeFDLaplacian.cpp        |   2 +-
 .../MLMG/AMReX_MLNodeLaplacian.cpp            |   4 +-
 .../MLMG/AMReX_MLNodeLaplacian_misc.cpp       |   4 +-
 .../MLMG/AMReX_MLNodeLaplacian_sten.cpp       |   2 +-
 .../MLMG/AMReX_MLNodeTensorLaplacian.cpp      |   4 +-
 Src/Particle/AMReX_DenseBins.H                |   2 +-
 Src/Particle/AMReX_NeighborList.H             |   5 +-
 Src/Particle/AMReX_NeighborParticlesGPUImpl.H |  17 ++-
 Src/Particle/AMReX_ParticleBufferMap.cpp      |   7 +-
 Src/Particle/AMReX_ParticleCommunication.H    |  22 +--
 Src/Particle/AMReX_ParticleContainerI.H       |  42 +++---
 Src/Particle/AMReX_ParticleIO.H               |  20 +--
 Src/Particle/AMReX_ParticleInit.H             | 102 +++++++-------
 Src/Particle/AMReX_ParticleLocator.H          |  14 +-
 Src/Particle/AMReX_ParticleTile.H             |   4 +-
 Src/Particle/AMReX_ParticleTransformation.H   |  16 +--
 Src/Particle/AMReX_SparseBins.H               |   8 +-
 Src/Particle/AMReX_WriteBinaryParticleData.H  |  18 +--
 Tests/EB_CNS/Source/CNS.cpp                   |  16 ++-
 Tests/GPU/AnyOf/main.cpp                      |   2 +-
 Tests/GPU/AtomicIf/main.cpp                   |   5 +-
 Tests/GPU/RandomNumberGeneration/main.cpp     |   4 +-
 Tests/GPU/Vector/main.cpp                     |   3 +-
 Tests/Particles/AsyncIO/main.cpp              |  42 +++---
 Tests/Particles/DenseBins/main.cpp            |   4 +-
 Tests/Particles/Intersection/main.cpp         |   2 +-
 .../NeighborParticles/MDParticleContainer.cpp |  22 +--
 Tests/Particles/ParallelContext/main.cpp      |  42 +++---
 Tests/Particles/ParticleReduce/main.cpp       |  14 +-
 .../ParticleTransformations/main.cpp          |  14 +-
 Tests/Particles/Redistribute/main.cpp         |  42 +++---
 62 files changed, 443 insertions(+), 464 deletions(-)

diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index 5fa2dc31919..5d64b3fadd4 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -18,7 +18,7 @@ for AMD and DPC++ for Intel. This will be designated with ``CUDA/HIP/DPC++``
 throughout the documentation.  However, application teams can also use
 OpenACC or OpenMP in their individual codes.
 
-At this time, AMReX does not support cross-native language compliation
+At this time, AMReX does not support cross-native language compilation
 (HIP for non-AMD systems and DPC++ for non Intel systems).  It may work with
 a given version, but AMReX does not track or guarantee such functionality.
 
@@ -1237,7 +1237,7 @@ the destructor of :cpp:`MFIter`.  This ensures that all GPU work
 inside of an :cpp:`MFIter` loop will complete before code outside of
 the loop is executed. Any CUDA kernel launches made outside of an
 :cpp:`MFIter` loop must ensure appropriate device synchronization
-occurs. This can be done by calling :cpp:`Gpu::synchronize()`.
+occurs. This can be done by calling :cpp:`Gpu::streamSynchronize()`.
 
 CUDA supports multiple streams and kernels. Kernels launched in the
 same stream are executed sequentially, but different streams of kernel
@@ -1342,6 +1342,48 @@ will show little improvement or even perform worse. So, this conditional stateme
 should be added to MFIter loops that contain GPU work, unless users specifically test
 the performance or are designing more complex workflows that require OpenMP.
 
+.. _sec:gpu:stream
+
+Stream and Synchronization
+==========================
+
+As mentioned in Section :ref:`sec:gpu:overview`, AMReX uses a number of GPU
+streams that are either CUDA streams or HIP streams or SYCL queues.  Many
+GPU functions (e.g., :cpp:`ParallelFor` and :cpp:`Gpu::copyAsync`) are
+asynchronous with respect to the host.  To facilitate synchronization that
+is sometimes necessary, AMReX provides :cpp:`Gpu::streamSynchronize()` and
+:cpp:`Gpu::streamSynchronizeAll()` to synchronize the current stream and all
+AMReX streams, respectively.  For performance reasons, one should try to
+minimize the number of synchronization calls.  For example,
+
+.. highlight:: c++
+
+::
+
+   // The synchronous version is NOT recommended
+   Gpu::copy(Gpu::deviceToHost, ....);
+   Gpu::copy(Gpu::deviceToHost, ....);
+   Gpu::copy(Gpu::deviceToHost, ....);
+
+   // NOT recommended because of unnecessary synchronization
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+
+   // recommended
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+
+In addition to stream synchronization, there is also
+:cpp:`Gpu::synchronize()` that will perform a device wide synchronization.
+However, a device wide synchronization is usually too excessive and it might
+interfere with other libraries (e.g., MPI).
+
 .. _sec:gpu:example:
 
 An Example of Migrating to GPU
@@ -1437,8 +1479,8 @@ portable way.
 .. _sec:gpu:assertion:
 
 
-Assertions, Error Checking and Synchronization
-================================================
+Assertions and Error Checking
+=============================
 
 To help debugging, we often use :cpp:`amrex::Assert` and
 :cpp:`amrex::Abort`.  These functions are GPU safe and can be used in
@@ -1461,10 +1503,11 @@ However, due to asynchronicity, determining the source of the error
 can be difficult.  Even if GPU kernels launched earlier in the code
 result in a CUDA error, the error may not be output at a nearby call to
 :cpp:`AMREX_GPU_ERROR_CHECK()` by the CPU.  When tracking down a CUDA
-launch error, :cpp:`Gpu::synchronize()` and
-:cpp:`Gpu::streamSynchronize()` can be used to synchronize
-the device or the CUDA stream, respectively, and track down the specific
-launch that causes the error.
+launch error, :cpp:`Gpu::synchronize()`,
+:cpp:`Gpu::streamSynchronize()`, or :cpp:`Gpu::streamSynchronizeAll()` can
+be used to synchronize the device, the current GPU stream, or all GPU
+streams, respectively, and track down the specific launch that causes the
+error.
 
 .. ===================================================================
 
diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp
index ece300cfe81..a88489f9512 100644
--- a/Src/Amr/AMReX_AmrLevel.cpp
+++ b/Src/Amr/AMReX_AmrLevel.cpp
@@ -1495,13 +1495,13 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
                                          dcomp,
                                          m_scomp,
                                          m_ncomp);
-        Gpu::synchronize();  // In case this runs on GPU
+        Gpu::streamSynchronize();  // In case this runs on GPU
     }
 
     if (m_FixUpCorners)
     {
         FixUpPhysCorners(fab,m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp);
-        Gpu::synchronize();  // In case this runs on GPU
+        Gpu::streamSynchronize();  // In case this runs on GPU
     }
 }
 
diff --git a/Src/Amr/AMReX_StateData.cpp b/Src/Amr/AMReX_StateData.cpp
index 3f7fca2ec9b..92f8c791986 100644
--- a/Src/Amr/AMReX_StateData.cpp
+++ b/Src/Amr/AMReX_StateData.cpp
@@ -511,9 +511,9 @@ StateData::FillBoundary (FArrayBox&     dest,
     }
 
 #ifdef AMREX_USE_GPU
-    // Add a synchronize here in case the user code launched kernels
+    // Add a streamSynchronize here in case the user code launched kernels
     // to handle the boundary fills.
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 #endif
 }
 
diff --git a/Src/AmrCore/AMReX_FluxRegister.cpp b/Src/AmrCore/AMReX_FluxRegister.cpp
index 308fc00ad9e..877d01258c7 100644
--- a/Src/AmrCore/AMReX_FluxRegister.cpp
+++ b/Src/AmrCore/AMReX_FluxRegister.cpp
@@ -721,7 +721,7 @@ FluxRegister::ClearInternalBorders (const Geometry& geom)
     }
 
 #ifdef AMREX_USE_GPU
-    // There is Gpu::synchronize in Parallelfor below internally.
+    // There is Gpu::streamSynchronize in Parallelfor below internally.
     ParallelFor(tags, nc,
     [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, Array4BoxTag<Real> const& tag)
     {
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 798a552976d..6a989ffbbf1 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -503,10 +503,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
 
     PODVector<int,DeviceArenaAllocator<int> > dv_tags_offset(ntotblocks);
     int* dp_tags_offset = dv_tags_offset.data();
-    Gpu::htod_memcpy(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int));
-#ifdef AMREX_USE_DPCPP
-    Gpu::synchronize();
-#endif
+    Gpu::htod_memcpy_async(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int));
 
     PODVector<IntVect,DeviceArenaAllocator<IntVect> > dv_tags(ntotaltags);
     IntVect* dp_tags = dv_tags.data();
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index 871343f173c..a6b6fea256a 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -603,7 +603,7 @@ void
 amrex::Finalize (amrex::AMReX* pamrex)
 {
 #ifdef AMREX_USE_GPU
-    Gpu::synchronize();
+    Gpu::streamSynchronizeAll();
 #endif
 
     AMReX::erase(pamrex);
diff --git a/Src/Base/AMReX_BlockMutex.cpp b/Src/Base/AMReX_BlockMutex.cpp
index 32e6d8face1..6e82fd9c4aa 100644
--- a/Src/Base/AMReX_BlockMutex.cpp
+++ b/Src/Base/AMReX_BlockMutex.cpp
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
     amrex::ignore_unused(state,N);
     amrex::Abort("xxxxx DPCPP todo");
 #else
-    amrex::launch((N+255)/256, 256, Gpu::nullStream(),
+    amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int i = threadIdx.x + blockIdx.x*blockDim.x;
@@ -38,4 +38,3 @@ BlockMutex::~BlockMutex () {
 #endif
 
 }
-
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 6bbfcecbb0f..61ef452b601 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -775,11 +775,11 @@ FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int nc
     detail::fab_to_fab<BUF, value_type>(snd_copy_tags, scomp, 0, ncomp,
                                         detail::CellStore<BUF, value_type>());
 
-    // There is Gpu::synchronize in fab_to_fab.
+    // There is Gpu::streamSynchronize in fab_to_fab.
 
     if (pbuffer != send_data[0]) {
         Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
         The_Arena()->free(pbuffer);
     }
 }
@@ -808,7 +808,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
         szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
         pbuffer = (char*)The_Arena()->alloc(szbuffer);
         Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 #endif
 
@@ -885,7 +885,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
         }
     }
 
-    // There is Gpu::synchronize in fab_to_fab.
+    // There is Gpu::streamSynchronize in fab_to_fab.
 
     if (pbuffer != recv_data[0]) {
         The_Arena()->free(pbuffer);
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 229ea3aa195..6eef7caa579 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -1807,7 +1807,7 @@ FabArray<FAB>::define (const BoxArray&            bxs,
 
     if(info.alloc) {
         AllocFabs(*m_factory, m_dallocator.m_arena, info.tags);
-        Gpu::synchronize();
+        Gpu::streamSynchronizeAll();
 #ifdef BL_USE_TEAM
         ParallelDescriptor::MyTeam().MemoryBarrier();
 #endif
diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H
index 98c94eb7e31..7e207c49b9b 100644
--- a/Src/Base/AMReX_GpuAsyncArray.H
+++ b/Src/Base/AMReX_GpuAsyncArray.H
@@ -124,7 +124,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (d_data)
         {
-            Gpu::dtoh_memcpy_async(h_p, d_data, n*sizeof(T));
+            Gpu::dtoh_memcpy(h_p, d_data, n*sizeof(T));
         }
         else
 #endif
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
index e5b8bf4903d..6c497f2a206 100644
--- a/Src/Base/AMReX_GpuBuffer.H
+++ b/Src/Base/AMReX_GpuBuffer.H
@@ -33,9 +33,6 @@ public:
         {
             d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
-#ifdef AMREX_USE_DPCPP
-            if (Gpu::onNullStream()) Gpu::synchronize();
-#endif
         }
 #endif
     }
@@ -55,9 +52,6 @@ public:
         {
             d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
-#ifdef AMREX_USE_DPCPP
-            if (Gpu::onNullStream()) Gpu::synchronize();
-#endif
         }
 #endif
     }
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index 0c980907680..cc68770ff3f 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -222,7 +222,7 @@ namespace Gpu {
      *
      * Example usage:
      *
-     *    Gpu::copy(Gpu::hostToDevice, a.begin(), a.end(), b.begin());
+     *    Gpu::copyAsync(Gpu::hostToDevice, a.begin(), a.end(), b.begin());
      */
     template<class InIter, class OutIter>
     void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
@@ -256,7 +256,7 @@ namespace Gpu {
      *
      * Example usage:
      *
-     *    Gpu::copy(Gpu::deviceToHost, a.begin(), a.end(), b.begin());
+     *    Gpu::copyAsync(Gpu::deviceToHost, a.begin(), a.end(), b.begin());
      */
     template<class InIter, class OutIter>
     void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
@@ -290,7 +290,7 @@ namespace Gpu {
      *
      * Example usage:
      *
-     *    Gpu::copy(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());
+     *    Gpu::copyAsync(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());
      */
     template<class InIter, class OutIter>
     void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
@@ -335,7 +335,7 @@ namespace Gpu {
 #endif
 #endif
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     /**
@@ -366,7 +366,7 @@ namespace Gpu {
 #endif
 #endif
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
 }}
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index 4399279aa99..8a327704a1d 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -53,17 +53,13 @@ public:
 
 #if defined(AMREX_USE_GPU)
     static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
-    static gpuStream_t nullStream () noexcept { return gpu_default_stream; }
 #ifdef AMREX_USE_CUDA
     /** for backward compatibility */
     static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
 #endif
 #ifdef AMREX_USE_DPCPP
-    static sycl::queue& nullQueue () noexcept { return *(gpu_default_stream.queue); }
     static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
     static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
-    static bool onNullStream () noexcept { return gpu_stream[OpenMP::get_thread_num()] == gpu_default_stream; }
-    static bool onNullStream (gpuStream_t stream) noexcept { return stream == gpu_default_stream; }
 #endif
 #endif
 
@@ -90,13 +86,16 @@ public:
     static void synchronize () noexcept;
 
     /**
-     * Halt execution of code until GPU stream has finished processing all
+     * Halt execution of code until the current AMReX GPU stream has finished processing all
      * previously requested tasks.
      */
     static void streamSynchronize () noexcept;
-#ifdef AMREX_USE_DPCPP
-    static void nonNullStreamSynchronize () noexcept;
-#endif
+
+    /**
+     * Halt execution of code until all AMReX GPU streams have finished processing all
+     * previously requested tasks.
+     */
+    static void streamSynchronizeAll () noexcept;
 
 #if defined(__CUDACC__)
     /**  Generic graph selection. These should be called by users.  */
@@ -176,9 +175,12 @@ private:
     static dim3 numThreadsMin;
     static dim3 numBlocksOverride, numThreadsOverride;
 
+    // We build gpu_default_stream and gpu_stream_pool.
+    // The non-owning gpu_stream is used to store the current stream that will be used.
+    // gpu_stream is a vector so that it's thread safe to write to it.
     static gpuStream_t gpu_default_stream;
-    static Vector<gpuStream_t> gpu_stream_pool;
-    static Vector<gpuStream_t> gpu_stream;
+    static Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
+    static Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
     static gpuDeviceProp_t device_prop;
     static int memory_pools_supported;
     static unsigned int max_blocks_per_launch;
@@ -198,12 +200,6 @@ gpuStream () noexcept
 {
     return Device::gpuStream();
 }
-
-inline gpuStream_t
-nullStream () noexcept
-{
-    return Device::nullStream();
-}
 #endif
 
 inline int
@@ -224,84 +220,19 @@ streamSynchronize () noexcept
     Device::streamSynchronize();
 }
 
-#ifdef AMREX_USE_DPCPP
 inline void
-nonNullStreamSynchronize () noexcept
+streamSynchronizeAll () noexcept
 {
-    Device::nonNullStreamSynchronize();
+    Device::streamSynchronizeAll();
 }
-#endif
 
 #ifdef AMREX_USE_GPU
 
-inline void
-htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
-{
-    if (sz == 0) return;
-#ifdef AMREX_USE_DPCPP
-    Device::nonNullStreamSynchronize();
-    auto& q = Device::nullQueue();
-    q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("htod_memcpy: ")+ex.what()+"!!!!!");
-    }
-    if (Device::onNullStream()) Gpu::synchronize();
-#else
-    AMREX_HIP_OR_CUDA(
-        AMREX_HIP_SAFE_CALL(hipMemcpy(p_d, p_h, sz, hipMemcpyHostToDevice));,
-        AMREX_CUDA_SAFE_CALL(cudaMemcpy(p_d, p_h, sz, cudaMemcpyHostToDevice)); )
-#endif
-}
-
-inline void
-dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
-{
-    if (sz == 0) return;
-#ifdef AMREX_USE_DPCPP
-    Device::nonNullStreamSynchronize();
-    auto& q = Device::nullQueue();
-    q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("dtoh_memcpy: ")+ex.what()+"!!!!!");
-    }
-    Gpu::synchronize(); // To mimic cuda behavior
-#else
-    AMREX_HIP_OR_CUDA(
-        AMREX_HIP_SAFE_CALL(hipMemcpy(p_h, p_d, sz, hipMemcpyDeviceToHost));,
-        AMREX_CUDA_SAFE_CALL(cudaMemcpy(p_h, p_d, sz, cudaMemcpyDeviceToHost)); )
-#endif
-}
-
-inline void
-dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
-{
-    if (sz == 0) return;
-#ifdef AMREX_USE_DPCPP
-    Device::nonNullStreamSynchronize();
-    auto& q = Device::nullQueue();
-    q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("dtod_memcpy: ")+ex.what()+"!!!!!");
-    }
-#else
-    AMREX_HIP_OR_CUDA(
-        AMREX_HIP_SAFE_CALL(hipMemcpy(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice));,
-        AMREX_CUDA_SAFE_CALL(cudaMemcpy(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice)); )
-#endif
-}
-
 inline void
 htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
 {
     if (sz == 0) return;
 #ifdef AMREX_USE_DPCPP
-    if (Device::onNullStream()) Device::nonNullStreamSynchronize();
     auto& q = Device::streamQueue();
     q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
 #else
@@ -316,7 +247,6 @@ dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
 {
     if (sz == 0) return;
 #ifdef AMREX_USE_DPCPP
-    if (Device::onNullStream()) Device::nonNullStreamSynchronize();
     auto& q = Device::streamQueue();
     q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
 #else
@@ -331,7 +261,6 @@ dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noe
 {
     if (sz == 0) return;
 #ifdef AMREX_USE_DPCPP
-    if (Device::onNullStream()) Device::nonNullStreamSynchronize();
     auto& q = Device::streamQueue();
     q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
 #else
@@ -341,20 +270,30 @@ dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noe
 #endif
 }
 
-#endif
+inline void
+htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
+{
+    if (sz == 0) return;
+    htod_memcpy_async(p_d, p_h, sz);
+    Gpu::streamSynchronize();
+}
 
-#ifdef AMREX_USE_DPCPP
-inline bool
-onNullStream ()
+inline void
+dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
 {
-    return Device::onNullStream();
+    if (sz == 0) return;
+    dtoh_memcpy_async(p_h, p_d, sz);
+    Gpu::streamSynchronize();
 }
 
-inline bool
-onNullStream (gpuStream_t stream)
+inline void
+dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
 {
-    return Device::onNullStream(stream);
+    if (sz == 0) return;
+    dtod_memcpy_async(p_d_dst, p_d_src, sz);
+    Gpu::streamSynchronize();
 }
+
 #endif
 
 }}
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index c921a37d355..05790837111 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -403,7 +403,7 @@ Device::initialize_gpu ()
         AMREX_HIP_SAFE_CALL(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeFourByte));
     }
 
-    gpu_default_stream = 0;
+    AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_default_stream));
     for (int i = 0; i < max_gpu_streams; ++i) {
         AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_stream_pool[i]));
     }
@@ -424,7 +424,7 @@ Device::initialize_gpu ()
         AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
     }
 
-    gpu_default_stream = 0;
+    AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_default_stream));
     for (int i = 0; i < max_gpu_streams; ++i) {
         AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_stream_pool[i]));
 #ifdef AMREX_USE_ACC
@@ -559,7 +559,7 @@ Device::numDevicesUsed () noexcept
 int
 Device::streamIndex (gpuStream_t s) noexcept
 {
-    if (s == nullStream()) {
+    if (s == gpu_default_stream) {
         return -1;
     } else {
         auto it = std::find(std::begin(gpu_stream_pool), std::end(gpu_stream_pool), s);
@@ -611,12 +611,19 @@ void
 Device::synchronize () noexcept
 {
 #ifdef AMREX_USE_DPCPP
-    nonNullStreamSynchronize();
+    auto& q = *(gpu_default_stream.queue);
     try {
-        gpu_default_stream.queue->wait_and_throw();
+        q.wait_and_throw();
     } catch (sycl::exception const& ex) {
         amrex::Abort(std::string("synchronize: ")+ex.what()+"!!!!!");
     }
+    for (auto const& s : gpu_stream_pool) {
+        try {
+            s.queue->wait_and_throw();
+        } catch (sycl::exception const& ex) {
+            amrex::Abort(std::string("synchronize: ")+ex.what()+"!!!!!");
+        }
+    }
 #else
     AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipDeviceSynchronize());,
                        AMREX_CUDA_SAFE_CALL(cudaDeviceSynchronize()); )
@@ -634,24 +641,27 @@ Device::streamSynchronize () noexcept
         amrex::Abort(std::string("streamSynchronize: ")+ex.what()+"!!!!!");
     }
 #else
-    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpu_stream[OpenMP::get_thread_num()]));,
-                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpu_stream[OpenMP::get_thread_num()])); )
+    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpuStream()));,
+                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpuStream())); )
 #endif
 }
 
-#ifdef AMREX_USE_DPCPP
 void
-Device::nonNullStreamSynchronize () noexcept
+Device::streamSynchronizeAll () noexcept
 {
+#ifdef AMREX_USE_GPU
+#ifdef AMREX_USE_DPCPP
+    Device::synchronize();
+#else
+    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpu_default_stream));,
+                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpu_default_stream)); )
     for (auto const& s : gpu_stream_pool) {
-        try {
-            s.queue->wait_and_throw();
-        } catch (sycl::exception const& ex) {
-            amrex::Abort(std::string("nonNullStreamSynchronize: ")+ex.what()+"!!!!!");
-        }
+        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(s));,
+                           AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); )
     }
-}
 #endif
+#endif
+}
 
 #if defined(__CUDACC__)
 
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index ec2bbe49b3a..12206f69b70 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -9,7 +9,6 @@ namespace amrex {
 template <typename L>
 void single_task (gpuStream_t stream, L&& f) noexcept
 {
-    if (Gpu::onNullStream(stream)) Gpu::nonNullStreamSynchronize();
     auto& q = *(stream.queue);
     try {
         q.submit([&] (sycl::handler& h) {
@@ -24,7 +23,6 @@ template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
              gpuStream_t stream, L&& f) noexcept
 {
-    if (Gpu::onNullStream(stream)) Gpu::nonNullStreamSynchronize();
     int nthreads_total = nthreads_per_block * nblocks;
     std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
         / sizeof(unsigned long long);
@@ -49,7 +47,6 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
 template<typename L>
 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
 {
-    if (Gpu::onNullStream(stream)) Gpu::nonNullStreamSynchronize();
     int nthreads_total = nthreads_per_block * nblocks;
     auto& q = *(stream.queue);
     try {
@@ -72,8 +69,6 @@ void launch (T const& n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -149,8 +144,6 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -203,8 +196,6 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -269,8 +260,6 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -331,11 +320,9 @@ void ParallelForRNG (T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
-    auto& q = Gpu::Device::nullQueue();
+    auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
         q.submit([&] (sycl::handler& h) {
@@ -370,11 +357,9 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
-    auto& q = Gpu::Device::nullQueue();
+    auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
         q.submit([&] (sycl::handler& h) {
@@ -416,8 +401,6 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
@@ -470,8 +453,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -533,8 +514,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len2x = len2.x;
     const auto len3x = len3.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -602,8 +581,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -673,8 +650,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len2x = len2.x;
     const auto len3x = len3.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -897,7 +872,7 @@ ParallelForRNG (T n, L&& f) noexcept
     randState_t* rand_state = getRandState();
     const auto ec = Gpu::ExecutionConfig(n);
     AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
-                        ec.numThreads, 0, Gpu::nullStream(),  // use null stream
+                        ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
@@ -905,6 +880,7 @@ ParallelForRNG (T n, L&& f) noexcept
             f(i,engine);
         }
     });
+    Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -921,7 +897,7 @@ ParallelForRNG (Box const& box, L&& f) noexcept
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
-                        ec.numThreads, 0, Gpu::nullStream(),  // use null stream
+                        ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
@@ -935,6 +911,7 @@ ParallelForRNG (Box const& box, L&& f) noexcept
             f(i,j,k,engine);
         }
     });
+    Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -951,7 +928,7 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
-                        ec.numThreads, 0, Gpu::nullStream(), // use null stream
+                        ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
@@ -967,6 +944,7 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
             }
         }
     });
+    Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H
index 827ee2265d0..89aa1f24bc9 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.H
@@ -9,7 +9,6 @@
     if (amrex::Gpu::inLaunchRegion()) \
     { \
         const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -68,7 +67,6 @@
         dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
                                           amrex_i_ec2.numBlocks.x); \
         amrex_i_nblocks.y = 2; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -151,7 +149,6 @@
                                                      amrex_i_ec2.numBlocks.x), \
                                                      amrex_i_ec3.numBlocks.x); \
         amrex_i_nblocks.y = 3; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -243,7 +240,6 @@
     if (amrex::Gpu::inLaunchRegion()) \
     { \
         auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -298,7 +294,6 @@
         dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
                                           amrex_i_ec2.numBlocks.x); \
         amrex_i_nblocks.y = 2; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -371,7 +366,6 @@
                                                      amrex_i_ec2.numBlocks.x), \
                                                      amrex_i_ec3.numBlocks.x); \
         amrex_i_nblocks.y = 3; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
diff --git a/Src/Base/AMReX_GpuUtility.cpp b/Src/Base/AMReX_GpuUtility.cpp
index 60f73265d35..d40a1aef31b 100644
--- a/Src/Base/AMReX_GpuUtility.cpp
+++ b/Src/Base/AMReX_GpuUtility.cpp
@@ -38,6 +38,12 @@ StreamIter::init() noexcept
     amrex::ignore_unused(m_threadsafe);
     amrex::ignore_unused(m_sync);
 #if defined(AMREX_USE_GPU)
+    if (m_sync) {
+#ifdef AMREX_USE_OMP
+#pragma omp single
+#endif
+        Gpu::streamSynchronize();
+    }
     Gpu::Device::setStreamIndex(m_i);
 #elif defined(AMREX_USE_OMP)
     int nthreads = omp_get_num_threads();
@@ -59,7 +65,7 @@ StreamIter::init() noexcept
 StreamIter::~StreamIter () {
 #ifdef AMREX_USE_GPU
     if (m_sync) {
-        Gpu::synchronize();
+        Gpu::streamSynchronizeAll();
     }
     AMREX_GPU_ERROR_CHECK();
     Gpu::Device::resetStreamIndex();
@@ -79,4 +85,3 @@ StreamIter::operator++ () noexcept
 #endif
 
 }}
-
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index c0ad9e8ce87..e8a97256d3d 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -222,7 +222,7 @@ MFIter::~MFIter ()
 #endif
 
 #ifdef AMREX_USE_GPU
-    if (device_sync) Gpu::synchronize();
+    if (device_sync) Gpu::streamSynchronizeAll();
 #endif
 
 #ifdef AMREX_USE_GPU
@@ -251,6 +251,15 @@ MFIter::Initialize ()
             "Nested or multiple active MFIters is not supported by default.  This can be changed by calling MFIter::allowMultipleMFIters(true)".);
     }
 
+#ifdef AMREX_USE_GPU
+    if (device_sync) {
+#ifdef AMREX_USE_OMP
+#pragma omp single
+#endif
+        Gpu::streamSynchronize();
+    }
+#endif
+
     if (flags & AllBoxes)  // a very special case
     {
         index_map    = &(fabArray.IndexArray());
diff --git a/Src/Base/AMReX_NonLocalBCImpl.H b/Src/Base/AMReX_NonLocalBCImpl.H
index dbae5ff9b9f..d2ac3b5a41c 100644
--- a/Src/Base/AMReX_NonLocalBCImpl.H
+++ b/Src/Base/AMReX_NonLocalBCImpl.H
@@ -296,7 +296,7 @@ unpack_recv_buffer_gpu (FabArray<FAB>& mf, int scomp, int ncomp,
         szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
         pbuffer = (char*)The_Arena()->alloc(szbuffer);
         Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 #endif
 
@@ -327,7 +327,7 @@ unpack_recv_buffer_gpu (FabArray<FAB>& mf, int scomp, int ncomp,
         tag.dfab(i,j,k,scomp+n) = proj(tag.sfab, si ,n);
     });
 
-    // There is Gpu::synchronize in ParallelFor above
+    // There is Gpu::streamSynchronize in ParallelFor above
 
     if (pbuffer != recv_data[0]) {
         The_Arena()->free(pbuffer);
diff --git a/Src/Base/AMReX_Partition.H b/Src/Base/AMReX_Partition.H
index b36a1a24273..bff8156537d 100644
--- a/Src/Base/AMReX_Partition.H
+++ b/Src/Base/AMReX_Partition.H
@@ -51,7 +51,7 @@ namespace detail
             {
                 amrex::Swap(p[i], p[n2-1-i]);
             });
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 }
diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp
index fce6096f714..b02ad2f1e8d 100644
--- a/Src/Base/AMReX_Random.cpp
+++ b/Src/Base/AMReX_Random.cpp
@@ -40,7 +40,7 @@ void ResizeRandomSeed (amrex::ULong gpu_seed)
 #ifdef AMREX_USE_DPCPP
 
     rand_engine_descr = new dpcpp_rng_descr
-        (Gpu::Device::nullQueue(), sycl::range<1>(N), gpu_seed, 1);
+        (Gpu::Device::streamQueue(), sycl::range<1>(N), gpu_seed, 1);
 
 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
@@ -54,7 +54,7 @@ void ResizeRandomSeed (amrex::ULong gpu_seed)
     });
 #endif
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 }
 #endif
@@ -182,7 +182,7 @@ amrex::DeallocateRandomSeedDevArray ()
 #ifdef AMREX_USE_DPCPP
     if (rand_engine_descr) {
         delete rand_engine_descr;
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
         rand_engine_descr = nullptr;
     }
 #else
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 57145322033..9c07b7b4a2a 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -813,7 +813,7 @@ bool AnyOf (N n, T const* v, P&& pred)
 #ifdef AMREX_USE_DPCPP
     const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
     const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::nullStream(),
+    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
         int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
         if (gh.threadIdx() == 0) { *has_any = *dp; }
@@ -834,7 +834,7 @@ bool AnyOf (N n, T const* v, P&& pred)
         }
     });
 #else
-    amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, 0,
+    amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         __shared__ int has_any;
         if (threadIdx.x == 0) has_any = *dp;
@@ -874,7 +874,7 @@ bool AnyOf (Box const& box, P&& pred)
 #ifdef AMREX_USE_DPCPP
     const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
     const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::nullStream(),
+    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
         int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
         if (gh.threadIdx() == 0) { *has_any = *dp; }
@@ -899,7 +899,7 @@ bool AnyOf (Box const& box, P&& pred)
         }
     });
 #else
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, 0,
+    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         __shared__ int has_any;
         if (threadIdx.x == 0) has_any = *dp;
diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H
index fe5502583a3..be8af6d06b2 100644
--- a/Src/Base/AMReX_TagParallelFor.H
+++ b/Src/Base/AMReX_TagParallelFor.H
@@ -215,7 +215,7 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
 #endif
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     The_Pinned_Arena()->free(h_buffer);
     The_Arena()->free(d_buffer);
 }
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 386a9e9c338..7302c4c4f1d 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -97,7 +97,7 @@ TinyProfiler::start () noexcept
 
 #ifdef AMREX_USE_GPU
             if (device_synchronize_around_region) {
-                amrex::Gpu::Device::synchronize();
+                amrex::Gpu::streamSynchronize();
             }
 #endif
 
@@ -189,7 +189,7 @@ TinyProfiler::stop () noexcept
 
 #ifdef AMREX_USE_GPU
             if (device_synchronize_around_region) {
-                amrex::Gpu::Device::synchronize();
+                amrex::Gpu::streamSynchronize();
             }
 #endif
 
@@ -273,7 +273,7 @@ TinyProfiler::stop (unsigned boxUintID) noexcept
             }
 
             if (device_synchronize_around_region) {
-                amrex::Gpu::Device::synchronize();
+                amrex::Gpu::streamSynchronize();
             }
 
 #ifdef AMREX_USE_CUDA
diff --git a/Src/Base/AMReX_VisMF.cpp b/Src/Base/AMReX_VisMF.cpp
index 9a99efa7446..c4983276d02 100644
--- a/Src/Base/AMReX_VisMF.cpp
+++ b/Src/Base/AMReX_VisMF.cpp
@@ -1022,7 +1022,7 @@ VisMF::Write (const FabArray<FArrayBox>&    mf,
                     fio.write_header(hss, fab, fab.nComp());
                     hLength = static_cast<std::streamoff>(hss.tellp());
                     auto tstr = hss.str();
-                    memcpy(afPtr, tstr.c_str(), hLength);  // ---- the fab header
+                    std::memcpy(afPtr, tstr.c_str(), hLength);  // ---- the fab header
                 }
                 Real const* fabdata = fab.dataPtr();
 #ifdef AMREX_USE_GPU
@@ -1721,7 +1721,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
                       RealDescriptor::convertToNativeFormat(fabdata, readDataItems,
                                                             afPtr, hdr.m_writtenRD);
                     } else {
-                      memcpy(fabdata, afPtr, fab.nBytes());
+                      std::memcpy(fabdata, afPtr, fab.nBytes());
                     }
                     currentOffset += readDataItems * hdr.m_writtenRD.numBytes();
 #ifdef AMREX_USE_GPU
@@ -2342,7 +2342,7 @@ VisMF::AsyncWriteDoit (const FabArray<FArrayBox>& mf, const std::string& mf_name
             if (strip_ghost) {
                 new_fab.copy<RunOn::Device>(mf[mfi], bx);
             } else {
-                Gpu::dtoh_memcpy(new_fab.dataPtr(), mf[mfi].dataPtr(), new_fab.size()*sizeof(Real));
+                Gpu::dtoh_memcpy_async(new_fab.dataPtr(), mf[mfi].dataPtr(), new_fab.size()*sizeof(Real));
             }
         } else
 #endif
diff --git a/Src/EB/AMReX_EBMultiFabUtil.cpp b/Src/EB/AMReX_EBMultiFabUtil.cpp
index fc80d1b4e6a..f215c2a1ae0 100644
--- a/Src/EB/AMReX_EBMultiFabUtil.cpp
+++ b/Src/EB/AMReX_EBMultiFabUtil.cpp
@@ -939,7 +939,6 @@ EB_interp_CC_to_FaceCentroid (const MultiFab& cc,
     {
         Gpu::copy(Gpu::hostToDevice, a_bcs.begin(), a_bcs.begin()+ncomp, dv_bcs.begin());
         d_bcs = dv_bcs.dataPtr();
-        Gpu::synchronize();
     }
     else
 #endif
@@ -1078,7 +1077,6 @@ EB_interp_CellCentroid_to_FaceCentroid (const MultiFab& phi_centroid,
     {
         Gpu::copy(Gpu::hostToDevice, a_bcs.begin(), a_bcs.begin()+ncomp, dv_bcs.begin());
         d_bcs = dv_bcs.dataPtr();
-        Gpu::synchronize();
     }
     else
 #endif
diff --git a/Src/EB/AMReX_EB_utils.cpp b/Src/EB/AMReX_EB_utils.cpp
index 1bcdfe08e65..035375cbda5 100644
--- a/Src/EB/AMReX_EB_utils.cpp
+++ b/Src/EB/AMReX_EB_utils.cpp
@@ -213,7 +213,7 @@ namespace amrex {
             div(i,j,k,icomp+n) = divc(i,j,k,n) + optmp(i,j,k,n);
         });
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     //
@@ -658,7 +658,7 @@ void FillSignedDistance (MultiFab& mf, EB2::Level const& ls_lev,
                         fab(i,j,k) = (-fluid_sign) * usd;
                     }
                 });
-                Gpu::synchronize();
+                Gpu::streamSynchronize();
             }
         } else {
             amrex::ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
diff --git a/Src/Extern/HDF5/AMReX_ParticleHDF5.H b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
index d4585046ceb..d861a3684d8 100644
--- a/Src/Extern/HDF5/AMReX_ParticleHDF5.H
+++ b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
@@ -1477,21 +1477,21 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
           auto new_size = old_size + src_tile.size();
           dst_tile.resize(new_size);
 
-          Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                    dst_tile.GetArrayOfStructs().begin() + old_size);
+          Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                         dst_tile.GetArrayOfStructs().begin() + old_size);
 
           for (int i = 0; i < NumRealComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
           }
 
           for (int i = 0; i < NumIntComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
           }
         }
       }
diff --git a/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H b/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
index 4716b5b28c6..4270000ba84 100644
--- a/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
+++ b/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
@@ -201,7 +201,7 @@ void WriteHDF5ParticleDataSync (PC const& pc,
         }
     }
 
-    Gpu::Device::synchronize();
+    Gpu::streamSynchronize();
 
     if(pc.GetUsePrePost())
     {
diff --git a/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp b/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
index 489f6f3f5ef..6f3d03fde03 100644
--- a/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
@@ -675,7 +675,7 @@ HypreABecLap3::loadVectors (MultiFab& soln, const MultiFab& rhs)
                         bp[0] = Real(0.0);
                     }
                 });
-                Gpu::synchronize();
+                Gpu::streamSynchronize();
             }
 
             HYPRE_IJVectorSetValues(b, nrows, cell_id_vec[mfi].dataPtr(), rhs_diag[mfi].dataPtr());
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
index af53bd1c0e8..8f6921950e7 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
@@ -1312,7 +1312,7 @@ MLCellLinOp::BndryCondLoc::setLOBndryConds (const Geometry& geom, const Real* dx
         }
     }
     Gpu::copyAsync(Gpu::hostToDevice, hv.begin(), hv.end(), bctl_dv.begin());
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 void
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index bafd1860967..cfa7595b515 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -302,7 +302,7 @@ MLEBNodeFDLaplacian::prepareForSolve ()
         });
     }
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
 #if (AMREX_SPACEDIM == 2)
     if (m_rz) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
index 543800f8286..eb21e4f2731 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
@@ -323,7 +323,7 @@ MLNodeLaplacian::restriction (int amrlev, int cmglev, MultiFab& crse, MultiFab&
                 mlndlap_restriction_rap(i,j,k,pcrse_ma[box_no],fine_ma[box_no],st_ma[box_no],msk_ma[box_no]);
             });
         }
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -451,7 +451,7 @@ MLNodeLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine, const Mu
                 mlndlap_semi_interpadd_aa(i, j, k, fine_ma[box_no], crse_ma[box_no], sig_ma[box_no], msk_ma[box_no], idir);
             });
         }
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     } else
 #endif
     {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
index e4a3bb65cc8..df5ab489d2f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
@@ -248,7 +248,7 @@ MLNodeLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i
 #endif
             });
         }
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -410,7 +410,7 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
             }
         }
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
         if (m_smooth_num_sweeps > 1) nodalSync(amrlev, mglev, sol);
     }
     else // cpu
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
index dcd7dea627c..e3caf4ecced 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
@@ -359,7 +359,7 @@ MLNodeLaplacian::buildStencil ()
         });
     }
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     // This is only needed at the bottom.
     m_s0_norm0[0].back() = m_stencil[0].back()->norm0(0,0) * m_normalization_threshold;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
index 065ae048420..9f8b01a8ce9 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
@@ -222,7 +222,7 @@ MLNodeTensorLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const Multi
     {
         mlndtslap_adotx(i,j,k, out_a[box_no], in_a[box_no], dmsk_a[box_no], s);
     });
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 #endif
 }
 
@@ -264,7 +264,7 @@ MLNodeTensorLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const Mult
             mlndtslap_gauss_seidel(i, j, k, sol_a[box_no], rhs_a[box_no], dmsk_a[box_no], s);
         }
     });
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 #endif
 }
 
diff --git a/Src/Particle/AMReX_DenseBins.H b/Src/Particle/AMReX_DenseBins.H
index e5e35f0e252..93c9415ad25 100644
--- a/Src/Particle/AMReX_DenseBins.H
+++ b/Src/Particle/AMReX_DenseBins.H
@@ -217,7 +217,7 @@ public:
 
         Gpu::exclusive_scan(m_counts.begin(), m_counts.end(), m_offsets.begin());
 
-        Gpu::copy(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
+        Gpu::copyAsync(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
 
         index_type* pperm = m_perm.dataPtr();
         constexpr index_type max_index = std::numeric_limits<index_type>::max();
diff --git a/Src/Particle/AMReX_NeighborList.H b/Src/Particle/AMReX_NeighborList.H
index 490d6a2b562..a015bede0d7 100644
--- a/Src/Particle/AMReX_NeighborList.H
+++ b/Src/Particle/AMReX_NeighborList.H
@@ -334,8 +334,9 @@ public:
         Gpu::HostVector<unsigned int> host_nbor_offsets(m_nbor_offsets.size());
         Gpu::HostVector<unsigned int> host_nbor_list(m_nbor_list.size());
 
-        Gpu::copy(Gpu::deviceToHost, m_nbor_offsets.begin(), m_nbor_offsets.end(), host_nbor_offsets.begin());
-        Gpu::copy(Gpu::deviceToHost, m_nbor_list.begin(), m_nbor_list.end(), host_nbor_list.begin());
+        Gpu::copyAsync(Gpu::deviceToHost, m_nbor_offsets.begin(), m_nbor_offsets.end(), host_nbor_offsets.begin());
+        Gpu::copyAsync(Gpu::deviceToHost, m_nbor_list.begin(), m_nbor_list.end(), host_nbor_list.begin());
+        Gpu::streamSynchronize();
 
         for (int i = 0; i < numParticles(); ++i) {
             amrex::Print() << "Particle " << i << " could collide with: ";
diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
index 10590fc400b..066318198fd 100644
--- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
+++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
@@ -101,15 +101,13 @@ buildNeighborMask ()
             }
 
             m_code_array[grid].resize(h_code_arr.size());
-            Gpu::copy(Gpu::hostToDevice, h_code_arr.begin(), h_code_arr.end(),
+            Gpu::copyAsync(Gpu::hostToDevice, h_code_arr.begin(), h_code_arr.end(),
                       m_code_array[grid].begin());
             m_isec_boxes[grid].resize(h_isec_boxes.size());
-            Gpu::copy(Gpu::hostToDevice, h_isec_boxes.begin(), h_isec_boxes.end(),
+            Gpu::copyAsync(Gpu::hostToDevice, h_isec_boxes.begin(), h_isec_boxes.end(),
                       m_isec_boxes[grid].begin());
 
-
-
-            Gpu::Device::synchronize();
+            Gpu::streamSynchronize();
         }
 
         RemoveDuplicates(neighbor_procs);
@@ -186,7 +184,8 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
         amrex::Gpu::exclusive_scan(counts.begin(), counts.end(), offsets.begin());
 
         int num_copies;
-        Gpu::dtoh_memcpy(&num_copies, offsets.data()+np, sizeof(int));
+        Gpu::dtoh_memcpy_async(&num_copies, offsets.data()+np, sizeof(int));
+        Gpu::streamSynchronize();
 
         neighbor_copy_op.resize(gid, lev, num_copies);
 
@@ -267,11 +266,11 @@ updateNeighborsGPU (bool boundary_neighbors_only)
     }
     else
     {
-        Gpu::Device::synchronize();
+        Gpu::streamSynchronize();
         pinned_snd_buffer.resize(snd_buffer.size());
         Gpu::dtoh_memcpy_async(pinned_snd_buffer.dataPtr(), snd_buffer.dataPtr(), snd_buffer.size());
         neighbor_copy_plan.buildMPIFinish(this->BufferMap());
-        Gpu::Device::synchronize();
+        Gpu::streamSynchronize();
         communicateParticlesStart(*this, neighbor_copy_plan, pinned_snd_buffer, pinned_rcv_buffer);
         rcv_buffer.resize(pinned_rcv_buffer.size());
         unpackBuffer(*this, neighbor_copy_plan, snd_buffer, NeighborUnpackPolicy());
@@ -280,7 +279,7 @@ updateNeighborsGPU (bool boundary_neighbors_only)
         unpackRemotes(*this, neighbor_copy_plan, rcv_buffer, NeighborUnpackPolicy());
     }
 
-    Gpu::Device::synchronize();
+    Gpu::streamSynchronize();
 }
 
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
diff --git a/Src/Particle/AMReX_ParticleBufferMap.cpp b/Src/Particle/AMReX_ParticleBufferMap.cpp
index 0921aa69056..693db092b39 100644
--- a/Src/Particle/AMReX_ParticleBufferMap.cpp
+++ b/Src/Particle/AMReX_ParticleBufferMap.cpp
@@ -104,9 +104,10 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb)
     d_lev_offsets.resize(0);
     d_lev_offsets.resize(m_lev_offsets.size());
 
-    Gpu::copy(Gpu::hostToDevice, m_lev_gid_to_bucket.begin(),m_lev_gid_to_bucket.end(),d_lev_gid_to_bucket.begin());
-    Gpu::copy(Gpu::hostToDevice, m_lev_offsets.begin(),m_lev_offsets.end(),d_lev_offsets.begin());
-    Gpu::copy(Gpu::hostToDevice, m_bucket_to_pid.begin(),m_bucket_to_pid.end(),d_bucket_to_pid.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_lev_gid_to_bucket.begin(),m_lev_gid_to_bucket.end(),d_lev_gid_to_bucket.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_lev_offsets.begin(),m_lev_offsets.end(),d_lev_offsets.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_bucket_to_pid.begin(),m_bucket_to_pid.end(),d_bucket_to_pid.begin());
+    Gpu::streamSynchronize();
 }
 
 bool ParticleBufferMap::isValid (const ParGDBBase* a_gdb) const
diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
index 3e2b15027df..a9b169cb21c 100644
--- a/Src/Particle/AMReX_ParticleCommunication.H
+++ b/Src/Particle/AMReX_ParticleCommunication.H
@@ -186,22 +186,24 @@ struct ParticleCopyPlan
                                    m_box_offsets.begin());
 
         m_box_counts_h.resize(m_box_counts_d.size());
-        Gpu::copy(Gpu::deviceToHost, m_box_counts_d.begin(), m_box_counts_d.end(),
-                  m_box_counts_h.begin());
+        Gpu::copyAsync(Gpu::deviceToHost, m_box_counts_d.begin(), m_box_counts_d.end(),
+                       m_box_counts_h.begin());
 
         m_snd_pad_correction_h.resize(0);
         m_snd_pad_correction_h.resize(ParallelContext::NProcsSub()+1, 0);
 
         m_snd_pad_correction_d.resize(m_snd_pad_correction_h.size());
-        Gpu::copy(Gpu::hostToDevice, m_snd_pad_correction_h.begin(), m_snd_pad_correction_h.end(),
-                  m_snd_pad_correction_d.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, m_snd_pad_correction_h.begin(), m_snd_pad_correction_h.end(),
+                       m_snd_pad_correction_d.begin());
 
         d_int_comp_mask.resize(int_comp_mask.size());
-        Gpu::copy(Gpu::hostToDevice,  int_comp_mask.begin(),  int_comp_mask.end(),
-                  d_int_comp_mask.begin());
+        Gpu::copyAsync(Gpu::hostToDevice,  int_comp_mask.begin(),  int_comp_mask.end(),
+                       d_int_comp_mask.begin());
         d_real_comp_mask.resize(real_comp_mask.size());
-        Gpu::copy(Gpu::hostToDevice, real_comp_mask.begin(), real_comp_mask.end(),
-                  d_real_comp_mask.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, real_comp_mask.begin(), real_comp_mask.end(),
+                       d_real_comp_mask.begin());
+
+        Gpu::streamSynchronize();
 
         int NStructReal = PC::ParticleContainerType::NStructReal;
         int NStructInt  = PC::ParticleContainerType::NStructInt;
@@ -572,7 +574,7 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
 
         Vector<int> offsets;
         policy.resizeTiles(tiles, sizes, offsets);
-        Gpu::Device::synchronize();
+        Gpu::streamSynchronize();
         int uindex = 0;
         int procindex = 0, rproc = plan.m_rcv_box_pids[0];
         for (int i = 0, N = plan.m_rcv_box_counts.size(); i < N; ++i)
@@ -604,7 +606,7 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
                                        p_comm_real, p_comm_int);
               });
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 #else
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index cd6ae4c6a3d..82832e6e09e 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -864,7 +864,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
 
         });
         last_offset+=next_offset;
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     // last_offset should equal virts.numParticles()
@@ -876,10 +876,10 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
         auto np = src_tile.numParticles();
         virts.resize(virts_offset+np);
         virts_offset += filterAndTransformParticles(virts, src_tile, FilterVirt(assign_buffer_grid,plo,dxi,domain), TransformerVirt(),0,virts_offset);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
-    //Resize implicitly includes a Gpu::synchronize()
     virts.resize(virts_offset);
+    Gpu::streamSynchronize();
     }
 }
 
@@ -966,8 +966,8 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
         ghosts.resize(ghost_offset+np);
         ghost_offset += filterAndTransformParticles(ghosts, src_tile, AssignGridFilter(assign_grid,gid,level,nGrow), TransformerGhost(),0,ghost_offset);
     }
-    //Resize implicitly includes a Gpu::synchronize()
     ghosts.resize(ghost_offset);
+    Gpu::streamSynchronize();
 }
 
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt,
@@ -1129,7 +1129,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>::So
                         dst[i] = src.m_aos[inds[i]];
                     });
 
-                    Gpu::synchronize();
+                    Gpu::streamSynchronize();
                     ptile.GetArrayOfStructs()().swap(tmp_particles);
                 }
 
@@ -1142,7 +1142,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>::So
                         dst[i] = src[inds[i]];
                     });
 
-                    Gpu::synchronize();
+                    Gpu::streamSynchronize();
 
                     ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
                 }
@@ -1156,7 +1156,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>::So
                         dst[i] = src[inds[i]];
                     });
 
-                    Gpu::synchronize();
+                    Gpu::streamSynchronize();
 
                     ptile.GetStructOfArrays().GetIntData(comp).swap(tmp_int);
                 }
@@ -1332,13 +1332,13 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
     }
     else
     {
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
         Gpu::PinnedVector<char> pinned_snd_buffer;
         Gpu::PinnedVector<char> pinned_rcv_buffer;
         pinned_snd_buffer.resize(snd_buffer.size());
         Gpu::dtoh_memcpy_async(pinned_snd_buffer.dataPtr(), snd_buffer.dataPtr(), snd_buffer.size());
         plan.buildMPIFinish(BufferMap());
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
         communicateParticlesStart(*this, plan, pinned_snd_buffer, pinned_rcv_buffer);
         rcv_buffer.resize(pinned_rcv_buffer.size());
         unpackBuffer(*this, plan, snd_buffer, RedistributeUnpackPolicy());
@@ -1347,7 +1347,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
         unpackRemotes(*this, plan, rcv_buffer, RedistributeUnpackPolicy());
     }
 
-    Gpu::Device::synchronize();
+    Gpu::Device::streamSynchronize();
     AMREX_ASSERT(numParticlesOutOfRange(*this, lev_min, lev_max, nGrow) == 0);
 #else
     amrex::ignore_unused(lev_min,lev_max,nGrow,local,remove_negative);
@@ -1954,22 +1954,22 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
               auto new_size = old_size + src_tile.size();
               dst_tile.resize(new_size);
 
-              Gpu::copy(Gpu::hostToDevice,
-                        src_tile.begin(), src_tile.end(),
-                        dst_tile.GetArrayOfStructs().begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             src_tile.begin(), src_tile.end(),
+                             dst_tile.GetArrayOfStructs().begin() + old_size);
 
               for (int i = 0; i < NumRealComps(); ++i) {
-                  Gpu::copy(Gpu::hostToDevice,
-                            host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                            host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                            dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                  Gpu::copyAsync(Gpu::hostToDevice,
+                                 host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                 host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                 dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
               }
 
               for (int i = 0; i < NumIntComps(); ++i) {
-                  Gpu::copy(Gpu::hostToDevice,
-                            host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                            host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                            dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                  Gpu::copyAsync(Gpu::hostToDevice,
+                                 host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                 host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                 dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
               }
             }
           }
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index 9beb86feb8a..cedbdb329c4 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -995,21 +995,21 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
           auto new_size = old_size + src_tile.size();
           dst_tile.resize(new_size);
 
-          Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                    dst_tile.GetArrayOfStructs().begin() + old_size);
+          Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                         dst_tile.GetArrayOfStructs().begin() + old_size);
 
           for (int i = 0; i < NumRealComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
           }
 
           for (int i = 0; i < NumIntComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
           }
         }
       }
diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H
index 0f50370c4a1..ee8afc778e2 100644
--- a/Src/Particle/AMReX_ParticleInit.H
+++ b/Src/Particle/AMReX_ParticleInit.H
@@ -330,18 +330,19 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 if((host_real_attribs[lev][std::make_pair(grid, tile)]).size() > (long unsigned int) NArrayReal)
                   for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
-         }
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                  }
             }
         }
+        Gpu::streamSynchronize();
 
         Redistribute();
     }
@@ -402,17 +403,18 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
             }
         }
+        Gpu::streamSynchronize();
 
         Redistribute();
 
@@ -836,10 +838,11 @@ InitFromBinaryFile (const std::string& file,
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
             }
         }
+        Gpu::streamSynchronize();
 
         Redistribute();
 
@@ -1093,24 +1096,25 @@ InitRandom (Long                    icount,
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayInt; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
                 }
             }
         }
+        Gpu::streamSynchronize();
 
         AMREX_ASSERT(OK());
     }
@@ -1197,25 +1201,27 @@ InitRandom (Long                    icount,
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayInt; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
                 }
             }
         }
-        // Let Redistribute() sort out where the particles beLong.
+        Gpu::streamSynchronize();
+
+        // Let Redistribute() sort out where the particles belong.
         Redistribute();
     }
 
@@ -1399,7 +1405,7 @@ InitOnePerCell (Real x_off, Real y_off, Real z_off, const ParticleInitData& pdat
 
         m_particles[0][ind].resize(ptile_tmp.numParticles());
         amrex::copyParticles(m_particles[0][ind], ptile_tmp);
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
     }
 
     Redistribute();
@@ -1521,27 +1527,27 @@ InitNRandomPerCell (int n_per_cell, const ParticleInitData& pdata)
                 auto new_size = old_size + src_tid.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tid.begin(), src_tid.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tid.begin(), src_tid.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i)
                 {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
-                              host_real_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
+                                   host_real_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayInt; ++i)
                 {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_int_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
-                              host_int_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
-                              dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_int_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
+                                   host_int_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
                 }
             }
         }
-
+        Gpu::streamSynchronize();
     }
 
     if (m_verbose > 1)
diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H
index 4d5f2c83e14..f8e3da93163 100644
--- a/Src/Particle/AMReX_ParticleLocator.H
+++ b/Src/Particle/AMReX_ParticleLocator.H
@@ -116,7 +116,7 @@ public:
         for (int i = 0; i < num_boxes; ++i) m_host_boxes.push_back(ba[i]);
 
         m_device_boxes.resize(num_boxes);
-        Gpu::copy(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin());
 
         // compute the lo, hi and the max box size in each direction
         ReduceOps<AMREX_D_DECL(ReduceOpMin, ReduceOpMin, ReduceOpMin),
@@ -274,9 +274,9 @@ public:
             m_locators[lev].build(a_ba[lev], a_geom[lev]);
             h_grid_assignors[lev] = m_locators[lev].getGridAssignor();
         }
-        Gpu::htod_memcpy(m_grid_assignors.data(), h_grid_assignors.data(),
-                         sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
-        Gpu::synchronize();
+        Gpu::htod_memcpy_async(m_grid_assignors.data(), h_grid_assignors.data(),
+                               sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
+        Gpu::streamSynchronize();
 #else
         for (int lev = 0; lev < num_levels; ++lev)
         {
@@ -329,9 +329,9 @@ public:
             m_locators[lev].setGeometry(a_gdb->Geom(lev));
             h_grid_assignors[lev] = m_locators[lev].getGridAssignor();
         }
-        Gpu::htod_memcpy(m_grid_assignors.data(), h_grid_assignors.data(),
-                         sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
-        Gpu::synchronize();
+        Gpu::htod_memcpy_async(m_grid_assignors.data(), h_grid_assignors.data(),
+                               sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
+        Gpu::streamSynchronize();
 #else
         for (int lev = 0; lev < num_levels; ++lev)
         {
diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H
index 1bb34058491..4374197d335 100644
--- a/Src/Particle/AMReX_ParticleTile.H
+++ b/Src/Particle/AMReX_ParticleTile.H
@@ -583,7 +583,7 @@ struct ParticleTile
 
 #ifdef AMREX_USE_GPU
         if ((h_runtime_r_ptrs.size() > 0) || (h_runtime_i_ptrs.size() > 0)) {
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 #endif
 
@@ -638,7 +638,7 @@ struct ParticleTile
 
 #ifdef AMREX_USE_GPU
         if ((h_runtime_r_cptrs.size() > 0) || (h_runtime_i_cptrs.size() > 0)) {
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 #endif
 
diff --git a/Src/Particle/AMReX_ParticleTransformation.H b/Src/Particle/AMReX_ParticleTransformation.H
index 95e4a747ae2..8fe57749c4a 100644
--- a/Src/Particle/AMReX_ParticleTransformation.H
+++ b/Src/Particle/AMReX_ParticleTransformation.H
@@ -162,7 +162,7 @@ void copyParticles (DstTile& dst, const SrcTile& src,
         copyParticle(dst_data, src_data, src_start+i, dst_start+i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -216,7 +216,7 @@ void transformParticles (DstTile& dst, const SrcTile& src,
         f(dst_data, src_data, src_start+i, dst_start+i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -277,7 +277,7 @@ void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
         f(dst1_data, dst2_data, src_data, src_start+i, dst1_start+i, dst2_start+i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -338,7 +338,7 @@ Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask,
         if (mask[i]) copyParticle(dst_data, src_data, src_start+i, dst_start+p_offsets[i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     return last_mask + last_offset;
 }
 
@@ -434,7 +434,7 @@ Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask
                                            dst_start+p_offsets[src_start+i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     return last_mask + last_offset;
 }
 
@@ -522,7 +522,7 @@ Index filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2,
         if (mask[i]) f(dst_data1, dst_data2, src_data, i, p_offsets[i], p_offsets[i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     return last_mask + last_offset;
 }
 
@@ -627,7 +627,7 @@ void gatherParticles (PTile& dst, const PTile& src, N np, const Index* inds)
         copyParticle(dst_data, src_data, inds[i], i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -657,7 +657,7 @@ void scatterParticles (PTile& dst, const PTile& src, N np, const Index* inds)
         copyParticle(dst_data, src_data, i, inds[i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 }
diff --git a/Src/Particle/AMReX_SparseBins.H b/Src/Particle/AMReX_SparseBins.H
index 93635784822..d9743c8282e 100644
--- a/Src/Particle/AMReX_SparseBins.H
+++ b/Src/Particle/AMReX_SparseBins.H
@@ -139,13 +139,15 @@ public:
         }
 
         m_bins.resize(host_bins.size());
-        Gpu::copy(Gpu::hostToDevice, host_bins.begin(), host_bins.end(), m_bins.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, host_bins.begin(), host_bins.end(), m_bins.begin());
 
         m_offsets.resize(host_offsets.size());
-        Gpu::copy(Gpu::hostToDevice, host_offsets.begin(), host_offsets.end(), m_offsets.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, host_offsets.begin(), host_offsets.end(), m_offsets.begin());
 
         m_perm.resize(host_perm.size());
-        Gpu::copy(Gpu::hostToDevice, host_perm.begin(), host_perm.end(), m_perm.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, host_perm.begin(), host_perm.end(), m_perm.begin());
+
+        Gpu::streamSynchronize();
     }
 
     //! \brief the number of items in the container
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
index 2d09b114934..5a71d015763 100644
--- a/Src/Particle/AMReX_WriteBinaryParticleData.H
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -184,11 +184,11 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
 
     typename PC::IntVector write_int_comp_d(write_int_comp.size());
     typename PC::IntVector write_real_comp_d(write_real_comp.size());
-    Gpu::copy(Gpu::hostToDevice, write_int_comp.begin(), write_int_comp.end(),
-              write_int_comp_d.begin());
-    Gpu::copy(Gpu::hostToDevice, write_real_comp.begin(), write_real_comp.end(),
-              write_real_comp_d.begin());
-    Gpu::Device::synchronize();
+    Gpu::copyAsync(Gpu::hostToDevice, write_int_comp.begin(), write_int_comp.end(),
+                   write_int_comp_d.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, write_real_comp.begin(), write_real_comp.end(),
+                   write_real_comp_d.begin());
+    Gpu::Device::streamSynchronize();
 
     const auto write_int_comp_d_ptr = write_int_comp_d.data();
     const auto write_real_comp_d_ptr = write_real_comp_d.data();
@@ -243,9 +243,9 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
         poffset += num_copies;
     }
 
-    Gpu::copy(Gpu::deviceToHost, idata_d.begin(), idata_d.end(), idata.begin());
-    Gpu::copy(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(), rdata.begin());
-    Gpu::Device::synchronize();
+    Gpu::copyAsync(Gpu::deviceToHost, idata_d.begin(), idata_d.end(), idata.begin());
+    Gpu::copyAsync(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(), rdata.begin());
+    Gpu::Device::streamSynchronize();
 }
 
 template <class PC>
@@ -375,7 +375,7 @@ void WriteBinaryParticleDataSync (PC const& pc,
         }
     }
 
-    Gpu::Device::synchronize();
+    Gpu::Device::streamSynchronize();
 
     if(pc.GetUsePrePost())
     {
diff --git a/Tests/EB_CNS/Source/CNS.cpp b/Tests/EB_CNS/Source/CNS.cpp
index 9e6d78716de..ab289e557f8 100644
--- a/Tests/EB_CNS/Source/CNS.cpp
+++ b/Tests/EB_CNS/Source/CNS.cpp
@@ -340,7 +340,7 @@ CNS::errorEst (TagBoxArray& tags, int, int, Real /*time*/, int, int)
                 }
             }
         });
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     if (level < refine_max_dengrad_lev)
@@ -361,7 +361,7 @@ CNS::errorEst (TagBoxArray& tags, int, int, Real /*time*/, int, int)
         {
             cns_tag_denerror(i, j, k, tagma[box_no], rhoma[box_no], dengrad_threshold, tagval);
         });
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 }
 
@@ -424,7 +424,7 @@ CNS::read_params ()
     if (!refine_boxes.empty()) {
 #ifdef AMREX_USE_GPU
         dp_refine_boxes = (RealBox*)The_Arena()->alloc(sizeof(RealBox)*refine_boxes.size());
-        Gpu::htod_memcpy(dp_refine_boxes, refine_boxes.data(), sizeof(RealBox)*refine_boxes.size());
+        Gpu::htod_memcpy_async(dp_refine_boxes, refine_boxes.data(), sizeof(RealBox)*refine_boxes.size());
 #else
         dp_refine_boxes = refine_boxes.data();
 #endif
@@ -439,7 +439,7 @@ CNS::read_params ()
     pp.query("T_S"      , h_parm->T_S);
 
     h_parm->Initialize();
-    amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_parm, h_parm+1, d_parm);
+    amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, h_parm, h_parm+1, d_parm);
 
     // eb_weights_type:
     //   0 -- weights = 1
@@ -447,12 +447,16 @@ CNS::read_params ()
     //   2 -- use_mass_as_eb_weights
     //   3 -- use_volfrac_as_eb_weights
     pp.query("eb_weights_type", eb_weights_type);
-    if (eb_weights_type < 0 || eb_weights_type > 3)
+    if (eb_weights_type < 0 || eb_weights_type > 3) {
         amrex::Abort("CNS: eb_weights_type must be 0,1,2 or 3");
+    }
 
     pp.query("do_reredistribution", do_reredistribution);
-    if (do_reredistribution != 0 && do_reredistribution != 1)
+    if (do_reredistribution != 0 && do_reredistribution != 1) {
         amrex::Abort("CNS: do_reredistibution must be 0 or 1");
+    }
+
+    amrex::Gpu::streamSynchronize();
 }
 
 void
diff --git a/Tests/GPU/AnyOf/main.cpp b/Tests/GPU/AnyOf/main.cpp
index 00ddb27c13a..20ee03d3982 100644
--- a/Tests/GPU/AnyOf/main.cpp
+++ b/Tests/GPU/AnyOf/main.cpp
@@ -54,7 +54,7 @@ void main_main ()
     {
         BL_PROFILE("Vector AnyOf");
 
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
 
         bool anyof_M = Reduce::AnyOf(nitem, vec.dataPtr(),
                         [=] AMREX_GPU_DEVICE (Real item) noexcept -> int
diff --git a/Tests/GPU/AtomicIf/main.cpp b/Tests/GPU/AtomicIf/main.cpp
index 379efcdfdaf..843ae313a95 100644
--- a/Tests/GPU/AtomicIf/main.cpp
+++ b/Tests/GPU/AtomicIf/main.cpp
@@ -53,11 +53,11 @@ void test ()
                                                });
                            });
     }
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     std::vector<T> v_h(N);
     Gpu::copyAsync(Gpu::deviceToHost, v_d.begin(), v_d.end(), v_h.begin());
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     // The first 4000 entries should all be 0.0
     for (int i = 0; i < 4000; ++i) {
@@ -81,4 +81,3 @@ int main (int argc, char* argv[])
     test<amrex::Long>();
     amrex::Finalize();
 }
-
diff --git a/Tests/GPU/RandomNumberGeneration/main.cpp b/Tests/GPU/RandomNumberGeneration/main.cpp
index 5eb913ab0ee..66746cc0d83 100644
--- a/Tests/GPU/RandomNumberGeneration/main.cpp
+++ b/Tests/GPU/RandomNumberGeneration/main.cpp
@@ -45,7 +45,7 @@ void RandomNumGen ()
             z_d_ptr[i] = amrex::Random(engine);
         });
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     std::vector<Real> x_h(Ndraw);
@@ -54,7 +54,7 @@ void RandomNumGen ()
     Gpu::copyAsync(Gpu::deviceToHost, x_d.begin(), x_d.end(), x_h.begin());
     Gpu::copyAsync(Gpu::deviceToHost, y_d.begin(), y_d.end(), y_h.begin());
     Gpu::copyAsync(Gpu::deviceToHost, z_d.begin(), z_d.end(), z_h.begin());
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     Real xmean=0., ymean=0., zmean=0., xvar=0., yvar=0., zvar=0.;
     for (int i = 0; i < Ndraw; ++i) {
diff --git a/Tests/GPU/Vector/main.cpp b/Tests/GPU/Vector/main.cpp
index 00cec25474a..cfa44437fb5 100644
--- a/Tests/GPU/Vector/main.cpp
+++ b/Tests/GPU/Vector/main.cpp
@@ -94,7 +94,7 @@ void async_test()
     amrex::Print() << "Async Synching -- should print second." << std::endl;
 #endif
 
-    Gpu::Device::synchronize();
+    Gpu::Device::streamSynchronize();
 }
 
 int main (int argc, char* argv[])
@@ -113,4 +113,3 @@ int main (int argc, char* argv[])
     }
     amrex::Finalize();
 }
-
diff --git a/Tests/Particles/AsyncIO/main.cpp b/Tests/Particles/AsyncIO/main.cpp
index ce484e7b773..825f077f5d9 100644
--- a/Tests/Particles/AsyncIO/main.cpp
+++ b/Tests/Particles/AsyncIO/main.cpp
@@ -121,44 +121,44 @@ class MyParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice,
-                      host_particles.begin(),
-                      host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_particles.begin(),
+                           host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_real[i].begin(),
-                          host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_int[i].begin(),
-                          host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
             for (int i = 0; i < NumRuntimeRealComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_real[i].begin(),
-                          host_runtime_real[i].end(),
-                          soa.GetRealData(NAR+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NAR+i).begin() + old_size);
             }
 
             for (int i = 0; i < NumRuntimeIntComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_int[i].begin(),
-                          host_runtime_int[i].end(),
-                          soa.GetIntData(NAI+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NAI+i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 
         Redistribute();
diff --git a/Tests/Particles/DenseBins/main.cpp b/Tests/Particles/DenseBins/main.cpp
index d1ed276fe9e..08f9049dc3f 100644
--- a/Tests/Particles/DenseBins/main.cpp
+++ b/Tests/Particles/DenseBins/main.cpp
@@ -60,8 +60,8 @@ void testGPU (int nbins, const amrex::Vector<int>& items)
 {
     // copy to device
     Gpu::DeviceVector<int> items_d(items.size());
-    Gpu::copy(Gpu::hostToDevice, items.begin(), items.end(), items_d.begin());
-    Gpu::Device::synchronize();
+    Gpu::copyAsync(Gpu::hostToDevice, items.begin(), items.end(), items_d.begin());
+    Gpu::Device::streamSynchronize();
 
     amrex::DenseBins<int> bins;
     bins.build(BinPolicy::GPU, items_d.size(), items_d.data(), nbins, [=] AMREX_GPU_DEVICE (int j) noexcept -> unsigned int { return j ; });
diff --git a/Tests/Particles/Intersection/main.cpp b/Tests/Particles/Intersection/main.cpp
index ab760b171a2..04e2a491ddb 100644
--- a/Tests/Particles/Intersection/main.cpp
+++ b/Tests/Particles/Intersection/main.cpp
@@ -83,7 +83,7 @@ void testIntersection()
             int num_cells = host_cells.size();
 
             Gpu::DeviceVector<IntVect> device_cells(num_cells);
-            Gpu::copy(Gpu::hostToDevice, host_cells.begin(), host_cells.end(), device_cells.begin());
+            Gpu::copyAsync(Gpu::hostToDevice, host_cells.begin(), host_cells.end(), device_cells.begin());
 
             Gpu::DeviceVector<int> device_grids(num_cells);
 
diff --git a/Tests/Particles/NeighborParticles/MDParticleContainer.cpp b/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
index b934f664f4e..d94528dc1c7 100644
--- a/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
+++ b/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
@@ -112,27 +112,27 @@ InitParticles(const IntVect& a_num_particles_per_cell,
         auto new_size = old_size + host_particles.size();
         particle_tile.resize(new_size);
 
-        Gpu::copy(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
-                  particle_tile.GetArrayOfStructs().begin() + old_size);
+        Gpu::copyAsync(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
+                       particle_tile.GetArrayOfStructs().begin() + old_size);
 
         auto& soa = particle_tile.GetStructOfArrays();
         for (int i = 0; i < NumRealComps(); ++i)
         {
-            Gpu::copy(Gpu::hostToDevice,
-                      host_real[i].begin(),
-                      host_real[i].end(),
-                      soa.GetRealData(i).begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_real[i].begin(),
+                           host_real[i].end(),
+                           soa.GetRealData(i).begin() + old_size);
         }
 
         for (int i = 0; i < NumIntComps(); ++i)
         {
-            Gpu::copy(Gpu::hostToDevice,
-                      host_int[i].begin(),
-                      host_int[i].end(),
-                      soa.GetIntData(i).begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_int[i].begin(),
+                           host_int[i].end(),
+                           soa.GetIntData(i).begin() + old_size);
         }
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     amrex::PrintToFile("neighbor_test") << " Number of particles is " << this->TotalNumberOfParticles()<< " \n";
diff --git a/Tests/Particles/ParallelContext/main.cpp b/Tests/Particles/ParallelContext/main.cpp
index 311c39a3bab..ff63697a049 100644
--- a/Tests/Particles/ParallelContext/main.cpp
+++ b/Tests/Particles/ParallelContext/main.cpp
@@ -126,44 +126,44 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice,
-                      host_particles.begin(),
-                      host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_particles.begin(),
+                           host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_real[i].begin(),
-                          host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_int[i].begin(),
-                          host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
             for (int i = 0; i < NumRuntimeRealComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_real[i].begin(),
-                          host_runtime_real[i].end(),
-                          soa.GetRealData(NAR+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NAR+i).begin() + old_size);
             }
 
             for (int i = 0; i < NumRuntimeIntComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_int[i].begin(),
-                          host_runtime_int[i].end(),
-                          soa.GetIntData(NAI+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NAI+i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 
         RedistributeLocal();
diff --git a/Tests/Particles/ParticleReduce/main.cpp b/Tests/Particles/ParticleReduce/main.cpp
index c03c4b49240..e798f8b5edf 100644
--- a/Tests/Particles/ParticleReduce/main.cpp
+++ b/Tests/Particles/ParticleReduce/main.cpp
@@ -99,23 +99,23 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 };
diff --git a/Tests/Particles/ParticleTransformations/main.cpp b/Tests/Particles/ParticleTransformations/main.cpp
index e60a6762f93..cc13b24f32d 100644
--- a/Tests/Particles/ParticleTransformations/main.cpp
+++ b/Tests/Particles/ParticleTransformations/main.cpp
@@ -100,23 +100,23 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 };
diff --git a/Tests/Particles/Redistribute/main.cpp b/Tests/Particles/Redistribute/main.cpp
index d5a416ae598..2cbdbe955a7 100644
--- a/Tests/Particles/Redistribute/main.cpp
+++ b/Tests/Particles/Redistribute/main.cpp
@@ -138,44 +138,44 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice,
-                      host_particles.begin(),
-                      host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_particles.begin(),
+                           host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_real[i].begin(),
-                          host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_int[i].begin(),
-                          host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
             for (int i = 0; i < NumRuntimeRealComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_real[i].begin(),
-                          host_runtime_real[i].end(),
-                          soa.GetRealData(NAR+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NAR+i).begin() + old_size);
             }
 
             for (int i = 0; i < NumRuntimeIntComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_int[i].begin(),
-                          host_runtime_int[i].end(),
-                          soa.GetIntData(NAI+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NAI+i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 
         RedistributeLocal();

From a4bf36ec53c4aaf54af7dfadad6284d163b0648f Mon Sep 17 00:00:00 2001
From: Andy Nonaka <AJNonaka@lbl.gov>
Date: Wed, 11 May 2022 17:31:24 -0700
Subject: [PATCH 09/25] multilevel version of writeplotfiletoascii (#2742)

---
 .../C_Src/WritePlotfileToASCII.cpp            | 113 +++++++++---------
 1 file changed, 56 insertions(+), 57 deletions(-)

diff --git a/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp b/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp
index ca7ca6ab6bb..b5f7fac2add 100644
--- a/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp
+++ b/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp
@@ -37,13 +37,13 @@ main (int   argc,
         PrintUsage(argv[0]);
     }
 
-    // plotfile names for the coarse, fine, and subtracted output
+    // plotfile name
     std::string iFile;
 
     // read in parameters from inputs file
     ParmParse pp;
 
-    // coarse MultiFab
+    // read in plotfile name
     pp.query("infile", iFile);
     if (iFile.empty())
         amrex::Abort("You must specify `infile'");
@@ -51,16 +51,13 @@ main (int   argc,
     int comp_in_line = 0;
     pp.query("comp_in_line", comp_in_line);
 
-    // single-level for now
-    // AMR comes later, where we iterate over each level in isolation
-
     // for the Header
-    std::string iFile2 = iFile;
-    iFile2 += "/Header";
+    std::string Header = iFile;
+    Header += "/Header";
 
     // open header
     ifstream x;
-    x.open(iFile2.c_str(), ios::in);
+    x.open(Header.c_str(), ios::in);
 
     // read in first line of header
     string str;
@@ -85,70 +82,72 @@ main (int   argc,
         Abort();
     }
 
-    // now read in the plotfile data
-    // check to see whether the user pointed to the plotfile base directory
-    // or the data itself
-    if (amrex::FileExists(iFile+"/Level_0/Cell_H")) {
-       iFile += "/Level_0/Cell";
-    }
-    if (amrex::FileExists(iFile+"/Level_00/Cell_H")) {
-       iFile += "/Level_00/Cell";
-    }
+    int lev = 0;
 
-    // storage for the input coarse and fine MultiFabs
-    MultiFab mf;
+    do {
 
-    // read in plotfiles, 'coarse' and 'fine' to MultiFabs
-    // note: fine could be the same resolution as coarse
-    VisMF::Read(mf, iFile);
+        if (lev > 9) {
+            Abort("Utility only works for 10 levels of refinement or less");
+        }
 
-    ncomp = mf.nComp();
-    Print() << "ncomp = " << ncomp << std::endl;
+        // storage for the MultiFab
+        MultiFab mf;
 
-    // check nodality
-    IntVect c_nodality = mf.ixType().toIntVect();
-    Print() << "nodality " << c_nodality << std::endl;
+        std::string iFile_lev = iFile;
 
-    // get boxArray
-    BoxArray ba = mf.boxArray();
+        std::string levX  = "/Level_"+to_string(lev)+"/Cell";
+        std::string levXX = "/Level_0"+to_string(lev)+"/Cell";
 
-    // minimalBox() computes a single box to enclose all the boxes
-    // enclosedCells() converts it to a cell-centered Box
-    Box bx_onegrid = ba.minimalBox().enclosedCells();
+        // now read in the plotfile data
+        // check to see whether the user pointed to the plotfile base directory
+        // or the data itself
+        if (amrex::FileExists(iFile+levX+"_H")) {
+            iFile_lev += levX;
+        } else if (amrex::FileExists(iFile+levXX+"_H")) {
+            iFile_lev += levXX;
+        } else {
+            break; // terminate while loop
+        }
 
-    // number of cells in the coarse domain
-    Print() << "npts in coarse domain = " << bx_onegrid.numPts() << std::endl;
-    long npts_coarsedomain = bx_onegrid.numPts();
+        // read in plotfile to MultiFab
+        VisMF::Read(mf, iFile_lev);
 
-    // BoxArray, DistributionMapping, and MultiFab with one grid
-    BoxArray ba_onegrid(bx_onegrid);
-    DistributionMapping dmap_onegrid(ba_onegrid);
-    MultiFab mf_onegrid(ba_onegrid,dmap_onegrid,ncomp,0);
+        if (lev == 0) {
+            ncomp = mf.nComp();
+            Print() << "Number of components in the plotfile = " << ncomp << std::endl;
+            Print() << "Nodality of plotfile = " << mf.ixType().toIntVect() << std::endl;
+        }
 
-    // copy data into MultiFab with one grid
-    mf_onegrid.ParallelCopy(mf,0,0,ncomp,0,0);
+        // get boxArray to compute number of grid points at the level
+        BoxArray ba = mf.boxArray();
+        Print() << "Number of grid points at level " << lev << " = " << ba.numPts() << std::endl;
 
-    for ( MFIter mfi(mf_onegrid,false); mfi.isValid(); ++mfi ) {
+        for ( MFIter mfi(mf,false); mfi.isValid(); ++mfi ) {
 
-        const Box& bx = mfi.validbox();
-        const auto lo = amrex::lbound(bx);
-        const auto hi = amrex::ubound(bx);
+            const Box& bx = mfi.validbox();
+            const auto lo = amrex::lbound(bx);
+            const auto hi = amrex::ubound(bx);
 
-        const Array4<Real>& mfdata = mf_onegrid.array(mfi);
+            const Array4<Real>& mfdata = mf.array(mfi);
 
-        if (comp_in_line == 1){
-          std::cout << mf_onegrid[mfi];
-        }else{
-          for (auto n=0; n<ncomp; ++n) {
-            for (auto k = lo.z; k <= hi.z; ++k) {
-              for (auto j = lo.y; j <= hi.y; ++j) {
+            if (comp_in_line == 1) {
+                std::cout << mf[mfi];
+            } else {
+                for (auto n=0; n<ncomp; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
                 for (auto i = lo.x; i <= hi.x; ++i) {
-                  std::cout << i << " " << j << " " << k << " " << n << " " << mfdata(i,j,k,n) << "\n";
+                    std::cout << i << " " << j << " " << k << " " << n << " " << mfdata(i,j,k,n) << "\n";
+                }
+                }
+                }
                 }
-              }
             }
-          }
-        }
-    } // end MFIter
+        } // end MFIter
+
+        // proceed to next level of refinement
+        ++lev;
+
+    } while(true);
 
 }

From 60a55479fe223251d9bece28339b68d6837cfab2 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Thu, 12 May 2022 08:38:26 -0700
Subject: [PATCH 10/25] Fix the Advection_AmrCore test (#2761)

The time used for computing velocity in the non-subcycling mode is
incorrect.

Close #2725
---
 Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
index 9b2452ad46c..515dbf75261 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
@@ -678,7 +678,7 @@ AmrCoreAdv::timeStepNoSubcycling (Real time, int iteration)
         }
     }
 
-    DefineVelocityAllLevels(time);
+    DefineVelocityAllLevels(time+0.5_rt*dt[0]);
     AdvancePhiAllLevels (time, dt[0], iteration);
 
 #ifdef AMREX_PARTICLES

From 45f961762f149c7a3daa522f55bde1ba53c52330 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Thu, 12 May 2022 09:29:00 -0700
Subject: [PATCH 11/25] Time step in the AmrLevel test (#2763)

Make the dt in the AmrLevel test consistent with that in the AmrCore Test.
That is we use the velocity at t+0.5*dt (here dt is from the previous step)
to estimate the dt for the next step.
---
 Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
index ad4d186e607..db69749a85f 100644
--- a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
+++ b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
@@ -414,6 +414,10 @@ AmrLevelAdv::estTimeStep (Real)
     GpuArray<Real,BL_SPACEDIM> prob_lo = geom.ProbLoArray();
     const Real cur_time = state[Phi_Type].curTime();
     const MultiFab& S_new = get_new_data(Phi_Type);
+    Real pred_time = cur_time;
+    if (cur_time > 0._rt) {
+        pred_time += 0.5_rt*parent->dtLevel(level);
+    }
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel reduction(min:dt_est)
@@ -431,7 +435,7 @@ AmrLevelAdv::estTimeStep (Real)
             // Note: no need to set elixir on uface[i] temporary fabs since
             //       norm<RunOn::Device> kernel launch is blocking.
 
-            get_face_velocity(cur_time,
+            get_face_velocity(pred_time,
                               AMREX_D_DECL(uface[0], uface[1], uface[2]),
                               dx, prob_lo);
 

From ec497e9e96695de202d7d6b00dd2c6a46c58b435 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Thu, 12 May 2022 15:21:46 -0700
Subject: [PATCH 12/25] Fix Wstringop-overflow warning in FabConv (#2767)

On Perlmutter, `g++ -O3 -march=znver3` produces lots of stringop-overflow warnings in
FabConv.  These warnings are false positive because the compiler does not know
sizeof(amrex::Real) is either 4 or 8.  This commit fixes the warnings.

Close #2750
---
 Src/Base/AMReX_FabConv.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/Src/Base/AMReX_FabConv.cpp b/Src/Base/AMReX_FabConv.cpp
index b758dd94261..f45a4a26d33 100644
--- a/Src/Base/AMReX_FabConv.cpp
+++ b/Src/Base/AMReX_FabConv.cpp
@@ -375,15 +375,14 @@ _pd_extract_field (char const* in,
 // Byte reverse nitems words.  Each word is nb bytes Long where nb is even.
 //
 
+template <int NB>
 static
 void
-_pd_btrvout (char* out,
-             Long  nb,
-             Long  nitems)
+_pd_btrvout (char* out, Long  nitems)
 {
-    for (Long jl = 0, nbo2 = nb >> 1; jl < nbo2; jl++)
+    for (int jl = 0, nbo2 = NB >> 1; jl < nbo2; jl++)
     {
-        Long jh  = nb - jl - 1;
+        int jh  = NB - jl - 1;
         char* p1 = out + jh;
         char* p2 = out + jl;
         for (Long i = 0L; i < nitems; i++)
@@ -391,8 +390,8 @@ _pd_btrvout (char* out,
             char tmp = *p1;
             *p1 = *p2;
             *p2 = tmp;
-            p1 += nb;
-            p2 += nb;
+            p1 += NB;
+            p2 += NB;
         }
     }
 }
@@ -458,8 +457,13 @@ _pd_insert_field (Long  in_long,
     //
     // Reorder the bytes appropriately.
     //
-    if (l_order == REVERSE_ORDER)
-        _pd_btrvout(in, l_bytes, 1L);
+    if (l_order == REVERSE_ORDER) {
+        if (l_bytes == 4) {
+            _pd_btrvout<4>(in, 1L);
+        } else { // It's either 4 or 8.  There is an assertion in PD_fconvert.
+            _pd_btrvout<8>(in, 1L);
+        }
+    }
     //
     // Copy the remaining aligned bytes over.
     //
@@ -622,6 +626,9 @@ PD_fconvert (void*       out,
              int         onescmp)
 {
 //    BL_PROFILE("PD_fconvert");
+
+    AMREX_ASSERT(l_bytes == 4 || l_bytes == 8); // Otherwise, we need to update _pd_btrvout
+
     Long i, expn, expn_max, hexpn, mant, DeltaBias, hmbo, hmbi;
     int nbits, inbytes, outbytes, sign;
     int indxin, indxout, inrem, outrem, dindx;

From 6a8011c897d066324cd7415814cd0ff91be9c59c Mon Sep 17 00:00:00 2001
From: PaulMullowney <60452402+PaulMullowney@users.noreply.github.com>
Date: Thu, 12 May 2022 16:33:31 -0600
Subject: [PATCH 13/25] this updates to recent Hypre API changes (#2765)

---
 Src/Base/AMReX.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index a6b6fea256a..f06806babcd 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -528,7 +528,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         HYPRE_Init();
 #ifdef HYPRE_USING_CUDA
 
-#if defined(HYPRE_RELEASE_NUMBER) && (HYPRE_RELEASE_NUMBER >= 22100)
+#if defined(HYPRE_RELEASE_NUMBER) && (HYPRE_RELEASE_NUMBER >= 22400)
 
 #ifdef HYPRE_USING_DEVICE_POOL
         /* device pool allocator */
@@ -541,7 +541,9 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         HYPRE_SetGPUMemoryPoolSize( mempool_bin_growth, mempool_min_bin,
                                     mempool_max_bin, mempool_max_cached_bytes );
 #endif
-        HYPRE_SetSpGemmUseCusparse(false);
+        /* This API below used to be HYPRE_SetSpGemmUseCusparse(). This was changed in commit
+           Hypre master commit dfdd1cd12f */
+        HYPRE_SetSpGemmUseVendor(false);
         HYPRE_SetMemoryLocation(HYPRE_MEMORY_DEVICE);
         HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE);
         HYPRE_SetUseGpuRand(true);

From b6a4e64dfdb6478b5e6e31b469e1824c3d46eb05 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Thu, 12 May 2022 17:05:39 -0700
Subject: [PATCH 14/25] Fix maybe-uninitialized warning in calling mlock
 (#2768)

---
 Src/Base/AMReX_Arena.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index 59d62ad0fac..cccde1f200c 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -126,7 +126,14 @@ Arena::allocate_system (std::size_t nbytes)
     if (arena_info.use_cpu_memory)
     {
         p = std::malloc(nbytes);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
         if (p && arena_info.device_use_hostalloc) AMREX_MLOCK(p, nbytes);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
     }
     else if (arena_info.device_use_hostalloc)
     {

From 80a15e44f215bcf23b120510b566e7e9e96f6545 Mon Sep 17 00:00:00 2001
From: Andrew Myers <atmyers2@gmail.com>
Date: Thu, 12 May 2022 17:25:01 -0700
Subject: [PATCH 15/25] Update particle << operator after changes to id/cpu
 (#2769)

---
 Src/Particle/AMReX_Particle.H | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/Particle/AMReX_Particle.H b/Src/Particle/AMReX_Particle.H
index 50cec1bb647..0548886010c 100644
--- a/Src/Particle/AMReX_Particle.H
+++ b/Src/Particle/AMReX_Particle.H
@@ -432,7 +432,7 @@ operator<< (std::ostream& os, const Particle<NReal, NInt>& p)
     for (int i = 0; i < NReal; i++)
         os << p.rdata(i) << ' ';
 
-    for (int i = 2; i < NInt; i++)
+    for (int i = 0; i < NInt; i++)
         os << p.idata(i) << ' ';
 
     if (!os.good())
@@ -470,7 +470,7 @@ operator<< (std::ostream& os, const Particle<0, NInt>& p)
     for (int i = 0; i < AMREX_SPACEDIM; i++)
         os << p.pos(i) << ' ';
 
-    for (int i = 2; i < NInt; i++)
+    for (int i = 0; i < NInt; i++)
         os << p.idata(i) << ' ';
 
     if (!os.good())

From 92ace5741853473b9358dc89d76889fa5eb2d56f Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 16 May 2022 07:54:12 -0700
Subject: [PATCH 16/25] Fix: AmrCore Move (#2773)

The move constructor and assignment operator for `AmrCore` with
particles was broken.

When moving `AmrParGDB`, its internal `m_amrcore` pointer needs
to be updated, too.
---
 Src/AmrCore/AMReX_AmrCore.H   |  4 ++--
 Src/AmrCore/AMReX_AmrCore.cpp | 20 ++++++++++++++++++++
 Src/AmrCore/AMReX_AmrParGDB.H |  2 ++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/Src/AmrCore/AMReX_AmrCore.H b/Src/AmrCore/AMReX_AmrCore.H
index 73e4d25ea8c..cd532bef63b 100644
--- a/Src/AmrCore/AMReX_AmrCore.H
+++ b/Src/AmrCore/AMReX_AmrCore.H
@@ -40,8 +40,8 @@ public:
 
     AmrCore (Geometry const& level_0_geom, AmrInfo const& amr_info);
 
-    AmrCore (AmrCore&& rhs) = default;
-    AmrCore& operator= (AmrCore&& rhs) = default;
+    AmrCore (AmrCore&& rhs);
+    AmrCore& operator= (AmrCore&& rhs);
 
     AmrCore (const AmrCore& rhs) = delete;
     AmrCore& operator= (const AmrCore& rhs) = delete;
diff --git a/Src/AmrCore/AMReX_AmrCore.cpp b/Src/AmrCore/AMReX_AmrCore.cpp
index 59617ea4e75..0cf233ac2c2 100644
--- a/Src/AmrCore/AMReX_AmrCore.cpp
+++ b/Src/AmrCore/AMReX_AmrCore.cpp
@@ -46,6 +46,26 @@ AmrCore::AmrCore (Geometry const& level_0_geom, AmrInfo const& amr_info)
 #endif
 }
 
+AmrCore::AmrCore (AmrCore&& rhs)
+    : AmrMesh(std::move(rhs))
+{
+#ifdef AMREX_PARTICLES
+    m_gdb = std::move(rhs.m_gdb);
+    m_gdb->m_amrcore = this;
+#endif
+}
+
+AmrCore& AmrCore::operator= (AmrCore&& rhs)
+{
+    AmrMesh::operator=(std::move(rhs));
+#ifdef AMREX_PARTICLES
+    m_gdb = std::move(rhs.m_gdb);
+    m_gdb->m_amrcore = this;
+#endif
+
+    return *this;
+}
+
 AmrCore::~AmrCore ()
 {
 }
diff --git a/Src/AmrCore/AMReX_AmrParGDB.H b/Src/AmrCore/AMReX_AmrParGDB.H
index 35e6d666db8..95cd16a874b 100644
--- a/Src/AmrCore/AMReX_AmrParGDB.H
+++ b/Src/AmrCore/AMReX_AmrParGDB.H
@@ -10,6 +10,8 @@ namespace amrex {
 class AmrParGDB
     : public ParGDBBase
 {
+    friend AmrCore;
+
 public:
 
     explicit AmrParGDB (AmrCore* amr) noexcept

From 91124b18bedce3954471e9ba79622f351aed8b5b Mon Sep 17 00:00:00 2001
From: Max Katz <maxpkatz@gmail.com>
Date: Mon, 16 May 2022 11:25:42 -0400
Subject: [PATCH 17/25] Add an optional volume weighting to AMRErrorTag (#2772)

This allows, for example, refining based on the mass in a cell rather than only on its density.

A function to obtain the cell volume at runtime given an IntVect, that can be run inside a ParallelFor, is added to Geometry.
---
 Src/AmrCore/AMReX_ErrorList.H   |  5 +++
 Src/AmrCore/AMReX_ErrorList.cpp |  8 +++--
 Src/Base/AMReX_Geometry.H       | 58 +++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/Src/AmrCore/AMReX_ErrorList.H b/Src/AmrCore/AMReX_ErrorList.H
index 9011655a50d..90f49b02749 100644
--- a/Src/AmrCore/AMReX_ErrorList.H
+++ b/Src/AmrCore/AMReX_ErrorList.H
@@ -382,6 +382,7 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
     int m_max_level = 1000;
     Real m_min_time = std::numeric_limits<Real>::lowest();
     Real m_max_time = std::numeric_limits<Real>::max();
+    int m_volume_weighting = 0;
     RealBox m_realbox;
 
     AMRErrorTagInfo& SetMaxLevel (int max_level) noexcept {
@@ -400,6 +401,10 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
       m_realbox = realbox;
       return *this;
     }
+    AMRErrorTagInfo& SetVolumeWeighting (int volume_weighting) noexcept {
+      m_volume_weighting = volume_weighting;
+      return *this;
+    }
   };
 
   class AMRErrorTag
diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp
index 5285cd4bfdc..1594ba740a9 100644
--- a/Src/AmrCore/AMReX_ErrorList.cpp
+++ b/Src/AmrCore/AMReX_ErrorList.cpp
@@ -291,6 +291,8 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
             {
                 auto const& datma   = mf->const_arrays();
                 auto threshold = m_value[level];
+                auto const volume_weighting = m_info.m_volume_weighting;
+                auto geomdata = geom.data();
                 if (m_test == GRAD)
                 {
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
@@ -340,7 +342,8 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
                 {
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
                     {
-                        if (datma[bi](i,j,k) <= threshold) {
+                        Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
+                        if (datma[bi](i,j,k) * vol <= threshold) {
                             tagma[bi](i,j,k) = tagval;
                         }
                     });
@@ -349,7 +352,8 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
                 {
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
                     {
-                        if (datma[bi](i,j,k) >= threshold) {
+                        Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
+                        if (datma[bi](i,j,k) * vol >= threshold) {
                             tagma[bi](i,j,k) = tagval;
                         }
                     });
diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H
index 773e6c69519..1a9910bc526 100644
--- a/Src/Base/AMReX_Geometry.H
+++ b/Src/Base/AMReX_Geometry.H
@@ -228,6 +228,64 @@ public:
                     const BoxArray& grds,
                     int             idx,
                     int             grow) const;
+
+    //! Return the volume of the specified cell.
+    AMREX_GPU_HOST_DEVICE AMREX_INLINE
+    static Real Volume (const IntVect& point, const GeometryData& geomdata)
+    {
+        auto dx = geomdata.CellSize();
+
+        Real vol;
+
+#if AMREX_SPACEDIM == 1
+
+        auto coord = geomdata.Coord();
+
+        if (coord == CoordSys::cartesian) {
+            // Cartesian
+
+            vol = dx[0];
+        }
+        else {
+            // Spherical
+
+            Real rl = geomdata.ProbLo()[0] + static_cast<Real>(point[0]) * dx[0];
+            Real rr = rl + dx[0];
+
+            vol = (4.0_rt / 3.0_rt) * M_PI * dx[0] * (rl * rl + rl * rr + rr * rr);
+        }
+
+#elif AMREX_SPACEDIM == 2
+
+        auto coord = geomdata.Coord();
+
+        if (coord == CoordSys::cartesian) {
+            // Cartesian
+
+            vol = dx[0] * dx[1];
+        }
+        else {
+            // Cylindrical
+
+            Real r_l = geomdata.ProbLo()[0] + static_cast<Real>(point[0]) * dx[0];
+            Real r_r = geomdata.ProbLo()[0] + static_cast<Real>(point[0]+1) * dx[0];
+
+            vol = M_PI * (r_l + r_r) * dx[0] * dx[1];
+        }
+
+#else
+
+        amrex::ignore_unused(point);
+
+        // Cartesian
+
+        vol = dx[0] * dx[1] * dx[2];
+
+#endif
+
+        return vol;
+    }
+
     /**
     * \brief Compute d(log(A))/dr at cell centers in given region and
     *           stuff the results into the passed MultiFab.

From 2ab21359fb503e9551b400670629f358ff78a167 Mon Sep 17 00:00:00 2001
From: Erik <epalmer@lbl.gov>
Date: Mon, 16 May 2022 12:05:06 -0400
Subject: [PATCH 18/25] Change repo html address to Ubuntu 20.04 (#2766)

---
 .github/workflows/dependencies/dependencies_nvcc11.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dependencies/dependencies_nvcc11.sh b/.github/workflows/dependencies/dependencies_nvcc11.sh
index c516948fa8d..79c8c6c31f6 100755
--- a/.github/workflows/dependencies/dependencies_nvcc11.sh
+++ b/.github/workflows/dependencies/dependencies_nvcc11.sh
@@ -20,7 +20,7 @@ sudo apt-get install -y \
     wget
 
 sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" \
+echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" \
     | sudo tee /etc/apt/sources.list.d/cuda.list
 sudo apt-get update
 sudo apt-get install -y \

From e04edae76586526792fed21677c4a4091161256f Mon Sep 17 00:00:00 2001
From: Erik <epalmer@lbl.gov>
Date: Mon, 16 May 2022 13:51:48 -0400
Subject: [PATCH 19/25] CI--HIP: wget gpg key from https instead of http
 (#2771)

* CI--HIP: wget gpg key from https instead of http

* change other http to https
---
 .github/workflows/dependencies/dependencies_hip.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index 59e7400d690..11314f951e4 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -12,14 +12,15 @@
 
 set -eu -o pipefail
 
+
 # Ref.: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu
-wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key \
+wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key \
   | sudo apt-key add -
-echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ ubuntu main' \
+echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ ubuntu main' \
   | sudo tee /etc/apt/sources.list.d/rocm.list
-
 echo 'export PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin:$PATH' \
   | sudo tee -a /etc/profile.d/rocm.sh
+
 # we should not need to export HIP_PATH=/opt/rocm/hip with those installs
 
 sudo apt-get update

From 5df7ff4cc2f9b5c490a3bb4d792cfe4f1a3f7a95 Mon Sep 17 00:00:00 2001
From: Erik <epalmer@lbl.gov>
Date: Mon, 16 May 2022 13:52:30 -0400
Subject: [PATCH 20/25] configure value of AMReX_GPU_RDC flag for use in cmake
 find_package(AMReX ...) (#2770)

---
 Tools/CMake/AMReXConfig.cmake.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index 6fa60344f17..9df9dc0204e 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -114,6 +114,7 @@ set(AMReX_CUDA                      @AMReX_CUDA@)
 set(AMReX_SYCL                      @AMReX_DPCPP@)
 set(AMReX_HIP                       @AMReX_HIP@)
 set(AMReX_GPU_BACKEND               @AMReX_GPU_BACKEND@)
+set(AMReX_GPU_RDC                   @AMReX_GPU_RDC@)
 set(AMReX_PRECISION                 @AMReX_PRECISION@)
 set(AMReX_FORTRAN                   @AMReX_FORTRAN@)
 

From 7bbe9facbf7f833637d49ad77d3ffcdd74905555 Mon Sep 17 00:00:00 2001
From: hengjiew <86926839+hengjiew@users.noreply.github.com>
Date: Mon, 16 May 2022 12:56:52 -0500
Subject: [PATCH 21/25] Fix the bug in the CMake build with AMReX_BASE_PROFILE.
 (#2774)

---
 Src/Base/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 820c4b6f214..ab86c6dfcc6 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -260,7 +260,7 @@ target_sources( amrex
 # Profiling
 #   this source file has zero symbols in default conditions, which creates
 #   ranlib warnings, e.g., on macOS
-if(AMREX_PROFILING OR AMReX_FORTRAN)
+if(AMReX_BASE_PROFILE OR AMReX_FORTRAN)
    target_sources( amrex
      PRIVATE
        AMReX_BLProfiler.cpp

From 32b1a0bb1bc34463ee6d111908739066a87623dc Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Tue, 17 May 2022 16:40:55 +0200
Subject: [PATCH 22/25] AmrCore: Include utility (#2778)

Add the `<utility>` header for `std::move`.
---
 Src/AmrCore/AMReX_AmrCore.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Src/AmrCore/AMReX_AmrCore.cpp b/Src/AmrCore/AMReX_AmrCore.cpp
index 0cf233ac2c2..54ec31eded2 100644
--- a/Src/AmrCore/AMReX_AmrCore.cpp
+++ b/Src/AmrCore/AMReX_AmrCore.cpp
@@ -11,6 +11,7 @@
 #endif
 
 #include <algorithm>
+#include <utility>
 #include <ostream>
 
 namespace amrex {

From 899d9078950092d7eb8f0017d8486cb240d9dc6a Mon Sep 17 00:00:00 2001
From: "Don E. Willcox" <dwillcox@users.noreply.github.com>
Date: Wed, 18 May 2022 08:50:29 -0700
Subject: [PATCH 23/25] Add some timestep controls to the AMReX TimeIntegrator
 class for its integrate() driver function. (#2780)

---
 Src/Base/AMReX_TimeIntegrator.H | 35 +++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/Src/Base/AMReX_TimeIntegrator.H b/Src/Base/AMReX_TimeIntegrator.H
index e1f5a5f3d30..210f87d8ef8 100644
--- a/Src/Base/AMReX_TimeIntegrator.H
+++ b/Src/Base/AMReX_TimeIntegrator.H
@@ -25,6 +25,8 @@ template<class T>
 class TimeIntegrator
 {
 private:
+    amrex::Real time, timestep;
+    int step_number;
     std::unique_ptr<IntegratorBase<T> > integrator_ptr;
     std::function<void ()> post_timestep;
 
@@ -75,6 +77,11 @@ private:
         // By default, do nothing
         set_rhs([](T& /* S_rhs */, const T& /* S_data */, const amrex::Real /* time */){});
         set_fast_rhs([](T& /* S_rhs */, T& /* S_extra */, const T& /* S_data */, const amrex::Real /* time */){});
+
+        // By default, initialize time, timestep, step number to 0's
+        time = 0.0_rt;
+        timestep = 0.0_rt;
+        step_number = 0;
     }
 
 public:
@@ -161,6 +168,26 @@ public:
         return integrator_ptr->get_fast_timestep();
     }
 
+    int get_step_number ()
+    {
+        return step_number;
+    }
+
+    amrex::Real get_time ()
+    {
+        return time;
+    }
+
+    amrex::Real get_timestep ()
+    {
+        return timestep;
+    }
+
+    void set_timestep (amrex::Real dt)
+    {
+        timestep = dt;
+    }
+
     std::function<void ()> get_post_timestep ()
     {
         return post_timestep;
@@ -187,12 +214,12 @@ public:
     }
 
     void integrate (T& S_old, T& S_new, amrex::Real start_time, const amrex::Real start_timestep,
-                    const amrex::Real end_time, const int nsteps)
+                    const amrex::Real end_time, const int start_step, const int max_steps)
     {
-        amrex::Real time = start_time;
-        amrex::Real timestep = start_timestep;
+        time = start_time;
+        timestep = start_timestep;
         bool stop_advance = false;
-        for (int step_number = 0; step_number < nsteps && !stop_advance; ++step_number)
+        for (step_number = start_step; step_number < max_steps && !stop_advance; ++step_number)
         {
             if (end_time - time < timestep) {
                 timestep = end_time - time;

From a0f5b049a16a8f94c2526051032fe818e4fc4a61 Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Wed, 18 May 2022 21:28:00 -0700
Subject: [PATCH 24/25] Remove a Sync object.

---
 Src/Base/AMReX_BLProfiler.H     |  85 ++++++++-------
 Src/Base/AMReX_BLProfiler.cpp   | 178 +++++++++++++++++---------------
 Src/Base/AMReX_TinyProfiler.H   |  20 ----
 Src/Base/AMReX_TinyProfiler.cpp |  81 +--------------
 4 files changed, 148 insertions(+), 216 deletions(-)

diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H
index 0cc488cca74..625dc0dad27 100644
--- a/Src/Base/AMReX_BLProfiler.H
+++ b/Src/Base/AMReX_BLProfiler.H
@@ -314,25 +314,6 @@ private:
     std::string regname;
 };
 
-class BLProfileSync {
-
-    friend amrex::BLProfiler;
-
-public:
-    static void Sync() noexcept;
-    static void Sync(const std::string& name) noexcept;
-    static void Sync(const char* name) noexcept;
-
-    static void StartSyncRegion() noexcept;
-    static void StartSyncRegion(const std::string& name) noexcept;
-    static void StartSyncRegion(const char* name) noexcept;
-    static void EndSyncRegion() noexcept;
-
-private:
-    static int sync_counter;
-    static int use_prof_syncs;
-};
-
 namespace BLProfilerUtils {
   void WriteHeader(std::ostream &os, const int colWidth,
                    const Real maxlen, const bool bwriteavg);
@@ -406,7 +387,7 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 }
 
 #define BL_PROFILE_INITIALIZE()  amrex::BLProfiler::Initialize();
-#define BL_PROFILE_INITPARAMS()  amrex::BLProfiler::InitParams();
+#define BL_PROFILE_INITPARAMS()  amrex::BLProfiler::InitParams(); amrex::BLProfileSync::InitParams()
 #define BL_PROFILE_FINALIZE()    amrex::BLProfiler::Finalize();
 
 #define BL_TINY_PROFILE_INITIALIZE()
@@ -444,11 +425,6 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
                                             bl_profiler_##rvname.start();
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)  bl_profiler_##rvname.stop();   \
     amrex::BLProfiler::RegionStop(fname);
-#define BL_PROFILE_SYNC()                       amrex::BLProfileSync::Sync()
-#define BL_PROFILE_SYNC_TIMED(fname)            amrex::BLProfileSync::Sync()
-#define BL_PROFILE_SYNC_START()                 amrex::BLProfileSync::StartSyncRegion()
-#define BL_PROFILE_SYNC_START_TIMED(fname)      amrex::BLProfileSync::StartSyncRegion(fname)
-#define BL_PROFILE_SYNC_STOP()                  amrex::BLProfileSync::EndSyncRegion()
 #define BL_PROFILE_TINY_FLUSH()
 #define BL_PROFILE_FLUSH() { amrex::BLProfiler::Finalize(true); }
 
@@ -497,7 +473,7 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 #define BL_PROFILE_INITPARAMS()
 #define BL_PROFILE_FINALIZE()
 
-#define BL_TINY_PROFILE_INITIALIZE()   amrex::TinyProfiler::Initialize()
+#define BL_TINY_PROFILE_INITIALIZE()   amrex::TinyProfiler::Initialize(); BLProfileSync::InitParams()
 #define BL_TINY_PROFILE_FINALIZE()     amrex::TinyProfiler::Finalize()
 
 #define BL_PROFILE(fname) BL_PROFILE_IMPL(fname, __COUNTER__)
@@ -530,13 +506,6 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 #define BL_PROFILE_REGION_VAR(fname, rvname)
 #define BL_PROFILE_REGION_VAR_START(fname, rvname)
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)
-
-#define BL_PROFILE_SYNC()                       amrex::TinyProfileSync::Sync()
-#define BL_PROFILE_SYNC_TIMED(fname)            amrex::TinyProfileSync::Sync()
-#define BL_PROFILE_SYNC_START()                 amrex::TinyProfileSync::StartSyncRegion()
-#define BL_PROFILE_SYNC_START_TIMED(fname)      amrex::TinyProfileSync::StartSyncRegion(fname)
-#define BL_PROFILE_SYNC_STOP()                  amrex::TinyProfileSync::EndSyncRegion()
-
 #define BL_PROFILE_TINY_FLUSH() amrex::TinyProfiler::Finalize(true)
 #define BL_PROFILE_FLUSH()
 #define BL_TRACE_PROFILE_FLUSH()
@@ -597,11 +566,6 @@ class BLProfiler
 #define BL_PROFILE_REGION_VAR(fname, rvname)
 #define BL_PROFILE_REGION_VAR_START(fname, rvname)
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)
-#define BL_PROFILE_SYNC()
-#define BL_PROFILE_SYNC_TIMED(fname)
-#define BL_PROFILE_SYNC_START()
-#define BL_PROFILE_SYNC_START_TIMED(fname)
-#define BL_PROFILE_SYNC_STOP()
 #define BL_PROFILE_TINY_FLUSH()
 #define BL_PROFILE_FLUSH()
 #define BL_TRACE_PROFILE_FLUSH()
@@ -633,6 +597,51 @@ class BLProfiler
 
 #endif
 
+// ============================================================
+// Sync macros
+// ============================================================
+
+#if (defined(BL_PROFILING) || defined(AMREX_TINY_PROFILING))
+
+namespace amrex {
+
+  class BLProfileSync {
+
+  public:
+    static void Sync() noexcept;
+    static void Sync(const std::string& name) noexcept;
+    static void Sync(const char* name) noexcept;
+
+    static void InitParams() noexcept;
+
+    static void StartSyncRegion() noexcept;
+    static void StartSyncRegion(const std::string& name) noexcept;
+    static void StartSyncRegion(const char* name) noexcept;
+    static void EndSyncRegion() noexcept;
+
+  private:
+    static int sync_counter;
+    static int use_prof_syncs;
+  };
+
+}
+
+#define BL_PROFILE_SYNC()                       amrex::BLProfileSync::Sync()
+#define BL_PROFILE_SYNC_TIMED(fname)            amrex::BLProfileSync::Sync(fname)
+#define BL_PROFILE_SYNC_START()                 amrex::BLProfileSync::StartSyncRegion()
+#define BL_PROFILE_SYNC_START_TIMED(fname)      amrex::BLProfileSync::StartSyncRegion(fname)
+#define BL_PROFILE_SYNC_STOP()                  amrex::BLProfileSync::EndSyncRegion()
+
+#else
+
+#define BL_PROFILE_SYNC()
+#define BL_PROFILE_SYNC_TIMED(fname)
+#define BL_PROFILE_SYNC_START()
+#define BL_PROFILE_SYNC_START_TIMED(fname)
+#define BL_PROFILE_SYNC_STOP()
+
+#endif
+
 // ============================================================
 // Third party macros.
 // Mutually exclusive, including from BL_PROFILE.
diff --git a/Src/Base/AMReX_BLProfiler.cpp b/Src/Base/AMReX_BLProfiler.cpp
index e512f54b509..9d12b23708b 100644
--- a/Src/Base/AMReX_BLProfiler.cpp
+++ b/Src/Base/AMReX_BLProfiler.cpp
@@ -104,9 +104,6 @@ int BLProfiler::CallStats::cstatsVersion(1);
 Vector<BLProfiler::CallStatsStack> BLProfiler::callIndexStack;
 Vector<BLProfiler::CallStatsPatch> BLProfiler::callIndexPatch;
 
-int BLProfileSync::use_prof_syncs = 0;
-int BLProfileSync::sync_counter = 0;
-
 #ifdef BL_TRACE_PROFILING
 int BLProfiler::callStackDepth(-1);
 int BLProfiler::prevCallStackDepth(0);
@@ -274,20 +271,13 @@ void BLProfiler::Initialize() {
 
 
 void BLProfiler::InitParams() {
-  {
-    ParmParse pParse("blprofiler");
-    pParse.queryAdd("prof_nfiles", nProfFiles);
-    pParse.queryAdd("prof_csflushsize", csFlushSize);
-    pParse.queryAdd("prof_traceflushsize", traceFlushSize);
-    pParse.queryAdd("prof_flushinterval", flushInterval);
-    pParse.queryAdd("prof_flushtimeinterval", flushTimeInterval);
-    pParse.queryAdd("prof_flushprint", bFlushPrint);
-  }
-
-  {
-    ParmParse pParse("amrex");
-    pParse.queryAdd("use_profiler_syncs", BLProfileSync::use_prof_syncs);
-  }
+  ParmParse pParse("blprofiler");
+  pParse.queryAdd("prof_nfiles", nProfFiles);
+  pParse.queryAdd("prof_csflushsize", csFlushSize);
+  pParse.queryAdd("prof_traceflushsize", traceFlushSize);
+  pParse.queryAdd("prof_flushinterval", flushInterval);
+  pParse.queryAdd("prof_flushtimeinterval", flushTimeInterval);
+  pParse.queryAdd("prof_flushprint", bFlushPrint);
 }
 
 
@@ -1556,70 +1546,6 @@ void BLProfiler::CommStats::UnFilter(CommFuncType cft) {
   }
 }
 
-void
-BLProfileSync::Sync () noexcept
-{
-    if (use_prof_syncs)
-        { ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub()); }
-}
-
-void
-BLProfileSync::Sync (const std::string& name) noexcept
-{
-    if (use_prof_syncs) {
-        BLProfiler synctimer(name);
-        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-    }
-}
-
-void
-BLProfileSync::Sync (const char* name) noexcept
-{
-    if (use_prof_syncs) {
-        BLProfiler synctimer(name);
-        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-    }
-}
-
-void
-BLProfileSync::StartSyncRegion () noexcept
-{
-    if (use_prof_syncs) {
-        if (sync_counter == 0) {
-            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-        }
-        sync_counter++;
-    }
-}
-
-void
-BLProfileSync::StartSyncRegion (const std::string& name) noexcept {
-    if (use_prof_syncs) {
-        if (sync_counter == 0) {
-            BLProfiler synctimer(name);
-            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-        }
-        sync_counter++;
-    }
-}
-
-void
-BLProfileSync::StartSyncRegion (const char* name) noexcept {
-    if (use_prof_syncs) {
-        if (sync_counter == 0) {
-            BLProfiler synctimer(name);
-            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-        }
-        sync_counter++;
-    }
-}
-
-void
-BLProfileSync::EndSyncRegion () noexcept {
-    if (use_prof_syncs) {
-        sync_counter--;
-    }
-}
 
 namespace {
   const int EOS(-1);
@@ -1766,3 +1692,93 @@ BL_FORT_PROC_DECL(BL_PROFFORTFUNCSTOP_CPP_INT,bl_proffortfuncstop_cpp_int)
 #endif
 
 #endif
+
+#if (defined(BL_PROFILING) || defined(AMREX_TINY_PROFILING))
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_BLProfiler.H>
+#include <AMReX_ParallelDescriptor.H>
+#include <AMReX_ParallelContext.H>
+
+namespace amrex {
+
+int BLProfileSync::use_prof_syncs = 0;
+int BLProfileSync::sync_counter = 0;
+
+void
+BLProfileSync::Sync () noexcept
+{
+    if (use_prof_syncs)
+        { ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub()); }
+}
+
+void
+BLProfileSync::Sync (const std::string& name) noexcept
+{
+    if (use_prof_syncs) {
+        BL_PROFILE(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+BLProfileSync::Sync (const char* name) noexcept
+{
+    if (use_prof_syncs) {
+        BL_PROFILE(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+BLProfileSync::InitParams () noexcept
+{
+    ParmParse pParse("amrex");
+    pParse.queryAdd("use_profiler_syncs", BLProfileSync::use_prof_syncs);
+
+    sync_counter = 0;
+}
+
+void
+BLProfileSync::StartSyncRegion () noexcept
+{
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion (const std::string& name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            BL_PROFILE(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion (const char* name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            BL_PROFILE(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::EndSyncRegion () noexcept {
+    if (use_prof_syncs) {
+        sync_counter--;
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H
index 4c4a75a5bc1..677b4448d3b 100644
--- a/Src/Base/AMReX_TinyProfiler.H
+++ b/Src/Base/AMReX_TinyProfiler.H
@@ -111,25 +111,5 @@ private:
     TinyProfiler tprof;
 };
 
-
-class TinyProfileSync {
-
-    friend amrex::TinyProfiler;
-
-public:
-    static void Sync() noexcept;
-    static void Sync(const std::string& name) noexcept;
-    static void Sync(const char* name) noexcept;
-
-    static void StartSyncRegion() noexcept;
-    static void StartSyncRegion(const std::string& name) noexcept;
-    static void StartSyncRegion(const char* name) noexcept;
-    static void EndSyncRegion() noexcept;
-
-private:
-    static int sync_counter;
-    static int use_prof_syncs;
-};
-
 }
 #endif
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 7302c4c4f1d..139e029f5bf 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -35,8 +35,6 @@ double TinyProfiler::t_init = std::numeric_limits<double>::max();
 int TinyProfiler::device_synchronize_around_region = 0;
 int TinyProfiler::n_print_tabs = 0;
 int TinyProfiler::verbose = 0;
-int TinyProfileSync::use_prof_syncs = 0;
-int TinyProfileSync::sync_counter = 0;
 
 namespace {
     std::set<std::string> improperly_nested_timers;
@@ -300,16 +298,10 @@ TinyProfiler::Initialize () noexcept
     regionstack.push_back(mainregion);
     t_init = amrex::second();
 
-    {
-        amrex::ParmParse pp("tiny_profiler");
-        pp.queryAdd("device_synchronize_around_region", device_synchronize_around_region);
-        pp.queryAdd("verbose", verbose);
-        pp.queryAdd("v", verbose);
-    }
-    {
-        amrex::ParmParse pp("amrex");
-        pp.queryAdd("use_profiler_syncs", TinyProfileSync::use_prof_syncs);
-    }
+    amrex::ParmParse pp("tiny_profiler");
+    pp.queryAdd("device_synchronize_around_region", device_synchronize_around_region);
+    pp.queryAdd("verbose", verbose);
+    pp.queryAdd("v", verbose);
 }
 
 void
@@ -644,69 +636,4 @@ TinyProfiler::PrintCallStack (std::ostream& os)
     }
 }
 
-void
-TinyProfileSync::Sync () noexcept
-{
-    if (use_prof_syncs)
-        { ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub()); }
-}
-
-void
-TinyProfileSync::Sync (const std::string& name) noexcept
-{
-    if (use_prof_syncs) {
-        TinyProfiler synctimer(name);
-        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-    }
-}
-
-void
-TinyProfileSync::Sync (const char* name) noexcept
-{
-    if (use_prof_syncs) {
-        TinyProfiler synctimer(name);
-        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-    }
-}
-
-void
-TinyProfileSync::StartSyncRegion () noexcept
-{
-    if (use_prof_syncs) {
-        if (sync_counter == 0) {
-            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-        }
-        sync_counter++;
-    }
-}
-
-void
-TinyProfileSync::StartSyncRegion (const std::string& name) noexcept {
-    if (use_prof_syncs) {
-        if (sync_counter == 0) {
-            TinyProfiler synctimer(name);
-            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-        }
-        sync_counter++;
-    }
-}
-
-void
-TinyProfileSync::StartSyncRegion (const char* name) noexcept {
-    if (use_prof_syncs) {
-        if (sync_counter == 0) {
-            TinyProfiler synctimer(name);
-            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
-        }
-        sync_counter++;
-    }
-}
-
-void
-TinyProfileSync::EndSyncRegion () noexcept {
-    if (use_prof_syncs) {
-        sync_counter--;
-    }
-}
-
 }

From 42a189e69895e6262697397c27685f688e36df27 Mon Sep 17 00:00:00 2001
From: Kevin Gott <kngott@lbl.gov>
Date: Wed, 18 May 2022 21:33:14 -0700
Subject: [PATCH 25/25] Back to before

---
 Src/Base/AMReX_TinyProfiler.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 139e029f5bf..4c524a47104 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -298,10 +298,12 @@ TinyProfiler::Initialize () noexcept
     regionstack.push_back(mainregion);
     t_init = amrex::second();
 
-    amrex::ParmParse pp("tiny_profiler");
-    pp.queryAdd("device_synchronize_around_region", device_synchronize_around_region);
-    pp.queryAdd("verbose", verbose);
-    pp.queryAdd("v", verbose);
+    {
+        amrex::ParmParse pp("tiny_profiler");
+        pp.queryAdd("device_synchronize_around_region", device_synchronize_around_region);
+        pp.queryAdd("verbose", verbose);
+        pp.queryAdd("v", verbose);
+    }
 }
 
 void