diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index 59e7400d690..11314f951e4 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -12,14 +12,15 @@
 
 set -eu -o pipefail
 
+
 # Ref.: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu
-wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key \
+wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key \
   | sudo apt-key add -
-echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ ubuntu main' \
+echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ ubuntu main' \
   | sudo tee /etc/apt/sources.list.d/rocm.list
-
 echo 'export PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin:$PATH' \
   | sudo tee -a /etc/profile.d/rocm.sh
+
 # we should not need to export HIP_PATH=/opt/rocm/hip with those installs
 
 sudo apt-get update
diff --git a/.github/workflows/dependencies/dependencies_nvcc11.sh b/.github/workflows/dependencies/dependencies_nvcc11.sh
index c516948fa8d..79c8c6c31f6 100755
--- a/.github/workflows/dependencies/dependencies_nvcc11.sh
+++ b/.github/workflows/dependencies/dependencies_nvcc11.sh
@@ -20,7 +20,7 @@ sudo apt-get install -y \
     wget
 
 sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" \
+echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" \
     | sudo tee /etc/apt/sources.list.d/cuda.list
 sudo apt-get update
 sudo apt-get install -y \
diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst
index 47684fe305c..f5b8b263a4e 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst
@@ -515,6 +515,8 @@ The list of available options is reported in the :ref:`table <tab:cmakevar>` bel
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_HDF5                   |  Enable HDF5-based I/O                          | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_HDF5_ZFP               |  Enable compression with ZFP in HDF5-based I/O  | NO                      | YES, NO               |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_PLOTFILE_TOOLS         |  Build and install plotfile postprocessing tools| NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_ENABLE_TESTS           |  Enable CTest suite                             | NO                      | YES, NO               |
diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index 5fa2dc31919..5d64b3fadd4 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -18,7 +18,7 @@ for AMD and DPC++ for Intel. This will be designated with ``CUDA/HIP/DPC++``
 throughout the documentation.  However, application teams can also use
 OpenACC or OpenMP in their individual codes.
 
-At this time, AMReX does not support cross-native language compliation
+At this time, AMReX does not support cross-native language compilation
 (HIP for non-AMD systems and DPC++ for non Intel systems).  It may work with
 a given version, but AMReX does not track or guarantee such functionality.
 
@@ -1237,7 +1237,7 @@ the destructor of :cpp:`MFIter`.  This ensures that all GPU work
 inside of an :cpp:`MFIter` loop will complete before code outside of
 the loop is executed. Any CUDA kernel launches made outside of an
 :cpp:`MFIter` loop must ensure appropriate device synchronization
-occurs. This can be done by calling :cpp:`Gpu::synchronize()`.
+occurs. This can be done by calling :cpp:`Gpu::streamSynchronize()`.
 
 CUDA supports multiple streams and kernels. Kernels launched in the
 same stream are executed sequentially, but different streams of kernel
@@ -1342,6 +1342,48 @@ will show little improvement or even perform worse. So, this conditional stateme
 should be added to MFIter loops that contain GPU work, unless users specifically test
 the performance or are designing more complex workflows that require OpenMP.
 
+.. _sec:gpu:stream
+
+Stream and Synchronization
+==========================
+
+As mentioned in Section :ref:`sec:gpu:overview`, AMReX uses a number of GPU
+streams that are either CUDA streams or HIP streams or SYCL queues.  Many
+GPU functions (e.g., :cpp:`ParallelFor` and :cpp:`Gpu::copyAsync`) are
+asynchronous with respect to the host.  To facilitate synchronization that
+is sometimes necessary, AMReX provides :cpp:`Gpu::streamSynchronize()` and
+:cpp:`Gpu::streamSynchronizeAll()` to synchronize the current stream and all
+AMReX streams, respectively.  For performance reasons, one should try to
+minimize the number of synchronization calls.  For example,
+
+.. highlight:: c++
+
+::
+
+   // The synchronous version is NOT recommended
+   Gpu::copy(Gpu::deviceToHost, ....);
+   Gpu::copy(Gpu::deviceToHost, ....);
+   Gpu::copy(Gpu::deviceToHost, ....);
+
+   // NOT recommended because of unnecessary synchronization
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+
+   // recommended
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::copyAsync(Gpu::deviceToHost, ....);
+   Gpu::streamSynchronize();
+
+In addition to stream synchronization, there is also
+:cpp:`Gpu::synchronize()` that will perform a device wide synchronization.
+However, a device wide synchronization is usually too excessive and it might
+interfere with other libraries (e.g., MPI).
+
 .. _sec:gpu:example:
 
 An Example of Migrating to GPU
@@ -1437,8 +1479,8 @@ portable way.
 .. _sec:gpu:assertion:
 
 
-Assertions, Error Checking and Synchronization
-================================================
+Assertions and Error Checking
+=============================
 
 To help debugging, we often use :cpp:`amrex::Assert` and
 :cpp:`amrex::Abort`.  These functions are GPU safe and can be used in
@@ -1461,10 +1503,11 @@ However, due to asynchronicity, determining the source of the error
 can be difficult.  Even if GPU kernels launched earlier in the code
 result in a CUDA error, the error may not be output at a nearby call to
 :cpp:`AMREX_GPU_ERROR_CHECK()` by the CPU.  When tracking down a CUDA
-launch error, :cpp:`Gpu::synchronize()` and
-:cpp:`Gpu::streamSynchronize()` can be used to synchronize
-the device or the CUDA stream, respectively, and track down the specific
-launch that causes the error.
+launch error, :cpp:`Gpu::synchronize()`,
+:cpp:`Gpu::streamSynchronize()`, or :cpp:`Gpu::streamSynchronizeAll()` can
+be used to synchronize the device, the current GPU stream, or all GPU
+streams, respectively, and track down the specific launch that causes the
+error.
 
 .. ===================================================================
 
diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp
index ece300cfe81..a88489f9512 100644
--- a/Src/Amr/AMReX_AmrLevel.cpp
+++ b/Src/Amr/AMReX_AmrLevel.cpp
@@ -1495,13 +1495,13 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
                                          dcomp,
                                          m_scomp,
                                          m_ncomp);
-        Gpu::synchronize();  // In case this runs on GPU
+        Gpu::streamSynchronize();  // In case this runs on GPU
     }
 
     if (m_FixUpCorners)
     {
         FixUpPhysCorners(fab,m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp);
-        Gpu::synchronize();  // In case this runs on GPU
+        Gpu::streamSynchronize();  // In case this runs on GPU
     }
 }
 
diff --git a/Src/Amr/AMReX_StateData.cpp b/Src/Amr/AMReX_StateData.cpp
index 3f7fca2ec9b..92f8c791986 100644
--- a/Src/Amr/AMReX_StateData.cpp
+++ b/Src/Amr/AMReX_StateData.cpp
@@ -511,9 +511,9 @@ StateData::FillBoundary (FArrayBox&     dest,
     }
 
 #ifdef AMREX_USE_GPU
-    // Add a synchronize here in case the user code launched kernels
+    // Add a streamSynchronize here in case the user code launched kernels
     // to handle the boundary fills.
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 #endif
 }
 
diff --git a/Src/AmrCore/AMReX_AmrCore.H b/Src/AmrCore/AMReX_AmrCore.H
index 73e4d25ea8c..cd532bef63b 100644
--- a/Src/AmrCore/AMReX_AmrCore.H
+++ b/Src/AmrCore/AMReX_AmrCore.H
@@ -40,8 +40,8 @@ public:
 
     AmrCore (Geometry const& level_0_geom, AmrInfo const& amr_info);
 
-    AmrCore (AmrCore&& rhs) = default;
-    AmrCore& operator= (AmrCore&& rhs) = default;
+    AmrCore (AmrCore&& rhs);
+    AmrCore& operator= (AmrCore&& rhs);
 
     AmrCore (const AmrCore& rhs) = delete;
     AmrCore& operator= (const AmrCore& rhs) = delete;
diff --git a/Src/AmrCore/AMReX_AmrCore.cpp b/Src/AmrCore/AMReX_AmrCore.cpp
index 59617ea4e75..54ec31eded2 100644
--- a/Src/AmrCore/AMReX_AmrCore.cpp
+++ b/Src/AmrCore/AMReX_AmrCore.cpp
@@ -11,6 +11,7 @@
 #endif
 
 #include <algorithm>
+#include <utility>
 #include <ostream>
 
 namespace amrex {
@@ -46,6 +47,26 @@ AmrCore::AmrCore (Geometry const& level_0_geom, AmrInfo const& amr_info)
 #endif
 }
 
+AmrCore::AmrCore (AmrCore&& rhs)
+    : AmrMesh(std::move(rhs))
+{
+#ifdef AMREX_PARTICLES
+    m_gdb = std::move(rhs.m_gdb);
+    m_gdb->m_amrcore = this;
+#endif
+}
+
+AmrCore& AmrCore::operator= (AmrCore&& rhs)
+{
+    AmrMesh::operator=(std::move(rhs));
+#ifdef AMREX_PARTICLES
+    m_gdb = std::move(rhs.m_gdb);
+    m_gdb->m_amrcore = this;
+#endif
+
+    return *this;
+}
+
 AmrCore::~AmrCore ()
 {
 }
diff --git a/Src/AmrCore/AMReX_AmrParGDB.H b/Src/AmrCore/AMReX_AmrParGDB.H
index 35e6d666db8..95cd16a874b 100644
--- a/Src/AmrCore/AMReX_AmrParGDB.H
+++ b/Src/AmrCore/AMReX_AmrParGDB.H
@@ -10,6 +10,8 @@ namespace amrex {
 class AmrParGDB
     : public ParGDBBase
 {
+    friend AmrCore;
+
 public:
 
     explicit AmrParGDB (AmrCore* amr) noexcept
diff --git a/Src/AmrCore/AMReX_ErrorList.H b/Src/AmrCore/AMReX_ErrorList.H
index 9011655a50d..90f49b02749 100644
--- a/Src/AmrCore/AMReX_ErrorList.H
+++ b/Src/AmrCore/AMReX_ErrorList.H
@@ -382,6 +382,7 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
     int m_max_level = 1000;
     Real m_min_time = std::numeric_limits<Real>::lowest();
     Real m_max_time = std::numeric_limits<Real>::max();
+    int m_volume_weighting = 0;
     RealBox m_realbox;
 
     AMRErrorTagInfo& SetMaxLevel (int max_level) noexcept {
@@ -400,6 +401,10 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
       m_realbox = realbox;
       return *this;
     }
+    AMRErrorTagInfo& SetVolumeWeighting (int volume_weighting) noexcept {
+      m_volume_weighting = volume_weighting;
+      return *this;
+    }
   };
 
   class AMRErrorTag
diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp
index 5285cd4bfdc..1594ba740a9 100644
--- a/Src/AmrCore/AMReX_ErrorList.cpp
+++ b/Src/AmrCore/AMReX_ErrorList.cpp
@@ -291,6 +291,8 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
             {
                 auto const& datma   = mf->const_arrays();
                 auto threshold = m_value[level];
+                auto const volume_weighting = m_info.m_volume_weighting;
+                auto geomdata = geom.data();
                 if (m_test == GRAD)
                 {
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
@@ -340,7 +342,8 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
                 {
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
                     {
-                        if (datma[bi](i,j,k) <= threshold) {
+                        Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
+                        if (datma[bi](i,j,k) * vol <= threshold) {
                             tagma[bi](i,j,k) = tagval;
                         }
                     });
@@ -349,7 +352,8 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
                 {
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
                     {
-                        if (datma[bi](i,j,k) >= threshold) {
+                        Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
+                        if (datma[bi](i,j,k) * vol >= threshold) {
                             tagma[bi](i,j,k) = tagval;
                         }
                     });
diff --git a/Src/AmrCore/AMReX_FluxRegister.cpp b/Src/AmrCore/AMReX_FluxRegister.cpp
index 308fc00ad9e..877d01258c7 100644
--- a/Src/AmrCore/AMReX_FluxRegister.cpp
+++ b/Src/AmrCore/AMReX_FluxRegister.cpp
@@ -721,7 +721,7 @@ FluxRegister::ClearInternalBorders (const Geometry& geom)
     }
 
 #ifdef AMREX_USE_GPU
-    // There is Gpu::synchronize in Parallelfor below internally.
+    // There is Gpu::streamSynchronize in Parallelfor below internally.
     ParallelFor(tags, nc,
     [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, Array4BoxTag<Real> const& tag)
     {
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 798a552976d..6a989ffbbf1 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -503,10 +503,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
 
     PODVector<int,DeviceArenaAllocator<int> > dv_tags_offset(ntotblocks);
     int* dp_tags_offset = dv_tags_offset.data();
-    Gpu::htod_memcpy(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int));
-#ifdef AMREX_USE_DPCPP
-    Gpu::synchronize();
-#endif
+    Gpu::htod_memcpy_async(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int));
 
     PODVector<IntVect,DeviceArenaAllocator<IntVect> > dv_tags(ntotaltags);
     IntVect* dp_tags = dv_tags.data();
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index 871343f173c..f06806babcd 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -528,7 +528,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         HYPRE_Init();
 #ifdef HYPRE_USING_CUDA
 
-#if defined(HYPRE_RELEASE_NUMBER) && (HYPRE_RELEASE_NUMBER >= 22100)
+#if defined(HYPRE_RELEASE_NUMBER) && (HYPRE_RELEASE_NUMBER >= 22400)
 
 #ifdef HYPRE_USING_DEVICE_POOL
         /* device pool allocator */
@@ -541,7 +541,9 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         HYPRE_SetGPUMemoryPoolSize( mempool_bin_growth, mempool_min_bin,
                                     mempool_max_bin, mempool_max_cached_bytes );
 #endif
-        HYPRE_SetSpGemmUseCusparse(false);
+        /* This API below used to be HYPRE_SetSpGemmUseCusparse(). This was changed in commit
+           Hypre master commit dfdd1cd12f */
+        HYPRE_SetSpGemmUseVendor(false);
         HYPRE_SetMemoryLocation(HYPRE_MEMORY_DEVICE);
         HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE);
         HYPRE_SetUseGpuRand(true);
@@ -603,7 +605,7 @@ void
 amrex::Finalize (amrex::AMReX* pamrex)
 {
 #ifdef AMREX_USE_GPU
-    Gpu::synchronize();
+    Gpu::streamSynchronizeAll();
 #endif
 
     AMReX::erase(pamrex);
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index 59d62ad0fac..cccde1f200c 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -126,7 +126,14 @@ Arena::allocate_system (std::size_t nbytes)
     if (arena_info.use_cpu_memory)
     {
         p = std::malloc(nbytes);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
         if (p && arena_info.device_use_hostalloc) AMREX_MLOCK(p, nbytes);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
     }
     else if (arena_info.device_use_hostalloc)
     {
diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H
index 8cde01a9ff4..625dc0dad27 100644
--- a/Src/Base/AMReX_BLProfiler.H
+++ b/Src/Base/AMReX_BLProfiler.H
@@ -314,7 +314,6 @@ private:
     std::string regname;
 };
 
-
 namespace BLProfilerUtils {
   void WriteHeader(std::ostream &os, const int colWidth,
                    const Real maxlen, const bool bwriteavg);
@@ -388,7 +387,7 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 }
 
 #define BL_PROFILE_INITIALIZE()  amrex::BLProfiler::Initialize();
-#define BL_PROFILE_INITPARAMS()  amrex::BLProfiler::InitParams();
+#define BL_PROFILE_INITPARAMS()  amrex::BLProfiler::InitParams(); amrex::BLProfileSync::InitParams()
 #define BL_PROFILE_FINALIZE()    amrex::BLProfiler::Finalize();
 
 #define BL_TINY_PROFILE_INITIALIZE()
@@ -426,7 +425,6 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
                                             bl_profiler_##rvname.start();
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)  bl_profiler_##rvname.stop();   \
     amrex::BLProfiler::RegionStop(fname);
-
 #define BL_PROFILE_TINY_FLUSH()
 #define BL_PROFILE_FLUSH() { amrex::BLProfiler::Finalize(true); }
 
@@ -475,7 +473,7 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 #define BL_PROFILE_INITPARAMS()
 #define BL_PROFILE_FINALIZE()
 
-#define BL_TINY_PROFILE_INITIALIZE()   amrex::TinyProfiler::Initialize()
+#define BL_TINY_PROFILE_INITIALIZE()   amrex::TinyProfiler::Initialize(); BLProfileSync::InitParams()
 #define BL_TINY_PROFILE_FINALIZE()     amrex::TinyProfiler::Finalize()
 
 #define BL_PROFILE(fname) BL_PROFILE_IMPL(fname, __COUNTER__)
@@ -599,6 +597,51 @@ class BLProfiler
 
 #endif
 
+// ============================================================
+// Sync macros
+// ============================================================
+
+#if (defined(BL_PROFILING) || defined(AMREX_TINY_PROFILING))
+
+namespace amrex {
+
+  class BLProfileSync {
+
+  public:
+    static void Sync() noexcept;
+    static void Sync(const std::string& name) noexcept;
+    static void Sync(const char* name) noexcept;
+
+    static void InitParams() noexcept;
+
+    static void StartSyncRegion() noexcept;
+    static void StartSyncRegion(const std::string& name) noexcept;
+    static void StartSyncRegion(const char* name) noexcept;
+    static void EndSyncRegion() noexcept;
+
+  private:
+    static int sync_counter;
+    static int use_prof_syncs;
+  };
+
+}
+
+#define BL_PROFILE_SYNC()                       amrex::BLProfileSync::Sync()
+#define BL_PROFILE_SYNC_TIMED(fname)            amrex::BLProfileSync::Sync(fname)
+#define BL_PROFILE_SYNC_START()                 amrex::BLProfileSync::StartSyncRegion()
+#define BL_PROFILE_SYNC_START_TIMED(fname)      amrex::BLProfileSync::StartSyncRegion(fname)
+#define BL_PROFILE_SYNC_STOP()                  amrex::BLProfileSync::EndSyncRegion()
+
+#else
+
+#define BL_PROFILE_SYNC()
+#define BL_PROFILE_SYNC_TIMED(fname)
+#define BL_PROFILE_SYNC_START()
+#define BL_PROFILE_SYNC_START_TIMED(fname)
+#define BL_PROFILE_SYNC_STOP()
+
+#endif
+
 // ============================================================
 // Third party macros.
 // Mutually exclusive, including from BL_PROFILE.
diff --git a/Src/Base/AMReX_BLProfiler.cpp b/Src/Base/AMReX_BLProfiler.cpp
index 516d0e11833..9d12b23708b 100644
--- a/Src/Base/AMReX_BLProfiler.cpp
+++ b/Src/Base/AMReX_BLProfiler.cpp
@@ -1692,3 +1692,93 @@ BL_FORT_PROC_DECL(BL_PROFFORTFUNCSTOP_CPP_INT,bl_proffortfuncstop_cpp_int)
 #endif
 
 #endif
+
+#if (defined(BL_PROFILING) || defined(AMREX_TINY_PROFILING))
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_BLProfiler.H>
+#include <AMReX_ParallelDescriptor.H>
+#include <AMReX_ParallelContext.H>
+
+namespace amrex {
+
+int BLProfileSync::use_prof_syncs = 0;
+int BLProfileSync::sync_counter = 0;
+
+void
+BLProfileSync::Sync () noexcept
+{
+    if (use_prof_syncs)
+        { ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub()); }
+}
+
+void
+BLProfileSync::Sync (const std::string& name) noexcept
+{
+    if (use_prof_syncs) {
+        BL_PROFILE(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+BLProfileSync::Sync (const char* name) noexcept
+{
+    if (use_prof_syncs) {
+        BL_PROFILE(name);
+        ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+    }
+}
+
+void
+BLProfileSync::InitParams () noexcept
+{
+    ParmParse pParse("amrex");
+    pParse.queryAdd("use_profiler_syncs", BLProfileSync::use_prof_syncs);
+
+    sync_counter = 0;
+}
+
+void
+BLProfileSync::StartSyncRegion () noexcept
+{
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion (const std::string& name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            BL_PROFILE(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::StartSyncRegion (const char* name) noexcept {
+    if (use_prof_syncs) {
+        if (sync_counter == 0) {
+            BL_PROFILE(name);
+            ParallelDescriptor::Barrier(ParallelContext::CommunicatorSub());
+        }
+        sync_counter++;
+    }
+}
+
+void
+BLProfileSync::EndSyncRegion () noexcept {
+    if (use_prof_syncs) {
+        sync_counter--;
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_BlockMutex.cpp b/Src/Base/AMReX_BlockMutex.cpp
index 32e6d8face1..6e82fd9c4aa 100644
--- a/Src/Base/AMReX_BlockMutex.cpp
+++ b/Src/Base/AMReX_BlockMutex.cpp
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
     amrex::ignore_unused(state,N);
     amrex::Abort("xxxxx DPCPP todo");
 #else
-    amrex::launch((N+255)/256, 256, Gpu::nullStream(),
+    amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int i = threadIdx.x + blockIdx.x*blockDim.x;
@@ -38,4 +38,3 @@ BlockMutex::~BlockMutex () {
 #endif
 
 }
-
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 6bbfcecbb0f..61ef452b601 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -775,11 +775,11 @@ FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int nc
     detail::fab_to_fab<BUF, value_type>(snd_copy_tags, scomp, 0, ncomp,
                                         detail::CellStore<BUF, value_type>());
 
-    // There is Gpu::synchronize in fab_to_fab.
+    // There is Gpu::streamSynchronize in fab_to_fab.
 
     if (pbuffer != send_data[0]) {
         Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
         The_Arena()->free(pbuffer);
     }
 }
@@ -808,7 +808,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
         szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
         pbuffer = (char*)The_Arena()->alloc(szbuffer);
         Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 #endif
 
@@ -885,7 +885,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
         }
     }
 
-    // There is Gpu::synchronize in fab_to_fab.
+    // There is Gpu::streamSynchronize in fab_to_fab.
 
     if (pbuffer != recv_data[0]) {
         The_Arena()->free(pbuffer);
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 166f3d964ea..6eef7caa579 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -1807,7 +1807,7 @@ FabArray<FAB>::define (const BoxArray&            bxs,
 
     if(info.alloc) {
         AllocFabs(*m_factory, m_dallocator.m_arena, info.tags);
-        Gpu::synchronize();
+        Gpu::streamSynchronizeAll();
 #ifdef BL_USE_TEAM
         ParallelDescriptor::MyTeam().MemoryBarrier();
 #endif
@@ -2696,7 +2696,7 @@ template <class FAB>
 void
 FabArray<FAB>::FillBoundaryAndSync (const Periodicity& period)
 {
-    BL_PROFILE("FAbArray::FillBoundaryAndSync()");
+    BL_PROFILE("FabArray::FillBoundaryAndSync()");
     if (n_grow.max() > 0 || !is_cell_centered()) {
         FillBoundaryAndSync_nowait(0, nComp(), n_grow, period);
         FillBoundaryAndSync_finish();
@@ -2708,7 +2708,7 @@ void
 FabArray<FAB>::FillBoundaryAndSync (int scomp, int ncomp, const IntVect& nghost,
                                     const Periodicity& period)
 {
-    BL_PROFILE("FAbArray::FillBoundaryAndSync()");
+    BL_PROFILE("FabArray::FillBoundaryAndSync()");
     if (nghost.max() > 0 || !is_cell_centered()) {
         FillBoundaryAndSync_nowait(scomp, ncomp, nghost, period);
         FillBoundaryAndSync_finish();
@@ -2908,7 +2908,6 @@ template <typename BUF>
 void
 FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const Periodicity& period, bool cross)
 {
-    BL_PROFILE("FillBoundary_nowait()");
     FBEP_nowait<BUF>(scomp, ncomp, nGrowVect(), period, cross);
 }
 
@@ -2918,7 +2917,6 @@ void
 FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost,
                                     const Periodicity& period, bool cross)
 {
-    BL_PROFILE("FillBoundary_nowait()");
     FBEP_nowait<BUF>(scomp, ncomp, nghost, period, cross);
 }
 
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index ef57321ebf6..40431a61557 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -10,6 +10,9 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
                             bool enforce_periodicity_only,
                             bool override_sync)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
+    BL_PROFILE("FillBoundary_nowait()");
+
     AMREX_ASSERT_WITH_MESSAGE(!fbd, "FillBoundary_nowait() called when comm operation already in progress.");
     AMREX_ASSERT(!enforce_periodicity_only || !override_sync);
 
@@ -168,7 +171,7 @@ FabArray<FAB>::FillBoundary_finish ()
 
     BL_PROFILE("FillBoundary_finish()");
 
-    if (!fbd) { n_filled = IntVect::TheZeroVector(); return; }
+    if (!fbd) { n_filled = IntVect::TheZeroVector(); BL_PROFILE_SYNC_STOP(); return; }
 
     const FB* TheFB = fbd->fb;
     const int N_rcvs = TheFB->m_RcvTags->size();
@@ -240,6 +243,8 @@ FabArray<FAB>::FillBoundary_finish ()
     fbd.reset();
 
 #endif
+
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <class FAB>
@@ -255,7 +260,6 @@ FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src,
                              const FabArrayBase::CPC * a_cpc)
 {
     BL_PROFILE("FabArray::ParallelCopy()");
-
     ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period, op, a_cpc);
     ParallelCopy_finish();
 }
@@ -312,6 +316,7 @@ FabArray<FAB>::ParallelCopy_nowait (const FabArray<FAB>& src,
                                     const FabArrayBase::CPC * a_cpc,
                                     bool                 to_ghost_cells_only)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
     BL_PROFILE("FabArray::ParallelCopy_nowait()");
 
     AMREX_ASSERT_WITH_MESSAGE(!pcd, "ParallelCopy_nowait() called when comm operation already in progress.");
@@ -494,7 +499,7 @@ FabArray<FAB>::ParallelCopy_finish ()
 
     BL_PROFILE("FabArray::ParallelCopy_finish()");
 
-    if (!pcd) { return; }
+    if (!pcd) { BL_PROFILE_SYNC_STOP(); return; }
 
     const CPC* thecpc = pcd->cpc;
 
@@ -559,6 +564,8 @@ FabArray<FAB>::ParallelCopy_finish ()
 
 #endif /*BL_USE_MPI*/
 
+    BL_PROFILE_SYNC_STOP();
+
 }
 
 template <class FAB>
diff --git a/Src/Base/AMReX_FabConv.cpp b/Src/Base/AMReX_FabConv.cpp
index b758dd94261..f45a4a26d33 100644
--- a/Src/Base/AMReX_FabConv.cpp
+++ b/Src/Base/AMReX_FabConv.cpp
@@ -375,15 +375,14 @@ _pd_extract_field (char const* in,
 // Byte reverse nitems words.  Each word is nb bytes Long where nb is even.
 //
 
+template <int NB>
 static
 void
-_pd_btrvout (char* out,
-             Long  nb,
-             Long  nitems)
+_pd_btrvout (char* out, Long  nitems)
 {
-    for (Long jl = 0, nbo2 = nb >> 1; jl < nbo2; jl++)
+    for (int jl = 0, nbo2 = NB >> 1; jl < nbo2; jl++)
     {
-        Long jh  = nb - jl - 1;
+        int jh  = NB - jl - 1;
         char* p1 = out + jh;
         char* p2 = out + jl;
         for (Long i = 0L; i < nitems; i++)
@@ -391,8 +390,8 @@ _pd_btrvout (char* out,
             char tmp = *p1;
             *p1 = *p2;
             *p2 = tmp;
-            p1 += nb;
-            p2 += nb;
+            p1 += NB;
+            p2 += NB;
         }
     }
 }
@@ -458,8 +457,13 @@ _pd_insert_field (Long  in_long,
     //
     // Reorder the bytes appropriately.
     //
-    if (l_order == REVERSE_ORDER)
-        _pd_btrvout(in, l_bytes, 1L);
+    if (l_order == REVERSE_ORDER) {
+        if (l_bytes == 4) {
+            _pd_btrvout<4>(in, 1L);
+        } else { // It's either 4 or 8.  There is an assertion in PD_fconvert.
+            _pd_btrvout<8>(in, 1L);
+        }
+    }
     //
     // Copy the remaining aligned bytes over.
     //
@@ -622,6 +626,9 @@ PD_fconvert (void*       out,
              int         onescmp)
 {
 //    BL_PROFILE("PD_fconvert");
+
+    AMREX_ASSERT(l_bytes == 4 || l_bytes == 8); // Otherwise, we need to update _pd_btrvout
+
     Long i, expn, expn_max, hexpn, mant, DeltaBias, hmbo, hmbi;
     int nbits, inbytes, outbytes, sign;
     int indxin, indxout, inrem, outrem, dindx;
diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H
index 773e6c69519..1a9910bc526 100644
--- a/Src/Base/AMReX_Geometry.H
+++ b/Src/Base/AMReX_Geometry.H
@@ -228,6 +228,64 @@ public:
                     const BoxArray& grds,
                     int             idx,
                     int             grow) const;
+
+    //! Return the volume of the specified cell.
+    AMREX_GPU_HOST_DEVICE AMREX_INLINE
+    static Real Volume (const IntVect& point, const GeometryData& geomdata)
+    {
+        auto dx = geomdata.CellSize();
+
+        Real vol;
+
+#if AMREX_SPACEDIM == 1
+
+        auto coord = geomdata.Coord();
+
+        if (coord == CoordSys::cartesian) {
+            // Cartesian
+
+            vol = dx[0];
+        }
+        else {
+            // Spherical
+
+            Real rl = geomdata.ProbLo()[0] + static_cast<Real>(point[0]) * dx[0];
+            Real rr = rl + dx[0];
+
+            vol = (4.0_rt / 3.0_rt) * M_PI * dx[0] * (rl * rl + rl * rr + rr * rr);
+        }
+
+#elif AMREX_SPACEDIM == 2
+
+        auto coord = geomdata.Coord();
+
+        if (coord == CoordSys::cartesian) {
+            // Cartesian
+
+            vol = dx[0] * dx[1];
+        }
+        else {
+            // Cylindrical
+
+            Real r_l = geomdata.ProbLo()[0] + static_cast<Real>(point[0]) * dx[0];
+            Real r_r = geomdata.ProbLo()[0] + static_cast<Real>(point[0]+1) * dx[0];
+
+            vol = M_PI * (r_l + r_r) * dx[0] * dx[1];
+        }
+
+#else
+
+        amrex::ignore_unused(point);
+
+        // Cartesian
+
+        vol = dx[0] * dx[1] * dx[2];
+
+#endif
+
+        return vol;
+    }
+
     /**
     * \brief Compute d(log(A))/dr at cell centers in given region and
     *           stuff the results into the passed MultiFab.
diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H
index 98c94eb7e31..7e207c49b9b 100644
--- a/Src/Base/AMReX_GpuAsyncArray.H
+++ b/Src/Base/AMReX_GpuAsyncArray.H
@@ -124,7 +124,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (d_data)
         {
-            Gpu::dtoh_memcpy_async(h_p, d_data, n*sizeof(T));
+            Gpu::dtoh_memcpy(h_p, d_data, n*sizeof(T));
         }
         else
 #endif
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
index e5b8bf4903d..6c497f2a206 100644
--- a/Src/Base/AMReX_GpuBuffer.H
+++ b/Src/Base/AMReX_GpuBuffer.H
@@ -33,9 +33,6 @@ public:
         {
             d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
-#ifdef AMREX_USE_DPCPP
-            if (Gpu::onNullStream()) Gpu::synchronize();
-#endif
         }
 #endif
     }
@@ -55,9 +52,6 @@ public:
         {
             d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
-#ifdef AMREX_USE_DPCPP
-            if (Gpu::onNullStream()) Gpu::synchronize();
-#endif
         }
 #endif
     }
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index 0c980907680..cc68770ff3f 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -222,7 +222,7 @@ namespace Gpu {
      *
      * Example usage:
      *
-     *    Gpu::copy(Gpu::hostToDevice, a.begin(), a.end(), b.begin());
+     *    Gpu::copyAsync(Gpu::hostToDevice, a.begin(), a.end(), b.begin());
      */
     template<class InIter, class OutIter>
     void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
@@ -256,7 +256,7 @@ namespace Gpu {
      *
      * Example usage:
      *
-     *    Gpu::copy(Gpu::deviceToHost, a.begin(), a.end(), b.begin());
+     *    Gpu::copyAsync(Gpu::deviceToHost, a.begin(), a.end(), b.begin());
      */
     template<class InIter, class OutIter>
     void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
@@ -290,7 +290,7 @@ namespace Gpu {
      *
      * Example usage:
      *
-     *    Gpu::copy(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());
+     *    Gpu::copyAsync(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());
      */
     template<class InIter, class OutIter>
     void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
@@ -335,7 +335,7 @@ namespace Gpu {
 #endif
 #endif
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     /**
@@ -366,7 +366,7 @@ namespace Gpu {
 #endif
 #endif
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
 }}
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index 4399279aa99..8a327704a1d 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -53,17 +53,13 @@ public:
 
 #if defined(AMREX_USE_GPU)
     static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
-    static gpuStream_t nullStream () noexcept { return gpu_default_stream; }
 #ifdef AMREX_USE_CUDA
     /** for backward compatibility */
     static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
 #endif
 #ifdef AMREX_USE_DPCPP
-    static sycl::queue& nullQueue () noexcept { return *(gpu_default_stream.queue); }
     static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
     static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
-    static bool onNullStream () noexcept { return gpu_stream[OpenMP::get_thread_num()] == gpu_default_stream; }
-    static bool onNullStream (gpuStream_t stream) noexcept { return stream == gpu_default_stream; }
 #endif
 #endif
 
@@ -90,13 +86,16 @@ public:
     static void synchronize () noexcept;
 
     /**
-     * Halt execution of code until GPU stream has finished processing all
+     * Halt execution of code until the current AMReX GPU stream has finished processing all
      * previously requested tasks.
      */
     static void streamSynchronize () noexcept;
-#ifdef AMREX_USE_DPCPP
-    static void nonNullStreamSynchronize () noexcept;
-#endif
+
+    /**
+     * Halt execution of code until all AMReX GPU streams have finished processing all
+     * previously requested tasks.
+     */
+    static void streamSynchronizeAll () noexcept;
 
 #if defined(__CUDACC__)
     /**  Generic graph selection. These should be called by users.  */
@@ -176,9 +175,12 @@ private:
     static dim3 numThreadsMin;
     static dim3 numBlocksOverride, numThreadsOverride;
 
+    // We build gpu_default_stream and gpu_stream_pool.
+    // The non-owning gpu_stream is used to store the current stream that will be used.
+    // gpu_stream is a vector so that it's thread safe to write to it.
     static gpuStream_t gpu_default_stream;
-    static Vector<gpuStream_t> gpu_stream_pool;
-    static Vector<gpuStream_t> gpu_stream;
+    static Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
+    static Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
     static gpuDeviceProp_t device_prop;
     static int memory_pools_supported;
     static unsigned int max_blocks_per_launch;
@@ -198,12 +200,6 @@ gpuStream () noexcept
 {
     return Device::gpuStream();
 }
-
-inline gpuStream_t
-nullStream () noexcept
-{
-    return Device::nullStream();
-}
 #endif
 
 inline int
@@ -224,84 +220,19 @@ streamSynchronize () noexcept
     Device::streamSynchronize();
 }
 
-#ifdef AMREX_USE_DPCPP
 inline void
-nonNullStreamSynchronize () noexcept
+streamSynchronizeAll () noexcept
 {
-    Device::nonNullStreamSynchronize();
+    Device::streamSynchronizeAll();
 }
-#endif
 
 #ifdef AMREX_USE_GPU
 
-inline void
-htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
-{
-    if (sz == 0) return;
-#ifdef AMREX_USE_DPCPP
-    Device::nonNullStreamSynchronize();
-    auto& q = Device::nullQueue();
-    q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("htod_memcpy: ")+ex.what()+"!!!!!");
-    }
-    if (Device::onNullStream()) Gpu::synchronize();
-#else
-    AMREX_HIP_OR_CUDA(
-        AMREX_HIP_SAFE_CALL(hipMemcpy(p_d, p_h, sz, hipMemcpyHostToDevice));,
-        AMREX_CUDA_SAFE_CALL(cudaMemcpy(p_d, p_h, sz, cudaMemcpyHostToDevice)); )
-#endif
-}
-
-inline void
-dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
-{
-    if (sz == 0) return;
-#ifdef AMREX_USE_DPCPP
-    Device::nonNullStreamSynchronize();
-    auto& q = Device::nullQueue();
-    q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("dtoh_memcpy: ")+ex.what()+"!!!!!");
-    }
-    Gpu::synchronize(); // To mimic cuda behavior
-#else
-    AMREX_HIP_OR_CUDA(
-        AMREX_HIP_SAFE_CALL(hipMemcpy(p_h, p_d, sz, hipMemcpyDeviceToHost));,
-        AMREX_CUDA_SAFE_CALL(cudaMemcpy(p_h, p_d, sz, cudaMemcpyDeviceToHost)); )
-#endif
-}
-
-inline void
-dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
-{
-    if (sz == 0) return;
-#ifdef AMREX_USE_DPCPP
-    Device::nonNullStreamSynchronize();
-    auto& q = Device::nullQueue();
-    q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("dtod_memcpy: ")+ex.what()+"!!!!!");
-    }
-#else
-    AMREX_HIP_OR_CUDA(
-        AMREX_HIP_SAFE_CALL(hipMemcpy(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice));,
-        AMREX_CUDA_SAFE_CALL(cudaMemcpy(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice)); )
-#endif
-}
-
 inline void
 htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
 {
     if (sz == 0) return;
 #ifdef AMREX_USE_DPCPP
-    if (Device::onNullStream()) Device::nonNullStreamSynchronize();
     auto& q = Device::streamQueue();
     q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
 #else
@@ -316,7 +247,6 @@ dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
 {
     if (sz == 0) return;
 #ifdef AMREX_USE_DPCPP
-    if (Device::onNullStream()) Device::nonNullStreamSynchronize();
     auto& q = Device::streamQueue();
     q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
 #else
@@ -331,7 +261,6 @@ dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noe
 {
     if (sz == 0) return;
 #ifdef AMREX_USE_DPCPP
-    if (Device::onNullStream()) Device::nonNullStreamSynchronize();
     auto& q = Device::streamQueue();
     q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
 #else
@@ -341,20 +270,30 @@ dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noe
 #endif
 }
 
-#endif
+inline void
+htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
+{
+    if (sz == 0) return;
+    htod_memcpy_async(p_d, p_h, sz);
+    Gpu::streamSynchronize();
+}
 
-#ifdef AMREX_USE_DPCPP
-inline bool
-onNullStream ()
+inline void
+dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
 {
-    return Device::onNullStream();
+    if (sz == 0) return;
+    dtoh_memcpy_async(p_h, p_d, sz);
+    Gpu::streamSynchronize();
 }
 
-inline bool
-onNullStream (gpuStream_t stream)
+inline void
+dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
 {
-    return Device::onNullStream(stream);
+    if (sz == 0) return;
+    dtod_memcpy_async(p_d_dst, p_d_src, sz);
+    Gpu::streamSynchronize();
 }
+
 #endif
 
 }}
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index c921a37d355..05790837111 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -403,7 +403,7 @@ Device::initialize_gpu ()
         AMREX_HIP_SAFE_CALL(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeFourByte));
     }
 
-    gpu_default_stream = 0;
+    AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_default_stream));
     for (int i = 0; i < max_gpu_streams; ++i) {
         AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_stream_pool[i]));
     }
@@ -424,7 +424,7 @@ Device::initialize_gpu ()
         AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
     }
 
-    gpu_default_stream = 0;
+    AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_default_stream));
     for (int i = 0; i < max_gpu_streams; ++i) {
         AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_stream_pool[i]));
 #ifdef AMREX_USE_ACC
@@ -559,7 +559,7 @@ Device::numDevicesUsed () noexcept
 int
 Device::streamIndex (gpuStream_t s) noexcept
 {
-    if (s == nullStream()) {
+    if (s == gpu_default_stream) {
         return -1;
     } else {
         auto it = std::find(std::begin(gpu_stream_pool), std::end(gpu_stream_pool), s);
@@ -611,12 +611,19 @@ void
 Device::synchronize () noexcept
 {
 #ifdef AMREX_USE_DPCPP
-    nonNullStreamSynchronize();
+    auto& q = *(gpu_default_stream.queue);
     try {
-        gpu_default_stream.queue->wait_and_throw();
+        q.wait_and_throw();
     } catch (sycl::exception const& ex) {
         amrex::Abort(std::string("synchronize: ")+ex.what()+"!!!!!");
     }
+    for (auto const& s : gpu_stream_pool) {
+        try {
+            s.queue->wait_and_throw();
+        } catch (sycl::exception const& ex) {
+            amrex::Abort(std::string("synchronize: ")+ex.what()+"!!!!!");
+        }
+    }
 #else
     AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipDeviceSynchronize());,
                        AMREX_CUDA_SAFE_CALL(cudaDeviceSynchronize()); )
@@ -634,24 +641,27 @@ Device::streamSynchronize () noexcept
         amrex::Abort(std::string("streamSynchronize: ")+ex.what()+"!!!!!");
     }
 #else
-    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpu_stream[OpenMP::get_thread_num()]));,
-                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpu_stream[OpenMP::get_thread_num()])); )
+    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpuStream()));,
+                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpuStream())); )
 #endif
 }
 
-#ifdef AMREX_USE_DPCPP
 void
-Device::nonNullStreamSynchronize () noexcept
+Device::streamSynchronizeAll () noexcept
 {
+#ifdef AMREX_USE_GPU
+#ifdef AMREX_USE_DPCPP
+    Device::synchronize();
+#else
+    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpu_default_stream));,
+                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpu_default_stream)); )
     for (auto const& s : gpu_stream_pool) {
-        try {
-            s.queue->wait_and_throw();
-        } catch (sycl::exception const& ex) {
-            amrex::Abort(std::string("nonNullStreamSynchronize: ")+ex.what()+"!!!!!");
-        }
+        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(s));,
+                           AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); )
     }
-}
 #endif
+#endif
+}
 
 #if defined(__CUDACC__)
 
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index ec2bbe49b3a..12206f69b70 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -9,7 +9,6 @@ namespace amrex {
 template <typename L>
 void single_task (gpuStream_t stream, L&& f) noexcept
 {
-    if (Gpu::onNullStream(stream)) Gpu::nonNullStreamSynchronize();
     auto& q = *(stream.queue);
     try {
         q.submit([&] (sycl::handler& h) {
@@ -24,7 +23,6 @@ template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
              gpuStream_t stream, L&& f) noexcept
 {
-    if (Gpu::onNullStream(stream)) Gpu::nonNullStreamSynchronize();
     int nthreads_total = nthreads_per_block * nblocks;
     std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
         / sizeof(unsigned long long);
@@ -49,7 +47,6 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
 template<typename L>
 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
 {
-    if (Gpu::onNullStream(stream)) Gpu::nonNullStreamSynchronize();
     int nthreads_total = nthreads_per_block * nblocks;
     auto& q = *(stream.queue);
     try {
@@ -72,8 +69,6 @@ void launch (T const& n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -149,8 +144,6 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -203,8 +196,6 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -269,8 +260,6 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -331,11 +320,9 @@ void ParallelForRNG (T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
-    auto& q = Gpu::Device::nullQueue();
+    auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
         q.submit([&] (sycl::handler& h) {
@@ -370,11 +357,9 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
-    auto& q = Gpu::Device::nullQueue();
+    auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
         q.submit([&] (sycl::handler& h) {
@@ -416,8 +401,6 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
@@ -470,8 +453,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -533,8 +514,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len2x = len2.x;
     const auto len3x = len3.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -602,8 +581,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -673,8 +650,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len2x = len2.x;
     const auto len3x = len3.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -897,7 +872,7 @@ ParallelForRNG (T n, L&& f) noexcept
     randState_t* rand_state = getRandState();
     const auto ec = Gpu::ExecutionConfig(n);
     AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
-                        ec.numThreads, 0, Gpu::nullStream(),  // use null stream
+                        ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
@@ -905,6 +880,7 @@ ParallelForRNG (T n, L&& f) noexcept
             f(i,engine);
         }
     });
+    Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -921,7 +897,7 @@ ParallelForRNG (Box const& box, L&& f) noexcept
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
-                        ec.numThreads, 0, Gpu::nullStream(),  // use null stream
+                        ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
@@ -935,6 +911,7 @@ ParallelForRNG (Box const& box, L&& f) noexcept
             f(i,j,k,engine);
         }
     });
+    Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -951,7 +928,7 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
-                        ec.numThreads, 0, Gpu::nullStream(), // use null stream
+                        ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
@@ -967,6 +944,7 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
             }
         }
     });
+    Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H
index 827ee2265d0..89aa1f24bc9 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.H
@@ -9,7 +9,6 @@
     if (amrex::Gpu::inLaunchRegion()) \
     { \
         const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -68,7 +67,6 @@
         dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
                                           amrex_i_ec2.numBlocks.x); \
         amrex_i_nblocks.y = 2; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -151,7 +149,6 @@
                                                      amrex_i_ec2.numBlocks.x), \
                                                      amrex_i_ec3.numBlocks.x); \
         amrex_i_nblocks.y = 3; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -243,7 +240,6 @@
     if (amrex::Gpu::inLaunchRegion()) \
     { \
         auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -298,7 +294,6 @@
         dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
                                           amrex_i_ec2.numBlocks.x); \
         amrex_i_nblocks.y = 2; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
@@ -371,7 +366,6 @@
                                                      amrex_i_ec2.numBlocks.x), \
                                                      amrex_i_ec3.numBlocks.x); \
         amrex_i_nblocks.y = 3; \
-        if (amrex::Gpu::onNullStream()) amrex::Gpu::nonNullStreamSynchronize(); \
         int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
         int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
         auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
diff --git a/Src/Base/AMReX_GpuUtility.cpp b/Src/Base/AMReX_GpuUtility.cpp
index 60f73265d35..d40a1aef31b 100644
--- a/Src/Base/AMReX_GpuUtility.cpp
+++ b/Src/Base/AMReX_GpuUtility.cpp
@@ -38,6 +38,12 @@ StreamIter::init() noexcept
     amrex::ignore_unused(m_threadsafe);
     amrex::ignore_unused(m_sync);
 #if defined(AMREX_USE_GPU)
+    if (m_sync) {
+#ifdef AMREX_USE_OMP
+#pragma omp single
+#endif
+        Gpu::streamSynchronize();
+    }
     Gpu::Device::setStreamIndex(m_i);
 #elif defined(AMREX_USE_OMP)
     int nthreads = omp_get_num_threads();
@@ -59,7 +65,7 @@ StreamIter::init() noexcept
 StreamIter::~StreamIter () {
 #ifdef AMREX_USE_GPU
     if (m_sync) {
-        Gpu::synchronize();
+        Gpu::streamSynchronizeAll();
     }
     AMREX_GPU_ERROR_CHECK();
     Gpu::Device::resetStreamIndex();
@@ -79,4 +85,3 @@ StreamIter::operator++ () noexcept
 #endif
 
 }}
-
diff --git a/Src/Base/AMReX_IntegratorBase.H b/Src/Base/AMReX_IntegratorBase.H
index 7e136dc5767..0b9c11c3ed9 100644
--- a/Src/Base/AMReX_IntegratorBase.H
+++ b/Src/Base/AMReX_IntegratorBase.H
@@ -97,23 +97,33 @@ struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::Vector<amrex
         }
     }
 
-    static void Copy (T& Y, const T& Other, bool Grow = true)
+    static void Copy (T& Y, const T& Other, const Vector<int> scomp={}, const Vector<int> ncomp={}, bool Grow = true)
     {
         // Copy the contents of Other into Y
         const int size = Y.size();
+        bool specify_components = scomp.size() > 0 && ncomp.size() == scomp.size();
         for (int i = 0; i < size; ++i) {
             IntVect nGrow = Grow ? Other[i].nGrowVect() : IntVect(0);
-            amrex::MultiFab::Copy(Y[i], Other[i], 0, 0, Other[i].nComp(), nGrow);
+            const int iscomp = specify_components ? scomp[i] : 0;
+            const int incomp = specify_components ? ncomp[i] : Other[i].nComp();
+            if (incomp > 0) {
+                amrex::MultiFab::Copy(Y[i], Other[i], iscomp, iscomp, incomp, nGrow);
+            }
         }
     }
 
-    static void Saxpy (T& Y, const amrex::Real a, const T& X, bool Grow = false)
+    static void Saxpy (T& Y, const amrex::Real a, const T& X, const Vector<int> scomp={}, const Vector<int> ncomp={}, bool Grow = false)
     {
         // Calculate Y += a * X
         const int size = Y.size();
+        bool specify_components = scomp.size() > 0 && ncomp.size() == scomp.size();
         for (int i = 0; i < size; ++i) {
             IntVect nGrow = Grow ? X[i].nGrowVect() : IntVect(0);
-            amrex::MultiFab::Saxpy(Y[i], a, X[i], 0, 0, X[i].nComp(), nGrow);
+            const int iscomp = specify_components ? scomp[i] : 0;
+            const int incomp = specify_components ? ncomp[i] : X[i].nComp();
+            if (incomp > 0) {
+                amrex::MultiFab::Saxpy(Y[i], a, X[i], iscomp, iscomp, incomp, nGrow);
+            }
         }
     }
 
@@ -130,18 +140,20 @@ struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::MultiFab, T>
         V.emplace_back(std::make_unique<T>(Other.boxArray(), Other.DistributionMap(), Other.nComp(), nGrow));
     }
 
-    static void Copy (T& Y, const T& Other, bool Grow = true)
+    static void Copy (T& Y, const T& Other, const int scomp=0, const int ncomp=-1, bool Grow = true)
     {
         // Copy the contents of Other into Y
         IntVect nGrow = Grow ? Other.nGrowVect() : IntVect(0);
-        amrex::MultiFab::Copy(Y, Other, 0, 0, Other.nComp(), nGrow);
+        const int mf_ncomp = ncomp > 0 ? ncomp : Other.nComp();
+        amrex::MultiFab::Copy(Y, Other, scomp, scomp, mf_ncomp, nGrow);
     }
 
-    static void Saxpy (T& Y, const amrex::Real a, const T& X, bool Grow = false)
+    static void Saxpy (T& Y, const amrex::Real a, const T& X, const int scomp=0, const int ncomp=-1, bool Grow = false)
     {
         // Calculate Y += a * X
         IntVect nGrow = Grow ? X.nGrowVect() : IntVect(0);
-        amrex::MultiFab::Saxpy(Y, a, X, 0, 0, X.nComp(), nGrow);
+        const int mf_ncomp = ncomp > 0 ? ncomp : X.nComp();
+        amrex::MultiFab::Saxpy(Y, a, X, scomp, scomp, mf_ncomp, nGrow);
     }
 
 };
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index c0ad9e8ce87..e8a97256d3d 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -222,7 +222,7 @@ MFIter::~MFIter ()
 #endif
 
 #ifdef AMREX_USE_GPU
-    if (device_sync) Gpu::synchronize();
+    if (device_sync) Gpu::streamSynchronizeAll();
 #endif
 
 #ifdef AMREX_USE_GPU
@@ -251,6 +251,15 @@ MFIter::Initialize ()
             "Nested or multiple active MFIters is not supported by default.  This can be changed by calling MFIter::allowMultipleMFIters(true)".);
     }
 
+#ifdef AMREX_USE_GPU
+    if (device_sync) {
+#ifdef AMREX_USE_OMP
+#pragma omp single
+#endif
+        Gpu::streamSynchronize();
+    }
+#endif
+
     if (flags & AllBoxes)  // a very special case
     {
         index_map    = &(fabArray.IndexArray());
diff --git a/Src/Base/AMReX_NonLocalBCImpl.H b/Src/Base/AMReX_NonLocalBCImpl.H
index dbae5ff9b9f..d2ac3b5a41c 100644
--- a/Src/Base/AMReX_NonLocalBCImpl.H
+++ b/Src/Base/AMReX_NonLocalBCImpl.H
@@ -296,7 +296,7 @@ unpack_recv_buffer_gpu (FabArray<FAB>& mf, int scomp, int ncomp,
         szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
         pbuffer = (char*)The_Arena()->alloc(szbuffer);
         Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 #endif
 
@@ -327,7 +327,7 @@ unpack_recv_buffer_gpu (FabArray<FAB>& mf, int scomp, int ncomp,
         tag.dfab(i,j,k,scomp+n) = proj(tag.sfab, si ,n);
     });
 
-    // There is Gpu::synchronize in ParallelFor above
+    // There is Gpu::streamSynchronize in ParallelFor above
 
     if (pbuffer != recv_data[0]) {
         The_Arena()->free(pbuffer);
diff --git a/Src/Base/AMReX_Partition.H b/Src/Base/AMReX_Partition.H
index b36a1a24273..bff8156537d 100644
--- a/Src/Base/AMReX_Partition.H
+++ b/Src/Base/AMReX_Partition.H
@@ -51,7 +51,7 @@ namespace detail
             {
                 amrex::Swap(p[i], p[n2-1-i]);
             });
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 }
diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp
index fce6096f714..b02ad2f1e8d 100644
--- a/Src/Base/AMReX_Random.cpp
+++ b/Src/Base/AMReX_Random.cpp
@@ -40,7 +40,7 @@ void ResizeRandomSeed (amrex::ULong gpu_seed)
 #ifdef AMREX_USE_DPCPP
 
     rand_engine_descr = new dpcpp_rng_descr
-        (Gpu::Device::nullQueue(), sycl::range<1>(N), gpu_seed, 1);
+        (Gpu::Device::streamQueue(), sycl::range<1>(N), gpu_seed, 1);
 
 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
@@ -54,7 +54,7 @@ void ResizeRandomSeed (amrex::ULong gpu_seed)
     });
 #endif
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 }
 #endif
@@ -182,7 +182,7 @@ amrex::DeallocateRandomSeedDevArray ()
 #ifdef AMREX_USE_DPCPP
     if (rand_engine_descr) {
         delete rand_engine_descr;
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
         rand_engine_descr = nullptr;
     }
 #else
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 57145322033..9c07b7b4a2a 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -813,7 +813,7 @@ bool AnyOf (N n, T const* v, P&& pred)
 #ifdef AMREX_USE_DPCPP
     const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
     const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::nullStream(),
+    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
         int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
         if (gh.threadIdx() == 0) { *has_any = *dp; }
@@ -834,7 +834,7 @@ bool AnyOf (N n, T const* v, P&& pred)
         }
     });
 #else
-    amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, 0,
+    amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         __shared__ int has_any;
         if (threadIdx.x == 0) has_any = *dp;
@@ -874,7 +874,7 @@ bool AnyOf (Box const& box, P&& pred)
 #ifdef AMREX_USE_DPCPP
     const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
     const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::nullStream(),
+    amrex::launch(ec.numBlocks.x, ec.numThreads.x, shared_mem_bytes, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
         int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
         if (gh.threadIdx() == 0) { *has_any = *dp; }
@@ -899,7 +899,7 @@ bool AnyOf (Box const& box, P&& pred)
         }
     });
 #else
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, 0,
+    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         __shared__ int has_any;
         if (threadIdx.x == 0) has_any = *dp;
diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H
index fe5502583a3..be8af6d06b2 100644
--- a/Src/Base/AMReX_TagParallelFor.H
+++ b/Src/Base/AMReX_TagParallelFor.H
@@ -215,7 +215,7 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
 #endif
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     The_Pinned_Arena()->free(h_buffer);
     The_Arena()->free(d_buffer);
 }
diff --git a/Src/Base/AMReX_TimeIntegrator.H b/Src/Base/AMReX_TimeIntegrator.H
index e1f5a5f3d30..210f87d8ef8 100644
--- a/Src/Base/AMReX_TimeIntegrator.H
+++ b/Src/Base/AMReX_TimeIntegrator.H
@@ -25,6 +25,8 @@ template<class T>
 class TimeIntegrator
 {
 private:
+    amrex::Real time, timestep;
+    int step_number;
     std::unique_ptr<IntegratorBase<T> > integrator_ptr;
     std::function<void ()> post_timestep;
 
@@ -75,6 +77,11 @@ private:
         // By default, do nothing
         set_rhs([](T& /* S_rhs */, const T& /* S_data */, const amrex::Real /* time */){});
         set_fast_rhs([](T& /* S_rhs */, T& /* S_extra */, const T& /* S_data */, const amrex::Real /* time */){});
+
+        // By default, initialize time, timestep, step number to 0's
+        time = 0.0_rt;
+        timestep = 0.0_rt;
+        step_number = 0;
     }
 
 public:
@@ -161,6 +168,26 @@ public:
         return integrator_ptr->get_fast_timestep();
     }
 
+    int get_step_number ()
+    {
+        return step_number;
+    }
+
+    amrex::Real get_time ()
+    {
+        return time;
+    }
+
+    amrex::Real get_timestep ()
+    {
+        return timestep;
+    }
+
+    void set_timestep (amrex::Real dt)
+    {
+        timestep = dt;
+    }
+
     std::function<void ()> get_post_timestep ()
     {
         return post_timestep;
@@ -187,12 +214,12 @@ public:
     }
 
     void integrate (T& S_old, T& S_new, amrex::Real start_time, const amrex::Real start_timestep,
-                    const amrex::Real end_time, const int nsteps)
+                    const amrex::Real end_time, const int start_step, const int max_steps)
     {
-        amrex::Real time = start_time;
-        amrex::Real timestep = start_timestep;
+        time = start_time;
+        timestep = start_timestep;
         bool stop_advance = false;
-        for (int step_number = 0; step_number < nsteps && !stop_advance; ++step_number)
+        for (step_number = start_step; step_number < max_steps && !stop_advance; ++step_number)
         {
             if (end_time - time < timestep) {
                 timestep = end_time - time;
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 3688e13032d..4c524a47104 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -95,7 +95,7 @@ TinyProfiler::start () noexcept
 
 #ifdef AMREX_USE_GPU
             if (device_synchronize_around_region) {
-                amrex::Gpu::Device::synchronize();
+                amrex::Gpu::streamSynchronize();
             }
 #endif
 
@@ -187,7 +187,7 @@ TinyProfiler::stop () noexcept
 
 #ifdef AMREX_USE_GPU
             if (device_synchronize_around_region) {
-                amrex::Gpu::Device::synchronize();
+                amrex::Gpu::streamSynchronize();
             }
 #endif
 
@@ -271,7 +271,7 @@ TinyProfiler::stop (unsigned boxUintID) noexcept
             }
 
             if (device_synchronize_around_region) {
-                amrex::Gpu::Device::synchronize();
+                amrex::Gpu::streamSynchronize();
             }
 
 #ifdef AMREX_USE_CUDA
diff --git a/Src/Base/AMReX_VisMF.cpp b/Src/Base/AMReX_VisMF.cpp
index 9a99efa7446..c4983276d02 100644
--- a/Src/Base/AMReX_VisMF.cpp
+++ b/Src/Base/AMReX_VisMF.cpp
@@ -1022,7 +1022,7 @@ VisMF::Write (const FabArray<FArrayBox>&    mf,
                     fio.write_header(hss, fab, fab.nComp());
                     hLength = static_cast<std::streamoff>(hss.tellp());
                     auto tstr = hss.str();
-                    memcpy(afPtr, tstr.c_str(), hLength);  // ---- the fab header
+                    std::memcpy(afPtr, tstr.c_str(), hLength);  // ---- the fab header
                 }
                 Real const* fabdata = fab.dataPtr();
 #ifdef AMREX_USE_GPU
@@ -1721,7 +1721,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
                       RealDescriptor::convertToNativeFormat(fabdata, readDataItems,
                                                             afPtr, hdr.m_writtenRD);
                     } else {
-                      memcpy(fabdata, afPtr, fab.nBytes());
+                      std::memcpy(fabdata, afPtr, fab.nBytes());
                     }
                     currentOffset += readDataItems * hdr.m_writtenRD.numBytes();
 #ifdef AMREX_USE_GPU
@@ -2342,7 +2342,7 @@ VisMF::AsyncWriteDoit (const FabArray<FArrayBox>& mf, const std::string& mf_name
             if (strip_ghost) {
                 new_fab.copy<RunOn::Device>(mf[mfi], bx);
             } else {
-                Gpu::dtoh_memcpy(new_fab.dataPtr(), mf[mfi].dataPtr(), new_fab.size()*sizeof(Real));
+                Gpu::dtoh_memcpy_async(new_fab.dataPtr(), mf[mfi].dataPtr(), new_fab.size()*sizeof(Real));
             }
         } else
 #endif
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 820c4b6f214..ab86c6dfcc6 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -260,7 +260,7 @@ target_sources( amrex
 # Profiling
 #   this source file has zero symbols in default conditions, which creates
 #   ranlib warnings, e.g., on macOS
-if(AMREX_PROFILING OR AMReX_FORTRAN)
+if(AMReX_BASE_PROFILE OR AMReX_FORTRAN)
    target_sources( amrex
      PRIVATE
        AMReX_BLProfiler.cpp
diff --git a/Src/EB/AMReX_EBMultiFabUtil.cpp b/Src/EB/AMReX_EBMultiFabUtil.cpp
index fc80d1b4e6a..f215c2a1ae0 100644
--- a/Src/EB/AMReX_EBMultiFabUtil.cpp
+++ b/Src/EB/AMReX_EBMultiFabUtil.cpp
@@ -939,7 +939,6 @@ EB_interp_CC_to_FaceCentroid (const MultiFab& cc,
     {
         Gpu::copy(Gpu::hostToDevice, a_bcs.begin(), a_bcs.begin()+ncomp, dv_bcs.begin());
         d_bcs = dv_bcs.dataPtr();
-        Gpu::synchronize();
     }
     else
 #endif
@@ -1078,7 +1077,6 @@ EB_interp_CellCentroid_to_FaceCentroid (const MultiFab& phi_centroid,
     {
         Gpu::copy(Gpu::hostToDevice, a_bcs.begin(), a_bcs.begin()+ncomp, dv_bcs.begin());
         d_bcs = dv_bcs.dataPtr();
-        Gpu::synchronize();
     }
     else
 #endif
diff --git a/Src/EB/AMReX_EB_utils.cpp b/Src/EB/AMReX_EB_utils.cpp
index 1bcdfe08e65..035375cbda5 100644
--- a/Src/EB/AMReX_EB_utils.cpp
+++ b/Src/EB/AMReX_EB_utils.cpp
@@ -213,7 +213,7 @@ namespace amrex {
             div(i,j,k,icomp+n) = divc(i,j,k,n) + optmp(i,j,k,n);
         });
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     //
@@ -658,7 +658,7 @@ void FillSignedDistance (MultiFab& mf, EB2::Level const& ls_lev,
                         fab(i,j,k) = (-fluid_sign) * usd;
                     }
                 });
-                Gpu::synchronize();
+                Gpu::streamSynchronize();
             }
         } else {
             amrex::ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
diff --git a/Src/Extern/HDF5/AMReX_ParticleHDF5.H b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
index d4585046ceb..d861a3684d8 100644
--- a/Src/Extern/HDF5/AMReX_ParticleHDF5.H
+++ b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
@@ -1477,21 +1477,21 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
           auto new_size = old_size + src_tile.size();
           dst_tile.resize(new_size);
 
-          Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                    dst_tile.GetArrayOfStructs().begin() + old_size);
+          Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                         dst_tile.GetArrayOfStructs().begin() + old_size);
 
           for (int i = 0; i < NumRealComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
           }
 
           for (int i = 0; i < NumIntComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
           }
         }
       }
diff --git a/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H b/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
index 4716b5b28c6..4270000ba84 100644
--- a/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
+++ b/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
@@ -201,7 +201,7 @@ void WriteHDF5ParticleDataSync (PC const& pc,
         }
     }
 
-    Gpu::Device::synchronize();
+    Gpu::streamSynchronize();
 
     if(pc.GetUsePrePost())
     {
diff --git a/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp b/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
index 489f6f3f5ef..6f3d03fde03 100644
--- a/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
@@ -675,7 +675,7 @@ HypreABecLap3::loadVectors (MultiFab& soln, const MultiFab& rhs)
                         bp[0] = Real(0.0);
                     }
                 });
-                Gpu::synchronize();
+                Gpu::streamSynchronize();
             }
 
             HYPRE_IJVectorSetValues(b, nrows, cell_id_vec[mfi].dataPtr(), rhs_diag[mfi].dataPtr());
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
index af53bd1c0e8..8f6921950e7 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
@@ -1312,7 +1312,7 @@ MLCellLinOp::BndryCondLoc::setLOBndryConds (const Geometry& geom, const Real* dx
         }
     }
     Gpu::copyAsync(Gpu::hostToDevice, hv.begin(), hv.end(), bctl_dv.begin());
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 void
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index bafd1860967..cfa7595b515 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -302,7 +302,7 @@ MLEBNodeFDLaplacian::prepareForSolve ()
         });
     }
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
 #if (AMREX_SPACEDIM == 2)
     if (m_rz) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
index 543800f8286..eb21e4f2731 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
@@ -323,7 +323,7 @@ MLNodeLaplacian::restriction (int amrlev, int cmglev, MultiFab& crse, MultiFab&
                 mlndlap_restriction_rap(i,j,k,pcrse_ma[box_no],fine_ma[box_no],st_ma[box_no],msk_ma[box_no]);
             });
         }
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -451,7 +451,7 @@ MLNodeLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine, const Mu
                 mlndlap_semi_interpadd_aa(i, j, k, fine_ma[box_no], crse_ma[box_no], sig_ma[box_no], msk_ma[box_no], idir);
             });
         }
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     } else
 #endif
     {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
index e4a3bb65cc8..df5ab489d2f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
@@ -248,7 +248,7 @@ MLNodeLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i
 #endif
             });
         }
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -410,7 +410,7 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
             }
         }
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
         if (m_smooth_num_sweeps > 1) nodalSync(amrlev, mglev, sol);
     }
     else // cpu
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
index dcd7dea627c..e3caf4ecced 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
@@ -359,7 +359,7 @@ MLNodeLaplacian::buildStencil ()
         });
     }
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     // This is only needed at the bottom.
     m_s0_norm0[0].back() = m_stencil[0].back()->norm0(0,0) * m_normalization_threshold;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
index 065ae048420..9f8b01a8ce9 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
@@ -222,7 +222,7 @@ MLNodeTensorLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const Multi
     {
         mlndtslap_adotx(i,j,k, out_a[box_no], in_a[box_no], dmsk_a[box_no], s);
     });
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 #endif
 }
 
@@ -264,7 +264,7 @@ MLNodeTensorLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const Mult
             mlndtslap_gauss_seidel(i, j, k, sol_a[box_no], rhs_a[box_no], dmsk_a[box_no], s);
         }
     });
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 #endif
 }
 
diff --git a/Src/Particle/AMReX_DenseBins.H b/Src/Particle/AMReX_DenseBins.H
index e5e35f0e252..93c9415ad25 100644
--- a/Src/Particle/AMReX_DenseBins.H
+++ b/Src/Particle/AMReX_DenseBins.H
@@ -217,7 +217,7 @@ public:
 
         Gpu::exclusive_scan(m_counts.begin(), m_counts.end(), m_offsets.begin());
 
-        Gpu::copy(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
+        Gpu::copyAsync(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
 
         index_type* pperm = m_perm.dataPtr();
         constexpr index_type max_index = std::numeric_limits<index_type>::max();
diff --git a/Src/Particle/AMReX_NeighborList.H b/Src/Particle/AMReX_NeighborList.H
index 490d6a2b562..a015bede0d7 100644
--- a/Src/Particle/AMReX_NeighborList.H
+++ b/Src/Particle/AMReX_NeighborList.H
@@ -334,8 +334,9 @@ public:
         Gpu::HostVector<unsigned int> host_nbor_offsets(m_nbor_offsets.size());
         Gpu::HostVector<unsigned int> host_nbor_list(m_nbor_list.size());
 
-        Gpu::copy(Gpu::deviceToHost, m_nbor_offsets.begin(), m_nbor_offsets.end(), host_nbor_offsets.begin());
-        Gpu::copy(Gpu::deviceToHost, m_nbor_list.begin(), m_nbor_list.end(), host_nbor_list.begin());
+        Gpu::copyAsync(Gpu::deviceToHost, m_nbor_offsets.begin(), m_nbor_offsets.end(), host_nbor_offsets.begin());
+        Gpu::copyAsync(Gpu::deviceToHost, m_nbor_list.begin(), m_nbor_list.end(), host_nbor_list.begin());
+        Gpu::streamSynchronize();
 
         for (int i = 0; i < numParticles(); ++i) {
             amrex::Print() << "Particle " << i << " could collide with: ";
diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
index 10590fc400b..066318198fd 100644
--- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
+++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
@@ -101,15 +101,13 @@ buildNeighborMask ()
             }
 
             m_code_array[grid].resize(h_code_arr.size());
-            Gpu::copy(Gpu::hostToDevice, h_code_arr.begin(), h_code_arr.end(),
+            Gpu::copyAsync(Gpu::hostToDevice, h_code_arr.begin(), h_code_arr.end(),
                       m_code_array[grid].begin());
             m_isec_boxes[grid].resize(h_isec_boxes.size());
-            Gpu::copy(Gpu::hostToDevice, h_isec_boxes.begin(), h_isec_boxes.end(),
+            Gpu::copyAsync(Gpu::hostToDevice, h_isec_boxes.begin(), h_isec_boxes.end(),
                       m_isec_boxes[grid].begin());
 
-
-
-            Gpu::Device::synchronize();
+            Gpu::streamSynchronize();
         }
 
         RemoveDuplicates(neighbor_procs);
@@ -186,7 +184,8 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
         amrex::Gpu::exclusive_scan(counts.begin(), counts.end(), offsets.begin());
 
         int num_copies;
-        Gpu::dtoh_memcpy(&num_copies, offsets.data()+np, sizeof(int));
+        Gpu::dtoh_memcpy_async(&num_copies, offsets.data()+np, sizeof(int));
+        Gpu::streamSynchronize();
 
         neighbor_copy_op.resize(gid, lev, num_copies);
 
@@ -267,11 +266,11 @@ updateNeighborsGPU (bool boundary_neighbors_only)
     }
     else
     {
-        Gpu::Device::synchronize();
+        Gpu::streamSynchronize();
         pinned_snd_buffer.resize(snd_buffer.size());
         Gpu::dtoh_memcpy_async(pinned_snd_buffer.dataPtr(), snd_buffer.dataPtr(), snd_buffer.size());
         neighbor_copy_plan.buildMPIFinish(this->BufferMap());
-        Gpu::Device::synchronize();
+        Gpu::streamSynchronize();
         communicateParticlesStart(*this, neighbor_copy_plan, pinned_snd_buffer, pinned_rcv_buffer);
         rcv_buffer.resize(pinned_rcv_buffer.size());
         unpackBuffer(*this, neighbor_copy_plan, snd_buffer, NeighborUnpackPolicy());
@@ -280,7 +279,7 @@ updateNeighborsGPU (bool boundary_neighbors_only)
         unpackRemotes(*this, neighbor_copy_plan, rcv_buffer, NeighborUnpackPolicy());
     }
 
-    Gpu::Device::synchronize();
+    Gpu::streamSynchronize();
 }
 
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
diff --git a/Src/Particle/AMReX_Particle.H b/Src/Particle/AMReX_Particle.H
index 50cec1bb647..0548886010c 100644
--- a/Src/Particle/AMReX_Particle.H
+++ b/Src/Particle/AMReX_Particle.H
@@ -432,7 +432,7 @@ operator<< (std::ostream& os, const Particle<NReal, NInt>& p)
     for (int i = 0; i < NReal; i++)
         os << p.rdata(i) << ' ';
 
-    for (int i = 2; i < NInt; i++)
+    for (int i = 0; i < NInt; i++)
         os << p.idata(i) << ' ';
 
     if (!os.good())
@@ -470,7 +470,7 @@ operator<< (std::ostream& os, const Particle<0, NInt>& p)
     for (int i = 0; i < AMREX_SPACEDIM; i++)
         os << p.pos(i) << ' ';
 
-    for (int i = 2; i < NInt; i++)
+    for (int i = 0; i < NInt; i++)
         os << p.idata(i) << ' ';
 
     if (!os.good())
diff --git a/Src/Particle/AMReX_ParticleBufferMap.cpp b/Src/Particle/AMReX_ParticleBufferMap.cpp
index 0921aa69056..693db092b39 100644
--- a/Src/Particle/AMReX_ParticleBufferMap.cpp
+++ b/Src/Particle/AMReX_ParticleBufferMap.cpp
@@ -104,9 +104,10 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb)
     d_lev_offsets.resize(0);
     d_lev_offsets.resize(m_lev_offsets.size());
 
-    Gpu::copy(Gpu::hostToDevice, m_lev_gid_to_bucket.begin(),m_lev_gid_to_bucket.end(),d_lev_gid_to_bucket.begin());
-    Gpu::copy(Gpu::hostToDevice, m_lev_offsets.begin(),m_lev_offsets.end(),d_lev_offsets.begin());
-    Gpu::copy(Gpu::hostToDevice, m_bucket_to_pid.begin(),m_bucket_to_pid.end(),d_bucket_to_pid.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_lev_gid_to_bucket.begin(),m_lev_gid_to_bucket.end(),d_lev_gid_to_bucket.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_lev_offsets.begin(),m_lev_offsets.end(),d_lev_offsets.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_bucket_to_pid.begin(),m_bucket_to_pid.end(),d_bucket_to_pid.begin());
+    Gpu::streamSynchronize();
 }
 
 bool ParticleBufferMap::isValid (const ParGDBBase* a_gdb) const
diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
index 3e2b15027df..a9b169cb21c 100644
--- a/Src/Particle/AMReX_ParticleCommunication.H
+++ b/Src/Particle/AMReX_ParticleCommunication.H
@@ -186,22 +186,24 @@ struct ParticleCopyPlan
                                    m_box_offsets.begin());
 
         m_box_counts_h.resize(m_box_counts_d.size());
-        Gpu::copy(Gpu::deviceToHost, m_box_counts_d.begin(), m_box_counts_d.end(),
-                  m_box_counts_h.begin());
+        Gpu::copyAsync(Gpu::deviceToHost, m_box_counts_d.begin(), m_box_counts_d.end(),
+                       m_box_counts_h.begin());
 
         m_snd_pad_correction_h.resize(0);
         m_snd_pad_correction_h.resize(ParallelContext::NProcsSub()+1, 0);
 
         m_snd_pad_correction_d.resize(m_snd_pad_correction_h.size());
-        Gpu::copy(Gpu::hostToDevice, m_snd_pad_correction_h.begin(), m_snd_pad_correction_h.end(),
-                  m_snd_pad_correction_d.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, m_snd_pad_correction_h.begin(), m_snd_pad_correction_h.end(),
+                       m_snd_pad_correction_d.begin());
 
         d_int_comp_mask.resize(int_comp_mask.size());
-        Gpu::copy(Gpu::hostToDevice,  int_comp_mask.begin(),  int_comp_mask.end(),
-                  d_int_comp_mask.begin());
+        Gpu::copyAsync(Gpu::hostToDevice,  int_comp_mask.begin(),  int_comp_mask.end(),
+                       d_int_comp_mask.begin());
         d_real_comp_mask.resize(real_comp_mask.size());
-        Gpu::copy(Gpu::hostToDevice, real_comp_mask.begin(), real_comp_mask.end(),
-                  d_real_comp_mask.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, real_comp_mask.begin(), real_comp_mask.end(),
+                       d_real_comp_mask.begin());
+
+        Gpu::streamSynchronize();
 
         int NStructReal = PC::ParticleContainerType::NStructReal;
         int NStructInt  = PC::ParticleContainerType::NStructInt;
@@ -572,7 +574,7 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
 
         Vector<int> offsets;
         policy.resizeTiles(tiles, sizes, offsets);
-        Gpu::Device::synchronize();
+        Gpu::streamSynchronize();
         int uindex = 0;
         int procindex = 0, rproc = plan.m_rcv_box_pids[0];
         for (int i = 0, N = plan.m_rcv_box_counts.size(); i < N; ++i)
@@ -604,7 +606,7 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
                                        p_comm_real, p_comm_int);
               });
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 #else
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index cd6ae4c6a3d..82832e6e09e 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -864,7 +864,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
 
         });
         last_offset+=next_offset;
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     // last_offset should equal virts.numParticles()
@@ -876,10 +876,10 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
         auto np = src_tile.numParticles();
         virts.resize(virts_offset+np);
         virts_offset += filterAndTransformParticles(virts, src_tile, FilterVirt(assign_buffer_grid,plo,dxi,domain), TransformerVirt(),0,virts_offset);
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
-    //Resize implicitly includes a Gpu::synchronize()
     virts.resize(virts_offset);
+    Gpu::streamSynchronize();
     }
 }
 
@@ -966,8 +966,8 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
         ghosts.resize(ghost_offset+np);
         ghost_offset += filterAndTransformParticles(ghosts, src_tile, AssignGridFilter(assign_grid,gid,level,nGrow), TransformerGhost(),0,ghost_offset);
     }
-    //Resize implicitly includes a Gpu::synchronize()
     ghosts.resize(ghost_offset);
+    Gpu::streamSynchronize();
 }
 
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt,
@@ -1129,7 +1129,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>::So
                         dst[i] = src.m_aos[inds[i]];
                     });
 
-                    Gpu::synchronize();
+                    Gpu::streamSynchronize();
                     ptile.GetArrayOfStructs()().swap(tmp_particles);
                 }
 
@@ -1142,7 +1142,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>::So
                         dst[i] = src[inds[i]];
                     });
 
-                    Gpu::synchronize();
+                    Gpu::streamSynchronize();
 
                     ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
                 }
@@ -1156,7 +1156,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>::So
                         dst[i] = src[inds[i]];
                     });
 
-                    Gpu::synchronize();
+                    Gpu::streamSynchronize();
 
                     ptile.GetStructOfArrays().GetIntData(comp).swap(tmp_int);
                 }
@@ -1332,13 +1332,13 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
     }
     else
     {
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
         Gpu::PinnedVector<char> pinned_snd_buffer;
         Gpu::PinnedVector<char> pinned_rcv_buffer;
         pinned_snd_buffer.resize(snd_buffer.size());
         Gpu::dtoh_memcpy_async(pinned_snd_buffer.dataPtr(), snd_buffer.dataPtr(), snd_buffer.size());
         plan.buildMPIFinish(BufferMap());
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
         communicateParticlesStart(*this, plan, pinned_snd_buffer, pinned_rcv_buffer);
         rcv_buffer.resize(pinned_rcv_buffer.size());
         unpackBuffer(*this, plan, snd_buffer, RedistributeUnpackPolicy());
@@ -1347,7 +1347,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
         unpackRemotes(*this, plan, rcv_buffer, RedistributeUnpackPolicy());
     }
 
-    Gpu::Device::synchronize();
+    Gpu::Device::streamSynchronize();
     AMREX_ASSERT(numParticlesOutOfRange(*this, lev_min, lev_max, nGrow) == 0);
 #else
     amrex::ignore_unused(lev_min,lev_max,nGrow,local,remove_negative);
@@ -1954,22 +1954,22 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
               auto new_size = old_size + src_tile.size();
               dst_tile.resize(new_size);
 
-              Gpu::copy(Gpu::hostToDevice,
-                        src_tile.begin(), src_tile.end(),
-                        dst_tile.GetArrayOfStructs().begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             src_tile.begin(), src_tile.end(),
+                             dst_tile.GetArrayOfStructs().begin() + old_size);
 
               for (int i = 0; i < NumRealComps(); ++i) {
-                  Gpu::copy(Gpu::hostToDevice,
-                            host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                            host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                            dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                  Gpu::copyAsync(Gpu::hostToDevice,
+                                 host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                 host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                 dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
               }
 
               for (int i = 0; i < NumIntComps(); ++i) {
-                  Gpu::copy(Gpu::hostToDevice,
-                            host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                            host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                            dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                  Gpu::copyAsync(Gpu::hostToDevice,
+                                 host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                 host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                 dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
               }
             }
           }
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index 9beb86feb8a..cedbdb329c4 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -995,21 +995,21 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
           auto new_size = old_size + src_tile.size();
           dst_tile.resize(new_size);
 
-          Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                    dst_tile.GetArrayOfStructs().begin() + old_size);
+          Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                         dst_tile.GetArrayOfStructs().begin() + old_size);
 
           for (int i = 0; i < NumRealComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
           }
 
           for (int i = 0; i < NumIntComps(); ++i) {
-              Gpu::copy(Gpu::hostToDevice,
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                        host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                        dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+              Gpu::copyAsync(Gpu::hostToDevice,
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                             host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                             dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
           }
         }
       }
diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H
index 0f50370c4a1..ee8afc778e2 100644
--- a/Src/Particle/AMReX_ParticleInit.H
+++ b/Src/Particle/AMReX_ParticleInit.H
@@ -330,18 +330,19 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 if((host_real_attribs[lev][std::make_pair(grid, tile)]).size() > (long unsigned int) NArrayReal)
                   for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
-         }
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                  }
             }
         }
+        Gpu::streamSynchronize();
 
         Redistribute();
     }
@@ -402,17 +403,18 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt, Allocator>
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
             }
         }
+        Gpu::streamSynchronize();
 
         Redistribute();
 
@@ -836,10 +838,11 @@ InitFromBinaryFile (const std::string& file,
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
             }
         }
+        Gpu::streamSynchronize();
 
         Redistribute();
 
@@ -1093,24 +1096,25 @@ InitRandom (Long                    icount,
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayInt; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
                 }
             }
         }
+        Gpu::streamSynchronize();
 
         AMREX_ASSERT(OK());
     }
@@ -1197,25 +1201,27 @@ InitRandom (Long                    icount,
                 auto new_size = old_size + src_tile.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayInt; ++i) {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                              host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                              dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
+                                   host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
                 }
             }
         }
-        // Let Redistribute() sort out where the particles beLong.
+        Gpu::streamSynchronize();
+
+        // Let Redistribute() sort out where the particles belong.
         Redistribute();
     }
 
@@ -1399,7 +1405,7 @@ InitOnePerCell (Real x_off, Real y_off, Real z_off, const ParticleInitData& pdat
 
         m_particles[0][ind].resize(ptile_tmp.numParticles());
         amrex::copyParticles(m_particles[0][ind], ptile_tmp);
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
     }
 
     Redistribute();
@@ -1521,27 +1527,27 @@ InitNRandomPerCell (int n_per_cell, const ParticleInitData& pdata)
                 auto new_size = old_size + src_tid.size();
                 dst_tile.resize(new_size);
 
-                Gpu::copy(Gpu::hostToDevice, src_tid.begin(), src_tid.end(),
-                          dst_tile.GetArrayOfStructs().begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, src_tid.begin(), src_tid.end(),
+                               dst_tile.GetArrayOfStructs().begin() + old_size);
 
                 for (int i = 0; i < NArrayReal; ++i)
                 {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_real_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
-                              host_real_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
-                              dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_real_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
+                                   host_real_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayInt; ++i)
                 {
-                    Gpu::copy(Gpu::hostToDevice,
-                              host_int_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
-                              host_int_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
-                              dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_int_attribs[host_lev][std::make_pair(gid,tid)][i].begin(),
+                                   host_int_attribs[host_lev][std::make_pair(gid,tid)][i].end(),
+                                   dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
                 }
             }
         }
-
+        Gpu::streamSynchronize();
     }
 
     if (m_verbose > 1)
diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H
index 4d5f2c83e14..f8e3da93163 100644
--- a/Src/Particle/AMReX_ParticleLocator.H
+++ b/Src/Particle/AMReX_ParticleLocator.H
@@ -116,7 +116,7 @@ public:
         for (int i = 0; i < num_boxes; ++i) m_host_boxes.push_back(ba[i]);
 
         m_device_boxes.resize(num_boxes);
-        Gpu::copy(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin());
 
         // compute the lo, hi and the max box size in each direction
         ReduceOps<AMREX_D_DECL(ReduceOpMin, ReduceOpMin, ReduceOpMin),
@@ -274,9 +274,9 @@ public:
             m_locators[lev].build(a_ba[lev], a_geom[lev]);
             h_grid_assignors[lev] = m_locators[lev].getGridAssignor();
         }
-        Gpu::htod_memcpy(m_grid_assignors.data(), h_grid_assignors.data(),
-                         sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
-        Gpu::synchronize();
+        Gpu::htod_memcpy_async(m_grid_assignors.data(), h_grid_assignors.data(),
+                               sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
+        Gpu::streamSynchronize();
 #else
         for (int lev = 0; lev < num_levels; ++lev)
         {
@@ -329,9 +329,9 @@ public:
             m_locators[lev].setGeometry(a_gdb->Geom(lev));
             h_grid_assignors[lev] = m_locators[lev].getGridAssignor();
         }
-        Gpu::htod_memcpy(m_grid_assignors.data(), h_grid_assignors.data(),
-                         sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
-        Gpu::synchronize();
+        Gpu::htod_memcpy_async(m_grid_assignors.data(), h_grid_assignors.data(),
+                               sizeof(AssignGrid<BinIteratorFactory>)*num_levels);
+        Gpu::streamSynchronize();
 #else
         for (int lev = 0; lev < num_levels; ++lev)
         {
diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H
index 1bb34058491..4374197d335 100644
--- a/Src/Particle/AMReX_ParticleTile.H
+++ b/Src/Particle/AMReX_ParticleTile.H
@@ -583,7 +583,7 @@ struct ParticleTile
 
 #ifdef AMREX_USE_GPU
         if ((h_runtime_r_ptrs.size() > 0) || (h_runtime_i_ptrs.size() > 0)) {
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 #endif
 
@@ -638,7 +638,7 @@ struct ParticleTile
 
 #ifdef AMREX_USE_GPU
         if ((h_runtime_r_cptrs.size() > 0) || (h_runtime_i_cptrs.size() > 0)) {
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 #endif
 
diff --git a/Src/Particle/AMReX_ParticleTransformation.H b/Src/Particle/AMReX_ParticleTransformation.H
index 95e4a747ae2..8fe57749c4a 100644
--- a/Src/Particle/AMReX_ParticleTransformation.H
+++ b/Src/Particle/AMReX_ParticleTransformation.H
@@ -162,7 +162,7 @@ void copyParticles (DstTile& dst, const SrcTile& src,
         copyParticle(dst_data, src_data, src_start+i, dst_start+i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -216,7 +216,7 @@ void transformParticles (DstTile& dst, const SrcTile& src,
         f(dst_data, src_data, src_start+i, dst_start+i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -277,7 +277,7 @@ void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
         f(dst1_data, dst2_data, src_data, src_start+i, dst1_start+i, dst2_start+i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -338,7 +338,7 @@ Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask,
         if (mask[i]) copyParticle(dst_data, src_data, src_start+i, dst_start+p_offsets[i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     return last_mask + last_offset;
 }
 
@@ -434,7 +434,7 @@ Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask
                                            dst_start+p_offsets[src_start+i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     return last_mask + last_offset;
 }
 
@@ -522,7 +522,7 @@ Index filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2,
         if (mask[i]) f(dst_data1, dst_data2, src_data, i, p_offsets[i], p_offsets[i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
     return last_mask + last_offset;
 }
 
@@ -627,7 +627,7 @@ void gatherParticles (PTile& dst, const PTile& src, N np, const Index* inds)
         copyParticle(dst_data, src_data, inds[i], i);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 /**
@@ -657,7 +657,7 @@ void scatterParticles (PTile& dst, const PTile& src, N np, const Index* inds)
         copyParticle(dst_data, src_data, i, inds[i]);
     });
 
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 }
 
 }
diff --git a/Src/Particle/AMReX_SparseBins.H b/Src/Particle/AMReX_SparseBins.H
index 93635784822..d9743c8282e 100644
--- a/Src/Particle/AMReX_SparseBins.H
+++ b/Src/Particle/AMReX_SparseBins.H
@@ -139,13 +139,15 @@ public:
         }
 
         m_bins.resize(host_bins.size());
-        Gpu::copy(Gpu::hostToDevice, host_bins.begin(), host_bins.end(), m_bins.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, host_bins.begin(), host_bins.end(), m_bins.begin());
 
         m_offsets.resize(host_offsets.size());
-        Gpu::copy(Gpu::hostToDevice, host_offsets.begin(), host_offsets.end(), m_offsets.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, host_offsets.begin(), host_offsets.end(), m_offsets.begin());
 
         m_perm.resize(host_perm.size());
-        Gpu::copy(Gpu::hostToDevice, host_perm.begin(), host_perm.end(), m_perm.begin());
+        Gpu::copyAsync(Gpu::hostToDevice, host_perm.begin(), host_perm.end(), m_perm.begin());
+
+        Gpu::streamSynchronize();
     }
 
     //! \brief the number of items in the container
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
index 2d09b114934..5a71d015763 100644
--- a/Src/Particle/AMReX_WriteBinaryParticleData.H
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -184,11 +184,11 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
 
     typename PC::IntVector write_int_comp_d(write_int_comp.size());
     typename PC::IntVector write_real_comp_d(write_real_comp.size());
-    Gpu::copy(Gpu::hostToDevice, write_int_comp.begin(), write_int_comp.end(),
-              write_int_comp_d.begin());
-    Gpu::copy(Gpu::hostToDevice, write_real_comp.begin(), write_real_comp.end(),
-              write_real_comp_d.begin());
-    Gpu::Device::synchronize();
+    Gpu::copyAsync(Gpu::hostToDevice, write_int_comp.begin(), write_int_comp.end(),
+                   write_int_comp_d.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, write_real_comp.begin(), write_real_comp.end(),
+                   write_real_comp_d.begin());
+    Gpu::Device::streamSynchronize();
 
     const auto write_int_comp_d_ptr = write_int_comp_d.data();
     const auto write_real_comp_d_ptr = write_real_comp_d.data();
@@ -243,9 +243,9 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
         poffset += num_copies;
     }
 
-    Gpu::copy(Gpu::deviceToHost, idata_d.begin(), idata_d.end(), idata.begin());
-    Gpu::copy(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(), rdata.begin());
-    Gpu::Device::synchronize();
+    Gpu::copyAsync(Gpu::deviceToHost, idata_d.begin(), idata_d.end(), idata.begin());
+    Gpu::copyAsync(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(), rdata.begin());
+    Gpu::Device::streamSynchronize();
 }
 
 template <class PC>
@@ -375,7 +375,7 @@ void WriteBinaryParticleDataSync (PC const& pc,
         }
     }
 
-    Gpu::Device::synchronize();
+    Gpu::Device::streamSynchronize();
 
     if(pc.GetUsePrePost())
     {
diff --git a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
index 9b2452ad46c..515dbf75261 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
@@ -678,7 +678,7 @@ AmrCoreAdv::timeStepNoSubcycling (Real time, int iteration)
         }
     }
 
-    DefineVelocityAllLevels(time);
+    DefineVelocityAllLevels(time+0.5_rt*dt[0]);
     AdvancePhiAllLevels (time, dt[0], iteration);
 
 #ifdef AMREX_PARTICLES
diff --git a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
index ad4d186e607..db69749a85f 100644
--- a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
+++ b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
@@ -414,6 +414,10 @@ AmrLevelAdv::estTimeStep (Real)
     GpuArray<Real,BL_SPACEDIM> prob_lo = geom.ProbLoArray();
     const Real cur_time = state[Phi_Type].curTime();
     const MultiFab& S_new = get_new_data(Phi_Type);
+    Real pred_time = cur_time;
+    if (cur_time > 0._rt) {
+        pred_time += 0.5_rt*parent->dtLevel(level);
+    }
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel reduction(min:dt_est)
@@ -431,7 +435,7 @@ AmrLevelAdv::estTimeStep (Real)
             // Note: no need to set elixir on uface[i] temporary fabs since
             //       norm<RunOn::Device> kernel launch is blocking.
 
-            get_face_velocity(cur_time,
+            get_face_velocity(pred_time,
                               AMREX_D_DECL(uface[0], uface[1], uface[2]),
                               dx, prob_lo);
 
diff --git a/Tests/EB_CNS/Source/CNS.cpp b/Tests/EB_CNS/Source/CNS.cpp
index 9e6d78716de..ab289e557f8 100644
--- a/Tests/EB_CNS/Source/CNS.cpp
+++ b/Tests/EB_CNS/Source/CNS.cpp
@@ -340,7 +340,7 @@ CNS::errorEst (TagBoxArray& tags, int, int, Real /*time*/, int, int)
                 }
             }
         });
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     if (level < refine_max_dengrad_lev)
@@ -361,7 +361,7 @@ CNS::errorEst (TagBoxArray& tags, int, int, Real /*time*/, int, int)
         {
             cns_tag_denerror(i, j, k, tagma[box_no], rhoma[box_no], dengrad_threshold, tagval);
         });
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 }
 
@@ -424,7 +424,7 @@ CNS::read_params ()
     if (!refine_boxes.empty()) {
 #ifdef AMREX_USE_GPU
         dp_refine_boxes = (RealBox*)The_Arena()->alloc(sizeof(RealBox)*refine_boxes.size());
-        Gpu::htod_memcpy(dp_refine_boxes, refine_boxes.data(), sizeof(RealBox)*refine_boxes.size());
+        Gpu::htod_memcpy_async(dp_refine_boxes, refine_boxes.data(), sizeof(RealBox)*refine_boxes.size());
 #else
         dp_refine_boxes = refine_boxes.data();
 #endif
@@ -439,7 +439,7 @@ CNS::read_params ()
     pp.query("T_S"      , h_parm->T_S);
 
     h_parm->Initialize();
-    amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_parm, h_parm+1, d_parm);
+    amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, h_parm, h_parm+1, d_parm);
 
     // eb_weights_type:
     //   0 -- weights = 1
@@ -447,12 +447,16 @@ CNS::read_params ()
     //   2 -- use_mass_as_eb_weights
     //   3 -- use_volfrac_as_eb_weights
     pp.query("eb_weights_type", eb_weights_type);
-    if (eb_weights_type < 0 || eb_weights_type > 3)
+    if (eb_weights_type < 0 || eb_weights_type > 3) {
         amrex::Abort("CNS: eb_weights_type must be 0,1,2 or 3");
+    }
 
     pp.query("do_reredistribution", do_reredistribution);
-    if (do_reredistribution != 0 && do_reredistribution != 1)
+    if (do_reredistribution != 0 && do_reredistribution != 1) {
         amrex::Abort("CNS: do_reredistibution must be 0 or 1");
+    }
+
+    amrex::Gpu::streamSynchronize();
 }
 
 void
diff --git a/Tests/GPU/AnyOf/main.cpp b/Tests/GPU/AnyOf/main.cpp
index 00ddb27c13a..20ee03d3982 100644
--- a/Tests/GPU/AnyOf/main.cpp
+++ b/Tests/GPU/AnyOf/main.cpp
@@ -54,7 +54,7 @@ void main_main ()
     {
         BL_PROFILE("Vector AnyOf");
 
-        Gpu::Device::synchronize();
+        Gpu::Device::streamSynchronize();
 
         bool anyof_M = Reduce::AnyOf(nitem, vec.dataPtr(),
                         [=] AMREX_GPU_DEVICE (Real item) noexcept -> int
diff --git a/Tests/GPU/AtomicIf/main.cpp b/Tests/GPU/AtomicIf/main.cpp
index 379efcdfdaf..843ae313a95 100644
--- a/Tests/GPU/AtomicIf/main.cpp
+++ b/Tests/GPU/AtomicIf/main.cpp
@@ -53,11 +53,11 @@ void test ()
                                                });
                            });
     }
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     std::vector<T> v_h(N);
     Gpu::copyAsync(Gpu::deviceToHost, v_d.begin(), v_d.end(), v_h.begin());
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     // The first 4000 entries should all be 0.0
     for (int i = 0; i < 4000; ++i) {
@@ -81,4 +81,3 @@ int main (int argc, char* argv[])
     test<amrex::Long>();
     amrex::Finalize();
 }
-
diff --git a/Tests/GPU/RandomNumberGeneration/main.cpp b/Tests/GPU/RandomNumberGeneration/main.cpp
index 5eb913ab0ee..66746cc0d83 100644
--- a/Tests/GPU/RandomNumberGeneration/main.cpp
+++ b/Tests/GPU/RandomNumberGeneration/main.cpp
@@ -45,7 +45,7 @@ void RandomNumGen ()
             z_d_ptr[i] = amrex::Random(engine);
         });
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     std::vector<Real> x_h(Ndraw);
@@ -54,7 +54,7 @@ void RandomNumGen ()
     Gpu::copyAsync(Gpu::deviceToHost, x_d.begin(), x_d.end(), x_h.begin());
     Gpu::copyAsync(Gpu::deviceToHost, y_d.begin(), y_d.end(), y_h.begin());
     Gpu::copyAsync(Gpu::deviceToHost, z_d.begin(), z_d.end(), z_h.begin());
-    Gpu::synchronize();
+    Gpu::streamSynchronize();
 
     Real xmean=0., ymean=0., zmean=0., xvar=0., yvar=0., zvar=0.;
     for (int i = 0; i < Ndraw; ++i) {
diff --git a/Tests/GPU/Vector/main.cpp b/Tests/GPU/Vector/main.cpp
index 00cec25474a..cfa44437fb5 100644
--- a/Tests/GPU/Vector/main.cpp
+++ b/Tests/GPU/Vector/main.cpp
@@ -94,7 +94,7 @@ void async_test()
     amrex::Print() << "Async Synching -- should print second." << std::endl;
 #endif
 
-    Gpu::Device::synchronize();
+    Gpu::Device::streamSynchronize();
 }
 
 int main (int argc, char* argv[])
@@ -113,4 +113,3 @@ int main (int argc, char* argv[])
     }
     amrex::Finalize();
 }
-
diff --git a/Tests/Particles/AsyncIO/main.cpp b/Tests/Particles/AsyncIO/main.cpp
index ce484e7b773..825f077f5d9 100644
--- a/Tests/Particles/AsyncIO/main.cpp
+++ b/Tests/Particles/AsyncIO/main.cpp
@@ -121,44 +121,44 @@ class MyParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice,
-                      host_particles.begin(),
-                      host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_particles.begin(),
+                           host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_real[i].begin(),
-                          host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_int[i].begin(),
-                          host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
             for (int i = 0; i < NumRuntimeRealComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_real[i].begin(),
-                          host_runtime_real[i].end(),
-                          soa.GetRealData(NAR+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NAR+i).begin() + old_size);
             }
 
             for (int i = 0; i < NumRuntimeIntComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_int[i].begin(),
-                          host_runtime_int[i].end(),
-                          soa.GetIntData(NAI+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NAI+i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 
         Redistribute();
diff --git a/Tests/Particles/DenseBins/main.cpp b/Tests/Particles/DenseBins/main.cpp
index d1ed276fe9e..08f9049dc3f 100644
--- a/Tests/Particles/DenseBins/main.cpp
+++ b/Tests/Particles/DenseBins/main.cpp
@@ -60,8 +60,8 @@ void testGPU (int nbins, const amrex::Vector<int>& items)
 {
     // copy to device
     Gpu::DeviceVector<int> items_d(items.size());
-    Gpu::copy(Gpu::hostToDevice, items.begin(), items.end(), items_d.begin());
-    Gpu::Device::synchronize();
+    Gpu::copyAsync(Gpu::hostToDevice, items.begin(), items.end(), items_d.begin());
+    Gpu::Device::streamSynchronize();
 
     amrex::DenseBins<int> bins;
     bins.build(BinPolicy::GPU, items_d.size(), items_d.data(), nbins, [=] AMREX_GPU_DEVICE (int j) noexcept -> unsigned int { return j ; });
diff --git a/Tests/Particles/Intersection/main.cpp b/Tests/Particles/Intersection/main.cpp
index ab760b171a2..04e2a491ddb 100644
--- a/Tests/Particles/Intersection/main.cpp
+++ b/Tests/Particles/Intersection/main.cpp
@@ -83,7 +83,7 @@ void testIntersection()
             int num_cells = host_cells.size();
 
             Gpu::DeviceVector<IntVect> device_cells(num_cells);
-            Gpu::copy(Gpu::hostToDevice, host_cells.begin(), host_cells.end(), device_cells.begin());
+            Gpu::copyAsync(Gpu::hostToDevice, host_cells.begin(), host_cells.end(), device_cells.begin());
 
             Gpu::DeviceVector<int> device_grids(num_cells);
 
diff --git a/Tests/Particles/NeighborParticles/MDParticleContainer.cpp b/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
index b934f664f4e..d94528dc1c7 100644
--- a/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
+++ b/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
@@ -112,27 +112,27 @@ InitParticles(const IntVect& a_num_particles_per_cell,
         auto new_size = old_size + host_particles.size();
         particle_tile.resize(new_size);
 
-        Gpu::copy(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
-                  particle_tile.GetArrayOfStructs().begin() + old_size);
+        Gpu::copyAsync(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
+                       particle_tile.GetArrayOfStructs().begin() + old_size);
 
         auto& soa = particle_tile.GetStructOfArrays();
         for (int i = 0; i < NumRealComps(); ++i)
         {
-            Gpu::copy(Gpu::hostToDevice,
-                      host_real[i].begin(),
-                      host_real[i].end(),
-                      soa.GetRealData(i).begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_real[i].begin(),
+                           host_real[i].end(),
+                           soa.GetRealData(i).begin() + old_size);
         }
 
         for (int i = 0; i < NumIntComps(); ++i)
         {
-            Gpu::copy(Gpu::hostToDevice,
-                      host_int[i].begin(),
-                      host_int[i].end(),
-                      soa.GetIntData(i).begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_int[i].begin(),
+                           host_int[i].end(),
+                           soa.GetIntData(i).begin() + old_size);
         }
 
-        Gpu::synchronize();
+        Gpu::streamSynchronize();
     }
 
     amrex::PrintToFile("neighbor_test") << " Number of particles is " << this->TotalNumberOfParticles()<< " \n";
diff --git a/Tests/Particles/ParallelContext/main.cpp b/Tests/Particles/ParallelContext/main.cpp
index 311c39a3bab..ff63697a049 100644
--- a/Tests/Particles/ParallelContext/main.cpp
+++ b/Tests/Particles/ParallelContext/main.cpp
@@ -126,44 +126,44 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice,
-                      host_particles.begin(),
-                      host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_particles.begin(),
+                           host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_real[i].begin(),
-                          host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_int[i].begin(),
-                          host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
             for (int i = 0; i < NumRuntimeRealComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_real[i].begin(),
-                          host_runtime_real[i].end(),
-                          soa.GetRealData(NAR+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NAR+i).begin() + old_size);
             }
 
             for (int i = 0; i < NumRuntimeIntComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_int[i].begin(),
-                          host_runtime_int[i].end(),
-                          soa.GetIntData(NAI+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NAI+i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 
         RedistributeLocal();
diff --git a/Tests/Particles/ParticleReduce/main.cpp b/Tests/Particles/ParticleReduce/main.cpp
index c03c4b49240..e798f8b5edf 100644
--- a/Tests/Particles/ParticleReduce/main.cpp
+++ b/Tests/Particles/ParticleReduce/main.cpp
@@ -99,23 +99,23 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 };
diff --git a/Tests/Particles/ParticleTransformations/main.cpp b/Tests/Particles/ParticleTransformations/main.cpp
index e60a6762f93..cc13b24f32d 100644
--- a/Tests/Particles/ParticleTransformations/main.cpp
+++ b/Tests/Particles/ParticleTransformations/main.cpp
@@ -100,23 +100,23 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice, host_particles.begin(), host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_real[i].begin(), host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice, host_int[i].begin(), host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 };
diff --git a/Tests/Particles/Redistribute/main.cpp b/Tests/Particles/Redistribute/main.cpp
index d5a416ae598..2cbdbe955a7 100644
--- a/Tests/Particles/Redistribute/main.cpp
+++ b/Tests/Particles/Redistribute/main.cpp
@@ -138,44 +138,44 @@ class TestParticleContainer
             auto new_size = old_size + host_particles.size();
             particle_tile.resize(new_size);
 
-            Gpu::copy(Gpu::hostToDevice,
-                      host_particles.begin(),
-                      host_particles.end(),
-                      particle_tile.GetArrayOfStructs().begin() + old_size);
+            Gpu::copyAsync(Gpu::hostToDevice,
+                           host_particles.begin(),
+                           host_particles.end(),
+                           particle_tile.GetArrayOfStructs().begin() + old_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
             for (int i = 0; i < NAR; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_real[i].begin(),
-                          host_real[i].end(),
-                          soa.GetRealData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
             }
 
             for (int i = 0; i < NAI; ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_int[i].begin(),
-                          host_int[i].end(),
-                          soa.GetIntData(i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
             }
             for (int i = 0; i < NumRuntimeRealComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_real[i].begin(),
-                          host_runtime_real[i].end(),
-                          soa.GetRealData(NAR+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NAR+i).begin() + old_size);
             }
 
             for (int i = 0; i < NumRuntimeIntComps(); ++i)
             {
-                Gpu::copy(Gpu::hostToDevice,
-                          host_runtime_int[i].begin(),
-                          host_runtime_int[i].end(),
-                          soa.GetIntData(NAI+i).begin() + old_size);
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NAI+i).begin() + old_size);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
 
         RedistributeLocal();
diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index d4be691c6dd..9df9dc0204e 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -85,6 +85,7 @@ set(AMReX_HYPRE_FOUND               @AMReX_HYPRE@)
 set(AMReX_PETSC_FOUND               @AMReX_PETSC@)
 set(AMReX_SUNDIALS_FOUND            @AMReX_SUNDIALS@)
 set(AMReX_HDF5_FOUND                @AMReX_HDF5@)
+set(AMReX_HDF5_ZFP_FOUND            @AMReX_HDF5_ZFP@)
 
 # Compilation options
 set(AMReX_FPE_FOUND                 @AMReX_FPE@)
@@ -113,6 +114,7 @@ set(AMReX_CUDA                      @AMReX_CUDA@)
 set(AMReX_SYCL                      @AMReX_DPCPP@)
 set(AMReX_HIP                       @AMReX_HIP@)
 set(AMReX_GPU_BACKEND               @AMReX_GPU_BACKEND@)
+set(AMReX_GPU_RDC                   @AMReX_GPU_RDC@)
 set(AMReX_PRECISION                 @AMReX_PRECISION@)
 set(AMReX_FORTRAN                   @AMReX_FORTRAN@)
 
@@ -131,6 +133,7 @@ set(AMReX_ASCENT                    @AMReX_ASCENT@)
 set(AMReX_HYPRE                     @AMReX_HYPRE@)
 set(AMReX_PETSC                     @AMReX_PETSC@)
 set(AMReX_HDF5                      @AMReX_HDF5@)
+set(AMReX_HDF5_ZFP                  @AMReX_HDF5_ZFP@)
 
 # Compilation options
 set(AMReX_FPE                       @AMReX_FPE@)
@@ -200,6 +203,10 @@ if (@AMReX_HDF5@)
     find_dependency(HDF5 REQUIRED)
 endif ()
 
+if (@AMReX_HDF5_ZFP@)
+    find_dependency(H5Z-ZFP REQUIRED)
+endif ()
+
 if (@AMReX_HYPRE@)
    find_dependency(HYPRE 2.20.0 REQUIRED)
 endif ()
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
index 6b45fb62d0a..663fefa606b 100644
--- a/Tools/CMake/AMReXOptions.cmake
+++ b/Tools/CMake/AMReXOptions.cmake
@@ -303,6 +303,10 @@ if (AMReX_HDF5_ASYNC)
    message(FATAL_ERROR "\nAMReX_HDF5_ASYNC not yet supported\n")
 endif ()
 
+cmake_dependent_option(AMReX_HDF5_ZFP "Enable ZFP compression in HDF5-based IO" OFF
+   "AMReX_HDF5" OFF )
+print_option(AMReX_HDF5_ZFP)
+
 # SUNDIALS
 option( AMReX_SUNDIALS "Enable SUNDIALS interfaces" OFF )
 print_option( AMReX_SUNDIALS )
diff --git a/Tools/CMake/AMReXSetDefines.cmake b/Tools/CMake/AMReXSetDefines.cmake
index cd4c12bf838..358f08db766 100644
--- a/Tools/CMake/AMReXSetDefines.cmake
+++ b/Tools/CMake/AMReXSetDefines.cmake
@@ -165,6 +165,7 @@ endif ()
 #
 add_amrex_define(AMREX_USE_HDF5 NO_LEGACY IF AMReX_HDF5)
 add_amrex_define(AMREX_USE_HDF5_ASYNC NO_LEGACY IF AMReX_HDF5_ASYNC)
+add_amrex_define(AMREX_USE_HDF5_ZFP NO_LEGACY IF AMReX_HDF5_ZFP)
 
 
 #
diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake
index 9f1f771d9fb..e8be52dc173 100644
--- a/Tools/CMake/AMReXThirdPartyLibraries.cmake
+++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake
@@ -2,11 +2,16 @@
 # HDF5 -- here it would be best to create an imported target
 #
 if (AMReX_HDF5)
-    set(HDF5_PREFER_PARALLEL TRUE)
+    if (AMReX_MPI)
+       set(HDF5_PREFER_PARALLEL TRUE)
+    endif ()
     find_package(HDF5 1.10.4 REQUIRED)
     if (AMReX_MPI AND (NOT HDF5_IS_PARALLEL))
-        message(FATAL_ERROR "\nHDF5 library does not support parallel I/O")
-     endif ()
+       message(FATAL_ERROR "\nHDF5 library does not support parallel I/O")
+    endif ()
+    if (HDF5_IS_PARALLEL AND (NOT AMReX_MPI))
+       message(FATAL_ERROR "\nMPI enabled in HDF5 but not in AMReX, which will likely fail to build")
+    endif ()
 
     if (TARGET hdf5::hdf5)  # CMake >= 3.19
        target_link_libraries(amrex PUBLIC hdf5::hdf5)
@@ -18,6 +23,23 @@ if (AMReX_HDF5)
 
 endif ()
 
+#
+# H5Z-ZFP
+#
+if (AMReX_HDF5_ZFP)
+   set(H5Z_ZFP_USE_STATIC_LIBS ON) # Static ON means using as a library, or OFF as an HDF5 plugin
+   find_package(H5Z_ZFP 1.0.1 CONFIG)
+   if (NOT AMReX_HDF5)
+      message(FATAL_ERROR "\nHDF5 must be enabled for ZFP support in HDF5")
+   endif ()
+
+   if (TARGET h5z_zfp::h5z_zfp)  # CMake >= 3.19
+      target_link_libraries(amrex PUBLIC h5z_zfp::h5z_zfp)
+   else ()  # CMake < 3.19 -- Remove when minimum cmake version is bumped up
+      target_include_directories(amrex PUBLIC ${H5Z_ZFP_INCLUDE_DIR})
+      target_link_libraries(amrex PUBLIC ${H5Z_ZFP_LIBRARY})
+   endif ()
+endif ()
 
 #
 # Sensei
diff --git a/Tools/CMake/AMReX_Config.H.in b/Tools/CMake/AMReX_Config.H.in
index c4cf03571f8..c38d6a4e9b2 100644
--- a/Tools/CMake/AMReX_Config.H.in
+++ b/Tools/CMake/AMReX_Config.H.in
@@ -50,6 +50,7 @@
 #cmakedefine AMREX_PARTICLES
 #cmakedefine AMREX_USE_HDF5
 #cmakedefine AMREX_USE_HDF5_ASYNC
+#cmakedefine AMREX_USE_HDF5_ZFP
 #cmakedefine AMREX_USE_HYPRE
 #cmakedefine AMREX_USE_PETSC
 #cmakedefine AMREX_USE_SUNDIALS
diff --git a/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp b/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp
index ca7ca6ab6bb..b5f7fac2add 100644
--- a/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp
+++ b/Tools/Postprocessing/C_Src/WritePlotfileToASCII.cpp
@@ -37,13 +37,13 @@ main (int   argc,
         PrintUsage(argv[0]);
     }
 
-    // plotfile names for the coarse, fine, and subtracted output
+    // plotfile name
     std::string iFile;
 
     // read in parameters from inputs file
     ParmParse pp;
 
-    // coarse MultiFab
+    // read in plotfile name
     pp.query("infile", iFile);
     if (iFile.empty())
         amrex::Abort("You must specify `infile'");
@@ -51,16 +51,13 @@ main (int   argc,
     int comp_in_line = 0;
     pp.query("comp_in_line", comp_in_line);
 
-    // single-level for now
-    // AMR comes later, where we iterate over each level in isolation
-
     // for the Header
-    std::string iFile2 = iFile;
-    iFile2 += "/Header";
+    std::string Header = iFile;
+    Header += "/Header";
 
     // open header
     ifstream x;
-    x.open(iFile2.c_str(), ios::in);
+    x.open(Header.c_str(), ios::in);
 
     // read in first line of header
     string str;
@@ -85,70 +82,72 @@ main (int   argc,
         Abort();
     }
 
-    // now read in the plotfile data
-    // check to see whether the user pointed to the plotfile base directory
-    // or the data itself
-    if (amrex::FileExists(iFile+"/Level_0/Cell_H")) {
-       iFile += "/Level_0/Cell";
-    }
-    if (amrex::FileExists(iFile+"/Level_00/Cell_H")) {
-       iFile += "/Level_00/Cell";
-    }
+    int lev = 0;
 
-    // storage for the input coarse and fine MultiFabs
-    MultiFab mf;
+    do {
 
-    // read in plotfiles, 'coarse' and 'fine' to MultiFabs
-    // note: fine could be the same resolution as coarse
-    VisMF::Read(mf, iFile);
+        if (lev > 9) {
+            Abort("Utility only works for 10 levels of refinement or less");
+        }
 
-    ncomp = mf.nComp();
-    Print() << "ncomp = " << ncomp << std::endl;
+        // storage for the MultiFab
+        MultiFab mf;
 
-    // check nodality
-    IntVect c_nodality = mf.ixType().toIntVect();
-    Print() << "nodality " << c_nodality << std::endl;
+        std::string iFile_lev = iFile;
 
-    // get boxArray
-    BoxArray ba = mf.boxArray();
+        std::string levX  = "/Level_"+to_string(lev)+"/Cell";
+        std::string levXX = "/Level_0"+to_string(lev)+"/Cell";
 
-    // minimalBox() computes a single box to enclose all the boxes
-    // enclosedCells() converts it to a cell-centered Box
-    Box bx_onegrid = ba.minimalBox().enclosedCells();
+        // now read in the plotfile data
+        // check to see whether the user pointed to the plotfile base directory
+        // or the data itself
+        if (amrex::FileExists(iFile+levX+"_H")) {
+            iFile_lev += levX;
+        } else if (amrex::FileExists(iFile+levXX+"_H")) {
+            iFile_lev += levXX;
+        } else {
+            break; // terminate while loop
+        }
 
-    // number of cells in the coarse domain
-    Print() << "npts in coarse domain = " << bx_onegrid.numPts() << std::endl;
-    long npts_coarsedomain = bx_onegrid.numPts();
+        // read in plotfile to MultiFab
+        VisMF::Read(mf, iFile_lev);
 
-    // BoxArray, DistributionMapping, and MultiFab with one grid
-    BoxArray ba_onegrid(bx_onegrid);
-    DistributionMapping dmap_onegrid(ba_onegrid);
-    MultiFab mf_onegrid(ba_onegrid,dmap_onegrid,ncomp,0);
+        if (lev == 0) {
+            ncomp = mf.nComp();
+            Print() << "Number of components in the plotfile = " << ncomp << std::endl;
+            Print() << "Nodality of plotfile = " << mf.ixType().toIntVect() << std::endl;
+        }
 
-    // copy data into MultiFab with one grid
-    mf_onegrid.ParallelCopy(mf,0,0,ncomp,0,0);
+        // get boxArray to compute number of grid points at the level
+        BoxArray ba = mf.boxArray();
+        Print() << "Number of grid points at level " << lev << " = " << ba.numPts() << std::endl;
 
-    for ( MFIter mfi(mf_onegrid,false); mfi.isValid(); ++mfi ) {
+        for ( MFIter mfi(mf,false); mfi.isValid(); ++mfi ) {
 
-        const Box& bx = mfi.validbox();
-        const auto lo = amrex::lbound(bx);
-        const auto hi = amrex::ubound(bx);
+            const Box& bx = mfi.validbox();
+            const auto lo = amrex::lbound(bx);
+            const auto hi = amrex::ubound(bx);
 
-        const Array4<Real>& mfdata = mf_onegrid.array(mfi);
+            const Array4<Real>& mfdata = mf.array(mfi);
 
-        if (comp_in_line == 1){
-          std::cout << mf_onegrid[mfi];
-        }else{
-          for (auto n=0; n<ncomp; ++n) {
-            for (auto k = lo.z; k <= hi.z; ++k) {
-              for (auto j = lo.y; j <= hi.y; ++j) {
+            if (comp_in_line == 1) {
+                std::cout << mf[mfi];
+            } else {
+                for (auto n=0; n<ncomp; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
                 for (auto i = lo.x; i <= hi.x; ++i) {
-                  std::cout << i << " " << j << " " << k << " " << n << " " << mfdata(i,j,k,n) << "\n";
+                    std::cout << i << " " << j << " " << k << " " << n << " " << mfdata(i,j,k,n) << "\n";
+                }
+                }
+                }
                 }
-              }
             }
-          }
-        }
-    } // end MFIter
+        } // end MFIter
+
+        // proceed to next level of refinement
+        ++lev;
+
+    } while(true);
 
 }