From 85423e5315f2c8ec50c4ca4196f7b34a3834e5f4 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Sun, 26 Apr 2020 21:38:54 -0400
Subject: [PATCH 1/9] move the fourth order center <-> average stuff to C++

---
 Source/hydro/fourth_center_average.cpp | 240 +++++++++++++++
 Source/hydro/fourth_order_nd.F90       | 410 -------------------------
 2 files changed, 240 insertions(+), 410 deletions(-)
 create mode 100644 Source/hydro/fourth_center_average.cpp
diff --git a/Source/hydro/fourth_center_average.cpp b/Source/hydro/fourth_center_average.cpp
new file mode 100644
index 0000000000..b27941e108
--- /dev/null
+++ b/Source/hydro/fourth_center_average.cpp
@@ -0,0 +1,240 @@
+
+
+// Note: pretty much all of these routines below assume that dx(1) = dx(2) = dx(3)
+
+Real compute_laplacian(int i, int j, int k, int n,
+                       Array4<Real const> const a,
+                       GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  Real lapx = 0.0;
+  Real lapy = 0.0;
+  Real lapz = 0.0;
+
+  // we use 2nd-order accurate one-sided stencils at the physical
+  // boundaries note: this differs from the suggestion in MC2011 --
+  // they just suggest using the Laplacian from +1 off the interior.
+  // I like the one-sided better.
+
+  if (i == domlo[0] && physbc_lo[0] != Interior) {
+    lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n);
+
+  } else if (i == domhi[0] && physbc_hi[0] != Interior) {
+    lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n);
+
+  } else {
+    lapx = a(i+1,j,k,n) - TWO*a(i,j,k,n) + a(i-1,j,k,n);
+  }
+
+#if AMREX_SPACEDIM >= 2
+  if (j == domlo[1] && physbc_lo[1] != Interior) {
+    lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n);
+
+  } else if (j == domhi[1] && physbc_hi[1] != Interior) {
+    lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n);
+
+  } else {
+    lapy = a(i,j+1,k,n) - TWO*a(i,j,k,n) + a(i,j-1,k,n);
+  }
+#endif
+
+#if AMREX_SPACEDIM == 3
+  if (k == domlo[2] && physbc_lo[2] != Interior) {
+    lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n);
+
+  } else if (k == domhi[2] && physbc_hi[2] != Interior) {
+    lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n);
+
+  } else {
+    lapz = a(i,j,k+1,n) - TWO*a(i,j,k,n) + a(i,j,k-1,n);
+  }
+#endif
+
+  return lapx + lapy + lapz;
+}
+
+
+Real trans_laplacian(int i, int j, int k, int n,
+                     int idir,
+                     Array4<Real const> const a,
+                     GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  Real lapx = 0.0;
+  Real lapy = 0.0;
+  Real lapz = 0.0;
+
+  // we use 2nd-order accurate one-sided stencils at the physical boundaries
+  if (idir != 0) {
+
+    if (i == domlo[0] && physbc_lo[0] != Interior) {
+      lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n);
+
+    } else if (i == domhi[0] && physbc_hi[0] != Interior) {
+      lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n);
+
+    } else {
+      lapx = a(i+1,j,k,n) - TWO*a(i,j,k,n) + a(i-1,j,k,n);
+    }
+  }
+
+  if (idir != 1) {
+
+    if (j == domlo[1] && physbc_lo[1] != Interior) {
+      lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n);
+
+    } else if (j == domhi[1] && physbc_hi[1] != Interior) {
+      lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n);
+
+    } else {
+      lapy = a(i,j+1,k,n) - TWO*a(i,j,k,n) + a(i,j-1,k,n);
+    }
+  }
+
+#if AMREX_SPACEDIM == 3
+  if (idir != 2) {
+
+    if (k == domlo[2] && physbc_lo[2] != Interior) {
+      lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n);
+
+    } else if (k == domhi[2] && physbc_hi[2] != Interior) {
+      lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n);
+
+    } else {
+      lapz = a(i,j,k+1,n) - TWO*a(i,j,k,n) + a(i,j,k-1,n);
+    }
+  }
+#endif
+
+  return lapx + lapy + lapz;
+}
+
+
+void make_cell_center(const Box& bx,
+                      Array4<Real const> const& U,
+                      Array4<Real> const& U_cc,
+                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  // Take a cell-average state U and a convert it to a cell-center
+  // state U_cc via U_cc = U - 1/24 L U
+
+  U_lo = U.lbound();
+  U_hi = U.ubound();
+
+  AMREX_ASSERT(U_lo[0] <= lo[0]-1 && U_hi[0] >= hi[0]+1 &&
+               (AMREX_SPACEDIM >= 2 && (U_lo[1] <= lo[1]-1 && U_hi[1] >= hi[1]+1)) &&
+               (AMREX_SPACEDIM == 3 && (U_lo[2] <= lo[2]-1 && U_hi[2] >= hi[2]+1)));
+
+  amrex::ParallelFor(bx, U.nComp(),
+  [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+  {
+    Real lap = compute_laplacian(i, j, k, n, U, domlo, domhi);
+
+    U_cc(i,j,k,n) = U(i,j,k,n) - (1.0_rt/24.0_rt) * lap;
+
+  });
+}
+
+void make_cell_center_in_place(const Box& bx,
+                               Array4<Real> const& U,
+                               Array4<Real> const& tmp,
+                               GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  // Take a cell-average state U and make it cell-centered in place
+  // via U <- U - 1/24 L U.  Note that this operation is not tile
+  // safe.
+
+  // here, tmp is a temporary memory space we use to store one-component's Laplacian
+
+  for (int n = 0; n < U.nComp(); n++) {
+
+    amrex::ParallelFor(bx,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+    {
+      tmp(i,j,k) = compute_laplacian(i, j, k, n, U, domlo, domhi);
+    });
+
+    amrex::ParallelFor(bx,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+    {
+      U(i,j,k,n) = U(i,j,k,n) - (1.0_rt/24.0_rt) * tmp(i,j,k);
+    });
+  }
+}
+
+
+void compute_lap_term(const Box& bx,
+                      Array4<Real const> const& Y,
+                      Array4<Real> const& lap, const int ncomp,
+                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  // Computes the h**2/24 L U term that is used in correcting
+  // cell-center to averages (and back)
+
+  amrex::ParallelFor(bx,
+  [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+  {
+    lap(i,j,k) = (1.0_rt/24.0_rt) *
+      compute_laplacian(i, j, k, ncomp, U, domlo, domhi);
+  });
+
+}
+
+
+void make_fourth_average(const Box& bx,
+                         Array4<Real> const& q,
+                         Array4<Real const> const& q_bar,
+                         GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  // Take the cell-center state q and another state q_bar (e.g.,
+  // constructed from the cell-average U) and replace the cell-center
+  // q with a 4th-order accurate cell-average, q <- q + 1/24 L q_bar
+
+  amrex::ParallelFor(bx, q.nComp(),
+  [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+  {
+    Real lap = compute_laplacian(i, j, k, n, q_bar, domlo, domhi);
+
+    q(i,j,k,n) += (1.0_rt/24.0_rt) * lap;
+  });
+}
+
+
+void make_fourth_in_place(const Box& bx,
+                          Array4<Real> const& q,
+                          Array4<Real> const& tmp,
+                          GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  // Take the cell-center q and makes it a cell-average q, in place
+  // (e.g. q is overwritten by its average), q <- q + 1/24 L q.
+  // Note: this routine is not tile safe.
+
+  for (int n = 0; n < q.nComp(); n++) {
+    make_fourth_in_place_n(bx, q, n, tmp, domlo, domhi);
+  }
+}
+
+
+void make_fourth_in_place_n(const Box& bx,
+                            Array4<Real> const& q, const int ncomp,
+                            Array4<Real> const& tmp,
+                            GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+
+  // Take the cell-center q and makes it a cell-average q, in place
+  // (e.g. q is overwritten by its average), q <- q + 1/24 L q.
+  // Note: this routine is not tile safe.
+  //
+  // This version operates on a single component.  Here ncomp is the
+  // component to update
+
+  amrex::ParallelFor(bx,
+  [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+  {
+    tmp(i,j,k) = compute_laplacian(i, j, k, ncomp, q, domlo, domhi);
+  });
+
+  amrex::ParallelFor(bx,
+  [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+  {
+    q(i,j,k,ncomp) += (1.0_rt/24.0_rt) * lap(i,j,k);
+  });
+
+}
diff --git a/Source/hydro/fourth_order_nd.F90 b/Source/hydro/fourth_order_nd.F90
index 7c0e427277..78a0abe6c9 100644
--- a/Source/hydro/fourth_order_nd.F90
+++ b/Source/hydro/fourth_order_nd.F90
@@ -790,414 +790,4 @@ subroutine ca_states(lo, hi, &
 
   end subroutine ca_states
 
-
-  ! Note: pretty much all of these routines below assume that dx(1) = dx(2) = dx(3)
-  pure function compute_laplacian(i, j, k, n, &
-                                  a, a_lo, a_hi, nc, &
-                                  domlo, domhi) result (lap)
-
-    use prob_params_module, only : physbc_lo, physbc_hi, Interior
-    implicit none
-
-    integer, intent(in) :: i, j, k, n
-    integer, intent(in) :: a_lo(3), a_hi(3)
-    integer, intent(in) :: nc
-    real(rt), intent(in) :: a(a_lo(1):a_hi(1), a_lo(2):a_hi(2), a_lo(3):a_hi(3), nc)
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    real(rt) :: lapx, lapy, lapz
-    real(rt) :: lap
-
-    lapx = ZERO
-    lapy = ZERO
-    lapz = ZERO
-
-    ! we use 2nd-order accurate one-sided stencils at the physical
-    ! boundaries note: this differs from the suggestion in MC2011 --
-    ! they just suggest using the Laplacian from +1 off the interior.
-    ! I like the one-sided better.
-
-    if (i == domlo(1) .and. physbc_lo(1) /= Interior) then
-       lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n)
-
-    else if (i == domhi(1) .and. physbc_hi(1) /= Interior) then
-       lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n)
-
-    else
-       lapx = a(i+1,j,k,n) - TWO*a(i,j,k,n) + a(i-1,j,k,n)
-    end if
-
-#if AMREX_SPACEDIM >= 2
-    if (j == domlo(2) .and. physbc_lo(2) /= Interior) then
-       lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n)
-
-    else if (j == domhi(2) .and. physbc_hi(2) /= Interior) then
-       lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n)
-
-    else
-       lapy = a(i,j+1,k,n) - TWO*a(i,j,k,n) + a(i,j-1,k,n)
-    end if
-
-#endif
-
-#if AMREX_SPACEDIM == 3
-    if (k == domlo(3) .and. physbc_lo(3) /= Interior) then
-       lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n)
-
-    else if (k == domhi(3) .and. physbc_hi(3) /= Interior) then
-       lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n)
-
-    else
-       lapz = a(i,j,k+1,n) - TWO*a(i,j,k,n) + a(i,j,k-1,n)
-    end if
-#endif
-
-    lap = lapx + lapy + lapz
-
-  end function compute_laplacian
-
-  subroutine trans_laplacian(i, j, k, n, &
-                             idir, &
-                             a, a_lo, a_hi, nc, &
-                             lap, &
-                             domlo, domhi) bind(C, name="trans_laplacian")
-
-    use prob_params_module, only : physbc_lo, physbc_hi, Interior
-    implicit none
-
-    integer, intent(in), value :: i, j, k, n
-    integer, intent(in), value :: idir
-    integer, intent(in) :: a_lo(3), a_hi(3)
-    integer, intent(in), value :: nc
-    real(rt), intent(in) :: a(a_lo(1):a_hi(1), a_lo(2):a_hi(2), a_lo(3):a_hi(3), nc)
-    integer, intent(in) :: domlo(3), domhi(3)
-    real(rt), intent(out) :: lap
-
-    real(rt) :: lapx, lapy, lapz
-
-    lapx = ZERO
-    lapy = ZERO
-    lapz = ZERO
-
-    ! we use 2nd-order accurate one-sided stencils at the physical boundaries
-    if (idir /= 1) then
-
-       if (i == domlo(1) .and. physbc_lo(1) /= Interior) then
-          lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n)
-
-       else if (i == domhi(1) .and. physbc_hi(1) /= Interior) then
-          lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n)
-
-       else
-          lapx = a(i+1,j,k,n) - TWO*a(i,j,k,n) + a(i-1,j,k,n)
-       end if
-    end if
-
-    if (idir /= 2) then
-
-       if (j == domlo(2) .and. physbc_lo(2) /= Interior) then
-          lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n)
-
-       else if (j == domhi(2) .and. physbc_hi(2) /= Interior) then
-          lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n)
-
-       else
-          lapy = a(i,j+1,k,n) - TWO*a(i,j,k,n) + a(i,j-1,k,n)
-       end if
-    end if
-
-#if AMREX_SPACEDIM == 3
-    if (idir /= 3) then
-
-       if (k == domlo(3) .and. physbc_lo(3) /= Interior) then
-          lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n)
-
-       else if (k == domhi(3) .and. physbc_hi(3) /= Interior) then
-          lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n)
-
-       else
-          lapz = a(i,j,k+1,n) - TWO*a(i,j,k,n) + a(i,j,k-1,n)
-       end if
-    end if
-#endif
-
-    lap = lapx + lapy + lapz
-
-  end subroutine trans_laplacian
-
-
-  subroutine ca_make_cell_center(lo, hi, &
-                                 U, U_lo, U_hi, nc, &
-                                 U_cc, U_cc_lo, U_cc_hi, nc_cc, &
-                                 domlo, domhi) &
-                                 bind(C, name="ca_make_cell_center")
-    ! Take a cell-average state U and a convert it to a cell-center
-    ! state U_cc via U_cc = U - 1/24 L U
-
-    implicit none
-
-    integer, intent(in) :: lo(3), hi(3)
-    integer, intent(in) :: U_lo(3), U_hi(3)
-    integer, intent(in) :: U_cc_lo(3), U_cc_hi(3)
-    integer, intent(in) :: nc, nc_cc
-    real(rt), intent(in) :: U(U_lo(1):U_hi(1), U_lo(2):U_hi(2), U_lo(3):U_hi(3), nc)
-    real(rt), intent(inout) :: U_cc(U_cc_lo(1):U_cc_hi(1), U_cc_lo(2):U_cc_hi(2), U_cc_lo(3):U_cc_hi(3), nc_cc)
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    integer :: i, j, k, n
-    real(rt) :: lap
-
-    if (U_lo(1) > lo(1)-1 .or. U_hi(1) < hi(1)+1 .or. &
-        (AMREX_SPACEDIM >= 2 .and. (U_lo(2) > lo(2)-1 .or. U_hi(2) < hi(2)+1)) .or. &
-        (AMREX_SPACEDIM == 3 .and. (U_lo(3) > lo(3)-1 .or. U_hi(3) < hi(3)+1))) then
-       call bl_error("insufficient ghostcells in ca_make_cell_center")
-    end if
-
-    do n = 1, nc
-
-       do k = lo(3), hi(3)
-          do j = lo(2), hi(2)
-             do i = lo(1), hi(1)
-
-                lap = compute_laplacian(i, j, k, n, &
-                                        U, U_lo, U_hi, nc, &
-                                        domlo, domhi)
-
-                U_cc(i,j,k,n) = U(i,j,k,n) - TWENTYFOURTH * lap
-
-             end do
-          end do
-       end do
-    end do
-
-  end subroutine ca_make_cell_center
-
-  subroutine ca_make_cell_center_in_place(lo, hi, &
-                                          U, U_lo, U_hi, nc, &
-                                          domlo, domhi) &
-                                          bind(C, name="ca_make_cell_center_in_place")
-    ! Take a cell-average state U and make it cell-centered in place
-    ! via U <- U - 1/24 L U.  Note that this operation is not tile
-    ! safe.
-
-    use amrex_mempool_module, only : bl_allocate, bl_deallocate
-
-    implicit none
-
-    integer, intent(in) :: lo(3), hi(3)
-    integer, intent(in) :: U_lo(3), U_hi(3)
-    integer, intent(in) :: nc
-    real(rt), intent(inout) :: U(U_lo(1):U_hi(1), U_lo(2):U_hi(2), U_lo(3):U_hi(3), nc)
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    integer :: i, j, k, n
-
-    real(rt), pointer :: lap(:,:,:)
-
-    call bl_allocate(lap, lo, hi)
-
-    do n = 1, nc
-
-       do k = lo(3), hi(3)
-          do j = lo(2), hi(2)
-             do i = lo(1), hi(1)
-
-                lap(i,j,k) = compute_laplacian(i, j, k, n, &
-                                               U, U_lo, U_hi, nc, &
-                                               domlo, domhi)
-
-             end do
-          end do
-       end do
-
-       do k = lo(3), hi(3)
-          do j = lo(2), hi(2)
-             do i = lo(1), hi(1)
-                U(i,j,k,n) = U(i,j,k,n) - TWENTYFOURTH * lap(i,j,k)
-             end do
-          end do
-       end do
-    end do
-
-    call bl_deallocate(lap)
-
-  end subroutine ca_make_cell_center_in_place
-
-  subroutine ca_compute_lap_term(lo, hi, &
-                                 U, U_lo, U_hi, nc, &
-                                 lap, lap_lo, lap_hi, ncomp, &
-                                 domlo, domhi) &
-                                 bind(C, name="ca_compute_lap_term")
-    ! Computes the h**2/24 L U term that is used in correcting
-    ! cell-center to averages (and back)
-
-    implicit none
-
-    integer, intent(in) :: lo(3), hi(3)
-    integer, intent(in) :: U_lo(3), U_hi(3)
-    integer, intent(in) :: lap_lo(3), lap_hi(3)
-    integer, intent(in) :: nc
-    real(rt), intent(in) :: U(U_lo(1):U_hi(1), U_lo(2):U_hi(2), U_lo(3):U_hi(3), nc)
-    real(rt), intent(inout) :: lap(lap_lo(1):lap_hi(1), lap_lo(2):lap_hi(2), lap_lo(3):lap_hi(3))
-    integer, intent(in), value :: ncomp
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    ! note: ncomp is C++ index (0-based)
-
-    integer :: i, j, k
-
-    do k = lo(3), hi(3)
-       do j = lo(2), hi(2)
-          do i = lo(1), hi(1)
-
-             lap(i,j,k) = compute_laplacian(i, j, k, ncomp+1, &
-                                            U, U_lo, U_hi, nc, &
-                                            domlo, domhi)
-
-          end do
-       end do
-    end do
-
-    lap(lo(1):hi(1), lo(2):hi(2), lo(3):hi(3)) = &
-         TWENTYFOURTH * lap(lo(1):hi(1), lo(2):hi(2), lo(3):hi(3))
-
-  end subroutine ca_compute_lap_term
-
-  subroutine ca_make_fourth_average(lo, hi, &
-                                    q, q_lo, q_hi, nc, &
-                                    q_bar, q_bar_lo, q_bar_hi, nc_bar, &
-                                    domlo, domhi) &
-                                    bind(C, name="ca_make_fourth_average")
-    ! Take the cell-center state q and another state q_bar (e.g.,
-    ! constructed from the cell-average U) and replace the cell-center
-    ! q with a 4th-order accurate cell-average, q <- q + 1/24 L q_bar
-
-    integer, intent(in) :: lo(3), hi(3)
-    integer, intent(in) :: q_lo(3), q_hi(3)
-    integer, intent(in) :: q_bar_lo(3), q_bar_hi(3)
-    integer, intent(in) :: nc, nc_bar
-    real(rt), intent(inout) :: q(q_lo(1):q_hi(1), q_lo(2):q_hi(2), q_lo(3):q_hi(3), nc)
-    real(rt), intent(in) :: q_bar(q_bar_lo(1):q_bar_hi(1), q_bar_lo(2):q_bar_hi(2), q_bar_lo(3):q_bar_hi(3), nc_bar)
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    integer :: i, j, k, n
-    real(rt) :: lap
-
-    do n = 1, nc
-       do k = lo(3), hi(3)
-          do j = lo(2), hi(2)
-             do i = lo(1), hi(1)
-
-                lap = compute_laplacian(i, j, k, n, &
-                                        q_bar, q_bar_lo, q_bar_hi, nc, &
-                                        domlo, domhi)
-
-                q(i,j,k,n) = q(i,j,k,n) + TWENTYFOURTH * lap
-
-             end do
-          end do
-       end do
-    end do
-
-  end subroutine ca_make_fourth_average
-
-  subroutine ca_make_fourth_in_place(lo, hi, &
-                                     q, q_lo, q_hi, nc, &
-                                     domlo, domhi) &
-                                     bind(C, name="ca_make_fourth_in_place")
-    ! Take the cell-center q and makes it a cell-average q, in place
-    ! (e.g. q is overwritten by its average), q <- q + 1/24 L q.
-    ! Note: this routine is not tile safe.
-
-    use amrex_mempool_module, only : bl_allocate, bl_deallocate
-
-    integer, intent(in) :: lo(3), hi(3)
-    integer, intent(in) :: q_lo(3), q_hi(3)
-    integer, intent(in) :: nc
-    real(rt), intent(inout) :: q(q_lo(1):q_hi(1), q_lo(2):q_hi(2), q_lo(3):q_hi(3), nc)
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    integer :: i, j, k, n
-    real(rt), pointer :: lap(:,:,:)
-
-    call bl_allocate(lap, lo, hi)
-
-    do n = 1, nc
-
-       do k = lo(3), hi(3)
-          do j = lo(2), hi(2)
-             do i = lo(1), hi(1)
-
-                lap(i,j,k) = compute_laplacian(i, j, k, n, &
-                                               q, q_lo, q_hi, nc, &
-                                               domlo, domhi)
-
-             end do
-          end do
-       end do
-
-       do k = lo(3), hi(3)
-          do j = lo(2), hi(2)
-             do i = lo(1), hi(1)
-                q(i,j,k,n) = q(i,j,k,n) + TWENTYFOURTH * lap(i,j,k)
-             end do
-          end do
-       end do
-    end do
-
-    call bl_deallocate(lap)
-
-  end subroutine ca_make_fourth_in_place
-
-  subroutine ca_make_fourth_in_place_n(lo, hi, &
-                                       q, q_lo, q_hi, nc, ncomp, &
-                                       domlo, domhi) &
-                                       bind(C, name="ca_make_fourth_in_place_n")
-    ! Take the cell-center q and makes it a cell-average q, in place
-    ! (e.g. q is overwritten by its average), q <- q + 1/24 L q.
-    ! Note: this routine is not tile safe.
-    !
-    ! This version operates on a single component.  Here ncomp is the
-    ! component to update -- we expect this to come in 0-based from
-    ! C++, so we add 1
-
-    use amrex_mempool_module, only : bl_allocate, bl_deallocate
-
-    integer, intent(in) :: lo(3), hi(3)
-    integer, intent(in) :: q_lo(3), q_hi(3)
-    integer, intent(in) :: nc
-    integer, intent(in), value :: ncomp
-    real(rt), intent(inout) :: q(q_lo(1):q_hi(1), q_lo(2):q_hi(2), q_lo(3):q_hi(3), nc)
-    integer, intent(in) :: domlo(3), domhi(3)
-
-    integer :: i, j, k
-    real(rt), pointer :: lap(:,:,:)
-
-    call bl_allocate(lap, lo, hi)
-
-    do k = lo(3), hi(3)
-       do j = lo(2), hi(2)
-          do i = lo(1), hi(1)
-
-             lap(i,j,k) = compute_laplacian(i, j, k, ncomp+1, &
-                                            q, q_lo, q_hi, nc, &
-                                            domlo, domhi)
-
-          end do
-       end do
-    end do
-
-    do k = lo(3), hi(3)
-       do j = lo(2), hi(2)
-          do i = lo(1), hi(1)
-             q(i,j,k,ncomp+1) = q(i,j,k,ncomp+1) + TWENTYFOURTH * lap(i,j,k)
-          end do
-       end do
-    end do
-
-    call bl_deallocate(lap)
-
-  end subroutine ca_make_fourth_in_place_n
-
-
 end module fourth_order

From 3ab2be039e076222640a7a61da5176cfc51e01b9 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 11:03:04 -0400
Subject: [PATCH 2/9] more work on porting the average stuff to C++

---
 Source/driver/Castro.cpp               | 73 ++++++++++++++++----------
 Source/driver/Castro_F.H               | 26 ---------
 Source/driver/Castro_advance_sdc.cpp   | 51 +++++++++---------
 Source/hydro/Castro_hydro.cpp          | 25 ++++-----
 Source/hydro/Castro_mol_hydro.cpp      | 47 +++++++----------
 Source/hydro/Make.package              |  2 +
 Source/hydro/fourth_center_average.H   | 43 +++++++++++++++
 Source/hydro/fourth_center_average.cpp | 11 +++-
 Source/sdc/Castro_sdc.cpp              | 61 ++++++++++++---------
 9 files changed, 188 insertions(+), 151 deletions(-)
 create mode 100644 Source/hydro/fourth_center_average.H

diff --git a/Source/driver/Castro.cpp b/Source/driver/Castro.cpp
index 61bdcddaca..8ef5e17722 100644
--- a/Source/driver/Castro.cpp
+++ b/Source/driver/Castro.cpp
@@ -1056,16 +1056,20 @@ Castro::initData ()
            AmrLevel::FillPatch(*this, Sborder, NUM_GROW, cur_time, State_Type, 0, NUM_STATE);
 
            // note: this cannot be tiled
-           const int* domain_lo = geom.Domain().loVect();
-           const int* domain_hi = geom.Domain().hiVect();
+           auto const domain_lo = geom.Domain().loVect3d();
+           auto const domain_hi = geom.Domain().hiVect3d();
+
+           FArrayBox tmp;
 
            for (MFIter mfi(S_new); mfi.isValid(); ++mfi)
              {
-               const Box& box     = mfi.validbox();
+               const Box& box = mfi.validbox();
+
+               tmp.resize(box, 1);
+               Elixir elix_tmp = tmp.elixir();
+               auto tmp_arr = tmp.array();
 
-               ca_make_fourth_in_place(BL_TO_FORTRAN_BOX(box),
-                                       BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                                       AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+               make_fourth_in_place(box, Sborder.array(mfi), tmp_arr, domain_lo, domain_hi);
              }
 
            // now copy back the averages
@@ -1079,16 +1083,20 @@ Castro::initData ()
          AmrLevel::FillPatch(*this, Sborder, NUM_GROW, cur_time, State_Type, 0, NUM_STATE);
 
          // convert to centers -- not tile safe
-         const int* domain_lo = geom.Domain().loVect();
-         const int* domain_hi = geom.Domain().hiVect();
+         auto const domain_lo = geom.Domain().loVect3d();
+         auto const domain_hi = geom.Domain().hiVect3d();
+
+         FArrayBox tmp;
 
          for (MFIter mfi(S_new); mfi.isValid(); ++mfi)
            {
              const Box& box = mfi.growntilebox(2);
 
-             ca_make_cell_center_in_place(BL_TO_FORTRAN_BOX(box),
-                                          BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                                          AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+             tmp.resize(box, 1);
+             Elixir elix_tmp = tmp.elixir();
+             auto tmp_arr = tmp.array();
+
+             make_cell_center_in_place(box, Sborder.array(mfi), tmp_arr, domain_lo, domain_hi);
            }
 
          // reset the energy -- do this in one ghost cell so we can average in place below
@@ -1131,9 +1139,11 @@ Castro::initData ()
            {
              const Box& box = mfi.validbox();
 
-             ca_make_fourth_in_place(BL_TO_FORTRAN_BOX(box),
-                                     BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                                     AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+             tmp.resize(box, 1);
+             Elixir elix_tmp = tmp.elixir();
+             auto tmp_arr = tmp.array();
+
+             make_fourth_in_place(box, Sborder.array(mfi), tmp_arr, domain_lo, domain_hi);
            }
 
          // now copy back the averages for UEINT and UTEMP only
@@ -3319,22 +3329,24 @@ Castro::computeTemp(MultiFab& State, Real time, int ng)
 
     // convert to cell centers -- this will result in Stemp being
     // cell centered only on 1 ghost cells
-    const int* domain_lo = geom.Domain().loVect();
-    const int* domain_hi = geom.Domain().hiVect();
+    auto const domain_lo = geom.Domain().loVect3d();
+    auto const domain_hi = geom.Domain().hiVect3d();
+
+    FArrayBox tmp;
 
     for (MFIter mfi(Stemp); mfi.isValid(); ++mfi) {
       const Box& bx = mfi.growntilebox(1);
       const Box& bx0 = mfi.tilebox();
       const int idx = mfi.tileIndex();
 
-      ca_compute_lap_term(BL_TO_FORTRAN_BOX(bx0),
-                          BL_TO_FORTRAN_FAB(Stemp[mfi]),
-                          BL_TO_FORTRAN_ANYD(Eint_lap[mfi]), UEINT,
-                          AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+      compute_lap_term(bx0, Stemp.array(mfi), Eint_lap.array(mfi), UEINT,
+                       domain_lo, domain_hi);
+
+      tmp.resize(bx, 1);
+      Elixir elix_tmp = tmp.elixir();
+      auto tmp_arr = tmp.array();
 
-      ca_make_cell_center_in_place(BL_TO_FORTRAN_BOX(bx),
-                                   BL_TO_FORTRAN_FAB(Stemp[mfi]),
-                                   AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+      make_cell_center_in_place(bx, Stemp.arry(mfi), tmp_arr, domain_lo, domain_hi);
 
     }
 
@@ -3421,19 +3433,22 @@ Castro::computeTemp(MultiFab& State, Real time, int ng)
     // cell-averages -- this is 4th-order and will be a no-op for
     // those zones where e wasn't changed.
 
-    const int* domain_lo = geom.Domain().loVect();
-    const int* domain_hi = geom.Domain().hiVect();
+    auto const domain_lo = geom.Domain().loVect3d();
+    auto const domain_hi = geom.Domain().hiVect3d();
+
+    FArrayBox tmp;
 
     for (MFIter mfi(Stemp); mfi.isValid(); ++mfi) {
 
       const Box& bx = mfi.tilebox();
       const int idx = mfi.tileIndex();
 
-      // only temperature
-      ca_make_fourth_in_place_n(BL_TO_FORTRAN_BOX(bx),
-                                BL_TO_FORTRAN_FAB(Stemp[mfi]), UTEMP,
-                                AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+      tmp.resize(bx, 1);
+      Elixir elix_tmp = tmp.elixir();
+      auto tmp_arr = tmp.array();
 
+      // only temperature
+      make_fourth_in_place_n(bx, Stemp.array(mfi), UTEMP, tmp_arr, domain_lo, domain_hi);
     }
 
     // correct UEINT
diff --git a/Source/driver/Castro_F.H b/Source/driver/Castro_F.H
index 3bc9265ace..c0ecda1c16 100644
--- a/Source/driver/Castro_F.H
+++ b/Source/driver/Castro_F.H
@@ -180,32 +180,6 @@ extern "C"
      const amrex::Real* dx, amrex::Real* dt);
 #endif
 
-  void ca_make_fourth_average(const int lo[], const int hi[],
-                              BL_FORT_FAB_ARG_3D(q), const int* nc,
-                              const BL_FORT_FAB_ARG_3D(q_bar), const int* nc_bar,
-                              const int* domlo, const int* domhi);
-
-  void ca_make_fourth_in_place(const int lo[], const int hi[],
-                               BL_FORT_FAB_ARG_3D(q), const int* nc,
-                               const int* domlo, const int* domhi);
-
-  void ca_make_fourth_in_place_n(const int lo[], const int hi[],
-                                 BL_FORT_FAB_ARG_3D(q), const int* nc, const int ncomp,
-                                 const int* domlo, const int* domhi);
-
-  void ca_compute_lap_term(const int lo[], const int hi[],
-                           const BL_FORT_FAB_ARG_3D(U), const int* nc,
-                           BL_FORT_FAB_ARG_ANYD(lap), const int ncomp,
-                           const int* domlo, const int* domhi);
-
-  void ca_make_cell_center(const int lo[], const int hi[],
-                           const BL_FORT_FAB_ARG_3D(U), const int* nc,
-                           BL_FORT_FAB_ARG_3D(U_cc), const int* nc_cc,
-                           const int* domlo, const int* domhi);
-
-  void ca_make_cell_center_in_place(const int lo[], const int hi[],
-                                    BL_FORT_FAB_ARG_3D(q), const int* nc,
-                                    const int* domlo, const int* domhi);
 
 #ifdef RADIATION
   void ca_inelastic_sct
diff --git a/Source/driver/Castro_advance_sdc.cpp b/Source/driver/Castro_advance_sdc.cpp
index d1b9191b89..f39fffaa26 100644
--- a/Source/driver/Castro_advance_sdc.cpp
+++ b/Source/driver/Castro_advance_sdc.cpp
@@ -40,8 +40,8 @@ Castro::do_advance_sdc (Real time,
   MultiFab& S_old = get_old_data(State_Type);
   MultiFab& S_new = get_new_data(State_Type);
 
-  const int* domain_lo = geom.Domain().loVect();
-  const int* domain_hi = geom.Domain().hiVect();
+  auto const domain_lo = geom.Domain().loVect3d();
+  auto const domain_hi = geom.Domain().hiVect3d();
 
   // Perform initialization steps.
 
@@ -103,10 +103,8 @@ Castro::do_advance_sdc (Real time,
 
           for (MFIter mfi(S_new); mfi.isValid(); ++mfi) {
             const Box& gbx = mfi.growntilebox(1);
-            ca_make_cell_center(BL_TO_FORTRAN_BOX(gbx),
-                                BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                                BL_TO_FORTRAN_FAB(Sburn[mfi]),
-                                AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+
+            make_cell_center(gbx, Sborder.array(mfi), Sburn.array(mfi), domain_lo, domain_hi);
 
           }
 
@@ -122,11 +120,16 @@ Castro::do_advance_sdc (Real time,
           AmrLevel::FillPatch(*this, old_source, old_source.nGrow(), prev_time, Source_Type, 0, NSRC);
 
           // Now convert to cell averages.  This loop cannot be tiled.
+          FArrayBox tmp;
+
           for (MFIter mfi(S_new); mfi.isValid(); ++mfi) {
             const Box& bx = mfi.tilebox();
-            ca_make_fourth_in_place(BL_TO_FORTRAN_BOX(bx),
-                                    BL_TO_FORTRAN_FAB(old_source[mfi]),
-                                    AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+
+            tmp.resize(bx, 1);
+            Elixir elix_tmp = tmp.elixir();
+            auto tmp_arr = tmp.array();
+
+            make_fourth_in_place(bx, old_source.array(mfi), tmp_arr, domain_lo, domain_hi);
           }
 
         } else {
@@ -305,6 +308,7 @@ Castro::do_advance_sdc (Real time,
 
   FArrayBox U_center;
   FArrayBox R_center;
+  FArrayBox tmp;
 
   // this cannot be tiled
   for (MFIter mfi(R_new); mfi.isValid(); ++mfi) {
@@ -313,33 +317,30 @@ Castro::do_advance_sdc (Real time,
 
     if (sdc_order == 4) {
 
-      const int* domain_lo = geom.Domain().loVect();
-      const int* domain_hi = geom.Domain().hiVect();
-
       // convert S_new to cell-centers
       U_center.resize(obx, NUM_STATE);
-      ca_make_cell_center(BL_TO_FORTRAN_BOX(obx),
-                          BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                          BL_TO_FORTRAN_FAB(U_center),
-                          AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+      Elixir elix_u_center = U_center.elixir();
+      auto const U_center_arr = U_center.array();
+
+      make_cell_center(obx, Sborder.array(mfi), U_center_arr, domain_lo, domain_hi);
 
       // pass in the reaction source and state at centers, including one ghost cell
       // and derive everything that is needed including 1 ghost cell
       R_center.resize(obx, R_new.nComp());
+      Elixir elix_r_center = R_center.elixir();
+      auto const R_center_arr = R_center.array();
+
       Array4<const Real> const Sburn_arr = Sburn.array(mfi);
-      Array4<const Real> const U_center_arr = U_center.array();
-      Array4<Real> const R_center_arr = R_center.array();
+
       // we don't worry about the difference between centers and averages
-      ca_store_reaction_state(obx,
-                              Sburn_arr,
-                              U_center_arr,
-                              R_center_arr);
+      ca_store_reaction_state(obx, Sburn_arr, U_center_arr, R_center_arr);
 
       // convert R_new from centers to averages in place
-      ca_make_fourth_in_place(BL_TO_FORTRAN_BOX(bx),
-                              BL_TO_FORTRAN_FAB(R_center),
-                              AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+      tmp.resize(bx, 1);
+      Elixir elix_tmp = tmp.elixir();
+      auto const tmp_arr = tmp.array();
 
+      make_fourth_in_place(bx, R_center_arr, tmp_arr, domain_lo, domain_hi);
 
       // store
       R_new[mfi].copy(R_center, bx, 0, bx, 0, R_new.nComp());
diff --git a/Source/hydro/Castro_hydro.cpp b/Source/hydro/Castro_hydro.cpp
index 0e4d8f0f33..e8777f66cc 100644
--- a/Source/hydro/Castro_hydro.cpp
+++ b/Source/hydro/Castro_hydro.cpp
@@ -124,11 +124,13 @@ Castro::cons_to_prim_fourth(const Real time)
     // convert the conservative state cell averages to primitive cell
     // averages with 4th order accuracy
 
-    const int* domain_lo = geom.Domain().loVect();
-    const int* domain_hi = geom.Domain().hiVect();
+    auto const domain_lo = geom.Domain().loVect3d();
+    auto const domain_hi = geom.Domain().hiVect3d();
 
     MultiFab& S_new = get_new_data(State_Type);
 
+    FArrayBox U_cc;
+
     // we don't support radiation here
 #ifdef RADIATION
     amrex::Abort("radiation not supported to fourth order");
@@ -147,13 +149,11 @@ Castro::cons_to_prim_fourth(const Real time)
       // convert U_avg to U_cc -- this will use a Laplacian
       // operation and will result in U_cc defined only on
       // NUM_GROW-1 ghost cells at the end.
-      FArrayBox U_cc;
       U_cc.resize(qbx, NUM_STATE);
+      Elixir elix_u_cc = U_cc.elixir();
+      auto const U_cc_arr = U_cc.array();
 
-      ca_make_cell_center(BL_TO_FORTRAN_BOX(qbxm1),
-                          BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                          BL_TO_FORTRAN_FAB(U_cc),
-                          AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+      make_cell_center(qbxm1, Sborder.array(mfi), U_cc_arr, domain_lo, domain_hi);
 
       // enforce the minimum density on the new cell-centered state
       ca_enforce_minimum_density
@@ -218,17 +218,12 @@ Castro::cons_to_prim_fourth(const Real time)
       // this will create q, qaux in NUM_GROW-1 ghost cells, but that's
       // we need here
 
-      ca_make_fourth_average(BL_TO_FORTRAN_BOX(qbxm1),
-                             BL_TO_FORTRAN_FAB(q[mfi]),
-                             BL_TO_FORTRAN_FAB(q_bar[mfi]),
-                             AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+      make_fourth_average(qbxm1, q.array(mfi), q_bar.array(mfi), domain_lo, domain_hi);
 
       // not sure if we need to convert qaux this way, or if we can
       // just evaluate it (we may not need qaux at all actually)
-      ca_make_fourth_average(BL_TO_FORTRAN_BOX(qbxm1),
-                             BL_TO_FORTRAN_FAB(qaux[mfi]),
-                             BL_TO_FORTRAN_FAB(qaux_bar[mfi]),
-                             AMREX_ARLIM_ANYD(domain_lo), AMREX_ARLIM_ANYD(domain_hi));
+
+      make_fourth_average(qbmx1, qaux.array(mfi), qaux_bar.array(mfi), domain_lo, domain_hi);
 
     }
 
diff --git a/Source/hydro/Castro_mol_hydro.cpp b/Source/hydro/Castro_mol_hydro.cpp
index 7417c836da..cb3ce1bfac 100644
--- a/Source/hydro/Castro_mol_hydro.cpp
+++ b/Source/hydro/Castro_mol_hydro.cpp
@@ -41,8 +41,8 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
 
   BL_PROFILE_VAR("Castro::advance_hydro_ca_umdrv()", CA_UMDRV);
 
-  const int* domain_lo = geom.Domain().loVect();
-  const int* domain_hi = geom.Domain().hiVect();
+  const auto domain_lo = geom.Domain().loVect3d();
+  const auto domain_hi = geom.Domain().hiVect3d();
 
 #ifdef _OPENMP
 #pragma omp parallel
@@ -253,11 +253,8 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
 
 
             if (do_hydro == 0) {
-              Array4<Real> const f_avg_arr = f_avg.array();
-
               AMREX_PARALLEL_FOR_4D(nbx, NUM_STATE, i, j, k, n, {
                 f_avg_arr(i,j,k,n) = 0.0;});
-
             }
 
 #ifdef DIFFUSION
@@ -289,23 +286,19 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
 
 #if AMREX_SPACEDIM >= 2
             // construct the face-center interface states q_fc
-            AMREX_PARALLEL_FOR_4D(nbx, NQ, i, j, k, n, {
-                bool test = (n == QGC) || (n == QTEMP);
-
-                if (test) continue;
+            amrex::ParallelFor(bx, NQ,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+            {
+              bool test = (n == QGC) || (n == QTEMP);
 
-                Real lap = 0.0;
-                int ncomp_f = n + 1;
+              if (test) return;
 
-                trans_laplacian(i, j, k, ncomp_f,
-                                idir_f,
-                                BL_TO_FORTRAN_ANYD(q_avg), NQ,
-                                &lap,
-                                ARLIM_3D(domain_lo), ARLIM_3D(domain_hi));
+              Real lap = trans_laplacian(i, j, k, n, idir,
+                                         q_avg_arr, domain_lo, domain_hi);
 
-                q_fc_arr(i,j,k,n) = q_avg_arr(i,j,k,n) - 1.0/24.0 * lap;
+              q_fc_arr(i,j,k,n) = q_avg_arr(i,j,k,n) - (1.0_rt/24.0_rt) * lap;
 
-              });
+            });
 
             // compute the face-center fluxes F(q_fc)
             Array4<Real> const f_arr = (flux[idir]).array();
@@ -336,20 +329,16 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
             // compute the final fluxes
             Array4<Real> const flux_arr = (flux[idir]).array();
 
-            AMREX_PARALLEL_FOR_4D(nbx, NUM_STATE, i, j, k, n, {
-
-                Real lap = 0.0;
-                int ncomp_f = n + 1;
+            amrex::ParallelFor(nbx, NUM_STATE,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+            {
 
-                trans_laplacian(i, j, k, ncomp_f,
-                                idir_f,
-                                BL_TO_FORTRAN_ANYD(f_avg), NUM_STATE,
-                                &lap,
-                                ARLIM_3D(domain_lo), ARLIM_3D(domain_hi));
+              Real lap = trans_laplacian(i, j, k, n, idir,
+                                         f_avg_arr, domain_lo, domain_hi);
 
-                flux_arr(i,j,k,n) = flux_arr(i,j,k,n) + 1.0/24.0 * lap;
+              flux_arr(i,j,k,n) += (1.0_rt/24.0_rt) * lap;
 
-              });
+            });
 
 #endif
 
diff --git a/Source/hydro/Make.package b/Source/hydro/Make.package
index b56b2b6ac0..8fcfca5467 100644
--- a/Source/hydro/Make.package
+++ b/Source/hydro/Make.package
@@ -33,6 +33,8 @@ ca_F90EXE_sources += Castro_ctu_nd.F90
 
 ifeq ($(USE_TRUE_SDC),TRUE)
   ca_F90EXE_sources += fourth_order_nd.F90
+  CEXE_sources += fourth_center_average.cpp
+  CEXE_headers += fourth_center_average.H
   ca_F90EXE_sources += Castro_mol_nd.F90
   ca_F90EXE_sources += Castro_fourth_order_nd.F90
 endif
diff --git a/Source/hydro/fourth_center_average.H b/Source/hydro/fourth_center_average.H
new file mode 100644
index 0000000000..4af48507da
--- /dev/null
+++ b/Source/hydro/fourth_center_average.H
@@ -0,0 +1,43 @@
+#ifndef _FOURTH_CENTER_AVERAGE_H_
+#define _FOURTH_CENTER_AVERAGE_H_
+
+Real compute_laplacian(int i, int j, int k, int n,
+                       Array4<Real const> const a,
+                       GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+Real trans_laplacian(int i, int j, int k, int n,
+                     int idir,
+                     Array4<Real const> const a,
+                     GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+void make_cell_center(const Box& bx,
+                      Array4<Real const> const& U,
+                      Array4<Real> const& U_cc,
+                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+void make_cell_center_in_place(const Box& bx,
+                               Array4<Real> const& U,
+                               Array4<Real> const& tmp,
+                               GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+void compute_lap_term(const Box& bx,
+                      Array4<Real const> const& U,
+                      Array4<Real> const& lap, const int ncomp,
+                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+void make_fourth_average(const Box& bx,
+                         Array4<Real> const& q,
+                         Array4<Real const> const& q_bar,
+                         GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+void make_fourth_in_place(const Box& bx,
+                          Array4<Real> const& q,
+                          Array4<Real> const& tmp,
+                          GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+void make_fourth_in_place_n(const Box& bx,
+                            Array4<Real> const& q, const int ncomp,
+                            Array4<Real> const& tmp,
+                            GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+
+#endif
diff --git a/Source/hydro/fourth_center_average.cpp b/Source/hydro/fourth_center_average.cpp
index b27941e108..7fbcbe2903 100644
--- a/Source/hydro/fourth_center_average.cpp
+++ b/Source/hydro/fourth_center_average.cpp
@@ -1,3 +1,12 @@
+#include <AMReX_Geometry.H>
+#include <AMReX_Gpu.H>
+
+#include <state_indices.H>
+
+#include "castro_params.H"
+
+using namespace amrex;
+
 
 
 // Note: pretty much all of these routines below assume that dx(1) = dx(2) = dx(3)
@@ -162,7 +171,7 @@ void make_cell_center_in_place(const Box& bx,
 
 
 void compute_lap_term(const Box& bx,
-                      Array4<Real const> const& Y,
+                      Array4<Real const> const& U,
                       Array4<Real> const& lap, const int ncomp,
                       GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
 
diff --git a/Source/sdc/Castro_sdc.cpp b/Source/sdc/Castro_sdc.cpp
index b1e3b1fce6..65cfae3999 100644
--- a/Source/sdc/Castro_sdc.cpp
+++ b/Source/sdc/Castro_sdc.cpp
@@ -26,8 +26,8 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
     // for 4th order reactive SDC, we need to first compute the source, C
     // and do a ghost cell fill on it
 
-    const int* domain_lo = geom.Domain().loVect();
-    const int* domain_hi = geom.Domain().hiVect();
+    auto const domain_lo = geom.Domain().loVect3d();
+    auto const domain_hi = geom.Domain().hiVect3d();
 
     // the timestep from m to m+1
     Real dt_m = (dt_sdc[m_end] - dt_sdc[m_start]) * dt;
@@ -123,6 +123,7 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
     FArrayBox C_center;
     FArrayBox U_new_center;
     FArrayBox R_new;
+    FArrayBox tmp;
 
     FArrayBox C2;
 
@@ -193,10 +194,10 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
             // convert the starting U to cell-centered on a fab-by-fab basis
             // -- including one ghost cell
             U_center.resize(bx1, NUM_STATE);
-            ca_make_cell_center(BL_TO_FORTRAN_BOX(bx1),
-                                BL_TO_FORTRAN_FAB(Sborder[mfi]),
-                                BL_TO_FORTRAN_FAB(U_center),
-                                AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+            Elixir elix_u_center = U_center.elixir();
+            auto U_center_arr = U_center.array();
+
+            make_cell_center(bx1, Sborder.array(mfi), U_center_arr, domain_lo, domain_hi);
 
             // sometimes the Laplacian can make the species go negative near discontinuities
             ca_normalize_species(AMREX_INT_ANYD(bx1.loVect()), AMREX_INT_ANYD(bx1.hiVect()),
@@ -204,21 +205,20 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
 
             // convert the C source to cell-centers
             C_center.resize(bx1, NUM_STATE);
-            ca_make_cell_center(BL_TO_FORTRAN_BOX(bx1),
-                                BL_TO_FORTRAN_FAB(C_source[mfi]),
-                                BL_TO_FORTRAN_FAB(C_center),
-                                AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+            Elixir elix_c_center = C_center.elixir();
+            auto C_center_arr = C_center.array();
+
+            make_cell_center(bx1, C_source.array(mfi), C_center_arr, domain_lo, domain_hi);
 
             // solve for the updated cell-center U using our cell-centered C -- we
             // need to do this with one ghost cell
             U_new_center.resize(bx1, NUM_STATE);
+            Elixir elix_u_new_center = U_new_center.elixir();
+            auto U_new_center_arr = U_new_center.array();
 
             // initialize U_new with our guess for the new state, stored as
             // an average in Sburn
-            ca_make_cell_center(BL_TO_FORTRAN_BOX(bx1),
-                                BL_TO_FORTRAN_FAB(Sburn[mfi]),
-                                BL_TO_FORTRAN_FAB(U_new_center),
-                                AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+            make_cell_center(bx1, Sburn.array(mfi), U_new_center_arr, domain_lo, domain_hi);
 
             ca_sdc_update_centers_o4(BL_TO_FORTRAN_BOX(bx1), &dt_m,
                                      BL_TO_FORTRAN_3D(U_center),
@@ -236,9 +236,11 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
                                    BL_TO_FORTRAN_3D(U_new_center),
                                    BL_TO_FORTRAN_3D(R_new));
 
-            ca_make_fourth_in_place(BL_TO_FORTRAN_BOX(bx),
-                                    BL_TO_FORTRAN_FAB(R_new),
-                                    AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+            tmp.resize(bx, 1);
+            Elixir elix_tmp = tmp.elixir();
+            auto const tmp_arr = tmp.array();
+
+            make_fourth_in_place(bx, R_new_arr, tmp_arr, domain_lo, domain_hi);
 
             // now do the conservative update using this <R> to get <U>
             // We'll also need to pass in <C>
@@ -313,8 +315,8 @@ Castro::construct_old_react_source(MultiFab& U_state,
 
     BL_PROFILE("Castro::construct_old_react_source()");
 
-    const int* domain_lo = geom.Domain().loVect();
-    const int* domain_hi = geom.Domain().hiVect();
+    auto const domain_lo = geom.Domain().loVect3d();
+    auto const domain_hi = geom.Domain().hiVect3d();
 
     if (sdc_order == 4 && input_is_average)
     {
@@ -323,6 +325,7 @@ Castro::construct_old_react_source(MultiFab& U_state,
 
         FArrayBox U_center;
         FArrayBox R_center;
+        FArrayBox tmp;
 
         for (MFIter mfi(U_state); mfi.isValid(); ++mfi)
         {
@@ -332,13 +335,16 @@ Castro::construct_old_react_source(MultiFab& U_state,
 
             // Convert to centers
             U_center.resize(obx, NUM_STATE);
-            ca_make_cell_center(BL_TO_FORTRAN_BOX(obx),
-                                BL_TO_FORTRAN_FAB(U_state[mfi]),
-                                BL_TO_FORTRAN_FAB(U_center),
-                                AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+            Elixir elix_u_center = U_center.elixir();
+            auto const U_center_arr = U_center.array();
+
+            make_cell_center(obx, U_state.array(mfi), U_center_arr, domain_lo, domain_hi);
 
             // burn, including one ghost cell
             R_center.resize(obx, NUM_STATE);
+            Elixir elix_r_center = R_center.elixir();
+            auto const R_center_arr = R_center.array();
+
             ca_instantaneous_react(BL_TO_FORTRAN_BOX(obx),
                                    BL_TO_FORTRAN_3D(U_center),
                                    BL_TO_FORTRAN_3D(R_center));
@@ -349,9 +355,12 @@ Castro::construct_old_react_source(MultiFab& U_state,
             Sburn[mfi].copy(R_center, obx, 0, obx, 0, NUM_STATE);
 
             // convert R to averages (in place)
-            ca_make_fourth_in_place(BL_TO_FORTRAN_BOX(bx),
-                                    BL_TO_FORTRAN_FAB(R_center),
-                                    AMREX_INT_ANYD(domain_lo), AMREX_INT_ANYD(domain_hi));
+
+            tmp.resize(bx, 1);
+            Elixir elix_tmp = tmp.elixir();
+            auto const tmp_arr = tmp.array();
+
+            make_fourth_in_place(bx, R_center_arr, tmp_arr, domain_lo, domain_hi);
 
             // copy this to the center
             R_source[mfi].copy(R_center, bx, 0, bx, 0, NUM_STATE);

From 557114ed462d68c71b6c3369beb26cd9a13a2f1c Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 12:31:33 -0400
Subject: [PATCH 3/9] this compiles now

---
 Source/driver/Castro.cpp               |  18 +-
 Source/driver/Castro_advance_sdc.cpp   |   4 +-
 Source/hydro/Castro_hydro.H            |  46 +++++
 Source/hydro/Castro_hydro.cpp          |   7 +-
 Source/hydro/Castro_mol_hydro.cpp      |  24 ++-
 Source/hydro/fourth_center_average.H   | 148 +++++++++++----
 Source/hydro/fourth_center_average.cpp | 240 ++++++++++---------------
 7 files changed, 283 insertions(+), 204 deletions(-)

diff --git a/Source/driver/Castro.cpp b/Source/driver/Castro.cpp
index 8ef5e17722..4cfa4542c7 100644
--- a/Source/driver/Castro.cpp
+++ b/Source/driver/Castro.cpp
@@ -1056,8 +1056,8 @@ Castro::initData ()
            AmrLevel::FillPatch(*this, Sborder, NUM_GROW, cur_time, State_Type, 0, NUM_STATE);
 
            // note: this cannot be tiled
-           auto const domain_lo = geom.Domain().loVect3d();
-           auto const domain_hi = geom.Domain().hiVect3d();
+           auto domain_lo = geom.Domain().loVect3d();
+           auto domain_hi = geom.Domain().hiVect3d();
 
            FArrayBox tmp;
 
@@ -1083,8 +1083,8 @@ Castro::initData ()
          AmrLevel::FillPatch(*this, Sborder, NUM_GROW, cur_time, State_Type, 0, NUM_STATE);
 
          // convert to centers -- not tile safe
-         auto const domain_lo = geom.Domain().loVect3d();
-         auto const domain_hi = geom.Domain().hiVect3d();
+         auto domain_lo = geom.Domain().loVect3d();
+         auto domain_hi = geom.Domain().hiVect3d();
 
          FArrayBox tmp;
 
@@ -3329,8 +3329,8 @@ Castro::computeTemp(MultiFab& State, Real time, int ng)
 
     // convert to cell centers -- this will result in Stemp being
     // cell centered only on 1 ghost cells
-    auto const domain_lo = geom.Domain().loVect3d();
-    auto const domain_hi = geom.Domain().hiVect3d();
+    auto domain_lo = geom.Domain().loVect3d();
+    auto domain_hi = geom.Domain().hiVect3d();
 
     FArrayBox tmp;
 
@@ -3346,7 +3346,7 @@ Castro::computeTemp(MultiFab& State, Real time, int ng)
       Elixir elix_tmp = tmp.elixir();
       auto tmp_arr = tmp.array();
 
-      make_cell_center_in_place(bx, Stemp.arry(mfi), tmp_arr, domain_lo, domain_hi);
+      make_cell_center_in_place(bx, Stemp.array(mfi), tmp_arr, domain_lo, domain_hi);
 
     }
 
@@ -3433,8 +3433,8 @@ Castro::computeTemp(MultiFab& State, Real time, int ng)
     // cell-averages -- this is 4th-order and will be a no-op for
     // those zones where e wasn't changed.
 
-    auto const domain_lo = geom.Domain().loVect3d();
-    auto const domain_hi = geom.Domain().hiVect3d();
+    auto domain_lo = geom.Domain().loVect3d();
+    auto domain_hi = geom.Domain().hiVect3d();
 
     FArrayBox tmp;
 
diff --git a/Source/driver/Castro_advance_sdc.cpp b/Source/driver/Castro_advance_sdc.cpp
index f39fffaa26..a0579916c6 100644
--- a/Source/driver/Castro_advance_sdc.cpp
+++ b/Source/driver/Castro_advance_sdc.cpp
@@ -40,8 +40,8 @@ Castro::do_advance_sdc (Real time,
   MultiFab& S_old = get_old_data(State_Type);
   MultiFab& S_new = get_new_data(State_Type);
 
-  auto const domain_lo = geom.Domain().loVect3d();
-  auto const domain_hi = geom.Domain().hiVect3d();
+  auto domain_lo = geom.Domain().loVect3d();
+  auto domain_hi = geom.Domain().hiVect3d();
 
   // Perform initialization steps.
 
diff --git a/Source/hydro/Castro_hydro.H b/Source/hydro/Castro_hydro.H
index 5ae037bc01..ed37d67bfd 100644
--- a/Source/hydro/Castro_hydro.H
+++ b/Source/hydro/Castro_hydro.H
@@ -541,4 +541,50 @@
 /// @param ng       Number of ghost cells
 ///
     void linear_to_hybrid_momentum(amrex::MultiFab& state, int ng = 0);
+
+#endif
+
+#ifdef TRUE_SDC
+
+    Real compute_laplacian(int i, int j, int k, int n,
+                           Array4<Real const> const a,
+                           GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                           GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
+
+    Real trans_laplacian(int i, int j, int k, int n,
+                         int idir,
+                         Array4<Real const> const a,
+                         GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                         GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
+
+    void make_cell_center(const Box& bx,
+                          Array4<Real const> const& U,
+                          Array4<Real> const& U_cc,
+                          GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+
+    void make_cell_center_in_place(const Box& bx,
+                                   Array4<Real> const& U,
+                                   Array4<Real> const& tmp,
+                                   GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+
+    void compute_lap_term(const Box& bx,
+                          Array4<Real const> const& U,
+                          Array4<Real> const& lap, const int ncomp,
+                          GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+
+    void make_fourth_average(const Box& bx,
+                             Array4<Real> const& q,
+                             Array4<Real const> const& q_bar,
+                             GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+
+    void make_fourth_in_place(const Box& bx,
+                              Array4<Real> const& q,
+                              Array4<Real> const& tmp,
+                              GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+
+    void make_fourth_in_place_n(const Box& bx,
+                                Array4<Real> const& q, const int ncomp,
+                                Array4<Real> const& tmp,
+                                GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+
 #endif
diff --git a/Source/hydro/Castro_hydro.cpp b/Source/hydro/Castro_hydro.cpp
index e8777f66cc..74718b8697 100644
--- a/Source/hydro/Castro_hydro.cpp
+++ b/Source/hydro/Castro_hydro.cpp
@@ -124,8 +124,8 @@ Castro::cons_to_prim_fourth(const Real time)
     // convert the conservative state cell averages to primitive cell
     // averages with 4th order accuracy
 
-    auto const domain_lo = geom.Domain().loVect3d();
-    auto const domain_hi = geom.Domain().hiVect3d();
+    auto domain_lo = geom.Domain().loVect3d();
+    auto domain_hi = geom.Domain().hiVect3d();
 
     MultiFab& S_new = get_new_data(State_Type);
 
@@ -182,7 +182,6 @@ Castro::cons_to_prim_fourth(const Real time)
       // convert U_cc to q_cc (we'll store this temporarily in q,
       // qaux).  This will remain valid only on the NUM_GROW-1 ghost
       // cells.
-      auto U_cc_arr = U_cc.array();
       auto q_arr = q.array(mfi);
       auto qaux_arr = qaux.array(mfi);
 
@@ -223,7 +222,7 @@ Castro::cons_to_prim_fourth(const Real time)
       // not sure if we need to convert qaux this way, or if we can
       // just evaluate it (we may not need qaux at all actually)
 
-      make_fourth_average(qbmx1, qaux.array(mfi), qaux_bar.array(mfi), domain_lo, domain_hi);
+      make_fourth_average(qbxm1, qaux.array(mfi), qaux_bar.array(mfi), domain_lo, domain_hi);
 
     }
 
diff --git a/Source/hydro/Castro_mol_hydro.cpp b/Source/hydro/Castro_mol_hydro.cpp
index cb3ce1bfac..aca963366a 100644
--- a/Source/hydro/Castro_mol_hydro.cpp
+++ b/Source/hydro/Castro_mol_hydro.cpp
@@ -7,6 +7,8 @@
 #include "diffusion_util.H"
 #endif
 
+#include "fourth_center_average.H"
+
 using namespace amrex;
 
 ///
@@ -41,8 +43,8 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
 
   BL_PROFILE_VAR("Castro::advance_hydro_ca_umdrv()", CA_UMDRV);
 
-  const auto domain_lo = geom.Domain().loVect3d();
-  const auto domain_hi = geom.Domain().hiVect3d();
+  GpuArray<int, 3> domain_lo = geom.Domain().loVect3d();
+  GpuArray<int, 3> domain_hi = geom.Domain().hiVect3d();
 
 #ifdef _OPENMP
 #pragma omp parallel
@@ -286,6 +288,16 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
 
 #if AMREX_SPACEDIM >= 2
             // construct the face-center interface states q_fc
+            const int* lo_bc = phys_bc.lo();
+            const int* hi_bc = phys_bc.hi();
+
+            GpuArray<bool, 3> lo_periodic;
+            GpuArray<bool, 3> hi_periodic;
+            for (int idir = 0; idir < 3; idir++) {
+              lo_periodic[idir] = lo_bc[idir] == Interior;
+              hi_periodic[idir] = hi_bc[idir] == Interior;
+            }
+
             amrex::ParallelFor(bx, NQ,
             [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
             {
@@ -293,8 +305,8 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
 
               if (test) return;
 
-              Real lap = trans_laplacian(i, j, k, n, idir,
-                                         q_avg_arr, domain_lo, domain_hi);
+              Real lap = trans_laplacian(i, j, k, n, idir, q_avg_arr,
+                                         lo_periodic, hi_periodic, domain_lo, domain_hi);
 
               q_fc_arr(i,j,k,n) = q_avg_arr(i,j,k,n) - (1.0_rt/24.0_rt) * lap;
 
@@ -333,8 +345,8 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
             [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
             {
 
-              Real lap = trans_laplacian(i, j, k, n, idir,
-                                         f_avg_arr, domain_lo, domain_hi);
+              Real lap = trans_laplacian(i, j, k, n, idir, f_avg_arr,
+                                         lo_periodic, hi_periodic, domain_lo, domain_hi);
 
               flux_arr(i,j,k,n) += (1.0_rt/24.0_rt) * lap;
 
diff --git a/Source/hydro/fourth_center_average.H b/Source/hydro/fourth_center_average.H
index 4af48507da..eda63cec16 100644
--- a/Source/hydro/fourth_center_average.H
+++ b/Source/hydro/fourth_center_average.H
@@ -1,43 +1,115 @@
 #ifndef _FOURTH_CENTER_AVERAGE_H_
 #define _FOURTH_CENTER_AVERAGE_H_
 
-Real compute_laplacian(int i, int j, int k, int n,
-                       Array4<Real const> const a,
-                       GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-Real trans_laplacian(int i, int j, int k, int n,
-                     int idir,
-                     Array4<Real const> const a,
-                     GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-void make_cell_center(const Box& bx,
-                      Array4<Real const> const& U,
-                      Array4<Real> const& U_cc,
-                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-void make_cell_center_in_place(const Box& bx,
-                               Array4<Real> const& U,
-                               Array4<Real> const& tmp,
-                               GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-void compute_lap_term(const Box& bx,
-                      Array4<Real const> const& U,
-                      Array4<Real> const& lap, const int ncomp,
-                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-void make_fourth_average(const Box& bx,
-                         Array4<Real> const& q,
-                         Array4<Real const> const& q_bar,
-                         GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-void make_fourth_in_place(const Box& bx,
-                          Array4<Real> const& q,
-                          Array4<Real> const& tmp,
-                          GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
-
-void make_fourth_in_place_n(const Box& bx,
-                            Array4<Real> const& q, const int ncomp,
-                            Array4<Real> const& tmp,
-                            GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi);
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+Real
+Castro:: compute_laplacian(int i, int j, int k, int n,
+                           Array4<Real const> const a,
+                           GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                           GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
+
+  Real lapx = 0.0;
+  Real lapy = 0.0;
+  Real lapz = 0.0;
+
+  // we use 2nd-order accurate one-sided stencils at the physical
+  // boundaries note: this differs from the suggestion in MC2011 --
+  // they just suggest using the Laplacian from +1 off the interior.
+  // I like the one-sided better.
+
+  if (i == domlo[0] && !lo_periodic[0]) {
+    lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n);
+
+  } else if (i == domhi[0] && !hi_periodic[0]) {
+    lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n);
+
+  } else {
+    lapx = a(i+1,j,k,n) - 2.0_rt*a(i,j,k,n) + a(i-1,j,k,n);
+  }
+
+#if AMREX_SPACEDIM >= 2
+  if (j == domlo[1] && !lo_periodic[1]) {
+    lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n);
+
+  } else if (j == domhi[1] && !hi_periodic[1]) {
+    lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n);
+
+  } else {
+    lapy = a(i,j+1,k,n) - 2.0_rt*a(i,j,k,n) + a(i,j-1,k,n);
+  }
+#endif
+
+#if AMREX_SPACEDIM == 3
+  if (k == domlo[2] && !lo_periodic[2]) {
+    lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n);
+
+  } else if (k == domhi[2] && !hi_periodic[2]) {
+    lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n);
+
+  } else {
+    lapz = a(i,j,k+1,n) - 2.0_rt*a(i,j,k,n) + a(i,j,k-1,n);
+  }
+#endif
+
+  return lapx + lapy + lapz;
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+Real
+Castro::trans_laplacian(int i, int j, int k, int n,
+                        int idir,
+                        Array4<Real const> const a,
+                        GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                        GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
+
+  Real lapx = 0.0;
+  Real lapy = 0.0;
+  Real lapz = 0.0;
+
+  // we use 2nd-order accurate one-sided stencils at the physical boundaries
+  if (idir != 0) {
+
+    if (i == domlo[0] && !lo_periodic[0]) {
+      lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n);
+
+    } else if (i == domhi[0] && !hi_periodic[0]) {
+      lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n);
+
+    } else {
+      lapx = a(i+1,j,k,n) - 2.0_rt*a(i,j,k,n) + a(i-1,j,k,n);
+    }
+  }
+
+  if (idir != 1) {
+
+    if (j == domlo[1] && !lo_periodic[1]) {
+      lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n);
+
+    } else if (j == domhi[1] && !hi_periodic[1]) {
+      lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n);
+
+    } else {
+      lapy = a(i,j+1,k,n) - 2.0_rt*a(i,j,k,n) + a(i,j-1,k,n);
+    }
+  }
+
+#if AMREX_SPACEDIM == 3
+  if (idir != 2) {
+
+    if (k == domlo[2] && !lo_periodic[2]) {
+      lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n);
+
+    } else if (k == domhi[2] && !hi_periodic[2]) {
+      lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n);
+
+    } else {
+      lapz = a(i,j,k+1,n) - 2.0_rt*a(i,j,k,n) + a(i,j,k-1,n);
+    }
+  }
+#endif
+
+  return lapx + lapy + lapz;
+}
+
 
 #endif
diff --git a/Source/hydro/fourth_center_average.cpp b/Source/hydro/fourth_center_average.cpp
index 7fbcbe2903..9f715082b8 100644
--- a/Source/hydro/fourth_center_average.cpp
+++ b/Source/hydro/fourth_center_average.cpp
@@ -1,151 +1,53 @@
-#include <AMReX_Geometry.H>
-#include <AMReX_Gpu.H>
-
-#include <state_indices.H>
-
-#include "castro_params.H"
+#include "Castro.H"
+#include "fourth_center_average.H"
 
 using namespace amrex;
 
-
-
 // Note: pretty much all of these routines below assume that dx(1) = dx(2) = dx(3)
 
-Real compute_laplacian(int i, int j, int k, int n,
-                       Array4<Real const> const a,
-                       GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
-
-  Real lapx = 0.0;
-  Real lapy = 0.0;
-  Real lapz = 0.0;
-
-  // we use 2nd-order accurate one-sided stencils at the physical
-  // boundaries note: this differs from the suggestion in MC2011 --
-  // they just suggest using the Laplacian from +1 off the interior.
-  // I like the one-sided better.
-
-  if (i == domlo[0] && physbc_lo[0] != Interior) {
-    lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n);
-
-  } else if (i == domhi[0] && physbc_hi[0] != Interior) {
-    lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n);
-
-  } else {
-    lapx = a(i+1,j,k,n) - TWO*a(i,j,k,n) + a(i-1,j,k,n);
-  }
-
-#if AMREX_SPACEDIM >= 2
-  if (j == domlo[1] && physbc_lo[1] != Interior) {
-    lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n);
-
-  } else if (j == domhi[1] && physbc_hi[1] != Interior) {
-    lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n);
-
-  } else {
-    lapy = a(i,j+1,k,n) - TWO*a(i,j,k,n) + a(i,j-1,k,n);
-  }
-#endif
-
-#if AMREX_SPACEDIM == 3
-  if (k == domlo[2] && physbc_lo[2] != Interior) {
-    lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n);
-
-  } else if (k == domhi[2] && physbc_hi[2] != Interior) {
-    lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n);
-
-  } else {
-    lapz = a(i,j,k+1,n) - TWO*a(i,j,k,n) + a(i,j,k-1,n);
-  }
-#endif
-
-  return lapx + lapy + lapz;
-}
-
-
-Real trans_laplacian(int i, int j, int k, int n,
-                     int idir,
-                     Array4<Real const> const a,
-                     GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
-
-  Real lapx = 0.0;
-  Real lapy = 0.0;
-  Real lapz = 0.0;
-
-  // we use 2nd-order accurate one-sided stencils at the physical boundaries
-  if (idir != 0) {
-
-    if (i == domlo[0] && physbc_lo[0] != Interior) {
-      lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i+1,j,k,n) + 4.0_rt*a(i+2,j,k,n) - a(i+3,j,k,n);
-
-    } else if (i == domhi[0] && physbc_hi[0] != Interior) {
-      lapx = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i-1,j,k,n) + 4.0_rt*a(i-2,j,k,n) - a(i-3,j,k,n);
-
-    } else {
-      lapx = a(i+1,j,k,n) - TWO*a(i,j,k,n) + a(i-1,j,k,n);
-    }
-  }
-
-  if (idir != 1) {
 
-    if (j == domlo[1] && physbc_lo[1] != Interior) {
-      lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j+1,k,n) + 4.0_rt*a(i,j+2,k,n) - a(i,j+3,k,n);
-
-    } else if (j == domhi[1] && physbc_hi[1] != Interior) {
-      lapy = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j-1,k,n) + 4.0_rt*a(i,j-2,k,n) - a(i,j-3,k,n);
-
-    } else {
-      lapy = a(i,j+1,k,n) - TWO*a(i,j,k,n) + a(i,j-1,k,n);
-    }
-  }
-
-#if AMREX_SPACEDIM == 3
-  if (idir != 2) {
-
-    if (k == domlo[2] && physbc_lo[2] != Interior) {
-      lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k+1,n) + 4.0_rt*a(i,j,k+2,n) - a(i,j,k+3,n);
-
-    } else if (k == domhi[2] && physbc_hi[2] != Interior) {
-      lapz = 2.0_rt*a(i,j,k,n) - 5.0_rt*a(i,j,k-1,n) + 4.0_rt*a(i,j,k-2,n) - a(i,j,k-3,n);
-
-    } else {
-      lapz = a(i,j,k+1,n) - TWO*a(i,j,k,n) + a(i,j,k-1,n);
-    }
-  }
-#endif
-
-  return lapx + lapy + lapz;
-}
-
-
-void make_cell_center(const Box& bx,
-                      Array4<Real const> const& U,
-                      Array4<Real> const& U_cc,
-                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+void
+Castro::make_cell_center(const Box& bx,
+                         Array4<Real const> const& U,
+                         Array4<Real> const& U_cc,
+                         GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
 
   // Take a cell-average state U and a convert it to a cell-center
   // state U_cc via U_cc = U - 1/24 L U
 
-  U_lo = U.lbound();
-  U_hi = U.ubound();
+  auto U_lo = lbound(U);
+  auto U_hi = ubound(U);
 
   AMREX_ASSERT(U_lo[0] <= lo[0]-1 && U_hi[0] >= hi[0]+1 &&
                (AMREX_SPACEDIM >= 2 && (U_lo[1] <= lo[1]-1 && U_hi[1] >= hi[1]+1)) &&
                (AMREX_SPACEDIM == 3 && (U_lo[2] <= lo[2]-1 && U_hi[2] >= hi[2]+1)));
 
+  const int* lo_bc = phys_bc.lo();
+  const int* hi_bc = phys_bc.hi();
+
+  GpuArray<bool, 3> lo_periodic;
+  GpuArray<bool, 3> hi_periodic;
+  for (int idir = 0; idir < 3; idir++) {
+    lo_periodic[idir] = lo_bc[idir] == Interior;
+    hi_periodic[idir] = hi_bc[idir] == Interior;
+  }
+
   amrex::ParallelFor(bx, U.nComp(),
   [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
   {
-    Real lap = compute_laplacian(i, j, k, n, U, domlo, domhi);
+    Real lap = compute_laplacian(i, j, k, n, U,
+                                 lo_periodic, hi_periodic, domlo, domhi);
 
     U_cc(i,j,k,n) = U(i,j,k,n) - (1.0_rt/24.0_rt) * lap;
 
   });
 }
 
-void make_cell_center_in_place(const Box& bx,
-                               Array4<Real> const& U,
-                               Array4<Real> const& tmp,
-                               GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+void
+Castro::make_cell_center_in_place(const Box& bx,
+                                  Array4<Real> const& U,
+                                  Array4<Real> const& tmp,
+                                  GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
 
   // Take a cell-average state U and make it cell-centered in place
   // via U <- U - 1/24 L U.  Note that this operation is not tile
@@ -153,12 +55,23 @@ void make_cell_center_in_place(const Box& bx,
 
   // here, tmp is a temporary memory space we use to store one-component's Laplacian
 
+  const int* lo_bc = phys_bc.lo();
+  const int* hi_bc = phys_bc.hi();
+
+  GpuArray<bool, 3> lo_periodic;
+  GpuArray<bool, 3> hi_periodic;
+  for (int idir = 0; idir < 3; idir++) {
+    lo_periodic[idir] = lo_bc[idir] == Interior;
+    hi_periodic[idir] = hi_bc[idir] == Interior;
+  }
+
   for (int n = 0; n < U.nComp(); n++) {
 
     amrex::ParallelFor(bx,
     [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
     {
-      tmp(i,j,k) = compute_laplacian(i, j, k, n, U, domlo, domhi);
+      tmp(i,j,k) = compute_laplacian(i, j, k, n, U,
+                                     lo_periodic, hi_periodic, domlo, domhi);
     });
 
     amrex::ParallelFor(bx,
@@ -170,47 +83,72 @@ void make_cell_center_in_place(const Box& bx,
 }
 
 
-void compute_lap_term(const Box& bx,
-                      Array4<Real const> const& U,
-                      Array4<Real> const& lap, const int ncomp,
-                      GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+void
+Castro::compute_lap_term(const Box& bx,
+                         Array4<Real const> const& U,
+                         Array4<Real> const& lap, const int ncomp,
+                         GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
 
   // Computes the h**2/24 L U term that is used in correcting
   // cell-center to averages (and back)
 
+  const int* lo_bc = phys_bc.lo();
+  const int* hi_bc = phys_bc.hi();
+
+  GpuArray<bool, 3> lo_periodic;
+  GpuArray<bool, 3> hi_periodic;
+  for (int idir = 0; idir < 3; idir++) {
+    lo_periodic[idir] = lo_bc[idir] == Interior;
+    hi_periodic[idir] = hi_bc[idir] == Interior;
+  }
+
   amrex::ParallelFor(bx,
   [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
   {
     lap(i,j,k) = (1.0_rt/24.0_rt) *
-      compute_laplacian(i, j, k, ncomp, U, domlo, domhi);
+      compute_laplacian(i, j, k, ncomp, U,
+                        lo_periodic, hi_periodic, domlo, domhi);
   });
 
 }
 
 
-void make_fourth_average(const Box& bx,
-                         Array4<Real> const& q,
-                         Array4<Real const> const& q_bar,
-                         GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+void
+Castro::make_fourth_average(const Box& bx,
+                            Array4<Real> const& q,
+                            Array4<Real const> const& q_bar,
+                            GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
 
   // Take the cell-center state q and another state q_bar (e.g.,
   // constructed from the cell-average U) and replace the cell-center
   // q with a 4th-order accurate cell-average, q <- q + 1/24 L q_bar
 
+  const int* lo_bc = phys_bc.lo();
+  const int* hi_bc = phys_bc.hi();
+
+  GpuArray<bool, 3> lo_periodic;
+  GpuArray<bool, 3> hi_periodic;
+  for (int idir = 0; idir < 3; idir++) {
+    lo_periodic[idir] = lo_bc[idir] == Interior;
+    hi_periodic[idir] = hi_bc[idir] == Interior;
+  }
+
   amrex::ParallelFor(bx, q.nComp(),
   [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
   {
-    Real lap = compute_laplacian(i, j, k, n, q_bar, domlo, domhi);
+    Real lap = compute_laplacian(i, j, k, n, q_bar,
+                                 lo_periodic, hi_periodic, domlo, domhi);
 
     q(i,j,k,n) += (1.0_rt/24.0_rt) * lap;
   });
 }
 
 
-void make_fourth_in_place(const Box& bx,
-                          Array4<Real> const& q,
-                          Array4<Real> const& tmp,
-                          GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+void
+Castro::make_fourth_in_place(const Box& bx,
+                             Array4<Real> const& q,
+                             Array4<Real> const& tmp,
+                             GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
 
   // Take the cell-center q and makes it a cell-average q, in place
   // (e.g. q is overwritten by its average), q <- q + 1/24 L q.
@@ -222,10 +160,11 @@ void make_fourth_in_place(const Box& bx,
 }
 
 
-void make_fourth_in_place_n(const Box& bx,
-                            Array4<Real> const& q, const int ncomp,
-                            Array4<Real> const& tmp,
-                            GpuArray<Real, 3>& domlo, GpuArray<Real, 3>& domhi) {
+void
+Castro::make_fourth_in_place_n(const Box& bx,
+                               Array4<Real> const& q, const int ncomp,
+                               Array4<Real> const& tmp,
+                               GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
 
   // Take the cell-center q and makes it a cell-average q, in place
   // (e.g. q is overwritten by its average), q <- q + 1/24 L q.
@@ -234,16 +173,27 @@ void make_fourth_in_place_n(const Box& bx,
   // This version operates on a single component.  Here ncomp is the
   // component to update
 
+  const int* lo_bc = phys_bc.lo();
+  const int* hi_bc = phys_bc.hi();
+
+  GpuArray<bool, 3> lo_periodic;
+  GpuArray<bool, 3> hi_periodic;
+  for (int idir = 0; idir < 3; idir++) {
+    lo_periodic[idir] = lo_bc[idir] == Interior;
+    hi_periodic[idir] = hi_bc[idir] == Interior;
+  }
+
   amrex::ParallelFor(bx,
   [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
   {
-    tmp(i,j,k) = compute_laplacian(i, j, k, ncomp, q, domlo, domhi);
+    tmp(i,j,k) = compute_laplacian(i, j, k, ncomp, q,
+                                   lo_periodic, hi_periodic, domlo, domhi);
   });
 
   amrex::ParallelFor(bx,
   [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
   {
-    q(i,j,k,ncomp) += (1.0_rt/24.0_rt) * lap(i,j,k);
+    q(i,j,k,ncomp) += (1.0_rt/24.0_rt) * tmp(i,j,k);
   });
 
 }

From 969b046d60ae0a083821b24d3e20d56639d6e3e1 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 13:07:24 -0400
Subject: [PATCH 4/9] fix assert

---
 Source/hydro/fourth_center_average.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Source/hydro/fourth_center_average.cpp b/Source/hydro/fourth_center_average.cpp
index 9f715082b8..1a5685995a 100644
--- a/Source/hydro/fourth_center_average.cpp
+++ b/Source/hydro/fourth_center_average.cpp
@@ -18,9 +18,12 @@ Castro::make_cell_center(const Box& bx,
   auto U_lo = lbound(U);
   auto U_hi = ubound(U);
 
-  AMREX_ASSERT(U_lo[0] <= lo[0]-1 && U_hi[0] >= hi[0]+1 &&
-               (AMREX_SPACEDIM >= 2 && (U_lo[1] <= lo[1]-1 && U_hi[1] >= hi[1]+1)) &&
-               (AMREX_SPACEDIM == 3 && (U_lo[2] <= lo[2]-1 && U_hi[2] >= hi[2]+1)));
+  auto lo = bx.loVect();
+  auto hi = bx.hiVect();
+
+  AMREX_ASSERT(U_lo.x <= lo[0]-1 && U_hi.x >= hi[0]+1 &&
+               (AMREX_SPACEDIM >= 2 && (U_lo.y <= lo[1]-1 && U_hi.y >= hi[1]+1)) &&
+               (AMREX_SPACEDIM == 3 && (U_lo.z <= lo[2]-1 && U_hi.z >= hi[2]+1)));
 
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();

From afce97b212f95bd25676099cd2070cf75401d754 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 13:18:04 -0400
Subject: [PATCH 5/9] fix the fix

---
 Source/hydro/fourth_center_average.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Source/hydro/fourth_center_average.cpp b/Source/hydro/fourth_center_average.cpp
index 1a5685995a..95053f3e0f 100644
--- a/Source/hydro/fourth_center_average.cpp
+++ b/Source/hydro/fourth_center_average.cpp
@@ -21,9 +21,13 @@ Castro::make_cell_center(const Box& bx,
   auto lo = bx.loVect();
   auto hi = bx.hiVect();
 
-  AMREX_ASSERT(U_lo.x <= lo[0]-1 && U_hi.x >= hi[0]+1 &&
-               (AMREX_SPACEDIM >= 2 && (U_lo.y <= lo[1]-1 && U_hi.y >= hi[1]+1)) &&
-               (AMREX_SPACEDIM == 3 && (U_lo.z <= lo[2]-1 && U_hi.z >= hi[2]+1)));
+  AMREX_ASSERT(U_lo.x <= lo[0]-1 && U_hi.x >= hi[0]+1);
+#if AMREX_SPACEDIM >= 2
+  AMREX_ASSERT(U_lo.y <= lo[1]-1 && U_hi.y >= hi[1]+1);
+#endif
+#if AMREX_SPACEDIM == 3
+  AMREX_ASSERT(U_lo.z <= lo[2]-1 && U_hi.z >= hi[2]+1);
+#endif
 
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();

From 4322ff6484aa64b934b2a5a0cd8955ec922d9e8a Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 13:23:00 -0400
Subject: [PATCH 6/9] fix a copy for SDC -- wrong number of ghost cells were
 assumed

---
 Source/driver/Castro_advance_sdc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/driver/Castro_advance_sdc.cpp b/Source/driver/Castro_advance_sdc.cpp
index d1b9191b89..a7da9e7b1c 100644
--- a/Source/driver/Castro_advance_sdc.cpp
+++ b/Source/driver/Castro_advance_sdc.cpp
@@ -147,7 +147,7 @@ Castro::do_advance_sdc (Real time,
 
         // store the result in sources_for_hydro -- this is what will
         // be used in the final conservative update
-        MultiFab::Copy(sources_for_hydro, old_source, 0, 0, NSRC, sources_for_hydro.nGrow());
+        MultiFab::Copy(sources_for_hydro, old_source, 0, 0, NSRC, old_source.nGrow());
 
       } else {
         sources_for_hydro.setVal(0.0, 0);

From 2415b8c400461f08a2aad9cac36adf1f7fb20970 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 13:38:39 -0400
Subject: [PATCH 7/9] fix a box

---
 Source/hydro/Castro_mol_hydro.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/hydro/Castro_mol_hydro.cpp b/Source/hydro/Castro_mol_hydro.cpp
index aca963366a..a6f021a2aa 100644
--- a/Source/hydro/Castro_mol_hydro.cpp
+++ b/Source/hydro/Castro_mol_hydro.cpp
@@ -298,7 +298,7 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
               hi_periodic[idir] = hi_bc[idir] == Interior;
             }
 
-            amrex::ParallelFor(bx, NQ,
+            amrex::ParallelFor(nbx, NQ,
             [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
             {
               bool test = (n == QGC) || (n == QTEMP);

From fd605153687a4d2fdbb2ab746ed54ac02bfaf850 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Mon, 27 Apr 2020 14:18:41 -0400
Subject: [PATCH 8/9] fix compilation

---
 Source/sdc/Castro_sdc.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Source/sdc/Castro_sdc.cpp b/Source/sdc/Castro_sdc.cpp
index 65cfae3999..3aebc95047 100644
--- a/Source/sdc/Castro_sdc.cpp
+++ b/Source/sdc/Castro_sdc.cpp
@@ -26,8 +26,8 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
     // for 4th order reactive SDC, we need to first compute the source, C
     // and do a ghost cell fill on it
 
-    auto const domain_lo = geom.Domain().loVect3d();
-    auto const domain_hi = geom.Domain().hiVect3d();
+    auto domain_lo = geom.Domain().loVect3d();
+    auto domain_hi = geom.Domain().hiVect3d();
 
     // the timestep from m to m+1
     Real dt_m = (dt_sdc[m_end] - dt_sdc[m_start]) * dt;
@@ -123,7 +123,7 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
     FArrayBox C_center;
     FArrayBox U_new_center;
     FArrayBox R_new;
-    FArrayBox tmp;
+    FArrayBox tlap;
 
     FArrayBox C2;
 
@@ -230,17 +230,17 @@ Castro::do_sdc_update(int m_start, int m_end, Real dt)
             // place (only for the interior)
             R_new.resize(bx1, NUM_STATE);
             Elixir elix_R_new = R_new.elixir();
-            Array4<const Real> const& R_new_arr=R_new.array();
+            Array4<Real> const& R_new_arr = R_new.array();
 
             ca_instantaneous_react(BL_TO_FORTRAN_BOX(bx1),
                                    BL_TO_FORTRAN_3D(U_new_center),
                                    BL_TO_FORTRAN_3D(R_new));
 
-            tmp.resize(bx, 1);
-            Elixir elix_tmp = tmp.elixir();
-            auto const tmp_arr = tmp.array();
+            tlap.resize(bx, 1);
+            Elixir elix_tlap = tlap.elixir();
+            auto const tlap_arr = tlap.array();
 
-            make_fourth_in_place(bx, R_new_arr, tmp_arr, domain_lo, domain_hi);
+            make_fourth_in_place(bx, R_new_arr, tlap_arr, domain_lo, domain_hi);
 
             // now do the conservative update using this <R> to get <U>
             // We'll also need to pass in <C>
@@ -315,8 +315,8 @@ Castro::construct_old_react_source(MultiFab& U_state,
 
     BL_PROFILE("Castro::construct_old_react_source()");
 
-    auto const domain_lo = geom.Domain().loVect3d();
-    auto const domain_hi = geom.Domain().hiVect3d();
+    auto domain_lo = geom.Domain().loVect3d();
+    auto domain_hi = geom.Domain().hiVect3d();
 
     if (sdc_order == 4 && input_is_average)
     {

From 859a796e15a65bc38e2651836b5aaca344261797 Mon Sep 17 00:00:00 2001
From: Michael Zingale <michael.zingale@stonybrook.edu>
Date: Wed, 6 May 2020 09:24:26 -0400
Subject: [PATCH 9/9] address some PR reviews

---
 Source/hydro/Castro_hydro.H            | 19 ++++++------
 Source/hydro/Castro_mol_hydro.cpp      |  6 ++--
 Source/hydro/fourth_center_average.H   |  8 ++---
 Source/hydro/fourth_center_average.cpp | 42 +++++++++++++-------------
 4 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/Source/hydro/Castro_hydro.H b/Source/hydro/Castro_hydro.H
index ed37d67bfd..f1e238c668 100644
--- a/Source/hydro/Castro_hydro.H
+++ b/Source/hydro/Castro_hydro.H
@@ -545,46 +545,47 @@
 #endif
 
 #ifdef TRUE_SDC
-
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Real compute_laplacian(int i, int j, int k, int n,
                            Array4<Real const> const a,
-                           GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                           GpuArray<bool, AMREX_SPACEDIM> const& lo_periodic, GpuArray<bool, AMREX_SPACEDIM> const& hi_periodic,
                            GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Real trans_laplacian(int i, int j, int k, int n,
                          int idir,
                          Array4<Real const> const a,
-                         GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                         GpuArray<bool, AMREX_SPACEDIM> const& lo_periodic, GpuArray<bool, AMREX_SPACEDIM> const& hi_periodic,
                          GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
     void make_cell_center(const Box& bx,
                           Array4<Real const> const& U,
                           Array4<Real> const& U_cc,
-                          GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+                          GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
     void make_cell_center_in_place(const Box& bx,
                                    Array4<Real> const& U,
                                    Array4<Real> const& tmp,
-                                   GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+                                   GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
     void compute_lap_term(const Box& bx,
                           Array4<Real const> const& U,
                           Array4<Real> const& lap, const int ncomp,
-                          GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+                          GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
     void make_fourth_average(const Box& bx,
                              Array4<Real> const& q,
                              Array4<Real const> const& q_bar,
-                             GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+                             GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
     void make_fourth_in_place(const Box& bx,
                               Array4<Real> const& q,
                               Array4<Real> const& tmp,
-                              GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+                              GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
     void make_fourth_in_place_n(const Box& bx,
                                 Array4<Real> const& q, const int ncomp,
                                 Array4<Real> const& tmp,
-                                GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi);
+                                GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi);
 
 #endif
diff --git a/Source/hydro/Castro_mol_hydro.cpp b/Source/hydro/Castro_mol_hydro.cpp
index a6f021a2aa..05c0690324 100644
--- a/Source/hydro/Castro_mol_hydro.cpp
+++ b/Source/hydro/Castro_mol_hydro.cpp
@@ -291,9 +291,9 @@ Castro::construct_mol_hydro_source(Real time, Real dt, MultiFab& A_update)
             const int* lo_bc = phys_bc.lo();
             const int* hi_bc = phys_bc.hi();
 
-            GpuArray<bool, 3> lo_periodic;
-            GpuArray<bool, 3> hi_periodic;
-            for (int idir = 0; idir < 3; idir++) {
+            GpuArray<bool, AMREX_SPACEDIM> lo_periodic;
+            GpuArray<bool, AMREX_SPACEDIM> hi_periodic;
+            for (int idir = 0; idir < AMREX_SPACEDIM; idir++) {
               lo_periodic[idir] = lo_bc[idir] == Interior;
               hi_periodic[idir] = hi_bc[idir] == Interior;
             }
diff --git a/Source/hydro/fourth_center_average.H b/Source/hydro/fourth_center_average.H
index eda63cec16..86bb837091 100644
--- a/Source/hydro/fourth_center_average.H
+++ b/Source/hydro/fourth_center_average.H
@@ -1,11 +1,11 @@
 #ifndef _FOURTH_CENTER_AVERAGE_H_
 #define _FOURTH_CENTER_AVERAGE_H_
 
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 Real
 Castro:: compute_laplacian(int i, int j, int k, int n,
                            Array4<Real const> const a,
-                           GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                           GpuArray<bool, AMREX_SPACEDIM> const& lo_periodic, GpuArray<bool, AMREX_SPACEDIM> const& hi_periodic,
                            GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   Real lapx = 0.0;
@@ -54,12 +54,12 @@ Castro:: compute_laplacian(int i, int j, int k, int n,
   return lapx + lapy + lapz;
 }
 
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 Real
 Castro::trans_laplacian(int i, int j, int k, int n,
                         int idir,
                         Array4<Real const> const a,
-                        GpuArray<bool, 3> const& lo_periodic, GpuArray<bool, 3> const& hi_periodic,
+                        GpuArray<bool, AMREX_SPACEDIM> const& lo_periodic, GpuArray<bool, AMREX_SPACEDIM> const& hi_periodic,
                         GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   Real lapx = 0.0;
diff --git a/Source/hydro/fourth_center_average.cpp b/Source/hydro/fourth_center_average.cpp
index 95053f3e0f..04a909b795 100644
--- a/Source/hydro/fourth_center_average.cpp
+++ b/Source/hydro/fourth_center_average.cpp
@@ -10,7 +10,7 @@ void
 Castro::make_cell_center(const Box& bx,
                          Array4<Real const> const& U,
                          Array4<Real> const& U_cc,
-                         GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
+                         GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   // Take a cell-average state U and a convert it to a cell-center
   // state U_cc via U_cc = U - 1/24 L U
@@ -32,9 +32,9 @@ Castro::make_cell_center(const Box& bx,
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();
 
-  GpuArray<bool, 3> lo_periodic;
-  GpuArray<bool, 3> hi_periodic;
-  for (int idir = 0; idir < 3; idir++) {
+  GpuArray<bool, AMREX_SPACEDIM> lo_periodic;
+  GpuArray<bool, AMREX_SPACEDIM> hi_periodic;
+  for (int idir = 0; idir < AMREX_SPACEDIM; idir++) {
     lo_periodic[idir] = lo_bc[idir] == Interior;
     hi_periodic[idir] = hi_bc[idir] == Interior;
   }
@@ -54,7 +54,7 @@ void
 Castro::make_cell_center_in_place(const Box& bx,
                                   Array4<Real> const& U,
                                   Array4<Real> const& tmp,
-                                  GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
+                                  GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   // Take a cell-average state U and make it cell-centered in place
   // via U <- U - 1/24 L U.  Note that this operation is not tile
@@ -65,9 +65,9 @@ Castro::make_cell_center_in_place(const Box& bx,
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();
 
-  GpuArray<bool, 3> lo_periodic;
-  GpuArray<bool, 3> hi_periodic;
-  for (int idir = 0; idir < 3; idir++) {
+  GpuArray<bool, AMREX_SPACEDIM> lo_periodic;
+  GpuArray<bool, AMREX_SPACEDIM> hi_periodic;
+  for (int idir = 0; idir < AMREX_SPACEDIM; idir++) {
     lo_periodic[idir] = lo_bc[idir] == Interior;
     hi_periodic[idir] = hi_bc[idir] == Interior;
   }
@@ -94,7 +94,7 @@ void
 Castro::compute_lap_term(const Box& bx,
                          Array4<Real const> const& U,
                          Array4<Real> const& lap, const int ncomp,
-                         GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
+                         GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   // Computes the h**2/24 L U term that is used in correcting
   // cell-center to averages (and back)
@@ -102,9 +102,9 @@ Castro::compute_lap_term(const Box& bx,
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();
 
-  GpuArray<bool, 3> lo_periodic;
-  GpuArray<bool, 3> hi_periodic;
-  for (int idir = 0; idir < 3; idir++) {
+  GpuArray<bool, AMREX_SPACEDIM> lo_periodic;
+  GpuArray<bool, AMREX_SPACEDIM> hi_periodic;
+  for (int idir = 0; idir < AMREX_SPACEDIM; idir++) {
     lo_periodic[idir] = lo_bc[idir] == Interior;
     hi_periodic[idir] = hi_bc[idir] == Interior;
   }
@@ -124,7 +124,7 @@ void
 Castro::make_fourth_average(const Box& bx,
                             Array4<Real> const& q,
                             Array4<Real const> const& q_bar,
-                            GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
+                            GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   // Take the cell-center state q and another state q_bar (e.g.,
   // constructed from the cell-average U) and replace the cell-center
@@ -133,9 +133,9 @@ Castro::make_fourth_average(const Box& bx,
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();
 
-  GpuArray<bool, 3> lo_periodic;
-  GpuArray<bool, 3> hi_periodic;
-  for (int idir = 0; idir < 3; idir++) {
+  GpuArray<bool, AMREX_SPACEDIM> lo_periodic;
+  GpuArray<bool, AMREX_SPACEDIM> hi_periodic;
+  for (int idir = 0; idir < AMREX_SPACEDIM; idir++) {
     lo_periodic[idir] = lo_bc[idir] == Interior;
     hi_periodic[idir] = hi_bc[idir] == Interior;
   }
@@ -155,7 +155,7 @@ void
 Castro::make_fourth_in_place(const Box& bx,
                              Array4<Real> const& q,
                              Array4<Real> const& tmp,
-                             GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
+                             GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   // Take the cell-center q and makes it a cell-average q, in place
   // (e.g. q is overwritten by its average), q <- q + 1/24 L q.
@@ -171,7 +171,7 @@ void
 Castro::make_fourth_in_place_n(const Box& bx,
                                Array4<Real> const& q, const int ncomp,
                                Array4<Real> const& tmp,
-                               GpuArray<int, 3>& domlo, GpuArray<int, 3>& domhi) {
+                               GpuArray<int, 3> const& domlo, GpuArray<int, 3> const& domhi) {
 
   // Take the cell-center q and makes it a cell-average q, in place
   // (e.g. q is overwritten by its average), q <- q + 1/24 L q.
@@ -183,9 +183,9 @@ Castro::make_fourth_in_place_n(const Box& bx,
   const int* lo_bc = phys_bc.lo();
   const int* hi_bc = phys_bc.hi();
 
-  GpuArray<bool, 3> lo_periodic;
-  GpuArray<bool, 3> hi_periodic;
-  for (int idir = 0; idir < 3; idir++) {
+  GpuArray<bool, AMREX_SPACEDIM> lo_periodic;
+  GpuArray<bool, AMREX_SPACEDIM> hi_periodic;
+  for (int idir = 0; idir < AMREX_SPACEDIM; idir++) {
     lo_periodic[idir] = lo_bc[idir] == Interior;
     hi_periodic[idir] = hi_bc[idir] == Interior;
   }