diff --git a/src/DataLayouts/cuda.jl b/src/DataLayouts/cuda.jl
index a3a4be35d1..2be14183e6 100644
--- a/src/DataLayouts/cuda.jl
+++ b/src/DataLayouts/cuda.jl
@@ -78,7 +78,10 @@ function Base.copyto!(
 ) where {S, Nij, A <: CUDA.CuArray}
     _, _, _, _, Nh = size(bc)
     if Nh > 0
-        CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!(dest, bc)
+        CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!(
+            dest,
+            bc,
+        )
     end
     return dest
 end
@@ -92,7 +95,10 @@ function Base.fill!(
 }
     _, _, _, _, Nh = size(dest)
     if Nh > 0
-        CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!(dest, val)
+        CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!(
+            dest,
+            val,
+        )
     end
     return dest
 end
@@ -133,14 +139,20 @@ function Base.copyto!(
 ) where {S, A <: CUDA.CuArray}
     _, _, _, Nv, Nh = size(bc)
     if Nv > 0 && Nh > 0
-        CUDA.@cuda threads = (1, 1) blocks = (Nh, Nv) knl_copyto!(dest, bc)
+        CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_copyto!(
+            dest,
+            bc,
+        )
     end
     return dest
 end
 function Base.fill!(dest::VF{S, A}, val) where {S, A <: CUDA.CuArray}
     _, _, _, Nv, Nh = size(dest)
     if Nv > 0 && Nh > 0
-        CUDA.@cuda threads = (1, 1) blocks = (Nh, Nv) knl_fill!(dest, val)
+        CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_fill!(
+            dest,
+            val,
+        )
     end
     return dest
 end
@@ -149,10 +161,16 @@ function Base.copyto!(
     dest::DataF{S},
     bc::Union{DataF{S, A}, Base.Broadcast.Broadcasted{DataFStyle{A}}},
 ) where {S, A <: CUDA.CuArray}
-    CUDA.@cuda threads = (1, 1) blocks = (1, 1) knl_copyto!(dest, bc)
+    CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_copyto!(
+        dest,
+        bc,
+    )
     return dest
 end
 function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray}
-    CUDA.@cuda threads = (1, 1) blocks = (1, 1) knl_fill!(dest, val)
+    CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_fill!(
+        dest,
+        val,
+    )
     return dest
 end
diff --git a/src/Fields/mapreduce_cuda.jl b/src/Fields/mapreduce_cuda.jl
index 64a88e13c7..a54b7a53af 100644
--- a/src/Fields/mapreduce_cuda.jl
+++ b/src/Fields/mapreduce_cuda.jl
@@ -124,7 +124,7 @@ function mapreduce_cuda(
     reduce_cuda = CuArray{T}(undef, nblocks, Nf)
     shmemsize = nthreads
     # place each field on a different block
-    @cuda threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!(
+    @cuda always_inline = true threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!(
         reduce_cuda,
         f,
         op,
@@ -138,7 +138,7 @@ function mapreduce_cuda(
     if nblocks > 1
         nthreads = min(32, nblocks)
         shmemsize = nthreads
-        @cuda threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!(
+        @cuda always_inline = true threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!(
             reduce_cuda,
             op,
             Val(shmemsize),
diff --git a/src/Limiters/quasimonotone.jl b/src/Limiters/quasimonotone.jl
index f87fd924e3..c740087ef7 100644
--- a/src/Limiters/quasimonotone.jl
+++ b/src/Limiters/quasimonotone.jl
@@ -106,7 +106,7 @@ function compute_element_bounds!(
     (Ni, Nj, _, Nv, Nh) = S
     nthreads, nblocks = config_threadblock(Nv, Nh)
 
-    CUDA.@cuda threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
         limiter,
         Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
         Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
@@ -221,7 +221,7 @@ function compute_neighbor_bounds_local!(
     topology = Spaces.topology(axes(ρ))
     Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
     nthreads, nblocks = config_threadblock(Nv, Nh)
-    CUDA.@cuda threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
         limiter,
         topology.local_neighbor_elem,
         topology.local_neighbor_elem_offset,
@@ -388,7 +388,7 @@ function apply_limiter!(
     maxiter = Ni * Nj
     WJ = Spaces.local_geometry_data(axes(ρq)).WJ
     nthreads, nblocks = config_threadblock(Nv, Nh)
-    CUDA.@cuda threads = nthreads blocks = nblocks apply_limiter_kernel!(
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks apply_limiter_kernel!(
         limiter,
         Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
         Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
diff --git a/src/MatrixFields/single_field_solver.jl b/src/MatrixFields/single_field_solver.jl
index a0f9f7b2c1..330a742c07 100644
--- a/src/MatrixFields/single_field_solver.jl
+++ b/src/MatrixFields/single_field_solver.jl
@@ -51,7 +51,7 @@ single_field_solve!(::ClimaComms.AbstractCPUDevice, cache, x, A, b) =
 function single_field_solve!(::ClimaComms.CUDADevice, cache, x, A, b)
     Ni, Nj, _, _, Nh = size(Fields.field_values(A))
     nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
-    CUDA.@cuda threads = nthreads blocks = nblocks single_field_solve_kernel!(
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks single_field_solve_kernel!(
         cache,
         x,
         A,
diff --git a/src/Operators/integrals.jl b/src/Operators/integrals.jl
index 4406b9ef28..364d1a6c63 100644
--- a/src/Operators/integrals.jl
+++ b/src/Operators/integrals.jl
@@ -23,7 +23,7 @@ function column_integral_definite!(
     space = axes(∫field)
     Ni, Nj, _, _, Nh = size(Fields.field_values(∫field))
     nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
-    @cuda threads = nthreads blocks = nblocks column_integral_definite_kernel!(
+    @cuda always_inline = true threads = nthreads blocks = nblocks column_integral_definite_kernel!(
         strip_space(∫field, space),
         strip_space(ᶜfield, space),
     )
@@ -114,7 +114,7 @@ function column_integral_indefinite!(
 )
     Ni, Nj, _, _, Nh = size(Fields.field_values(ᶠ∫field))
     nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
-    @cuda threads = nthreads blocks = nblocks column_integral_indefinite_kernel!(
+    @cuda always_inline = true threads = nthreads blocks = nblocks column_integral_indefinite_kernel!(
         ᶠ∫field,
         ᶜfield,
     )
@@ -295,7 +295,7 @@ function column_mapreduce_device!(
     else
         column_mapreduce_kernel!
     end
-    @cuda threads = nthreads blocks = nblocks kernel!(
+    @cuda always_inline = true threads = nthreads blocks = nblocks kernel!(
         fn,
         op,
         # reduced_field,
diff --git a/src/Operators/thomas_algorithm.jl b/src/Operators/thomas_algorithm.jl
index f5f460c6ff..8047be965f 100644
--- a/src/Operators/thomas_algorithm.jl
+++ b/src/Operators/thomas_algorithm.jl
@@ -17,7 +17,10 @@ column_thomas_solve!(::ClimaComms.AbstractCPUDevice, A, b) =
 function column_thomas_solve!(::ClimaComms.CUDADevice, A, b)
     Ni, Nj, _, _, Nh = size(Fields.field_values(A))
     nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
-    @cuda threads = nthreads blocks = nblocks thomas_algorithm_kernel!(A, b)
+    @cuda always_inline = true threads = nthreads blocks = nblocks thomas_algorithm_kernel!(
+        A,
+        b,
+    )
 end
 
 function thomas_algorithm_kernel!(
diff --git a/src/Remapping/distributed_remapping.jl b/src/Remapping/distributed_remapping.jl
index c399a424a3..72178f8519 100644
--- a/src/Remapping/distributed_remapping.jl
+++ b/src/Remapping/distributed_remapping.jl
@@ -515,7 +515,7 @@ function _set_interpolated_values!(
     if ClimaComms.device(field) isa ClimaComms.CUDADevice
         nblocks, _ = size(interpolation_matrix[1])
         nthreads = length(vert_interpolation_weights)
-        @cuda threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
+        @cuda always_inline = true threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
             out,
             interpolation_matrix,
             local_horiz_indices,
@@ -559,7 +559,7 @@ function _set_interpolated_values!(
     if ClimaComms.device(space) isa ClimaComms.CUDADevice
         nitems = length(out)
         nthreads, nblocks = Topologies._configure_threadblock(nitems)
-        @cuda threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
+        @cuda always_inline = true threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
             out,
             local_horiz_interpolation_weights,
             local_horiz_indices,
diff --git a/src/Remapping/interpolate_array.jl b/src/Remapping/interpolate_array.jl
index b5b957af13..990b726801 100644
--- a/src/Remapping/interpolate_array.jl
+++ b/src/Remapping/interpolate_array.jl
@@ -89,7 +89,7 @@ function interpolate_slab!(
     nitems = length(output_array)
     nthreads, nblocks = Topologies._configure_threadblock(nitems)
 
-    @cuda threads = (nthreads) blocks = (nblocks) interpolate_slab_kernel!(
+    @cuda always_inline = true threads = (nthreads) blocks = (nblocks) interpolate_slab_kernel!(
         output_cuarray,
         field,
         cuslab_indices,
@@ -332,7 +332,7 @@ function interpolate_slab_level!(
 
     nitems = length(vidx_ref_coordinates)
     nthreads, nblocks = Topologies._configure_threadblock(nitems)
-    @cuda threads = (nthreads) blocks = (nblocks) interpolate_slab_level_kernel!(
+    @cuda always_inline = true threads = (nthreads) blocks = (nblocks) interpolate_slab_level_kernel!(
         output_cuarray,
         field,
         cuvidx_ref_coordinates,
diff --git a/src/Topologies/dss_cuda.jl b/src/Topologies/dss_cuda.jl
index ba65ef0ef3..17a6e865e7 100644
--- a/src/Topologies/dss_cuda.jl
+++ b/src/Topologies/dss_cuda.jl
@@ -21,7 +21,7 @@ function dss_load_perimeter_data!(
     (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
     nitems = nlevels * nperimeter * nfid * nelems
     nthreads, nblocks = _configure_threadblock(nitems)
-    CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_load_perimeter_data_kernel!(
+    CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_load_perimeter_data_kernel!(
         pperimeter_data,
         pdata,
         perimeter,
@@ -58,7 +58,7 @@ function dss_unload_perimeter_data!(
     (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
     nitems = nlevels * nperimeter * nfid * nelems
     nthreads, nblocks = _configure_threadblock(nitems)
-    CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_unload_perimeter_data_kernel!(
+    CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_unload_perimeter_data_kernel!(
         pdata,
         pperimeter_data,
         perimeter,
@@ -98,7 +98,7 @@ function dss_local!(
 
         nitems = nlevels * nfid * (nlocalfaces + nlocalvertices)
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_local_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_local_kernel!(
             pperimeter_data,
             topology.local_vertices,
             topology.local_vertex_offset,
@@ -184,7 +184,7 @@ function dss_transform!(
         (nlevels, nperimeter, _, _) = size(pperimeter_data)
         nitems = nlevels * nperimeter * nlocalelems
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_transform_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_transform_kernel!(
             pperimeter_data,
             pdata,
             p∂ξ∂x,
@@ -290,7 +290,7 @@ function dss_untransform!(
         (nlevels, nperimeter, _, _) = size(pperimeter_data)
         nitems = nlevels * nperimeter * nlocalelems
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_untransform_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_untransform_kernel!(
             pperimeter_data,
             pdata,
             p∂ξ∂x,
@@ -376,7 +376,7 @@ function dss_local_ghost!(
         max_threads = 256
         nitems = nlevels * nfid * nghostvertices
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_local_ghost_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_local_ghost_kernel!(
             pperimeter_data,
             topology.ghost_vertices,
             topology.ghost_vertex_offset,
@@ -431,7 +431,7 @@ function fill_send_buffer!(
     if nsend > 0
         nitems = nsend * nlevels * nfid
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!(
             send_data,
             send_buf_idx,
             pperimeter_data,
@@ -473,7 +473,7 @@ function load_from_recv_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer)
     if nrecv > 0
         nitems = nrecv * nlevels * nfid
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!(
             pperimeter_data,
             recv_data,
             recv_buf_idx,
@@ -517,7 +517,7 @@ function dss_ghost!(
         nlevels, _, nfidx, _ = size(pperimeter_data)
         nitems = nlevels * nfidx * nghostvertices
         nthreads, nblocks = _configure_threadblock(nitems)
-        CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_ghost_kernel!(
+        CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_ghost_kernel!(
             pperimeter_data,
             topology.ghost_vertices,
             topology.ghost_vertex_offset,