diff --git a/ext/ClimaCoreCUDAExt.jl b/ext/ClimaCoreCUDAExt.jl
index 167696e93d..8952a6b24a 100644
--- a/ext/ClimaCoreCUDAExt.jl
+++ b/ext/ClimaCoreCUDAExt.jl
@@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
 import ClimaCore.RecursiveApply:
     ⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
 import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
+import ClimaCore.DataLayouts: UniversalSize
 
 include(joinpath("cuda", "cuda_utils.jl"))
 include(joinpath("cuda", "data_layouts.jl"))
diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl
index 15ee90ce34..8c7d87a3bb 100644
--- a/ext/cuda/cuda_utils.jl
+++ b/ext/cuda/cuda_utils.jl
@@ -90,6 +90,12 @@ function auto_launch!(
     return nothing
 end
 
+function threads_via_occupancy(f!::F!, args) where {F!}
+    kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
+    config = CUDA.launch_configuration(kernel.fun)
+    return config.threads
+end
+
 """
     thread_index()
 
diff --git a/ext/cuda/data_layouts.jl b/ext/cuda/data_layouts.jl
index 20cc9d7178..6af88f897d 100644
--- a/ext/cuda/data_layouts.jl
+++ b/ext/cuda/data_layouts.jl
@@ -29,6 +29,7 @@ include("data_layouts_fill.jl")
 include("data_layouts_copyto.jl")
 include("data_layouts_fused_copyto.jl")
 include("data_layouts_mapreduce.jl")
+include("data_layouts_threadblock.jl")
 
 adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
 adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl
index 82cf46d88c..013892e602 100644
--- a/ext/cuda/data_layouts_copyto.jl
+++ b/ext/cuda/data_layouts_copyto.jl
@@ -1,111 +1,38 @@
 DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
 
-function knl_copyto!(dest, src)
-
-    i = CUDA.threadIdx().x
-    j = CUDA.threadIdx().y
-
-    h = CUDA.blockIdx().x
-    v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
-
-    if v <= size(dest, 4)
-        I = CartesianIndex((i, j, 1, v, h))
+function knl_copyto!(dest, src, us)
+    I = universal_index(dest)
+    if is_valid_index(dest, I, us)
         @inbounds dest[I] = src[I]
     end
     return nothing
 end
 
-function Base.copyto!(
-    dest::IJFH{S, Nij, Nh},
-    bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
-    ::ToCUDA,
-) where {S, Nij, Nh}
-    if Nh > 0
-        auto_launch!(
-            knl_copyto!,
-            (dest, bc);
-            threads_s = (Nij, Nij),
-            blocks_s = (Nh, 1),
-        )
-    end
-    return dest
-end
-
-function Base.copyto!(
-    dest::VIJFH{S, Nv, Nij, Nh},
-    bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
-    ::ToCUDA,
-) where {S, Nv, Nij, Nh}
-    if Nv > 0 && Nh > 0
-        Nv_per_block = min(Nv, fld(256, Nij * Nij))
-        Nv_blocks = cld(Nv, Nv_per_block)
-        auto_launch!(
-            knl_copyto!,
-            (dest, bc);
-            threads_s = (Nij, Nij, Nv_per_block),
-            blocks_s = (Nh, Nv_blocks),
-        )
-    end
-    return dest
-end
-
-function Base.copyto!(
-    dest::VF{S, Nv},
-    bc::DataLayouts.BroadcastedUnionVF{S, Nv},
-    ::ToCUDA,
-) where {S, Nv}
-    if Nv > 0
-        auto_launch!(
-            knl_copyto!,
-            (dest, bc);
-            threads_s = (1, 1),
-            blocks_s = (1, Nv),
-        )
-    end
-    return dest
-end
-
-function Base.copyto!(
-    dest::DataF{S},
-    bc::DataLayouts.BroadcastedUnionDataF{S},
-    ::ToCUDA,
-) where {S}
-    auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
-    return dest
-end
-
-import ClimaCore.DataLayouts: isascalar
-function knl_copyto_flat!(dest::AbstractData, bc, us)
-    @inbounds begin
-        tidx = thread_index()
-        if tidx ≤ get_N(us)
-            n = size(dest)
-            I = kernel_indexes(tidx, n)
-            dest[I] = bc[I]
-        end
-    end
-    return nothing
-end
-
 function cuda_copyto!(dest::AbstractData, bc)
     (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
     us = DataLayouts.UniversalSize(dest)
     if Nv > 0 && Nh > 0
-        nitems = prod(DataLayouts.universal_size(dest))
-        auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
+        args = (dest, bc, us)
+        threads = threads_via_occupancy(knl_copyto!, args)
+        n_max_threads = min(threads, get_N(us))
+        p = partition(dest, n_max_threads)
+        auto_launch!(
+            knl_copyto!,
+            args;
+            threads_s = p.threads,
+            blocks_s = p.blocks,
+        )
     end
     return dest
 end
 
-# TODO: can we use CUDA's luanch configuration for all data layouts?
-# Currently, it seems to have a slight performance degradation.
 #! format: off
-# Base.copyto!(dest::IJFH{S, Nij},          bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
+Base.copyto!(dest::IJFH{S, Nij},          bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
 Base.copyto!(dest::IFH{S, Ni, Nh},        bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
 Base.copyto!(dest::IJF{S, Nij},           bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
 Base.copyto!(dest::IF{S, Ni},             bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
 Base.copyto!(dest::VIFH{S, Nv, Ni, Nh},   bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
-# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
-# Base.copyto!(dest::VF{S, Nv},             bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
-# Base.copyto!(dest::DataF{S},              bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
+Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
+Base.copyto!(dest::VF{S, Nv},             bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
+Base.copyto!(dest::DataF{S},              bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
 #! format: on
diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl
index cac5bdf526..b38d8e7ff5 100644
--- a/ext/cuda/data_layouts_fill.jl
+++ b/ext/cuda/data_layouts_fill.jl
@@ -1,32 +1,27 @@
-function knl_fill_flat!(dest::AbstractData, val, us)
-    @inbounds begin
-        tidx = thread_index()
-        if tidx ≤ get_N(us)
-            n = size(dest)
-            I = kernel_indexes(tidx, n)
-            @inbounds dest[I] = val
-        end
+function knl_fill!(dest, val, us)
+    I = universal_index(dest)
+    if is_valid_index(dest, I, us)
+        @inbounds dest[I] = val
     end
     return nothing
 end
 
-function cuda_fill!(dest::AbstractData, val)
+function cuda_fill!(dest::AbstractData, bc)
     (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
     us = DataLayouts.UniversalSize(dest)
     if Nv > 0 && Nh > 0
-        nitems = prod(DataLayouts.universal_size(dest))
-        auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true)
+        args = (dest, bc, us)
+        threads = threads_via_occupancy(knl_fill!, args)
+        n_max_threads = min(threads, get_N(us))
+        p = partition(dest, n_max_threads)
+        auto_launch!(
+            knl_fill!,
+            args;
+            threads_s = p.threads,
+            blocks_s = p.blocks,
+        )
     end
     return dest
 end
 
-#! format: off
-Base.fill!(dest::IJFH{<:Any, <:Any},         val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::IFH{<:Any, <:Any},          val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::IJF{<:Any, <:Any},          val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::IF{<:Any, <:Any},           val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::VIFH{<:Any, <:Any, <:Any},  val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::VF{<:Any, <:Any},           val, ::ToCUDA) = cuda_fill!(dest, val)
-Base.fill!(dest::DataF{<:Any},               val, ::ToCUDA) = cuda_fill!(dest, val)
-#! format: on
+Base.fill!(dest::AbstractData, val, ::ToCUDA) = cuda_fill!(dest, val)
diff --git a/ext/cuda/data_layouts_fused_copyto.jl b/ext/cuda/data_layouts_fused_copyto.jl
index 0b1d1126d1..a39a0f9430 100644
--- a/ext/cuda/data_layouts_fused_copyto.jl
+++ b/ext/cuda/data_layouts_fused_copyto.jl
@@ -1,106 +1,59 @@
 Base.@propagate_inbounds function rcopyto_at!(
     pair::Pair{<:AbstractData, <:Any},
     I,
-    v,
+    us,
 )
     dest, bc = pair.first, pair.second
-    if 1 ≤ v <= size(dest, 4)
+    if is_valid_index(dest, I, us)
         dest[I] = isascalar(bc) ? bc[] : bc[I]
     end
     return nothing
 end
-Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v)
+Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, us)
     dest, bc = pair.first, pair.second
-    if 1 ≤ v <= size(dest, 4)
+    if is_valid_index(dest, I, us)
         bcI = isascalar(bc) ? bc[] : bc[I]
         dest[] = bcI
     end
     return nothing
 end
-Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v)
-    rcopyto_at!(first(pairs), I, v)
-    rcopyto_at!(Base.tail(pairs), I, v)
+Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, us)
+    rcopyto_at!(first(pairs), I, us)
+    rcopyto_at!(Base.tail(pairs), I, us)
 end
-Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) =
-    rcopyto_at!(first(pairs), I, v)
-@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing
-
-function knl_fused_copyto!(fmbc::FusedMultiBroadcast)
+Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, us) =
+    rcopyto_at!(first(pairs), I, us)
+@inline rcopyto_at!(pairs::Tuple{}, I, us) = nothing
 
+function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us)
     @inbounds begin
-        i = CUDA.threadIdx().x
-        j = CUDA.threadIdx().y
-
-        h = CUDA.blockIdx().x
-        v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
-        (; pairs) = fmbc
-        I = CartesianIndex((i, j, 1, v, h))
-        rcopyto_at!(pairs, I, v)
+        I = universal_index(dest1)
+        if is_valid_index(dest1, I, us)
+            (; pairs) = fmbc
+            rcopyto_at!(pairs, I, us)
+        end
     end
     return nothing
 end
 
 function fused_copyto!(
     fmbc::FusedMultiBroadcast,
-    dest1::VIJFH{S, Nv, Nij, Nh},
+    dest1::DataLayouts.AbstractData,
     ::ToCUDA,
-) where {S, Nv, Nij, Nh}
-    if Nv > 0 && Nh > 0
-        Nv_per_block = min(Nv, fld(256, Nij * Nij))
-        Nv_blocks = cld(Nv, Nv_per_block)
-        auto_launch!(
-            knl_fused_copyto!,
-            (fmbc,);
-            threads_s = (Nij, Nij, Nv_per_block),
-            blocks_s = (Nh, Nv_blocks),
-        )
-    end
-    return nothing
-end
-
-function fused_copyto!(
-    fmbc::FusedMultiBroadcast,
-    dest1::IJFH{S, Nij},
-    ::ToCUDA,
-) where {S, Nij}
-    _, _, _, _, Nh = size(dest1)
-    if Nh > 0
-        auto_launch!(
-            knl_fused_copyto!,
-            (fmbc,);
-            threads_s = (Nij, Nij),
-            blocks_s = (Nh, 1),
-        )
-    end
-    return nothing
-end
-function fused_copyto!(
-    fmbc::FusedMultiBroadcast,
-    dest1::VF{S, Nv},
-    ::ToCUDA,
-) where {S, Nv}
-    _, _, _, _, Nh = size(dest1)
+)
+    (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest1)
     if Nv > 0 && Nh > 0
+        us = DataLayouts.UniversalSize(dest1)
+        args = (fmbc, dest1, us)
+        threads = threads_via_occupancy(knl_fused_copyto!, args)
+        n_max_threads = min(threads, get_N(us))
+        p = partition(dest1, n_max_threads)
         auto_launch!(
             knl_fused_copyto!,
-            (fmbc,);
-            threads_s = (1, 1),
-            blocks_s = (Nh, Nv),
+            args;
+            threads_s = p.threads,
+            blocks_s = p.blocks,
         )
     end
     return nothing
 end
-
-function fused_copyto!(
-    fmbc::FusedMultiBroadcast,
-    dest1::DataF{S},
-    ::ToCUDA,
-) where {S}
-    auto_launch!(
-        knl_fused_copyto!,
-        (fmbc,);
-        threads_s = (1, 1),
-        blocks_s = (1, 1),
-    )
-    return nothing
-end
diff --git a/ext/cuda/data_layouts_threadblock.jl b/ext/cuda/data_layouts_threadblock.jl
new file mode 100644
index 0000000000..ff37ac278f
--- /dev/null
+++ b/ext/cuda/data_layouts_threadblock.jl
@@ -0,0 +1,212 @@
+const CI5 = CartesianIndex{5}
+
+maximum_allowable_threads() = (
+    CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
+    CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
+    CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z),
+)
+
+"""
+    partition(::AbstractData, n_max_threads)
+
+Given `n_max_threads`, which should be determined
+from CUDA's occupancy API, `partition` returns
+a NamedTuple containing:
+
+ - `threads` size of threads to pass to CUDA
+ - `blocks` size of blocks to pass to CUDA
+
+The general pattern followed here, which seems
+to produce good results, is to satisfy a few
+criteria:
+
+ - Maximize the number of (allowable) threads
+   in the thread partition
+ - The order of the thread partition should
+   follow the fastest changing index in the
+   datalayout (e.g., VIJ in VIJFH)
+"""
+function partition end
+
+"""
+    universal_index(::AbstractData)
+
+Returns a universal cartesian index,
+computed from CUDA's `threadIdx`,
+`blockIdx` and `blockDim`.
+"""
+function universal_index end
+
+"""
+    is_valid_index(::AbstractData, I::CartesianIndex, us::UniversalSize)
+
+Check the minimal number of index
+bounds to ensure that the result of
+`universal_index` is valid.
+"""
+function is_valid_index end
+
+##### VIJFH
+@inline function partition(data::DataLayouts.VIJFH, n_max_threads::Integer)
+    (Nij, _, _, Nv, Nh) = DataLayouts.universal_size(data)
+    Nv_thread = min(Int(fld(n_max_threads, Nij * Nij)), Nv)
+    Nv_blocks = cld(Nv, Nv_thread)
+    @assert prod((Nv_thread, Nij, Nij)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nv_thread, Nij, Nij))),$n_max_threads)"
+    return (; threads = (Nv_thread, Nij, Nij), blocks = (Nv_blocks, Nh))
+end
+@inline function universal_index(::DataLayouts.VIJFH)
+    (tv, i, j) = CUDA.threadIdx()
+    (bv, h) = CUDA.blockIdx()
+    v = tv + (bv - 1) * CUDA.blockDim().x
+    return CartesianIndex((i, j, 1, v, h))
+end
+@inline is_valid_index(::DataLayouts.VIJFH, I::CI5, us::UniversalSize) =
+    1 ≤ I[4] ≤ DataLayouts.get_Nv(us)
+
+##### IJFH
+@inline function partition(data::DataLayouts.IJFH, n_max_threads::Integer)
+    (Nij, _, _, _, Nh) = DataLayouts.universal_size(data)
+    Nh_thread = min(
+        Int(fld(n_max_threads, Nij * Nij)),
+        Nh,
+        maximum_allowable_threads()[3],
+    )
+    Nh_blocks = cld(Nh, Nh_thread)
+    @assert prod((Nij, Nij)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij))),$n_max_threads)"
+    return (; threads = (Nij, Nij, Nh_thread), blocks = (Nh_blocks,))
+end
+@inline function universal_index(::DataLayouts.IJFH)
+    (i, j, th) = CUDA.threadIdx()
+    (bh,) = CUDA.blockIdx()
+    h = th + (bh - 1) * CUDA.blockDim().z
+    return CartesianIndex((i, j, 1, 1, h))
+end
+@inline is_valid_index(::DataLayouts.IJFH, I::CI5, us::UniversalSize) =
+    1 ≤ I[5] ≤ DataLayouts.get_Nh(us)
+
+##### IFH
+@inline function partition(data::DataLayouts.IFH, n_max_threads::Integer)
+    (Ni, _, _, _, Nh) = DataLayouts.universal_size(data)
+    Nh_thread = min(Int(fld(n_max_threads, Ni)), Nh)
+    Nh_blocks = cld(Nh, Nh_thread)
+    @assert prod((Ni, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nh_thread))),$n_max_threads)"
+    return (; threads = (Ni, Nh_thread), blocks = (Nh_blocks,))
+end
+@inline function universal_index(::DataLayouts.IFH)
+    (i, th) = CUDA.threadIdx()
+    (bh,) = CUDA.blockIdx()
+    h = th + (bh - 1) * CUDA.blockDim().y
+    return CartesianIndex((i, 1, 1, 1, h))
+end
+@inline is_valid_index(::DataLayouts.IFH, I::CI5, us::UniversalSize) =
+    1 ≤ I[5] ≤ DataLayouts.get_Nh(us)
+
+##### IJF
+@inline function partition(data::DataLayouts.IJF, n_max_threads::Integer)
+    (Nij, _, _, _, _) = DataLayouts.universal_size(data)
+    @assert prod((Nij, Nij)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij))),$n_max_threads)"
+    return (; threads = (Nij, Nij), blocks = (1,))
+end
+@inline function universal_index(::DataLayouts.IJF)
+    (i, j) = CUDA.threadIdx()
+    return CartesianIndex((i, j, 1, 1, 1))
+end
+@inline is_valid_index(::DataLayouts.IJF, I::CI5, us::UniversalSize) = true
+
+##### IF
+@inline function partition(data::DataLayouts.IF, n_max_threads::Integer)
+    (Ni, _, _, _, _) = DataLayouts.universal_size(data)
+    @assert Ni ≤ n_max_threads "threads,n_max_threads=($(Ni),$n_max_threads)"
+    return (; threads = (Ni,), blocks = (1,))
+end
+@inline function universal_index(::DataLayouts.IF)
+    (i,) = CUDA.threadIdx()
+    return CartesianIndex((i, 1, 1, 1, 1))
+end
+@inline is_valid_index(::DataLayouts.IF, I::CI5, us::UniversalSize) = true
+
+##### VIFH
+@inline function partition(data::DataLayouts.VIFH, n_max_threads::Integer)
+    (Ni, _, _, Nv, Nh) = DataLayouts.universal_size(data)
+    Nv_thread = min(Int(fld(n_max_threads, Ni)), Nv)
+    Nv_blocks = cld(Nv, Nv_thread)
+    @assert prod((Nv_thread, Ni)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nv_thread, Ni))),$n_max_threads)"
+    return (; threads = (Nv_thread, Ni), blocks = (Nv_blocks, Nh))
+end
+@inline function universal_index(::DataLayouts.VIFH)
+    (tv, i) = CUDA.threadIdx()
+    (bv, h) = CUDA.blockIdx()
+    v = tv + (bv - 1) * CUDA.blockDim().x
+    return CartesianIndex((i, 1, 1, v, h))
+end
+@inline is_valid_index(::DataLayouts.VIFH, I::CI5, us::UniversalSize) =
+    1 ≤ I[4] ≤ DataLayouts.get_Nv(us)
+
+##### VF
+@inline function partition(data::DataLayouts.VF, n_max_threads::Integer)
+    (_, _, _, Nv, _) = DataLayouts.universal_size(data)
+    Nvt = fld(n_max_threads, Nv)
+    Nv_thread = Nvt == 0 ? n_max_threads : min(Int(Nvt), Nv)
+    Nv_blocks = cld(Nv, Nv_thread)
+    @assert Nv_thread ≤ n_max_threads "threads,n_max_threads=($(Nv_thread),$n_max_threads)"
+    (; threads = (Nv_thread,), blocks = (Nv_blocks,))
+end
+@inline function universal_index(::DataLayouts.VF)
+    (tv,) = CUDA.threadIdx()
+    (bv,) = CUDA.blockIdx()
+    v = tv + (bv - 1) * CUDA.blockDim().x
+    return CartesianIndex((1, 1, 1, v, 1))
+end
+@inline is_valid_index(::DataLayouts.VF, I::CI5, us::UniversalSize) =
+    1 ≤ I[4] ≤ DataLayouts.get_Nv(us)
+
+##### DataF
+@inline partition(data::DataLayouts.DataF, n_max_threads::Integer) =
+    (; threads = 1, blocks = 1)
+@inline universal_index(::DataLayouts.DataF) = CartesianIndex((1, 1, 1, 1, 1))
+@inline is_valid_index(::DataLayouts.DataF, I::CI5, us::UniversalSize) = true
+
+#####
+##### Custom partitions
+#####
+
+##### Column-wise
+@inline function columnwise_partition(
+    us::DataLayouts.UniversalSize,
+    n_max_threads::Integer,
+)
+    (Nij, _, _, _, Nh) = DataLayouts.universal_size(us)
+    Nh_thread = min(Int(fld(n_max_threads, Nij * Nij)), Nh)
+    Nh_blocks = cld(Nh, Nh_thread)
+    @assert prod((Nij, Nij, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nh_thread))),$n_max_threads)"
+    return (; threads = (Nij, Nij, Nh_thread), blocks = (Nh_blocks,))
+end
+@inline function columnwise_universal_index()
+    (i, j, th) = CUDA.threadIdx()
+    (bh,) = CUDA.blockIdx()
+    h = th + (bh - 1) * CUDA.blockDim().z
+    return CartesianIndex((i, j, 1, 1, h))
+end
+@inline columnwise_is_valid_index(I::CI5, us::UniversalSize) =
+    1 ≤ I[5] ≤ DataLayouts.get_Nh(us)
+
+##### Element-wise (e.g., limiters)
+# TODO
+
+##### Multiple-field solve partition
+@inline function multiple_field_solve_partition(
+    us::DataLayouts.UniversalSize,
+    n_max_threads::Integer;
+    Nnames,
+)
+    (Nij, _, _, _, Nh) = DataLayouts.universal_size(us)
+    @assert prod((Nij, Nij, Nnames)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nnames))),$n_max_threads)"
+    return (; threads = (Nij, Nij, Nnames), blocks = (Nh,))
+end
+@inline function multiple_field_solve_universal_index()
+    (i, j, iname) = CUDA.threadIdx()
+    (h,) = CUDA.blockIdx()
+    return (CartesianIndex((i, j, 1, 1, h)), iname)
+end
+@inline multiple_field_solve_is_valid_index(I::CI5, us::UniversalSize) =
+    1 ≤ I[5] ≤ DataLayouts.get_Nh(us)
diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl
index d4150da5bc..57a7c5c0cb 100644
--- a/ext/cuda/matrix_fields_multiple_field_solve.jl
+++ b/ext/cuda/matrix_fields_multiple_field_solve.jl
@@ -18,10 +18,9 @@ NVTX.@annotate function multiple_field_solve!(
     b,
     x1,
 )
-    Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
     names = MatrixFields.matrix_row_keys(keys(A))
     Nnames = length(names)
-    nthreads, nblocks = _configure_threadblock(Ni * Nj * Nh * Nnames)
+    Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
     sscache = Operators.strip_space(cache)
     ssx = Operators.strip_space(x)
     ssA = Operators.strip_space(A)
@@ -36,11 +35,17 @@ NVTX.@annotate function multiple_field_solve!(
 
     args = (device, caches, xs, As, bs, x1, Val(Nnames))
 
+    us = UniversalSize(Fields.field_values(x1))
+    nitems = Ni * Nj * Nh * Nnames
+    threads = threads_via_occupancy(multiple_field_solve_kernel!, args)
+    n_max_threads = min(threads, nitems)
+    p = multiple_field_solve_partition(us, n_max_threads; Nnames)
+
     auto_launch!(
         multiple_field_solve_kernel!,
         args;
-        threads_s = nthreads,
-        blocks_s = nblocks,
+        threads_s = p.threads,
+        blocks_s = p.blocks,
         always_inline = true,
     )
 end
@@ -83,11 +88,10 @@ function multiple_field_solve_kernel!(
     ::Val{Nnames},
 ) where {Nnames}
     @inbounds begin
-        Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
-        tidx = thread_index()
-        n = (Ni, Nj, Nh, Nnames)
-        if valid_range(tidx, prod(n))
-            (i, j, h, iname) = kernel_indexes(tidx, n).I
+        us = UniversalSize(Fields.field_values(x1))
+        (I, iname) = multiple_field_solve_universal_index()
+        if multiple_field_solve_is_valid_index(I, us)
+            (i, j, _, _, h) = I.I
             generated_single_field_solve!(
                 device,
                 caches,
diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl
index 67e520a823..59c9e1424a 100644
--- a/ext/cuda/matrix_fields_single_field_solve.jl
+++ b/ext/cuda/matrix_fields_single_field_solve.jl
@@ -15,24 +15,24 @@ import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, rmap, rzero, rdiv
 
 function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
     Ni, Nj, _, _, Nh = size(Fields.field_values(A))
+    us = UniversalSize(Fields.field_values(A))
+    args = (device, cache, x, A, b, us)
+    threads = threads_via_occupancy(single_field_solve_kernel!, args)
     nitems = Ni * Nj * Nh
-    nthreads = min(256, nitems)
-    nblocks = cld(nitems, nthreads)
-    args = (device, cache, x, A, b)
+    n_max_threads = min(threads, nitems)
+    p = columnwise_partition(us, n_max_threads)
     auto_launch!(
         single_field_solve_kernel!,
         args;
-        threads_s = nthreads,
-        blocks_s = nblocks,
+        threads_s = p.threads,
+        blocks_s = p.blocks,
     )
 end
 
-function single_field_solve_kernel!(device, cache, x, A, b)
-    idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
-    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
-    if idx <= Ni * Nj * Nh
-        (i, j, h) = CartesianIndices((1:Ni, 1:Nj, 1:Nh))[idx].I
-
+function single_field_solve_kernel!(device, cache, x, A, b, us)
+    I = columnwise_universal_index()
+    if columnwise_is_valid_index(I, us)
+        (i, j, _, _, h) = I.I
         _single_field_solve!(
             device,
             Spaces.column(cache, i, j, h),
diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl
index da23bc01ea..c93b4e0797 100644
--- a/ext/cuda/operators_finite_difference.jl
+++ b/ext/cuda/operators_finite_difference.jl
@@ -18,27 +18,21 @@ function Base.copyto!(
     },
 )
     space = axes(out)
-    if space isa Spaces.ExtrudedFiniteDifferenceSpace
-        QS = Spaces.quadrature_style(space)
-        Nq = Quadratures.degrees_of_freedom(QS)
-        Nh = Topologies.nlocalelems(Spaces.topology(space))
-    else
-        Nq = 1
-        Nh = 1
-    end
-    (li, lw, rw, ri) = bounds = Operators.window_bounds(space, bc)
-    Nv = ri - li + 1
-    max_threads = 256
-    us = DataLayouts.UniversalSize(Fields.field_values(out))
-    nitems = Nv * Nq * Nq * Nh # # of independent items
-    (nthreads, nblocks) = _configure_threadblock(max_threads, nitems)
+    bounds = Operators.window_bounds(space, bc)
+    out_fv = Fields.field_values(out)
+    us = DataLayouts.UniversalSize(out_fv)
     args =
         (strip_space(out, space), strip_space(bc, space), axes(out), bounds, us)
+
+    threads = threads_via_occupancy(copyto_stencil_kernel!, args)
+    n_max_threads = min(threads, get_N(us))
+    p = partition(out_fv, n_max_threads)
+
     auto_launch!(
         copyto_stencil_kernel!,
         args;
-        threads_s = (nthreads,),
-        blocks_s = (nblocks,),
+        threads_s = p.threads,
+        blocks_s = p.blocks,
     )
     return out
 end
@@ -46,13 +40,11 @@ import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
 
 function copyto_stencil_kernel!(out, bc, space, bds, us)
     @inbounds begin
-        gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-        if gid ≤ get_N(us)
+        out_fv = Fields.field_values(out)
+        I = universal_index(out_fv)
+        if is_valid_index(out_fv, I, us)
             (li, lw, rw, ri) = bds
-            Nv = get_Nv(us)
-            Nq = get_Nij(us)
-            Nh = get_Nh(us)
-            (v, i, j, h) = cart_ind((Nv, Nq, Nq, Nh), gid).I
+            (i, j, _, v, h) = I.I
             hidx = (i, j, h)
             idx = v - 1 + li
             if idx < lw
diff --git a/ext/cuda/operators_integral.jl b/ext/cuda/operators_integral.jl
index cf5e5d2ac8..ef116daa0d 100644
--- a/ext/cuda/operators_integral.jl
+++ b/ext/cuda/operators_integral.jl
@@ -19,7 +19,6 @@ function column_reduce_device!(
     space,
 ) where {F, T}
     Ni, Nj, _, _, Nh = size(Fields.field_values(output))
-    threads_s, blocks_s = _configure_threadblock(Ni * Nj * Nh)
     args = (
         single_column_reduce!,
         f,
@@ -29,7 +28,17 @@ function column_reduce_device!(
         init,
         space,
     )
-    auto_launch!(bycolumn_kernel!, args; threads_s, blocks_s)
+    us = UniversalSize(Fields.field_values(output))
+    nitems = Ni * Nj * Nh
+    threads = threads_via_occupancy(bycolumn_kernel!, args)
+    n_max_threads = min(threads, nitems)
+    p = columnwise_partition(us, n_max_threads)
+    auto_launch!(
+        bycolumn_kernel!,
+        args;
+        threads_s = p.threads,
+        blocks_s = p.blocks,
+    )
 end
 
 function column_accumulate_device!(
@@ -41,8 +50,7 @@ function column_accumulate_device!(
     init,
     space,
 ) where {F, T}
-    Ni, Nj, _, _, Nh = size(Fields.field_values(output))
-    threads_s, blocks_s = _configure_threadblock(Ni * Nj * Nh)
+    us = UniversalSize(Fields.field_values(output))
     args = (
         single_column_accumulate!,
         f,
@@ -52,7 +60,17 @@ function column_accumulate_device!(
         init,
         space,
     )
-    auto_launch!(bycolumn_kernel!, args; threads_s, blocks_s)
+    Ni, Nj, _, _, Nh = size(Fields.field_values(output))
+    nitems = Ni * Nj * Nh
+    threads = threads_via_occupancy(bycolumn_kernel!, args)
+    n_max_threads = min(threads, nitems)
+    p = columnwise_partition(us, n_max_threads)
+    auto_launch!(
+        bycolumn_kernel!,
+        args;
+        threads_s = p.threads,
+        blocks_s = p.blocks,
+    )
 end
 
 bycolumn_kernel!(
@@ -67,10 +85,10 @@ bycolumn_kernel!(
     if space isa Spaces.FiniteDifferenceSpace
         single_column_function!(f, transform, output, input, init, space)
     else
-        idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-        Ni, Nj, _, _, Nh = size(Fields.field_values(output))
-        if idx <= Ni * Nj * Nh
-            i, j, h = cart_ind((Ni, Nj, Nh), idx).I
+        I = columnwise_universal_index()
+        us = UniversalSize(Fields.field_values(output))
+        if columnwise_is_valid_index(I, us)
+            (i, j, _, _, h) = I.I
             single_column_function!(
                 f,
                 transform,
diff --git a/ext/cuda/operators_thomas_algorithm.jl b/ext/cuda/operators_thomas_algorithm.jl
index 83546518ab..11eea95fed 100644
--- a/ext/cuda/operators_thomas_algorithm.jl
+++ b/ext/cuda/operators_thomas_algorithm.jl
@@ -5,14 +5,18 @@ import ClimaCore.Operators:
 import CUDA
 using CUDA: @cuda
 function column_thomas_solve!(::ClimaComms.CUDADevice, A, b)
-    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
-    nthreads, nblocks = _configure_threadblock(Ni * Nj * Nh)
+    us = UniversalSize(Fields.field_values(A))
     args = (A, b)
+    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
+    threads = threads_via_occupancy(thomas_algorithm_kernel!, args)
+    nitems = Ni * Nj * Nh
+    n_max_threads = min(threads, nitems)
+    p = columnwise_partition(us, n_max_threads)
     auto_launch!(
         thomas_algorithm_kernel!,
         args;
-        threads_s = nthreads,
-        blocks_s = nblocks,
+        threads_s = p.threads,
+        blocks_s = p.blocks,
     )
 end
 
@@ -20,10 +24,10 @@ function thomas_algorithm_kernel!(
     A::Fields.ExtrudedFiniteDifferenceField,
     b::Fields.ExtrudedFiniteDifferenceField,
 )
-    idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
-    if idx <= Ni * Nj * Nh
-        i, j, h = cart_ind((Ni, Nj, Nh), idx).I
+    I = columnwise_universal_index()
+    us = UniversalSize(Fields.field_values(A))
+    if columnwise_is_valid_index(I, us)
+        (i, j, _, _, h) = I.I
         thomas_algorithm!(Spaces.column(A, i, j, h), Spaces.column(b, i, j, h))
     end
     return nothing
diff --git a/test/Fields/benchmark_field_multi_broadcast_fusion.jl b/test/Fields/benchmark_field_multi_broadcast_fusion.jl
index 8d0aefeceb..e30f00f4ca 100644
--- a/test/Fields/benchmark_field_multi_broadcast_fusion.jl
+++ b/test/Fields/benchmark_field_multi_broadcast_fusion.jl
@@ -11,8 +11,9 @@ include("utils_field_multi_broadcast_fusion.jl")
     device = ClimaComms.device()
     space = TU.CenterExtrudedFiniteDifferenceSpace(
         FT;
-        zelem = 3,
-        helem = 4,
+        zelem = 63,
+        helem = 30,
+        Nq = 4,
         context = ClimaComms.context(device),
     )
     X = Fields.FieldVector(
@@ -43,8 +44,9 @@ end
     if device isa ClimaComms.CPUSingleThreaded
         space = CenterExtrudedFiniteDifferenceSpaceLineHSpace(
             FT;
-            zelem = 3,
-            helem = 4,
+            zelem = 63,
+            helem = 30,
+            Nq = 4,
             context = ClimaComms.context(device),
         )
         X = Fields.FieldVector(
@@ -101,7 +103,7 @@ end
     device = ClimaComms.device()
     colspace = TU.ColumnCenterFiniteDifferenceSpace(
         FT;
-        zelem = 3,
+        zelem = 63,
         context = ClimaComms.context(device),
     )
     VF_data() = Fields.Field(FT, colspace)
diff --git a/test/Operators/integrals.jl b/test/Operators/integrals.jl
index 6d7137e0ae..76701c378c 100644
--- a/test/Operators/integrals.jl
+++ b/test/Operators/integrals.jl
@@ -26,7 +26,7 @@ test_allocs(allocs) =
     if ClimaComms.device() isa ClimaComms.AbstractCPUDevice
         @test allocs == 0
     else
-        @test allocs ≤ 5000 # GPU always has ~2 kB of non-deterministic allocs.
+        @test allocs ≤ 5008 # GPU always has ~2 kB of non-deterministic allocs.
     end
 
 function test_column_integral_definite!(center_space)