diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 78c1683f..fe6fae05 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,6 +1,6 @@
 steps:
   - label: "Triggering Pipelines (Pull Request)"
-    if: "build.pull_request.base_branch == 'main'"
+    if: build.branch != "main" && build.tag == null
     agents:
       queue: "juliagpu"
     plugins:
diff --git a/.buildkite/testing.yml b/.buildkite/testing.yml
index 82a68ba5..2146ea94 100644
--- a/.buildkite/testing.yml
+++ b/.buildkite/testing.yml
@@ -24,55 +24,108 @@ steps:
             julia:
               - "1"
 
-  - group: ":telescope: Downstream CUDA"
+  - group: ":julia: AMD GPU"
     steps:
-      - label: ":julia: {{matrix.repo}} (Julia 1 + CUDA GPU)"
+      - label: ":julia: Julia: {{matrix.julia}} + AMD GPU"
         plugins:
           - JuliaCI/julia#v1:
-              version: "1"
+              version: "{{matrix.julia}}"
+          - JuliaCI/julia-test#v1:
+              test_args: "--quickfail"
           - JuliaCI/julia-coverage#v1:
               codecov: true
               dirs:
                 - src
                 - ext
-        command: julia --code-coverage=user --color=yes --project .buildkite/scripts/downstream.jl "{{matrix.repo}}" "CUDA"
+        env:
+          RETESTITEMS_NWORKERS: 2
+          BACKEND_GROUP: "AMDGPU"
         agents:
           queue: "juliagpu"
-          cuda: "*"
-        if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.branch != "main"
+          rocm: "*"
+          rocmgpu: "*"
+        if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
         timeout_in_minutes: 240
         matrix:
           setup:
-            repo:
-              - "Boltz"
-              - "Lux"
+            julia:
+              - "1"
 
-  - group: ":julia: AMD GPU"
+  # - group: ":julia: Metal GPU"
+  #   steps:
+  #     - label: ":julia: Julia {{matrix.julia}} + Metal GPU"
+  #       soft_fail: true
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "{{matrix.julia}}"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--quickfail"
+  #         - JuliaCI/julia-coverage#v1:
+  #             codecov: true
+  #             dirs:
+  #               - src
+  #               - ext
+  #       agents:
+  #         queue: "juliaecosystem"
+  #         os: "macos"
+  #         arch: "aarch64"
+  #       env:
+  #         BACKEND_GROUP: "Metal"
+  #       if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
+  #       timeout_in_minutes: 240
+  #       matrix:
+  #         setup:
+  #           julia:
+  #             - "1"
+
+  # - group: ":julia: oneAPI GPU"
+  #   steps:
+  #     - label: ":julia: Julia {{matrix.julia}} + oneAPI GPU"
+  #       soft_fail: true
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "{{matrix.julia}}"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--quickfail"
+  #         - JuliaCI/julia-coverage#v1:
+  #             codecov: true
+  #             dirs:
+  #               - src
+  #               - ext
+  #       agents:
+  #         queue: "juliagpu"
+  #         intel: "*"
+  #       env:
+  #         BACKEND_GROUP: "oneAPI"
+  #       if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
+  #       timeout_in_minutes: 240
+  #       matrix:
+  #         setup:
+  #           julia:
+  #             - "1"
+
+  - group: ":telescope: Downstream CUDA"
     steps:
-      - label: ":julia: Julia: {{matrix.julia}} + AMD GPU"
+      - label: ":julia: {{matrix.repo}} (Julia 1 + CUDA GPU)"
         plugins:
           - JuliaCI/julia#v1:
-              version: "{{matrix.julia}}"
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail"
+              version: "1"
           - JuliaCI/julia-coverage#v1:
               codecov: true
               dirs:
                 - src
                 - ext
-        env:
-          RETESTITEMS_NWORKERS: 2
-          BACKEND_GROUP: "AMDGPU"
+        command: julia --code-coverage=user --color=yes --project .buildkite/scripts/downstream.jl "{{matrix.repo}}" "CUDA"
         agents:
           queue: "juliagpu"
-          rocm: "*"
-          rocmgpu: "*"
-        if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
+          cuda: "*"
+        if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.branch != "main"
         timeout_in_minutes: 240
         matrix:
           setup:
-            julia:
-              - "1"
+            repo:
+              - "Boltz"
+              - "Lux"
 
   - group: ":telescope: Downstream AMD GPU"
     steps:
diff --git a/Project.toml b/Project.toml
index 37a4d383..2e3fb8ed 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.2.4"
+version = "1.3.0"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/benchmarks/setup.jl b/benchmarks/setup.jl
index 06211e9d..53e0bd11 100644
--- a/benchmarks/setup.jl
+++ b/benchmarks/setup.jl
@@ -236,11 +236,6 @@ end
 
 function setup_batched_matmul_benchmarks!(suite::BenchmarkGroup, cpu_or_gpu::String,
         backend::String, dev::MLDataDevices.AbstractDevice)
-    if dev isa MetalDevice || dev isa oneAPIDevice
-        @warn "Skipping batched_matmul benchmarks for $(dev)..."
-        return
-    end
-
     for N in [2, 16, 128, 512], Bsize in [4, 32, 128, 512]
         benchmark_name = "batchedmm($N, Bsize=$Bsize)"
 
diff --git a/src/impl/Impl.jl b/src/impl/Impl.jl
index bdd79cbf..c1818c77 100644
--- a/src/impl/Impl.jl
+++ b/src/impl/Impl.jl
@@ -21,8 +21,8 @@ using Random: Random, AbstractRNG, rand!
 using Statistics: Statistics, mean, var
 
 using LuxCore: LuxCore
-using MLDataDevices: get_device_type, AMDGPUDevice, CUDADevice, AbstractGPUDevice,
-                     AbstractDevice
+using MLDataDevices: get_device_type, CPUDevice, AMDGPUDevice, CUDADevice,
+                     AbstractGPUDevice, AbstractDevice
 using NNlib: NNlib, ConvDims
 
 using ..LuxLib: Optional, Numeric, ∂∅, internal_operation_mode, AbstractInternalArrayOpMode,
diff --git a/src/impl/batched_mul.jl b/src/impl/batched_mul.jl
index c5e3fdf3..87afb452 100644
--- a/src/impl/batched_mul.jl
+++ b/src/impl/batched_mul.jl
@@ -8,22 +8,26 @@ function batched_matmul(::GenericBroadcastOp, x::AbstractArray{xT, 3},
     return NNlib.batched_mul(x, y)
 end
 
-function batched_matmul(::GPUBroadcastOp{<:AbstractGPUDevice},
+for dev in (AMDGPUDevice, CUDADevice)
+    @eval function batched_matmul(::GPUBroadcastOp{$(dev)},
+            x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
+        return NNlib.batched_mul(x, y)  # GPU versions are well optimized
+    end
+end
+
+function batched_matmul(opmode::GPUBroadcastOp{<:AbstractGPUDevice},
         x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
-    return NNlib.batched_mul(x, y)  # GPU versions are well optimized
+    if isconcretetype(Core.Compiler._return_type(
+        NNlib.batched_mul, Tuple{typeof(x), typeof(y)}))
+        return NNlib.batched_mul(x, y)  # GPU versions are well optimized
+    end
+    return fallback_batched_matmul(opmode, x, y)
 end
 
-function batched_matmul(::GPUBroadcastOp{AMDGPUDevice}, x::AbstractArray{<:Complex, 3},
+function batched_matmul(
+        opmode::GPUBroadcastOp{AMDGPUDevice}, x::AbstractArray{<:Complex, 3},
         y::AbstractArray{<:Complex, 3})
-    if (size(x, 3) != size(y, 3) && size(x, 3) != 1 && size(y, 3) != 1) ||
-       (size(x, 2) != size(y, 1))
-        throw(DimensionMismatch(lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul."))
-    end
-    @warn "Using fallback implementation of `batched_matmul` for complex numbers on \
-           AMDGPUDevice" maxlog=1
-    size(x, 3) == size(y, 3) && return stack(*, batchview(x), batchview(y))
-    size(x, 3) == 1 && return stack(Base.Fix1(*, batchview(x, 1)), batchview(y))
-    return stack(Base.Fix2(*, batchview(y, 1)), batchview(x))
+    return fallback_batched_matmul(opmode, x, y)
 end
 
 function batched_matmul(opmode::LoopedArrayOp, x::AbstractArray{xT, 3},
@@ -73,6 +77,39 @@ function batched_matmul_loopvec_impl!(
     end
 end
 
+function fallback_batched_matmul(
+        dev, x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
+    z = similar(x, promote_type(eltype(x), eltype(y)), size(x, 1),
+        size(y, 2), max(size(x, 3), size(y, 3)))
+    fallback_batched_matmul!(z, dev, x, y)
+    return z
+end
+
+function fallback_batched_matmul!(
+        z::AbstractArray{zT, 3}, dev, x::AbstractArray{xT, 3},
+        y::AbstractArray{yT, 3}) where {zT, xT, yT}
+    @warn "Using fallback Batched Matrix Multiply routine for $(dev) with A: size = \
+           $(size(x)) eltype = $(xT) and B: size = $(size(y)) eltype = $(yT). This may be \
+           slow." maxlog=1
+    if (size(x, 3) != size(y, 3) && size(x, 3) != 1 && size(y, 3) != 1) ||
+       (size(x, 2) != size(y, 1))
+        throw(DimensionMismatch(lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul."))
+    end
+    if size(x, 3) == size(y, 3)
+        Threads.@threads for L in indices((x, y), 3)
+            mul!(batchview(z, L), batchview(x, L), batchview(y, L))
+        end
+    elseif size(x, 3) == 1
+        Threads.@threads for L in indices((x, y), 3)
+            mul!(batchview(z, L), batchview(x, 1), batchview(y, L))
+        end
+    else # has to be size(y, 3) == 1
+        Threads.@threads for L in indices((x, y), 3)
+            mul!(batchview(z, L), batchview(x, L), batchview(y, 1))
+        end
+    end
+end
+
 function CRC.rrule(::typeof(batched_matmul), x::AbstractArray{xT, 3},
         y::AbstractArray{yT, 3}) where {xT, yT}
     ∇batched_matmul = @closure Δ_ -> begin
diff --git a/src/impl/conv.jl b/src/impl/conv.jl
index 4cee0adc..f5181b65 100644
--- a/src/impl/conv.jl
+++ b/src/impl/conv.jl
@@ -31,7 +31,7 @@ function conv!(y::AbstractArray{yT, N}, ::Type{<:AbstractDevice},
     NNlib.conv!(y, x, weight, cdims)
     return
 end
-function conv!(y::AbstractArray{yT, N}, ::Type{<:AbstractGPUDevice},
+function conv!(y::AbstractArray{yT, N}, ::Type{<:Union{CUDADevice, AMDGPUDevice}},
         x::AbstractArray{xT, N}, weight::AbstractArray{wT, N},
         cdims::ConvDims) where {yT, xT, wT, N}
     if xT !== wT !== yT
@@ -43,11 +43,53 @@ function conv!(y::AbstractArray{yT, N}, ::Type{<:AbstractGPUDevice},
         contiguous(ofeltype_array(yT, weight)), cdims)
     return
 end
+function conv!(y::AbstractArray{yT, N}, dev::Type{<:AbstractGPUDevice},
+        x::AbstractArray{xT, N}, weight::AbstractArray{wT, N},
+        cdims::ConvDims) where {yT, xT, wT, N}
+    if xT !== wT !== yT
+        safe_warning(
+            "Mixed Precision Inputs received for GPU convolution [weight: $(wT)] and \
+             [x: $(xT)]. Promoting to $(yT).", 1)
+    end
+    x_cont = contiguous(ofeltype_array(yT, x))
+    weight_cont = contiguous(ofeltype_array(yT, weight))
+    fallback_slow_conv!(y, dev, x_cont, weight_cont, cdims)
+    return
+end
 
-function conv(x′, weight′, cdims::ConvDims)
+function fallback_slow_conv!(y::AbstractArray{yT, N}, dev::Type{<:AbstractDevice},
+        x::AbstractArray{xT, N}, weight::AbstractArray{wT, N},
+        cdims::ConvDims) where {yT, xT, wT, N}
+    @warn "Falling back to slow convolution routine for $(dev) with x: size = \
+           $(size(x)) eltype = $(xT) and weight: size = $(size(weight)) \
+           eltype = $(wT)." maxlog=1
+    # TODO: We should be able to reuse `y` for some part here for some efficiency
+    @assert NNlib.groupcount(cdims)==1 "Only groups=1 is supported for now." # FIXME
+    tmp = NNlib.unfold(x, cdims)
+    weight_compact = reshape(weight, :, size(weight, N), 1)
+    res = batched_matmul(tmp, weight_compact)
+    copyto!(y, reshape(res, size(y)))
+    return
+end
+
+conv(x, weight, cdims::ConvDims) = conv(get_device_type((x, weight)), x, weight, cdims)
+
+function conv(
+        ::Type{<:Union{CPUDevice, CUDADevice, AMDGPUDevice}}, x′, weight′, cdims::ConvDims)
     x, weight = get_conv_input_weight(x′, weight′)
     return NNlib.conv(x, weight, cdims)
 end
+function conv(dev::Type{<:AbstractDevice}, x′, weight′, cdims::ConvDims)
+    x, weight = get_conv_input_weight(dev, x′, weight′)
+    return fallback_slow_conv(dev, x, weight, cdims)
+end
+
+function fallback_slow_conv(dev, x, weight, cdims::ConvDims)
+    y = similar(x, promote_type(eltype(x), eltype(weight)), NNlib.output_size(cdims)...,
+        NNlib.channels_out(cdims), size(x, ndims(x)))
+    fallback_slow_conv!(y, dev, x, weight, cdims)
+    return y
+end
 
 function ∇conv_data(x′, weight′, cdims::ConvDims)
     x, weight = get_conv_input_weight(x′, weight′)
diff --git a/src/impl/dropout.jl b/src/impl/dropout.jl
index 264156a3..64d28fa5 100644
--- a/src/impl/dropout.jl
+++ b/src/impl/dropout.jl
@@ -190,6 +190,7 @@ function generate_dropout_mask_loop!(y::AbstractArray, p, invp)
 end
 
 function generate_dropout_mask_simd_loop!(y::AbstractArray{T}, p, invp) where {T}
+    p, invp = T(p), T(invp)
     @simd ivdep for I in indices(y)
         y[I] = (y[I] > p) * invp
     end
@@ -197,7 +198,9 @@ end
 
 @enzyme_alternative generate_dropout_mask_loop! generate_dropout_mask_simd_loop!
 
-function generate_dropout_mask!(y::AbstractArray, ::AbstractInternalArrayOpMode, p, invp)
+function generate_dropout_mask!(
+        y::AbstractArray{T}, ::AbstractInternalArrayOpMode, p, invp) where {T}
+    p, invp = T(p), T(invp)
     @. y = (y > p) * invp
     return
 end
diff --git a/test/Project.toml b/test/Project.toml
index 51b229fc..ab1b5736 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -61,3 +61,9 @@ Statistics = "1.10"
 Test = "1.10"
 Tracker = "0.2.34"
 Zygote = "0.6.70"
+
+[extras]
+CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+
+[preferences.CUDA_Driver_jll]
+compat = false
diff --git a/test/common_ops/activation_tests.jl b/test/common_ops/activation_tests.jl
index a5c3e2f8..2045f20f 100644
--- a/test/common_ops/activation_tests.jl
+++ b/test/common_ops/activation_tests.jl
@@ -5,11 +5,13 @@
     apply_act_fast(f::F, x) where {F} = sum(abs2, fast_activation!!(f, copy(x)))
     apply_act_fast2(f::F, x) where {F} = sum(abs2, fast_activation(f, x))
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$f: $T" for f in [identity, relu, sigmoid, sigmoid_fast, softplus,
                 logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
             T in [Float16, Float32, Float64]
 
+            !fp64 && T == Float64 && continue
+
             x = rand(rng, T, 4, 3) |> aType
 
             y1 = apply_act(f, x)
diff --git a/test/common_ops/bias_act_tests.jl b/test/common_ops/bias_act_tests.jl
index 2bdbc830..1429c9b2 100644
--- a/test/common_ops/bias_act_tests.jl
+++ b/test/common_ops/bias_act_tests.jl
@@ -11,13 +11,15 @@
     end
     (f::__Fix1)(x, b) = f.f(f.act, x, b)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$act, $T, $sz" for act in [
                 identity, relu, sigmoid, sigmoid_fast, softplus,
                 logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
             T in [Float16, Float32, Float64],
             sz in [(2, 2, 3, 4), (4, 5)]
 
+            !fp64 && T == Float64 && continue
+
             x = rand(rng, T, sz) |> aType
             b = rand(rng, T, sz[end - 1]) |> aType
 
diff --git a/test/common_ops/conv_tests.jl b/test/common_ops/conv_tests.jl
index 5c208cd4..c7426b20 100644
--- a/test/common_ops/conv_tests.jl
+++ b/test/common_ops/conv_tests.jl
@@ -92,8 +92,9 @@ export expand, convfilter, calc_padding, anonact, TEST_BLOCKS, run_conv_testing
 end
 
 @testitem "Fused Conv: Group 1" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[1]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -101,8 +102,9 @@ end
 end
 
 @testitem "Fused Conv: Group 2" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[2]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -110,8 +112,9 @@ end
 end
 
 @testitem "Fused Conv: Group 3" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[3]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -119,8 +122,9 @@ end
 end
 
 @testitem "Fused Conv: Group 4" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[4]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -128,8 +132,9 @@ end
 end
 
 @testitem "Fused Conv: Group 5" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[5]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
diff --git a/test/common_ops/dense_tests.jl b/test/common_ops/dense_tests.jl
index a14906b6..e438647c 100644
--- a/test/common_ops/dense_tests.jl
+++ b/test/common_ops/dense_tests.jl
@@ -79,40 +79,45 @@ export ALL_TEST_CONFIGS, TEST_BLOCKS, run_dense_testing
 end
 
 @testitem "Fused Dense: Group 1" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[1]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 2" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[2]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 3" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[3]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 4" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[4]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 5" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[5]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
diff --git a/test/common_ops/dropout_tests.jl b/test/common_ops/dropout_tests.jl
index 2dd6f5e2..45f8fd01 100644
--- a/test/common_ops/dropout_tests.jl
+++ b/test/common_ops/dropout_tests.jl
@@ -1,11 +1,13 @@
 @testitem "Dropout" tags=[:other_ops] setup=[SharedTestSetup] begin
     rng = StableRNG(12345)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$T, $x_shape, $dims" for T in (Float16, Float32, Float64),
             x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1)),
             dims in (:, 1, (1, 2))
 
+            !fp64 && T == Float64 && continue
+
             x = randn(rng, T, x_shape) |> aType
 
             @test @inferred(dropout(rng, x, T(0.5), Val(true), T(2), dims)) isa Any
@@ -46,10 +48,12 @@ end
 
     rng = StableRNG(12345)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$T: $x_shape" for T in (Float16, Float32, Float64),
             x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
 
+            !fp64 && T == Float64 && continue
+
             x = randn(rng, T, x_shape) |> aType
             mask = rand(T, x_shape) |> aType
 
@@ -133,10 +137,12 @@ end
 
     rng = StableRNG(12345)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$T: $x_shape" for T in (Float16, Float32, Float64),
             x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
 
+            !fp64 && T == Float64 && continue
+
             x = randn(rng, T, x_shape) |> aType
 
             @test @inferred(alpha_dropout(rng, x, T(0.5), Val(true))) isa Any
diff --git a/test/normalization/batchnorm_tests.jl b/test/normalization/batchnorm_tests.jl
index 3d935809..3936200a 100644
--- a/test/normalization/batchnorm_tests.jl
+++ b/test/normalization/batchnorm_tests.jl
@@ -123,8 +123,9 @@ export setup_batchnorm, ALL_TEST_CONFIGS, TEST_BLOCKS, run_batchnorm_testing
 end
 
 @testitem "Batch Norm: Group 1" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -132,8 +133,9 @@ end
 end
 
 @testitem "Batch Norm: Group 2" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -141,8 +143,9 @@ end
 end
 
 @testitem "Batch Norm: Group 3" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -150,8 +153,9 @@ end
 end
 
 @testitem "Batch Norm: Group 4" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -159,8 +163,9 @@ end
 end
 
 @testitem "Batch Norm: Group 5" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -168,7 +173,9 @@ end
 end
 
 @testitem "Batch Norm: Mixed Precision" tags=[:batch_norm] setup=[SharedTestSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && aType == Float64 && continue
+
         x = rand(Float64, 4, 4, 6, 2) |> aType
         scale = rand(Float32, 6) |> aType
         bias = rand(Float32, 6) |> aType
diff --git a/test/normalization/groupnorm_tests.jl b/test/normalization/groupnorm_tests.jl
index 3d5e821a..3c638885 100644
--- a/test/normalization/groupnorm_tests.jl
+++ b/test/normalization/groupnorm_tests.jl
@@ -93,40 +93,45 @@ export setup_groupnorm, ALL_TEST_CONFIGS, TEST_BLOCKS, run_groupnorm_testing
 end
 
 @testitem "Group Norm: Group 1" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 2" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 3" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 4" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 5" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
diff --git a/test/normalization/instancenorm_tests.jl b/test/normalization/instancenorm_tests.jl
index a48a502d..ff166cfa 100644
--- a/test/normalization/instancenorm_tests.jl
+++ b/test/normalization/instancenorm_tests.jl
@@ -84,8 +84,9 @@ end
 
 @testitem "Instance Norm: Group 1" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -94,8 +95,9 @@ end
 
 @testitem "Instance Norm: Group 2" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -104,8 +106,9 @@ end
 
 @testitem "Instance Norm: Group 3" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -114,8 +117,9 @@ end
 
 @testitem "Instance Norm: Group 4" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -124,8 +128,9 @@ end
 
 @testitem "Instance Norm: Group 5" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
diff --git a/test/normalization/layernorm_tests.jl b/test/normalization/layernorm_tests.jl
index bdfccb47..37ca3c70 100644
--- a/test/normalization/layernorm_tests.jl
+++ b/test/normalization/layernorm_tests.jl
@@ -90,8 +90,9 @@ export ALL_TEST_CONFIGS, TEST_BLOCKS, run_layernorm_testing
 end
 
 @testitem "Layer Norm: Group 1" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -99,8 +100,9 @@ end
 end
 
 @testitem "Layer Norm: Group 2" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -108,8 +110,9 @@ end
 end
 
 @testitem "Layer Norm: Group 3" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -117,8 +120,9 @@ end
 end
 
 @testitem "Layer Norm: Group 4" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -126,8 +130,9 @@ end
 end
 
 @testitem "Layer Norm: Group 5" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -135,7 +140,9 @@ end
 end
 
 @testitem "Layer Norm: Error Checks" tags=[:layer_norm] setup=[SharedTestSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && continue
+
         x = rand(2, 3) |> aType
 
         @test_throws ArgumentError layernorm(x, nothing, nothing, identity, nothing, 1e-5)
diff --git a/test/others/bmm_tests.jl b/test/others/bmm_tests.jl
index ea847568..2b89b0ef 100644
--- a/test/others/bmm_tests.jl
+++ b/test/others/bmm_tests.jl
@@ -46,8 +46,10 @@ end
 @testitem "batched_mul" tags=[:batched_ops] setup=[SharedTestSetup, BatchedMMSetup] begin
     rng = StableRNG(1234)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "batched_mul: Float64 × $(TB)" for TB in [Float64, Float32]
+            !fp64 && continue
+
             @testset "real" begin
                 A = randn(rng, 7, 5, 3) |> aType
                 B = randn(rng, TB, 5, 7, 3) |> aType
@@ -131,7 +133,9 @@ end
     SharedTestSetup, BatchedMMSetup] begin
     rng = StableRNG(1234)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && continue
+
         @testset "Float64 × $(TB)" for TB in [Float64, ComplexF64]
             @testset "trivial dimensions & unit strides" begin
                 @testset "$tA(rand$((sA...,3))) ⊠ $tB(rand$((sB...,3)))" for tA in [
@@ -228,7 +232,9 @@ end
     SharedTestSetup, BatchedMMSetup] begin
     rng = StableRNG(1234)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && continue
+
         @testset "Float64 × $(TB)" for TB in [Float64, ComplexF64]
             A = randn(rng, 3, 3, 3) |> aType
             M = aType(rand(rng, TB, 3, 3)) .+ im
@@ -259,42 +265,44 @@ end
     fn(A, B) = sum(batched_matmul(A, B))
     fn_vec(A, B) = sum(batched_vec(A, B))
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         M, P, Q = 13, 7, 11
         B = 3
 
         @testset "Two 3-arrays" begin
-            @test_gradients(fn, aType(randn(rng, M, P, B)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, batched_adjoint(aType(randn(rng, P, M, B))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P, B)),
-                batched_transpose(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, B)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, batched_adjoint(aType(randn(rng, Float32, P, M, B))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, B)),
+                batched_transpose(aType(randn(rng, Float32, Q, P, B))); atol=1e-3,
+                rtol=1e-3)
         end
 
         @testset "One a matrix..." begin
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, adjoint(aType(randn(rng, P, M))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                batched_adjoint(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
-
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, adjoint(aType(randn(rng, P, M))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                batched_adjoint(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, adjoint(aType(randn(rng, Float32, P, M))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                batched_adjoint(aType(randn(rng, Float32, Q, P, B))); atol=1e-3, rtol=1e-3)
+
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, adjoint(aType(randn(rng, Float32, P, M))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                batched_adjoint(aType(randn(rng, Float32, Q, P, B))); atol=1e-3, rtol=1e-3)
         end
 
         @testset "... or equivalent to a matrix" begin
-            @test_gradients(fn, aType(randn(rng, M, P, 1)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, batched_transpose(aType(randn(rng, P, M, 1))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P, 1)),
-                batched_transpose(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, 1)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, batched_transpose(aType(randn(rng, Float32, P, M, 1))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, 1)),
+                batched_transpose(aType(randn(rng, Float32, Q, P, B))); atol=1e-3,
+                rtol=1e-3)
         end
     end
 end
diff --git a/test/others/forwarddiff_tests.jl b/test/others/forwarddiff_tests.jl
index 23c279e8..228aa7d3 100644
--- a/test/others/forwarddiff_tests.jl
+++ b/test/others/forwarddiff_tests.jl
@@ -38,7 +38,7 @@
         end
     end
 
-    @testset "$(mode): Jacobian Vector Products" for (mode, aType, ongpu) in MODES
+    @testset "$(mode): Jacobian Vector Products" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(op)(; flipped = $flipped)" for flipped in (true, false),
             op in (depthwiseconv, conv)
 
@@ -98,7 +98,7 @@ end
 
     rng = StableRNG(12345)
 
-    @testset "$mode: dropout" for (mode, aType, ongpu) in MODES
+    @testset "$mode: dropout" for (mode, aType, ongpu, fp64) in MODES
         x = randn(rng, Float32, 10, 2) |> aType
         x_dual = ForwardDiff.Dual.(x)
 
diff --git a/test/others/misc_tests.jl b/test/others/misc_tests.jl
index 6943de74..6e046eea 100644
--- a/test/others/misc_tests.jl
+++ b/test/others/misc_tests.jl
@@ -1,5 +1,5 @@
 @testitem "internal_operation_mode: Wrapped Arrays" tags=[:others] setup=[SharedTestSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         x = rand(Float32, 4, 3) |> aType
         retval = ongpu ? LuxLib.GPUBroadcastOp : LuxLib.LoopedArrayOp
         @test LuxLib.internal_operation_mode(x) isa retval
diff --git a/test/runtests.jl b/test/runtests.jl
index 799d0c2b..54223a63 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,6 +14,8 @@ const LUXLIB_BLAS_BACKEND = lowercase(get(ENV, "LUXLIB_BLAS_BACKEND", "default")
 
 (BACKEND_GROUP == "all" || BACKEND_GROUP == "cuda") && push!(EXTRA_PKGS, "LuxCUDA")
 (BACKEND_GROUP == "all" || BACKEND_GROUP == "amdgpu") && push!(EXTRA_PKGS, "AMDGPU")
+(BACKEND_GROUP == "all" || BACKEND_GROUP == "oneapi") && push!(EXTRA_PKGS, "oneAPI")
+(BACKEND_GROUP == "all" || BACKEND_GROUP == "metal") && push!(EXTRA_PKGS, "Metal")
 
 if !isempty(EXTRA_PKGS)
     @info "Installing Extra Packages for testing" EXTRA_PKGS=EXTRA_PKGS
diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl
index 4cf27cfb..487a50d5 100644
--- a/test/shared_testsetup.jl
+++ b/test/shared_testsetup.jl
@@ -33,6 +33,14 @@ if BACKEND_GROUP == "all" || BACKEND_GROUP == "amdgpu"
     using AMDGPU
 end
 
+if BACKEND_GROUP == "all" || BACKEND_GROUP == "oneapi"
+    using oneAPI
+end
+
+if BACKEND_GROUP == "all" || BACKEND_GROUP == "metal"
+    using Metal
+end
+
 cpu_testing() = BACKEND_GROUP == "all" || BACKEND_GROUP == "cpu"
 function cuda_testing()
     return (BACKEND_GROUP == "all" || BACKEND_GROUP == "cuda") &&
@@ -42,12 +50,22 @@ function amdgpu_testing()
     return (BACKEND_GROUP == "all" || BACKEND_GROUP == "amdgpu") &&
            MLDataDevices.functional(AMDGPUDevice)
 end
+function oneapi_testing()
+    return (BACKEND_GROUP == "all" || BACKEND_GROUP == "oneapi") &&
+           MLDataDevices.functional(oneAPIDevice)
+end
+function metal_testing()
+    return (BACKEND_GROUP == "all" || BACKEND_GROUP == "metal") &&
+           MLDataDevices.functional(MetalDevice)
+end
 
 const MODES = begin
     modes = []
-    cpu_testing() && push!(modes, ("cpu", Array, false))
-    cuda_testing() && push!(modes, ("cuda", CuArray, true))
-    amdgpu_testing() && push!(modes, ("amdgpu", ROCArray, true))
+    cpu_testing() && push!(modes, ("cpu", Array, false, true))
+    cuda_testing() && push!(modes, ("cuda", CuArray, true, true))
+    amdgpu_testing() && push!(modes, ("amdgpu", ROCArray, true, true))
+    oneapi_testing() && push!(modes, ("oneapi", oneArray, true, false))
+    metal_testing() && push!(modes, ("metal", MtlArray, true, false))
     modes
 end