From e8b9675ca82f30953bdbb24cd645b0ee98247c8a Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Sat, 21 Sep 2024 12:11:22 -0400
Subject: [PATCH] test: check for FP64 support

---
 test/common_ops/activation_tests.jl      |  4 +-
 test/common_ops/bias_act_tests.jl        |  4 +-
 test/common_ops/conv_tests.jl            | 15 ++--
 test/common_ops/dense_tests.jl           | 15 ++--
 test/common_ops/dropout_tests.jl         | 12 ++-
 test/normalization/batchnorm_tests.jl    | 19 +++--
 test/normalization/groupnorm_tests.jl    | 15 ++--
 test/normalization/instancenorm_tests.jl | 15 ++--
 test/normalization/layernorm_tests.jl    | 19 +++--
 test/others/bmm_tests.jl                 | 94 +++++++++++++-----------
 test/others/forwarddiff_tests.jl         |  4 +-
 test/others/misc_tests.jl                |  2 +-
 test/shared_testsetup.jl                 | 10 +--
 13 files changed, 140 insertions(+), 88 deletions(-)

diff --git a/test/common_ops/activation_tests.jl b/test/common_ops/activation_tests.jl
index a5c3e2f8..2045f20f 100644
--- a/test/common_ops/activation_tests.jl
+++ b/test/common_ops/activation_tests.jl
@@ -5,11 +5,13 @@
     apply_act_fast(f::F, x) where {F} = sum(abs2, fast_activation!!(f, copy(x)))
     apply_act_fast2(f::F, x) where {F} = sum(abs2, fast_activation(f, x))
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$f: $T" for f in [identity, relu, sigmoid, sigmoid_fast, softplus,
                 logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
             T in [Float16, Float32, Float64]
 
+            !fp64 && T == Float64 && continue
+
             x = rand(rng, T, 4, 3) |> aType
 
             y1 = apply_act(f, x)
diff --git a/test/common_ops/bias_act_tests.jl b/test/common_ops/bias_act_tests.jl
index 2bdbc830..1429c9b2 100644
--- a/test/common_ops/bias_act_tests.jl
+++ b/test/common_ops/bias_act_tests.jl
@@ -11,13 +11,15 @@
     end
     (f::__Fix1)(x, b) = f.f(f.act, x, b)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$act, $T, $sz" for act in [
                 identity, relu, sigmoid, sigmoid_fast, softplus,
                 logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
             T in [Float16, Float32, Float64],
             sz in [(2, 2, 3, 4), (4, 5)]
 
+            !fp64 && T == Float64 && continue
+
             x = rand(rng, T, sz) |> aType
             b = rand(rng, T, sz[end - 1]) |> aType
 
diff --git a/test/common_ops/conv_tests.jl b/test/common_ops/conv_tests.jl
index 5c208cd4..c7426b20 100644
--- a/test/common_ops/conv_tests.jl
+++ b/test/common_ops/conv_tests.jl
@@ -92,8 +92,9 @@ export expand, convfilter, calc_padding, anonact, TEST_BLOCKS, run_conv_testing
 end
 
 @testitem "Fused Conv: Group 1" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[1]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -101,8 +102,9 @@ end
 end
 
 @testitem "Fused Conv: Group 2" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[2]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -110,8 +112,9 @@ end
 end
 
 @testitem "Fused Conv: Group 3" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[3]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -119,8 +122,9 @@ end
 end
 
 @testitem "Fused Conv: Group 4" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[4]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
@@ -128,8 +132,9 @@ end
 end
 
 @testitem "Fused Conv: Group 5" tags=[:conv] setup=[SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for ((Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)) in TEST_BLOCKS[5]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(generate_fixed_array, activation, kernel, stride,
                 padding, hasbias, groups, Tw, Tx, aType, mode, ongpu)
         end
diff --git a/test/common_ops/dense_tests.jl b/test/common_ops/dense_tests.jl
index a14906b6..e438647c 100644
--- a/test/common_ops/dense_tests.jl
+++ b/test/common_ops/dense_tests.jl
@@ -79,40 +79,45 @@ export ALL_TEST_CONFIGS, TEST_BLOCKS, run_dense_testing
 end
 
 @testitem "Fused Dense: Group 1" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[1]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 2" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[2]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 3" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[3]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 4" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[4]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Fused Dense: Group 5" tags=[:dense] setup=[SharedTestSetup, DenseSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $Tw x $Tx, size $M x $N, bias $hasbias, activation $activation" for ((Tx, Tw), M, N, hasbias, activation) in TEST_BLOCKS[5]
+            !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
         end
     end
diff --git a/test/common_ops/dropout_tests.jl b/test/common_ops/dropout_tests.jl
index 2dd6f5e2..45f8fd01 100644
--- a/test/common_ops/dropout_tests.jl
+++ b/test/common_ops/dropout_tests.jl
@@ -1,11 +1,13 @@
 @testitem "Dropout" tags=[:other_ops] setup=[SharedTestSetup] begin
     rng = StableRNG(12345)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$T, $x_shape, $dims" for T in (Float16, Float32, Float64),
             x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1)),
             dims in (:, 1, (1, 2))
 
+            !fp64 && T == Float64 && continue
+
             x = randn(rng, T, x_shape) |> aType
 
             @test @inferred(dropout(rng, x, T(0.5), Val(true), T(2), dims)) isa Any
@@ -46,10 +48,12 @@ end
 
     rng = StableRNG(12345)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$T: $x_shape" for T in (Float16, Float32, Float64),
             x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
 
+            !fp64 && T == Float64 && continue
+
             x = randn(rng, T, x_shape) |> aType
             mask = rand(T, x_shape) |> aType
 
@@ -133,10 +137,12 @@ end
 
     rng = StableRNG(12345)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$T: $x_shape" for T in (Float16, Float32, Float64),
             x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
 
+            !fp64 && T == Float64 && continue
+
             x = randn(rng, T, x_shape) |> aType
 
             @test @inferred(alpha_dropout(rng, x, T(0.5), Val(true))) isa Any
diff --git a/test/normalization/batchnorm_tests.jl b/test/normalization/batchnorm_tests.jl
index 3d935809..3936200a 100644
--- a/test/normalization/batchnorm_tests.jl
+++ b/test/normalization/batchnorm_tests.jl
@@ -123,8 +123,9 @@ export setup_batchnorm, ALL_TEST_CONFIGS, TEST_BLOCKS, run_batchnorm_testing
 end
 
 @testitem "Batch Norm: Group 1" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -132,8 +133,9 @@ end
 end
 
 @testitem "Batch Norm: Group 2" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -141,8 +143,9 @@ end
 end
 
 @testitem "Batch Norm: Group 3" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -150,8 +153,9 @@ end
 end
 
 @testitem "Batch Norm: Group 4" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -159,8 +163,9 @@ end
 end
 
 @testitem "Batch Norm: Group 5" tags=[:batch_norm] setup=[SharedTestSetup, BatchNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $act $affine $track_stats" for (T, sz, training, affine, track_stats, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_batchnorm_testing(generate_fixed_array, T, sz, training,
                 affine, track_stats, act, aType, mode, ongpu)
         end
@@ -168,7 +173,9 @@ end
 end
 
 @testitem "Batch Norm: Mixed Precision" tags=[:batch_norm] setup=[SharedTestSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && aType == Float64 && continue
+
         x = rand(Float64, 4, 4, 6, 2) |> aType
         scale = rand(Float32, 6) |> aType
         bias = rand(Float32, 6) |> aType
diff --git a/test/normalization/groupnorm_tests.jl b/test/normalization/groupnorm_tests.jl
index 3d5e821a..3c638885 100644
--- a/test/normalization/groupnorm_tests.jl
+++ b/test/normalization/groupnorm_tests.jl
@@ -93,40 +93,45 @@ export setup_groupnorm, ALL_TEST_CONFIGS, TEST_BLOCKS, run_groupnorm_testing
 end
 
 @testitem "Group Norm: Group 1" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 2" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 3" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 4" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
 end
 
 @testitem "Group Norm: Group 5" tags=[:group_norm] setup=[SharedTestSetup, GroupNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $groups, $affine, $act" for (T, sz, groups, affine, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_groupnorm_testing(T, sz, groups, affine, act, aType, mode, ongpu)
         end
     end
diff --git a/test/normalization/instancenorm_tests.jl b/test/normalization/instancenorm_tests.jl
index a48a502d..ff166cfa 100644
--- a/test/normalization/instancenorm_tests.jl
+++ b/test/normalization/instancenorm_tests.jl
@@ -84,8 +84,9 @@ end
 
 @testitem "Instance Norm: Group 1" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -94,8 +95,9 @@ end
 
 @testitem "Instance Norm: Group 2" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -104,8 +106,9 @@ end
 
 @testitem "Instance Norm: Group 3" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -114,8 +117,9 @@ end
 
 @testitem "Instance Norm: Group 4" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
@@ -124,8 +128,9 @@ end
 
 @testitem "Instance Norm: Group 5" tags=[:instance_norm] setup=[
     SharedTestSetup, InstanceNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $sz, $training $act" for (T, sz, training, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_instancenorm_testing(
                 generate_fixed_array, T, sz, training, act, aType, mode, ongpu)
         end
diff --git a/test/normalization/layernorm_tests.jl b/test/normalization/layernorm_tests.jl
index bdfccb47..37ca3c70 100644
--- a/test/normalization/layernorm_tests.jl
+++ b/test/normalization/layernorm_tests.jl
@@ -90,8 +90,9 @@ export ALL_TEST_CONFIGS, TEST_BLOCKS, run_layernorm_testing
 end
 
 @testitem "Layer Norm: Group 1" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[1]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -99,8 +100,9 @@ end
 end
 
 @testitem "Layer Norm: Group 2" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[2]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -108,8 +110,9 @@ end
 end
 
 @testitem "Layer Norm: Group 3" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[3]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -117,8 +120,9 @@ end
 end
 
 @testitem "Layer Norm: Group 4" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[4]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -126,8 +130,9 @@ end
 end
 
 @testitem "Layer Norm: Group 5" tags=[:layer_norm] setup=[SharedTestSetup, LayerNormSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "eltype $T, size $x_shape, $act" for (T, x_shape, affine_shape, act) in TEST_BLOCKS[5]
+            !fp64 && T == Float64 && continue
             run_layernorm_testing(
                 generate_fixed_array, aType, T, x_shape, affine_shape, act, ongpu, mode)
         end
@@ -135,7 +140,9 @@ end
 end
 
 @testitem "Layer Norm: Error Checks" tags=[:layer_norm] setup=[SharedTestSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && continue
+
         x = rand(2, 3) |> aType
 
         @test_throws ArgumentError layernorm(x, nothing, nothing, identity, nothing, 1e-5)
diff --git a/test/others/bmm_tests.jl b/test/others/bmm_tests.jl
index ea847568..e7a71ae6 100644
--- a/test/others/bmm_tests.jl
+++ b/test/others/bmm_tests.jl
@@ -46,12 +46,14 @@ end
 @testitem "batched_mul" tags=[:batched_ops] setup=[SharedTestSetup, BatchedMMSetup] begin
     rng = StableRNG(1234)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "batched_mul: Float64 × $(TB)" for TB in [Float64, Float32]
+            !fp64 && continue
+
             @testset "real" begin
-                A = randn(rng, 7, 5, 3) |> aType
-                B = randn(rng, TB, 5, 7, 3) |> aType
-                C = randn(rng, 7, 6, 3) |> aType
+                A = randn(rng, Float32, 7, 5, 3) |> aType
+                B = randn(rng, Float32, TB, 5, 7, 3) |> aType
+                C = randn(rng, Float32, 7, 6, 3) |> aType
 
                 @test batched_matmul(A, B) ≈ bmm_test(A, B)
                 @test batched_matmul(batched_transpose(A), batched_transpose(B)) ≈
@@ -61,9 +63,9 @@ end
             end
 
             @testset "complex" begin
-                cA = randn(rng, Complex{Float64}, 7, 5, 3) |> aType
-                cB = randn(rng, Complex{TB}, 5, 7, 3) |> aType
-                cC = randn(rng, Complex{Float64}, 7, 6, 3) |> aType
+                cA = randn(rng, Float32, Complex{Float64}, 7, 5, 3) |> aType
+                cB = randn(rng, Float32, Complex{TB}, 5, 7, 3) |> aType
+                cC = randn(rng, Float32, Complex{Float64}, 7, 6, 3) |> aType
 
                 @test batched_matmul(cA, cB) ≈ bmm_adjtest(cA, cB)
                 @test batched_matmul(batched_adjoint(cA), batched_adjoint(cB)) ≈
@@ -97,8 +99,8 @@ end
                         fun in [identity, batched_adjoint],
                         ty in [identity, complex]
 
-                        A = randn(rng, ty(Float64), 4, 4, 4) |> aType
-                        B = randn(rng, ty(TB), 4, 4, 4) |> aType
+                        A = randn(rng, Float32, ty(Float64), 4, 4, 4) |> aType
+                        B = randn(rng, Float32, ty(TB), 4, 4, 4) |> aType
 
                         @test batched_matmul(fun(A), PermutedDimsArray(B, perm)) ≈
                               batched_matmul(fun(A), permutedims(B, perm))
@@ -131,7 +133,9 @@ end
     SharedTestSetup, BatchedMMSetup] begin
     rng = StableRNG(1234)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && continue
+
         @testset "Float64 × $(TB)" for TB in [Float64, ComplexF64]
             @testset "trivial dimensions & unit strides" begin
                 @testset "$tA(rand$((sA...,3))) ⊠ $tB(rand$((sB...,3)))" for tA in [
@@ -162,9 +166,9 @@ end
     rng = StableRNG(1234)
 
     @testset "Float64 × $(TB)" for TB in [Float64, Float32]
-        A = randn(rng, 7, 5, 3)
-        B = randn(rng, TB, 5, 7, 3)
-        C = randn(rng, 7, 6, 3)
+        A = randn(rng, Float32, 7, 5, 3)
+        B = randn(rng, Float32, TB, 5, 7, 3)
+        C = randn(rng, Float32, 7, 6, 3)
 
         function interface_tests(X, _X)
             @test length(_X) == length(X)
@@ -228,9 +232,11 @@ end
     SharedTestSetup, BatchedMMSetup] begin
     rng = StableRNG(1234)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        !fp64 && continue
+
         @testset "Float64 × $(TB)" for TB in [Float64, ComplexF64]
-            A = randn(rng, 3, 3, 3) |> aType
+            A = randn(rng, Float32, 3, 3, 3) |> aType
             M = aType(rand(rng, TB, 3, 3)) .+ im
             V = aType(rand(rng, TB, 3))
 
@@ -259,42 +265,44 @@ end
     fn(A, B) = sum(batched_matmul(A, B))
     fn_vec(A, B) = sum(batched_vec(A, B))
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         M, P, Q = 13, 7, 11
         B = 3
 
         @testset "Two 3-arrays" begin
-            @test_gradients(fn, aType(randn(rng, M, P, B)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, batched_adjoint(aType(randn(rng, P, M, B))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P, B)),
-                batched_transpose(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, B)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, batched_adjoint(aType(randn(rng, Float32, P, M, B))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, B)),
+                batched_transpose(aType(randn(rng, Float32, Q, P, B))); atol=1e-3,
+                rtol=1e-3)
         end
 
         @testset "One a matrix..." begin
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, adjoint(aType(randn(rng, P, M))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                batched_adjoint(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
-
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, adjoint(aType(randn(rng, P, M))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P)),
-                batched_adjoint(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, adjoint(aType(randn(rng, Float32, P, M))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                batched_adjoint(aType(randn(rng, Float32, Q, P, B))); atol=1e-3, rtol=1e-3)
+
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, adjoint(aType(randn(rng, Float32, P, M))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P)),
+                batched_adjoint(aType(randn(rng, Float32, Q, P, B))); atol=1e-3, rtol=1e-3)
         end
 
         @testset "... or equivalent to a matrix" begin
-            @test_gradients(fn, aType(randn(rng, M, P, 1)),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, batched_transpose(aType(randn(rng, P, M, 1))),
-                aType(randn(rng, P, Q, B)); atol=1e-3, rtol=1e-3)
-            @test_gradients(fn, aType(randn(rng, M, P, 1)),
-                batched_transpose(aType(randn(rng, Q, P, B))); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, 1)),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, batched_transpose(aType(randn(rng, Float32, P, M, 1))),
+                aType(randn(rng, Float32, P, Q, B)); atol=1e-3, rtol=1e-3)
+            @test_gradients(fn, aType(randn(rng, Float32, M, P, 1)),
+                batched_transpose(aType(randn(rng, Float32, Q, P, B))); atol=1e-3,
+                rtol=1e-3)
         end
     end
 end
@@ -308,8 +316,8 @@ end
 
     ops = (identity, NNlib.batched_adjoint, NNlib.batched_transpose)
 
-    @testset "$mode" for (mode, aType, ongpu) in MODES
-        x = randn(rng, Float32, 3, 3, 2) |> aType
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
+        x = randn(rng, Float32, Float32, 3, 3, 2) |> aType
 
         @testset "$(op1) x $(op2)" for (op1, op2) in Iterators.product(ops, ops)
             x1 = op1(x)
diff --git a/test/others/forwarddiff_tests.jl b/test/others/forwarddiff_tests.jl
index 23c279e8..228aa7d3 100644
--- a/test/others/forwarddiff_tests.jl
+++ b/test/others/forwarddiff_tests.jl
@@ -38,7 +38,7 @@
         end
     end
 
-    @testset "$(mode): Jacobian Vector Products" for (mode, aType, ongpu) in MODES
+    @testset "$(mode): Jacobian Vector Products" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(op)(; flipped = $flipped)" for flipped in (true, false),
             op in (depthwiseconv, conv)
 
@@ -98,7 +98,7 @@ end
 
     rng = StableRNG(12345)
 
-    @testset "$mode: dropout" for (mode, aType, ongpu) in MODES
+    @testset "$mode: dropout" for (mode, aType, ongpu, fp64) in MODES
         x = randn(rng, Float32, 10, 2) |> aType
         x_dual = ForwardDiff.Dual.(x)
 
diff --git a/test/others/misc_tests.jl b/test/others/misc_tests.jl
index 6943de74..6e046eea 100644
--- a/test/others/misc_tests.jl
+++ b/test/others/misc_tests.jl
@@ -1,5 +1,5 @@
 @testitem "internal_operation_mode: Wrapped Arrays" tags=[:others] setup=[SharedTestSetup] begin
-    @testset "$mode" for (mode, aType, ongpu) in MODES
+    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         x = rand(Float32, 4, 3) |> aType
         retval = ongpu ? LuxLib.GPUBroadcastOp : LuxLib.LoopedArrayOp
         @test LuxLib.internal_operation_mode(x) isa retval
diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl
index fb7bb9c3..487a50d5 100644
--- a/test/shared_testsetup.jl
+++ b/test/shared_testsetup.jl
@@ -61,11 +61,11 @@ end
 
 const MODES = begin
     modes = []
-    cpu_testing() && push!(modes, ("cpu", Array, false))
-    cuda_testing() && push!(modes, ("cuda", CuArray, true))
-    amdgpu_testing() && push!(modes, ("amdgpu", ROCArray, true))
-    oneapi_testing() && push!(modes, ("oneapi", oneArray, true))
-    metal_testing() && push!(modes, ("metal", MtlArray, true))
+    cpu_testing() && push!(modes, ("cpu", Array, false, true))
+    cuda_testing() && push!(modes, ("cuda", CuArray, true, true))
+    amdgpu_testing() && push!(modes, ("amdgpu", ROCArray, true, true))
+    oneapi_testing() && push!(modes, ("oneapi", oneArray, true, false))
+    metal_testing() && push!(modes, ("metal", MtlArray, true, false))
     modes
 end