test: more enzyme testing

LuxDL · Jul 21, 2024 · 6aad315 · 6aad315
1 parent ea9788e
commit 6aad315
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 18 deletions.
diff --git a/.buildkite/testing.yml b/.buildkite/testing.yml
@@ -18,7 +18,7 @@ steps:
         env:
           BACKEND_GROUP: "CUDA"
         if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
-        timeout_in_minutes: 60
+        timeout_in_minutes: 240
         matrix:
           setup:
             julia:
@@ -40,7 +40,7 @@ steps:
           queue: "juliagpu"
           cuda: "*"
         if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.pull_request.labels includes "run downstream test"
-        timeout_in_minutes: 60
+        timeout_in_minutes: 240
         matrix:
           setup:
             repo:
@@ -70,7 +70,7 @@ steps:
           rocm: "*"
           rocmgpu: "*"
         if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
-        timeout_in_minutes: 60
+        timeout_in_minutes: 240
         matrix:
           setup:
             julia:
@@ -97,7 +97,7 @@ steps:
           JULIA_AMDGPU_HIP_MUST_LOAD: "1"
           JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
         if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/
-        timeout_in_minutes: 60
+        timeout_in_minutes: 240
         matrix:
           setup:
             repo:

diff --git a/test/common_ops/conv_tests.jl b/test/common_ops/conv_tests.jl
@@ -79,8 +79,7 @@
                 ∂w_enz = Enzyme.make_zero(weight)
                 ∂x_enz = Enzyme.make_zero(x)
                 ∂b = if hasbias
-                    ∂b_enz = Enzyme.make_zero(bias)
-                    Duplicated(bias, ∂b_enz)
+                    Duplicated(bias, Enzyme.make_zero(bias))
                 else
                     Const(nothing)
                 end

diff --git a/test/normalization/batchnorm_tests.jl b/test/normalization/batchnorm_tests.jl
@@ -50,8 +50,10 @@
             rtol = fp16 ? 1.0f-2 : 1.0f-3
 
             @test y≈y_simple atol=atol rtol=rtol
-            @test nt.running_mean≈nt_simple.running_mean atol=atol rtol=rtol
-            @test nt.running_var≈nt_simple.running_var atol=atol rtol=rtol
+            if track_stats
+                @test nt.running_mean≈nt_simple.running_mean atol=atol rtol=rtol
+                @test nt.running_var≈nt_simple.running_var atol=atol rtol=rtol
+            end
 
             # Check the rrules
             _f = (args...) -> sum(first(batchnorm(
@@ -63,8 +65,10 @@
             ∂x_simple, ∂scale_simple, ∂bias_simple = Zygote.gradient(
                 sum ∘ _f2, x, scale, bias)
             @test ∂x≈∂x_simple atol=atol rtol=rtol
-            @test ∂scale≈∂scale_simple atol=atol rtol=rtol
-            @test ∂bias≈∂bias_simple atol=atol rtol=rtol
+            if affine
+                @test ∂scale≈∂scale_simple atol=atol rtol=rtol
+                @test ∂bias≈∂bias_simple atol=atol rtol=rtol
+            end
 
             @test @inferred(batchnorm(
                 x, scale, bias, rm, rv, training, act, T(0.9), epsilon)) isa Any
@@ -87,11 +91,27 @@
             end
 
             if anonact !== act
-                lfn = (x, sc, b, rm, rv, tr, act, ϵ) -> sum(batchnorm(
-                    x, sc, b, rm, rv, tr, act, ϵ))
+                lfn = (x, sc, b, rm, rv, tr, act, ϵ) -> sum(first(batchnorm(
+                    x, sc, b, rm, rv, tr, act, ϵ)))
                 @test @inferred(Zygote.gradient(
                     lfn, x, scale, bias, rm, rv, training, act, epsilon)) isa Any
             end
+
+            if !on_gpu && !fp16 && __istraining(training) && affine
+                __f = (args...) -> sum(first(batchnorm(
+                    args..., rm, rv, training, act, T(0.9), epsilon)))
+                ∂x, ∂scale, ∂bias = Zygote.gradient(__f, x, scale, bias)
+
+                ∂x_enz = Enzyme.make_zero(x)
+                ∂scale_enz = Enzyme.make_zero(scale)
+                ∂bias_enz = Enzyme.make_zero(bias)
+                Enzyme.autodiff(Reverse, __f, Active, Duplicated(x, ∂x_enz),
+                    Duplicated(scale, ∂scale_enz), Duplicated(bias, ∂bias_enz))
+
+                @test ∂x≈∂x_enz rtol=rtol atol=atol
+                @test ∂scale≈∂scale_enz rtol=rtol atol=atol
+                @test ∂bias≈∂bias_enz rtol=rtol atol=atol
+            end
         end
 
         @testset "mixed precision" begin

diff --git a/test/normalization/instancenorm_tests.jl b/test/normalization/instancenorm_tests.jl
@@ -35,26 +35,44 @@
             if !affine && act === identity
                 _target_std = ones(
                     ntuple(_ -> 1, length(sz) - 2)..., size(x)[(end - 1):end]...)
-                @test check_approx(
-                    std(Array(y); dims=1:(length(sz) - 2)), _target_std; atol=0.2, rtol=0.2)
+                @test std(Array(y); dims=1:(length(sz) - 2))≈_target_std atol=0.2 rtol=0.2
             end
             @test std(y; dims=1:(length(sz) - 2)) != std(x; dims=1:(length(sz) - 2))
 
+            fp16 = T == Float16
+            atol = fp16 ? 1.0f-2 : 1.0f-3
+            rtol = fp16 ? 1.0f-2 : 1.0f-3
+
             if __istraining(training) && affine
-                fp16 = T == Float16
                 __f = (args...) -> sum(first(instancenorm(
                     x, args..., training, act, epsilon)))
                 skip_fd = act === relu
                 allow_unstable() do
-                    @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu skip_finite_differences=$(skip_fd)
+                    @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=$atol rtol=$rtol gpu_testing=$on_gpu skip_finite_differences=$(skip_fd)
                 end
             end
 
             if anonact !== act
-                lfn = (x, sc, b, tr, act, ϵ) -> sum(instancenorm(x, sc, b, tr, act, ϵ))
+                lfn = (x, sc, b, tr, act, ϵ) -> sum(first(instancenorm(
+                    x, sc, b, tr, act, ϵ)))
                 @test @inferred(Zygote.gradient(
                     lfn, x, scale, bias, training, act, epsilon)) isa Any
             end
+
+            if !on_gpu && !fp16 && __istraining(training) && affine
+                __f = (args...) -> sum(first(instancenorm(args..., training, act, epsilon)))
+                ∂x, ∂scale, ∂bias = Zygote.gradient(__f, x, scale, bias)
+
+                ∂x_enz = Enzyme.make_zero(x)
+                ∂scale_enz = Enzyme.make_zero(scale)
+                ∂bias_enz = Enzyme.make_zero(bias)
+                Enzyme.autodiff(Reverse, __f, Active, Duplicated(x, ∂x_enz),
+                    Duplicated(scale, ∂scale_enz), Duplicated(bias, ∂bias_enz))
+
+                @test ∂x≈∂x_enz rtol=rtol atol=atol
+                @test ∂scale≈∂scale_enz rtol=rtol atol=atol
+                @test ∂bias≈∂bias_enz rtol=rtol atol=atol
+            end
         end
     end
 end
diff --git a/test/normalization/layernorm_tests.jl b/test/normalization/layernorm_tests.jl
@@ -39,12 +39,16 @@
                 @test check_approx(std(y; dims), 1; atol=1e-1, rtol=1e-1)
             end
 
+            fp16 = T == Float16
+            atol = fp16 ? 1.0f-2 : 1.0f-3
+            rtol = fp16 ? 1.0f-2 : 1.0f-3
+
             if affine_shape !== nothing
                 fp16 = T == Float16
                 __f = (args...) -> sum(_f(x, args...))
                 skip_fd = act === relu
                 allow_unstable() do
-                    @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu skip_finite_differences=$(skip_fd)
+                    @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=$atol rtol=$rtol gpu_testing=$on_gpu skip_finite_differences=$(skip_fd)
                 end
             end
 
@@ -53,6 +57,26 @@
                 @test @inferred(Zygote.gradient(
                     lfn, x, scale, bias, act, dims, epsilon)) isa Any
             end
+
+            if !on_gpu && !fp16
+                __f = (args...) -> sum(first(layernorm(args..., act, dims, epsilon)))
+                ∂x, ∂scale, ∂bias = Zygote.gradient(__f, x, scale, bias)
+
+                ∂x_enz = Enzyme.make_zero(x)
+                (∂b, ∂sc) = if bias === nothing
+                    Const(nothing), Const(nothing)
+                else
+                    (Duplicated(bias, Enzyme.make_zero(bias)),
+                        Duplicated(scale, Enzyme.make_zero(scale)))
+                end
+                Enzyme.autodiff(Reverse, __f, Active, Duplicated(x, ∂x_enz), ∂sc, ∂b)
+
+                @test ∂x≈∂x_enz rtol=rtol atol=atol
+                if bias !== nothing
+                    @test ∂sc.dval≈∂scale rtol=rtol atol=atol
+                    @test ∂b.dval≈∂bias rtol=rtol atol=atol
+                end
+            end
         end
     end
 end