diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 2d554e56..b96cb400 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -101,7 +101,7 @@ jobs: # force it to use this PR's version of the package Pkg.develop(PackageSpec(path=".")) # resolver may fail with main deps Pkg.update() - Pkg.test(; coverage=true) # resolver may fail with test time deps + Pkg.test(; coverage="user") # resolver may fail with test time deps catch err err isa Pkg.Resolve.ResolverError || rethrow() # If we can't resolve that means this is incompatible by SemVer and this is fine diff --git a/src/api/groupnorm.jl b/src/api/groupnorm.jl index 72f5f8e6..5f713cf3 100644 --- a/src/api/groupnorm.jl +++ b/src/api/groupnorm.jl @@ -33,8 +33,7 @@ function groupnorm(x::AbstractArray{<:Real, N}, scale::Optional{<:AbstractVector sz = size(x) x_reshaped = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ groups, groups, sz[N]) - x_ = _groupnorm_impl( - x_reshaped, scale, bias, _get_groupnorm_reduce_dims(x), Val(false), epsilon, σ) + x_ = _groupnorm_impl(x_reshaped, scale, bias, _get_groupnorm_reduce_dims(x), epsilon, σ) return reshape(x_, sz) end diff --git a/src/impl/affine_normalize.jl b/src/impl/affine_normalize.jl index 441664dc..a08fd60b 100644 --- a/src/impl/affine_normalize.jl +++ b/src/impl/affine_normalize.jl @@ -158,7 +158,11 @@ end (i, j, k, l) = @index(Global, NTuple) @inbounds denom = sqrt(σ²[1, 1, k, l] + ϵ) @inbounds denom² = denom * denom - @inbounds _sc = scale[1, j, k, 1] / denom + if scale !== nothing + @inbounds _sc = scale[1, j, k, 1] / denom + else + @inbounds _sc = inv(denom) + end @inbounds xμ = x[i, j, k, l] - μ[1, 1, k, l] @inbounds ∂x[i, j, k, l] = ∂y[i, j, k, l] * _sc @@ -182,7 +186,7 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², scale, for K in axes(∂y, 3), L in axes(∂y, 4) denom = sqrt(σ²[1, 1, K, L] + ϵ) denom² = denom * denom - _sc = scale[1, J, K, 1] / denom + _sc = scale !== nothing ? (scale[1, J, K, 1] / denom) : inv(denom) for I in axes(∂y, 1) xμ = x[I, J, K, L] - μ[1, 1, K, L] diff --git a/src/impl/normalization.jl b/src/impl/normalization.jl index 87cbecf7..dcfc0cdd 100644 --- a/src/impl/normalization.jl +++ b/src/impl/normalization.jl @@ -111,7 +111,8 @@ EnzymeRules.inactive_noinl(::typeof(_get_norm_reshape_dims), ::Any...) = nothing # code. function _groupnorm_impl(x::AbstractArray, scale::Optional{<:AbstractVector}, bias::Optional{<:AbstractVector}, reduce_dims::Val, - training::Val, epsilon, act::F=identity) where {F} - (μ, σ²), _ = _get_batch_statistics(x, nothing, nothing, reduce_dims, training, nothing) + epsilon, act::F=identity) where {F} + (μ, σ²), _ = _get_batch_statistics( + x, nothing, nothing, reduce_dims, Val(false), nothing) return _affine_normalize_gn(act, x, μ, σ², scale, bias, epsilon) end diff --git a/test/common_ops/conv_tests.jl b/test/common_ops/conv_tests.jl index f3674d0a..3e2b7616 100644 --- a/test/common_ops/conv_tests.jl +++ b/test/common_ops/conv_tests.jl @@ -53,17 +53,20 @@ @test y≈y_generic atol=atol rtol=rtol @test eltype(y) == promote_type(Tw, Tx) - @inferred fused_conv_bias_activation(activation, weight, x, bias, cdims) + @test @inferred(fused_conv_bias_activation( + activation, weight, x, bias, cdims)) isa Any @jet fused_conv_bias_activation(activation, weight, x, bias, cdims) __f = (σ, w, x, b, cdims) -> sum( abs2, fused_conv_bias_activation(σ, w, x, b, cdims)) if mode != "amdgpu" && activation !== anonact - @inferred Zygote.gradient(__f, activation, weight, x, bias, cdims) + @test @inferred(Zygote.gradient( + __f, activation, weight, x, bias, cdims)) isa Any else try - @inferred Zygote.gradient(__f, activation, weight, x, bias, cdims) + @test @inferred(Zygote.gradient( + __f, activation, weight, x, bias, cdims)) isa Any @test true catch @test_broken false diff --git a/test/common_ops/dense_tests.jl b/test/common_ops/dense_tests.jl index 8b7fcf4d..0ec78459 100644 --- a/test/common_ops/dense_tests.jl +++ b/test/common_ops/dense_tests.jl @@ -25,13 +25,13 @@ @test y ≈ y_generic @test eltype(y) == promote_type(Tw, Tx) - @inferred fused_dense_bias_activation(activation, w, x, bias) + @test @inferred(fused_dense_bias_activation(activation, w, x, bias)) isa Any @jet fused_dense_bias_activation(activation, w, x, bias) __f = (σ, w, x, b) -> sum(abs2, fused_dense_bias_activation(σ, w, x, b)) if activation !== anonact - @inferred Zygote.gradient(__f, activation, w, x, bias) + @test @inferred(Zygote.gradient(__f, activation, w, x, bias)) isa Any else @test length(@inferred(Zygote.gradient(__f, activation, w, x, bias)))==4 broken=true end diff --git a/test/common_ops/dropout_tests.jl b/test/common_ops/dropout_tests.jl index 55aeaa91..ca5e9b9c 100644 --- a/test/common_ops/dropout_tests.jl +++ b/test/common_ops/dropout_tests.jl @@ -9,7 +9,7 @@ x = randn(rng, T, x_shape) |> aType - @inferred dropout(rng, x, T(0.5), Val(true), T(2), Colon()) + @test @inferred(dropout(rng, x, T(0.5), Val(true), T(2), Colon())) isa Any y, mask_, rng_ = dropout(rng, x, T(0.5), Val(true), T(2), Colon()) @@ -20,7 +20,7 @@ @test rng != rng_ __f = x -> sum(first(dropout(StableRNG(0), x, 0.5, Val(true), 2.0, Colon()))) - @test size(only(@inferred(Zygote.gradient(__f, x)))) == size(x) + @test @inferred(Zygote.gradient(__f, x)) isa Any __f = let rng = rng, T = T x -> sum(first(dropout(rng, x, T(0.5), Val(true), T(2), Colon()))) @@ -41,7 +41,7 @@ end @jet sum(first(dropout(rng, x, T(0.5), Val(true), T(2), Colon()))) - @inferred dropout(rng, x, T(0.5), Val(true), T(2), Colon()) + @test @inferred(dropout(rng, x, T(0.5), Val(true), T(2), Colon())) isa Any y, mask_, rng_ = dropout(rng, x, T(0.5), Val(false), T(2), Colon()) @@ -68,7 +68,8 @@ end mask = rand(T, x_shape) |> aType # Update mask - @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(true), T(2), Colon()) + @test @inferred(dropout( + rng, x, mask, T(0.5), Val(true), Val(true), T(2), Colon())) isa Any y, mask_, rng_ = dropout( rng, x, mask, T(0.5), Val(true), Val(true), T(2), Colon()) @@ -82,7 +83,7 @@ end __f = (x, mask) -> sum(first(dropout( StableRNG(0), x, mask, 0.5, Val(true), Val(true), 2.0, Colon()))) - @test size(first(@inferred(Zygote.gradient(__f, x, mask)))) == size(x) + @test @inferred(Zygote.gradient(__f, x, mask)) isa Any __f = let rng = rng, mask = mask x -> sum(first(dropout( @@ -109,7 +110,8 @@ end rng, x, mask, T(0.5), Val(true), Val(true), T(2), Colon()))) # Try using mask if possible (possible!!) - @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon()) + @test @inferred(dropout( + rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon())) isa Any y, mask_, rng_ = dropout( rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon()) @@ -124,7 +126,7 @@ end __f = (x, mask) -> sum(first(dropout( StableRNG(0), x, mask, 0.5, Val(true), Val(false), 2.0, Colon()))) # Branching based on runtime values - @test_broken size(first(@inferred(Zygote.gradient(__f, x, mask)))) == size(x) + @test @inferred(Zygote.gradient(__f, x, mask)) isa Any broken=true __f = let rng = rng, mask = mask x -> sum(first(dropout( @@ -147,7 +149,8 @@ end mask = rand(T, (x_shape[1:(end - 1)]..., 13)) |> aType # Try using mask if possible (not possible!!) - @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon()) + @test @inferred(dropout( + rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon())) isa Any y, mask_, rng_ = dropout( rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon()) @@ -162,7 +165,7 @@ end __f = (x, mask) -> sum(first(dropout( StableRNG(0), x, mask, 0.5, Val(true), Val(false), 2.0, Colon()))) # Branching based on runtime activity - @test_broken size(first(@inferred(Zygote.gradient(__f, x, mask)))) == size(x) + @test @inferred(Zygote.gradient(__f, x, mask)) isa Any broken=true __f = let rng = rng, mask = mask x -> sum(first(dropout( @@ -187,7 +190,8 @@ end @jet sum(first(dropout( rng, x, mask, T(0.5), Val(true), Val(false), T(2), Colon()))) # Testing Mode - @inferred dropout(rng, x, mask, T(0.5), Val(false), Val(false), T(2), Colon()) + @test @inferred(dropout( + rng, x, mask, T(0.5), Val(false), Val(false), T(2), Colon())) isa Any y, mask_, rng_ = dropout( rng, x, mask, T(0.5), Val(false), Val(false), T(2), Colon()) @@ -212,7 +216,7 @@ end x = randn(rng, T, x_shape) |> aType - @inferred alpha_dropout(rng, x, T(0.5), Val(true)) + @test @inferred(alpha_dropout(rng, x, T(0.5), Val(true))) isa Any y, rng_ = alpha_dropout(rng, x, T(0.5), Val(true)) @@ -223,7 +227,7 @@ end @test_broken isapprox(std(y), std(x); atol=1.0f-2, rtol=1.0f-2) __f = x -> sum(first(alpha_dropout(StableRNG(0), x, 0.5, Val(true)))) - @test size(only(@inferred(Zygote.gradient(__f, x)))) == size(x) + @test @inferred(Zygote.gradient(__f, x)) isa Any __f = let rng = rng x -> sum(first(alpha_dropout(rng, x, T(0.5), Val(true)))) @@ -243,8 +247,7 @@ end end @jet sum(first(alpha_dropout(rng, x, T(0.5), Val(true)))) - - @inferred alpha_dropout(rng, x, T(0.5), Val(false)) + @test @inferred(alpha_dropout(rng, x, T(0.5), Val(false))) isa Any y, rng_ = alpha_dropout(rng, x, T(0.5), Val(false)) diff --git a/test/normalization/batchnorm_tests.jl b/test/normalization/batchnorm_tests.jl index 1c5f82f8..fb3a5d3c 100644 --- a/test/normalization/batchnorm_tests.jl +++ b/test/normalization/batchnorm_tests.jl @@ -30,7 +30,8 @@ y, nt = batchnorm(x, scale, bias, rm, rv, training, act, T(0.9), epsilon) - @inferred batchnorm(x, scale, bias, rm, rv, training, act, T(0.9), epsilon) + @test @inferred(batchnorm( + x, scale, bias, rm, rv, training, act, T(0.9), epsilon)) isa Any @jet batchnorm(x, scale, bias, rm, rv, training, act, T(0.9), epsilon) @test y isa aType{T, length(sz)} diff --git a/test/normalization/groupnorm_tests.jl b/test/normalization/groupnorm_tests.jl index 2fc3393e..3d3d76f9 100644 --- a/test/normalization/groupnorm_tests.jl +++ b/test/normalization/groupnorm_tests.jl @@ -8,31 +8,78 @@ return x, scale, bias end + # Bypassing all optimizations + function __groupnorm_basic( + x::AbstractArray{<:Real, N}, scale::LuxLib.Optional{<:AbstractVector}, + bias::LuxLib.Optional{<:AbstractVector}, groups::Int, + σ::F=identity, epsilon::Real=1.0f-5) where {F, N} + sz = size(x) + x_reshaped = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ groups, groups, sz[N]) + x_ = LuxLib._normalization(x_reshaped, nothing, nothing, scale, bias, + LuxLib._get_groupnorm_reduce_dims(x), Val(false), nothing, epsilon, σ)[1] + return reshape(x_, sz) + end + @testset "$mode" for (mode, aType, on_gpu) in MODES @testset "eltype $T, size $sz, ngroups $groups, $act" for T in ( Float16, Float32, Float64), - sz in ((4, 6, 2), (8, 8, 8, 6, 2), (3, 16, 16, 12, 2), + sz in ((6, 2), (4, 6, 2), (8, 8, 8, 6, 2), (3, 16, 16, 12, 2), (4, 4, 6, 2), (2, 2, 6, 2), (3, 3, 12, 4)), groups in (2, 3), act in (identity, relu, tanh_fast, sigmoid_fast, x -> x^3) _f = (args...) -> groupnorm(args..., groups, act, epsilon) + _f2 = (args...) -> groupnorm(args..., groups, act, epsilon) epsilon = T(1e-5) x, scale, bias = _setup_groupnorm(aType, T, sz) y = _f(x, scale, bias) - @inferred groupnorm(x, scale, bias, groups, act, epsilon) + y_simple = _f2(x, scale, bias) + + fp16 = T == Float16 + atol = fp16 ? 1.0f-2 : 1.0f-3 + rtol = fp16 ? 1.0f-2 : 1.0f-3 + + @test y≈y_simple atol=atol rtol=rtol + + # Check the rrules + ∂x, ∂scale, ∂bias = Zygote.gradient(sum ∘ _f, x, scale, bias) + ∂x_simple, ∂scale_simple, ∂bias_simple = Zygote.gradient( + sum ∘ _f2, x, scale, bias) + @test ∂x≈∂x_simple atol=atol rtol=rtol + @test ∂scale≈∂scale_simple atol=atol rtol=rtol + @test ∂bias≈∂bias_simple atol=atol rtol=rtol + + @test @inferred(groupnorm(x, scale, bias, groups, act, epsilon)) isa Any @jet groupnorm(x, scale, bias, groups, act, epsilon) + lfn = (x, sc, b, g, act, ϵ) -> sum(groupnorm(x, sc, b, g, act, ϵ)) + @test @inferred(Zygote.gradient(lfn, x, scale, bias, groups, act, epsilon)) isa + Any + @test y isa aType{T, length(sz)} @test size(y) == sz - fp16 = T == Float16 __f = (args...) -> sum(groupnorm(x, args..., groups, act, epsilon)) skip_fd = act === relu allow_unstable() do - @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 skip_finite_differences=$(skip_fd) + @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=$atol rtol=$rtol soft_fail=$fp16 skip_finite_differences=$(skip_fd) + end + + __f = (x, scale, bias) -> sum(groupnorm(x, scale, bias, groups, act, epsilon)) + if !on_gpu + ∂x, ∂scale, ∂bias = Zygote.gradient(__f, x, scale, bias) + + ∂x_enz = Enzyme.make_zero(x) + ∂scale_enz = Enzyme.make_zero(scale) + ∂bias_enz = Enzyme.make_zero(bias) + Enzyme.autodiff(Reverse, __f, Duplicated(x, ∂x_enz), + Duplicated(scale, ∂scale_enz), Duplicated(bias, ∂bias_enz)) + + @test ∂x≈∂x_enz rtol=rtol atol=atol + @test ∂scale≈∂scale_enz rtol=rtol atol=atol + @test ∂bias≈∂bias_enz rtol=rtol atol=atol end end end diff --git a/test/normalization/instancenorm_tests.jl b/test/normalization/instancenorm_tests.jl index b135c4ed..e989343e 100644 --- a/test/normalization/instancenorm_tests.jl +++ b/test/normalization/instancenorm_tests.jl @@ -24,7 +24,7 @@ y, nt = instancenorm(x, scale, bias, training, act, epsilon) - @inferred instancenorm(x, scale, bias, training, act, epsilon) + @test @inferred(instancenorm(x, scale, bias, training, act, epsilon)) isa Any @jet instancenorm(x, scale, bias, training, act, epsilon) @test y isa aType{T, length(sz)} diff --git a/test/normalization/layernorm_tests.jl b/test/normalization/layernorm_tests.jl index 7be16eaf..384470ff 100644 --- a/test/normalization/layernorm_tests.jl +++ b/test/normalization/layernorm_tests.jl @@ -24,7 +24,7 @@ x, scale, bias = _setup_layernorm(aType, T, x_shape, affine_shape) - @inferred layernorm(x, scale, bias, act, dims, epsilon) + @test @inferred(layernorm(x, scale, bias, act, dims, epsilon)) isa Any @jet layernorm(x, scale, bias, act, dims, epsilon) y = _f(x, scale, bias)