diff --git a/NEWS.md b/NEWS.md index d908a79820..07852c2dde 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # Flux Release Notes +## v0.12.10 +* `Dropout`/`AlphaDropout` now supports [user-specified RNGs](https://github.com/FluxML/Flux.jl/pull/1838) + ## v0.12.9 * Fixed incorrect output and added GPU compatibility for [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/1781). * Add trilinear [Upsample layer](https://github.com/FluxML/Flux.jl/pull/1792). diff --git a/src/functor.jl b/src/functor.jl index bef3559c2f..f9c3f29f62 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -96,6 +96,14 @@ end struct FluxCUDAAdaptor end adapt_storage(to::FluxCUDAAdaptor, x) = CUDA.cu(x) adapt_storage(to::FluxCUDAAdaptor, x::Zygote.FillArrays.AbstractFill) = CUDA.cu(collect(x)) +if VERSION >= v"1.7" + adapt_storage(to::FluxCUDAAdaptor, x::Random.TaskLocalRNG) = CUDA.default_rng() +else + adapt_storage(to::FluxCUDAAdaptor, x::Random._GLOBAL_RNG) = CUDA.default_rng() +end +adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x +adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) = + error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().") # TODO: figure out the correct design for OneElement adapt_storage(to::FluxCUDAAdaptor, x::Zygote.OneElement) = CUDA.cu(collect(x)) @@ -109,6 +117,8 @@ adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x adapt_storage(to::FluxCPUAdaptor, x::T) where T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix = adapt(Array, x) adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x adapt_storage(to::FluxCPUAdaptor, x::AbstractSparseArray) = x +adapt_storage(to::FluxCPUAdaptor, x::CUDA.RNG) = Random.default_rng() +adapt_storage(to::FluxCPUAdaptor, x::AbstractRNG) = x Zygote.@adjoint function Array(x::CUDA.CuArray) Array(x), d -> (CUDA.cu(d),) @@ -149,6 +159,9 @@ _isbitsarray(::AbstractArray{<:Number}) = true _isbitsarray(::AbstractArray{T}) where T = isbitstype(T) _isbitsarray(x) = false +_isleaf(::AbstractRNG) = true +_isleaf(x) = _isbitsarray(x) || Functors.isleaf(x) + """ gpu(x) @@ -174,7 +187,7 @@ CuArray{Float32, 2} """ function gpu(x) check_use_cuda() - use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isbitsarray) : x + use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) : x end function check_use_cuda() diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 146b7dba56..164fc0d782 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -10,7 +10,7 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0) """ - dropout(x, p; dims=:, active=true) + dropout([rng = rng_from_array(x)], x, p; dims=:, active=true) The dropout function. If `active` is `true`, for each input, either sets that input to `0` (with probability @@ -20,6 +20,9 @@ This is used as a regularisation, i.e. it reduces overfitting during training. If `active` is `false`, it just returns the input `x`. +Specify `rng` for custom RNGs instead of the default RNG. +Note that custom RNGs are only supported on the CPU. + Warning: when using this function, you have to manually manage the activation state. Usually in fact, dropout is used while training but is deactivated in the inference phase. This can be @@ -28,26 +31,31 @@ automatically managed using the [`Dropout`](@ref) layer instead of the The [`Dropout`](@ref) layer is what you should use in most scenarios. """ -function dropout(x, p; dims=:, active::Bool=true) +function dropout(rng, x, p; dims=:, active::Bool=true) active || return x - y = dropout_mask(x, p, dims=dims) + y = dropout_mask(rng, x, p, dims=dims) return x .* y end +dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) -@adjoint function dropout(x, p; dims=:, active::Bool=true) +@adjoint function dropout(rng, x, p; dims=:, active::Bool=true) active || return x, Δ -> (Δ, nothing) - y = dropout_mask(x, p, dims=dims) - return x .* y, Δ -> (Δ .* y, nothing) + y = dropout_mask(rng, x, p, dims=dims) + return x .* y, Δ -> (nothing, Δ .* y, nothing) end -function dropout_mask(x, p; dims=:) - y = rand!(similar(x, _dropout_shape(x, dims))) +dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) +dropout_mask(rng, x::CuArray, p; kwargs...) = + throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) +dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) +function _dropout_mask(rng, x, p; dims=:) + y = rand!(rng, similar(x, _dropout_shape(x, dims))) y .= _dropout_kernel.(y, p, 1 - p) return y end """ - Dropout(p; dims=:) + Dropout(p; dims=:, rng = rng_from_array()) Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input. @@ -55,22 +63,31 @@ To apply dropout along certain dimension(s), specify the `dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input (also called 2D dropout). +Specify `rng` to use a custom RNG instead of the default. +Custom RNGs are only supported on the CPU. + Does nothing to the input once [`Flux.testmode!`](@ref) is `true`. """ -mutable struct Dropout{F,D} +mutable struct Dropout{F,D,R<:AbstractRNG} p::F dims::D active::Union{Bool, Nothing} + rng::R end +Dropout(p, dims, active) = Dropout(p, dims, active, rng_from_array()) -function Dropout(p; dims=:) +function Dropout(p; dims=:, rng = rng_from_array()) @assert 0 ≤ p ≤ 1 - Dropout(p, dims, nothing) + Dropout(p, dims, nothing, rng) end +@functor Dropout + +trainable(a::Dropout) = () + function (a::Dropout)(x) _isactive(a) || return x - return dropout(x, a.p; dims=a.dims, active=true) + return dropout(a.rng, x, a.p; dims=a.dims, active=true) end testmode!(m::Dropout, mode=true) = @@ -83,7 +100,7 @@ function Base.show(io::IO, d::Dropout) end """ - AlphaDropout(p) + AlphaDropout(p; rng = rng_from_array()) A dropout layer. Used in [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515). @@ -92,14 +109,21 @@ remain the same as before. Does nothing to the input once [`testmode!`](@ref) is true. """ -mutable struct AlphaDropout{F} +mutable struct AlphaDropout{F,R<:AbstractRNG} p::F active::Union{Bool, Nothing} - function AlphaDropout(p, active = nothing) + rng::R + function AlphaDropout(p, active, rng) @assert 0 ≤ p ≤ 1 - new{typeof(p)}(p, active) + new{typeof(p), typeof(rng)}(p, active, rng) end end +AlphaDropout(p, active) = AlphaDropout(p, active, rng_from_array()) +AlphaDropout(p; rng = rng_from_array()) = AlphaDropout(p, nothing, rng) + +@functor AlphaDropout + +trainable(a::AlphaDropout) = () function (a::AlphaDropout)(x::AbstractArray{T}) where T _isactive(a) || return x @@ -111,7 +135,7 @@ function (a::AlphaDropout)(x::AbstractArray{T}) where T A = T(inv(sqrt((1 - p) * (1 + p * α′^2)))) B = T(-A * α′ * p) - noise = rand!(similar(x)) + noise = rand!(a.rng, similar(x)) return A .* ifelse.(noise .> p, x, α′) .+ B end diff --git a/src/utils.jl b/src/utils.jl index c1888829d4..09eadcac61 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -33,6 +33,25 @@ nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of con ofeltype(x, y) = convert(float(eltype(x)), y) epseltype(x) = eps(float(eltype(x))) +""" + rng_from_array([x]) + +Create an instance of the RNG most appropriate for `x`. +The current defaults are: +- `x isa AbstractArray` + - Julia version is < 1.7: `Random.GLOBAL_RNG` + - Julia version is >= 1.7: `Random.default_rng()` +- `x isa CuArray`: `CUDA.default_rng()` +When `x` is unspecified, it is assumed to be a `AbstractArray`. +""" +rng_from_array(::AbstractArray) = rng_from_array() +rng_from_array(::CuArray) = CUDA.default_rng() +if VERSION >= v"1.7" + rng_from_array() = Random.default_rng() +else + rng_from_array() = Random.GLOBAL_RNG +end + """ glorot_uniform([rng=GLOBAL_RNG], dims...) diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl index 4e67997b4f..396e6c0ab5 100644 --- a/test/cuda/layers.jl +++ b/test/cuda/layers.jl @@ -280,3 +280,13 @@ end end end end + +@testset "Dropout RNGs" begin + @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3), 0.1) + @testset for layer in (Dropout, AlphaDropout) + m = layer(0.1; rng = MersenneTwister(123)) + @test_throws ErrorException gpu(m) + m = layer(0.1; rng = CUDA.default_rng()) + @test gpu(m).rng isa CUDA.RNG + end +end diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index 8ed3d66eb4..ebd32b1ec0 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -1,6 +1,7 @@ using Flux, Test, CUDA using Zygote using Zygote: pullback +using Random @info "Testing GPU Support" CUDA.allowscalar(false) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 4548decd91..5e30bcb94b 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -4,85 +4,116 @@ using Zygote: pullback evalwgrad(f, x...) = pullback(f, x...)[1] @testset "Dropout" begin - x = [1.,2.,3.] - @test x == Dropout(0.1)(x) - @test x == evalwgrad(Dropout(0), x) - @test zero(x) == evalwgrad(Dropout(1), x) - - x = rand(100) - m = Dropout(0.9) - y = evalwgrad(m, x) - @test count(a->a==0, y) > 50 - testmode!(m, true) - y = evalwgrad(m, x) # should override istraining - @test count(a->a==0, y) == 0 - testmode!(m, false) - y = evalwgrad(m, x) - @test count(a->a==0, y) > 50 - - x = rand(Float32, 100) - m = Chain(Dense(100,100), - Dropout(0.9)) - y = evalwgrad(m, x) - @test count(a->a == 0, y) > 50 - testmode!(m, true) - y = evalwgrad(m, x) # should override istraining - @test count(a->a == 0, y) == 0 - - x = rand(100, 50) - m = Dropout(0.5, dims = 2) - y = m(x) - c = map(i->count(a->a==0, @view y[i, :]), 1:100) - @test minimum(c) == maximum(c) - m = Dropout(0.5, dims = 1) - y = m(x) - c = map(i->count(a->a==0, @view y[:, i]), 1:50) - @test minimum(c) == maximum(c) - - # issue #1084 - m = Dropout(0.9) - x = rand(100) - - testmode!(m) - y = m(x) - @test count(a->a == 0, y) == 0 - trainmode!(m) - y = m(x) - @test count(a->a == 0, y) > 50 - - y = Flux.dropout(x, 0.9, active=true) - @test count(a->a == 0, y) > 50 - - y = Flux.dropout(x, 0.9, active=false) - @test count(a->a == 0, y) == 0 + @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.,2.,3.] + @test x == Dropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(Dropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) + + x = rand(100) + m = Dropout(0.9; rng_kwargs...) + y = evalwgrad(m, x) + @test count(a->a==0, y) > 50 + testmode!(m, true) + y = evalwgrad(m, x) # should override istraining + @test count(a->a==0, y) == 0 + testmode!(m, false) + y = evalwgrad(m, x) + @test count(a->a==0, y) > 50 + + x = rand(Float32, 100) + m = Chain(Dense(100,100), + Dropout(0.9; rng_kwargs...)) + y = evalwgrad(m, x) + @test count(a->a == 0, y) > 50 + testmode!(m, true) + y = evalwgrad(m, x) # should override istraining + @test count(a->a == 0, y) == 0 + + x = rand(100, 50) + m = Dropout(0.5; dims = 2, rng_kwargs...) + y = m(x) + c = map(i->count(a->a==0, @view y[i, :]), 1:100) + @test minimum(c) == maximum(c) + m = Dropout(0.5; dims = 1, rng_kwargs...) + y = m(x) + c = map(i->count(a->a==0, @view y[:, i]), 1:50) + @test minimum(c) == maximum(c) + + # issue #1084 + m = Dropout(0.9; rng_kwargs...) + x = rand(100) + + testmode!(m) + y = m(x) + @test count(a->a == 0, y) == 0 + trainmode!(m) + y = m(x) + @test count(a->a == 0, y) > 50 + + y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true) + @test count(a->a == 0, y) > 50 + + y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false) + @test count(a->a == 0, y) == 0 + + # CPU RNGs map onto CPU ok + if isempty(rng_kwargs) + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG + else + @test cpu(m).rng isa Random._GLOBAL_RNG + end + else + @test cpu(m).rng === only(values(rng_kwargs)) + end + end end @testset "AlphaDropout" begin - x = [1., 2., 3.] - @test x == AlphaDropout(0.1)(x) - @test x == evalwgrad(AlphaDropout(0), x) - @test zero(x) == evalwgrad(AlphaDropout(1), x) - - x = randn(1000) # large enough to prevent flaky test - m = AlphaDropout(0.5) - - y = evalwgrad(m, x) - # Should preserve unit mean and variance - @test mean(y) ≈ 0 atol=0.1 - @test var(y) ≈ 1 atol=0.1 - - testmode!(m, true) # should override istraining - @test evalwgrad(m, x) == x - - testmode!(m, false) - y = evalwgrad(m, x) - @test mean(y) ≈ 0 atol=0.1 - @test var(y) ≈ 1 atol=0.1 - - # Known good value ranges - # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 - x = ones(100) - @test 40 < sum(evalwgrad(m, x)) < 130 + @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1., 2., 3.] + @test x == AlphaDropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x) + + x = randn(1000) # large enough to prevent flaky test + m = AlphaDropout(0.5; rng_kwargs...) + + y = evalwgrad(m, x) + # Should preserve unit mean and variance + @test mean(y) ≈ 0 atol=0.2 + @test var(y) ≈ 1 atol=0.2 + + testmode!(m, true) # should override istraining + @test evalwgrad(m, x) == x + + testmode!(m, false) + y = evalwgrad(m, x) + @test mean(y) ≈ 0 atol=0.2 + @test var(y) ≈ 1 atol=0.2 + + # Known good value ranges + # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 + x = ones(100) + if isempty(rng_kwargs) + @test 40 < sum(evalwgrad(m, x)) < 130 + else + # FIXME: this breaks spuriously for MersenneTwister + @test_skip 40 < sum(evalwgrad(m, x)) < 130 + end + + # CPU RNGs map onto CPU ok + if isempty(rng_kwargs) + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG + else + @test cpu(m).rng isa Random._GLOBAL_RNG + end + else + @test cpu(m).rng === only(values(rng_kwargs)) + end + end end @testset "BatchNorm" begin