diff --git a/NEWS.md b/NEWS.md
index d908a79820..07852c2dde 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,8 @@
 # Flux Release Notes
 
+## v0.12.10
+* `Dropout`/`AlphaDropout` now supports [user-specified RNGs](https://github.com/FluxML/Flux.jl/pull/1838)
+
 ## v0.12.9
 * Fixed incorrect output and added GPU compatibility for [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/1781).
 * Add trilinear [Upsample layer](https://github.com/FluxML/Flux.jl/pull/1792).
diff --git a/src/functor.jl b/src/functor.jl
index bef3559c2f..f9c3f29f62 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -96,6 +96,14 @@ end
 struct FluxCUDAAdaptor end
 adapt_storage(to::FluxCUDAAdaptor, x) = CUDA.cu(x)
 adapt_storage(to::FluxCUDAAdaptor, x::Zygote.FillArrays.AbstractFill) = CUDA.cu(collect(x))
+if VERSION >= v"1.7"
+  adapt_storage(to::FluxCUDAAdaptor, x::Random.TaskLocalRNG) = CUDA.default_rng()
+else
+  adapt_storage(to::FluxCUDAAdaptor, x::Random._GLOBAL_RNG) = CUDA.default_rng()
+end
+adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x
+adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) =
+  error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().")
 
 # TODO: figure out the correct design for OneElement
 adapt_storage(to::FluxCUDAAdaptor, x::Zygote.OneElement) = CUDA.cu(collect(x))
@@ -109,6 +117,8 @@ adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x
 adapt_storage(to::FluxCPUAdaptor, x::T) where T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix = adapt(Array, x)
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x
 adapt_storage(to::FluxCPUAdaptor, x::AbstractSparseArray) = x
+adapt_storage(to::FluxCPUAdaptor, x::CUDA.RNG) = Random.default_rng()
+adapt_storage(to::FluxCPUAdaptor, x::AbstractRNG) = x
 
 Zygote.@adjoint function Array(x::CUDA.CuArray)
   Array(x), d -> (CUDA.cu(d),)
@@ -149,6 +159,9 @@ _isbitsarray(::AbstractArray{<:Number}) = true
 _isbitsarray(::AbstractArray{T}) where T = isbitstype(T)
 _isbitsarray(x) = false
 
+_isleaf(::AbstractRNG) = true
+_isleaf(x) = _isbitsarray(x) || Functors.isleaf(x)
+
 """
     gpu(x)
 
@@ -174,7 +187,7 @@ CuArray{Float32, 2}
 """
 function gpu(x)
   check_use_cuda()
-  use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isbitsarray) : x
+  use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) : x
 end
 
 function check_use_cuda()
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 146b7dba56..164fc0d782 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -10,7 +10,7 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
 """
-    dropout(x, p; dims=:, active=true)
+    dropout([rng = rng_from_array(x)], x, p; dims=:, active=true)
 
 The dropout function. If `active` is `true`,
 for each input, either sets that input to `0` (with probability
@@ -20,6 +20,9 @@ This is used as a regularisation, i.e. it reduces overfitting during training.
 
 If `active` is `false`, it just returns the input `x`.
 
+Specify `rng` for custom RNGs instead of the default RNG.
+Note that custom RNGs are only supported on the CPU.
+
 Warning: when using this function, you have to manually manage the activation
 state. Usually in fact, dropout is used while training
 but is deactivated in the inference phase. This can be
@@ -28,26 +31,31 @@ automatically managed using the [`Dropout`](@ref) layer instead of the
 
 The [`Dropout`](@ref) layer is what you should use in most scenarios.
 """
-function dropout(x, p; dims=:, active::Bool=true)
+function dropout(rng, x, p; dims=:, active::Bool=true)
   active || return x
-  y = dropout_mask(x, p, dims=dims)
+  y = dropout_mask(rng, x, p, dims=dims)
   return x .* y
 end
+dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
 
-@adjoint function dropout(x, p; dims=:, active::Bool=true)
+@adjoint function dropout(rng, x, p; dims=:, active::Bool=true)
   active || return x, Δ -> (Δ, nothing)
-  y = dropout_mask(x, p, dims=dims)
-  return x .* y, Δ -> (Δ .* y, nothing)
+  y = dropout_mask(rng, x, p, dims=dims)
+  return x .* y, Δ -> (nothing, Δ .* y, nothing)
 end
 
-function dropout_mask(x, p; dims=:)
-  y = rand!(similar(x, _dropout_shape(x, dims)))
+dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
+dropout_mask(rng, x::CuArray, p; kwargs...) =
+  throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
+dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
+function _dropout_mask(rng, x, p; dims=:)
+  y = rand!(rng, similar(x, _dropout_shape(x, dims)))
   y .= _dropout_kernel.(y, p, 1 - p)
   return y
 end
 
 """
-    Dropout(p; dims=:)
+    Dropout(p; dims=:, rng = rng_from_array())
 
 Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
 
@@ -55,22 +63,31 @@ To apply dropout along certain dimension(s), specify the `dims` keyword.
 e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
 (also called 2D dropout).
 
+Specify `rng` to use a custom RNG instead of the default.
+Custom RNGs are only supported on the CPU.
+
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 """
-mutable struct Dropout{F,D}
+mutable struct Dropout{F,D,R<:AbstractRNG}
   p::F
   dims::D
   active::Union{Bool, Nothing}
+  rng::R
 end
+Dropout(p, dims, active) = Dropout(p, dims, active, rng_from_array())
 
-function Dropout(p; dims=:)
+function Dropout(p; dims=:, rng = rng_from_array())
   @assert 0 ≤ p ≤ 1
-  Dropout(p, dims, nothing)
+  Dropout(p, dims, nothing, rng)
 end
 
+@functor Dropout
+
+trainable(a::Dropout) = ()
+
 function (a::Dropout)(x)
   _isactive(a) || return x
-  return dropout(x, a.p; dims=a.dims, active=true)
+  return dropout(a.rng, x, a.p; dims=a.dims, active=true)
 end
 
 testmode!(m::Dropout, mode=true) =
@@ -83,7 +100,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p)
+    AlphaDropout(p; rng = rng_from_array())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -92,14 +109,21 @@ remain the same as before.
 
 Does nothing to the input once [`testmode!`](@ref) is true.
 """
-mutable struct AlphaDropout{F}
+mutable struct AlphaDropout{F,R<:AbstractRNG}
   p::F
   active::Union{Bool, Nothing}
-  function AlphaDropout(p, active = nothing)
+  rng::R
+  function AlphaDropout(p, active, rng)
     @assert 0 ≤ p ≤ 1
-    new{typeof(p)}(p, active)
+    new{typeof(p), typeof(rng)}(p, active, rng)
   end
 end
+AlphaDropout(p, active) = AlphaDropout(p, active, rng_from_array())
+AlphaDropout(p; rng = rng_from_array()) = AlphaDropout(p, nothing, rng)
+
+@functor AlphaDropout
+
+trainable(a::AlphaDropout) = ()
 
 function (a::AlphaDropout)(x::AbstractArray{T}) where T
   _isactive(a) || return x
@@ -111,7 +135,7 @@ function (a::AlphaDropout)(x::AbstractArray{T}) where T
   A = T(inv(sqrt((1 - p) * (1 + p * α′^2))))
   B = T(-A * α′ * p)
 
-  noise = rand!(similar(x))
+  noise = rand!(a.rng, similar(x))
   return A .* ifelse.(noise .> p, x, α′) .+ B
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index c1888829d4..09eadcac61 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -33,6 +33,25 @@ nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of con
 ofeltype(x, y) = convert(float(eltype(x)), y)
 epseltype(x) = eps(float(eltype(x)))
 
+"""
+    rng_from_array([x])
+
+Create an instance of the RNG most appropriate for `x`.
+The current defaults are:
+- `x isa AbstractArray`
+  - Julia version is < 1.7: `Random.GLOBAL_RNG`
+  - Julia version is >= 1.7: `Random.default_rng()`
+- `x isa CuArray`: `CUDA.default_rng()`
+When `x` is unspecified, it is assumed to be a `AbstractArray`.
+"""
+rng_from_array(::AbstractArray) = rng_from_array()
+rng_from_array(::CuArray) = CUDA.default_rng()
+if VERSION >= v"1.7"
+  rng_from_array() = Random.default_rng()
+else
+  rng_from_array() = Random.GLOBAL_RNG
+end
+
 """
     glorot_uniform([rng=GLOBAL_RNG], dims...)
 
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 4e67997b4f..396e6c0ab5 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -280,3 +280,13 @@ end
     end
   end
 end
+
+@testset "Dropout RNGs" begin
+  @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3), 0.1)
+  @testset for layer in (Dropout, AlphaDropout)
+    m = layer(0.1; rng = MersenneTwister(123))
+    @test_throws ErrorException gpu(m)
+    m = layer(0.1; rng = CUDA.default_rng())
+    @test gpu(m).rng isa CUDA.RNG
+  end
+end
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
index 8ed3d66eb4..ebd32b1ec0 100644
--- a/test/cuda/runtests.jl
+++ b/test/cuda/runtests.jl
@@ -1,6 +1,7 @@
 using Flux, Test, CUDA
 using Zygote
 using Zygote: pullback
+using Random
 
 @info "Testing GPU Support"
 CUDA.allowscalar(false)
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 4548decd91..5e30bcb94b 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -4,85 +4,116 @@ using Zygote: pullback
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
 @testset "Dropout" begin
-  x = [1.,2.,3.]
-  @test x == Dropout(0.1)(x)
-  @test x == evalwgrad(Dropout(0), x)
-  @test zero(x) == evalwgrad(Dropout(1), x)
-
-  x = rand(100)
-  m = Dropout(0.9)
-  y = evalwgrad(m, x)
-  @test count(a->a==0, y) > 50
-  testmode!(m, true)
-  y = evalwgrad(m, x) # should override istraining
-  @test count(a->a==0, y) == 0
-  testmode!(m, false)
-  y = evalwgrad(m, x)
-  @test count(a->a==0, y) > 50
-
-  x = rand(Float32, 100)
-  m = Chain(Dense(100,100),
-            Dropout(0.9))
-  y = evalwgrad(m, x)
-  @test count(a->a == 0, y) > 50
-  testmode!(m, true)
-  y = evalwgrad(m, x) # should override istraining
-  @test count(a->a == 0, y) == 0
-
-  x = rand(100, 50)
-  m = Dropout(0.5, dims = 2)
-  y = m(x)
-  c = map(i->count(a->a==0, @view y[i, :]), 1:100)
-  @test minimum(c) == maximum(c)
-  m = Dropout(0.5, dims = 1)
-  y = m(x)
-  c = map(i->count(a->a==0, @view y[:, i]), 1:50)
-  @test minimum(c) == maximum(c)
-
-  # issue #1084
-  m = Dropout(0.9)
-  x = rand(100)
-
-  testmode!(m)
-  y = m(x)
-  @test count(a->a == 0, y) == 0
-  trainmode!(m)
-  y = m(x)
-  @test count(a->a == 0, y) > 50
-
-  y = Flux.dropout(x, 0.9, active=true)
-  @test count(a->a == 0, y) > 50
-
-  y = Flux.dropout(x, 0.9, active=false)
-  @test count(a->a == 0, y) == 0
+  @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1.,2.,3.]
+    @test x == Dropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
+
+    x = rand(100)
+    m = Dropout(0.9; rng_kwargs...)
+    y = evalwgrad(m, x)
+    @test count(a->a==0, y) > 50
+    testmode!(m, true)
+    y = evalwgrad(m, x) # should override istraining
+    @test count(a->a==0, y) == 0
+    testmode!(m, false)
+    y = evalwgrad(m, x)
+    @test count(a->a==0, y) > 50
+
+    x = rand(Float32, 100)
+    m = Chain(Dense(100,100),
+              Dropout(0.9; rng_kwargs...))
+    y = evalwgrad(m, x)
+    @test count(a->a == 0, y) > 50
+    testmode!(m, true)
+    y = evalwgrad(m, x) # should override istraining
+    @test count(a->a == 0, y) == 0
+
+    x = rand(100, 50)
+    m = Dropout(0.5; dims = 2, rng_kwargs...)
+    y = m(x)
+    c = map(i->count(a->a==0, @view y[i, :]), 1:100)
+    @test minimum(c) == maximum(c)
+    m = Dropout(0.5; dims = 1, rng_kwargs...)
+    y = m(x)
+    c = map(i->count(a->a==0, @view y[:, i]), 1:50)
+    @test minimum(c) == maximum(c)
+
+    # issue #1084
+    m = Dropout(0.9; rng_kwargs...)
+    x = rand(100)
+
+    testmode!(m)
+    y = m(x)
+    @test count(a->a == 0, y) == 0
+    trainmode!(m)
+    y = m(x)
+    @test count(a->a == 0, y) > 50
+
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true)
+    @test count(a->a == 0, y) > 50
+
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false)
+    @test count(a->a == 0, y) == 0
+
+    # CPU RNGs map onto CPU ok
+    if isempty(rng_kwargs)
+      if VERSION >= v"1.7"
+        @test cpu(m).rng isa Random.TaskLocalRNG
+      else
+        @test cpu(m).rng isa Random._GLOBAL_RNG
+      end
+    else
+      @test cpu(m).rng === only(values(rng_kwargs))
+    end
+  end
 end
 
 @testset "AlphaDropout" begin
-  x = [1., 2., 3.]
-  @test x == AlphaDropout(0.1)(x)
-  @test x == evalwgrad(AlphaDropout(0), x)
-  @test zero(x) == evalwgrad(AlphaDropout(1), x)
-
-  x = randn(1000) # large enough to prevent flaky test
-  m = AlphaDropout(0.5)
-
-  y = evalwgrad(m, x)
-  # Should preserve unit mean and variance
-  @test mean(y) ≈ 0 atol=0.1
-  @test var(y) ≈ 1 atol=0.1
-
-  testmode!(m, true) # should override istraining
-  @test evalwgrad(m, x) == x
-
-  testmode!(m, false)
-  y = evalwgrad(m, x)
-  @test mean(y) ≈ 0 atol=0.1
-  @test var(y) ≈ 1 atol=0.1
-  
-  # Known good value ranges
-  # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
-  x = ones(100)
-  @test 40 < sum(evalwgrad(m, x)) < 130
+  @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1., 2., 3.]
+    @test x == AlphaDropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
+
+    x = randn(1000) # large enough to prevent flaky test
+    m = AlphaDropout(0.5; rng_kwargs...)
+
+    y = evalwgrad(m, x)
+    # Should preserve unit mean and variance
+    @test mean(y) ≈ 0 atol=0.2
+    @test var(y) ≈ 1 atol=0.2
+
+    testmode!(m, true) # should override istraining
+    @test evalwgrad(m, x) == x
+
+    testmode!(m, false)
+    y = evalwgrad(m, x)
+    @test mean(y) ≈ 0 atol=0.2
+    @test var(y) ≈ 1 atol=0.2
+
+    # Known good value ranges
+    # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
+    x = ones(100)
+    if isempty(rng_kwargs)
+      @test 40 < sum(evalwgrad(m, x)) < 130
+    else
+      # FIXME: this breaks spuriously for MersenneTwister
+      @test_skip 40 < sum(evalwgrad(m, x)) < 130
+    end
+
+    # CPU RNGs map onto CPU ok
+    if isempty(rng_kwargs)
+      if VERSION >= v"1.7"
+        @test cpu(m).rng isa Random.TaskLocalRNG
+      else
+        @test cpu(m).rng isa Random._GLOBAL_RNG
+      end
+    else
+      @test cpu(m).rng === only(values(rng_kwargs))
+    end
+  end
 end
 
 @testset "BatchNorm" begin