Merge pull request #97 from DrChainsaw/cudaext

Make CUDA a weakdep
DrChainsaw · Aug 10, 2023 · d38f6b9 · d38f6b9
2 parents 11a6e18 + 0a1670b
commit d38f6b9
Show file tree

Hide file tree

Showing 17 changed files with 142 additions and 63 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest
         with:
-          version: '1.6'
+          version: '1.9'
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy

diff --git a/Project.toml b/Project.toml
@@ -4,7 +4,6 @@ authors = ["DrChainsaw"]
 version = "0.10.4"
 
 [deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -13,13 +12,21 @@ MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 NaiveNASflux = "85610aed-7d32-5e57-bb50-4c2e1c9e7997"
 NaiveNASlib = "bd45eb3e-47ce-54bd-9eaf-e86c5f900853"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+NaiveGAfluxCUDAExt = "CUDA"
+
 [compat]
 CUDA = "3, 4"
 Flux = "0.13.4, 0.14"
@@ -29,6 +36,8 @@ MemPool = "0.3"
 NaiveNASflux = "2.0.10"
 NaiveNASlib = "2.0.11"
 Optimisers = "0.2"
+PackageExtensionCompat = "1"
+PrecompileTools = "1"
 Reexport = "0.2.0, 1"
 Setfield = "0.3.4, 0.5, 0.6, 0.7, 0.8, 1"
 julia = "1.7"
diff --git a/ext/NaiveGAfluxCUDAExt.jl b/ext/NaiveGAfluxCUDAExt.jl
@@ -0,0 +1,24 @@
+module NaiveGAfluxCUDAExt
+
+using NaiveGAflux
+import CUDA
+
+struct NaiveGAfluxCudaDevice{D}
+    device::D
+end
+
+# Doesn't seem like we can do much with the device we got from CUDA, but lets keep it in case someone finds a use for it
+NaiveGAflux.execution_device(a::CUDA.CuArray) = NaiveGAfluxCudaDevice(CUDA.device(a))
+
+function NaiveGAflux._availablebytes(::NaiveGAfluxCudaDevice)
+    # Doesn't seem like CUDA exposes these per device
+    info = CUDA.MemoryInfo()
+    info.free_bytes + info.pool_reserved_bytes - info.pool_used_bytes
+end
+
+# Should map data to device, but how?
+NaiveGAflux.matchdatatype(::NaiveGAfluxCudaDevice, iter) = GpuIterator(iter)
+
+NaiveGAflux._rangetoarr(a::Type{<:CUDA.CuArray}) = a
+
+end
diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
@@ -10,13 +10,12 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
 import Flux
 using Flux: Dense, Conv, ConvTranspose, DepthwiseConv, CrossCor, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, 
             MaxPool, MeanPool, Dropout, AlphaDropout, GlobalMaxPool, GlobalMeanPool, cpu, gpu,
-            SamePad, params
+            SamePad
 import Optimisers
 import Optimisers: WeightDecay
 using Random
 using Logging
 using Statistics
-import CUDA
 import MemPool
 import IterTools
 import Functors
@@ -28,7 +27,7 @@ using Setfield
 # For temporary storage of program state for pause/resume type of operations
 using Serialization
 
-const rng_default = MersenneTwister(abs(rand(Int)))
+const rng_default = MersenneTwister(1)
 const modeldir = "models"
 
 # Fitness
@@ -125,4 +124,12 @@ include("iterators.jl")
 include("app/AutoFlux.jl")
 include("visualize/callbacks.jl")
 
+
+using PackageExtensionCompat
+function __init__()
+    @require_extensions
+end
+
+include("precompile.jl")
+
 end # module
diff --git a/src/app/AutoFlux.jl b/src/app/AutoFlux.jl
@@ -24,7 +24,7 @@ Keyword `mdir` is a directory which will be searched for serialized state from w
 function fit(x, y; cb=identity, mdir=missing)
     if ndims(x) == 4 
         outsize = ndims(y) == 1 ? length(unique(y)) : size(y, 1)
-        return fit(ImageClassifier(insize=size(x), outsize, mdir=defaultdir(mdir, "ImageClassifier")), x, y; cb=identity)
+        return fit(ImageClassifier(;insize=size(x), outsize, mdir=modeldir(mdir, "ImageClassifier")), x, y; cb=identity)
     end
     error("No model for $(ndims(x))D data")
 end

diff --git a/src/app/imageclassification/ImageClassification.jl b/src/app/imageclassification/ImageClassification.jl
@@ -1,7 +1,7 @@
 module ImageClassification
 
 using ...NaiveGAflux
-using ..AutoFlux: fit
+using ..AutoFlux: fit, defaultdir
 using NaiveGAflux: GlobalPool
 using NaiveGAflux: shapetrace, squashshapes, fshape, ndimsout, check_apply
 using NaiveGAflux: StatefulGenerationIter

diff --git a/src/batchsize.jl b/src/batchsize.jl
@@ -2,7 +2,7 @@ generic_batchsizefun_docstring(fname="batchsizefun") = """
 
 `$(fname)` is a function with the following signature:
 
-`$(fname)(model, batchsize; inshape_nobatch, availablebytes)`
+`$(fname)(batchsize, model; inshape_nobatch, availablebytes)`
 
 It returns the largest batch size not larger than `batchsize` which can be used for `model` without using more than `availablebytes` bytes of memory.
 The type of `batchsize` may be used to e.g. determine if one shall account for backwards pass (if `typeof(batchsize) === TrainBatchSize`) or not (if `typeof(batchsize) == ValidationBatchSize`).
@@ -128,8 +128,8 @@ struct BatchSizeSelectionScaled{F}
     batchsizefun::F
 end
 BatchSizeSelectionScaled(scale::AbstractFloat) = BatchSizeSelectionScaled(scale, limit_maxbatchsize)  
-function (bs::BatchSizeSelectionScaled)(args...; availablebytes=_availablebytes(), kwargs...) 
-    bs.batchsizefun(args...;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
+function (bs::BatchSizeSelectionScaled)(orgbs, model; availablebytes=_availablebytes(model), kwargs...) 
+    bs.batchsizefun(orgbs, model;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
 end
 
 """
@@ -288,29 +288,31 @@ function batchsizeselection(inshape_nobatch::Tuple;
     bs = isnothing(alternatives) ? bs : BatchSizeSelectionFromAlternatives(alternatives, bs)
 end
 
-function limit_maxbatchsize(bs::TrainBatchSize, model; inshape_nobatch, availablebytes = _availablebytes())
+function limit_maxbatchsize(bs::TrainBatchSize, model; inshape_nobatch, availablebytes = _availablebytes(model))
     maxsize = maxtrainbatchsize(model, inshape_nobatch, availablebytes)
     maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
 end
 
 function limit_maxbatchsize(bs::ValidationBatchSize,
                             model; 
                             inshape_nobatch,
-                            availablebytes = _availablebytes()
+                            availablebytes = _availablebytes(model)
                             )
     maxsize = maxvalidationbatchsize(model, inshape_nobatch, availablebytes)
     maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
 end
 
-function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
-    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model); init=0)
+function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes(model))
+    elemsize = _model_parsize(model)
+    paramsize = elemsize > 0 ? nparams(model) * elemsize : 0
     actsize = activationsizes(model, inshape_nobatch) 
     den = paramsize + 2 * actsize
     return den > 0 ? fld(availablebytes - paramsize, den) : -1
 end
 
-function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
-    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model); init=0)
+function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availablebytes(model))
+    elemsize = _model_parsize(model)
+    paramsize = elemsize > 0 ? nparams(model) * elemsize : 0
     actsize = activationsizes(model, inshape_nobatch)
     return actsize > 0 ? fld(availablebytes - paramsize, actsize) : -1
 end
@@ -330,17 +332,10 @@ function activationsizes(model::CompGraph, inshape_nobatch, elemsize = _model_pa
 end
 
 function _model_parsize(model)
-    ps = params(model)
-    isempty(ps) && return 0
-    return ps |> first |> eltype |> sizeof
+    anyarr = find_first_array(model)
+    anyarr === nothing ? 0 : sizeof(eltype(anyarr))
 end
 
-# TODO: Take model as input and look at params to determine of cpu or gpu
-function _availablebytes()
-    if CUDA.functional()
-        info = CUDA.MemoryInfo()
-        info.free_bytes + info.pool_reserved_bytes - info.pool_used_bytes
-    else
-        Int(Sys.free_memory())
-    end
-end
+_availablebytes(model) = _availablebytes(execution_device(model))
+_availablebytes(::NaiveGAfluxCpuDevice) = Int(Sys.free_memory())
+
diff --git a/src/candidate.jl b/src/candidate.jl
@@ -279,7 +279,7 @@ struct FittedCandidate{F, C <: AbstractCandidate} <: AbstractWrappingCandidate
     c::C
 end
 FittedCandidate(c::AbstractCandidate, f::AbstractFitness, gen) = FittedCandidate(gen, fitness(f, c), c)
-FittedCandidate(c::FittedCandidate, f::AbstractFitness, gen) = FittedCandidate(gen, fitness(f, c), wrappedcand(c))
+FittedCandidate(c::FittedCandidate{F}, f::AbstractFitness, gen) where F = FittedCandidate(gen, fitness(f, c)::F, wrappedcand(c))
 
 @functor FittedCandidate
 
@@ -294,8 +294,18 @@ generation(c::FittedCandidate; default=nothing) = c.gen
 # if they are not passed a FittedCandidate. Perhaps having some kind of fitness state container in each candidate?
 newcand(c::FittedCandidate, mapfield) = FittedCandidate(c.gen, c.fitness, newcand(wrappedcand(c), mapfield))
 
+"""
+    nparams(model)
+
+Return the number of trainable parameters in `model`.
+"""
 nparams(c::AbstractCandidate) = model(nparams, c)
-nparams(x) = mapreduce(length, +, params(x).order; init=0)
+nparams(x) = nparams(0, x)
+nparams(x::Integer, g::CompGraph) = nparams(x, vertices(g))
+nparams(x::Integer, v::AbstractVertex) = nparams(x, layer(v))
+nparams(x::Integer, m) = nparams(x, Flux.trainable(m))
+nparams(x::Integer, tr::Union{Tuple, NamedTuple, AbstractArray}) = foldl(nparams, tr; init=x)
+nparams(x::Integer, tr::AbstractArray{<:Number}) = x + length(tr)
 
 """
     MapType{T, F1, F2}

diff --git a/src/fitness.jl b/src/fitness.jl
@@ -72,10 +72,6 @@ function _fitness(s::GpuFitness, c::AbstractCandidate)
     fitval = _fitness(s.f, cgpu)
     # In case parameters changed. Would like to do this some other way, perhaps return the candidate too, or move training to evolve...
     transferstate!(c, cpu(cgpu)) # Can't load CuArray into a normal array
-    cgpu = nothing # So we can reclaim the memory
-    # Should not be needed according to CUDA docs, but programs seems to hang every now and then if not done.
-    # Should revisit every now and then to see if things have changed...
-    gpu_gc()
     return fitval
 end
 
@@ -112,14 +108,6 @@ function _transferstate!(to::T, from::T) where T <:AbstractArray
     foreach(transferstate!, to, from)
 end 
 
-const gpu_gc = if CUDA.functional()
-    function(full=true)
-        GC.gc(full)
-        CUDA.reclaim()
-    end
-else
-    () -> nothing
-end
 
 """
     AccuracyFitness <: AbstractFitness
@@ -152,13 +140,12 @@ end
 function _fitnessiterator(f, c::AbstractCandidate, iter)
     geniter = itergeneration(iter, generation(c; default=0))
     canditer = f(c; default=geniter)
-    matchdatatype(params(c), canditer)
+    matchdatatype(c, canditer)
 end
 
-matchdatatype(ps::Flux.Params, iter) = isempty(ps) ? iter : matchdatatype(first(ps), iter)
-
-matchdatatype(::CUDA.CuArray, iter) = GpuIterator(iter)
-matchdatatype(::AbstractArray, iter) = iter
+matchdatatype(c::AbstractCandidate, iter) = matchdatatype(model(c), iter)
+matchdatatype(model, iter) = matchdatatype(execution_device(model), iter)
+matchdatatype(::NaiveGAfluxCpuDevice, iter) = iter
 
 """
     TrainThenFitness{I,L,O,F} <: AbstractFitness

diff --git a/src/iterators.jl b/src/iterators.jl
@@ -161,6 +161,8 @@ Base.IteratorEltype(::Type{SeedIterator{R, T}}) where {R,T} = Base.IteratorEltyp
     GpuIterator(itr)
 
 Return an iterator which sends values from `itr` to the GPU.
+
+Will often be used automatically when training a model with parameters on the GPU.
 """
 GpuIterator(itr) = Iterators.map(gpuitr, itr) # Iterator.map can't infer eltypes, but we can't either as we don't know for sure what Flux.gpu will do
 gpuitr(a) = Flux.gpu(a)
@@ -353,7 +355,6 @@ Base.IteratorEltype(::Type{ReBatchingIterator{I}}) where I = Base.IteratorEltype
 _rangetoarr(a) = a
 _rangetoarr(t::Type{<:Tuple}) = Tuple{map(_rangetoarr, t.parameters)...}
 _rangetoarr(a::Type{<:Array}) = a
-_rangetoarr(a::Type{<:CUDA.CuArray}) = a
 _rangetoarr(::Type{<:AbstractArray{T,N}}) where {T,N} = Array{T,N}
 
 function Base.iterate(itr::ReBatchingIterator)

diff --git a/src/precompile.jl b/src/precompile.jl
@@ -0,0 +1,19 @@
+using PrecompileTools
+
+@setup_workload begin
+    @compile_workload begin
+        ## This depends on RNG Implementation which is probably not ideal...
+        Logging.with_logger(Logging.NullLogger()) do
+            AutoFlux.fit(
+                        AutoFlux.ImageClassification.ImageClassifier(;
+                                popsize=2, 
+                                insize=(32,32,3,0),
+                                outsize=10
+                            ), 
+                        zeros(Float32, 32,32,3,0), # Zero size so we at least don't compute and gradients and stuff
+                        zeros(Int, 10, 0); 
+                        stopcriterion = pop -> generation(pop) > 1)
+        end
+    end
+    Random.seed!(rng_default, 1)
+end
diff --git a/src/util.jl b/src/util.jl
@@ -369,7 +369,7 @@ function Base.showerror(io::IO, e::InconsistentAutoOptimiserException{<:CompGrap
 end
 function _print_implicit_opts(io::IO, g::CompGraph)
     for (i, v) in enumerate(vertices(g))
-        if !isempty(Flux.trainable(layer(v)))
+        if find_first_array(layer(v)) !== nothing
             println(io, "vertex ", i, " implicit: ", _is_implicit_opt(v), ", name: ", name(v))
         end
     end
@@ -387,7 +387,7 @@ Throws a $InconsistentAutoOptimiserException if some but not all parameters use
 function check_implicit_optimiser(g::CompGraph)
     prev = nothing
     for v in vertices(g)
-        if !isempty(Flux.trainable(layer(v)))
+        if find_first_array(layer(v)) !== nothing
             isimplicit = _is_implicit_opt(v)
             if isnothing(prev)
                 prev = isimplicit 
@@ -400,3 +400,26 @@ function check_implicit_optimiser(g::CompGraph)
     end
     isnothing(prev) ? false : prev
 end
+
+struct NaiveGAfluxCpuDevice end
+
+function execution_device(model) 
+     maybearray = find_first_array(model)
+     maybearray === nothing ? NaiveGAfluxCpuDevice() : execution_device(maybearray)
+end
+execution_device(::AbstractArray) = NaiveGAfluxCpuDevice()
+
+# Just to avoid infinite recursion due to inputs -> outputs -> inputs
+find_first_array(g::CompGraph) = find_first_array(vertices(g))
+find_first_array(v::AbstractVertex) = find_first_array(layer(v))
+
+find_first_array(model) = find_first_array(Flux.trainable(model))
+function find_first_array(x::Union{Tuple, NamedTuple, AbstractArray}) 
+    for e in x
+        res = find_first_array(e)
+        res !== nothing && return res
+    end
+end
+find_first_array(x::AbstractArray{<:Number}) = x
+
+
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
@@ -17,5 +18,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
-Documenter = "0.27"
+CUDA = "4"
 cuDNN = "1"
+Documenter = "0.27"
+