Skip to content

Commit

Permalink
Merge pull request #97 from DrChainsaw/cudaext
Browse files Browse the repository at this point in the history
Make CUDA a weakdep
  • Loading branch information
DrChainsaw authored Aug 10, 2023
2 parents 11a6e18 + 0a1670b commit d38f6b9
Show file tree
Hide file tree
Showing 17 changed files with 142 additions and 63 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- uses: actions/checkout@v2
- uses: julia-actions/setup-julia@latest
with:
version: '1.6'
version: '1.9'
- name: Install dependencies
run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
- name: Build and deploy
Expand Down
11 changes: 10 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ authors = ["DrChainsaw"]
version = "0.10.4"

[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
Expand All @@ -13,13 +12,21 @@ MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
NaiveNASflux = "85610aed-7d32-5e57-bb50-4c2e1c9e7997"
NaiveNASlib = "bd45eb3e-47ce-54bd-9eaf-e86c5f900853"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
NaiveGAfluxCUDAExt = "CUDA"

[compat]
CUDA = "3, 4"
Flux = "0.13.4, 0.14"
Expand All @@ -29,6 +36,8 @@ MemPool = "0.3"
NaiveNASflux = "2.0.10"
NaiveNASlib = "2.0.11"
Optimisers = "0.2"
PackageExtensionCompat = "1"
PrecompileTools = "1"
Reexport = "0.2.0, 1"
Setfield = "0.3.4, 0.5, 0.6, 0.7, 0.8, 1"
julia = "1.7"
24 changes: 24 additions & 0 deletions ext/NaiveGAfluxCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module NaiveGAfluxCUDAExt

using NaiveGAflux
import CUDA

struct NaiveGAfluxCudaDevice{D}
device::D
end

# Doesn't seem like we can do much with the device we got from CUDA, but lets keep it in case someone finds a use for it
NaiveGAflux.execution_device(a::CUDA.CuArray) = NaiveGAfluxCudaDevice(CUDA.device(a))

function NaiveGAflux._availablebytes(::NaiveGAfluxCudaDevice)
# Doesn't seem like CUDA exposes these per device
info = CUDA.MemoryInfo()
info.free_bytes + info.pool_reserved_bytes - info.pool_used_bytes
end

# Should map data to device, but how?
NaiveGAflux.matchdatatype(::NaiveGAfluxCudaDevice, iter) = GpuIterator(iter)

NaiveGAflux._rangetoarr(a::Type{<:CUDA.CuArray}) = a

end
13 changes: 10 additions & 3 deletions src/NaiveGAflux.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
import Flux
using Flux: Dense, Conv, ConvTranspose, DepthwiseConv, CrossCor, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
MaxPool, MeanPool, Dropout, AlphaDropout, GlobalMaxPool, GlobalMeanPool, cpu, gpu,
SamePad, params
SamePad
import Optimisers
import Optimisers: WeightDecay
using Random
using Logging
using Statistics
import CUDA
import MemPool
import IterTools
import Functors
Expand All @@ -28,7 +27,7 @@ using Setfield
# For temporary storage of program state for pause/resume type of operations
using Serialization

const rng_default = MersenneTwister(abs(rand(Int)))
const rng_default = MersenneTwister(1)
const modeldir = "models"

# Fitness
Expand Down Expand Up @@ -125,4 +124,12 @@ include("iterators.jl")
include("app/AutoFlux.jl")
include("visualize/callbacks.jl")


using PackageExtensionCompat
function __init__()
@require_extensions
end

include("precompile.jl")

end # module
2 changes: 1 addition & 1 deletion src/app/AutoFlux.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Keyword `mdir` is a directory which will be searched for serialized state from w
function fit(x, y; cb=identity, mdir=missing)
if ndims(x) == 4
outsize = ndims(y) == 1 ? length(unique(y)) : size(y, 1)
return fit(ImageClassifier(insize=size(x), outsize, mdir=defaultdir(mdir, "ImageClassifier")), x, y; cb=identity)
return fit(ImageClassifier(;insize=size(x), outsize, mdir=modeldir(mdir, "ImageClassifier")), x, y; cb=identity)
end
error("No model for $(ndims(x))D data")
end
Expand Down
2 changes: 1 addition & 1 deletion src/app/imageclassification/ImageClassification.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module ImageClassification

using ...NaiveGAflux
using ..AutoFlux: fit
using ..AutoFlux: fit, defaultdir
using NaiveGAflux: GlobalPool
using NaiveGAflux: shapetrace, squashshapes, fshape, ndimsout, check_apply
using NaiveGAflux: StatefulGenerationIter
Expand Down
37 changes: 16 additions & 21 deletions src/batchsize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ generic_batchsizefun_docstring(fname="batchsizefun") = """
`$(fname)` is a function with the following signature:
`$(fname)(model, batchsize; inshape_nobatch, availablebytes)`
`$(fname)(batchsize, model; inshape_nobatch, availablebytes)`
It returns the largest batch size not larger than `batchsize` which can be used for `model` without using more than `availablebytes` bytes of memory.
The type of `batchsize` may be used to e.g. determine if one shall account for backwards pass (if `typeof(batchsize) === TrainBatchSize`) or not (if `typeof(batchsize) == ValidationBatchSize`).
Expand Down Expand Up @@ -128,8 +128,8 @@ struct BatchSizeSelectionScaled{F}
batchsizefun::F
end
BatchSizeSelectionScaled(scale::AbstractFloat) = BatchSizeSelectionScaled(scale, limit_maxbatchsize)
function (bs::BatchSizeSelectionScaled)(args...; availablebytes=_availablebytes(), kwargs...)
bs.batchsizefun(args...;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
function (bs::BatchSizeSelectionScaled)(orgbs, model; availablebytes=_availablebytes(model), kwargs...)
bs.batchsizefun(orgbs, model;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
end

"""
Expand Down Expand Up @@ -288,29 +288,31 @@ function batchsizeselection(inshape_nobatch::Tuple;
bs = isnothing(alternatives) ? bs : BatchSizeSelectionFromAlternatives(alternatives, bs)
end

function limit_maxbatchsize(bs::TrainBatchSize, model; inshape_nobatch, availablebytes = _availablebytes())
function limit_maxbatchsize(bs::TrainBatchSize, model; inshape_nobatch, availablebytes = _availablebytes(model))
maxsize = maxtrainbatchsize(model, inshape_nobatch, availablebytes)
maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
end

function limit_maxbatchsize(bs::ValidationBatchSize,
model;
inshape_nobatch,
availablebytes = _availablebytes()
availablebytes = _availablebytes(model)
)
maxsize = maxvalidationbatchsize(model, inshape_nobatch, availablebytes)
maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
end

function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model); init=0)
function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes(model))
elemsize = _model_parsize(model)
paramsize = elemsize > 0 ? nparams(model) * elemsize : 0
actsize = activationsizes(model, inshape_nobatch)
den = paramsize + 2 * actsize
return den > 0 ? fld(availablebytes - paramsize, den) : -1
end

function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model); init=0)
function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availablebytes(model))
elemsize = _model_parsize(model)
paramsize = elemsize > 0 ? nparams(model) * elemsize : 0
actsize = activationsizes(model, inshape_nobatch)
return actsize > 0 ? fld(availablebytes - paramsize, actsize) : -1
end
Expand All @@ -330,17 +332,10 @@ function activationsizes(model::CompGraph, inshape_nobatch, elemsize = _model_pa
end

function _model_parsize(model)
ps = params(model)
isempty(ps) && return 0
return ps |> first |> eltype |> sizeof
anyarr = find_first_array(model)
anyarr === nothing ? 0 : sizeof(eltype(anyarr))
end

# TODO: Take model as input and look at params to determine of cpu or gpu
function _availablebytes()
if CUDA.functional()
info = CUDA.MemoryInfo()
info.free_bytes + info.pool_reserved_bytes - info.pool_used_bytes
else
Int(Sys.free_memory())
end
end
_availablebytes(model) = _availablebytes(execution_device(model))
_availablebytes(::NaiveGAfluxCpuDevice) = Int(Sys.free_memory())

14 changes: 12 additions & 2 deletions src/candidate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ struct FittedCandidate{F, C <: AbstractCandidate} <: AbstractWrappingCandidate
c::C
end
FittedCandidate(c::AbstractCandidate, f::AbstractFitness, gen) = FittedCandidate(gen, fitness(f, c), c)
FittedCandidate(c::FittedCandidate, f::AbstractFitness, gen) = FittedCandidate(gen, fitness(f, c), wrappedcand(c))
FittedCandidate(c::FittedCandidate{F}, f::AbstractFitness, gen) where F = FittedCandidate(gen, fitness(f, c)::F, wrappedcand(c))

@functor FittedCandidate

Expand All @@ -294,8 +294,18 @@ generation(c::FittedCandidate; default=nothing) = c.gen
# if they are not passed a FittedCandidate. Perhaps having some kind of fitness state container in each candidate?
newcand(c::FittedCandidate, mapfield) = FittedCandidate(c.gen, c.fitness, newcand(wrappedcand(c), mapfield))

"""
nparams(model)
Return the number of trainable parameters in `model`.
"""
nparams(c::AbstractCandidate) = model(nparams, c)
nparams(x) = mapreduce(length, +, params(x).order; init=0)
nparams(x) = nparams(0, x)
nparams(x::Integer, g::CompGraph) = nparams(x, vertices(g))
nparams(x::Integer, v::AbstractVertex) = nparams(x, layer(v))
nparams(x::Integer, m) = nparams(x, Flux.trainable(m))
nparams(x::Integer, tr::Union{Tuple, NamedTuple, AbstractArray}) = foldl(nparams, tr; init=x)
nparams(x::Integer, tr::AbstractArray{<:Number}) = x + length(tr)

"""
MapType{T, F1, F2}
Expand Down
21 changes: 4 additions & 17 deletions src/fitness.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,6 @@ function _fitness(s::GpuFitness, c::AbstractCandidate)
fitval = _fitness(s.f, cgpu)
# In case parameters changed. Would like to do this some other way, perhaps return the candidate too, or move training to evolve...
transferstate!(c, cpu(cgpu)) # Can't load CuArray into a normal array
cgpu = nothing # So we can reclaim the memory
# Should not be needed according to CUDA docs, but programs seems to hang every now and then if not done.
# Should revisit every now and then to see if things have changed...
gpu_gc()
return fitval
end

Expand Down Expand Up @@ -112,14 +108,6 @@ function _transferstate!(to::T, from::T) where T <:AbstractArray
foreach(transferstate!, to, from)
end

const gpu_gc = if CUDA.functional()
function(full=true)
GC.gc(full)
CUDA.reclaim()
end
else
() -> nothing
end

"""
AccuracyFitness <: AbstractFitness
Expand Down Expand Up @@ -152,13 +140,12 @@ end
function _fitnessiterator(f, c::AbstractCandidate, iter)
geniter = itergeneration(iter, generation(c; default=0))
canditer = f(c; default=geniter)
matchdatatype(params(c), canditer)
matchdatatype(c, canditer)
end

matchdatatype(ps::Flux.Params, iter) = isempty(ps) ? iter : matchdatatype(first(ps), iter)

matchdatatype(::CUDA.CuArray, iter) = GpuIterator(iter)
matchdatatype(::AbstractArray, iter) = iter
matchdatatype(c::AbstractCandidate, iter) = matchdatatype(model(c), iter)
matchdatatype(model, iter) = matchdatatype(execution_device(model), iter)
matchdatatype(::NaiveGAfluxCpuDevice, iter) = iter

"""
TrainThenFitness{I,L,O,F} <: AbstractFitness
Expand Down
3 changes: 2 additions & 1 deletion src/iterators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ Base.IteratorEltype(::Type{SeedIterator{R, T}}) where {R,T} = Base.IteratorEltyp
GpuIterator(itr)
Return an iterator which sends values from `itr` to the GPU.
Will often be used automatically when training a model with parameters on the GPU.
"""
GpuIterator(itr) = Iterators.map(gpuitr, itr) # Iterator.map can't infer eltypes, but we can't either as we don't know for sure what Flux.gpu will do
gpuitr(a) = Flux.gpu(a)
Expand Down Expand Up @@ -353,7 +355,6 @@ Base.IteratorEltype(::Type{ReBatchingIterator{I}}) where I = Base.IteratorEltype
_rangetoarr(a) = a
_rangetoarr(t::Type{<:Tuple}) = Tuple{map(_rangetoarr, t.parameters)...}
_rangetoarr(a::Type{<:Array}) = a
_rangetoarr(a::Type{<:CUDA.CuArray}) = a
_rangetoarr(::Type{<:AbstractArray{T,N}}) where {T,N} = Array{T,N}

function Base.iterate(itr::ReBatchingIterator)
Expand Down
19 changes: 19 additions & 0 deletions src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using PrecompileTools

@setup_workload begin
@compile_workload begin
## This depends on RNG Implementation which is probably not ideal...
Logging.with_logger(Logging.NullLogger()) do
AutoFlux.fit(
AutoFlux.ImageClassification.ImageClassifier(;
popsize=2,
insize=(32,32,3,0),
outsize=10
),
zeros(Float32, 32,32,3,0), # Zero size so we at least don't compute and gradients and stuff
zeros(Int, 10, 0);
stopcriterion = pop -> generation(pop) > 1)
end
end
Random.seed!(rng_default, 1)
end
27 changes: 25 additions & 2 deletions src/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ function Base.showerror(io::IO, e::InconsistentAutoOptimiserException{<:CompGrap
end
function _print_implicit_opts(io::IO, g::CompGraph)
for (i, v) in enumerate(vertices(g))
if !isempty(Flux.trainable(layer(v)))
if find_first_array(layer(v)) !== nothing
println(io, "vertex ", i, " implicit: ", _is_implicit_opt(v), ", name: ", name(v))
end
end
Expand All @@ -387,7 +387,7 @@ Throws a $InconsistentAutoOptimiserException if some but not all parameters use
function check_implicit_optimiser(g::CompGraph)
prev = nothing
for v in vertices(g)
if !isempty(Flux.trainable(layer(v)))
if find_first_array(layer(v)) !== nothing
isimplicit = _is_implicit_opt(v)
if isnothing(prev)
prev = isimplicit
Expand All @@ -400,3 +400,26 @@ function check_implicit_optimiser(g::CompGraph)
end
isnothing(prev) ? false : prev
end

struct NaiveGAfluxCpuDevice end

function execution_device(model)
maybearray = find_first_array(model)
maybearray === nothing ? NaiveGAfluxCpuDevice() : execution_device(maybearray)
end
execution_device(::AbstractArray) = NaiveGAfluxCpuDevice()

# Just to avoid infinite recursion due to inputs -> outputs -> inputs
find_first_array(g::CompGraph) = find_first_array(vertices(g))
find_first_array(v::AbstractVertex) = find_first_array(layer(v))

find_first_array(model) = find_first_array(Flux.trainable(model))
function find_first_array(x::Union{Tuple, NamedTuple, AbstractArray})
for e in x
res = find_first_array(e)
res !== nothing && return res
end
end
find_first_array(x::AbstractArray{<:Number}) = x


5 changes: 4 additions & 1 deletion test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
Expand All @@ -17,5 +18,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[compat]
Documenter = "0.27"
CUDA = "4"
cuDNN = "1"
Documenter = "0.27"

Loading

0 comments on commit d38f6b9

Please sign in to comment.