Skip to content

Commit

Permalink
Merge pull request #118 from SciML/ap/static_kernels
Browse files Browse the repository at this point in the history
Using SimpleNonlinearSolve in GPU Kernels
  • Loading branch information
avik-pal authored Jan 13, 2024
2 parents ef11f39 + 91a995d commit 6eb690e
Show file tree
Hide file tree
Showing 11 changed files with 297 additions and 43 deletions.
16 changes: 16 additions & 0 deletions lib/SimpleNonlinearSolve/.buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
steps:
- label: "Julia 1"
plugins:
- JuliaCI/julia#v1:
version: "1"
- JuliaCI/julia-test#v1:
agents:
queue: "juliagpu"
cuda: "*"
timeout_in_minutes: 30
# Don't run Buildkite if the commit message includes the text [skip tests]
if: build.message !~ /\[skip tests\]/

env:
GROUP: CUDA
JULIA_PKG_SERVER: ""
2 changes: 1 addition & 1 deletion lib/SimpleNonlinearSolve/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "SimpleNonlinearSolve"
uuid = "727e6d20-b764-4bd8-a329-72de5adea6c7"
authors = ["SciML"]
version = "1.2.0"
version = "1.2.1"

[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Expand Down
1 change: 1 addition & 0 deletions lib/SimpleNonlinearSolve/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

[![codecov](https://codecov.io/gh/SciML/SimpleNonlinearSolve.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/SciML/SimpleNonlinearSolve.jl)
[![Build Status](https://github.com/SciML/SimpleNonlinearSolve.jl/workflows/CI/badge.svg)](https://github.com/SciML/SimpleNonlinearSolve.jl/actions?query=workflow%3ACI)
[![Build status](https://badge.buildkite.com/c5f7db4f1b5e8a592514378b6fc807d934546cc7d5aa79d645.svg?branch=main)](https://buildkite.com/julialang/simplenonlinearsolve-dot-jl)

[![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor%27s%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
[![SciML Code Style](https://img.shields.io/static/v1?label=code%20style&message=SciML&color=9558b2&labelColor=389826)](https://github.com/SciML/SciMLStyle)
Expand Down
6 changes: 6 additions & 0 deletions lib/SimpleNonlinearSolve/src/SimpleNonlinearSolve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ function SciMLBase.solve(prob::IntervalNonlinearProblem, alg::Nothing,
return solve(prob, ITP(), args...; kwargs...)
end

# By Pass the highlevel checks for NonlinearProblem for Simple Algorithms
function SciMLBase.solve(prob::NonlinearProblem, alg::AbstractSimpleNonlinearSolveAlgorithm,
args...; kwargs...)
return SciMLBase.__solve(prob, alg, args...; kwargs...)
end

@setup_workload begin
for T in (Float32, Float64)
prob_no_brack_scalar = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
Expand Down
46 changes: 30 additions & 16 deletions lib/SimpleNonlinearSolve/src/nlsolve/dfsane.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
SimpleDFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0,
M::Int = 10, γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
M::Union{Int, Val} = Val(10), γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
nexp::Int = 2, η_strategy::Function = (f_1, k, x, F) -> f_1 ./ k^2)
A low-overhead implementation of the df-sane method for solving large-scale nonlinear
Expand Down Expand Up @@ -42,21 +42,27 @@ see the paper [1].
information for solving large-scale nonlinear systems of equations, Mathematics of
Computation, 75, 1429-1448.
"""
@kwdef @concrete struct SimpleDFSane <: AbstractSimpleNonlinearSolveAlgorithm
σ_min = 1e-10
σ_max = 1e10
σ_1 = 1.0
M::Int = 10
γ = 1e-4
τ_min = 0.1
τ_max = 0.5
nexp::Int = 2
η_strategy = (f_1, k, x, F) -> f_1 ./ k^2
@concrete struct SimpleDFSane{M} <: AbstractSimpleNonlinearSolveAlgorithm
σ_min
σ_max
σ_1
γ
τ_min
τ_max
nexp::Int
η_strategy
end

function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
function SimpleDFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0,
M::Union{Int, Val} = Val(10), γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
nexp::Int = 2, η_strategy::F = (f_1, k, x, F) -> f_1 ./ k^2) where {F}
return SimpleDFSane{SciMLBase._unwrap_val(M)}(σ_min, σ_max, σ_1, γ, τ_min, τ_max, nexp,
η_strategy)
end

function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane{M}, args...;
abstol = nothing, reltol = nothing, maxiters = 1000, alias_u0 = false,
termination_condition = nothing, kwargs...)
termination_condition = nothing, kwargs...) where {M}
x = __maybe_unaliased(prob.u0, alias_u0)
fx = _get_fx(prob, x)
T = eltype(x)
Expand All @@ -65,7 +71,7 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
σ_max = T(alg.σ_max)
σ_k = T(alg.σ_1)

(; M, nexp, η_strategy) = alg
(; nexp, η_strategy) = alg
γ = T(alg.γ)
τ_min = T(alg.τ_min)
τ_max = T(alg.τ_max)
Expand All @@ -77,7 +83,11 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
α_1 = one(T)
f_1 = fx_norm

history_f_k = fill(fx_norm, M)
history_f_k = if x isa SArray
ones(SVector{M, T}) * fx_norm
else
fill(fx_norm, M)
end

# Generate the cache
@bb x_cache = similar(x)
Expand Down Expand Up @@ -143,7 +153,11 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
fx_norm = fx_norm_new

# Store function value
history_f_k[mod1(k, M)] = fx_norm_new
if history_f_k isa SVector
history_f_k = Base.setindex(history_f_k, fx_norm_new, mod1(k, M))
else
history_f_k[mod1(k, M)] = fx_norm_new
end
k += 1
end

Expand Down
168 changes: 163 additions & 5 deletions lib/SimpleNonlinearSolve/src/nlsolve/lbroyden.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,22 @@ function SimpleLimitedMemoryBroyden(; threshold::Union{Val, Int} = Val(27))
return SimpleLimitedMemoryBroyden{SciMLBase._unwrap_val(threshold)}()
end

@views function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleLimitedMemoryBroyden,
function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleLimitedMemoryBroyden,
args...; termination_condition = nothing, kwargs...)
if prob.u0 isa SArray
if termination_condition === nothing ||
termination_condition isa AbsNormTerminationMode
return __static_solve(prob, alg, args...; termination_condition, kwargs...)
end
@warn "Specifying `termination_condition = $(termination_condition)` for \
`SimpleLimitedMemoryBroyden` with `SArray` is not non-allocating. Use \
either `termination_condition = AbsNormTerminationMode()` or \
`termination_condition = nothing`." maxlog=1
end
return __generic_solve(prob, alg, args...; termination_condition, kwargs...)
end

@views function __generic_solve(prob::NonlinearProblem, alg::SimpleLimitedMemoryBroyden,
args...; abstol = nothing, reltol = nothing, maxiters = 1000, alias_u0 = false,
termination_condition = nothing, kwargs...)
x = __maybe_unaliased(prob.u0, alias_u0)
Expand All @@ -36,7 +51,7 @@ end

fx = _get_fx(prob, x)

U, Vᵀ = __init_low_rank_jacobian(x, fx, threshold)
U, Vᵀ = __init_low_rank_jacobian(x, fx, x isa StaticArray ? threshold : Val(η))

abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fx, x,
termination_condition)
Expand All @@ -48,7 +63,7 @@ end
@bb δf = copy(fx)

@bb vᵀ_cache = copy(x)
Tcache = __lbroyden_threshold_cache(x, threshold)
Tcache = __lbroyden_threshold_cache(x, x isa StaticArray ? threshold : Val(η))
@bb mat_cache = copy(x)

for i in 1:maxiters
Expand Down Expand Up @@ -83,6 +98,97 @@ end
return build_solution(prob, alg, x, fx; retcode = ReturnCode.MaxIters)
end

# Non-allocating StaticArrays version of SimpleLimitedMemoryBroyden is actually quite
# finicky, so we'll implement it separately from the generic version
# Ignore termination_condition. Don't pass things into internal functions
function __static_solve(prob::NonlinearProblem{<:SArray}, alg::SimpleLimitedMemoryBroyden,
args...; abstol = nothing, maxiters = 1000, kwargs...)
x = prob.u0
fx = _get_fx(prob, x)
threshold = __get_threshold(alg)

U, Vᵀ = __init_low_rank_jacobian(vec(x), vec(fx), threshold)

abstol = DiffEqBase._get_tolerance(abstol, eltype(x))

xo, δx, fo, δf = x, -fx, fx, fx

converged, res = __unrolled_lbroyden_initial_iterations(prob, xo, fo, δx, abstol, U, Vᵀ,
threshold)

converged &&
return build_solution(prob, alg, res.x, res.fx; retcode = ReturnCode.Success)

xo, fo, δx = res.x, res.fx, res.δx

for i in 1:(maxiters - SciMLBase._unwrap_val(threshold))
x = xo .+ δx
fx = prob.f(x, prob.p)
δf = fx - fo

maximum(abs, fx) abstol &&
return build_solution(prob, alg, x, fx; retcode = ReturnCode.Success)

vᵀ = _restructure(x, _rmatvec!!(U, Vᵀ, vec(δx)))
mvec = _restructure(x, _matvec!!(U, Vᵀ, vec(δf)))

d = dot(vᵀ, δf)
δx = @. (δx - mvec) / d

U = Base.setindex(U, vec(δx), mod1(i, SciMLBase._unwrap_val(threshold)))
Vᵀ = Base.setindex(Vᵀ, vec(vᵀ), mod1(i, SciMLBase._unwrap_val(threshold)))

δx = -_restructure(fx, _matvec!!(U, Vᵀ, vec(fx)))

xo = x
fo = fx
end

return build_solution(prob, alg, xo, fo; retcode = ReturnCode.MaxIters)
end

@generated function __unrolled_lbroyden_initial_iterations(prob, xo, fo, δx, abstol, U,
Vᵀ, ::Val{threshold}) where {threshold}
calls = []
for i in 1:threshold
static_idx, static_idx_p1 = Val(i - 1), Val(i)
push!(calls,
quote
x = xo .+ δx
fx = prob.f(x, prob.p)
δf = fx - fo

maximum(abs, fx) abstol && return true, (; x, fx, δx)

_U = __first_n_getindex(U, $(static_idx))
_Vᵀ = __first_n_getindex(Vᵀ, $(static_idx))

vᵀ = _restructure(x, _rmatvec!!(_U, _Vᵀ, vec(δx)))
mvec = _restructure(x, _matvec!!(_U, _Vᵀ, vec(δf)))

d = dot(vᵀ, δf)
δx = @. (δx - mvec) / d

U = Base.setindex(U, vec(δx), $(i))
Vᵀ = Base.setindex(Vᵀ, vec(vᵀ), $(i))

_U = __first_n_getindex(U, $(static_idx_p1))
_Vᵀ = __first_n_getindex(Vᵀ, $(static_idx_p1))
δx = -_restructure(fx, _matvec!!(_U, _Vᵀ, vec(fx)))

xo = x
fo = fx
end)
end
push!(calls, quote
# Termination Check
maximum(abs, fx) abstol && return true, (; x, fx, δx)

return false, (; x, fx, δx)
end)
return Expr(:block, calls...)
end

function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
# xᵀ × (-I + UVᵀ)
η = size(U, 2)
Expand All @@ -98,6 +204,9 @@ function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
return y
end

@inline _rmatvec!!(::Nothing, Vᵀ, x) = -x
@inline _rmatvec!!(U, Vᵀ, x) = __mapTdot(__mapdot(x, U), Vᵀ) .- x

function _matvec!!(y, Vᵀx, U, Vᵀ, x)
# (-I + UVᵀ) × x
η = size(U, 2)
Expand All @@ -113,7 +222,56 @@ function _matvec!!(y, Vᵀx, U, Vᵀ, x)
return y
end

@inline _matvec!!(::Nothing, Vᵀ, x) = -x
@inline _matvec!!(U, Vᵀ, x) = __mapTdot(__mapdot(x, Vᵀ), U) .- x

function __mapdot(x::SVector{S1}, Y::SVector{S2, <:SVector{S1}}) where {S1, S2}
return map(Base.Fix1(dot, x), Y)
end
@generated function __mapTdot(x::SVector{S1}, Y::SVector{S1, <:SVector{S2}}) where {S1, S2}
calls = []
syms = [gensym("m$(i)") for i in 1:S1]
for i in 1:S1
push!(calls, :($(syms[i]) = x[$(i)] .* Y[$i]))
end
push!(calls, :(return .+($(syms...))))
return Expr(:block, calls...)
end

@generated function __first_n_getindex(x::SVector{L, T}, ::Val{N}) where {L, T, N}
@assert N L
getcalls = ntuple(i -> :(x[$i]), N)
N == 0 && return :(return nothing)
return :(return SVector{$N, $T}(($(getcalls...))))
end

__lbroyden_threshold_cache(x, ::Val{threshold}) where {threshold} = similar(x, threshold)
function __lbroyden_threshold_cache(x::SArray, ::Val{threshold}) where {threshold}
return SArray{Tuple{threshold}, eltype(x)}(ntuple(_ -> zero(eltype(x)), threshold))
function __lbroyden_threshold_cache(x::StaticArray, ::Val{threshold}) where {threshold}
return zeros(MArray{Tuple{threshold}, eltype(x)})
end
__lbroyden_threshold_cache(x::SArray, ::Val{threshold}) where {threshold} = nothing

function __init_low_rank_jacobian(u::StaticArray{S1, T1}, fu::StaticArray{S2, T2},
::Val{threshold}) where {S1, S2, T1, T2, threshold}
T = promote_type(T1, T2)
fuSize, uSize = Size(fu), Size(u)
Vᵀ = MArray{Tuple{threshold, prod(uSize)}, T}(undef)
U = MArray{Tuple{prod(fuSize), threshold}, T}(undef)
return U, Vᵀ
end
@generated function __init_low_rank_jacobian(u::SVector{Lu, T1}, fu::SVector{Lfu, T2},
::Val{threshold}) where {Lu, Lfu, T1, T2, threshold}
T = promote_type(T1, T2)
inner_inits_Vᵀ = [:(zeros(SVector{$Lu, $T})) for i in 1:threshold]
inner_inits_U = [:(zeros(SVector{$Lfu, $T})) for i in 1:threshold]
return quote
Vᵀ = SVector($(inner_inits_Vᵀ...))
U = SVector($(inner_inits_U...))
return U, Vᵀ
end
end
function __init_low_rank_jacobian(u, fu, ::Val{threshold}) where {threshold}
Vᵀ = similar(u, threshold, length(u))
U = similar(u, length(fu), threshold)
return U, Vᵀ
end
14 changes: 0 additions & 14 deletions lib/SimpleNonlinearSolve/src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -243,20 +243,6 @@ function __init_identity_jacobian!!(J::SVector{S1}) where {S1}
return ones(SVector{S1, eltype(J)})
end

function __init_low_rank_jacobian(u::StaticArray{S1, T1}, fu::StaticArray{S2, T2},
::Val{threshold}) where {S1, S2, T1, T2, threshold}
T = promote_type(T1, T2)
fuSize, uSize = Size(fu), Size(u)
Vᵀ = MArray{Tuple{threshold, prod(uSize)}, T}(undef)
U = MArray{Tuple{prod(fuSize), threshold}, T}(undef)
return U, Vᵀ
end
function __init_low_rank_jacobian(u, fu, ::Val{threshold}) where {threshold}
Vᵀ = similar(u, threshold, length(u))
U = similar(u, length(fu), threshold)
return U, Vᵀ
end

@inline _vec(v) = vec(v)
@inline _vec(v::Number) = v
@inline _vec(v::AbstractVector) = v
Expand Down
11 changes: 5 additions & 6 deletions lib/SimpleNonlinearSolve/test/basictests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ end
## SimpleDFSane needs to allocate a history vector
@testset "Allocation Checks: $(_nameof(alg))" for alg in (SimpleNewtonRaphson(),
SimpleHalley(), SimpleBroyden(), SimpleKlement(), SimpleLimitedMemoryBroyden(),
SimpleTrustRegion())
@check_allocs nlsolve(prob, alg) = DiffEqBase.__solve(prob, alg; abstol = 1e-9)
SimpleTrustRegion(), SimpleDFSane())
@check_allocs nlsolve(prob, alg) = SciMLBase.solve(prob, alg; abstol = 1e-9)

nlprob_scalar = NonlinearProblem{false}(quadratic_f, 1.0, 2.0)
nlprob_sa = NonlinearProblem{false}(quadratic_f, @SVector[1.0, 1.0], 2.0)
Expand All @@ -175,18 +175,17 @@ end
@test true
catch e
@error e
@test false
# History Vector Allocates
@test false broken=(alg isa SimpleDFSane)
end

# ForwardDiff allocates for hessian since we don't propagate the chunksize
# SimpleLimitedMemoryBroyden needs to do views on the low rank matrices so the sizes
# are dynamic. This can be fixed but no without maintaining the simplicity of the code
try
nlsolve(nlprob_sa, alg)
@test true
catch e
@error e
@test false broken=(alg isa SimpleHalley || alg isa SimpleLimitedMemoryBroyden)
@test false broken=(alg isa SimpleHalley)
end
end

Expand Down
Loading

0 comments on commit 6eb690e

Please sign in to comment.