Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

feat: use fallback GPU implementations with warnings #165

Merged
merged 11 commits into from
Sep 21, 2024
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
steps:
- label: "Triggering Pipelines (Pull Request)"
if: "build.pull_request.base_branch == 'main'"
if: build.branch != "main" && build.tag == null
agents:
queue: "juliagpu"
plugins:
Expand Down
97 changes: 75 additions & 22 deletions .buildkite/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,55 +24,108 @@ steps:
julia:
- "1"

- group: ":telescope: Downstream CUDA"
- group: ":julia: AMD GPU"
steps:
- label: ":julia: {{matrix.repo}} (Julia 1 + CUDA GPU)"
- label: ":julia: Julia: {{matrix.julia}} + AMD GPU"
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "{{matrix.julia}}"
- JuliaCI/julia-test#v1:
test_args: "--quickfail"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
- src
- ext
command: julia --code-coverage=user --color=yes --project .buildkite/scripts/downstream.jl "{{matrix.repo}}" "CUDA"
env:
RETESTITEMS_NWORKERS: 2
BACKEND_GROUP: "AMDGPU"
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.branch != "main"
rocm: "*"
rocmgpu: "*"
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
timeout_in_minutes: 240
matrix:
setup:
repo:
- "Boltz"
- "Lux"
julia:
- "1"

- group: ":julia: AMD GPU"
# - group: ":julia: Metal GPU"
# steps:
# - label: ":julia: Julia {{matrix.julia}} + Metal GPU"
# soft_fail: true
# plugins:
# - JuliaCI/julia#v1:
# version: "{{matrix.julia}}"
# - JuliaCI/julia-test#v1:
# test_args: "--quickfail"
# - JuliaCI/julia-coverage#v1:
# codecov: true
# dirs:
# - src
# - ext
# agents:
# queue: "juliaecosystem"
# os: "macos"
# arch: "aarch64"
# env:
# BACKEND_GROUP: "Metal"
# if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
# timeout_in_minutes: 240
# matrix:
# setup:
# julia:
# - "1"

# - group: ":julia: oneAPI GPU"
# steps:
# - label: ":julia: Julia {{matrix.julia}} + oneAPI GPU"
# soft_fail: true
# plugins:
# - JuliaCI/julia#v1:
# version: "{{matrix.julia}}"
# - JuliaCI/julia-test#v1:
# test_args: "--quickfail"
# - JuliaCI/julia-coverage#v1:
# codecov: true
# dirs:
# - src
# - ext
# agents:
# queue: "juliagpu"
# intel: "*"
# env:
# BACKEND_GROUP: "oneAPI"
# if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
# timeout_in_minutes: 240
# matrix:
# setup:
# julia:
# - "1"

- group: ":telescope: Downstream CUDA"
steps:
- label: ":julia: Julia: {{matrix.julia}} + AMD GPU"
- label: ":julia: {{matrix.repo}} (Julia 1 + CUDA GPU)"
plugins:
- JuliaCI/julia#v1:
version: "{{matrix.julia}}"
- JuliaCI/julia-test#v1:
test_args: "--quickfail"
version: "1"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
- src
- ext
env:
RETESTITEMS_NWORKERS: 2
BACKEND_GROUP: "AMDGPU"
command: julia --code-coverage=user --color=yes --project .buildkite/scripts/downstream.jl "{{matrix.repo}}" "CUDA"
agents:
queue: "juliagpu"
rocm: "*"
rocmgpu: "*"
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
cuda: "*"
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.branch != "main"
timeout_in_minutes: 240
matrix:
setup:
julia:
- "1"
repo:
- "Boltz"
- "Lux"

- group: ":telescope: Downstream AMD GPU"
steps:
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.2.4"
version = "1.3.0"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
5 changes: 0 additions & 5 deletions benchmarks/setup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -236,11 +236,6 @@ end

function setup_batched_matmul_benchmarks!(suite::BenchmarkGroup, cpu_or_gpu::String,
backend::String, dev::MLDataDevices.AbstractDevice)
if dev isa MetalDevice || dev isa oneAPIDevice
@warn "Skipping batched_matmul benchmarks for $(dev)..."
return
end

for N in [2, 16, 128, 512], Bsize in [4, 32, 128, 512]
benchmark_name = "batchedmm($N, Bsize=$Bsize)"

Expand Down
4 changes: 2 additions & 2 deletions src/impl/Impl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ using Random: Random, AbstractRNG, rand!
using Statistics: Statistics, mean, var

using LuxCore: LuxCore
using MLDataDevices: get_device_type, AMDGPUDevice, CUDADevice, AbstractGPUDevice,
AbstractDevice
using MLDataDevices: get_device_type, CPUDevice, AMDGPUDevice, CUDADevice,
AbstractGPUDevice, AbstractDevice
using NNlib: NNlib, ConvDims

using ..LuxLib: Optional, Numeric, ∂∅, internal_operation_mode, AbstractInternalArrayOpMode,
Expand Down
61 changes: 49 additions & 12 deletions src/impl/batched_mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,26 @@ function batched_matmul(::GenericBroadcastOp, x::AbstractArray{xT, 3},
return NNlib.batched_mul(x, y)
end

function batched_matmul(::GPUBroadcastOp{<:AbstractGPUDevice},
for dev in (AMDGPUDevice, CUDADevice)
@eval function batched_matmul(::GPUBroadcastOp{$(dev)},
x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
return NNlib.batched_mul(x, y) # GPU versions are well optimized
end
end

function batched_matmul(opmode::GPUBroadcastOp{<:AbstractGPUDevice},
x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
return NNlib.batched_mul(x, y) # GPU versions are well optimized
if isconcretetype(Core.Compiler._return_type(
NNlib.batched_mul, Tuple{typeof(x), typeof(y)}))
return NNlib.batched_mul(x, y) # GPU versions are well optimized
end
return fallback_batched_matmul(opmode, x, y)
end

function batched_matmul(::GPUBroadcastOp{AMDGPUDevice}, x::AbstractArray{<:Complex, 3},
function batched_matmul(
opmode::GPUBroadcastOp{AMDGPUDevice}, x::AbstractArray{<:Complex, 3},
y::AbstractArray{<:Complex, 3})
if (size(x, 3) != size(y, 3) && size(x, 3) != 1 && size(y, 3) != 1) ||
(size(x, 2) != size(y, 1))
throw(DimensionMismatch(lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul."))
end
@warn "Using fallback implementation of `batched_matmul` for complex numbers on \
AMDGPUDevice" maxlog=1
size(x, 3) == size(y, 3) && return stack(*, batchview(x), batchview(y))
size(x, 3) == 1 && return stack(Base.Fix1(*, batchview(x, 1)), batchview(y))
return stack(Base.Fix2(*, batchview(y, 1)), batchview(x))
return fallback_batched_matmul(opmode, x, y)
end

function batched_matmul(opmode::LoopedArrayOp, x::AbstractArray{xT, 3},
Expand Down Expand Up @@ -73,6 +77,39 @@ function batched_matmul_loopvec_impl!(
end
end

function fallback_batched_matmul(
dev, x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
z = similar(x, promote_type(eltype(x), eltype(y)), size(x, 1),
size(y, 2), max(size(x, 3), size(y, 3)))
fallback_batched_matmul!(z, dev, x, y)
return z
end

function fallback_batched_matmul!(
z::AbstractArray{zT, 3}, dev, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {zT, xT, yT}
@warn "Using fallback Batched Matrix Multiply routine for $(dev) with A: size = \
$(size(x)) eltype = $(xT) and B: size = $(size(y)) eltype = $(yT). This may be \
slow." maxlog=1
if (size(x, 3) != size(y, 3) && size(x, 3) != 1 && size(y, 3) != 1) ||
(size(x, 2) != size(y, 1))
throw(DimensionMismatch(lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul."))
end
if size(x, 3) == size(y, 3)
Threads.@threads for L in indices((x, y), 3)
mul!(batchview(z, L), batchview(x, L), batchview(y, L))
end
elseif size(x, 3) == 1
Threads.@threads for L in indices((x, y), 3)
mul!(batchview(z, L), batchview(x, 1), batchview(y, L))
end
else # has to be size(y, 3) == 1
Threads.@threads for L in indices((x, y), 3)
mul!(batchview(z, L), batchview(x, L), batchview(y, 1))
end
end
end

function CRC.rrule(::typeof(batched_matmul), x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {xT, yT}
∇batched_matmul = @closure Δ_ -> begin
Expand Down
46 changes: 44 additions & 2 deletions src/impl/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ function conv!(y::AbstractArray{yT, N}, ::Type{<:AbstractDevice},
NNlib.conv!(y, x, weight, cdims)
return
end
function conv!(y::AbstractArray{yT, N}, ::Type{<:AbstractGPUDevice},
function conv!(y::AbstractArray{yT, N}, ::Type{<:Union{CUDADevice, AMDGPUDevice}},
x::AbstractArray{xT, N}, weight::AbstractArray{wT, N},
cdims::ConvDims) where {yT, xT, wT, N}
if xT !== wT !== yT
Expand All @@ -43,11 +43,53 @@ function conv!(y::AbstractArray{yT, N}, ::Type{<:AbstractGPUDevice},
contiguous(ofeltype_array(yT, weight)), cdims)
return
end
function conv!(y::AbstractArray{yT, N}, dev::Type{<:AbstractGPUDevice},
x::AbstractArray{xT, N}, weight::AbstractArray{wT, N},
cdims::ConvDims) where {yT, xT, wT, N}
if xT !== wT !== yT
safe_warning(
"Mixed Precision Inputs received for GPU convolution [weight: $(wT)] and \
[x: $(xT)]. Promoting to $(yT).", 1)
end
x_cont = contiguous(ofeltype_array(yT, x))
weight_cont = contiguous(ofeltype_array(yT, weight))
fallback_slow_conv!(y, dev, x_cont, weight_cont, cdims)
return
end

function conv(x′, weight′, cdims::ConvDims)
function fallback_slow_conv!(y::AbstractArray{yT, N}, dev::Type{<:AbstractDevice},
x::AbstractArray{xT, N}, weight::AbstractArray{wT, N},
cdims::ConvDims) where {yT, xT, wT, N}
@warn "Falling back to slow convolution routine for $(dev) with x: size = \
$(size(x)) eltype = $(xT) and weight: size = $(size(weight)) \
eltype = $(wT)." maxlog=1
# TODO: We should be able to reuse `y` for some part here for some efficiency
@assert NNlib.groupcount(cdims)==1 "Only groups=1 is supported for now." # FIXME
tmp = NNlib.unfold(x, cdims)
weight_compact = reshape(weight, :, size(weight, N), 1)
res = batched_matmul(tmp, weight_compact)
copyto!(y, reshape(res, size(y)))
return
end

conv(x, weight, cdims::ConvDims) = conv(get_device_type((x, weight)), x, weight, cdims)

function conv(
::Type{<:Union{CPUDevice, CUDADevice, AMDGPUDevice}}, x′, weight′, cdims::ConvDims)
x, weight = get_conv_input_weight(x′, weight′)
return NNlib.conv(x, weight, cdims)
end
function conv(dev::Type{<:AbstractDevice}, x′, weight′, cdims::ConvDims)
x, weight = get_conv_input_weight(dev, x′, weight′)
return fallback_slow_conv(dev, x, weight, cdims)
end

function fallback_slow_conv(dev, x, weight, cdims::ConvDims)
y = similar(x, promote_type(eltype(x), eltype(weight)), NNlib.output_size(cdims)...,
NNlib.channels_out(cdims), size(x, ndims(x)))
fallback_slow_conv!(y, dev, x, weight, cdims)
return y
end

function ∇conv_data(x′, weight′, cdims::ConvDims)
x, weight = get_conv_input_weight(x′, weight′)
Expand Down
5 changes: 4 additions & 1 deletion src/impl/dropout.jl
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,17 @@ function generate_dropout_mask_loop!(y::AbstractArray, p, invp)
end

function generate_dropout_mask_simd_loop!(y::AbstractArray{T}, p, invp) where {T}
p, invp = T(p), T(invp)
@simd ivdep for I in indices(y)
y[I] = (y[I] > p) * invp
end
end

@enzyme_alternative generate_dropout_mask_loop! generate_dropout_mask_simd_loop!

function generate_dropout_mask!(y::AbstractArray, ::AbstractInternalArrayOpMode, p, invp)
function generate_dropout_mask!(
y::AbstractArray{T}, ::AbstractInternalArrayOpMode, p, invp) where {T}
p, invp = T(p), T(invp)
@. y = (y > p) * invp
return
end
Expand Down
6 changes: 6 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,9 @@ Statistics = "1.10"
Test = "1.10"
Tracker = "0.2.34"
Zygote = "0.6.70"

[extras]
CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"

[preferences.CUDA_Driver_jll]
compat = false
4 changes: 3 additions & 1 deletion test/common_ops/activation_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
apply_act_fast(f::F, x) where {F} = sum(abs2, fast_activation!!(f, copy(x)))
apply_act_fast2(f::F, x) where {F} = sum(abs2, fast_activation(f, x))

@testset "$mode" for (mode, aType, ongpu) in MODES
@testset "$mode" for (mode, aType, ongpu, fp64) in MODES
@testset "$f: $T" for f in [identity, relu, sigmoid, sigmoid_fast, softplus,
logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
T in [Float16, Float32, Float64]

!fp64 && T == Float64 && continue

x = rand(rng, T, 4, 3) |> aType

y1 = apply_act(f, x)
Expand Down
4 changes: 3 additions & 1 deletion test/common_ops/bias_act_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
end
(f::__Fix1)(x, b) = f.f(f.act, x, b)

@testset "$mode" for (mode, aType, ongpu) in MODES
@testset "$mode" for (mode, aType, ongpu, fp64) in MODES
@testset "$act, $T, $sz" for act in [
identity, relu, sigmoid, sigmoid_fast, softplus,
logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
T in [Float16, Float32, Float64],
sz in [(2, 2, 3, 4), (4, 5)]

!fp64 && T == Float64 && continue

x = rand(rng, T, sz) |> aType
b = rand(rng, T, sz[end - 1]) |> aType

Expand Down
Loading
Loading