Skip to content

Commit

Permalink
FieldVectors, add benchmarks, improve perf
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Nov 5, 2024
1 parent b9714ab commit e0dec11
Show file tree
Hide file tree
Showing 7 changed files with 345 additions and 44 deletions.
2 changes: 2 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,7 @@ steps:
key: unit_field
command:
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_fieldvectors.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/inference_repro.jl"
Expand All @@ -495,6 +496,7 @@ steps:
command:
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_fieldvectors.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/inference_repro.jl"
Expand Down
90 changes: 90 additions & 0 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,93 @@ function Base.copyto!(
@inbounds bc0 = bc[]
fill!(dest, bc0)
end

# For field-vector operations
function DataLayouts.copyto_per_field!(
array::AbstractArray,
bc::Union{AbstractArray, Base.Broadcast.Broadcasted},
::ToCUDA,
)
bc′ = DataLayouts.to_non_extruded_broadcasted(bc)
# All field variables are treated separately, so
# we can parallelize across the field index, and
# leverage linear indexing:
nitems = prod(size(array))
N = prod(size(array))
args = (array, bc′, N)
threads = threads_via_occupancy(copyto_per_field_kernel!, args)
n_max_threads = min(threads, nitems)
p = linear_partition(nitems, n_max_threads)
auto_launch!(
copyto_per_field_kernel!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
return array
end
function copyto_per_field_kernel!(array, bc, N)
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
if 1 i N
@inbounds array[i] = bc[i]
end
return nothing
end

# Need 2 methods here to avoid unbound arguments:
function DataLayouts.copyto_per_field_scalar!(
array::AbstractArray,
bc::Base.Broadcast.Broadcasted{Style},
::ToCUDA,
) where {
Style <:
Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}},
}
bc′ = DataLayouts.to_non_extruded_broadcasted(bc)
# All field variables are treated separately, so
# we can parallelize across the field index, and
# leverage linear indexing:
nitems = prod(size(array))
N = prod(size(array))
args = (array, bc′, N)
threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
n_max_threads = min(threads, nitems)
p = linear_partition(nitems, n_max_threads)
auto_launch!(
copyto_per_field_kernel_0D!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
return array
end
function DataLayouts.copyto_per_field_scalar!(
array::AbstractArray,
bc::Real,
::ToCUDA,
)
bc′ = DataLayouts.to_non_extruded_broadcasted(bc)
# All field variables are treated separately, so
# we can parallelize across the field index, and
# leverage linear indexing:
nitems = prod(size(array))
N = prod(size(array))
args = (array, bc′, N)
threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
n_max_threads = min(threads, nitems)
p = linear_partition(nitems, n_max_threads)
auto_launch!(
copyto_per_field_kernel_0D!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
return array
end
function copyto_per_field_kernel_0D!(array, bc, N)
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
if 1 i N
@inbounds array[i] = bc[]
end
return nothing
end
48 changes: 48 additions & 0 deletions src/DataLayouts/copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,51 @@ function Base.copyto!(
end
return dest
end

function copyto_per_field!(
array::AbstractArray,
bc::Union{AbstractArray, Base.Broadcast.Broadcasted},
::ToCPU,
)
bc′ = to_non_extruded_broadcasted(bc)
# All field variables are treated separately, so
# we can parallelize across the field index, and
# leverage linear indexing:
N = prod(size(array))
@inbounds @simd for I in 1:N
array[I] = bc′[I]
end
return array
end

# Need 2 methods here to avoid unbound arguments:
function copyto_per_field_scalar!(array::AbstractArray, bc::Real, ::ToCPU)
bc′ = to_non_extruded_broadcasted(bc)
# All field variables are treated separately, so
# we can parallelize across the field index, and
# leverage linear indexing:
N = prod(size(array))
@inbounds @simd for I in 1:N
array[I] = bc′[]
end
return array
end

function copyto_per_field_scalar!(
array::AbstractArray,
bc::Base.Broadcast.Broadcasted{Style},
::ToCPU,
) where {
Style <:
Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}},
}
bc′ = to_non_extruded_broadcasted(bc)
# All field variables are treated separately, so
# we can parallelize across the field index, and
# leverage linear indexing:
N = prod(size(array))
@inbounds @simd for I in 1:N
array[I] = bc′[]
end
return array
end
6 changes: 5 additions & 1 deletion src/Fields/Fields.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ import ..DataLayouts:
FusedMultiBroadcast,
@fused_direct,
isascalar,
check_fused_broadcast_axes
check_fused_broadcast_axes,
ToCPU,
ToCUDA,
copyto_per_field!,
copyto_per_field_scalar!
import ..Domains
import ..Topologies
import ..Quadratures
Expand Down
104 changes: 61 additions & 43 deletions src/Fields/fieldvector.jl
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,6 @@ function Base.similar(
error("Cannot construct FieldVector")
end

@inline function Base.copyto!(dest::FV, src::FV) where {FV <: FieldVector}
for symb in propertynames(dest)
pd = parent(getproperty(dest, symb))
ps = parent(getproperty(src, symb))
copyto!(pd, ps)
end
return dest
end

"""
Spaces.create_dss_buffer(fv::FieldVector)
Expand Down Expand Up @@ -218,32 +209,6 @@ function Spaces.weighted_dss!(
Spaces.weighted_dss!(pairs...)
end


# Recursively call transform_bc_args() on broadcast arguments in a way that is statically reducible by the optimizer
# see Base.Broadcast.preprocess_args
@inline transform_bc_args(args::Tuple, inds...) = (
transform_broadcasted(args[1], inds...),
transform_bc_args(Base.tail(args), inds...)...,
)
@inline transform_bc_args(args::Tuple{Any}, inds...) =
(transform_broadcasted(args[1], inds...),)
@inline transform_bc_args(args::Tuple{}, inds...) = ()

@inline function transform_broadcasted(
bc::Base.Broadcast.Broadcasted{FieldVectorStyle},
symb,
axes,
)
Base.Broadcast.Broadcasted(
bc.f,
transform_bc_args(bc.args, symb, axes),
axes,
)
end
@inline transform_broadcasted(fv::FieldVector, symb, axes) =
parent(getfield(_values(fv), symb))
@inline transform_broadcasted(x, symb, axes) = x

@inline function first_fieldvector_in_bc(args::Tuple, rargs...)
x1 = first_fieldvector_in_bc(args[1], rargs...)
x1 isa FieldVector && return x1
Expand Down Expand Up @@ -330,26 +295,79 @@ function Base.Broadcast.instantiate(
return Base.Broadcast.Broadcasted{FieldVectorStyle}(bc.f, bc.args, axes)
end

@inline function Base.copyto!(
dest::FieldVector,
# Recursively call transform_bc_args() on broadcast arguments in a way that is statically reducible by the optimizer
# see Base.Broadcast.preprocess_args
@inline transform_bc_args(args::Tuple, inds...) = (
transform_broadcasted(args[1], inds...),
transform_bc_args(Base.tail(args), inds...)...,
)
@inline transform_bc_args(args::Tuple{Any}, inds...) =
(transform_broadcasted(args[1], inds...),)
@inline transform_bc_args(args::Tuple{}, inds...) = ()

@inline function transform_broadcasted(
bc::Base.Broadcast.Broadcasted{FieldVectorStyle},
symb,
axes,
)
Base.Broadcast.Broadcasted(
bc.f,
transform_bc_args(bc.args, symb, axes),
axes,
)
end
@inline transform_broadcasted(fv::FieldVector, symb, axes) =
parent(getfield(_values(fv), symb))
@inline transform_broadcasted(x, symb, axes) = x

@inline Base.copyto!(
dest::FieldVector,
bc::Union{FieldVector, Base.Broadcast.Broadcasted{FieldVectorStyle}},
) = copyto_per_field!(dest, bc)

@inline function copyto_per_field!(
dest::FieldVector,
bc::Union{FieldVector, Base.Broadcast.Broadcasted{FieldVectorStyle}},
)
map(propertynames(dest)) do symb
Base.@_inline_meta
p = parent(getfield(_values(dest), symb))
copyto!(p, transform_broadcasted(bc, symb, axes(p)))
array = parent(getfield(_values(dest), symb))
bct = transform_broadcasted(bc, symb, axes(array))
if array isa FieldVector # recurse
copyto_per_field!(array, bct)
else
copyto_per_field!(array, bct, DataLayouts.device_dispatch(array))
end
end
return dest
end

@inline function Base.copyto!(
@inline Base.copyto!(
dest::FieldVector,
bc::Base.Broadcast.Broadcasted{<:Base.Broadcast.Style{Tuple}},
) = copyto_per_field_scalar!(dest, bc)

@inline Base.copyto!(
dest::FieldVector,
bc::Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}},
)
) = copyto_per_field_scalar!(dest, bc)

@inline Base.copyto!(dest::FieldVector, bc::Real) =
copyto_per_field_scalar!(dest, bc)

@inline function copyto_per_field_scalar!(dest::FieldVector, bc)
map(propertynames(dest)) do symb
Base.@_inline_meta
p = parent(getfield(_values(dest), symb))
copyto!(p, bc)
array = parent((getfield(_values(dest), symb)))
if array isa FieldVector # recurse
copyto_per_field_scalar!(array, bc)
else
copyto_per_field_scalar!(
array,
bc,
DataLayouts.device_dispatch(array),
)
end
nothing
end
return dest
Expand Down
Loading

0 comments on commit e0dec11

Please sign in to comment.