Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize memopt and create and update GPU unit tests #81

Merged
merged 100 commits into from
Jun 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
100 commits
Select commit Hold shift + click to select a range
7a0c376
fix AMDGPU allocater tests
omlins Dec 12, 2022
9851fd2
move macros into code and fix range start
omlins Dec 19, 2022
e74adc1
add error if too many positional args in loopopt
omlins Dec 19, 2022
991c70a
add error if maxsize not dividable without rest by nthreads
omlins Dec 19, 2022
a654ca6
remove possibility for smaller block size at end
omlins Dec 19, 2022
4feb2a7
add error if maxsize not dividable without rest by nthreads
omlins Dec 19, 2022
0286434
introduce stencilsize
omlins Dec 20, 2022
5dd5798
introduce stencilsize
omlins Dec 20, 2022
ed60173
move from stencilsize to stencilranges
omlins Dec 20, 2022
333f720
move from stencilsize to stencilranges
omlins Dec 20, 2022
20e7456
fix y-dimension register usage
omlins Dec 20, 2022
4051d6e
generalize substitute and in expression check
omlins Dec 22, 2022
3b726c1
evaluate stencil offsets at parse time
omlins Dec 22, 2022
aebe72d
evaluate stencil offsets at parse time
omlins Dec 22, 2022
808f4f0
extract offset
omlins Dec 22, 2022
deed435
extract offset
omlins Dec 22, 2022
42f5452
generalize register queuing
omlins Jan 4, 2023
c145fb3
generalize register queuing
omlins Jan 5, 2023
9ddd3e0
handle incoherent stencilranges
omlins Jan 8, 2023
527ab36
generalize register queuing
omlins Jan 8, 2023
842acfb
generalize register queuing
omlins Jan 19, 2023
70b652c
improve check package error message
omlins Jan 20, 2023
3d5a6d7
generalize register queuing - clean up
omlins Jan 20, 2023
478af97
generalize register queuing - clean up
omlins Jan 20, 2023
200b490
generalize register queuing - clean up
omlins Jan 20, 2023
2900eac
add unit tests for loopopt
omlins Jan 20, 2023
6c908b2
update kernel language unit tests
omlins Jan 20, 2023
73dba08
update kernel language unit tests
omlins Jan 20, 2023
b9daa83
remove temporarily unused arguments package a number type
omlins Jan 23, 2023
80eee91
generalize register queuing - for multiple arrays
omlins Jan 27, 2023
29cfda4
generalize register queuing - for multiple arrays
omlins Jan 27, 2023
707d99c
accept offset argument in shared memory macro
omlins Feb 20, 2023
981afd4
add get and set name
omlins Feb 20, 2023
5377e71
add global loop optimization keyword argument
omlins Feb 20, 2023
56be3b0
add shared memory offset and function argument types
omlins Feb 20, 2023
32ff86a
create meta data storage
omlins Feb 20, 2023
109e49a
create meta data storage
omlins Feb 20, 2023
ac07238
add tests for two and three arrays
omlins Feb 20, 2023
cab850e
add shared memory offset handling for CPU
omlins Feb 20, 2023
e1cb5de
add shared memory offset handling for CPU
omlins Feb 20, 2023
9f55654
fix documentation
omlins Feb 21, 2023
d16175d
fix shared memory offset and do minor improvements
omlins Feb 21, 2023
ea74c35
add unit test for shared memory offset on cpu
omlins Feb 21, 2023
90b7502
add unit tests for optimization keyword arguments
omlins Feb 21, 2023
929dc3c
add shared memory loop entry and exit
omlins Mar 14, 2023
6d3ce51
add halo read optimization
omlins Mar 20, 2023
c8da94e
add halo read optimization
omlins Mar 20, 2023
5bcb0a4
add halo read optimization
omlins Mar 20, 2023
d3dc138
clean up
omlins Mar 20, 2023
76959a0
share indexing values between areas
omlins Mar 21, 2023
1516fc1
fix shared memory cpu macro
omlins Mar 24, 2023
ebc884d
enable macro name simplification
omlins Mar 24, 2023
65db8eb
fix loop optimization initialization
omlins Mar 24, 2023
040e68a
enable on-the-fly assignments in parallel kernels
omlins Mar 24, 2023
0766ea0
enable on-the-fly assignments in parallel kernels
omlins Mar 24, 2023
b853e3e
move stencil access tests to shared
omlins Mar 24, 2023
9270507
fix kernel language unit tests
omlins Mar 24, 2023
fc592f2
add tests for loopopt in initialization
omlins Mar 24, 2023
284fcf1
add tests for on-the-fly statements in parallel kernels
omlins Mar 24, 2023
db258d3
add register counting
omlins Mar 30, 2023
646fb20
add explicit shared memory and launch handling
omlins Mar 30, 2023
8793ec0
fix key word argument handling for backends
omlins Apr 4, 2023
125d291
add unit tests for nx, ny, nz != x .* threads
omlins Apr 4, 2023
e472374
improve indexing casting
omlins Apr 12, 2023
d39d379
add support for shared memory without halo
omlins Apr 12, 2023
da1fb9b
fix kernel indexing
omlins Apr 13, 2023
80e47d8
fix kernel indexing
omlins Apr 13, 2023
6ece0f2
fix kernel indexing
omlins Apr 13, 2023
50094b5
fix kernel indexing
omlins Apr 13, 2023
a24a9ed
fix kernel indexing
omlins Apr 13, 2023
abd417a
fix kernel indexing
omlins Apr 13, 2023
4177324
improve optimization variable and ranges handling
omlins Apr 13, 2023
68e3939
change boolean array type
omlins Apr 27, 2023
179840e
introduce non fullrange optranges defaults
omlins May 31, 2023
cd22a1c
ad optimization defaults
omlins May 31, 2023
7ac7ee8
change loopopt to memopt
omlins May 31, 2023
df8d227
add documentation about number type ommission
omlins May 31, 2023
a05fecd
fix @fill for AMDGPU
omlins Jun 7, 2023
3d5a5e3
add shared memory support for AMDGPU
omlins Jun 7, 2023
2feb70b
introduce loopdim
omlins Jun 7, 2023
44007ec
introduce loopdim
omlins Jun 7, 2023
c655372
introduce loopdim
omlins Jun 7, 2023
4b5bb32
fixing boolean tests to be compatible with CUDA boolean array impleme…
omlins Jun 7, 2023
4a75dc5
add AMDGPU shared memory support and partial support for 32-bit indexing
omlins Jun 7, 2023
e37c7bf
adjust thread computation for memopt
omlins Jun 7, 2023
85f6909
introduce loopdim
omlins Jun 7, 2023
c4504b7
complete AMDGPU unit tests
omlins Jun 7, 2023
e53d462
add AMDGPU shared memory support and partial support for 32-bit indexing
omlins Jun 8, 2023
a8e0b1a
add compatibility with CUDA
omlins Jun 8, 2023
e43cdd8
fix incremental compilation unit tests
omlins Jun 8, 2023
69af346
fix incremental compilation unit tests
omlins Jun 8, 2023
9bcf5ac
fix incremental compilation unit tests
omlins Jun 8, 2023
e43b628
fix shared memory integer promotion issue
omlins Jun 9, 2023
38e1f1e
remove kernel print tests not available in AMDGPU
omlins Jun 9, 2023
e99322e
fix formatting
omlins Jun 9, 2023
42e89a2
make incremental compilation test compatible with julia 1.9
omlins Jun 9, 2023
ca10e62
make incremental compilation test compatible with julia 1.9
omlins Jun 9, 2023
b851c9c
restore CI with Julia 1.9 and nightly
omlins Jun 9, 2023
0469330
make incremental compilation test compatible with julia 1.9
omlins Jun 9, 2023
d88be9b
remove the nightly CI failing due to AMDGPU not compatible with LLVM
omlins Jun 12, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
matrix:
version:
- '1.7' # Minimum required Julia version (due to CellArrays' AMDGPU dependency).
#- '1' # Latest stable 1.x release of Julia
#- 'nightly'
- '1' # Latest stable 1.x release of Julia
# - 'nightly'
os:
- ubuntu-latest
- macOS-latest
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
[compat]
AMDGPU = "0.4"
CellArrays = "0.1"
CUDA = "3.12"
CUDA = "3.12, 4"
MacroTools = "0.5"
StaticArrays = "1"
julia = "1.7"
Expand Down
6 changes: 3 additions & 3 deletions src/ParallelKernel/allocators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ falses_cpu(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}
zeros_cuda(::Type{T}, blocklength, args...) where {T<:Number} = (check_datatype(T); CUDA.zeros(T, args...)) # (blocklength is ignored if neither celldims nor celltype is set)
ones_cuda(::Type{T}, blocklength, args...) where {T<:Number} = (check_datatype(T); CUDA.ones(T, args...)) # ...
rand_cuda(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = CuArray(rand_cpu(T, blocklength, args...)) # ...
falses_cuda(::Type{T}, blocklength, args...) where {T<:Bool} = CUDA.falses(args...) # ...
trues_cuda(::Type{T}, blocklength, args...) where {T<:Bool} = CUDA.trues(args...) # ...
falses_cuda(::Type{T}, blocklength, args...) where {T<:Bool} = CUDA.zeros(Bool, args...) # ...
trues_cuda(::Type{T}, blocklength, args...) where {T<:Bool} = CUDA.ones(Bool, args...) # ...
fill_cuda(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = CuArray(fill_cpu(T, blocklength, args...)) # ...

zeros_cuda(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = (check_datatype(T); fill_cuda(T, blocklength, 0, args...))
Expand Down Expand Up @@ -470,7 +470,7 @@ end

function fill_amdgpu(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B}
if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end
check_datatype(T, Bool)
check_datatype(T, Bool, Enum)
if (length(x) == 1) cell = convert(T, fill(convert(eltype(T), x), size(T)))
elseif (length(x) == length(T)) cell = convert(T, x)
else @ArgumentError("fill: argument 'x' contains the wrong number of elements ($(length(x))). It must be a scalar or contain the number of elements defined by 'celldims'.")
Expand Down
2 changes: 1 addition & 1 deletion src/ParallelKernel/hide_communication.jl
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ function hide_communication_gpu(ranges_outer::Union{Symbol,Expr}, ranges_inner::
bc_and_commcalls = process_bc_and_commcalls(bc_and_commcalls)
quote
for i in 1:length($ranges_outer)
@parallel_async $ranges_outer[i] stream=ParallelStencil.ParallelKernel.@get_priority_stream(i) $(kwargs...) $compkernelcall #NOTE: it cannot directly go to ParallelStencil.ParallelKernel.@parallel_async as else it cannot honour ParallelStencil args as loopopt (fixing it to ParallelStencil is also not possible as it assumes, else the ParalellKernel hide_communication unit tests fail).
@parallel_async $ranges_outer[i] stream=ParallelStencil.ParallelKernel.@get_priority_stream(i) $(kwargs...) $compkernelcall #NOTE: it cannot directly go to ParallelStencil.ParallelKernel.@parallel_async as else it cannot honour ParallelStencil args as memopt (fixing it to ParallelStencil is also not possible as it assumes, else the ParalellKernel hide_communication unit tests fail).
end
for i in 1:length($ranges_inner)
@parallel_async $ranges_inner[i] stream=ParallelStencil.ParallelKernel.@get_stream(i) $(kwargs...) $compkernelcall #NOTE: ...
Expand Down
20 changes: 13 additions & 7 deletions src/ParallelKernel/kernel_language.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ macro sync_threads(args...) check_initialized(); checknoargs(args...); esc(sync_


##
# NOTE: the optional offset parameter of cuDynamicSharedMem is currently not exposed.
const SHAREDMEM_DOC = """
@sharedMem(T, dims)
@sharedMem(T, dims, offset::Integer=0)

Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`.
Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`.
When multiple shared memory arrays are created within a kernel, then all arrays except for the first one typically need to define the `offset` to the base shared memory pointer in bytes (note that the CPU and AMDGPU implementation do not require the `offset` and will simply ignore it when present).

!!! note "Note"
The amount of shared memory needs to specified when launching the kernel (keyword argument `shmem`).
The amount of shared memory needs to be specified when launching the kernel (keyword argument `shmem`).
"""
@doc SHAREDMEM_DOC
macro sharedMem(args...) check_initialized(); checkargs_sharedMem(args...); esc(sharedMem(args...)); end
Expand Down Expand Up @@ -93,7 +93,7 @@ function checknoargs(args...)
end

function checkargs_sharedMem(args...)
if (length(args) != 2) @ArgumentError("wrong number of arguments.") end
if !(2 <= length(args) <= 3) @ArgumentError("wrong number of arguments.") end
end


Expand Down Expand Up @@ -147,12 +147,16 @@ end

function sharedMem(args...; package::Symbol=get_package())
if (package == PKG_CUDA) return :(CUDA.@cuDynamicSharedMem($(args...)))
elseif (package == PKG_AMDGPU) @KeywordArgumentError("not yet supported for AMDGPU.")
elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.@sharedMem_amdgpu($(args...)))
elseif (package == PKG_THREADS) return :(ParallelStencil.ParallelKernel.@sharedMem_cpu($(args...)))
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
end
end

macro sharedMem_amdgpu(T, dims) esc(:(AMDGPU.@ROCDynamicLocalArray($T, $dims, false))) end

macro sharedMem_amdgpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_amdgpu($T, $dims))) end


## FUNCTIONS FOR PRINTING

Expand All @@ -167,7 +171,7 @@ end
function pk_println(args...; package::Symbol=get_package())
if (package == PKG_CUDA) return :(CUDA.@cuprintln($(args...)))
elseif (package == PKG_AMDGPU) @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.")
elseif (package == PKG_THREADS) return :(Base.@println($(args...)))
elseif (package == PKG_THREADS) return :(Base.println($(args...)))
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
end
end
Expand All @@ -189,3 +193,5 @@ macro threadIdx_cpu() esc(:(ParallelStencil.ParallelKernel.Dim3(1, 1, 1))) end
macro sync_threads_cpu() esc(:(begin end)) end

macro sharedMem_cpu(T, dims) :(MArray{Tuple{$(esc(dims))...}, $(esc(T)), length($(esc(dims))), prod($(esc(dims)))}(undef)); end # Note: A macro is used instead of a function as a creating a type stable function is not really possible (dims can take any values and they become part of the MArray type...). MArray is not escaped in order not to have to import StaticArrays in the user code.

macro sharedMem_cpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_cpu($T, $dims))) end
Loading