Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize memopt and create and update GPU unit tests #81

Merged
merged 100 commits into from
Jun 12, 2023
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
100 commits
Select commit Hold shift + click to select a range
7a0c376
fix AMDGPU allocater tests
omlins Dec 12, 2022
9851fd2
move macros into code and fix range start
omlins Dec 19, 2022
e74adc1
add error if too many positional args in loopopt
omlins Dec 19, 2022
991c70a
add error if maxsize not dividable without rest by nthreads
omlins Dec 19, 2022
a654ca6
remove possibility for smaller block size at end
omlins Dec 19, 2022
4feb2a7
add error if maxsize not dividable without rest by nthreads
omlins Dec 19, 2022
0286434
introduce stencilsize
omlins Dec 20, 2022
5dd5798
introduce stencilsize
omlins Dec 20, 2022
ed60173
move from stencilsize to stencilranges
omlins Dec 20, 2022
333f720
move from stencilsize to stencilranges
omlins Dec 20, 2022
20e7456
fix y-dimension register usage
omlins Dec 20, 2022
4051d6e
generalize substitute and in expression check
omlins Dec 22, 2022
3b726c1
evaluate stencil offsets at parse time
omlins Dec 22, 2022
aebe72d
evaluate stencil offsets at parse time
omlins Dec 22, 2022
808f4f0
extract offset
omlins Dec 22, 2022
deed435
extract offset
omlins Dec 22, 2022
42f5452
generalize register queuing
omlins Jan 4, 2023
c145fb3
generalize register queuing
omlins Jan 5, 2023
9ddd3e0
handle incoherent stencilranges
omlins Jan 8, 2023
527ab36
generalize register queuing
omlins Jan 8, 2023
842acfb
generalize register queuing
omlins Jan 19, 2023
70b652c
improve check package error message
omlins Jan 20, 2023
3d5a6d7
generalize register queuing - clean up
omlins Jan 20, 2023
478af97
generalize register queuing - clean up
omlins Jan 20, 2023
200b490
generalize register queuing - clean up
omlins Jan 20, 2023
2900eac
add unit tests for loopopt
omlins Jan 20, 2023
6c908b2
update kernel language unit tests
omlins Jan 20, 2023
73dba08
update kernel language unit tests
omlins Jan 20, 2023
b9daa83
remove temporarily unused arguments package a number type
omlins Jan 23, 2023
80eee91
generalize register queuing - for multiple arrays
omlins Jan 27, 2023
29cfda4
generalize register queuing - for multiple arrays
omlins Jan 27, 2023
707d99c
accept offset argument in shared memory macro
omlins Feb 20, 2023
981afd4
add get and set name
omlins Feb 20, 2023
5377e71
add global loop optimization keyword argument
omlins Feb 20, 2023
56be3b0
add shared memory offset and function argument types
omlins Feb 20, 2023
32ff86a
create meta data storage
omlins Feb 20, 2023
109e49a
create meta data storage
omlins Feb 20, 2023
ac07238
add tests for two and three arrays
omlins Feb 20, 2023
cab850e
add shared memory offset handling for CPU
omlins Feb 20, 2023
e1cb5de
add shared memory offset handling for CPU
omlins Feb 20, 2023
9f55654
fix documentation
omlins Feb 21, 2023
d16175d
fix shared memory offset and do minor improvements
omlins Feb 21, 2023
ea74c35
add unit test for shared memory offset on cpu
omlins Feb 21, 2023
90b7502
add unit tests for optimization keyword arguments
omlins Feb 21, 2023
929dc3c
add shared memory loop entry and exit
omlins Mar 14, 2023
6d3ce51
add halo read optimization
omlins Mar 20, 2023
c8da94e
add halo read optimization
omlins Mar 20, 2023
5bcb0a4
add halo read optimization
omlins Mar 20, 2023
d3dc138
clean up
omlins Mar 20, 2023
76959a0
share indexing values between areas
omlins Mar 21, 2023
1516fc1
fix shared memory cpu macro
omlins Mar 24, 2023
ebc884d
enable macro name simplification
omlins Mar 24, 2023
65db8eb
fix loop optimization initialization
omlins Mar 24, 2023
040e68a
enable on-the-fly assignments in parallel kernels
omlins Mar 24, 2023
0766ea0
enable on-the-fly assignments in parallel kernels
omlins Mar 24, 2023
b853e3e
move stencil access tests to shared
omlins Mar 24, 2023
9270507
fix kernel language unit tests
omlins Mar 24, 2023
fc592f2
add tests for loopopt in initialization
omlins Mar 24, 2023
284fcf1
add tests for on-the-fly statements in parallel kernels
omlins Mar 24, 2023
db258d3
add register counting
omlins Mar 30, 2023
646fb20
add explicit shared memory and launch handling
omlins Mar 30, 2023
8793ec0
fix key word argument handling for backends
omlins Apr 4, 2023
125d291
add unit tests for nx, ny, nz != x .* threads
omlins Apr 4, 2023
e472374
improve indexing casting
omlins Apr 12, 2023
d39d379
add support for shared memory without halo
omlins Apr 12, 2023
da1fb9b
fix kernel indexing
omlins Apr 13, 2023
80e47d8
fix kernel indexing
omlins Apr 13, 2023
6ece0f2
fix kernel indexing
omlins Apr 13, 2023
50094b5
fix kernel indexing
omlins Apr 13, 2023
a24a9ed
fix kernel indexing
omlins Apr 13, 2023
abd417a
fix kernel indexing
omlins Apr 13, 2023
4177324
improve optimization variable and ranges handling
omlins Apr 13, 2023
68e3939
change boolean array type
omlins Apr 27, 2023
179840e
introduce non fullrange optranges defaults
omlins May 31, 2023
cd22a1c
ad optimization defaults
omlins May 31, 2023
7ac7ee8
change loopopt to memopt
omlins May 31, 2023
df8d227
add documentation about number type ommission
omlins May 31, 2023
a05fecd
fix @fill for AMDGPU
omlins Jun 7, 2023
3d5a5e3
add shared memory support for AMDGPU
omlins Jun 7, 2023
2feb70b
introduce loopdim
omlins Jun 7, 2023
44007ec
introduce loopdim
omlins Jun 7, 2023
c655372
introduce loopdim
omlins Jun 7, 2023
4b5bb32
fixing boolean tests to be compatible with CUDA boolean array impleme…
omlins Jun 7, 2023
4a75dc5
add AMDGPU shared memory support and partial support for 32-bit indexing
omlins Jun 7, 2023
e37c7bf
adjust thread computation for memopt
omlins Jun 7, 2023
85f6909
introduce loopdim
omlins Jun 7, 2023
c4504b7
complete AMDGPU unit tests
omlins Jun 7, 2023
e53d462
add AMDGPU shared memory support and partial support for 32-bit indexing
omlins Jun 8, 2023
a8e0b1a
add compatibility with CUDA
omlins Jun 8, 2023
e43cdd8
fix incremental compilation unit tests
omlins Jun 8, 2023
69af346
fix incremental compilation unit tests
omlins Jun 8, 2023
9bcf5ac
fix incremental compilation unit tests
omlins Jun 8, 2023
e43b628
fix shared memory integer promotion issue
omlins Jun 9, 2023
38e1f1e
remove kernel print tests not available in AMDGPU
omlins Jun 9, 2023
e99322e
fix formatting
omlins Jun 9, 2023
42e89a2
make incremental compilation test compatible with julia 1.9
omlins Jun 9, 2023
ca10e62
make incremental compilation test compatible with julia 1.9
omlins Jun 9, 2023
b851c9c
restore CI with Julia 1.9 and nightly
omlins Jun 9, 2023
0469330
make incremental compilation test compatible with julia 1.9
omlins Jun 9, 2023
d88be9b
remove the nightly CI failing due to AMDGPU not compatible with LLVM
omlins Jun 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/ParallelKernel/kernel_language.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ macro sync_threads(args...) check_initialized(); checknoargs(args...); esc(sync_


##
# NOTE: the optional offset parameter of cuDynamicSharedMem is currently not exposed.
const SHAREDMEM_DOC = """
@sharedMem(T, dims)
@sharedMem(T, dims, offset::Integer=0)

Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`.
Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`.
When multiple shared memory arrays are created within a kernel, then all arrays except for the first one typically need to define the `offset` to the base shared memory pointer in bytes (note that the CPU implementation does not require the `offset` and will simply ignore it when present).

!!! note "Note"
The amount of shared memory needs to specified when launching the kernel (keyword argument `shmem`).
Expand Down Expand Up @@ -93,7 +93,7 @@ function checknoargs(args...)
end

function checkargs_sharedMem(args...)
if (length(args) != 2) @ArgumentError("wrong number of arguments.") end
if !(2 <= length(args) <= 3) @ArgumentError("wrong number of arguments.") end
end


Expand Down Expand Up @@ -167,7 +167,7 @@ end
function pk_println(args...; package::Symbol=get_package())
if (package == PKG_CUDA) return :(CUDA.@cuprintln($(args...)))
elseif (package == PKG_AMDGPU) @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.")
elseif (package == PKG_THREADS) return :(Base.@println($(args...)))
elseif (package == PKG_THREADS) return :(Base.println($(args...)))
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
end
end
Expand All @@ -189,3 +189,5 @@ macro threadIdx_cpu() esc(:(ParallelStencil.ParallelKernel.Dim3(1, 1, 1))) end
macro sync_threads_cpu() esc(:(begin end)) end

macro sharedMem_cpu(T, dims) :(MArray{Tuple{$(esc(dims))...}, $(esc(T)), length($(esc(dims))), prod($(esc(dims)))}(undef)); end # Note: A macro is used instead of a function as a creating a type stable function is not really possible (dims can take any values and they become part of the MArray type...). MArray is not escaped in order not to have to import StaticArrays in the user code.

macro sharedMem_cpu(T, dims, offset) :(ParallelStencil.ParallelKernel.@sharedMem_cpu($T, $dims)) end
21 changes: 17 additions & 4 deletions src/ParallelKernel/shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ const SUPPORTED_NUMBERTYPES = [Float16, Float32, Float64, Complex{Fl
const PKNumber = Union{Float16, Float32, Float64, Complex{Float16}, Complex{Float32}, Complex{Float64}} # NOTE: this always needs to correspond to SUPPORTED_NUMBERTYPES!
const NUMBERTYPE_NONE = DataType
const ERRMSG_UNSUPPORTED_PACKAGE = "unsupported package for parallelization"
const ERRMSG_CHECK_PACKAGE = "package has to be one of the following: $(join(SUPPORTED_PACKAGES,", "))"
const ERRMSG_CHECK_PACKAGE = "package has to be functional and one of the following: $(join(SUPPORTED_PACKAGES,", "))"
const ERRMSG_CHECK_NUMBERTYPE = "numbertype has to be one of the following: $(join(SUPPORTED_NUMBERTYPES,", "))"
const ERRMSG_CHECK_LITERALTYPES = "the type given to 'literaltype' must be one of the following: $(join(SUPPORTED_LITERALTYPES,", "))"

Expand Down Expand Up @@ -144,9 +144,17 @@ end

## FUNCTIONS TO DEAL WITH KERNEL DEFINITIONS: SIGNATURES, BODY AND RETURN STATEMENT

extract_kernel_args(kernel::Expr) = return (splitdef(kernel)[:args], splitdef(kernel)[:kwargs])
get_body(kernel::Expr) = return kernel.args[2]
set_body!(kernel::Expr, body::Expr) = ((kernel.args[2] = body); return kernel)
extract_kernel_args(kernel::Expr) = return (splitdef(kernel)[:args], splitdef(kernel)[:kwargs])
get_body(kernel::Expr) = return kernel.args[2]
set_body!(kernel::Expr, body::Expr) = ((kernel.args[2] = body); return kernel)
get_name(kernel::Expr) = return splitdef(kernel)[:name]

function set_name(kernel::Expr, name::Symbol)
kernel_elems = splitdef(kernel)
kernel_elems[:name] = name
kernel = combinedef(kernel_elems)
return kernel
end

function push_to_signature!(kernel::Expr, arg::Expr)
kernel_elems = splitdef(kernel)
Expand Down Expand Up @@ -271,6 +279,8 @@ function substitute(expr::Expr, old, new)
end
end

substitute(expr, old, new) = (old == expr) ? new : expr

function inexpr_walk(expr::Expr, s::Symbol; match_only_head=false)
found = false
postwalk(expr) do x
Expand All @@ -281,6 +291,9 @@ function inexpr_walk(expr::Expr, s::Symbol; match_only_head=false)
return found
end

inexpr_walk(expr::Symbol, s::Symbol; match_only_head=false) = (s == expr)
inexpr_walk(expr, s::Symbol; match_only_head=false) = false

Base.unquoted(s::Symbol) = s

function extract_tuple(t::Union{Expr,Symbol}) # NOTE: this could return a tuple, but would require to change all small arrays to tuples...
Expand Down
25 changes: 19 additions & 6 deletions src/init_parallel_stencil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,22 @@ macro init_parallel_stencil(args...)
if (length(posargs) == 3) package, numbertype_val, ndims_val = extract_posargs_init(__module__, posargs...)
else package, numbertype_val, ndims_val = extract_kwargs_init(__module__, kwargs)
end
loopopt_val = extract_kwargs_optional(__module__, kwargs)
if (package == PKG_NONE) @ArgumentError("the package argument cannot be ommited.") end #TODO: this error message will disappear, once the package can be defined at runtime.
if (ndims == NDIMS_NONE) @ArgumentError("the ndims argument cannot be ommited.") end #TODO: this error message will disappear, once the ndims can be defined at runtime.
check_already_initialized(package, numbertype_val, ndims_val)
esc(init_parallel_stencil(__module__, package, numbertype_val, ndims_val))
check_already_initialized(package, numbertype_val, ndims_val, loopopt_val)
esc(init_parallel_stencil(__module__, package, numbertype_val, ndims_val, loopopt_val))
end

function init_parallel_stencil(caller::Module, package::Symbol, numbertype::DataType, ndims::Integer)
function init_parallel_stencil(caller::Module, package::Symbol, numbertype::DataType, ndims::Integer, loopopt::Bool)
if (numbertype == NUMBERTYPE_NONE) datadoc_call = :(@doc replace(ParallelStencil.ParallelKernel.DATA_DOC_NUMBERTYPE_NONE, "@init_parallel_kernel" => "@init_parallel_stencil") Data)
else datadoc_call = :(@doc replace(ParallelStencil.ParallelKernel.DATA_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") Data)
end
ParallelKernel.init_parallel_kernel(caller, package, numbertype; datadoc_call=datadoc_call)
set_package(package)
set_numbertype(numbertype)
set_ndims(ndims)
set_loopopt(loopopt)
set_initialized(true)
return nothing
end
Expand All @@ -63,12 +65,14 @@ macro is_initialized() is_initialized() end
macro get_package() get_package() end
macro get_numbertype() get_numbertype() end
macro get_ndims() get_ndims() end
macro get_loopopt() get_loopopt() end
let
global is_initialized, set_initialized, set_package, get_package, set_numbertype, get_numbertype, set_ndims, get_ndims, check_initialized, check_already_initialized
global is_initialized, set_initialized, set_package, get_package, set_numbertype, get_numbertype, set_ndims, get_ndims, set_loopopt, get_loopopt, check_initialized, check_already_initialized
_is_initialized::Bool = false
package::Symbol = PKG_NONE
numbertype::DataType = NUMBERTYPE_NONE
ndims::Integer = NDIMS_NONE
loopopt::Bool = false
set_initialized(flag::Bool) = (_is_initialized = flag)
is_initialized() = _is_initialized
set_package(pkg::Symbol) = (package = pkg)
Expand All @@ -77,11 +81,13 @@ let
get_numbertype() = numbertype
set_ndims(n::Integer) = (ndims = n)
get_ndims() = ndims
set_loopopt(loopopt::Bool) = (loopopt = loopopt)
get_loopopt() = loopopt
check_initialized() = if !is_initialized() @NotInitializedError("no macro or function of the module can be called before @init_parallel_stencil.") end

function check_already_initialized(package::Symbol, numbertype::DataType, ndims::Integer)
function check_already_initialized(package::Symbol, numbertype::DataType, ndims::Integer, loopopt::Bool)
if is_initialized()
if package==get_package() && numbertype==get_numbertype() && ndims==get_ndims()
if package==get_package() && numbertype==get_numbertype() && ndims==get_ndims() && loopopt==get_loopopt()
@warn "ParallelStencil has already been initialized, with the same arguments. If you are using ParallelStencil interactively in the REPL, then you can ignore this message. If you are using ParallelStencil non-interactively, then you are likely using ParallelStencil in an inconsistent way: @init_parallel_stencil should only be called once, right after 'using ParallelStencil'."
else
@IncoherentCallError("ParallelStencil has already been initialized, with different arguments. If you are using ParallelStencil interactively in the REPL and want to avoid restarting Julia, then you can call ParallelStencil.@reset_parallel_stencil() and rerun all parts of your code that use ParallelStencil features (including kernel definitions and array allocations). If you are using ParallelStencil non-interactively, then you are using ParallelStencil in an invalid way: @init_parallel_stencil should only be called once, right after 'using ParallelStencil'.")
Expand All @@ -104,3 +110,10 @@ function extract_kwargs_init(caller::Module, kwargs::Dict)
end
return package, numbertype_val, ndims_val
end

function extract_kwargs_optional(caller::Module, kwargs::Dict)
if (:loopopt in keys(kwargs)) loopopt_val = eval_arg(caller, kwargs[:loopopt]); check_loopopt(loopopt_val)
else loopopt_val = false
end
return loopopt_val
end
Loading