omlins · omlins · Jun 12, 2023 · Dec 12, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,8 +13,8 @@ jobs:
       matrix:
         version:
           - '1.7' # Minimum required Julia version (due to CellArrays' AMDGPU dependency).
-          #- '1'   # Latest stable 1.x release of Julia
-          #- 'nightly'
+          - '1'   # Latest stable 1.x release of Julia
+          # - 'nightly'
         os:
           - ubuntu-latest
           - macOS-latest

diff --git a/Project.toml b/Project.toml
@@ -14,7 +14,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 [compat]
 AMDGPU = "0.4"
 CellArrays = "0.1"
-CUDA = "3.12"
+CUDA = "3.12, 4"
 MacroTools = "0.5"
 StaticArrays = "1"
 julia = "1.7"

diff --git a/src/ParallelKernel/allocators.jl b/src/ParallelKernel/allocators.jl
@@ -416,8 +416,8 @@ falses_cpu(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}
  zeros_cuda(::Type{T}, blocklength, args...) where {T<:Number}                      = (check_datatype(T); CUDA.zeros(T, args...))  # (blocklength is ignored if neither celldims nor celltype is set)
   ones_cuda(::Type{T}, blocklength, args...) where {T<:Number}                      = (check_datatype(T); CUDA.ones(T, args...))   # ...
   rand_cuda(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}}          = CuArray(rand_cpu(T, blocklength, args...))   # ...
-falses_cuda(::Type{T}, blocklength, args...) where {T<:Bool}                        = CUDA.falses(args...)                         # ...
- trues_cuda(::Type{T}, blocklength, args...) where {T<:Bool}                        = CUDA.trues(args...)                          # ...
+falses_cuda(::Type{T}, blocklength, args...) where {T<:Bool}                        = CUDA.zeros(Bool, args...)                         # ...
+ trues_cuda(::Type{T}, blocklength, args...) where {T<:Bool}                        = CUDA.ones(Bool, args...)                          # ...
   fill_cuda(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}}          = CuArray(fill_cpu(T, blocklength, args...))   # ...
 
  zeros_cuda(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}    = (check_datatype(T); fill_cuda(T, blocklength, 0, args...))
@@ -470,7 +470,7 @@ end
 
 function fill_amdgpu(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B}
     if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end
-    check_datatype(T, Bool)
+    check_datatype(T, Bool, Enum)
     if     (length(x) == 1)         cell = convert(T, fill(convert(eltype(T), x), size(T)))
     elseif (length(x) == length(T)) cell = convert(T, x)
     else                            @ArgumentError("fill: argument 'x' contains the wrong number of elements ($(length(x))). It must be a scalar or contain the number of elements defined by 'celldims'.")

diff --git a/src/ParallelKernel/hide_communication.jl b/src/ParallelKernel/hide_communication.jl
@@ -117,7 +117,7 @@ function hide_communication_gpu(ranges_outer::Union{Symbol,Expr}, ranges_inner::
     bc_and_commcalls = process_bc_and_commcalls(bc_and_commcalls)
     quote
         for i in 1:length($ranges_outer)
-            @parallel_async $ranges_outer[i] stream=ParallelStencil.ParallelKernel.@get_priority_stream(i) $(kwargs...) $compkernelcall #NOTE: it cannot directly go to ParallelStencil.ParallelKernel.@parallel_async as else it cannot honour ParallelStencil args as loopopt (fixing it to ParallelStencil is also not possible as it assumes, else the ParalellKernel hide_communication unit tests fail).
+            @parallel_async $ranges_outer[i] stream=ParallelStencil.ParallelKernel.@get_priority_stream(i) $(kwargs...) $compkernelcall #NOTE: it cannot directly go to ParallelStencil.ParallelKernel.@parallel_async as else it cannot honour ParallelStencil args as memopt (fixing it to ParallelStencil is also not possible as it assumes, else the ParalellKernel hide_communication unit tests fail).
         end
         for i in 1:length($ranges_inner)
             @parallel_async $ranges_inner[i] stream=ParallelStencil.ParallelKernel.@get_stream(i) $(kwargs...) $compkernelcall          #NOTE: ...

diff --git a/src/ParallelKernel/kernel_language.jl b/src/ParallelKernel/kernel_language.jl
@@ -53,14 +53,14 @@ macro sync_threads(args...) check_initialized(); checknoargs(args...); esc(sync_
 
 
 ##
-# NOTE: the optional offset parameter of cuDynamicSharedMem is currently not exposed.
 const SHAREDMEM_DOC = """
-    @sharedMem(T, dims)
+    @sharedMem(T, dims, offset::Integer=0)
 
-Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`.
+Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`. 
+When multiple shared memory arrays are created within a kernel, then all arrays except for the first one typically need to define the `offset` to the base shared memory pointer in bytes (note that the CPU and AMDGPU implementation do not require the `offset` and will simply ignore it when present).
 
 !!! note "Note"
-    The amount of shared memory needs to specified when launching the kernel (keyword argument `shmem`).
+    The amount of shared memory needs to be specified when launching the kernel (keyword argument `shmem`).
 """
 @doc SHAREDMEM_DOC
 macro sharedMem(args...) check_initialized(); checkargs_sharedMem(args...); esc(sharedMem(args...)); end
@@ -93,7 +93,7 @@ function checknoargs(args...)
 end
 
 function checkargs_sharedMem(args...)
-    if (length(args) != 2) @ArgumentError("wrong number of arguments.") end
+    if !(2 <= length(args) <= 3) @ArgumentError("wrong number of arguments.") end
 end
 
 
@@ -147,12 +147,16 @@ end
 
 function sharedMem(args...; package::Symbol=get_package())
     if     (package == PKG_CUDA)    return :(CUDA.@cuDynamicSharedMem($(args...)))
-    elseif (package == PKG_AMDGPU)  @KeywordArgumentError("not yet supported for AMDGPU.")
+    elseif (package == PKG_AMDGPU)  return :(ParallelStencil.ParallelKernel.@sharedMem_amdgpu($(args...)))
     elseif (package == PKG_THREADS) return :(ParallelStencil.ParallelKernel.@sharedMem_cpu($(args...)))
     else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
     end
 end
 
+macro sharedMem_amdgpu(T, dims) esc(:(AMDGPU.@ROCDynamicLocalArray($T, $dims, false))) end
+
+macro sharedMem_amdgpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_amdgpu($T, $dims))) end
+
 
 ## FUNCTIONS FOR PRINTING
 
@@ -167,7 +171,7 @@ end
 function pk_println(args...; package::Symbol=get_package())
     if     (package == PKG_CUDA)    return :(CUDA.@cuprintln($(args...)))
     elseif (package == PKG_AMDGPU)  @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.")
-    elseif (package == PKG_THREADS) return :(Base.@println($(args...)))
+    elseif (package == PKG_THREADS) return :(Base.println($(args...)))
     else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
     end
 end
@@ -189,3 +193,5 @@ macro threadIdx_cpu() esc(:(ParallelStencil.ParallelKernel.Dim3(1, 1, 1))) end
 macro sync_threads_cpu() esc(:(begin end)) end
 
 macro sharedMem_cpu(T, dims) :(MArray{Tuple{$(esc(dims))...}, $(esc(T)), length($(esc(dims))), prod($(esc(dims)))}(undef)); end # Note: A macro is used instead of a function as a creating a type stable function is not really possible (dims can take any values and they become part of the MArray type...). MArray is not escaped in order not to have to import StaticArrays in the user code.
+
+macro sharedMem_cpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_cpu($T, $dims))) end