omlins · omlins · Jun 12, 2023 · Dec 12, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/src/ParallelKernel/kernel_language.jl b/src/ParallelKernel/kernel_language.jl
@@ -53,11 +53,11 @@ macro sync_threads(args...) check_initialized(); checknoargs(args...); esc(sync_
 
 
 ##
-# NOTE: the optional offset parameter of cuDynamicSharedMem is currently not exposed.
 const SHAREDMEM_DOC = """
-    @sharedMem(T, dims)
+    @sharedMem(T, dims, offset::Integer=0)
 
-Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`.
+Create an array that is *shared* between the threads of a block (i.e. accessible only by the threads of a same block), with element type `T` and size specified by `dims`. 
+When multiple shared memory arrays are created within a kernel, then all arrays except for the first one typically need to define the `offset` to the base shared memory pointer in bytes (note that the CPU implementation does not require the `offset` and will simply ignore it when present).
 
 !!! note "Note"
     The amount of shared memory needs to specified when launching the kernel (keyword argument `shmem`).
@@ -93,7 +93,7 @@ function checknoargs(args...)
 end
 
 function checkargs_sharedMem(args...)
-    if (length(args) != 2) @ArgumentError("wrong number of arguments.") end
+    if !(2 <= length(args) <= 3) @ArgumentError("wrong number of arguments.") end
 end
 
 
@@ -167,7 +167,7 @@ end
 function pk_println(args...; package::Symbol=get_package())
     if     (package == PKG_CUDA)    return :(CUDA.@cuprintln($(args...)))
     elseif (package == PKG_AMDGPU)  @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.")
-    elseif (package == PKG_THREADS) return :(Base.@println($(args...)))
+    elseif (package == PKG_THREADS) return :(Base.println($(args...)))
     else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
     end
 end
@@ -189,3 +189,5 @@ macro threadIdx_cpu() esc(:(ParallelStencil.ParallelKernel.Dim3(1, 1, 1))) end
 macro sync_threads_cpu() esc(:(begin end)) end
 
 macro sharedMem_cpu(T, dims) :(MArray{Tuple{$(esc(dims))...}, $(esc(T)), length($(esc(dims))), prod($(esc(dims)))}(undef)); end # Note: A macro is used instead of a function as a creating a type stable function is not really possible (dims can take any values and they become part of the MArray type...). MArray is not escaped in order not to have to import StaticArrays in the user code.
+
+macro sharedMem_cpu(T, dims, offset) :(ParallelStencil.ParallelKernel.@sharedMem_cpu($T, $dims)) end
diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl
@@ -57,7 +57,7 @@ const SUPPORTED_NUMBERTYPES        =      [Float16, Float32, Float64, Complex{Fl
 const PKNumber                     = Union{Float16, Float32, Float64, Complex{Float16}, Complex{Float32}, Complex{Float64}} # NOTE: this always needs to correspond to SUPPORTED_NUMBERTYPES!
 const NUMBERTYPE_NONE              = DataType
 const ERRMSG_UNSUPPORTED_PACKAGE   = "unsupported package for parallelization"
-const ERRMSG_CHECK_PACKAGE         = "package has to be one of the following: $(join(SUPPORTED_PACKAGES,", "))"
+const ERRMSG_CHECK_PACKAGE         = "package has to be functional and one of the following: $(join(SUPPORTED_PACKAGES,", "))"
 const ERRMSG_CHECK_NUMBERTYPE      = "numbertype has to be one of the following: $(join(SUPPORTED_NUMBERTYPES,", "))"
 const ERRMSG_CHECK_LITERALTYPES    = "the type given to 'literaltype' must be one of the following: $(join(SUPPORTED_LITERALTYPES,", "))"
 
@@ -144,9 +144,17 @@ end
 
 ## FUNCTIONS TO DEAL WITH KERNEL DEFINITIONS: SIGNATURES, BODY AND RETURN STATEMENT
 
-extract_kernel_args(kernel::Expr)   = return (splitdef(kernel)[:args], splitdef(kernel)[:kwargs])
-get_body(kernel::Expr)              = return kernel.args[2]
-set_body!(kernel::Expr, body::Expr) = ((kernel.args[2] = body); return kernel)
+extract_kernel_args(kernel::Expr)     = return (splitdef(kernel)[:args], splitdef(kernel)[:kwargs])
+get_body(kernel::Expr)                = return kernel.args[2]
+set_body!(kernel::Expr, body::Expr)   = ((kernel.args[2] = body); return kernel)
+get_name(kernel::Expr)                = return splitdef(kernel)[:name]
+
+function set_name(kernel::Expr, name::Symbol)
+    kernel_elems = splitdef(kernel)
+    kernel_elems[:name] = name
+    kernel = combinedef(kernel_elems)
+    return kernel
+end
 
 function push_to_signature!(kernel::Expr, arg::Expr)
     kernel_elems = splitdef(kernel)
@@ -271,6 +279,8 @@ function substitute(expr::Expr, old, new)
     end
 end
 
+substitute(expr, old, new) = (old == expr) ? new : expr
+
 function inexpr_walk(expr::Expr, s::Symbol; match_only_head=false)
     found = false
     postwalk(expr) do x
@@ -281,6 +291,9 @@ function inexpr_walk(expr::Expr, s::Symbol; match_only_head=false)
     return found
 end
 
+inexpr_walk(expr::Symbol, s::Symbol; match_only_head=false) = (s == expr)
+inexpr_walk(expr,         s::Symbol; match_only_head=false) = false
+
 Base.unquoted(s::Symbol) = s
 
 function extract_tuple(t::Union{Expr,Symbol}) # NOTE: this could return a tuple, but would require to change all small arrays to tuples...

diff --git a/src/init_parallel_stencil.jl b/src/init_parallel_stencil.jl
@@ -40,20 +40,22 @@ macro init_parallel_stencil(args...)
     if (length(posargs) == 3) package, numbertype_val, ndims_val = extract_posargs_init(__module__, posargs...)
     else                      package, numbertype_val, ndims_val = extract_kwargs_init(__module__, kwargs)
     end
+    loopopt_val = extract_kwargs_optional(__module__, kwargs)
     if (package == PKG_NONE) @ArgumentError("the package argument cannot be ommited.") end #TODO: this error message will disappear, once the package can be defined at runtime.
     if (ndims == NDIMS_NONE) @ArgumentError("the ndims argument cannot be ommited.") end #TODO: this error message will disappear, once the ndims can be defined at runtime.
-    check_already_initialized(package, numbertype_val, ndims_val)
-    esc(init_parallel_stencil(__module__, package, numbertype_val, ndims_val))
+    check_already_initialized(package, numbertype_val, ndims_val, loopopt_val)
+    esc(init_parallel_stencil(__module__, package, numbertype_val, ndims_val, loopopt_val))
 end
 
-function init_parallel_stencil(caller::Module, package::Symbol, numbertype::DataType, ndims::Integer)
+function init_parallel_stencil(caller::Module, package::Symbol, numbertype::DataType, ndims::Integer, loopopt::Bool)
     if (numbertype == NUMBERTYPE_NONE) datadoc_call = :(@doc replace(ParallelStencil.ParallelKernel.DATA_DOC_NUMBERTYPE_NONE, "@init_parallel_kernel" => "@init_parallel_stencil") Data)
     else                               datadoc_call = :(@doc replace(ParallelStencil.ParallelKernel.DATA_DOC,                 "@init_parallel_kernel" => "@init_parallel_stencil") Data)
     end
     ParallelKernel.init_parallel_kernel(caller, package, numbertype; datadoc_call=datadoc_call)
     set_package(package)
     set_numbertype(numbertype)
     set_ndims(ndims)
+    set_loopopt(loopopt)
     set_initialized(true)
     return nothing
 end
@@ -63,12 +65,14 @@ macro is_initialized() is_initialized() end
 macro get_package() get_package() end
 macro get_numbertype() get_numbertype() end
 macro get_ndims() get_ndims() end
+macro get_loopopt() get_loopopt() end
 let
-    global is_initialized, set_initialized, set_package, get_package, set_numbertype, get_numbertype, set_ndims, get_ndims, check_initialized, check_already_initialized
+    global is_initialized, set_initialized, set_package, get_package, set_numbertype, get_numbertype, set_ndims, get_ndims, set_loopopt, get_loopopt, check_initialized, check_already_initialized
     _is_initialized::Bool       = false
     package::Symbol             = PKG_NONE
     numbertype::DataType        = NUMBERTYPE_NONE
     ndims::Integer              = NDIMS_NONE
+    loopopt::Bool               = false
     set_initialized(flag::Bool) = (_is_initialized = flag)
     is_initialized()            = _is_initialized
     set_package(pkg::Symbol)    = (package = pkg)
@@ -77,11 +81,13 @@ let
     get_numbertype()            = numbertype
     set_ndims(n::Integer)       = (ndims = n)
     get_ndims()                 = ndims
+    set_loopopt(loopopt::Bool)  = (loopopt = loopopt)
+    get_loopopt()               = loopopt
     check_initialized()         = if !is_initialized() @NotInitializedError("no macro or function of the module can be called before @init_parallel_stencil.") end
 
-    function check_already_initialized(package::Symbol, numbertype::DataType, ndims::Integer)
+    function check_already_initialized(package::Symbol, numbertype::DataType, ndims::Integer, loopopt::Bool)
         if is_initialized()
-            if package==get_package() && numbertype==get_numbertype() && ndims==get_ndims()
+            if package==get_package() && numbertype==get_numbertype() && ndims==get_ndims() && loopopt==get_loopopt()
                 @warn "ParallelStencil has already been initialized, with the same arguments. If you are using ParallelStencil interactively in the REPL, then you can ignore this message. If you are using ParallelStencil non-interactively, then you are likely using ParallelStencil in an inconsistent way: @init_parallel_stencil should only be called once, right after 'using ParallelStencil'."
             else
                 @IncoherentCallError("ParallelStencil has already been initialized, with different arguments. If you are using ParallelStencil interactively in the REPL and want to avoid restarting Julia, then you can call ParallelStencil.@reset_parallel_stencil() and rerun all parts of your code that use ParallelStencil features (including kernel definitions and array allocations). If you are using ParallelStencil non-interactively, then you are using ParallelStencil in an invalid way: @init_parallel_stencil should only be called once, right after 'using ParallelStencil'.")
@@ -104,3 +110,10 @@ function extract_kwargs_init(caller::Module, kwargs::Dict)
     end
     return package, numbertype_val, ndims_val
 end
+
+function extract_kwargs_optional(caller::Module, kwargs::Dict)
+    if (:loopopt in keys(kwargs)) loopopt_val = eval_arg(caller, kwargs[:loopopt]); check_loopopt(loopopt_val)
+    else                          loopopt_val = false
+    end
+    return loopopt_val
+end