diff --git a/Manifest.toml b/Manifest.toml index 6225d8d8a7..2d634a920b 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,9 +2,9 @@ [[AbstractFFTs]] deps = ["ChainRulesCore", "LinearAlgebra"] -git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409" +git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.2.1" +version = "1.3.1" [[Adapt]] deps = ["LinearAlgebra", "Requires"] @@ -18,6 +18,12 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + [[BFloat16s]] deps = ["LinearAlgebra", "Printf", "Random", "Test"] git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" @@ -64,9 +70,9 @@ version = "0.1.6" [[Compat]] deps = ["Dates", "LinearAlgebra", "UUIDs"] -git-tree-sha1 = "61fdd77467a5c3ad071ef8277ac6bd6af7dd4c04" +git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.6.0" +version = "4.6.1" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -87,9 +93,9 @@ deps = ["ArgTools", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [[ExprTools]] -git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d" +git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.8" +version = "0.1.9" [[GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] @@ -105,9 +111,9 @@ version = "0.1.4" [[GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "95185985a5d2388c6d0fedb06181ad4ddd40e0cb" +git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.17.2" +version = "0.17.3" [[InteractiveUtils]] deps = ["Markdown"] @@ -130,6 +136,12 @@ git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.4.1" +[[KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "17d0bb94eef881b09c57967be12cca70fefb3304" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.0" + [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04" @@ -178,6 +190,12 @@ version = "0.3.23" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.10" + [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" @@ -253,6 +271,12 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[SnoopPrecompile]] +deps = ["Preferences"] +git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c" +uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c" +version = "1.0.3" + [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" @@ -266,6 +290,17 @@ git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" version = "2.2.0" +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] +git-tree-sha1 = "2d7d9e1ddadc8407ffd460e24218e37ef52dd9a3" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.5.16" + +[[StaticArraysCore]] +git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.0" + [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -295,6 +330,17 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" +[[UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.0" + [[Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" diff --git a/Project.toml b/Project.toml index b18100dac1..4e35c695f5 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "4.0.1" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc" @@ -14,6 +15,7 @@ CompilerSupportLibraries_jll = "e66e0078-7015-5450-92f7-15fbd957f2ae" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -28,10 +30,13 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Requires = "ae029012-a4dd-5104-9daa-d747884805df" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" +UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] AbstractFFTs = "0.4, 0.5, 1.0" Adapt = "3.3" +Atomix = "0.1" BFloat16s = "0.2, 0.3, 0.4" CEnum = "0.2, 0.3, 0.4" CUDA_Driver_jll = "0.2" @@ -47,4 +52,5 @@ RandomNumbers = "1.5.3" Reexport = "0.2, 1.0" Requires = "0.5, 1.0" SpecialFunctions = "1.3, 2" +UnsafeAtomicsLLVM = "0.1" julia = "1.6" diff --git a/lib/cublas/linalg.jl b/lib/cublas/linalg.jl index ead34d4262..5779bc7c22 100644 --- a/lib/cublas/linalg.jl +++ b/lib/cublas/linalg.jl @@ -44,7 +44,7 @@ function LinearAlgebra.dot(x::AnyCuArray{T1}, y::AnyCuArray{T2}) where {T1,T2} val = CUDA.reduce_block(+, local_val, zero(T), shuffle) if threadIdx().x == 1i32 # NOTE: introduces nondeterminism - @inbounds CUDA.@atomic res[] += val + @inbounds CUDA.@atomic res[1i32] += val end return diff --git a/src/CUDA.jl b/src/CUDA.jl index 5ddccec286..0e918eae14 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -107,6 +107,10 @@ include("../lib/nvml/NVML.jl") const has_nvml = NVML.has_nvml export NVML, has_nvml +# KernelAbstractions +include("CUDAKernels.jl") +export CUDABackend + include("precompile.jl") end diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl new file mode 100644 index 0000000000..628f528f75 --- /dev/null +++ b/src/CUDAKernels.jl @@ -0,0 +1,253 @@ +module CUDAKernels + +import KernelAbstractions +import CUDA + +struct CUDABackend <: KernelAbstractions.GPU + prefer_blocks::Bool + always_inline::Bool +end +CUDABackend(;prefer_blocks=false, always_inline=false) = CUDABackend(prefer_blocks, always_inline) + +export CUDABackend + +KernelAbstractions.allocate(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.CuArray{T}(undef, dims) +KernelAbstractions.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims) +KernelAbstractions.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims) + +# Import through parent +import KernelAbstractions: StaticArrays, Adapt +import .StaticArrays: MArray + +KernelAbstractions.get_backend(::CUDA.CuArray) = CUDABackend() +KernelAbstractions.get_backend(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDABackend() + +KernelAbstractions.synchronize(::CUDABackend) = CUDA.synchronize() + +### +# copyto! +### +# - IdDict does not free the memory +# - WeakRef dict does not unique the key by objectid +const __pinned_memory = Dict{UInt64, WeakRef}() + +function __pin!(a) + # use pointer instead of objectid? + oid = objectid(a) + if haskey(__pinned_memory, oid) && __pinned_memory[oid].value !== nothing + return nothing + end + ad = CUDA.Mem.register(CUDA.Mem.Host, pointer(a), sizeof(a)) + finalizer(_ -> CUDA.Mem.unregister(ad), a) + __pinned_memory[oid] = WeakRef(a) + return nothing +end + +function KernelAbstractions.copyto!(::CUDABackend, A, B) + A isa Array && __pin!(A) + B isa Array && __pin!(B) + + GC.@preserve A B begin + destptr = pointer(A) + srcptr = pointer(B) + N = length(A) + unsafe_copyto!(destptr, srcptr, N, async=true) + end + return A +end + +import KernelAbstractions: Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config + +### +# Kernel launch +### +function launch_config(kernel::Kernel{CUDABackend}, ndrange, workgroupsize) + if ndrange isa Integer + ndrange = (ndrange,) + end + if workgroupsize isa Integer + workgroupsize = (workgroupsize, ) + end + + # partition checked that the ndrange's agreed + if KernelAbstractions.ndrange(kernel) <: StaticSize + ndrange = nothing + end + + iterspace, dynamic = if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && + workgroupsize === nothing + # use ndrange as preliminary workgroupsize for autotuning + partition(kernel, ndrange, ndrange) + else + partition(kernel, ndrange, workgroupsize) + end + + return ndrange, workgroupsize, iterspace, dynamic +end + +function threads_to_workgroupsize(threads, ndrange) + total = 1 + return map(ndrange) do n + x = min(div(threads, total), n) + total *= x + return x + end +end + +function (obj::Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=nothing) + backend = KernelAbstractions.backend(obj) + + ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize) + # this might not be the final context, since we may tune the workgroupsize + ctx = mkcontext(obj, ndrange, iterspace) + + # If the kernel is statically sized we can tell the compiler about that + if KernelAbstractions.workgroupsize(obj) <: StaticSize + maxthreads = prod(KernelAbstractions.get(KernelAbstractions.workgroupsize(obj))) + else + maxthreads = nothing + end + + kernel = CUDA.@cuda launch=false always_inline=backend.always_inline maxthreads=maxthreads obj.f(ctx, args...) + + # figure out the optimal workgroupsize automatically + if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing + config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange)) + if backend.prefer_blocks + # Prefer blocks over threads + threads = min(prod(ndrange), config.threads) + # XXX: Some kernels performs much better with all blocks active + cu_blocks = max(cld(prod(ndrange), threads), config.blocks) + threads = cld(prod(ndrange), cu_blocks) + else + threads = config.threads + end + + workgroupsize = threads_to_workgroupsize(threads, ndrange) + iterspace, dynamic = partition(obj, ndrange, workgroupsize) + ctx = mkcontext(obj, ndrange, iterspace) + end + + nblocks = length(blocks(iterspace)) + threads = length(workitems(iterspace)) + + if nblocks == 0 + return nothing + end + + # Launch kernel + kernel(ctx, args...; threads=threads, blocks=nblocks) + + return nothing +end + +# list of overrides (only for Julia 1.6) +const overrides = Expr[] + +import GPUCompiler +macro device_override(ex) + ex = macroexpand(__module__, ex) + if Meta.isexpr(ex, :call) + @show ex = eval(ex) + error() + end + code = quote + $GPUCompiler.@override($CUDA.method_table, $ex) + end + if isdefined(Base.Experimental, Symbol("@overlay")) + return esc(code) + else + push!(overrides, code) + return + end +end + +function __init__() + precompiling = ccall(:jl_generating_output, Cint, ()) != 0 + precompiling && return + # register device overrides + eval(Expr(:block, overrides...)) + empty!(overrides) +end + +import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices +import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print +import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds + +function mkcontext(kernel::Kernel{CUDABackend}, _ndrange, iterspace) + CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace) +end + +@device_override @inline function __index_Local_Linear(ctx) + return CUDA.threadIdx().x +end + +@device_override @inline function __index_Group_Linear(ctx) + return CUDA.blockIdx().x +end + +@device_override @inline function __index_Global_Linear(ctx) + I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x) + # TODO: This is unfortunate, can we get the linear index cheaper + @inbounds LinearIndices(__ndrange(ctx))[I] +end + +@device_override @inline function __index_Local_Cartesian(ctx) + @inbounds workitems(__iterspace(ctx))[CUDA.threadIdx().x] +end + +@device_override @inline function __index_Group_Cartesian(ctx) + @inbounds blocks(__iterspace(ctx))[CUDA.blockIdx().x] +end + +@device_override @inline function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x) +end + +@device_override @inline function __validindex(ctx) + if __dynamic_checkbounds(ctx) + I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x) + return I in __ndrange(ctx) + else + return true + end +end + +import KernelAbstractions: groupsize, __groupsize, __workitems_iterspace +import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize, __size + +### +# GPU implementation of shared memory +### + +@device_override @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} + CUDA.CuStaticSharedArray(T, Dims) +end + +### +# GPU implementation of scratch memory +# - private memory for each workitem +### + +@device_override @inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} + MArray{__size(Dims), T}(undef) +end + +@device_override @inline function __synchronize() + CUDA.sync_threads() +end + +@device_override @inline function __print(args...) + CUDA._cuprint(args...) +end + +### +# GPU implementation of const memory +### + +Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental.Const(a) + +# Argument conversion +KernelAbstractions.argconvert(k::Kernel{CUDABackend}, arg) = CUDA.cudaconvert(arg) + +end diff --git a/src/compiler/gpucompiler.jl b/src/compiler/gpucompiler.jl index bd39307b3a..a6364ebbbc 100644 --- a/src/compiler/gpucompiler.jl +++ b/src/compiler/gpucompiler.jl @@ -15,7 +15,7 @@ function device_properties(dev) cap = maximum(caps) # select the PTX ISA we assume to be available - # (we actually only need 6.2, but NVPTX doesn't support that) + # 6.3 introduced `atom.cas.b16` ptx = v"6.3" # we need to take care emitting LLVM instructions like `unreachable`, which diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl index 19e9f66d3d..4a1cb4a36d 100644 --- a/src/compiler/reflection.jl +++ b/src/compiler/reflection.jl @@ -125,7 +125,7 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native) function $method(io::IO, @nospecialize(func), @nospecialize(types); kernel::Bool=false, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing, always_inline::Bool=false, - kwargs...) + cap=capability(device()), kwargs...) source = FunctionSpec(func, Base.to_tuple_type(types), kernel) target = CUDACompilerTarget(device(); minthreads, maxthreads, blocks_per_sm, maxregs) params = CUDACompilerParams() diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl index 443a7fd420..1bf3887b62 100644 --- a/src/device/intrinsics.jl +++ b/src/device/intrinsics.jl @@ -3,6 +3,28 @@ # special intrinsics for writing version-dependent code include("intrinsics/version.jl") +abstract type SyncScope end +struct SystemScope <: SyncScope end +struct DeviceScope <: SyncScope end +struct BlockScope <: SyncScope end + +const system_scope = SystemScope() +const device_scope = DeviceScope() +const block_scope = BlockScope() + +import UnsafeAtomics +using UnsafeAtomics.Internal: LLVMOrdering +using UnsafeAtomics: unordered, monotonic, acquire, release, acq_rel, seq_cst + +struct AtomicUnsupported{T} <: Exception end +struct AtomicOrderUnsupported{Ordering} <: Exception + order::Ordering +end + +# Note CUDA C++ has also consume ordering which LLVM does not support +# monotonic -> relaxed +# unordered -> ??? maybe weak + # extensions to the C language include("intrinsics/memory_shared.jl") include("intrinsics/indexing.jl") diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 15b8c10e39..876a456652 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -1,5 +1,7 @@ # Atomic Functions (B.12) +# TODO replace the below with UnsafeAtomicsLLVM if possible + # # Low-level intrinsics # @@ -357,117 +359,352 @@ This operation is only supported for values of type Int32. """ atomic_dec! +asm(::Type{LLVMOrdering{:monotonic}}) = :relaxed +asm(::Type{LLVMOrdering{Order}}) where Order = Order + +asm(::Type{SystemScope}) = :sys +asm(::Type{DeviceScope}) = :gpu +asm(::Type{BlockScope}) = :cta + +function suffix(sz) + if sz == 1 + "b8" + elseif sz == 2 + "b16" + elseif sz == 4 + "b32" + elseif sz == 8 + "b64" + end +end +function reg(sz) + if sz == 1 + "r" + elseif sz == 2 + "h" + elseif sz == 4 + "r" + elseif sz == 8 + "l" + end +end -# -# High-level interface -# +function addr_space(A) + if A == AS.Global + as = ".global" + elseif A == AS.Shared + as = ".shared" + else + as = "" + end +end -# prototype of a high-level interface for performing atomic operations on arrays -# -# this design could be generalized by having atomic {field,array}{set,ref} accessors, as -# well as acquire/release operations to implement the fallback functionality where any -# operation can be applied atomically. - -if VERSION <= v"1.7-" -export @atomic -end - -const inplace_ops = Dict( - :(+=) => :(+), - :(-=) => :(-), - :(*=) => :(*), - :(/=) => :(/), - :(\=) => :(\), - :(%=) => :(%), - :(^=) => :(^), - :(&=) => :(&), - :(|=) => :(|), - :(⊻=) => :(⊻), - :(>>>=) => :(>>>), - :(>>=) => :(>>), - :(<<=) => :(<<), -) +for (order, scope, A, sz) in Iterators.product( + (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (2,4,8)) + instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).$(suffix(sz)) \$0, [\$1];" + constraint = "=$(reg(sz)),l,~{memory}" + @eval @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr) +end -struct AtomicError <: Exception - msg::AbstractString +# Handle byte sized load +for (order, scope, A) in Iterators.product( + (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared)) + instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).b8 \$0, [\$1];" + constraint = "=r,l,~{memory}" + @eval @inline function __load(::Val{1}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) + return Core.bitcast(T, val % UInt8) + end end -Base.showerror(io::IO, err::AtomicError) = - print(io, "AtomicError: ", err.msg) +@inline __load(ptr::LLVMPtr{T}, order, scope) where T = + __load(Val(sizeof(T)), ptr, order, scope) -""" - @atomic a[I] = op(a[I], val) - @atomic a[I] ...= val - -Atomically perform a sequence of operations that loads an array element `a[I]`, performs the -operation `op` on that value and a second value `val`, and writes the result back to the -array. This sequence can be written out as a regular assignment, in which case the same -array element should be used in the left and right hand side of the assignment, or as an -in-place application of a known operator. In both cases, the array reference should be pure -and not induce any side-effects. - -!!! warn - This interface is experimental, and might change without warning. Use the lower-level - `atomic_...!` functions for a stable API, albeit one limited to natively-supported ops. -""" -macro atomic(ex) - # decode assignment and call - if ex.head == :(=) - ref = ex.args[1] - rhs = ex.args[2] - Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call")) - op = rhs.args[1] - if rhs.args[2] != ref - throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side")) +for (A, sz) in Iterators.product( + (AS.Generic, AS.Global, AS.Shared), + (2,4,8)) + instruction = "ld.volatile$(addr_space(A)).$(suffix(sz)) \$0, [\$1];" + constraint = "=$(reg(sz)),l,~{memory}" + @eval @inline __load_volatile(::Val{$sz}, ptr::LLVMPtr{T, $A}) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr) +end + +# Handle byte sized load +for (A) in (AS.Generic, AS.Global, AS.Shared) + instruction = "ld.volatile$(addr_space(A)).b8 \$0, [\$1];" + constraint = "=r,l,~{memory}" + @eval @inline function __load_volatile(::Val{1}, ptr::LLVMPtr{T, $A}) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) + return Core.bitcast(T, val % UInt8) + end +end + +@inline __load_volatile(ptr::LLVMPtr{T}) where {T} = + __load_volatile(Val(sizeof(T)), ptr) + +@inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T + if order == acq_rel || order == release + throw(AtomicOrderUnsupported(order)) + end + if sizeof(T) > 8 + throw(AtomicUnsupported{T}()) + end + if compute_capability() >= sv"7.0" + if order == monotonic + val = __load(ptr, monotonic, scope) + return val + end + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + val = __load(ptr, acquire, scope) + return val + elseif compute_capability() >= sv"6.0" + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + val = __load_volatile(ptr) + if order == monotonic + return val + end + atomic_thread_fence(order, scope) + return val + else + throw(AtomicUnsupported{T}()) + end +end + +for (order, scope, A, sz) in Iterators.product( + (LLVMOrdering{:release}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (1, 2, 4, 8)) + instruction = "st$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) [\$0], \$1;" + constraint = "l,$(reg(sz)),~{memory}" + @eval @inline __store!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T, ::$order, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val) +end + +@inline __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T = + __store!(Val(sizeof(T)), ptr, val, order, scope) + +for (A, sz) in Iterators.product( + (AS.Generic, AS.Global, AS.Shared), + (1, 2, 4, 8)) + instruction = "st$(addr_space(A)).volatile.$(suffix(sz)) [\$0], \$1;" + constraint = "l,$(reg(sz)),~{memory}" + @eval @inline __store_volatile!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T) where {T} = + @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val) +end + +# Could be done using LLVM. +@inline __store_volatile!(ptr::LLVMPtr{T}, val::T) where {T} = + __store_volatile!(Val(sizeof(T)), ptr, val) + +@inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T + if order == acq_rel || order == acquire # || order == consume + throw(AtomicOrderUnsupported(order)) + end + if sizeof(T) > 8 + throw(AtomicUnsupported{T}()) + end + if compute_capability() >= sv"7.0" + if order == release + __store!(ptr, val, release, scope) + return + end + if order == seq_cst + atomic_thread_fence(seq_cst, scope) end - val = rhs.args[3] - elseif haskey(inplace_ops, ex.head) - op = inplace_ops[ex.head] - ref = ex.args[1] - val = ex.args[2] + __store!(ptr, val, monotonic, scope) + elseif compute_capability() >= sv"6.0" + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + __store_volatile!(ptr, val) else - throw(AtomicError("unknown @atomic expression")) + throw(AtomicUnsupported{T}()) + end +end + +order(::LLVMOrdering{:monotonic}) = 1 +# order(::Consume) = 2 +order(::LLVMOrdering{:acquire}) = 3 +order(::LLVMOrdering{:release}) = 4 +order(::LLVMOrdering{:acq_rel}) = 5 +order(::LLVMOrdering{:seq_cst}) = 6 + +Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b)) + +@inline function stronger_order(a::LLVMOrdering, b::LLVMOrdering) + m = max(a, b) + if m != release + return m + end + # maximum is release, what is the other one? + other = min(a, b) + if other == monotonic + return release + # elseif other == Consume() + # return Acq_Rel() + elseif other == acquire + return acq_rel + elseif other == release + return release + end + Base.llvmcall("unreachable", Cvoid, Tuple{}) + @assert(false) +end + +for (order, scope, A, sz) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (2, 4, 8)) + instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" + constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" + @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) +end + +# Handle byte sized cas +for (order, scope, A) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared)) + instruction = "atom.$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).b8 \$0, [\$1];" + constraint = "=r,l,r,r,~{memory}" + @eval @inline function __cas!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + return Core.bitcast(T, val % UInt8) end +end + +@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T = + __cas!(Val(sizeof(T)), ptr, old, new, order, scope) + +for (order, A, sz) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (AS.Generic, AS.Global, AS.Shared), + (2, 4, 8)) + instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" + constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" + @eval @inline __cas_old!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) +end - # decode array expression - Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression")) - array = ref.args[1] - indices = Expr(:tuple, ref.args[2:end]...) - - esc(quote - $atomic_arrayset($array, $indices, $op, $val) - end) -end - -# FIXME: make this respect the indexing style -@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} = - atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val)) - -# native atomics -for (op,impl,typ) in [(+, atomic_add!, [UInt32,Int32,UInt64,Int64,Float32]), - (-, atomic_sub!, [UInt32,Int32,UInt64,Int64,Float32]), - (&, atomic_and!, [UInt32,Int32,UInt64,Int64]), - (|, atomic_or!, [UInt32,Int32,UInt64,Int64]), - (⊻, atomic_xor!, [UInt32,Int32,UInt64,Int64]), - (max, atomic_max!, [UInt32,Int32,UInt64,Int64]), - (min, atomic_min!, [UInt32,Int32,UInt64,Int64])] - @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op), - val::T) where {T<:Union{$(typ...)}} = - $impl(pointer(A, I), val) -end - -# native atomics that are not supported on all devices -@inline function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::typeof(+), - val::T) where {T <: Union{Float64}} - ptr = pointer(A, I) - if compute_capability() >= sv"6.0" - atomic_add!(ptr, val) +# Handle byte sized cas +for (order, A) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (AS.Generic, AS.Global, AS.Shared)) + instruction = "atom.$(addr_space(A)).cas.$(asm(order)).b8 \$0, [\$1];" + constraint = "=r,l,r,r,~{memory}" + @eval @inline function __cas_old!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + return Core.bitcast(T, val % UInt8) + end +end + +@inline __cas_old!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T = + __cas_old!(Val(sizeof(T)), ptr, old, new, scope) + +@inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T + order = stronger_order(success_order, failure_order) + if sizeof(T) > 8 || sizeof(T) < 2 + throw(AtomicUnsupported{T}()) + end + if compute_capability() >= sv"7.0" + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + if order == seq_cst # order == consume + order = acquire + end + old = __cas!(ptr, expected, new, order, scope) + elseif compute_capability() >= sv"6.0" + if order == seq_cst || order == acq_rel || order == release + atomic_thread_fence(seq_cst, scope) + end + old = __cas_old!(ptr, expected, new, scope) + if order == seq_cst || order == acq_rel || order == acquire # order == consume + atomic_thread_fence(seq_cst, scope) + end else - atomic_op!(ptr, op, val) + throw(AtomicUnsupported{T}()) + end + return old +end + +# +# High-level interface +# +import Atomix: @atomic, @atomicswap, @atomicreplace +# import UnsafeAtomicsLLVM + +if VERSION <= v"1.7" + export @atomic +end + +using Atomix: Atomix, IndexableRef + +const CuIndexableRef{Indexable<:CuDeviceArray} = IndexableRef{Indexable} + +@inline function Atomix.get(ref::CuIndexableRef, order) + atomic_load(Atomix.pointer(ref), order) +end + +@inline function Atomix.set!(ref::CuIndexableRef, v, order) + v = convert(eltype(ref), v) + atomic_store!(Atomix.pointer(ref), v, order) +end + +@inline function Atomix.replace!(ref::CuIndexableRef, expected, desired, + success_ordering, failure_ordering) + ptr = Atomix.pointer(ref) + expected = convert(eltype(ref), expected) + desired = convert(eltype(ref), desired) + return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering) +end + +@inline function modify!(ptr::LLVMPtr{T}, op::OP, x, order) where {T, OP} + old = atomic_load(ptr, order) + while true + expected = old + new = op(expected, x) + old = atomic_cas!(ptr, expected, new, order, monotonic) + if old === expected + return expected => new + end end end -# fallback using compare-and-swap -@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} = - atomic_op!(pointer(A, I), op, val) +@inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP} + x = convert(eltype(ref), x) + ptr = Atomix.pointer(ref) + # TODO: Support hardware variants + # old = if op === (+) + # atomic_add!(ptr, x) + # elseif op === (-) + # atomic_sub!(ptr, x) + # elseif op === (&) + # atomic_and!(ptr, x) + # elseif op === (|) + # atomic_or!(ptr, x) + # elseif op === xor + # atomic_xor!(ptr, x) + # elseif op === min + # atomic_min!(ptr, x) + # elseif op === max + # atomic_max!(ptr, x) + # else + return modify!(ptr, op, x, order) + # end + # return old => op(old, x) +end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index 9c76737e36..2cf19e7cf4 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -83,39 +83,126 @@ the warp. Cvoid, Tuple{UInt32}, convert(UInt32, mask)) end +@inline threadfence(::BlockScope) = threadfence_block() +@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) +@inline threadfence_sc_block() = @asmcall("fence.sc.cta;", "~{memory}", true, Cvoid, Tuple{}) +@inline threadfence_acq_rel_block() = @asmcall("fence.acq_rel.cta;", "~{memory}", true, Cvoid, Tuple{}) + +function atomic_thread_fence(order, scope::BlockScope) + if compute_capability() >= sv"7.0" + if order == seq_cst + threadfence_sc_block() + elseif order == acquire || order == acq_rel || order == release # || order == consume + threadfence_acq_rel_block() + else + throw(AtomicOrderUnsupported(order)) + end + else + if order == seq_cst || + # order == consume || + order == acquire || + order == acq_rel || + order == release + + threadfence_block() + else + throw(AtomicOrderUnsupported(order)) + end + end +end + +@inline threadfence(::DeviceScope=device_scope) = threadfence_device() +@inline threadfence_device() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) +@inline threadfence_sc_device() = @asmcall("fence.sc.gpu;", "~{memory}", true, Cvoid, Tuple{}) +@inline threadfence_acq_rel_device() = @asmcall("fence.acq_rel.gpu;", "~{memory}", true, Cvoid, Tuple{}) + +function atomic_thread_fence(order, scope::DeviceScope=device_scope) + if compute_capability() >= sv"7.0" + if order == seq_cst + + threadfence_sc_device() + elseif order == acquire || + # order == consume || + order == acq_rel || + order == release + + threadfence_acq_rel_device() + else + throw(AtomicOrderUnsupported(order)) + end + else + if order == seq_cst || + # order == consume || + order == acquire || + order == acq_rel || + order == release + + threadfence_device() + else + throw(AtomicOrderUnsupported(order)) + end + end +end + +@inline threadfence(::SystemScope) = threadfence_system() +@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) +@inline threadfence_sc_system() = @asmcall("fence.sc.sys;", "~{memory}", true, Cvoid, Tuple{}) +@inline threadfence_acq_rel_system() = @asmcall("fence.acq_rel.sys;", "~{memory}", true, Cvoid, Tuple{}) + +function atomic_thread_fence(order, scope::SystemScope) + if compute_capability() >= sv"7.0" + if order == seq_cst + + threadfence_sc_system() + elseif order == acquire || + # order == consume || + order == acq_rel || + order == release + + threadfence_acq_rel_system() + else + throw(AtomicOrderUnsupported(order)) + end + else + if order == seq_cst || + # order == consume || + order == acquire || + order == acq_rel || + order == release + + threadfence_system() + else + throw(AtomicOrderUnsupported(order)) + end + end +end + """ - threadfence_block() + threadfence(::SyncScope=device_scope) A memory fence that ensures that: -- All writes to all memory made by the calling thread before the call to `threadfence_block()` - are observed by all threads in the block of the calling thread as occurring before all writes - to all memory made by the calling thread after the call to `threadfence_block()` -- All reads from all memory made by the calling thread before the call to `threadfence_block()` - are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`. -""" -@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) +- All writes to all memory made by the calling thread before the call to `threadfence(scope)` + are observed by all threads in the scope of the calling thread as occurring before all writes + to all memory made by the calling thread after the call to `threadfence(scope)` +- All reads from all memory made by the calling thread before the call to `threadfence(scope)` + are ordered before all reads from all memory made by the calling thread after the call to `threadfence(scope)`. -""" - threadfence() +SyncScope can be one of `block_scope`, `device_scope`, or `system_scope`. + - `block_scope` orders reads and write on the *same* block. + - `device_scope` orders reads and write on the *same* device. + - `system_scope` orders reads and writes across all threads in the device, + host threads, and all threads in peer devices. -A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the -calling thread and also ensures that no writes to all memory made by the calling thread after -the call to `threadfence()` are observed by any thread in the device as occurring before any -write to all memory made by the calling thread before the call to `threadfence()`. +See [`atomic_thread_fence`](@ref) for a variant that takes atomic orderings. -Note that for this ordering guarantee to be true, the observing threads must truly observe the -memory and not cached versions of it; this is requires the use of volatile loads and stores, -which is not available from Julia right now. -""" -@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) +!!! note + Note that for this ordering guarantee to be true, the observing threads must truly observe the + memory and not cached versions of it; this is requires the use of atomic loads and stores. """ - threadfence_system() +function threadfence end -A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the -calling thread and also ensures that all writes to all memory made by the calling thread -before the call to `threadfence_system()` are observed by all threads in the device, -host threads, and all threads in peer devices as occurring before all writes to all -memory made by the calling thread after the call to `threadfence_system()`. """ -@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) + atomic_thread_fence(order::Atomicx.Ordering, ::SyncScope=device) +""" +function atomic_thread_fence end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index 95252c2eda..ec5354dea7 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -7,7 +7,9 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -15,6 +17,7 @@ REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index 29810defe7..da331c0474 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -1,410 +1,182 @@ -# TODO: unify with Base.@atomic -using CUDA: @atomic -using BFloat16s: BFloat16 - -@testset "atomics (low-level)" begin - -# tested on all natively-supported atomics - -@testset "atomic_add" begin - types = [Int32, Int64, UInt32, UInt64, Float32] - capability(device()) >= v"6.0" && push!(types, Float64) - capability(device()) >= v"7.0" && push!(types, Float16) - - @testset for T in types - a = CuArray(T[0]) - - function kernel(a, b) - CUDA.atomic_add!(pointer(a), b) - return - end - - @cuda threads=1024 kernel(a, one(T)) - @test Array(a)[1] == 1024 - end -end - -@testset "atomic_sub" begin - types = [Int32, Int64, UInt32, UInt64] - - @testset for T in types - a = CuArray(T[2048]) - - function kernel(a, b) - CUDA.atomic_sub!(pointer(a), b) - return - end - - @cuda threads=1024 kernel(a, one(T)) - @test Array(a)[1] == 1024 - end -end - -@testset "atomic_inc" begin - @testset for T in [Int32] - a = CuArray(T[0]) - - function kernel(a, b) - CUDA.atomic_inc!(pointer(a), b) - return - end - - @cuda threads=768 kernel(a, T(512)) - @test Array(a)[1] == 255 - end -end - -@testset "atomic_dec" begin - @testset for T in [Int32] - a = CuArray(T[1024]) - - function kernel(a, b) - CUDA.atomic_dec!(pointer(a), b) - return - end - - @cuda threads=256 kernel(a, T(512)) - @test Array(a)[1] == 257 - end -end - -@testset "atomic_xchg" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray([zero(T)]) - - function kernel(a, b) - CUDA.atomic_xchg!(pointer(a), b) - return - end - - @cuda threads=1024 kernel(a, one(T)) - @test Array(a)[1] == one(T) - end -end - -@testset "atomic_and" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray(T[1023]) - - function kernel(a, T) - i = threadIdx().x - 1 - k = 1 - for i = 1:i - k *= 2 - end - b = 1023 - k # 1023 - 2^i - CUDA.atomic_and!(pointer(a), T(b)) - return - end - - @cuda threads=10 kernel(a, T) - @test Array(a)[1] == zero(T) - end -end - -@testset "atomic_or" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray(T[0]) - - function kernel(a, T) - i = threadIdx().x - b = 1 # 2^(i-1) - for i = 1:i - b *= 2 - end - b /= 2 - CUDA.atomic_or!(pointer(a), T(b)) - return - end - - @cuda threads=10 kernel(a, T) - @test Array(a)[1] == 1023 - end -end - -@testset "atomic_xor" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray(T[1023]) - - function kernel(a, T) - i = threadIdx().x - b = 1 # 2^(i-1) - for i = 1:i - b *= 2 - end - b /= 2 - CUDA.atomic_xor!(pointer(a), T(b)) - return - end - - @cuda threads=10 kernel(a, T) - @test Array(a)[1] == 0 - end -end - -@testset "atomic_cas" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16]) - - @testset for T in types - a = CuArray(T[0]) - - function kernel(a, b, c) - CUDA.atomic_cas!(pointer(a), b, c) - return - end - - @cuda threads=1024 kernel(a, zero(T), one(T)) - @test Array(a)[1] == 1 - end -end - -@testset "atomic_max" begin - types = [Int32, Int64, UInt32, UInt64] - - @testset for T in types - a = CuArray([zero(T)]) - - function kernel(a, T) - i = threadIdx().x - CUDA.atomic_max!(pointer(a), T(i)) - return - end - - @cuda threads=1024 kernel(a, T) - @test Array(a)[1] == 1024 - end -end - -@testset "atomic_min" begin - types = [Int32, Int64, UInt32, UInt64] - - @testset for T in types - a = CuArray(T[1024]) - - function kernel(a, T) - i = threadIdx().x - CUDA.atomic_min!(pointer(a), T(i)) - return - end - - @cuda threads=1024 kernel(a, T) - @test Array(a)[1] == 1 - end -end - -@testset "shared memory" begin - function kernel() - shared = CuStaticSharedArray(Float32, 1) - @atomic shared[threadIdx().x] += 0f0 - return - end - - CUDA.@sync @cuda kernel() -end - -end +using CUDA: @atomic, @atomicswap, @atomicreplace @testset "atomics (high-level)" begin - -# tested on all types supported by atomic_cas! (which empowers the fallback definition) - -@testset "add" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray([zero(T)]) - + # tested on all types supported by atomic_cas! (which empowers the fallback definition) + + @testset "add" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] + 1 @atomic a[1] += 1 return end - @cuda threads=1024 kernel(T, a) - @test Array(a)[1] == 2048 + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=1024 kernel(T, a) + @test Array(a)[1] == 1024 + end end -end - -@testset "sub" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray(T[2048]) - + + @testset "sub" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] - 1 @atomic a[1] -= 1 return end - @cuda threads=1024 kernel(T, a) - @test Array(a)[1] == 0 + @testset for T in types + a = CuArray(T[2048]) + @cuda threads=1024 kernel(T, a) + @test Array(a)[1] == 1024 + end end -end - -@testset "mul" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray(T[1]) - + + @testset "mul" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] * 2 @atomic a[1] *= 2 return end - @cuda threads=5 kernel(T, a) - @test Array(a)[1] == 1024 + @testset for T in types + a = CuArray(T[1]) + @cuda threads=5 kernel(T, a) + @test Array(a)[1] == 32 + end end -end - -@testset "div" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray(T[1024]) - + + @testset "div" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] / 2 @atomic a[1] /= 2 return end - @cuda threads=5 kernel(T, a) - @test Array(a)[1] == 1 + @testset for T in types + a = CuArray(T[32]) + @cuda threads=5 kernel(T, a) + @test Array(a)[1] == 1 + end end -end - -@testset "and" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([~zero(T), ~zero(T)]) - + + @testset "and" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) + function kernel(T, a) i = threadIdx().x mask = ~(T(1) << (i-1)) - @atomic a[1] = a[1] & mask - @atomic a[2] &= mask + @atomic a[1] &= mask return end - - @cuda threads=8*sizeof(T) kernel(T, a) - @test Array(a)[1] == zero(T) - @test Array(a)[2] == zero(T) + + @testset for T in types + a = CuArray([~zero(T)]) + @cuda threads=8*sizeof(T) kernel(T, a) + @test Array(a)[1] == zero(T) + end end -end - -@testset "or" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([zero(T), zero(T)]) + + @testset "or" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) function kernel(T, a) i = threadIdx().x mask = T(1) << (i-1) - @atomic a[1] = a[1] | mask - @atomic a[2] |= mask + @atomic a[1] |= mask return end - @cuda threads=8*sizeof(T) kernel(T, a) - @test Array(a)[1] == ~zero(T) - @test Array(a)[2] == ~zero(T) + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=8*sizeof(T) kernel(T, a) + @test Array(a)[1] == ~zero(T) + end end -end - -@testset "xor" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([zero(T), zero(T)]) - + + @testset "xor" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) + function kernel(T, a) i = threadIdx().x mask = T(1) << ((i-1)%(8*sizeof(T))) - @atomic a[1] = a[1] ⊻ mask - @atomic a[2] ⊻= mask + @atomic a[1] ⊻= mask return end - nb = 4 - @cuda threads=(8*sizeof(T)+nb) kernel(T, a) - @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T)) - @test Array(a)[2] == ~zero(T) & ~((one(T) << nb) - one(T)) + @testset for T in types + a = CuArray([zero(T)]) + nb = 4 + @cuda threads=(8*sizeof(T)+nb) kernel(T, a) + @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T)) + end end -end - -@testset "max" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray([zero(T)]) - + + @testset "max" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) i = threadIdx().x - @atomic a[1] = max(a[1], i) + @atomic a[1] max i return end - @cuda threads=32 kernel(T, a) - @test Array(a)[1] == 32 + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=32 kernel(T, a) + @test Array(a)[1] == 32 + end end -end - -@testset "min" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray([typemax(T)]) - + + @testset "min" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) i = threadIdx().x - @atomic a[1] = min(a[1], i) + @atomic a[1] min i return end - @cuda threads=32 kernel(T, a) - @test Array(a)[1] == 1 + @testset for T in types + a = CuArray([typemax(T)]) + @cuda threads=32 kernel(T, a) + @test Array(a)[1] == 1 + end end -end - -@testset "shift" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([one(T)]) - + + @testset "shift" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) + function kernel(T, a) @atomic a[1] <<= 1 return end - @cuda threads=8 kernel(T, a) - @test Array(a)[1] == 1<<8 + @testset for T in types + a = CuArray([one(T)]) + @cuda threads=8 kernel(T, a) + @test Array(a)[1] == 1<<8 + end end -end - -@testset "macro" begin - + @testset "NaN" begin f(x,y) = 3x + 2y function kernel(x) - CUDA.@atomic x[1] = f(x[1],42f0) + @inbounds CUDA.@atomic x[1] f 42f0 nothing end @@ -417,69 +189,15 @@ end @test isnan(Array(a)[1]) end - using CUDA: AtomicError - - @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin - @atomic a[1] = 1 - end - @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin - @atomic a[1] = b ? 1 : 2 - end - - @test_throws_macro AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side") @macroexpand begin - @atomic a[1] = a[2] + 1 - end - - @test_throws_macro AtomicError("unknown @atomic expression") @macroexpand begin - @atomic wat(a[1]) - end - - @test_throws_macro AtomicError("@atomic should be applied to an array reference expression") @macroexpand begin - @atomic a = a + 1 - end -end - -@testset "shared memory" begin - # test that atomic operations on shared memory work - # https://github.com/JuliaGPU/CUDA.jl/issues/311 - - function kernel(a) - b = CUDA.CuStaticSharedArray(Int, 1) - - if threadIdx().x == 1 - b[] = a[] + @testset "macro" begin + @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin + @atomic wat(a[1]) end - sync_threads() - - CUDA.atomic_add!(pointer(b), 1) - sync_threads() - - if threadIdx().x == 1 - a[] = b[] + + @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin + @atomic a = a + 1 end - return - end - - a = CuArray([0]) - @cuda threads=16 kernel(a) - @test Array(a) == [16] -end - -@testset "shared memory bug" begin - # shared memory atomics resulted in illegal memory accesses - # https://github.com/JuliaGPU/CUDA.jl/issues/558 - - function kernel() - tid = threadIdx().x - shared = CuStaticSharedArray(Float32, 4) - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - return end - - @cuda threads=2 kernel() - synchronize() -end - + end + \ No newline at end of file diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl new file mode 100644 index 0000000000..0db2c30527 --- /dev/null +++ b/test/device/intrinsics/lowlevel_atomics.jl @@ -0,0 +1,303 @@ +using BFloat16s: BFloat16 + +@testset "atomics (low-level) with order" begin + +@testset "atomic_load" begin + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + BFloat16, Float16, Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] + + function kernel(a, order, scope) + CUDA.atomic_load(pointer(a), order, scope) + return + end + + @testset for (T, order, scope) in Iterators.product(types, orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, order, scope) + @test Array(a)[1] == 0 + end + end +end + +@testset "atomic_store!" begin + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + BFloat16, Float16, Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] + + function kernel(a, val, order, scope) + CUDA.atomic_store!(pointer(a), val, order, scope) + return + end + + @testset for (T, order, scope) in Iterators.product(types, orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, one(T), order, scope) + @test Array(a)[1] == one(T) + end + end +end + +@testset "atomic_cas!" begin + if capability(device()) >= v"6.0" + # TODO size(T) in (1, 2) + types = [Int32, Int64, + UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel] + + function kernel(a, expected, desired, success_order, failure_order, scope) + CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope) + return + end + + @testset for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope) + @test Array(a)[1] == one(T) + end + end +end + +end # atomics (low-level) with order + +@testset "atomics (low-level)" begin + +# tested on all natively-supported atomics + +@testset "atomic_add" begin + types = [Int32, Int64, UInt32, UInt64, Float32] + capability(device()) >= v"6.0" && push!(types, Float64) + capability(device()) >= v"7.0" && push!(types, Float16) + + function kernel(a, b) + CUDA.atomic_add!(pointer(a), b) + return + end + + @testset for T in types + a = CuArray(T[0]) + + @cuda threads=1024 kernel(a, one(T)) + @test Array(a)[1] == 1024 + end +end + +@testset "atomic_sub" begin + types = [Int32, Int64, UInt32, UInt64] + + function kernel(a, b) + CUDA.atomic_sub!(pointer(a), b) + return + end + + @testset for T in types + a = CuArray(T[2048]) + @cuda threads=1024 kernel(a, one(T)) + @test Array(a)[1] == 1024 + end +end + +@testset "atomic_inc" begin + function kernel(a, b) + CUDA.atomic_inc!(pointer(a), b) + return + end + + @testset for T in [Int32] + a = CuArray(T[0]) + @cuda threads=768 kernel(a, T(512)) + @test Array(a)[1] == 255 + end +end + +@testset "atomic_dec" begin + function kernel(a, b) + CUDA.atomic_dec!(pointer(a), b) + return + end + + @testset for T in [Int32] + a = CuArray(T[1024]) + @cuda threads=256 kernel(a, T(512)) + @test Array(a)[1] == 257 + end +end + +@testset "atomic_xchg" begin + function kernel(a, b) + CUDA.atomic_xchg!(pointer(a), b) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray([zero(T)]) + @cuda threads=1024 kernel(a, one(T)) + @test Array(a)[1] == one(T) + end +end + +@testset "atomic_and" begin + function kernel(a, T) + i = threadIdx().x - 1 + k = 1 + for i = 1:i + k *= 2 + end + b = 1023 - k # 1023 - 2^i + CUDA.atomic_and!(pointer(a), T(b)) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray(T[1023]) + @cuda threads=10 kernel(a, T) + @test Array(a)[1] == zero(T) + end +end + +@testset "atomic_or" begin + function kernel(a, T) + i = threadIdx().x + b = 1 # 2^(i-1) + for i = 1:i + b *= 2 + end + b /= 2 + CUDA.atomic_or!(pointer(a), T(b)) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray(T[0]) + @cuda threads=10 kernel(a, T) + @test Array(a)[1] == 1023 + end +end + +@testset "atomic_xor" begin + function kernel(a, T) + i = threadIdx().x + b = 1 # 2^(i-1) + for i = 1:i + b *= 2 + end + b /= 2 + CUDA.atomic_xor!(pointer(a), T(b)) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray(T[1023]) + @cuda threads=10 kernel(a, T) + @test Array(a)[1] == 0 + end +end + +@testset "atomic_cas" begin + types = [Int32, Int64, UInt32, UInt64] + capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16]) + + function kernel(a, b, c) + CUDA.atomic_cas!(pointer(a), b, c) + return + end + + @testset for T in types + a = CuArray(T[0]) + @cuda threads=1024 kernel(a, zero(T), one(T)) + @test Array(a)[1] == 1 + end +end + +@testset "atomic_max" begin + types = [Int32, Int64, UInt32, UInt64] + + function kernel(a, T) + i = threadIdx().x + CUDA.atomic_max!(pointer(a), T(i)) + return + end + + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=1024 kernel(a, T) + @test Array(a)[1] == 1024 + end +end + +@testset "atomic_min" begin + types = [Int32, Int64, UInt32, UInt64] + + function kernel(a, T) + i = threadIdx().x + CUDA.atomic_min!(pointer(a), T(i)) + return + end + + @testset for T in types + a = CuArray(T[1024]) + @cuda threads=1024 kernel(a, T) + @test Array(a)[1] == 1 + end +end + +@testset "shared memory" begin + @testset "simple" begin + function kernel() + shared = CuStaticSharedArray(Float32, 1) + CUDA.atomic_add!(pointer(shared, threadIdx().x), 0f0) + return + end + + CUDA.@sync @cuda kernel() + end + + @testset "shared memory reduction" begin + # test that atomic operations on shared memory work + # https://github.com/JuliaGPU/CUDA.jl/issues/311 + + function kernel(a) + b = CUDA.CuStaticSharedArray(Int, 1) + + if threadIdx().x == 1 + b[] = a[] + end + sync_threads() + + CUDA.atomic_add!(pointer(b), 1) + sync_threads() + + if threadIdx().x == 1 + a[] = b[] + end + return + end + + a = CuArray([0]) + @cuda threads=16 kernel(a) + @test Array(a) == [16] + end + + @testset "shared memory bug" begin + # shared memory atomics resulted in illegal memory accesses + # https://github.com/JuliaGPU/CUDA.jl/issues/558 + + function kernel() + tid = threadIdx().x + shared = CuStaticSharedArray(Float32, 4) + CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) + sync_threads() + CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) + return + end + + @cuda threads=2 kernel() + synchronize() + end +end + +end # low-level atomics diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl new file mode 100644 index 0000000000..05f5d36978 --- /dev/null +++ b/test/kernelabstractions.jl @@ -0,0 +1,16 @@ +import KernelAbstractions +using Test + +include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl")) + +using CUDA +using CUDA.CUDAKernels + +if CUDA.functional() + CUDA.versioninfo() + CUDA.allowscalar(false) + Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CUDA.CuDeviceArray) + for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false)) + Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CUDA.CuDeviceArray) + end +end