diff --git a/Manifest.toml b/Manifest.toml
index 6225d8d8a7..2d634a920b 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,9 +2,9 @@
 
 [[AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409"
+git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.2.1"
+version = "1.3.1"
 
 [[Adapt]]
 deps = ["LinearAlgebra", "Requires"]
@@ -18,6 +18,12 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
 [[Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
+[[Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "0.1.0"
+
 [[BFloat16s]]
 deps = ["LinearAlgebra", "Printf", "Random", "Test"]
 git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
@@ -64,9 +70,9 @@ version = "0.1.6"
 
 [[Compat]]
 deps = ["Dates", "LinearAlgebra", "UUIDs"]
-git-tree-sha1 = "61fdd77467a5c3ad071ef8277ac6bd6af7dd4c04"
+git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.6.0"
+version = "4.6.1"
 
 [[CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -87,9 +93,9 @@ deps = ["ArgTools", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
 [[ExprTools]]
-git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d"
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.8"
+version = "0.1.9"
 
 [[GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
@@ -105,9 +111,9 @@ version = "0.1.4"
 
 [[GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "95185985a5d2388c6d0fedb06181ad4ddd40e0cb"
+git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.17.2"
+version = "0.17.3"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -130,6 +136,12 @@ git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
 version = "1.4.1"
 
+[[KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "17d0bb94eef881b09c57967be12cca70fefb3304"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.9.0"
+
 [[LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
 git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04"
@@ -178,6 +190,12 @@ version = "0.3.23"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
+[[MacroTools]]
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.10"
+
 [[Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
@@ -253,6 +271,12 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SnoopPrecompile]]
+deps = ["Preferences"]
+git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c"
+uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c"
+version = "1.0.3"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -266,6 +290,17 @@ git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
 version = "2.2.0"
 
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
+git-tree-sha1 = "2d7d9e1ddadc8407ffd460e24218e37ef52dd9a3"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "1.5.16"
+
+[[StaticArraysCore]]
+git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
+uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+version = "1.4.0"
+
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -295,6 +330,17 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
+[[UnsafeAtomics]]
+git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.2.1"
+
+[[UnsafeAtomicsLLVM]]
+deps = ["LLVM", "UnsafeAtomics"]
+git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e"
+uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+version = "0.1.0"
+
 [[Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
diff --git a/Project.toml b/Project.toml
index b18100dac1..4e35c695f5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "4.0.1"
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
@@ -14,6 +15,7 @@ CompilerSupportLibraries_jll = "e66e0078-7015-5450-92f7-15fbd957f2ae"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -28,10 +30,13 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
 AbstractFFTs = "0.4, 0.5, 1.0"
 Adapt = "3.3"
+Atomix = "0.1"
 BFloat16s = "0.2, 0.3, 0.4"
 CEnum = "0.2, 0.3, 0.4"
 CUDA_Driver_jll = "0.2"
@@ -47,4 +52,5 @@ RandomNumbers = "1.5.3"
 Reexport = "0.2, 1.0"
 Requires = "0.5, 1.0"
 SpecialFunctions = "1.3, 2"
+UnsafeAtomicsLLVM = "0.1"
 julia = "1.6"
diff --git a/lib/cublas/linalg.jl b/lib/cublas/linalg.jl
index ead34d4262..5779bc7c22 100644
--- a/lib/cublas/linalg.jl
+++ b/lib/cublas/linalg.jl
@@ -44,7 +44,7 @@ function LinearAlgebra.dot(x::AnyCuArray{T1}, y::AnyCuArray{T2}) where {T1,T2}
         val = CUDA.reduce_block(+, local_val, zero(T), shuffle)
         if threadIdx().x == 1i32
             # NOTE: introduces nondeterminism
-            @inbounds CUDA.@atomic res[] += val
+            @inbounds CUDA.@atomic res[1i32] += val
         end
 
         return
diff --git a/src/CUDA.jl b/src/CUDA.jl
index 5ddccec286..0e918eae14 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -107,6 +107,10 @@ include("../lib/nvml/NVML.jl")
 const has_nvml = NVML.has_nvml
 export NVML, has_nvml
 
+# KernelAbstractions
+include("CUDAKernels.jl")
+export CUDABackend
+
 include("precompile.jl")
 
 end
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
new file mode 100644
index 0000000000..628f528f75
--- /dev/null
+++ b/src/CUDAKernels.jl
@@ -0,0 +1,253 @@
+module CUDAKernels
+
+import KernelAbstractions
+import CUDA
+
+struct CUDABackend <: KernelAbstractions.GPU
+    prefer_blocks::Bool
+    always_inline::Bool
+end
+CUDABackend(;prefer_blocks=false, always_inline=false) = CUDABackend(prefer_blocks, always_inline)
+
+export CUDABackend
+
+KernelAbstractions.allocate(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.CuArray{T}(undef, dims)
+KernelAbstractions.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims)
+KernelAbstractions.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims)
+
+# Import through parent
+import KernelAbstractions: StaticArrays, Adapt
+import .StaticArrays: MArray
+
+KernelAbstractions.get_backend(::CUDA.CuArray) = CUDABackend()
+KernelAbstractions.get_backend(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDABackend()
+
+KernelAbstractions.synchronize(::CUDABackend) = CUDA.synchronize()
+
+###
+# copyto!
+###
+# - IdDict does not free the memory
+# - WeakRef dict does not unique the key by objectid
+const __pinned_memory = Dict{UInt64, WeakRef}()
+
+function __pin!(a)
+    # use pointer instead of objectid?
+    oid = objectid(a)
+    if haskey(__pinned_memory, oid) && __pinned_memory[oid].value !== nothing
+        return nothing
+    end
+    ad = CUDA.Mem.register(CUDA.Mem.Host, pointer(a), sizeof(a))
+    finalizer(_ -> CUDA.Mem.unregister(ad), a)
+    __pinned_memory[oid] = WeakRef(a)
+    return nothing
+end
+
+function KernelAbstractions.copyto!(::CUDABackend, A, B)
+    A isa Array && __pin!(A)
+    B isa Array && __pin!(B)
+
+    GC.@preserve A B begin
+        destptr = pointer(A)
+        srcptr  = pointer(B)
+        N       = length(A)
+        unsafe_copyto!(destptr, srcptr, N, async=true)
+    end
+    return A
+end
+
+import KernelAbstractions: Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
+
+###
+# Kernel launch
+###
+function launch_config(kernel::Kernel{CUDABackend}, ndrange, workgroupsize)
+    if ndrange isa Integer
+        ndrange = (ndrange,)
+    end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
+
+    # partition checked that the ndrange's agreed
+    if KernelAbstractions.ndrange(kernel) <: StaticSize
+        ndrange = nothing
+    end
+
+    iterspace, dynamic = if KernelAbstractions.workgroupsize(kernel) <: DynamicSize &&
+        workgroupsize === nothing
+        # use ndrange as preliminary workgroupsize for autotuning
+        partition(kernel, ndrange, ndrange)
+    else
+        partition(kernel, ndrange, workgroupsize)
+    end
+
+    return ndrange, workgroupsize, iterspace, dynamic
+end
+
+function threads_to_workgroupsize(threads, ndrange)
+    total = 1
+    return map(ndrange) do n
+        x = min(div(threads, total), n)
+        total *= x
+        return x
+    end
+end
+
+function (obj::Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    backend = KernelAbstractions.backend(obj)
+
+    ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = mkcontext(obj, ndrange, iterspace)
+
+    # If the kernel is statically sized we can tell the compiler about that
+    if KernelAbstractions.workgroupsize(obj) <: StaticSize
+        maxthreads = prod(KernelAbstractions.get(KernelAbstractions.workgroupsize(obj)))
+    else
+        maxthreads = nothing
+    end
+
+    kernel = CUDA.@cuda launch=false always_inline=backend.always_inline maxthreads=maxthreads obj.f(ctx, args...)
+
+    # figure out the optimal workgroupsize automatically
+    if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
+        config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange))
+        if backend.prefer_blocks
+            # Prefer blocks over threads
+            threads = min(prod(ndrange), config.threads)
+            # XXX: Some kernels performs much better with all blocks active
+            cu_blocks = max(cld(prod(ndrange), threads), config.blocks)
+            threads = cld(prod(ndrange), cu_blocks)
+        else
+            threads = config.threads
+        end
+
+        workgroupsize = threads_to_workgroupsize(threads, ndrange)
+        iterspace, dynamic = partition(obj, ndrange, workgroupsize)
+        ctx = mkcontext(obj, ndrange, iterspace)
+    end
+
+    nblocks = length(blocks(iterspace))
+    threads = length(workitems(iterspace))
+
+    if nblocks == 0
+        return nothing
+    end
+
+    # Launch kernel
+    kernel(ctx, args...; threads=threads, blocks=nblocks)
+
+    return nothing
+end
+
+# list of overrides (only for Julia 1.6)
+const overrides = Expr[]
+
+import GPUCompiler
+macro device_override(ex)
+    ex = macroexpand(__module__, ex)
+    if Meta.isexpr(ex, :call)
+        @show ex = eval(ex)
+        error()
+    end
+    code = quote
+        $GPUCompiler.@override($CUDA.method_table, $ex)
+    end
+    if isdefined(Base.Experimental, Symbol("@overlay"))
+        return esc(code)
+    else
+        push!(overrides, code)
+        return
+    end
+end
+
+function __init__()
+    precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+    precompiling && return
+    # register device overrides
+    eval(Expr(:block, overrides...))
+    empty!(overrides)
+end
+
+import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
+import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
+import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
+
+function mkcontext(kernel::Kernel{CUDABackend}, _ndrange, iterspace)
+    CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
+end
+
+@device_override @inline function __index_Local_Linear(ctx)
+    return CUDA.threadIdx().x
+end
+
+@device_override @inline function __index_Group_Linear(ctx)
+    return CUDA.blockIdx().x
+end
+
+@device_override @inline function __index_Global_Linear(ctx)
+    I =  @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+    # TODO: This is unfortunate, can we get the linear index cheaper
+    @inbounds LinearIndices(__ndrange(ctx))[I]
+end
+
+@device_override @inline function __index_Local_Cartesian(ctx)
+    @inbounds workitems(__iterspace(ctx))[CUDA.threadIdx().x]
+end
+
+@device_override @inline function __index_Group_Cartesian(ctx)
+    @inbounds blocks(__iterspace(ctx))[CUDA.blockIdx().x]
+end
+
+@device_override @inline function __index_Global_Cartesian(ctx)
+    return @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+end
+
+@device_override @inline function __validindex(ctx)
+    if __dynamic_checkbounds(ctx)
+        I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+        return I in __ndrange(ctx)
+    else
+        return true
+    end
+end
+
+import KernelAbstractions: groupsize, __groupsize, __workitems_iterspace
+import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize, __size
+
+###
+# GPU implementation of shared memory
+###
+
+@device_override @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+    CUDA.CuStaticSharedArray(T, Dims)
+end
+
+###
+# GPU implementation of scratch memory
+# - private memory for each workitem
+###
+
+@device_override @inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
+    MArray{__size(Dims), T}(undef)
+end
+
+@device_override @inline function __synchronize()
+    CUDA.sync_threads()
+end
+
+@device_override @inline function __print(args...)
+    CUDA._cuprint(args...)
+end
+
+###
+# GPU implementation of const memory
+###
+
+Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental.Const(a)
+
+# Argument conversion
+KernelAbstractions.argconvert(k::Kernel{CUDABackend}, arg) = CUDA.cudaconvert(arg)
+
+end
diff --git a/src/compiler/gpucompiler.jl b/src/compiler/gpucompiler.jl
index bd39307b3a..a6364ebbbc 100644
--- a/src/compiler/gpucompiler.jl
+++ b/src/compiler/gpucompiler.jl
@@ -15,7 +15,7 @@ function device_properties(dev)
             cap = maximum(caps)
 
             # select the PTX ISA we assume to be available
-            # (we actually only need 6.2, but NVPTX doesn't support that)
+            # 6.3 introduced `atom.cas.b16`
             ptx = v"6.3"
 
             # we need to take care emitting LLVM instructions like `unreachable`, which
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
index 19e9f66d3d..4a1cb4a36d 100644
--- a/src/compiler/reflection.jl
+++ b/src/compiler/reflection.jl
@@ -125,7 +125,7 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
         function $method(io::IO, @nospecialize(func), @nospecialize(types);
                          kernel::Bool=false, minthreads=nothing, maxthreads=nothing,
                          blocks_per_sm=nothing, maxregs=nothing, always_inline::Bool=false,
-                         kwargs...)
+                         cap=capability(device()), kwargs...)
             source = FunctionSpec(func, Base.to_tuple_type(types), kernel)
             target = CUDACompilerTarget(device(); minthreads, maxthreads, blocks_per_sm, maxregs)
             params = CUDACompilerParams()
diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl
index 443a7fd420..1bf3887b62 100644
--- a/src/device/intrinsics.jl
+++ b/src/device/intrinsics.jl
@@ -3,6 +3,28 @@
 # special intrinsics for writing version-dependent code
 include("intrinsics/version.jl")
 
+abstract type SyncScope end
+struct SystemScope <: SyncScope end
+struct DeviceScope <: SyncScope end
+struct BlockScope <: SyncScope end
+
+const system_scope = SystemScope()
+const device_scope = DeviceScope()
+const block_scope = BlockScope()
+
+import UnsafeAtomics
+using UnsafeAtomics.Internal: LLVMOrdering
+using UnsafeAtomics: unordered, monotonic, acquire, release, acq_rel, seq_cst
+
+struct AtomicUnsupported{T} <: Exception end
+struct AtomicOrderUnsupported{Ordering} <: Exception
+    order::Ordering
+end
+
+# Note CUDA C++ has also consume ordering which LLVM does not support
+# monotonic -> relaxed
+# unordered -> ??? maybe weak
+
 # extensions to the C language
 include("intrinsics/memory_shared.jl")
 include("intrinsics/indexing.jl")
diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 15b8c10e39..876a456652 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -1,5 +1,7 @@
 # Atomic Functions (B.12)
 
+# TODO replace the below with UnsafeAtomicsLLVM if possible
+
 #
 # Low-level intrinsics
 #
@@ -357,117 +359,352 @@ This operation is only supported for values of type Int32.
 """
 atomic_dec!
 
+asm(::Type{LLVMOrdering{:monotonic}}) = :relaxed
+asm(::Type{LLVMOrdering{Order}}) where Order = Order
+
+asm(::Type{SystemScope}) = :sys
+asm(::Type{DeviceScope}) = :gpu
+asm(::Type{BlockScope}) = :cta
+
+function suffix(sz)
+    if sz == 1
+        "b8"
+    elseif sz == 2
+        "b16"
+    elseif sz == 4
+        "b32"
+    elseif sz == 8
+        "b64"
+    end
+end
 
+function reg(sz)
+    if sz == 1
+        "r"
+    elseif sz == 2
+        "h"
+    elseif sz == 4
+        "r"
+    elseif sz == 8
+        "l"
+    end
+end
 
-#
-# High-level interface
-#
+function addr_space(A)
+    if A == AS.Global
+        as = ".global"
+    elseif A == AS.Shared
+        as = ".shared"
+    else
+        as = ""
+    end
+end
 
-# prototype of a high-level interface for performing atomic operations on arrays
-#
-# this design could be generalized by having atomic {field,array}{set,ref} accessors, as
-# well as acquire/release operations to implement the fallback functionality where any
-# operation can be applied atomically.
-
-if VERSION <= v"1.7-"
-export @atomic
-end
-
-const inplace_ops = Dict(
-    :(+=)   => :(+),
-    :(-=)   => :(-),
-    :(*=)   => :(*),
-    :(/=)   => :(/),
-    :(\=)   => :(\),
-    :(%=)   => :(%),
-    :(^=)   => :(^),
-    :(&=)   => :(&),
-    :(|=)   => :(|),
-    :(⊻=)   => :(⊻),
-    :(>>>=) => :(>>>),
-    :(>>=)  => :(>>),
-    :(<<=)  => :(<<),
-)
+for (order, scope, A, sz) in Iterators.product(
+                                (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
+                                (BlockScope, DeviceScope, SystemScope),
+                                (AS.Generic, AS.Global, AS.Shared),
+                                (2,4,8))
+    instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).$(suffix(sz)) \$0, [\$1];"
+    constraint  = "=$(reg(sz)),l,~{memory}"
+    @eval @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr)
+end
 
-struct AtomicError <: Exception
-    msg::AbstractString
+# Handle byte sized load
+for (order, scope, A) in Iterators.product(
+                            (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
+                            (BlockScope, DeviceScope, SystemScope),
+                            (AS.Generic, AS.Global, AS.Shared))
+    instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).b8 \$0, [\$1];"
+    constraint  = "=r,l,~{memory}"
+    @eval @inline function __load(::Val{1}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
+        return Core.bitcast(T, val % UInt8)
+    end
 end
 
-Base.showerror(io::IO, err::AtomicError) =
-    print(io, "AtomicError: ", err.msg)
+@inline __load(ptr::LLVMPtr{T}, order, scope) where T =
+    __load(Val(sizeof(T)), ptr, order, scope)
 
-"""
-    @atomic a[I] = op(a[I], val)
-    @atomic a[I] ...= val
-
-Atomically perform a sequence of operations that loads an array element `a[I]`, performs the
-operation `op` on that value and a second value `val`, and writes the result back to the
-array. This sequence can be written out as a regular assignment, in which case the same
-array element should be used in the left and right hand side of the assignment, or as an
-in-place application of a known operator. In both cases, the array reference should be pure
-and not induce any side-effects.
-
-!!! warn
-    This interface is experimental, and might change without warning.  Use the lower-level
-    `atomic_...!` functions for a stable API, albeit one limited to natively-supported ops.
-"""
-macro atomic(ex)
-    # decode assignment and call
-    if ex.head == :(=)
-        ref = ex.args[1]
-        rhs = ex.args[2]
-        Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call"))
-        op = rhs.args[1]
-        if rhs.args[2] != ref
-            throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side"))
+for (A, sz) in Iterators.product(
+                    (AS.Generic, AS.Global, AS.Shared),
+                    (2,4,8))
+    instruction = "ld.volatile$(addr_space(A)).$(suffix(sz)) \$0, [\$1];"
+    constraint  = "=$(reg(sz)),l,~{memory}"
+    @eval @inline __load_volatile(::Val{$sz}, ptr::LLVMPtr{T, $A}) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr)
+end
+
+# Handle byte sized load
+for (A) in (AS.Generic, AS.Global, AS.Shared)
+    instruction = "ld.volatile$(addr_space(A)).b8 \$0, [\$1];"
+    constraint  = "=r,l,~{memory}"
+    @eval @inline function __load_volatile(::Val{1}, ptr::LLVMPtr{T, $A}) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
+        return Core.bitcast(T, val % UInt8)
+    end
+end
+
+@inline __load_volatile(ptr::LLVMPtr{T}) where {T} =
+    __load_volatile(Val(sizeof(T)), ptr)
+
+@inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T
+    if order == acq_rel || order == release
+        throw(AtomicOrderUnsupported(order))
+    end
+    if sizeof(T) > 8
+        throw(AtomicUnsupported{T}())
+    end
+    if compute_capability() >= sv"7.0"
+        if order == monotonic
+            val = __load(ptr, monotonic, scope)
+            return val
+        end
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        val = __load(ptr, acquire, scope)
+        return val
+    elseif compute_capability() >= sv"6.0"
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        val = __load_volatile(ptr)
+        if order == monotonic
+            return val
+        end
+        atomic_thread_fence(order, scope)
+        return val
+    else
+        throw(AtomicUnsupported{T}())
+    end
+end
+
+for (order, scope, A, sz) in Iterators.product(
+                            (LLVMOrdering{:release}, LLVMOrdering{:monotonic}),
+                            (BlockScope, DeviceScope, SystemScope),
+                            (AS.Generic, AS.Global, AS.Shared),
+                            (1, 2, 4, 8))
+    instruction = "st$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) [\$0], \$1;"
+    constraint  = "l,$(reg(sz)),~{memory}"
+    @eval @inline __store!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T, ::$order, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val)
+end
+
+@inline __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T =
+    __store!(Val(sizeof(T)), ptr, val, order, scope)
+
+for (A, sz) in Iterators.product(
+                (AS.Generic, AS.Global, AS.Shared),
+                (1, 2, 4, 8))
+    instruction = "st$(addr_space(A)).volatile.$(suffix(sz)) [\$0], \$1;"
+    constraint  = "l,$(reg(sz)),~{memory}"
+    @eval @inline __store_volatile!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T) where {T} =
+        @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val)
+end
+
+# Could be done using LLVM.
+@inline __store_volatile!(ptr::LLVMPtr{T}, val::T) where {T} =
+    __store_volatile!(Val(sizeof(T)), ptr, val)
+
+@inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T
+    if order == acq_rel || order == acquire # || order == consume
+        throw(AtomicOrderUnsupported(order))
+    end
+    if sizeof(T) > 8
+        throw(AtomicUnsupported{T}())
+    end
+    if compute_capability() >= sv"7.0"
+        if order == release
+            __store!(ptr, val, release, scope)
+            return
+        end
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
         end
-        val = rhs.args[3]
-    elseif haskey(inplace_ops, ex.head)
-        op = inplace_ops[ex.head]
-        ref = ex.args[1]
-        val = ex.args[2]
+        __store!(ptr, val, monotonic, scope)
+    elseif compute_capability() >= sv"6.0"
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        __store_volatile!(ptr, val)
     else
-        throw(AtomicError("unknown @atomic expression"))
+        throw(AtomicUnsupported{T}())
+    end
+end
+
+order(::LLVMOrdering{:monotonic}) = 1
+# order(::Consume) = 2
+order(::LLVMOrdering{:acquire}) = 3
+order(::LLVMOrdering{:release}) = 4
+order(::LLVMOrdering{:acq_rel}) = 5
+order(::LLVMOrdering{:seq_cst}) = 6
+
+Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b))
+
+@inline function stronger_order(a::LLVMOrdering, b::LLVMOrdering)
+    m = max(a, b)
+    if m != release
+        return m
+    end
+    # maximum is release, what is the other one?
+    other = min(a, b)
+    if other == monotonic
+        return release
+    # elseif other == Consume()
+    #     return Acq_Rel()
+    elseif other == acquire
+        return acq_rel
+    elseif other == release
+        return release
+    end
+    Base.llvmcall("unreachable", Cvoid, Tuple{})
+    @assert(false)
+end
+
+for (order, scope, A, sz) in Iterators.product(
+                                (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                                (BlockScope, DeviceScope, SystemScope),
+                                (AS.Generic, AS.Global, AS.Shared),
+                                (2, 4, 8))
+    instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
+    constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
+    @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+end
+
+# Handle byte sized cas
+for (order, scope, A) in Iterators.product(
+                            (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                            (BlockScope, DeviceScope, SystemScope),
+                            (AS.Generic, AS.Global, AS.Shared))
+    instruction = "atom.$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).b8 \$0, [\$1];"
+    constraint  = "=r,l,r,r,~{memory}"
+    @eval @inline function __cas!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+        return Core.bitcast(T, val % UInt8)
     end
+end
+
+@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T =
+    __cas!(Val(sizeof(T)), ptr, old, new, order, scope)
+
+for (order, A, sz) in Iterators.product(
+                                (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                                (AS.Generic, AS.Global, AS.Shared),
+                                (2, 4, 8))
+    instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
+    constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
+    @eval @inline __cas_old!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+end
 
-    # decode array expression
-    Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression"))
-    array = ref.args[1]
-    indices = Expr(:tuple, ref.args[2:end]...)
-
-    esc(quote
-        $atomic_arrayset($array, $indices, $op, $val)
-    end)
-end
-
-# FIXME: make this respect the indexing style
-@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} =
-    atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
-
-# native atomics
-for (op,impl,typ) in [(+,   atomic_add!, [UInt32,Int32,UInt64,Int64,Float32]),
-                      (-,   atomic_sub!, [UInt32,Int32,UInt64,Int64,Float32]),
-                      (&,   atomic_and!, [UInt32,Int32,UInt64,Int64]),
-                      (|,   atomic_or!,  [UInt32,Int32,UInt64,Int64]),
-                      (⊻,   atomic_xor!, [UInt32,Int32,UInt64,Int64]),
-                      (max, atomic_max!, [UInt32,Int32,UInt64,Int64]),
-                      (min, atomic_min!, [UInt32,Int32,UInt64,Int64])]
-    @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
-                                  val::T) where {T<:Union{$(typ...)}} =
-        $impl(pointer(A, I), val)
-end
-
-# native atomics that are not supported on all devices
-@inline function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::typeof(+),
-                                 val::T) where {T <: Union{Float64}}
-    ptr = pointer(A, I)
-    if compute_capability() >= sv"6.0"
-        atomic_add!(ptr, val)
+# Handle byte sized cas
+for (order, A) in Iterators.product(
+                            (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                            (AS.Generic, AS.Global, AS.Shared))
+    instruction = "atom.$(addr_space(A)).cas.$(asm(order)).b8 \$0, [\$1];"
+    constraint  = "=r,l,r,r,~{memory}"
+    @eval @inline function __cas_old!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+        return Core.bitcast(T, val % UInt8)
+    end
+end
+
+@inline __cas_old!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T =
+    __cas_old!(Val(sizeof(T)), ptr, old, new, scope)
+
+@inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
+    order = stronger_order(success_order, failure_order)
+    if sizeof(T) > 8 || sizeof(T) < 2
+        throw(AtomicUnsupported{T}())
+    end
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        if order == seq_cst # order == consume
+            order = acquire
+        end
+        old = __cas!(ptr, expected, new, order, scope)
+    elseif compute_capability() >= sv"6.0"
+        if order == seq_cst || order == acq_rel || order == release
+            atomic_thread_fence(seq_cst, scope)
+        end
+        old = __cas_old!(ptr, expected, new, scope)
+        if order == seq_cst || order == acq_rel || order == acquire # order == consume
+            atomic_thread_fence(seq_cst, scope)
+        end
     else
-        atomic_op!(ptr, op, val)
+        throw(AtomicUnsupported{T}())
+    end
+    return old
+end
+
+#
+# High-level interface
+#
+import Atomix: @atomic, @atomicswap, @atomicreplace
+# import UnsafeAtomicsLLVM
+
+if VERSION <= v"1.7"
+    export @atomic
+end
+
+using Atomix: Atomix, IndexableRef
+
+const CuIndexableRef{Indexable<:CuDeviceArray} = IndexableRef{Indexable}
+
+@inline function Atomix.get(ref::CuIndexableRef, order)
+    atomic_load(Atomix.pointer(ref), order)
+end
+
+@inline function Atomix.set!(ref::CuIndexableRef, v, order)
+    v = convert(eltype(ref), v)
+    atomic_store!(Atomix.pointer(ref), v, order)
+end
+
+@inline function Atomix.replace!(ref::CuIndexableRef, expected, desired,
+                                 success_ordering, failure_ordering)
+    ptr = Atomix.pointer(ref)
+    expected = convert(eltype(ref), expected)
+    desired = convert(eltype(ref), desired)
+    return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering)
+end
+
+@inline function modify!(ptr::LLVMPtr{T}, op::OP, x, order) where {T, OP}
+    old = atomic_load(ptr, order)
+    while true
+        expected = old
+        new = op(expected, x)
+        old = atomic_cas!(ptr, expected, new, order, monotonic)
+        if old === expected
+            return expected => new
+        end
     end
 end
 
-# fallback using compare-and-swap
-@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
-    atomic_op!(pointer(A, I), op, val)
+@inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP}
+    x = convert(eltype(ref), x)
+    ptr = Atomix.pointer(ref)
+    # TODO: Support hardware variants
+    # old = if op === (+)
+    #     atomic_add!(ptr, x)
+    # elseif op === (-)
+    #     atomic_sub!(ptr, x)
+    # elseif op === (&)
+    #     atomic_and!(ptr, x)
+    # elseif op === (|)
+    #     atomic_or!(ptr, x)
+    # elseif op === xor
+    #     atomic_xor!(ptr, x)
+    # elseif op === min
+    #     atomic_min!(ptr, x)
+    # elseif op === max
+    #     atomic_max!(ptr, x)
+    # else
+        return modify!(ptr, op, x, order)
+    # end
+    # return old => op(old, x)
+end
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index 9c76737e36..2cf19e7cf4 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -83,39 +83,126 @@ the warp.
              Cvoid, Tuple{UInt32}, convert(UInt32, mask))
 end
 
+@inline threadfence(::BlockScope) = threadfence_block()
+@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
+@inline threadfence_sc_block() = @asmcall("fence.sc.cta;", "~{memory}", true, Cvoid, Tuple{})
+@inline threadfence_acq_rel_block() = @asmcall("fence.acq_rel.cta;", "~{memory}", true, Cvoid, Tuple{})
+
+function atomic_thread_fence(order, scope::BlockScope)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+            threadfence_sc_block()
+        elseif order == acquire || order == acq_rel || order == release # || order == consume
+            threadfence_acq_rel_block()
+        else
+            throw(AtomicOrderUnsupported(order))
+        end
+    else
+        if order == seq_cst ||
+         # order == consume ||
+           order == acquire ||
+           order == acq_rel ||
+           order == release
+
+            threadfence_block()
+        else
+            throw(AtomicOrderUnsupported(order))
+        end
+    end
+end
+
+@inline threadfence(::DeviceScope=device_scope) = threadfence_device()
+@inline threadfence_device() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+@inline threadfence_sc_device() = @asmcall("fence.sc.gpu;", "~{memory}", true, Cvoid, Tuple{})
+@inline threadfence_acq_rel_device() = @asmcall("fence.acq_rel.gpu;", "~{memory}", true, Cvoid, Tuple{})
+
+function atomic_thread_fence(order, scope::DeviceScope=device_scope)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+
+            threadfence_sc_device()
+        elseif order == acquire ||
+             # order == consume ||
+               order == acq_rel ||
+               order == release
+
+            threadfence_acq_rel_device()
+        else
+            throw(AtomicOrderUnsupported(order))
+        end
+    else
+        if order == seq_cst ||
+           # order == consume ||
+           order == acquire ||
+           order == acq_rel ||
+           order == release
+
+            threadfence_device()
+        else
+            throw(AtomicOrderUnsupported(order))
+        end
+    end
+end
+
+@inline threadfence(::SystemScope) = threadfence_system()
+@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+@inline threadfence_sc_system() = @asmcall("fence.sc.sys;", "~{memory}", true, Cvoid, Tuple{})
+@inline threadfence_acq_rel_system() = @asmcall("fence.acq_rel.sys;", "~{memory}", true, Cvoid, Tuple{})
+
+function atomic_thread_fence(order, scope::SystemScope)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+
+            threadfence_sc_system()
+        elseif order == acquire ||
+            #  order == consume ||
+               order == acq_rel ||
+               order == release
+
+            threadfence_acq_rel_system()
+        else
+            throw(AtomicOrderUnsupported(order))
+        end
+    else
+        if order == seq_cst ||
+         # order == consume ||
+           order == acquire ||
+           order == acq_rel ||
+           order == release
+
+            threadfence_system()
+        else
+            throw(AtomicOrderUnsupported(order))
+        end
+    end
+end
+
 """
-    threadfence_block()
+    threadfence(::SyncScope=device_scope)
 
 A memory fence that ensures that:
-- All writes to all memory made by the calling thread before the call to `threadfence_block()`
-  are observed by all threads in the block of the calling thread as occurring before all writes
-  to all memory made by the calling thread after the call to `threadfence_block()`
-- All reads from all memory made by the calling thread before the call to `threadfence_block()`
-  are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`.
-"""
-@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
+- All writes to all memory made by the calling thread before the call to `threadfence(scope)`
+  are observed by all threads in the scope of the calling thread as occurring before all writes
+  to all memory made by the calling thread after the call to `threadfence(scope)`
+- All reads from all memory made by the calling thread before the call to `threadfence(scope)`
+  are ordered before all reads from all memory made by the calling thread after the call to `threadfence(scope)`.
 
-"""
-    threadfence()
+SyncScope can be one of `block_scope`, `device_scope`, or `system_scope`.
+  - `block_scope` orders reads and write on the *same* block.
+  - `device_scope` orders reads and write on the *same* device.
+  - `system_scope` orders reads and writes across all threads in the device,
+    host threads, and all threads in peer devices.
 
-A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the
-calling thread and also ensures that no writes to all memory made by the calling thread after
-the call to `threadfence()` are observed by any thread in the device as occurring before any
-write to all memory made by the calling thread before the call to `threadfence()`.
+See [`atomic_thread_fence`](@ref) for a variant that takes atomic orderings.
 
-Note that for this ordering guarantee to be true, the observing threads must truly observe the
-memory and not cached versions of it; this is requires the use of volatile loads and stores,
-which is not available from Julia right now.
-"""
-@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+!!! note
+  Note that for this ordering guarantee to be true, the observing threads must truly observe the
+  memory and not cached versions of it; this is requires the use of atomic loads and stores.
 
 """
-    threadfence_system()
+function threadfence end
 
-A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the
-calling thread and also ensures that all writes to all memory made by the calling thread
-before the call to `threadfence_system()` are observed by all threads in the device,
-host threads, and all threads in peer devices as occurring before all writes to all
-memory made by the calling thread after the call to `threadfence_system()`.
 """
-@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+    atomic_thread_fence(order::Atomicx.Ordering, ::SyncScope=device)
+"""
+function atomic_thread_fence end
\ No newline at end of file
diff --git a/test/Project.toml b/test/Project.toml
index 95252c2eda..ec5354dea7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -7,7 +7,9 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -15,6 +17,7 @@ REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index 29810defe7..da331c0474 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -1,410 +1,182 @@
-# TODO: unify with Base.@atomic
-using CUDA: @atomic
-using BFloat16s: BFloat16
-
-@testset "atomics (low-level)" begin
-
-# tested on all natively-supported atomics
-
-@testset "atomic_add" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32]
-    capability(device()) >= v"6.0" && push!(types, Float64)
-    capability(device()) >= v"7.0" && push!(types, Float16)
-
-    @testset for T in types
-        a = CuArray(T[0])
-
-        function kernel(a, b)
-            CUDA.atomic_add!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == 1024
-    end
-end
-
-@testset "atomic_sub" begin
-    types = [Int32, Int64, UInt32, UInt64]
-
-    @testset for T in types
-        a = CuArray(T[2048])
-
-        function kernel(a, b)
-            CUDA.atomic_sub!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == 1024
-    end
-end
-
-@testset "atomic_inc" begin
-    @testset for T in [Int32]
-        a = CuArray(T[0])
-
-        function kernel(a, b)
-            CUDA.atomic_inc!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=768 kernel(a, T(512))
-        @test Array(a)[1] == 255
-    end
-end
-
-@testset "atomic_dec" begin
-    @testset for T in [Int32]
-        a = CuArray(T[1024])
-
-        function kernel(a, b)
-            CUDA.atomic_dec!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=256 kernel(a, T(512))
-        @test Array(a)[1] == 257
-    end
-end
-
-@testset "atomic_xchg" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray([zero(T)])
-
-        function kernel(a, b)
-            CUDA.atomic_xchg!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == one(T)
-    end
-end
-
-@testset "atomic_and" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray(T[1023])
-
-        function kernel(a, T)
-            i = threadIdx().x - 1
-            k = 1
-            for i = 1:i
-                k *= 2
-            end
-            b = 1023 - k  # 1023 - 2^i
-            CUDA.atomic_and!(pointer(a), T(b))
-            return
-        end
-
-        @cuda threads=10 kernel(a, T)
-        @test Array(a)[1] == zero(T)
-    end
-end
-
-@testset "atomic_or" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray(T[0])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            b = 1  # 2^(i-1)
-            for i = 1:i
-                b *= 2
-            end
-            b /= 2
-            CUDA.atomic_or!(pointer(a), T(b))
-            return
-        end
-
-        @cuda threads=10 kernel(a, T)
-        @test Array(a)[1] == 1023
-    end
-end
-
-@testset "atomic_xor" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray(T[1023])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            b = 1  # 2^(i-1)
-            for i = 1:i
-                b *= 2
-            end
-            b /= 2
-            CUDA.atomic_xor!(pointer(a), T(b))
-            return
-        end
-
-        @cuda threads=10 kernel(a, T)
-        @test Array(a)[1] == 0
-    end
-end
-
-@testset "atomic_cas" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16])
-
-    @testset for T in types
-        a = CuArray(T[0])
-
-        function kernel(a, b, c)
-            CUDA.atomic_cas!(pointer(a), b, c)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, zero(T), one(T))
-        @test Array(a)[1] == 1
-    end
-end
-
-@testset "atomic_max" begin
-    types = [Int32, Int64, UInt32, UInt64]
-
-    @testset for T in types
-        a = CuArray([zero(T)])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            CUDA.atomic_max!(pointer(a), T(i))
-            return
-        end
-
-        @cuda threads=1024 kernel(a, T)
-        @test Array(a)[1] == 1024
-    end
-end
-
-@testset "atomic_min" begin
-    types = [Int32, Int64, UInt32, UInt64]
-
-    @testset for T in types
-        a = CuArray(T[1024])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            CUDA.atomic_min!(pointer(a), T(i))
-            return
-        end
-
-        @cuda threads=1024 kernel(a, T)
-        @test Array(a)[1] == 1
-    end
-end
-
-@testset "shared memory" begin
-    function kernel()
-        shared = CuStaticSharedArray(Float32, 1)
-        @atomic shared[threadIdx().x] += 0f0
-        return
-    end
-
-    CUDA.@sync @cuda kernel()
-end
-
-end
+using CUDA: @atomic, @atomicswap, @atomicreplace
 
 @testset "atomics (high-level)" begin
-
-# tested on all types supported by atomic_cas! (which empowers the fallback definition)
-
-@testset "add" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray([zero(T)])
-
+    # tested on all types supported by atomic_cas! (which empowers the fallback definition)
+    
+    @testset "add" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] + 1
             @atomic a[1] += 1
             return
         end
 
-        @cuda threads=1024 kernel(T, a)
-        @test Array(a)[1] == 2048
+        @testset for T in types
+            a = CuArray([zero(T)])    
+            @cuda threads=1024 kernel(T, a)
+            @test Array(a)[1] == 1024
+        end
     end
-end
-
-@testset "sub" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray(T[2048])
-
+    
+    @testset "sub" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] - 1
             @atomic a[1] -= 1
             return
         end
 
-        @cuda threads=1024 kernel(T, a)
-        @test Array(a)[1] == 0
+        @testset for T in types
+            a = CuArray(T[2048])    
+            @cuda threads=1024 kernel(T, a)
+            @test Array(a)[1] == 1024
+        end
     end
-end
-
-@testset "mul" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray(T[1])
-
+    
+    @testset "mul" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] * 2
             @atomic a[1] *= 2
             return
         end
 
-        @cuda threads=5 kernel(T, a)
-        @test Array(a)[1] == 1024
+        @testset for T in types
+            a = CuArray(T[1])
+            @cuda threads=5 kernel(T, a)
+            @test Array(a)[1] == 32
+        end
     end
-end
-
-@testset "div" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray(T[1024])
-
+    
+    @testset "div" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] / 2
             @atomic a[1] /= 2
             return
         end
 
-        @cuda threads=5 kernel(T, a)
-        @test Array(a)[1] == 1
+        @testset for T in types
+            a = CuArray(T[32])    
+            @cuda threads=5 kernel(T, a)
+            @test Array(a)[1] == 1
+        end
     end
-end
-
-@testset "and" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([~zero(T), ~zero(T)])
-
+    
+    @testset "and" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
+    
         function kernel(T, a)
             i = threadIdx().x
             mask = ~(T(1) << (i-1))
-            @atomic a[1] = a[1] & mask
-            @atomic a[2] &= mask
+            @atomic a[1] &= mask
             return
         end
-
-        @cuda threads=8*sizeof(T) kernel(T, a)
-        @test Array(a)[1] == zero(T)
-        @test Array(a)[2] == zero(T)
+            
+        @testset for T in types
+            a = CuArray([~zero(T)])    
+            @cuda threads=8*sizeof(T) kernel(T, a)
+            @test Array(a)[1] == zero(T)
+        end
     end
-end
-
-@testset "or" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([zero(T), zero(T)])
+    
+    @testset "or" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
 
         function kernel(T, a)
             i = threadIdx().x
             mask = T(1) << (i-1)
-            @atomic a[1] = a[1] | mask
-            @atomic a[2] |= mask
+            @atomic a[1] |= mask
             return
         end
 
-        @cuda threads=8*sizeof(T) kernel(T, a)
-        @test Array(a)[1] == ~zero(T)
-        @test Array(a)[2] == ~zero(T)
+        @testset for T in types
+            a = CuArray([zero(T)])    
+            @cuda threads=8*sizeof(T) kernel(T, a)
+            @test Array(a)[1] == ~zero(T)
+        end
     end
-end
-
-@testset "xor" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([zero(T), zero(T)])
-
+    
+    @testset "xor" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
+    
         function kernel(T, a)
             i = threadIdx().x
             mask = T(1) << ((i-1)%(8*sizeof(T)))
-            @atomic a[1] = a[1] ⊻ mask
-            @atomic a[2] ⊻= mask
+            @atomic a[1] ⊻= mask
             return
         end
 
-        nb = 4
-        @cuda threads=(8*sizeof(T)+nb) kernel(T, a)
-        @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T))
-        @test Array(a)[2] == ~zero(T) & ~((one(T) << nb) - one(T))
+        @testset for T in types
+            a = CuArray([zero(T)])
+            nb = 4
+            @cuda threads=(8*sizeof(T)+nb) kernel(T, a)
+            @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T))
+        end
     end
-end
-
-@testset "max" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray([zero(T)])
-
+    
+    @testset "max" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
             i = threadIdx().x
-            @atomic a[1] = max(a[1], i)
+            @atomic a[1] max i
             return
         end
 
-        @cuda threads=32 kernel(T, a)
-        @test Array(a)[1] == 32
+        @testset for T in types
+            a = CuArray([zero(T)])    
+            @cuda threads=32 kernel(T, a)
+            @test Array(a)[1] == 32
+        end
     end
-end
-
-@testset "min" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray([typemax(T)])
-
+    
+    @testset "min" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
             i = threadIdx().x
-            @atomic a[1] = min(a[1], i)
+            @atomic a[1] min i
             return
         end
 
-        @cuda threads=32 kernel(T, a)
-        @test Array(a)[1] == 1
+        @testset for T in types
+            a = CuArray([typemax(T)])
+            @cuda threads=32 kernel(T, a)
+            @test Array(a)[1] == 1
+        end
     end
-end
-
-@testset "shift" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([one(T)])
-
+    
+    @testset "shift" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
+    
         function kernel(T, a)
             @atomic a[1] <<= 1
             return
         end
 
-        @cuda threads=8 kernel(T, a)
-        @test Array(a)[1] == 1<<8
+        @testset for T in types
+            a = CuArray([one(T)])    
+            @cuda threads=8 kernel(T, a)
+            @test Array(a)[1] == 1<<8
+        end
     end
-end
-
-@testset "macro" begin
-
+    
     @testset "NaN" begin
         f(x,y) = 3x + 2y
 
         function kernel(x)
-            CUDA.@atomic x[1] = f(x[1],42f0)
+            @inbounds CUDA.@atomic x[1] f 42f0
             nothing
         end
 
@@ -417,69 +189,15 @@ end
         @test isnan(Array(a)[1])
     end
 
-    using CUDA: AtomicError
-
-    @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin
-        @atomic a[1] = 1
-    end
-    @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin
-        @atomic a[1] = b ? 1 : 2
-    end
-
-    @test_throws_macro AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side") @macroexpand begin
-        @atomic a[1] = a[2] + 1
-    end
-
-    @test_throws_macro AtomicError("unknown @atomic expression") @macroexpand begin
-        @atomic wat(a[1])
-    end
-
-    @test_throws_macro AtomicError("@atomic should be applied to an array reference expression") @macroexpand begin
-        @atomic a = a + 1
-    end
-end
-
-@testset "shared memory" begin
-    # test that atomic operations on shared memory work
-    # https://github.com/JuliaGPU/CUDA.jl/issues/311
-
-    function kernel(a)
-        b = CUDA.CuStaticSharedArray(Int, 1)
-
-        if threadIdx().x == 1
-            b[] = a[]
+    @testset "macro" begin
+        @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin
+            @atomic wat(a[1])
         end
-        sync_threads()
-
-        CUDA.atomic_add!(pointer(b), 1)
-        sync_threads()
-
-        if threadIdx().x == 1
-            a[] = b[]
+    
+        @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin
+            @atomic a = a + 1
         end
-        return
-    end
-
-    a = CuArray([0])
-    @cuda threads=16 kernel(a)
-    @test Array(a) == [16]
-end
-
-@testset "shared memory bug" begin
-    # shared memory atomics resulted in illegal memory accesses
-    # https://github.com/JuliaGPU/CUDA.jl/issues/558
-
-    function kernel()
-        tid = threadIdx().x
-        shared = CuStaticSharedArray(Float32, 4)
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-        sync_threads()
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-        return
     end
-
-    @cuda threads=2 kernel()
-    synchronize()
-end
-
+    
 end
+    
\ No newline at end of file
diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl
new file mode 100644
index 0000000000..0db2c30527
--- /dev/null
+++ b/test/device/intrinsics/lowlevel_atomics.jl
@@ -0,0 +1,303 @@
+using BFloat16s: BFloat16
+
+@testset "atomics (low-level) with order" begin
+
+@testset "atomic_load" begin
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 BFloat16, Float16, Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
+
+        function kernel(a, order, scope)
+            CUDA.atomic_load(pointer(a), order, scope)
+            return
+        end
+
+        @testset for (T, order, scope) in Iterators.product(types, orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, order, scope)
+            @test Array(a)[1] == 0
+        end
+    end
+end
+
+@testset "atomic_store!" begin
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 BFloat16, Float16, Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
+
+        function kernel(a, val, order, scope)
+            CUDA.atomic_store!(pointer(a), val, order, scope)
+            return
+        end
+
+        @testset for (T, order, scope) in Iterators.product(types, orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, one(T), order, scope)
+            @test Array(a)[1] == one(T)
+        end
+    end
+end
+
+@testset "atomic_cas!" begin
+    if capability(device()) >= v"6.0"
+        # TODO size(T) in (1, 2)
+        types = [Int32, Int64, 
+                 UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel]
+
+        function kernel(a, expected, desired, success_order, failure_order, scope)
+            CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope)
+            return
+        end
+
+        @testset for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope)
+            @test Array(a)[1] == one(T)
+        end
+    end
+end
+
+end # atomics (low-level) with order
+
+@testset "atomics (low-level)" begin
+
+# tested on all natively-supported atomics
+
+@testset "atomic_add" begin
+    types = [Int32, Int64, UInt32, UInt64, Float32]
+    capability(device()) >= v"6.0" && push!(types, Float64)
+    capability(device()) >= v"7.0" && push!(types, Float16)
+
+    function kernel(a, b)
+        CUDA.atomic_add!(pointer(a), b)
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[0])
+
+        @cuda threads=1024 kernel(a, one(T))
+        @test Array(a)[1] == 1024
+    end
+end
+
+@testset "atomic_sub" begin
+    types = [Int32, Int64, UInt32, UInt64]
+
+    function kernel(a, b)
+        CUDA.atomic_sub!(pointer(a), b)
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[2048])
+        @cuda threads=1024 kernel(a, one(T))
+        @test Array(a)[1] == 1024
+    end
+end
+
+@testset "atomic_inc" begin
+    function kernel(a, b)
+        CUDA.atomic_inc!(pointer(a), b)
+        return
+    end
+
+    @testset for T in [Int32]
+        a = CuArray(T[0])
+        @cuda threads=768 kernel(a, T(512))
+        @test Array(a)[1] == 255
+    end
+end
+
+@testset "atomic_dec" begin
+    function kernel(a, b)
+        CUDA.atomic_dec!(pointer(a), b)
+        return
+    end
+
+    @testset for T in [Int32]
+        a = CuArray(T[1024])
+        @cuda threads=256 kernel(a, T(512))
+        @test Array(a)[1] == 257
+    end
+end
+
+@testset "atomic_xchg" begin
+    function kernel(a, b)
+        CUDA.atomic_xchg!(pointer(a), b)
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray([zero(T)])
+        @cuda threads=1024 kernel(a, one(T))
+        @test Array(a)[1] == one(T)
+    end
+end
+
+@testset "atomic_and" begin
+    function kernel(a, T)
+        i = threadIdx().x - 1
+        k = 1
+        for i = 1:i
+            k *= 2
+        end
+        b = 1023 - k  # 1023 - 2^i
+        CUDA.atomic_and!(pointer(a), T(b))
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray(T[1023])
+        @cuda threads=10 kernel(a, T)
+        @test Array(a)[1] == zero(T)
+    end
+end
+
+@testset "atomic_or" begin
+    function kernel(a, T)
+        i = threadIdx().x
+        b = 1  # 2^(i-1)
+        for i = 1:i
+            b *= 2
+        end
+        b /= 2
+        CUDA.atomic_or!(pointer(a), T(b))
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray(T[0])
+        @cuda threads=10 kernel(a, T)
+        @test Array(a)[1] == 1023
+    end
+end
+
+@testset "atomic_xor" begin
+    function kernel(a, T)
+        i = threadIdx().x
+        b = 1  # 2^(i-1)
+        for i = 1:i
+            b *= 2
+        end
+        b /= 2
+        CUDA.atomic_xor!(pointer(a), T(b))
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray(T[1023])
+        @cuda threads=10 kernel(a, T)
+        @test Array(a)[1] == 0
+    end
+end
+
+@testset "atomic_cas" begin
+    types = [Int32, Int64, UInt32, UInt64]
+    capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16])
+
+    function kernel(a, b, c)
+        CUDA.atomic_cas!(pointer(a), b, c)
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[0])
+        @cuda threads=1024 kernel(a, zero(T), one(T))
+        @test Array(a)[1] == 1
+    end
+end
+
+@testset "atomic_max" begin
+    types = [Int32, Int64, UInt32, UInt64]
+
+    function kernel(a, T)
+        i = threadIdx().x
+        CUDA.atomic_max!(pointer(a), T(i))
+        return
+    end
+
+    @testset for T in types
+        a = CuArray([zero(T)])
+        @cuda threads=1024 kernel(a, T)
+        @test Array(a)[1] == 1024
+    end
+end
+
+@testset "atomic_min" begin
+    types = [Int32, Int64, UInt32, UInt64]
+
+    function kernel(a, T)
+        i = threadIdx().x
+        CUDA.atomic_min!(pointer(a), T(i))
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[1024])
+        @cuda threads=1024 kernel(a, T)
+        @test Array(a)[1] == 1
+    end
+end
+
+@testset "shared memory" begin
+    @testset "simple" begin
+        function kernel()
+            shared = CuStaticSharedArray(Float32, 1)
+            CUDA.atomic_add!(pointer(shared, threadIdx().x), 0f0)
+            return
+        end
+
+        CUDA.@sync @cuda kernel()
+    end
+
+    @testset "shared memory reduction" begin
+        # test that atomic operations on shared memory work
+        # https://github.com/JuliaGPU/CUDA.jl/issues/311
+
+        function kernel(a)
+            b = CUDA.CuStaticSharedArray(Int, 1)
+
+            if threadIdx().x == 1
+                b[] = a[]
+            end
+            sync_threads()
+
+            CUDA.atomic_add!(pointer(b), 1)
+            sync_threads()
+
+            if threadIdx().x == 1
+                a[] = b[]
+            end
+            return
+        end
+
+        a = CuArray([0])
+        @cuda threads=16 kernel(a)
+        @test Array(a) == [16]
+    end
+
+    @testset "shared memory bug" begin
+        # shared memory atomics resulted in illegal memory accesses
+        # https://github.com/JuliaGPU/CUDA.jl/issues/558
+
+        function kernel()
+            tid = threadIdx().x
+            shared = CuStaticSharedArray(Float32, 4)
+            CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
+            sync_threads()
+            CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
+            return
+        end
+
+        @cuda threads=2 kernel()
+        synchronize()
+    end
+end
+
+end # low-level atomics
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
new file mode 100644
index 0000000000..05f5d36978
--- /dev/null
+++ b/test/kernelabstractions.jl
@@ -0,0 +1,16 @@
+import KernelAbstractions
+using Test
+
+include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
+
+using CUDA
+using CUDA.CUDAKernels
+
+if CUDA.functional()
+    CUDA.versioninfo()
+    CUDA.allowscalar(false)
+    Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CUDA.CuDeviceArray)
+    for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
+        Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CUDA.CuDeviceArray)
+    end
+end