Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Atomic - causes LLVM error on CUDA #36

Open
pxl-th opened this issue Apr 11, 2023 · 0 comments
Open

Atomic - causes LLVM error on CUDA #36

pxl-th opened this issue Apr 11, 2023 · 0 comments

Comments

@pxl-th
Copy link

pxl-th commented Apr 11, 2023

Ref FluxML/NNlib.jl#487.

Consider this kernel:

@kernel function _scatter!(op::OP, dst, src, idxs) where OP
    i = @index(Global)
    idx = Tuple(idxs[i])
    Atomix.modify!(Atomix.IndexableRef(dst, idx), op, src[i])
end

On Julia 1.9-rc2 on CUDA fails for - op, but works for +, *, /, min, max:

julia> using CUDA, NNlib

julia> x = CUDA.ones(Float32, 3, 4);

julia> idxs= cu([1 2 3 4; 4 3 2 1; 3 5 5 3]);

julia> y = NNlib.scatter(+, x, idxs);

julia> y = NNlib.scatter(-, x, idxs);
ERROR: LLVM error: Cannot select: 0x817b8d0: f32,ch = <<Unknown DAG Node>><(load store seq_cst (s32) on %ir.33, addrspace 1)> 0x817bad8:1, 0x7efa020, 0x817bad8, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/atomics.jl:255 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/atomics.jl:255 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/atomics.jl:359 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/internal.jl:20 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
  0x7efa020: i64 = add 0x7ef93f0, Constant:i64<-4>, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:114 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ]
    0x7ef93f0: i64 = add 0x817b798, 0x7c80840, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:114 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ]
      0x817b798: i64,ch = CopyFromReg 0x7569e78, Register:i64 %0, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:114 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ]
        0x817bd48: i64 = Register %0
      0x7c80840: i64 = shl 0x817b528, Constant:i32<2>, int.jl:88 @[ abstractarray.jl:1247 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ]
        0x817b528: i64,ch = CopyFromReg 0x7569e78, Register:i64 %9, int.jl:88 @[ abstractarray.jl:1247 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ]
          0x817bc10: i64 = Register %9
        0x817bc78: i32 = Constant<2>
    0x817b868: i64 = Constant<-4>
  0x817bad8: f32,ch = load<(load (s32) from %ir.28, !tbaa !203, addrspace 1)> 0x7569e78, 0x817be80, undef:i64, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
    0x817be80: i64 = add nuw 0x7ef9f50, Constant:i64<4>, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
      0x7ef9f50: i64 = add 0x7ef9e80, 0x7ef9db0, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
        0x7ef9e80: i64,ch = CopyFromReg 0x7569e78, Register:i64 %1, /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:80 @[ none:0 ]
          0x817b9a0: i64 = Register %1
        0x7ef9db0: i64 = shl 0x7efa0f0, Constant:i32<2>, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
          0x7efa0f0: i64,ch = CopyFromReg 0x7569e78, Register:i64 %8, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
            0x817bf50: i64 = Register %8
          0x817bc78: i32 = Constant<2>
      0x817b4c0: i64 = Constant<4>
    0x817b5f8: i64 = undef
In function: _Z13gpu__scatter_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE1_13CuDeviceArrayI7Float32Li1ELi1EES8_IS9_Li2ELi1EES8_IS5_Li2ELi1EE
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/TLGyi/src/core/context.jl:118
  [2] LLVMTargetMachineEmitToMemoryBuffer
    @ ~/.julia/packages/LLVM/TLGyi/lib/13/libLLVM_h.jl:947 [inlined]
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/TLGyi/src/targetmachine.jl:45
  [4] mcgen(job::GPUCompiler.CompilerJob, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/mcgen.jl:73
  [5] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [6] macro expansion
    @ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:424 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [8] macro expansion
    @ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:421 [inlined]
  [9] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/utils.jl:83
 [10] emit_asm
    @ ~/.julia/packages/GPUCompiler/HQBY9/src/utils.jl:77 [inlined]
 [11] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing, ctx::LLVM.ThreadSafeContext)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:140
 [12] codegen
    @ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:94 [inlined]
 [13] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, ctx::LLVM.ThreadSafeContext)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:90
 [14] compile
    @ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:81 [inlined]
 [15] compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
    @ CUDA ~/.julia/packages/CUDA/is36v/src/compiler/compilation.jl:105
 [16] #203
    @ ~/.julia/packages/CUDA/is36v/src/compiler/compilation.jl:100 [inlined]
 [17] LLVM.ThreadSafeContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ LLVM ~/.julia/packages/LLVM/TLGyi/src/executionengine/ts_module.jl:14
 [18] JuliaContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:35
 [19] compile
    @ ~/.julia/packages/CUDA/is36v/src/compiler/compilation.jl:99 [inlined]
 [20] actual_compilation(cache::Dict{UInt64, Any}, key::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/cache.jl:53
 [21] cached_compilation(cache::Dict{UInt64, Any}, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/cache.jl:37
 [22] macro expansion
    @ ~/.julia/packages/CUDA/is36v/src/compiler/execution.jl:310 [inlined]
 [23] macro expansion
    @ ./lock.jl:267 [inlined]
 [24] cufunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(-), CuDeviceVector{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Int64, 1}}}; kwargs::Base.Pairs{Symbol, Union{Nothing, Bool}, Tuple{Symbol, Symbol}, NamedTuple{(:always_inline, :maxthreads), Tuple{Bool, Nothing}}})
    @ CUDA ~/.julia/packages/CUDA/is36v/src/compiler/execution.jl:306
 [25] macro expansion
    @ ~/.julia/packages/CUDA/is36v/src/compiler/execution.jl:104 [inlined]
 [26] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/is36v/src/CUDAKernels.jl:116
 [27] Kernel
    @ ~/.julia/packages/CUDA/is36v/src/CUDAKernels.jl:102 [inlined]
 [28] scatter!
    @ ~/code/NNlib.jl/src/scatter.jl:104 [inlined]
 [29] scatter(op::typeof(-), src::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, idx::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/code/NNlib.jl/src/scatter.jl:177
 [30] scatter(op::typeof(-), src::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, idx::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer})
    @ NNlib ~/code/NNlib.jl/src/scatter.jl:168
 [31] top-level scope
    @ REPL[7]:1
 [32] top-level scope
    @ ~/.julia/packages/CUDA/is36v/src/initialization.jl:162

Additionally, irrespective of backend, replacing:

Atomix.modify!(Atomix.IndexableRef(dst, idx), op, src[i])

with:

@atomic dst[idx...] = op(dst[idx...], src[i])

Does not perform op atomically.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant