From a6fc211f54a6ab9bb3eb19d1d2447554c8209465 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 27 Feb 2020 10:46:22 +0100 Subject: [PATCH 1/3] Use a custom generator function for control on back-edges. --- src/execution.jl | 235 ++++++++++++++++++++++----------------- test/device/execution.jl | 20 ++++ 2 files changed, 151 insertions(+), 104 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 90269042..079943ac 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -61,23 +61,6 @@ function assign_args!(code, args) return vars, var_exprs end -# fast lookup of global world age -world_age() = ccall(:jl_get_tls_world_age, UInt, ()) - -# slow lookup of local method age -function method_age(f, t)::UInt - for m in Base._methods(f, t, 1, typemax(UInt)) - if VERSION >= v"1.2.0-DEV.573" - return m[3].primary_world - else - return m[3].min_world - end - end - - tt = Base.to_tuple_type(t) - throw(MethodError(f, tt)) -end - ## high-level @cuda interface @@ -334,8 +317,87 @@ end ## host-side API -const agecache = Dict{UInt, UInt}() +using Core.Compiler: retrieve_code_info, CodeInfo, MethodInstance, SSAValue, SlotNumber +using Base: _methods_by_ftype + +# actual compilation +function cufunction_slow(f, tt, spec; name=nothing, kwargs...) + start = time_ns() + + # compile to PTX + ctx = context() + dev = device(ctx) + cap = supported_capability(dev) + asm, kernel_fn, undefined_fns = + compile(:ptx, cap, f, tt; name=name, strict=true, kwargs...) + + # settings to JIT based on Julia's debug setting + jit_options = Dict{CUDAdrv.CUjit_option,Any}() + if Base.JLOptions().debug_level == 1 + jit_options[CUDAdrv.JIT_GENERATE_LINE_INFO] = true + elseif Base.JLOptions().debug_level >= 2 + jit_options[CUDAdrv.JIT_GENERATE_DEBUG_INFO] = true + end + + # link the CUDA device library + image = asm + # linking the device runtime library requires use of the CUDA linker, + # which in turn switches compilation to device relocatable code (-rdc) mode. + # + # even if not doing any actual calls that need -rdc (i.e., calls to the runtime + # library), this significantly hurts performance, so don't do it unconditionally + intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail", + "__nvvm_reflect" #= TODO: should have been optimized away =#] + if !isempty(setdiff(undefined_fns, intrinsic_fns)) + @timeit_debug to "device runtime library" begin + linker = CUDAdrv.CuLink(jit_options) + CUDAdrv.add_file!(linker, libcudadevrt[], CUDAdrv.JIT_INPUT_LIBRARY) + CUDAdrv.add_data!(linker, kernel_fn, asm) + image = CUDAdrv.complete(linker) + end + end + + # JIT into an executable kernel object + mod = CuModule(image, jit_options) + fun = CuFunction(mod, kernel_fn) + kernel = HostKernel{f,tt}(ctx, mod, fun) + + create_exceptions!(mod) + + stop = time_ns() + @debug begin + ver = version(kernel) + mem = memory(kernel) + reg = registers(kernel) + fn = something(name, nameof(f)) + """Compiled $fn($(join(tt.parameters, ", "))) to PTX $(ver.ptx) for SM $(ver.binary) in $(round((time_ns() - start) / 1000000; digits=2)) ms. + Kernel uses $reg registers, and $(Base.format_bytes(mem.local)) local, $(Base.format_bytes(mem.shared)) shared, and $(Base.format_bytes(mem.constant)) constant memory.""" + end + + return kernel +end + +# cached compilation const compilecache = Dict{UInt, HostKernel}() +@inline function cufunction_fast(f, tt, spec; name=nothing, kwargs...) + # generate a key for indexing the compilation cache + ctx = context() + key = hash(spec) + key = hash(pointer_from_objref(ctx), key) # contexts are unique, but handles might alias + # TODO: implement this as a hash function in CUDAdrv + key = hash(name, key) + key = hash(kwargs, key) + for nf in 1:nfields(f) + # mix in the values of any captured variable + key = hash(getfield(f, nf), key) + end + + return get!(compilecache, key) do + cufunction_slow(f, tt, spec; name=name, kwargs...) + end::HostKernel{f,tt} +end + +specialization_counter = 0 """ cufunction(f, tt=Tuple{}; kwargs...) @@ -356,92 +418,57 @@ The output of this function is automatically cached, i.e. you can simply call `c in a hot path without degrading performance. New code will be generated automatically, when when function changes, or when different types or keyword arguments are provided. """ -@generated function cufunction(f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...) - tt = Base.to_tuple_type(tt.parameters[1]) - sig = Base.signature_type(f, tt) - t = Tuple(tt.parameters) - - precomp_key = hash(sig) # precomputable part of the keys - quote - Base.@_inline_meta - - # look-up the method age - key = hash(world_age(), $precomp_key) - if haskey(agecache, key) - age = agecache[key] - else - age = method_age(f, $t) - agecache[key] = age - end - - # generate a key for indexing the compilation cache - ctx = context() - key = hash(age, $precomp_key) - key = hash(pointer_from_objref(ctx), key) # contexts are unique, but handles might alias - key = hash(name, key) - key = hash(kwargs, key) - for nf in 1:nfields(f) - # mix in the values of any captured variable - key = hash(getfield(f, nf), key) - end - - # compile the function - if !haskey(compilecache, key) - start = time_ns() - - # compile to PTX - dev = device(ctx) - cap = supported_capability(dev) - asm, kernel_fn, undefined_fns = - compile(:ptx, cap, f, tt; name=name, strict=true, kwargs...) - - # settings to JIT based on Julia's debug setting - jit_options = Dict{CUDAdrv.CUjit_option,Any}() - if Base.JLOptions().debug_level == 1 - jit_options[CUDAdrv.JIT_GENERATE_LINE_INFO] = true - elseif Base.JLOptions().debug_level >= 2 - jit_options[CUDAdrv.JIT_GENERATE_DEBUG_INFO] = true - end - - # link the CUDA device library - image = asm - # linking the device runtime library requires use of the CUDA linker, - # which in turn switches compilation to device relocatable code (-rdc) mode. - # - # even if not doing any actual calls that need -rdc (i.e., calls to the runtime - # library), this significantly hurts performance, so don't do it unconditionally - intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail", - "__nvvm_reflect" #= TODO: should have been optimized away =#] - if !isempty(setdiff(undefined_fns, intrinsic_fns)) - @timeit_debug to "device runtime library" begin - linker = CUDAdrv.CuLink(jit_options) - CUDAdrv.add_file!(linker, libcudadevrt[], CUDAdrv.JIT_INPUT_LIBRARY) - CUDAdrv.add_data!(linker, kernel_fn, asm) - image = CUDAdrv.complete(linker) - end - end - - # JIT into an executable kernel object - mod = CuModule(image, jit_options) - fun = CuFunction(mod, kernel_fn) - kernel = HostKernel{f,tt}(ctx, mod, fun) - - create_exceptions!(mod) - - compilecache[key] = kernel - stop = time_ns() - @debug begin - ver = version(kernel) - mem = memory(kernel) - reg = registers(kernel) - fn = something(name, nameof(f)) - """Compiled $fn($(join(tt.parameters, ", "))) to PTX $(ver.ptx) for SM $(ver.binary) in $(round((time_ns() - start) / 1000000; digits=2)) ms. - Kernel uses $reg registers, and $(Base.format_bytes(mem.local)) local, $(Base.format_bytes(mem.shared)) shared, and $(Base.format_bytes(mem.constant)) constant memory.""" - end - end - - return compilecache[key]::HostKernel{f,tt} - end +@generated function cufunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) + # generated function that crafts a custom code info to call the actual cufunction impl. + # this gives us the flexibility to insert manual back edges for automatic recompilation. + tt = tt.parameters[1] + + # get a hold of the method and code info of the kernel function + sig = Tuple{f, tt.parameters...} + mthds = _methods_by_ftype(sig, -1, typemax(UInt)) + Base.isdispatchtuple(tt) || return(:(error("$tt is not a dispatch tuple"))) + length(mthds) == 1 || return (:(throw(MethodError(f,tt)))) + mtypes, msp, m = mthds[1] + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, (Any, Any, Any), m, mtypes, msp) + mi.def.isva && return :(error("varargs kernel methods are not supported")) + ci = retrieve_code_info(mi) + @assert isa(ci, CodeInfo) + + # generate a unique id to represent this specialization + global specialization_counter + id = (specialization_counter += 1) + # TODO: save the mi/ci here (or embed it in the AST to pass to cufunction) + # and use that to drive compilation + + # prepare a new code info + new_ci = copy(ci) + empty!(new_ci.code) + empty!(new_ci.codelocs) + empty!(new_ci.linetable) + empty!(new_ci.ssaflags) + new_ci.ssavaluetypes = 0 + new_ci.edges = MethodInstance[mi] + # XXX: setting this edge does not give us proper method invalidation, see + # JuliaLang/julia#34962 which demonstrates we also need to "call" the kernel. + # invoking `code_llvm` also does the necessary codegen, as does calling the + # underlying C methods -- which CUDAnative does, so everything Just Works. + + # prepare the slots + new_ci.slotnames = Symbol[:kwfunc, :kwargs, Symbol("#self#"), :f, :tt] + new_ci.slotflags = UInt8[0x00 for i = 1:5] + kwargs = SlotNumber(2) + f = SlotNumber(4) + tt = SlotNumber(5) + + # call the compiler + append!(new_ci.code, [Expr(:call, Core.kwfunc, cufunction_fast), + Expr(:call, merge, NamedTuple(), kwargs), + Expr(:call, SSAValue(1), SSAValue(2), cufunction_fast, f, tt, id), + Expr(:return, SSAValue(3))]) + append!(new_ci.codelocs, [0, 0, 0, 0]) + new_ci.ssavaluetypes += 4 + + return new_ci end # https://github.com/JuliaLang/julia/issues/14919 diff --git a/test/device/execution.jl b/test/device/execution.jl index e7292d30..e9c2bebc 100644 --- a/test/device/execution.jl +++ b/test/device/execution.jl @@ -316,6 +316,26 @@ end end +@testset "automatic recompilation (bis)" begin + arr = CuArray(zeros(Int)) + + @eval doit(ptr) = unsafe_store!(ptr, 1) + + function kernel(ptr) + doit(ptr) + return + end + + @cuda kernel(pointer(arr)) + @test Array(arr)[] == 1 + + @eval doit(ptr) = unsafe_store!(ptr, 2) + + @cuda kernel(pointer(arr)) + @test Array(arr)[] == 2 +end + + @testset "non-isbits arguments" begin function kernel1(T, i) sink(i) From 4201c4e912fd031b6d11b2f8192c0bc58a4621f8 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 27 Feb 2020 10:46:36 +0100 Subject: [PATCH 2/3] Simplify at-cuda implementation. --- src/execution.jl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 079943ac..ed83c1b1 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -103,16 +103,11 @@ A device-side launch, aka. dynamic parallelism, is similar but more restricted: """ macro cuda(ex...) # destructure the `@cuda` expression - if length(ex) > 0 && ex[1].head == :tuple - error("The tuple argument to @cuda has been replaced by keywords: `@cuda threads=... fun(args...)`") - end call = ex[end] kwargs = ex[1:end-1] # destructure the kernel call - if call.head != :call - throw(ArgumentError("second argument to @cuda should be a function call")) - end + Meta.isexpr(call, :call) || throw(ArgumentError("second argument to @cuda should be a function call")) f = call.args[1] args = call.args[2:end] From c16685f53ef94d753f3769ce6de763bb02a1b656 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 3 Mar 2020 10:50:29 +0100 Subject: [PATCH 3/3] Leave one line table entry. --- src/execution.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution.jl b/src/execution.jl index ed83c1b1..2abcd2a1 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -439,7 +439,7 @@ when function changes, or when different types or keyword arguments are provided new_ci = copy(ci) empty!(new_ci.code) empty!(new_ci.codelocs) - empty!(new_ci.linetable) + resize!(new_ci.linetable, 1) # codegen assumes at least one entry empty!(new_ci.ssaflags) new_ci.ssavaluetypes = 0 new_ci.edges = MethodInstance[mi]