From bd386ffd68160fd35e9c9008e44d9a8d939e16e6 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 6 Mar 2018 17:26:46 -0500
Subject: [PATCH] cfunction macro: extend cfunction capabilities

Provide static support for handling dynamic calls and closures
---
 base/c.jl                                    |  44 +-
 base/compiler/abstractinterpretation.jl      |  25 +-
 base/compiler/optimize.jl                    |  15 +-
 base/compiler/validation.jl                  |  11 +-
 base/deprecated.jl                           |   6 +
 base/exports.jl                              |   2 +-
 base/libuv.jl                                |  19 +-
 base/threadcall.jl                           |  61 +--
 doc/src/base/c.md                            |   2 +-
 doc/src/manual/calling-c-and-fortran-code.md | 102 ++--
 src/ast.c                                    |   8 +-
 src/ccall.cpp                                |  85 ++--
 src/cgutils.cpp                              |  26 +-
 src/codegen.cpp                              | 462 +++++++++++++++----
 src/julia-syntax.scm                         |  11 +-
 src/julia_internal.h                         |  10 +-
 src/method.c                                 |  53 ++-
 src/runtime_ccall.cpp                        | 142 ++++++
 src/toplevel.c                               |   9 +-
 stdlib/FileWatching/src/FileWatching.jl      |  15 +-
 stdlib/LibGit2/src/callbacks.jl              |  12 +-
 stdlib/LibGit2/src/tree.jl                   |  65 ++-
 stdlib/LibGit2/test/libgit2.jl               |   2 +-
 stdlib/LibGit2/test/online.jl                |   2 +-
 stdlib/Libdl/src/Libdl.jl                    |   8 +-
 stdlib/Sockets/src/Sockets.jl                |  12 +-
 test/ambiguous.jl                            |   9 +-
 test/ccall.jl                                | 197 ++++++--
 test/core.jl                                 |  21 +-
 test/embedding/embedding.c                   |   2 +-
 test/misc.jl                                 |  15 +-
 test/reflection.jl                           |   8 +-
 test/spawn.jl                                |   2 +-
 test/staged.jl                               |   7 +-
 test/threads.jl                              |  40 +-
 35 files changed, 1105 insertions(+), 405 deletions(-)

diff --git a/base/c.jl b/base/c.jl
index 6a7403678771c5..80fafcd844b5c7 100644
--- a/base/c.jl
+++ b/base/c.jl
@@ -16,12 +16,29 @@ respectively.
 """
 cglobal
 
+struct CFunction
+    ptr::Ptr{Cvoid}
+    f::Any
+    _1::Ptr{Cvoid}
+    _2::Ptr{Cvoid}
+    let construtor = false end
+end
+unsafe_convert(::Type{Ptr{Cvoid}}, cf::CFunction) = cf.ptr
+
 """
-    cfunction(f::Function, returntype::Type, argtypes::Type) -> Ptr{Cvoid}
+    @cfunction(callable, ReturnType, (ArgumentTypes...,)) -> Ptr{Cvoid}
+    @cfunction(\$callable, ReturnType, (ArgumentTypes...,)) -> CFunction
+
+Generate a C-callable function pointer from the Julia function `closure`
+for the given type signature.
+
+Note that the argument type tuple must be a literal tuple, and not a tuple-valued variable or expression
+(although it can include a splat expression). And that these arguments will be evaluated in global scope
+during compile-time (not deferred until runtime).
+Adding a `\$` in front of the function argument changes this to instead create a runtime closure
+over the local variable `callable`.
 
-Generate C-callable function pointer from the Julia function `f`. Type annotation of the return
-value in the callback function is a must for situations where Julia cannot infer the return
-type automatically.
+See [manual section on ccall and cfunction usage](@ref Calling-C-and-Fortran-Code).
 
 # Examples
 ```julia-repl
@@ -29,11 +46,26 @@ julia> function foo(x::Int, y::Int)
            return x + y
        end
 
-julia> cfunction(foo, Int, Tuple{Int,Int})
+julia> @cfunction(foo, Int, (Int, Int))
 Ptr{Cvoid} @0x000000001b82fcd0
 ```
 """
-cfunction(f, r, a) = ccall(:jl_function_ptr, Ptr{Cvoid}, (Any, Any, Any), f, r, a)
+macro cfunction(f, at, rt)
+    if !(isa(rt, Expr) && rt.head === :tuple)
+        throw(ArgumentError("@cfunction argument types must be a literal tuple"))
+    end
+    rt.head = :call
+    pushfirst!(rt.args, GlobalRef(Core, :svec))
+    if isa(f, Expr) && f.head === :$
+        fptr = f.args[1]
+        typ = CFunction
+    else
+        fptr = QuoteNode(f)
+        typ = Ptr{Cvoid}
+    end
+    cfun = Expr(:cfunction, typ, fptr, at, rt, QuoteNode(:ccall))
+    return esc(cfun)
+end
 
 if ccall(:jl_is_char_signed, Ref{Bool}, ())
     const Cchar = Int8
diff --git a/base/compiler/abstractinterpretation.jl b/base/compiler/abstractinterpretation.jl
index 2acb39c1a9e5e3..8a3cbf51902652 100644
--- a/base/compiler/abstractinterpretation.jl
+++ b/base/compiler/abstractinterpretation.jl
@@ -663,8 +663,8 @@ function abstract_call(@nospecialize(f), fargs::Union{Tuple{},Vector{Any}}, argt
     return abstract_call_gf_by_type(f, argtypes, atype, sv)
 end
 
-function abstract_eval_call(e::Expr, vtypes::VarTable, sv::InferenceState)
-    argtypes = Any[abstract_eval(a, vtypes, sv) for a in e.args]
+# wrapper around `abstract_call` for first computing if `f` is available
+function abstract_eval_call(fargs::Union{Tuple{},Vector{Any}}, argtypes::Vector{Any}, vtypes::VarTable, sv::InferenceState)
     #print("call ", e.args[1], argtypes, "\n\n")
     for x in argtypes
         x === Bottom && return Bottom
@@ -689,7 +689,7 @@ function abstract_eval_call(e::Expr, vtypes::VarTable, sv::InferenceState)
         end
         return abstract_call_gf_by_type(nothing, argtypes, argtypes_to_type(argtypes), sv)
     end
-    return abstract_call(f, e.args, argtypes, vtypes, sv)
+    return abstract_call(f, fargs, argtypes, vtypes, sv)
 end
 
 function sp_type_rewrap(@nospecialize(T), linfo::MethodInstance, isreturn::Bool)
@@ -730,6 +730,18 @@ function sp_type_rewrap(@nospecialize(T), linfo::MethodInstance, isreturn::Bool)
     return T
 end
 
+function abstract_eval_cfunction(e::Expr, vtypes::VarTable, sv::InferenceState)
+    f = abstract_eval(e.args[2], vtypes, sv)
+    # rt = sp_type_rewrap(e.args[3], sv.linfo, true)
+    at = Any[ sp_type_rewrap(argt, sv.linfo, false) for argt in e.args[4]::SimpleVector ]
+    pushfirst!(at, f)
+    # this may be the wrong world for the call,
+    # but some of the result is likely to be valid anyways
+    # and that may help generate better codegen
+    abstract_eval_call((), at, vtypes, sv)
+    nothing
+end
+
 function abstract_eval(@nospecialize(e), vtypes::VarTable, sv::InferenceState)
     if isa(e, QuoteNode)
         return AbstractEvalConstant((e::QuoteNode).value)
@@ -748,7 +760,8 @@ function abstract_eval(@nospecialize(e), vtypes::VarTable, sv::InferenceState)
     end
     e = e::Expr
     if e.head === :call
-        t = abstract_eval_call(e, vtypes, sv)
+        argtypes = Any[ abstract_eval(a, vtypes, sv) for a in e.args ]
+        t = abstract_eval_call(e.args, argtypes, vtypes, sv)
     elseif e.head === :new
         t = instanceof_tfunc(abstract_eval(e.args[1], vtypes, sv))[1]
         for i = 2:length(e.args)
@@ -767,6 +780,10 @@ function abstract_eval(@nospecialize(e), vtypes::VarTable, sv::InferenceState)
                 t = Bottom
             end
         end
+    elseif e.head === :cfunction
+        t = e.args[1]
+        isa(t, Type) || (t = Any)
+        abstract_eval_cfunction(e, vtypes, sv)
     elseif e.head === :static_parameter
         n = e.args[1]
         t = Any
diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl
index 1ed62775d08aa6..3940fc67d176c0 100644
--- a/base/compiler/optimize.jl
+++ b/base/compiler/optimize.jl
@@ -769,17 +769,26 @@ function substitute!(
         head = e.head
         if head === :static_parameter
             return quoted(spvals[e.args[1]])
+        elseif head === :cfunction
+            @assert !isa(spsig, UnionAll) || !isempty(spvals)
+            if !(e.args[2] isa QuoteNode) # very common no-op
+                e.args[2] = substitute!(e.args[2], na, argexprs, spsig, spvals, offset, boundscheck)
+            end
+            e.args[3] = ccall(:jl_instantiate_type_in_env, Any, (Any, Any, Ptr{Any}), e.args[3], spsig, spvals)
+            e.args[4] = svec(Any[
+                ccall(:jl_instantiate_type_in_env, Any, (Any, Any, Ptr{Any}), argt, spsig, spvals)
+                for argt
+                in e.args[4] ]...)
         elseif head === :foreigncall
             @assert !isa(spsig, UnionAll) || !isempty(spvals)
             for i = 1:length(e.args)
                 if i == 2
                     e.args[2] = ccall(:jl_instantiate_type_in_env, Any, (Any, Any, Ptr{Any}), e.args[2], spsig, spvals)
                 elseif i == 3
-                    argtuple = Any[
+                    e.args[3] = svec(Any[
                         ccall(:jl_instantiate_type_in_env, Any, (Any, Any, Ptr{Any}), argt, spsig, spvals)
                         for argt
-                        in e.args[3] ]
-                    e.args[3] = svec(argtuple...)
+                        in e.args[3] ]...)
                 elseif i == 4
                     @assert isa((e.args[4]::QuoteNode).value, Symbol)
                 elseif i == 5
diff --git a/base/compiler/validation.jl b/base/compiler/validation.jl
index 90d08481a94205..6dbe181f74c759 100644
--- a/base/compiler/validation.jl
+++ b/base/compiler/validation.jl
@@ -22,6 +22,7 @@ const VALID_EXPR_HEADS = IdDict{Any,Any}(
     :meta => 0:typemax(Int),
     :global => 1:1,
     :foreigncall => 3:typemax(Int),
+    :cfunction => 6:6,
     :isdefined => 1:1,
     :simdloop => 0:0,
     :gc_preserve_begin => 0:typemax(Int),
@@ -139,9 +140,11 @@ function validate_code!(errors::Vector{>:InvalidCodeError}, c::CodeInfo, is_top_
                 end
                 validate_val!(x.args[1])
             elseif head === :call || head === :invoke || head == :gc_preserve_end || head === :meta ||
-                head === :inbounds || head === :foreigncall || head === :const || head === :enter ||
-                head === :leave || head === :method || head === :global || head === :static_parameter ||
-                head === :new || head === :thunk || head === :simdloop || head === :throw_undef_if_not || head === :unreachable
+                head === :inbounds || head === :foreigncall || head === :cfunction ||
+                head === :const || head === :enter || head === :leave ||
+                head === :method || head === :global || head === :static_parameter ||
+                head === :new || head === :thunk || head === :simdloop ||
+                head === :throw_undef_if_not || head === :unreachable
                 validate_val!(x)
             else
                 push!(errors, InvalidCodeError("invalid statement", x))
@@ -221,7 +224,7 @@ end
 
 function is_valid_rvalue(lhs, x)
     is_valid_argument(x) && return true
-    if isa(x, Expr) && x.head in (:new, :the_exception, :isdefined, :call, :invoke, :foreigncall, :gc_preserve_begin)
+    if isa(x, Expr) && x.head in (:new, :the_exception, :isdefined, :call, :invoke, :foreigncall, :cfunction, :gc_preserve_begin)
         return true
         # TODO: disallow `globalref = call` when .typ field is removed
         #return isa(lhs, SSAValue) || isa(lhs, Slot)
diff --git a/base/deprecated.jl b/base/deprecated.jl
index 65275475841182..c34ec99b6552b5 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -495,6 +495,12 @@ end
 
 # PR #23066
 @deprecate cfunction(f, r, a::Tuple) cfunction(f, r, Tuple{a...})
+@noinline function cfunction(f, r, a)
+    @nospecialize(f, r, a)
+    depwarn("The function `cfunction` is now written as a macro `@cfunction`.", :cfunction)
+    return ccall(:jl_function_ptr, Ptr{Cvoid}, (Any, Any, Any), f, r, a)
+end
+export cfunction
 
 # PR 23341
 @eval GMP @deprecate gmp_version() version() false
diff --git a/base/exports.jl b/base/exports.jl
index 15de30529884fc..9c7c4360ec9933 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -924,7 +924,7 @@ export
     withenv,
 
 # C interface
-    cfunction,
+    @cfunction,
     cglobal,
     disable_sigint,
     pointer,
diff --git a/base/libuv.jl b/base/libuv.jl
index 3012755e0982df..0c5123b73d6306 100644
--- a/base/libuv.jl
+++ b/base/libuv.jl
@@ -85,13 +85,20 @@ function process_events(block::Bool)
     end
 end
 
+function uv_alloc_buf end
+function uv_readcb end
+function uv_writecb_task end
+function uv_return_spawn end
+function uv_asynccb end
+function uv_timercb end
+
 function reinit_stdio()
-    global uv_jl_alloc_buf     = cfunction(uv_alloc_buf, Cvoid, Tuple{Ptr{Cvoid}, Csize_t, Ptr{Cvoid}})
-    global uv_jl_readcb        = cfunction(uv_readcb, Cvoid, Tuple{Ptr{Cvoid}, Cssize_t, Ptr{Cvoid}})
-    global uv_jl_writecb_task  = cfunction(uv_writecb_task, Cvoid, Tuple{Ptr{Cvoid}, Cint})
-    global uv_jl_return_spawn  = cfunction(uv_return_spawn, Cvoid, Tuple{Ptr{Cvoid}, Int64, Int32})
-    global uv_jl_asynccb       = cfunction(uv_asynccb, Cvoid, Tuple{Ptr{Cvoid}})
-    global uv_jl_timercb       = cfunction(uv_timercb, Cvoid, Tuple{Ptr{Cvoid}})
+    global uv_jl_alloc_buf     = @cfunction(uv_alloc_buf, Cvoid, (Ptr{Cvoid}, Csize_t, Ptr{Cvoid}))
+    global uv_jl_readcb        = @cfunction(uv_readcb, Cvoid, (Ptr{Cvoid}, Cssize_t, Ptr{Cvoid}))
+    global uv_jl_writecb_task  = @cfunction(uv_writecb_task, Cvoid, (Ptr{Cvoid}, Cint))
+    global uv_jl_return_spawn  = @cfunction(uv_return_spawn, Cvoid, (Ptr{Cvoid}, Int64, Int32))
+    global uv_jl_asynccb       = @cfunction(uv_asynccb, Cvoid, (Ptr{Cvoid},))
+    global uv_jl_timercb       = @cfunction(uv_timercb, Cvoid, (Ptr{Cvoid},))
 
     global uv_eventloop = ccall(:jl_global_event_loop, Ptr{Cvoid}, ())
     global stdin = init_stdio(ccall(:jl_stdin_stream, Ptr{Cvoid}, ()))
diff --git a/base/threadcall.jl b/base/threadcall.jl
index f6e7a2be05ddfe..901699b7a4d5b5 100644
--- a/base/threadcall.jl
+++ b/base/threadcall.jl
@@ -4,12 +4,6 @@ const max_ccall_threads = parse(Int, get(ENV, "UV_THREADPOOL_SIZE", "4"))
 const thread_notifiers = Union{Condition, Nothing}[nothing for i in 1:max_ccall_threads]
 const threadcall_restrictor = Semaphore(max_ccall_threads)
 
-function notify_fun(idx)
-    global thread_notifiers
-    notify(thread_notifiers[idx])
-    return
-end
-
 """
     @threadcall((cfunc, clib), rettype, (argtypes...), argvals...)
 
@@ -36,62 +30,71 @@ macro threadcall(f, rettype, argtypes, argvals...)
     argvals = map(esc, argvals)
 
     # construct non-allocating wrapper to call C function
-    wrapper = :(function wrapper(args_ptr::Ptr{Cvoid}, retval_ptr::Ptr{Cvoid})
+    wrapper = :(function (args_ptr::Ptr{Cvoid}, retval_ptr::Ptr{Cvoid})
         p = args_ptr
+        # the rest of the body is created below
     end)
     body = wrapper.args[2].args
     args = Symbol[]
-    for (i,T) in enumerate(argtypes)
+    for (i, T) in enumerate(argtypes)
         arg = Symbol("arg", i)
         push!(body, :($arg = unsafe_load(convert(Ptr{$T}, p))))
-        push!(body, :(p += sizeof($T)))
+        push!(body, :(p += Core.sizeof($T)))
         push!(args, arg)
     end
     push!(body, :(ret = ccall($f, $rettype, ($(argtypes...),), $(args...))))
     push!(body, :(unsafe_store!(convert(Ptr{$rettype}, retval_ptr), ret)))
-    push!(body, :(return sizeof($rettype)))
+    push!(body, :(return Int(Core.sizeof($rettype))))
 
     # return code to generate wrapper function and send work request thread queue
-    :(let
-        $wrapper
-        do_threadcall(wrapper, $rettype, Any[$(argtypes...)], Any[$(argvals...)])
+    wrapper = Expr(Symbol("hygienic-scope"), wrapper, @__MODULE__)
+    return :(let fun_ptr = @cfunction($wrapper, Int, (Ptr{Cvoid}, Ptr{Cvoid}))
+        do_threadcall(fun_ptr, $rettype, Any[$(argtypes...)], Any[$(argvals...)])
     end)
 end
 
-function do_threadcall(wrapper::Function, rettype::Type, argtypes::Vector, argvals::Vector)
+function do_threadcall(fun_ptr::Ptr{Cvoid}, rettype::Type, argtypes::Vector, argvals::Vector)
     # generate function pointer
-    fun_ptr = cfunction(wrapper, Int, Tuple{Ptr{Cvoid}, Ptr{Cvoid}})
-    c_notify_fun = cfunction(notify_fun, Cvoid, Tuple{Cint})
+    c_notify_fun = @cfunction(
+        function notify_fun(idx)
+            global thread_notifiers
+            notify(thread_notifiers[idx])
+            return
+        end, Cvoid, (Cint,))
 
     # cconvert, root and unsafe_convert arguments
     roots = Any[]
-    args_size = isempty(argtypes) ? 0 : sum(sizeof, argtypes)
+    args_size = isempty(argtypes) ? 0 : sum(Core.sizeof, argtypes)
     args_arr = Vector{UInt8}(undef, args_size)
     ptr = pointer(args_arr)
     for (T, x) in zip(argtypes, argvals)
+        isbits(T) || throw(ArgumentError("threadcall requires isbits argument types"))
         y = cconvert(T, x)
         push!(roots, y)
-        unsafe_store!(convert(Ptr{T}, ptr), unsafe_convert(T, y))
-        ptr += sizeof(T)
+        unsafe_store!(convert(Ptr{T}, ptr), unsafe_convert(T, y)::T)
+        ptr += Core.sizeof(T)
     end
 
     # create return buffer
-    ret_arr = Vector{UInt8}(undef, sizeof(rettype))
+    ret_arr = Vector{UInt8}(undef, Core.sizeof(rettype))
 
     # wait for a worker thread to be available
     acquire(threadcall_restrictor)
     idx = findfirst(isequal(nothing), thread_notifiers)::Int
     thread_notifiers[idx] = Condition()
 
-    # queue up the work to be done
-    ccall(:jl_queue_work, Cvoid,
-        (Ptr{Cvoid}, Ptr{UInt8}, Ptr{UInt8}, Ptr{Cvoid}, Cint),
-        fun_ptr, args_arr, ret_arr, c_notify_fun, idx)
+    GC.@preserve args_arr ret_arr roots begin
+        # queue up the work to be done
+        ccall(:jl_queue_work, Cvoid,
+            (Ptr{Cvoid}, Ptr{UInt8}, Ptr{UInt8}, Ptr{Cvoid}, Cint),
+            fun_ptr, args_arr, ret_arr, c_notify_fun, idx)
 
-    # wait for a result & return it
-    wait(thread_notifiers[idx])
-    thread_notifiers[idx] = nothing
-    release(threadcall_restrictor)
+        # wait for a result & return it
+        wait(thread_notifiers[idx])
+        thread_notifiers[idx] = nothing
+        release(threadcall_restrictor)
 
-    unsafe_load(convert(Ptr{rettype}, pointer(ret_arr)))
+        r = unsafe_load(convert(Ptr{rettype}, pointer(ret_arr)))
+    end
+    return r
 end
diff --git a/doc/src/base/c.md b/doc/src/base/c.md
index 5b46e431c0624a..a09b916c7d7261 100644
--- a/doc/src/base/c.md
+++ b/doc/src/base/c.md
@@ -3,7 +3,7 @@
 ```@docs
 ccall
 Core.Intrinsics.cglobal
-Base.cfunction
+Base.@cfunction
 Base.unsafe_convert
 Base.cconvert
 Base.unsafe_load
diff --git a/doc/src/manual/calling-c-and-fortran-code.md b/doc/src/manual/calling-c-and-fortran-code.md
index 07d7f2fe484bdf..affc89b7657fee 100644
--- a/doc/src/manual/calling-c-and-fortran-code.md
+++ b/doc/src/manual/calling-c-and-fortran-code.md
@@ -155,25 +155,28 @@ It is possible to pass Julia functions to native C functions that accept functio
 For example, to match C prototypes of the form:
 
 ```c
-typedef returntype (*functiontype)(argumenttype,...)
+typedef returntype (*functiontype)(argumenttype, ...)
 ```
 
-The function [`cfunction`](@ref) generates the C-compatible function pointer for a call to a
-Julia function. Arguments to [`cfunction`](@ref) are as follows:
+The macro [`@cfunction`](@ref) generates the C-compatible function pointer for a call to a
+Julia function. Arguments to [`@cfunction`](@ref) are as follows:
 
 1. A Julia Function
 2. Return type
-3. A tuple type of input types
+3. A literal tuple of input types
 
-Only platform-default C calling convention is supported. `cfunction`-generated pointers cannot
-be used in calls where WINAPI expects `stdcall` function on 32-bit windows, but can be used on WIN64
-(where `stdcall` is unified with C calling convention).
+Like ccall, all of these arguments will be evaluated at compile-time, when the containing method is defined.
+
+Currently, only the platform-default C calling convention is supported. This means that
+`@cfunction`-generated pointers cannot be used in calls where WINAPI expects `stdcall`
+function on 32-bit windows, but can be used on WIN64 (where `stdcall` is unified with the
+C calling convention).
 
 A classic example is the standard C library `qsort` function, declared as:
 
 ```c
 void qsort(void *base, size_t nmemb, size_t size,
-           int(*compare)(const void *a, const void *b));
+           int (*compare)(const void*, const void*));
 ```
 
 The `base` argument is a pointer to an array of length `nmemb`, with elements of `size` bytes
@@ -182,26 +185,26 @@ an integer less/greater than zero if `a` should appear before/after `b` (or zero
 is permitted). Now, suppose that we have a 1d array `A` of values in Julia that we want to sort
 using the `qsort` function (rather than Julia's built-in `sort` function). Before we worry about
 calling `qsort` and passing arguments, we need to write a comparison function that works for some
-arbitrary type T:
+arbitrary objects (which define `<`):
 
 ```jldoctest mycompare
-julia> function mycompare(a::T, b::T) where T
-           return convert(Cint, a < b ? -1 : a > b ? +1 : 0)::Cint
+julia> function mycompare(a, b)::Cint
+           return (a < b) ? -1 : ((a > b) ? +1 : 0)
        end
 mycompare (generic function with 1 method)
 ```
 
 Notice that we have to be careful about the return type: `qsort` expects a function returning
-a C `int`, so we must be sure to return `Cint` via a call to `convert` and a `typeassert`.
+a C `int`, so we annotate the return type of the function to be sure it returns a `Cint`.
 
-In order to pass this function to C, we obtain its address using the function `cfunction`:
+In order to pass this function to C, we obtain its address using the macro `@cfunction`:
 
 ```jldoctest mycompare
-julia> const mycompare_c = cfunction(mycompare, Cint, Tuple{Ref{Cdouble}, Ref{Cdouble}});
+julia> mycompare_c = @cfunction(mycompare, Cint, (Ref{Cdouble}, Ref{Cdouble}));
 ```
 
-[`cfunction`](@ref) accepts three arguments: the Julia function (`mycompare`), the return type
-(`Cint`), and a tuple type of the input argument types, in this case to sort an array of `Cdouble`
+[`@cfunction`](@ref) requires three arguments: the Julia function (`mycompare`), the return type
+(`Cint`), and a literal tuple of the input argument types, in this case to sort an array of `Cdouble`
 ([`Float64`](@ref)) elements.
 
 The final call to `qsort` looks like this:
@@ -227,7 +230,7 @@ julia> A
 
 As can be seen, `A` is changed to the sorted array `[-2.7, 1.3, 3.1, 4.4]`. Note that Julia
 knows how to convert an array into a `Ptr{Cdouble}`, how to compute the size of a type in bytes
-(identical to C's `sizeof` operator), and so on. For fun, try inserting a `println("mycompare($a,$b)")`
+(identical to C's `sizeof` operator), and so on. For fun, try inserting a `println("mycompare($a, $b)")`
 line into `mycompare`, which will allow you to see the comparisons that `qsort` is performing
 (and to verify that it is really calling the Julia function that you passed to it).
 
@@ -518,16 +521,18 @@ unsafe_string(str + Core.sizeof(Cint), len)
 
 ### Type Parameters
 
-The type arguments to `ccall` are evaluated statically, when the method containing the `ccall` is defined.
-They therefore must take the form of a literal tuple, not a variable, and cannot reference local variables.
+The type arguments to `ccall` and `@cfunction` are evaluated statically,
+when the method containing the usage is defined.
+They therefore must take the form of a literal tuple, not a variable,
+and cannot reference local variables.
 
 This may sound like a strange restriction,
 but remember that since C is not a dynamic language like Julia,
 its functions can only accept argument types with a statically-known, fixed signature.
 
-However, while the type layout must be known statically to compute the `ccall` ABI,
+However, while the type layout must be known statically to compute the intended C ABI,
 the static parameters of the function are considered to be part of this static environment.
-The static parameters of the function may be used as type parameters in the `ccall` signature,
+The static parameters of the function may be used as type parameters in the call signature,
 as long as they don't affect the layout of the type.
 For example, `f(x::T) where {T} = ccall(:valid, Ptr{T}, (Ptr{T},), x)`
 is valid, since `Ptr` is always a word-size primitive type.
@@ -603,7 +608,7 @@ Fortran subroutines, or a `T` for Fortran functions returning the type `T`.
 
 ## Mapping C Functions to Julia
 
-### `ccall`/`cfunction` argument translation guide
+### `ccall` / `@cfunction` argument translation guide
 
 For translating a C argument list to Julia:
 
@@ -626,28 +631,27 @@ For translating a C argument list to Julia:
 
       * `Any`
       * argument value must be a valid Julia object
-      * currently unsupported by [`cfunction`](@ref)
   * `jl_value_t**`
 
       * `Ref{Any}`
       * argument value must be a valid Julia object (or `C_NULL`)
-      * currently unsupported by [`cfunction`](@ref)
   * `T*`
 
       * `Ref{T}`, where `T` is the Julia type corresponding to `T`
       * argument value will be copied if it is an `isbits` type otherwise, the value must be a valid Julia
         object
-  * `(T*)(...)` (e.g. a pointer to a function)
+  * `T (*)(...)` (e.g. a pointer to a function)
 
-      * `Ptr{Cvoid}` (you may need to use [`cfunction`](@ref) explicitly to create this pointer)
+      * `Ptr{Cvoid}` (you may need to use [`@cfunction`](@ref) explicitly to create this pointer)
   * `...` (e.g. a vararg)
 
       * `T...`, where `T` is the Julia type
+      * currently unsupported by `@cfunction`
   * `va_arg`
 
-      * not supported
+      * not supported by `ccall` or `@cfunction`
 
-### `ccall`/`cfunction` return type translation guide
+### `ccall` / `@cfunction` return type translation guide
 
 For translating a C return type to Julia:
 
@@ -675,7 +679,7 @@ For translating a C return type to Julia:
       * argument value must be a valid Julia object
   * `jl_value_t**`
 
-      * `Ref{Any}`
+      * `Ptr{Any}` (`Ref{Any}` is invalid as a return type)
       * argument value must be a valid Julia object (or `C_NULL`)
   * `T*`
 
@@ -683,14 +687,14 @@ For translating a C return type to Julia:
 
           * `Ref{T}`, where `T` is the Julia type corresponding to `T`
           * a return type of `Ref{Any}` is invalid, it should either be `Any` (corresponding to `jl_value_t*`)
-            or `Ptr{Any}` (corresponding to `Ptr{Any}`)
+            or `Ptr{Any}` (corresponding to `jl_value_t**`)
           * C **MUST NOT** modify the memory returned via `Ref{T}` if `T` is an `isbits` type
       * If the memory is owned by C:
 
           * `Ptr{T}`, where `T` is the Julia type corresponding to `T`
-  * `(T*)(...)` (e.g. a pointer to a function)
+  * `T (*)(...)` (e.g. a pointer to a function)
 
-      * `Ptr{Cvoid}` (you may need to use [`cfunction`](@ref) explicitly to create this pointer)
+      * `Ptr{Cvoid}` (you may need to use [`@cfunction`](@ref) explicitly to create this pointer)
 
 ### Passing Pointers for Modifying Inputs
 
@@ -880,8 +884,10 @@ expression, which is then evaluated. Keep in mind that `eval` only operates at t
 so within this expression local variables will not be available (unless their values are substituted
 with `$`). For this reason, `eval` is typically only used to form top-level definitions, for example
 when wrapping libraries that contain many similar functions.
+A similar example can be constructed for [`@cfunction`](@ref).
 
-If your usage is more dynamic, use indirect calls as described in the next section.
+However, doing this will also be very slow and leak memory, so you should usually avoid this and instead keep reading.
+The next section discusses how to use indirect calls to efficiently accomplish a similar effect.
 
 ## Indirect Calls
 
@@ -911,6 +917,34 @@ mylibvar = Libdl.dlopen("mylib")
 ccall(@dlsym("myfunc", mylibvar), Cvoid, ())
 ```
 
+## Closure cfunctions
+
+The first argument to [`@cfunction`](@ref) can be marked with a `$`, in which case
+the return value will instead be a `struct CFunction` which closes over the argument.
+You must ensure that this return object is kept alive until all uses of it are done.
+The contents and code at the cfunction pointer will be erased via a [`finalizer`](@ref)
+when this reference is dropped and atexit. This is not usually needed, since this
+functionality is not present in C, but can be useful for dealing with ill-designed APIs
+which don't provide a separate closure environment parameter.
+
+```julia
+function qsort(a::Vector{T}, cmp) where T
+    isbits(T) || throw(ArgumentError("this method can only qsort isbits arrays"))
+    callback = @cfunction $cmp Cint (Ref{T}, Ref{T})
+    # Here, `callback` isa Base.CFunction, which will be converted to Ptr{Cvoid}
+    # (and protected against finalization) by the ccall
+    ccall(:qsort, Cvoid, (Ptr{T}, Csize_t, Csize_t, Ptr{Cvoid}),
+        a, length(A), Base.elsize(A), callback)
+    # We could instead use:
+    #    GC.@preserve callback begin
+    #        use(Base.unsafe_convert(Ptr{Cvoid}, callback))
+    #    end
+    # if we needed to use it outside of a `ccall`
+    return a
+end
+```
+
+
 ## Closing a Library
 
 It is sometimes useful to close (unload) a library so that it can be reloaded.
@@ -922,7 +956,7 @@ and load in the new changes. One can either restart Julia or use the
 ```julia
 lib = Libdl.dlopen("./my_lib.so") # Open the library explicitly.
 sym = Libdl.dlsym(lib, :my_fcn)   # Get a symbol for the function to call.
-ccall(sym, ...) # Use the symbol instead of the (symbol, library) tuple (remaining arguments are the same).
+ccall(sym, ...) # Use the pointer `sym` instead of the (symbol, library) tuple (remaining arguments are the same).
 Libdl.dlclose(lib) # Close the library explicitly.
 ```
 
diff --git a/src/ast.c b/src/ast.c
index ffb2a289f10874..f277cb209a08d6 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -45,8 +45,11 @@ jl_sym_t *exc_sym;     jl_sym_t *error_sym;
 jl_sym_t *new_sym;     jl_sym_t *using_sym;
 jl_sym_t *const_sym;   jl_sym_t *thunk_sym;
 jl_sym_t *underscore_sym;
-jl_sym_t *abstracttype_sym; jl_sym_t *primtype_sym;
-jl_sym_t *structtype_sym; jl_sym_t *foreigncall_sym;
+jl_sym_t *abstracttype_sym;
+jl_sym_t *primtype_sym;
+jl_sym_t *structtype_sym;
+jl_sym_t *foreigncall_sym;
+jl_sym_t *cfunction_sym;
 jl_sym_t *global_sym; jl_sym_t *list_sym;
 jl_sym_t *dot_sym;    jl_sym_t *newvar_sym;
 jl_sym_t *boundscheck_sym; jl_sym_t *inbounds_sym;
@@ -321,6 +324,7 @@ void jl_init_frontend(void)
     call_sym = jl_symbol("call");
     invoke_sym = jl_symbol("invoke");
     foreigncall_sym = jl_symbol("foreigncall");
+    cfunction_sym = jl_symbol("cfunction");
     quote_sym = jl_symbol("quote");
     inert_sym = jl_symbol("inert");
     top_sym = jl_symbol("top");
diff --git a/src/ccall.cpp b/src/ccall.cpp
index cfaccf4e960799..f075e8a4d8f6b9 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -472,7 +472,7 @@ static Value *llvm_type_rewrite(
 
 // --- argument passing and scratch space utilities ---
 
-static Value *runtime_apply_type(jl_codectx_t &ctx, jl_value_t *ty, jl_unionall_t *unionall)
+static Value *runtime_apply_type_env(jl_codectx_t &ctx, jl_value_t *ty)
 {
     // box if concrete type was not statically known
     Value *args[] = {
@@ -515,7 +515,7 @@ static void typeassert_input(jl_codectx_t &ctx, const jl_cgval_t &jvinfo, jl_val
                 emit_typecheck(ctx, jvinfo, jlto, msg);
             }
             else {
-                jl_cgval_t jlto_runtime = mark_julia_type(ctx, runtime_apply_type(ctx, jlto, jlto_env), true, jl_any_type);
+                jl_cgval_t jlto_runtime = mark_julia_type(ctx, runtime_apply_type_env(ctx, jlto), true, jl_any_type);
                 Value *vx = boxed(ctx, jvinfo);
                 Value *istype = ctx.builder.CreateICmpNE(
                         ctx.builder.CreateCall(prepare_call(jlisa_func), { vx, boxed(ctx, jlto_runtime) }),
@@ -1140,23 +1140,30 @@ static jl_cgval_t emit_llvmcall(jl_codectx_t &ctx, jl_value_t **args, size_t nar
 
 // --- code generator for ccall itself ---
 
-static jl_cgval_t mark_or_box_ccall_result(jl_codectx_t &ctx, Value *result, bool isboxed, jl_value_t *rt, jl_unionall_t *unionall, bool static_rt)
+static Value *box_ccall_result(jl_codectx_t &ctx, Value *result, Value *runtime_dt, jl_value_t *rt)
 {
-    if (!static_rt) {
-        assert(!isboxed && ctx.spvals_ptr && unionall && jl_is_datatype(rt));
-        Value *runtime_dt = runtime_apply_type(ctx, rt, unionall);
-        // TODO: is this concrete check actually necessary, or is it structurally guaranteed?
-        emit_concretecheck(ctx, runtime_dt, "ccall: return type must be a concrete DataType");
+    // XXX: need to handle parameterized zero-byte types (singleton)
 #if JL_LLVM_VERSION >= 40000
-        const DataLayout &DL = jl_data_layout;
+    const DataLayout &DL = jl_data_layout;
 #else
-        const DataLayout &DL = jl_ExecutionEngine->getDataLayout();
+    const DataLayout &DL = jl_ExecutionEngine->getDataLayout();
 #endif
-        unsigned nb = DL.getTypeStoreSize(result->getType());
-        MDNode *tbaa = jl_is_mutable(rt) ? tbaa_mutab : tbaa_immut;
-        Value *strct = emit_allocobj(ctx, nb, runtime_dt);
-        init_bits_value(ctx, strct, result, tbaa);
-        return mark_julia_type(ctx, strct, true, rt);
+    unsigned nb = DL.getTypeStoreSize(result->getType());
+    MDNode *tbaa = jl_is_mutable(rt) ? tbaa_mutab : tbaa_immut;
+    Value *strct = emit_allocobj(ctx, nb, runtime_dt);
+    init_bits_value(ctx, strct, result, tbaa);
+    return strct;
+}
+
+static jl_cgval_t mark_or_box_ccall_result(jl_codectx_t &ctx, Value *result, bool isboxed, jl_value_t *rt, jl_unionall_t *unionall, bool static_rt)
+{
+    if (!static_rt) {
+        assert(!isboxed && jl_is_datatype(rt) && ctx.spvals_ptr && unionall);
+        Value *runtime_dt = runtime_apply_type_env(ctx, rt);
+        // TODO: skip this check if rt is not a Tuple
+        emit_concretecheck(ctx, runtime_dt, "ccall: return type must be a concrete DataType");
+        Value *strct = box_ccall_result(ctx, result, runtime_dt, rt);
+        return mark_julia_type(ctx, strct, true, rt); // TODO: jl_rewrap_unionall(rt, unionall)
     }
     return mark_julia_type(ctx, result, isboxed, rt);
 }
@@ -1383,11 +1390,10 @@ static std::pair<CallingConv::ID, bool> convert_cconv(jl_sym_t *lhd)
     jl_errorf("ccall: invalid calling convention %s", jl_symbol_name(lhd));
 }
 
-static bool verify_ref_type(jl_codectx_t &ctx, jl_value_t* &rt, jl_unionall_t *unionall_env, int n, const char *fname)
+static bool verify_ref_type(jl_codectx_t &ctx, jl_value_t* ref, jl_unionall_t *unionall_env, int n, const char *fname)
 {
     // emit verification that the tparam for Ref isn't Any or a TypeVar
-    jl_value_t *ref = jl_tparam0(rt);
-    const char rt_err_msg_notany[] = " type Ref{Any} is invalid. use Ptr{Any} instead.";
+    const char rt_err_msg_notany[] = " type Ref{Any} is invalid. Use Any or Ptr{Any} instead.";
     if (ref == (jl_value_t*)jl_any_type && n == 0) {
         emit_error(ctx, make_errmsg(fname, n, rt_err_msg_notany));
         return false;
@@ -1416,6 +1422,7 @@ static bool verify_ref_type(jl_codectx_t &ctx, jl_value_t* &rt, jl_unionall_t *u
                     }
                     break;
                 }
+                ua = (jl_unionall_t*)ua->body;
             }
         }
         if (always_error) {
@@ -1566,7 +1573,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         : NULL;
 
     if (jl_is_abstract_ref_type(rt)) {
-        if (!verify_ref_type(ctx, rt, unionall, 0, "ccall")) {
+        if (!verify_ref_type(ctx, jl_tparam0(rt), unionall, 0, "ccall")) {
             JL_GC_POP();
             return jl_cgval_t();
         }
@@ -1601,7 +1608,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         if (jl_is_vararg_type(tti))
             tti = jl_unwrap_vararg(tti);
         if (jl_is_abstract_ref_type(tti)) {
-            if (!verify_ref_type(ctx, tti, unionall, i + 1, "ccall")) {
+            if (!verify_ref_type(ctx, jl_tparam0(tti), unionall, i + 1, "ccall")) {
                 JL_GC_POP();
                 return jl_cgval_t();
             }
@@ -1771,44 +1778,6 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         ctx.builder.SetInsertPoint(contBB);
         return ghostValue(jl_void_type);
     }
-    else if (is_libjulia_func(jl_function_ptr)) {
-        assert(!isVa && !llvmcall && nargt == 3);
-        assert(lrt == T_size);
-        jl_value_t *ft = argv[0].typ;
-        jl_value_t *frt = argv[1].constant;
-        if (!frt) {
-            if (jl_is_type_type(argv[1].typ) && !jl_has_free_typevars(argv[1].typ))
-                frt = jl_tparam0(argv[1].typ);
-        }
-        if (ft != jl_bottom_type && frt) {
-            jl_value_t *fargt = argv[2].constant;;
-            JL_GC_PUSH1(&fargt);
-            if (!fargt && jl_is_type_type(argv[2].typ)) {
-                if (!jl_has_free_typevars(argv[2].typ))
-                    fargt = jl_tparam0(argv[2].typ);
-            }
-            else if (fargt && jl_is_tuple(fargt)) {
-                // TODO: maybe deprecation warning, better checking
-                fargt = (jl_value_t*)jl_apply_tuple_type_v((jl_value_t**)jl_data_ptr(fargt), jl_nfields(fargt));
-            }
-            if (fargt && jl_is_tuple_type(fargt)) {
-                Value *llvmf = NULL;
-                JL_TRY {
-                    llvmf = jl_cfunction_cache(ft, frt, (jl_tupletype_t*)fargt);
-                }
-                JL_CATCH {
-                    llvmf = NULL;
-                }
-                if (llvmf) {
-                    JL_GC_POP();
-                    JL_GC_POP();
-                    Value *fptr = ctx.builder.CreatePtrToInt(prepare_call(llvmf), lrt);
-                    return mark_or_box_ccall_result(ctx, fptr, retboxed, rt, unionall, static_rt);
-                }
-            }
-            JL_GC_POP();
-        }
-    }
     else if (is_libjulia_func(jl_array_isassigned) &&
              argv[1].typ == (jl_value_t*)jl_ulong_type) {
         assert(!isVa && !llvmcall && nargt == 2 && !addressOf.at(0) && !addressOf.at(1));
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 828d5c79e86c7f..5b5842e28d464e 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -1135,15 +1135,20 @@ static void emit_typecheck(jl_codectx_t &ctx, const jl_cgval_t &x, jl_value_t *t
     }
 }
 
-static void emit_concretecheck(jl_codectx_t &ctx, Value *typ, const std::string &msg)
+static Value *emit_isconcrete(jl_codectx_t &ctx, Value *typ)
 {
-    assert(typ->getType() == T_prjlvalue);
-    emit_typecheck(ctx, mark_julia_type(ctx, typ, true, jl_any_type), (jl_value_t*)jl_datatype_type, msg);
     Value *isconcrete;
     isconcrete = ctx.builder.CreateConstInBoundsGEP1_32(T_int8, emit_bitcast(ctx, decay_derived(typ), T_pint8), offsetof(jl_datatype_t, isconcretetype));
     isconcrete = ctx.builder.CreateLoad(isconcrete, tbaa_const);
     isconcrete = ctx.builder.CreateTrunc(isconcrete, T_int1);
-    error_unless(ctx, isconcrete, msg);
+    return isconcrete;
+}
+
+static void emit_concretecheck(jl_codectx_t &ctx, Value *typ, const std::string &msg)
+{
+    assert(typ->getType() == T_prjlvalue);
+    emit_typecheck(ctx, mark_julia_type(ctx, typ, true, jl_any_type), (jl_value_t*)jl_datatype_type, msg);
+    error_unless(ctx, emit_isconcrete(ctx, typ), msg);
 }
 
 #define CHECK_BOUNDS 1
@@ -2338,6 +2343,15 @@ static Value *emit_allocobj(jl_codectx_t &ctx, size_t static_size, Value *jt)
     return call;
 }
 
+// allocation for unknown object from an untracked pointer
+static Value *emit_new_bits(jl_codectx_t &ctx, Value *jt, Value *pval)
+{
+    pval = ctx.builder.CreateBitCast(pval, T_pint8);
+    auto call = ctx.builder.CreateCall(prepare_call(jl_newbits_func), { jt, pval });
+    call->setAttributes(jl_newbits_func->getAttributes());
+    return call;
+}
+
 // if ptr is NULL this emits a write barrier _back_
 static void emit_write_barrier(jl_codectx_t &ctx, Value *parent, Value *ptr)
 {
@@ -2496,8 +2510,8 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
             if (jl_field_isptr(sty, i)) {
                 tbaa_decorate(strctinfo.tbaa, ctx.builder.CreateStore(
                         ConstantPointerNull::get(cast<PointerType>(T_prjlvalue)),
-                            ctx.builder.CreateGEP(T_prjlvalue, emit_bitcast(ctx, strct, T_pprjlvalue),
-                                ConstantInt::get(T_size, jl_field_offset(sty, i) / sizeof(void*)))));
+                        ctx.builder.CreateGEP(T_prjlvalue, emit_bitcast(ctx, strct, T_pprjlvalue),
+                            ConstantInt::get(T_size, jl_field_offset(sty, i) / sizeof(void*)))));
             }
         }
         for (size_t i = nargs; i < nf; i++) {
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 915474b6904166..b1f8ee184c8f56 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -299,6 +299,7 @@ static Function *jlenter_func;
 static Function *jlleave_func;
 static Function *jlegal_func;
 static Function *jl_alloc_obj_func;
+static Function *jl_newbits_func;
 static Function *jl_typeof_func;
 static Function *jl_write_barrier_func;
 static Function *jlisa_func;
@@ -329,6 +330,7 @@ static Function *jldepwarnpi_func;
 //static Function *jlgetnthfield_func;
 static Function *jlgetnthfieldchecked_func;
 //static Function *jlsetnthfield_func;
+static Function *jlgetcfunctiontrampoline_func;
 #ifdef _OS_WINDOWS_
 static Function *resetstkoflw_func;
 #if defined(_CPU_X86_64_)
@@ -421,7 +423,8 @@ struct jl_cgval_t {
         assert(!(isboxed && TIndex != NULL));
         assert(TIndex == NULL || TIndex->getType() == T_int8);
     }
-    jl_cgval_t(jl_value_t *typ) : // ghost value constructor
+    explicit jl_cgval_t(jl_value_t *typ) : // ghost value constructor
+        // mark explicit to avoid being used implicitly for conversion from NULL (use jl_cgval_t() instead)
         V(NULL),
         Vboxed(NULL),
         TIndex(NULL),
@@ -1418,28 +1421,14 @@ jl_generic_fptr_t jl_generate_fptr(jl_method_instance_t *li, const char *F, size
     return fptr;
 }
 
-static Function *jl_cfunction_cache(jl_value_t *ft, jl_value_t *declrt, jl_tupletype_t *argt);
-
-// Get a pointer to the cache for the C-callable entry point for the given argument types.
-// here argt does not include the leading function type argument
-static Function *jl_cfunction_object(jl_value_t *f, jl_value_t *declrt, jl_tupletype_t *argt)
-{
-    jl_value_t *ft;
-    if (jl_is_type(f))
-        ft = (jl_value_t*)jl_wrap_Type(f);
-    else
-        ft = jl_typeof(f);
-    return jl_cfunction_cache(ft, declrt, argt);
-}
+static Function *jl_cfunction_object(jl_value_t *f, jl_value_t *declrt, jl_tupletype_t *argt);
 
 // get the address of a C-callable entry point for a function
 extern "C" JL_DLLEXPORT
 void *jl_function_ptr(jl_function_t *f, jl_value_t *rt, jl_value_t *argt)
 {
-    JL_GC_PUSH1(&argt);
     JL_LOCK(&codegen_lock);
     Function *llvmf = jl_cfunction_object(f, rt, (jl_tupletype_t*)argt);
-    JL_GC_POP();
     void *ptr = (void*)getAddressForFunction(llvmf->getName());
     JL_UNLOCK(&codegen_lock);
     return ptr;
@@ -3703,6 +3692,8 @@ static void emit_assignment(jl_codectx_t &ctx, jl_value_t *l, jl_value_t *r)
 
 // --- convert expression to code ---
 
+static jl_cgval_t emit_cfunction(jl_codectx_t &ctx, jl_value_t *output_type, const jl_cgval_t &fexpr, jl_value_t *rt, jl_svec_t *argt);
+
 static Value *emit_condition(jl_codectx_t &ctx, const jl_cgval_t &condV, const std::string &msg)
 {
     bool isbool = (condV.typ == (jl_value_t*)jl_bool_type);
@@ -3879,6 +3870,10 @@ static jl_cgval_t emit_expr(jl_codectx_t &ctx, jl_value_t *expr)
     else if (head == foreigncall_sym) {
         return emit_ccall(ctx, args, jl_array_dim0(ex->args));
     }
+    else if (head == cfunction_sym) {
+        jl_cgval_t fexpr_rt = emit_expr(ctx, args[1]);
+        return emit_cfunction(ctx, args[0], fexpr_rt, args[2], (jl_svec_t*)args[3]);
+    }
     else if (head == assign_sym) {
         emit_assignment(ctx, args[0], args[1]);
         return ghostValue(jl_void_type);
@@ -4184,22 +4179,25 @@ static void emit_cfunc_invalidate(
     }
 }
 
-static Function*
-gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
-    jl_typemap_entry_t *sf, jl_value_t *declrt, jl_tupletype_t *sigt)
+static Function* gen_cfun_wrapper(
+    Module *into,
+    const function_sig_t &sig, jl_value_t *ff,
+    jl_typemap_entry_t *sf, jl_value_t *declrt, jl_tupletype_t *sigt,
+    jl_unionall_t *unionall_env, jl_svec_t *sparam_vals, jl_array_t **closure_types)
 {
     // Generate a c-callable wrapper
     size_t nargs = sig.nargs;
     const char *name = "cfunction";
     size_t world = jl_world_counter;
+    bool nest = (!ff || unionall_env);
     // try to look up this function for direct invoking
-    jl_method_instance_t *lam = jl_get_specialization1((jl_tupletype_t*)sigt, world);
+    jl_method_instance_t *lam = sigt ? jl_get_specialization1((jl_tupletype_t*)sigt, world) : NULL;
     jl_value_t *astrt = (jl_value_t*)jl_any_type;
     // infer it first, if necessary
     if (lam) {
         name = jl_symbol_name(lam->def.method->name);
         jl_code_info_t *src = NULL;
-        if (!lam->inferred) // TODO: this isn't ideal to be unconditionally calling type inference from here
+        if (!into && !lam->inferred) // TODO: this isn't ideal to be unconditionally calling type inference from here
             src = jl_type_infer(&lam, world, 0);
         jl_compile_linfo(&lam, src, world, &jl_default_cgparams);
         if (lam->jlcall_api != JL_API_CONST) {
@@ -4224,20 +4222,39 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
     std::stringstream funcName;
     funcName << "jlcapi_" << name << "_" << globalUnique++;
 
-    Module *M = new Module(name, jl_LLVMContext);
-    jl_setup_module(M);
-    FunctionType *functype = sig.functype();
+    Module *M = into;
+    if (!M) {
+        M = new Module(name, jl_LLVMContext);
+        jl_setup_module(M);
+    }
+#if JL_LLVM_VERSION >= 50000
+    AttributeList attributes = sig.attributes;
+#else
+    AttributeSet attributes = sig.attributes;
+#endif
+    FunctionType *functype;
+    if (nest) {
+        // add nest parameter (pointer to jl_value_t* data array) after sret arg
+        assert(closure_types);
+        std::vector<Type*> fargt_sig(sig.fargt_sig);
+        fargt_sig.insert(fargt_sig.begin() + sig.sret, T_pprjlvalue);
+        functype = FunctionType::get(sig.sret ? T_void : sig.prt, fargt_sig, sig.isVa);
+        attributes = attributes.addAttribute(jl_LLVMContext, 1 + sig.sret, Attribute::Nest);
+    }
+    else {
+        functype = sig.functype();
+    }
     Function *cw = Function::Create(functype,
             GlobalVariable::ExternalLinkage,
             funcName.str(), M);
     jl_init_function(cw);
-    cw->setAttributes(sig.attributes);
+    cw->setAttributes(attributes);
 #ifdef JL_DISABLE_FPO
     cw->addFnAttr("no-frame-pointer-elim", "true");
 #endif
-    Function *cw_proto = function_proto(cw);
+    Function *cw_proto = into ? cw : function_proto(cw);
     // Save the Function object reference
-    {
+    if (sf) {
         jl_value_t *oldsf = sf->func.value;
         size_t i, oldlen = jl_svec_len(oldsf);
         jl_value_t *newsf = (jl_value_t*)jl_alloc_svec(oldlen + 2);
@@ -4256,6 +4273,8 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
     ctx.linfo = lam;
     ctx.world = world;
     ctx.params = &jl_default_cgparams;
+    ctx.name = name;
+    ctx.funcName = name;
 
     BasicBlock *b0 = BasicBlock::Create(jl_LLVMContext, "top", cw);
     ctx.builder.SetInsertPoint(b0);
@@ -4292,27 +4311,54 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
     // first emit code to record the arguments
     Function::arg_iterator AI = cw->arg_begin();
     Value *sretPtr = sig.sret ? &*AI++ : NULL;
+    Value *nestPtr = nest ? &*AI++ : NULL;
     jl_cgval_t *inputargs = (jl_cgval_t*)alloca(sizeof(jl_cgval_t) * (nargs + 1));
-    // we need to pass the function object even if (even though) it is a singleton
-    inputargs[0] = mark_julia_const(ff);
+    if (ff) {
+        // we need to pass the function object even if (even though) it is a singleton
+        inputargs[0] = mark_julia_const(ff);
+    }
+    else {
+        assert(nest && nestPtr);
+        Value *ff = ctx.builder.CreateLoad(T_prjlvalue, nestPtr);
+        inputargs[0] = mark_julia_type(ctx, ff, true, jl_any_type);
+    }
+    // XXX: these values may need to be rooted until the end of the function
+    jl_value_t *rt1 = NULL;
+    jl_value_t *rt2 = NULL;
+    JL_GC_PUSH2(&rt1, &rt2);
     for (size_t i = 0; i < nargs; ++i, ++AI) {
+        // figure out how to unpack this argument type
         Value *val = &*AI;
-        jl_value_t *jargty = jl_svecref(sig.at, i);
-        // figure out how to unpack this type
+        assert(sig.fargt_sig.at(i + sig.sret) == val->getType());
         jl_cgval_t &inputarg = inputargs[i + 1];
-        if (jl_is_abstract_ref_type(jargty)) {
-            // a pointer to a value
+        jl_value_t *jargty = jl_svecref(sig.at, i);
+        bool aref = jl_is_abstract_ref_type(jargty);
+        if (aref) // a pointer to a value
             jargty = jl_tparam0(jargty);
+
+        // if we know the outer function sparams, try to fill those in now
+        // so that the julia_to_native type checks are more likely to be doable (e.g. concrete types) at compile-time
+        jl_value_t *jargty_proper = jargty;
+        bool static_at = !(unionall_env && jl_has_typevar_from_unionall(jargty, unionall_env));
+        if (!static_at) {
+            if (sparam_vals) {
+                jargty_proper = rt1 = jl_instantiate_type_in_env(jargty, unionall_env, jl_svec_data(sparam_vals));
+                assert(jargty_proper != jargty);
+                jargty = jargty_proper;
+                static_at = true;
+            }
+            else {
+                jargty_proper = rt1 = jl_rewrap_unionall(jargty, (jl_value_t*)unionall_env);
+            }
+        }
+
+        if (aref) {
             if (jargty == (jl_value_t*)jl_any_type) {
                 inputarg = mark_julia_type(ctx,
                         ctx.builder.CreateLoad(emit_bitcast(ctx, val, T_pprjlvalue)),
-                        true, jargty);
+                        true, jl_any_type);
             }
-            else if (!jl_justbits(jargty)) {
-                // must be a jl_value_t* (because it's mutable or contains gc roots)
-                inputarg = mark_julia_type(ctx, maybe_decay_untracked(emit_bitcast(ctx, val, T_prjlvalue)), true, jargty);
-            }
-            else {
+            else if (static_at && jl_justbits(jargty)) { // anything that can be stored unboxed
                 bool isboxed;
                 Type *T = julia_type_to_llvm(jargty, &isboxed);
                 assert(!isboxed);
@@ -4326,44 +4372,92 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
                     inputarg = mark_julia_type(ctx, val, false, jargty);
                 }
             }
+            else if (static_at || (!jl_is_typevar(jargty) && !jl_is_immutable_datatype(jargty))) {
+                // must be a jl_value_t* (because it's mutable or contains gc roots)
+                inputarg = mark_julia_type(ctx, maybe_decay_untracked(emit_bitcast(ctx, val, T_prjlvalue)), true, jargty_proper);
+            }
+            else {
+                // allocate val into a new box, if it might not be boxed
+                // otherwise preserve / reuse the existing box identity
+                // TODO: could inspect `jargty` and eliminate some of these cases
+                if (!*closure_types)
+                    *closure_types = jl_alloc_vec_any(0);
+                jl_array_ptr_1d_push(*closure_types, jargty);
+                Value *runtime_dt = ctx.builder.CreateLoad(T_prjlvalue,
+                        ctx.builder.CreateConstGEP1_32(T_prjlvalue, nestPtr, jl_array_len(*closure_types)));
+                BasicBlock *boxedBB = BasicBlock::Create(jl_LLVMContext, "isboxed", cw);
+                BasicBlock *loadBB = BasicBlock::Create(jl_LLVMContext, "need-load", cw);
+                BasicBlock *unboxedBB = BasicBlock::Create(jl_LLVMContext, "maybe-unboxed", cw);
+                BasicBlock *isanyBB = BasicBlock::Create(jl_LLVMContext, "any", cw);
+                BasicBlock *afterBB = BasicBlock::Create(jl_LLVMContext, "after", cw);
+                Value *isrtboxed = ctx.builder.CreateIsNull(val);
+                ctx.builder.CreateCondBr(isrtboxed, boxedBB, loadBB);
+                ctx.builder.SetInsertPoint(boxedBB);
+                Value *p1 = ctx.builder.CreateBitCast(val, T_pjlvalue);
+                p1 = maybe_decay_untracked(p1);
+                ctx.builder.CreateBr(afterBB);
+                ctx.builder.SetInsertPoint(loadBB);
+                Value *isrtany = ctx.builder.CreateICmpEQ(
+                        literal_pointer_val(ctx, (jl_value_t*)jl_any_type),
+                        ctx.builder.CreateBitCast(val, T_pjlvalue));
+                ctx.builder.CreateCondBr(isrtany, isanyBB, unboxedBB);
+                ctx.builder.SetInsertPoint(isanyBB);
+                Value *p2 = ctx.builder.CreateLoad(T_prjlvalue, ctx.builder.CreateBitCast(val, T_pprjlvalue));
+                ctx.builder.CreateBr(afterBB);
+                ctx.builder.SetInsertPoint(unboxedBB);
+                Value *p3 = emit_new_bits(ctx, runtime_dt, val);
+                ctx.builder.CreateBr(afterBB);
+                ctx.builder.SetInsertPoint(afterBB);
+                PHINode *p = ctx.builder.CreatePHI(T_prjlvalue, 3);
+                p->addIncoming(p1, boxedBB);
+                p->addIncoming(p2, isanyBB);
+                p->addIncoming(p3, unboxedBB);
+                inputarg = mark_julia_type(ctx, p, true, jargty_proper);
+            }
         }
         else {
-            bool argboxed;
-            (void)julia_struct_to_llvm(jargty, NULL, &argboxed);
+            bool argboxed = sig.fargt_isboxed.at(i);
             if (argboxed) {
                 // a jl_value_t*, even when represented as a struct
-                inputarg = mark_julia_type(ctx, val, true, jargty);
+                inputarg = mark_julia_type(ctx, val, true, jargty_proper);
             }
             else {
                 // something of type T
                 // undo whatever we might have done to this poor argument
+                assert(jl_is_datatype(jargty));
                 if (sig.byRefList.at(i)) {
                     assert(cast<PointerType>(val->getType())->getElementType() == sig.fargt[i]);
                     val = ctx.builder.CreateAlignedLoad(val, 1); // unknown alignment from C
                 }
                 else {
-                    bool issigned = jl_signed_type && jl_subtype(jargty, (jl_value_t*)jl_signed_type);
+                    bool issigned = jl_signed_type && jl_subtype(jargty_proper, (jl_value_t*)jl_signed_type);
                     val = llvm_type_rewrite(ctx, val, sig.fargt[i], issigned);
                 }
-                bool isboxed;
-                (void)julia_type_to_llvm(jargty, &isboxed);
-                if (isboxed) {
-                    // passed an unboxed T, but want something boxed
-                    Value *mem = emit_allocobj(ctx, jl_datatype_size(jargty),
-                                               literal_pointer_val(ctx, (jl_value_t*)jargty));
-                    tbaa_decorate(jl_is_mutable(jargty) ? tbaa_mutab : tbaa_immut,
-                                  ctx.builder.CreateAlignedStore(val,
-                                                             emit_bitcast(ctx, mem, val->getType()->getPointerTo()),
-                                                             16)); // julia's gc gives 16-byte aligned addresses
-                    inputarg = mark_julia_type(ctx, mem, true, jargty);
+                // passed an unboxed T, but may need something boxed (not valid to be unboxed)
+                if (static_at) {
+                    bool isboxed;
+                    assert(jargty == jargty_proper);
+                    (void)julia_type_to_llvm(jargty, &isboxed);
+                    if (isboxed)
+                        inputarg = mark_julia_type(ctx,
+                                box_ccall_result(ctx, val, literal_pointer_val(ctx, jargty), jargty),
+                                true, jargty_proper);
+                    else
+                        inputarg = mark_julia_type(ctx, val, false, jargty);
                 }
                 else {
-                    // mark that this is an unboxed T
-                    inputarg = mark_julia_type(ctx, val, false, jargty);
+                    if (!*closure_types)
+                        *closure_types = jl_alloc_vec_any(0);
+                    jl_array_ptr_1d_push(*closure_types, jargty);
+                    Value *runtime_dt = ctx.builder.CreateLoad(T_prjlvalue,
+                            ctx.builder.CreateConstGEP1_32(T_prjlvalue, nestPtr, jl_array_len(*closure_types)));
+                    Value *strct = box_ccall_result(ctx, val, runtime_dt, jargty);
+                    inputarg = mark_julia_type(ctx, strct, true, jargty_proper);
                 }
             }
         }
     }
+    JL_GC_POP();
     assert(AI == cw->arg_end());
 
     // Create the call
@@ -4505,8 +4599,9 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
         retval = mark_julia_type(ctx, ret, true, astrt);
     }
 
-    // inline a call to typeassert here
+    // inline a call to typeassert here, if required
     emit_typecheck(ctx, retval, declrt, "cfunction");
+    retval = update_julia_type(ctx, retval, declrt);
 
     // Prepare the return value
     Value *r;
@@ -4524,8 +4619,7 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
         if (sig.sret)
             prt = sig.fargt_sig[0]->getContainedType(0); // sret is a PointerType
         bool issigned = jl_signed_type && jl_subtype(declrt, (jl_value_t*)jl_signed_type);
-        Value *v = julia_to_native(ctx, sig.lrt, sig.retboxed, declrt, NULL, retval,
-                                   false, 0, NULL);
+        Value *v = emit_unbox(ctx, sig.lrt, retval, retval.typ);
         r = llvm_type_rewrite(ctx, v, prt, issigned);
         if (sig.sret) {
             ctx.builder.CreateStore(r, sretPtr);
@@ -4533,7 +4627,6 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
         }
     }
     else {
-        assert(type_is_ghost(sig.lrt));
         r = NULL;
     }
 
@@ -4543,28 +4636,213 @@ gen_cfun_wrapper(const function_sig_t &sig, jl_value_t *ff,
     ctx.builder.SetCurrentDebugLocation(noDbg);
     ctx.builder.ClearInsertionPoint();
 
-    jl_finalize_module(M, true);
+    if (nest) {
+        funcName << "make";
+        Function *cw_make = Function::Create(
+                FunctionType::get(T_pint8, { T_pint8, T_ppjlvalue }, false),
+                GlobalVariable::ExternalLinkage,
+                funcName.str(), M);
+        jl_init_function(cw_make);
+#ifdef JL_DISABLE_FPO
+        cw_make->addFnAttr("no-frame-pointer-elim", "true");
+#endif
+        BasicBlock *b0 = BasicBlock::Create(jl_LLVMContext, "top", cw_make);
+        IRBuilder<> cwbuilder(b0);
+        Function::arg_iterator AI = cw_make->arg_begin();
+        Argument *Tramp = &*AI; ++AI;
+        Argument *NVal = &*AI; ++AI;
+        Function *init_trampoline = Intrinsic::getDeclaration(cw_make->getParent(), Intrinsic::init_trampoline);
+        Function *adjust_trampoline = Intrinsic::getDeclaration(cw_make->getParent(), Intrinsic::adjust_trampoline);
+        cwbuilder.CreateCall(init_trampoline, {
+                Tramp,
+                cwbuilder.CreateBitCast(cw, T_pint8),
+                cwbuilder.CreateBitCast(NVal, T_pint8)
+            });
+        cwbuilder.CreateRet(cwbuilder.CreateCall(adjust_trampoline, { Tramp }));
+        cw_proto = into ? cw_make : function_proto(cw_make);
+    }
+
+    if (!into)
+        jl_finalize_module(M, true);
 
     return cw_proto;
 }
 
+// Get the LLVM Function* for the C-callable entry point for a certain function
+// and argument types.
+// here argt does not include the leading function type argument
+static jl_cgval_t emit_cfunction(jl_codectx_t &ctx, jl_value_t *output_type, const jl_cgval_t &fexpr_rt, jl_value_t *declrt, jl_svec_t *argt)
+{
+    jl_unionall_t *unionall_env = (jl_is_method(ctx.linfo->def.method) && jl_is_unionall(ctx.linfo->def.method->sig))
+        ? (jl_unionall_t*)ctx.linfo->def.method->sig
+        : NULL;
+    jl_svec_t *sparam_vals = NULL;
+    if (ctx.spvals_ptr == NULL && jl_svec_len(ctx.linfo->sparam_vals) > 0)
+        sparam_vals = ctx.linfo->sparam_vals;
+
+    jl_value_t *rt = declrt;
+    if (jl_is_abstract_ref_type(declrt)) {
+        declrt = jl_tparam0(declrt);
+        if (!verify_ref_type(ctx, declrt, unionall_env, 0, "cfunction")) {
+            return jl_cgval_t();
+        }
+        if (unionall_env)
+            declrt = jl_rewrap_unionall(declrt, (jl_value_t*)unionall_env);
+        rt = (jl_value_t*)jl_any_type; // convert return type to jl_value_t*
+    }
+
+    // some sanity checking and check whether there's a vararg
+    jl_array_t *closure_types = NULL;
+    jl_value_t *sigt = NULL; // dispatch-sig = type signature with Ref{} annotations removed and applied to the env
+    JL_GC_PUSH4(&declrt, &sigt, &rt, &closure_types);
+    bool isVa;
+    size_t nargt;
+    Type *lrt;
+    bool retboxed;
+    bool static_rt;
+    const std::string err = verify_ccall_sig(
+            /* inputs:  */
+            0, rt, (jl_value_t*)argt, unionall_env,
+            sparam_vals,
+            "cfunction",
+            /* outputs: */
+            nargt, isVa, lrt, retboxed, static_rt);
+    if (!err.empty()) {
+        emit_error(ctx, "cfunction " + err);
+        JL_GC_POP();
+        return jl_cgval_t();
+    }
+    if (rt != declrt && rt != (jl_value_t*)jl_any_type)
+        jl_add_method_root(ctx, rt);
+
+    function_sig_t sig("cfunction", lrt, rt, retboxed, argt, unionall_env, nargt, isVa, CallingConv::C, false);
+    if (sig.err_msg.empty() && (sig.isVa || sig.fargt.size() + sig.sret != sig.fargt_sig.size()))
+        sig.err_msg = "cfunction: Vararg syntax not allowed for argument list";
+    if (!sig.err_msg.empty()) {
+        emit_error(ctx, sig.err_msg);
+        JL_GC_POP();
+        return jl_cgval_t();
+    }
+
+    // compute+verify the dispatch signature, and see if it depends on the environment sparams
+    bool approx = false;
+    sigt = (jl_value_t*)jl_alloc_svec(nargt + 1);
+    jl_svecset(sigt, 0, fexpr_rt.typ);
+    if (!fexpr_rt.constant && (!jl_is_concrete_type(fexpr_rt.typ) || jl_is_kind(fexpr_rt.typ)))
+        approx = true;
+    for (size_t i = 0; i < nargt; i++) {
+        jl_value_t *jargty = jl_svecref(argt, i);
+        if (jl_is_abstract_ref_type(jargty)) {
+            jargty = jl_tparam0(jargty);
+            if (!verify_ref_type(ctx, jargty, unionall_env, i + 1, "cfunction")) {
+                JL_GC_POP();
+                return jl_cgval_t();
+            }
+        }
+        if (unionall_env && jl_has_typevar_from_unionall(jargty, unionall_env)) {
+            if (sparam_vals)
+                jargty = jl_instantiate_type_in_env(jargty, unionall_env, jl_svec_data(sparam_vals));
+            else
+                approx = true;
+        }
+        jl_svecset(sigt, i + 1, jargty);
+    }
+    if (approx) {
+        sigt = NULL;
+    }
+    else {
+        sigt = (jl_value_t*)jl_apply_tuple_type((jl_svec_t*)sigt);
+    }
+    if (sigt && !(unionall_env && jl_has_typevar_from_unionall(rt, unionall_env))) {
+        unionall_env = NULL;
+    }
+
+    bool nest = (!fexpr_rt.constant || unionall_env);
+    Value *F = gen_cfun_wrapper(
+            jl_Module,
+            sig, fexpr_rt.constant,
+            NULL, declrt, (jl_tupletype_t*)sigt,
+            unionall_env, sparam_vals, &closure_types);
+    bool outboxed;
+    if (nest) {
+        // F is actually an init_trampoline function that returns the real address
+        // Now fill in the nest parameters
+        Value *fobj = boxed(ctx, fexpr_rt);
+        jl_svec_t *fill = jl_emptysvec;
+        if (closure_types) {
+            assert(ctx.spvals_ptr);
+            size_t n = jl_array_len(closure_types);
+            jl_svec_t *fill = jl_alloc_svec_uninit(n);
+            for (size_t i = 0; i < n; i++) {
+                jl_svecset(fill, i, jl_array_ptr_ref(closure_types, i));
+            }
+            jl_add_method_root(ctx, (jl_value_t*)fill);
+        }
+        std::stringstream cname;
+        cname << "trampolines" << globalUnique++;
+        Type *T_htable = ArrayType::get(T_size, sizeof(htable_t) / sizeof(void*));
+        Value *cache = new GlobalVariable(*jl_Module, T_htable, false,
+                               GlobalVariable::InternalLinkage,
+                               ConstantAggregateZero::get(T_htable),
+                               cname.str());
+        F = ctx.builder.CreateCall(prepare_call(jlgetcfunctiontrampoline_func), {
+                 fobj,
+                 literal_pointer_val(ctx, output_type),
+                 ctx.builder.CreateBitCast(cache, T_pint8),
+                 literal_pointer_val(ctx, (jl_value_t*)fill),
+                 F,
+                 closure_types ? literal_pointer_val(ctx, (jl_value_t*)unionall_env) : V_null,
+                 closure_types ? ctx.spvals_ptr : ConstantPointerNull::get(cast<PointerType>(T_pprjlvalue))
+             });
+        outboxed = true;
+    }
+    else {
+        F = ctx.builder.CreatePtrToInt(F, T_size);
+        outboxed = (output_type != (jl_value_t*)jl_voidpointer_type);
+        if (outboxed) {
+            assert(jl_datatype_size(output_type) == sizeof(void*) * 4);
+            Value *strct = emit_allocobj(ctx, jl_datatype_size(output_type),
+                                         literal_pointer_val(ctx, (jl_value_t*)output_type));
+            Value *derived_strct = emit_bitcast(ctx, decay_derived(strct), T_psize);
+            MDNode *tbaa = best_tbaa(output_type);
+            tbaa_decorate(tbaa, ctx.builder.CreateStore(F, derived_strct));
+            tbaa_decorate(tbaa, ctx.builder.CreateStore(
+                ctx.builder.CreatePtrToInt(literal_pointer_val(ctx, fexpr_rt.constant), T_size),
+                ctx.builder.CreateConstGEP1_32(T_size, derived_strct, 1)));
+            Value *zero = ConstantInt::get(T_size, 0);
+            tbaa_decorate(tbaa, ctx.builder.CreateStore(zero,
+                    ctx.builder.CreateConstGEP1_32(T_size, derived_strct, 2)));
+            tbaa_decorate(tbaa, ctx.builder.CreateStore(zero,
+                    ctx.builder.CreateConstGEP1_32(T_size, derived_strct, 3)));
+            F = strct;
+        }
+    }
+    JL_GC_POP();
+    return mark_julia_type(ctx, F, outboxed, output_type);
+}
+
 const struct jl_typemap_info cfunction_cache = {
     1, (jl_datatype_t**)&jl_array_any_type
 };
 
 jl_array_t *jl_cfunction_list;
 
-static Function *jl_cfunction_cache(jl_value_t *ft, jl_value_t *declrt, jl_tupletype_t *argt)
+static Function *jl_cfunction_object(jl_value_t *ff, jl_value_t *declrt, jl_tupletype_t *argt)
 {
     // Assumes the codegen lock is acquired. The caller is responsible for that.
-    jl_value_t *sigt = NULL; // dispatch sig: type signature (argt) with Ref{} annotations removed and ft added
-    JL_GC_PUSH2(&sigt, &ft);
+    jl_ptls_t ptls = jl_get_ptls_states();
+    if (ptls->in_pure_callback)
+        jl_error("cfunction cannot be used in a generated function");
 
     // validate and unpack the arguments
-    JL_TYPECHK(cfunction, type, (jl_value_t*)ft);
     JL_TYPECHK(cfunction, type, declrt);
-    if (!jl_is_tuple_type(argt))
+    if (!jl_is_tuple_type(argt)) // the C API requires that argt Tuple type actually be an svec
         jl_type_error("cfunction", (jl_value_t*)jl_anytuple_type_type, (jl_value_t*)argt);
+    // trampolines are not supported here:
+    // check that f is a guaranteed singleton type
+    jl_value_t *ft = jl_typeof(ff);
+    if (((jl_datatype_t*)ft)->instance != ff)
+        jl_error("cfunction: use `@cfunction` to make closures");
 
     // check the cache structure
     // this has three levels (for the 3 parameters above)
@@ -4588,7 +4866,6 @@ static Function *jl_cfunction_cache(jl_value_t *ft, jl_value_t *declrt, jl_tuple
                 for (i = 0; i < l; i += 2) {
                     jl_value_t *ti = jl_svecref(sf, i);
                     if (jl_egal(ti, declrt)) {
-                        JL_GC_POP();
                         return (Function*)jl_unbox_voidpointer(jl_svecref(sf, i + 1));
                     }
                 }
@@ -4606,20 +4883,6 @@ static Function *jl_cfunction_cache(jl_value_t *ft, jl_value_t *declrt, jl_tuple
             jl_cfunction_list = jl_eqtable_put(jl_cfunction_list, ft, insert.unknown, NULL);
     }
 
-    // try to avoid needing a trampoline,
-    // if we know that `ft` is some sort of
-    // guaranteed singleton type
-    jl_value_t *ff = NULL;
-    if (jl_is_datatype(ft))
-        ff = ((jl_datatype_t*)ft)->instance;
-    if (jl_is_type_type(ft)) {
-        jl_value_t *tp0 = jl_tparam0(ft);
-        if (jl_is_concrete_type(tp0))
-            ff = tp0;
-    }
-    if (!ff)
-        jl_error("cfunction: function closures not yet supported");
-
     // compute / validate return type
     jl_value_t *crt = declrt;
     if (jl_is_abstract_ref_type(declrt)) {
@@ -4638,6 +4901,8 @@ static Function *jl_cfunction_cache(jl_value_t *ft, jl_value_t *declrt, jl_tuple
         lcrt = T_prjlvalue;
 
     // compute / validate method signature
+    jl_value_t *sigt = NULL; // dispatch sig: type signature (argt) with Ref{} annotations removed and ft added
+    JL_GC_PUSH1(&sigt);
     size_t i, nargs = jl_nparams(argt);
     sigt = (jl_value_t*)jl_alloc_svec(nargs + 1);
     jl_svecset(sigt, 0, ft);
@@ -4666,9 +4931,9 @@ static Function *jl_cfunction_cache(jl_value_t *ft, jl_value_t *declrt, jl_tuple
             err = NULL;
         }
         else {
-            auto f = gen_cfun_wrapper(sig, ff, cache_l3, declrt, (jl_tupletype_t*)sigt);
+            Function *F = gen_cfun_wrapper(NULL, sig, ff, cache_l3, declrt, (jl_tupletype_t*)sigt, NULL, NULL, NULL);
             JL_GC_POP();
-            return f;
+            return F;
         }
     }
     if (err)
@@ -6844,6 +7109,16 @@ static void init_julia_llvm_env(Module *m)
     add_return_attr(jl_alloc_obj_func, Attribute::NonNull);
     add_named_global(jl_alloc_obj_func, (void*)NULL, /*dllimport*/false);
 
+    std::vector<Type*> newbits_args(0);
+    newbits_args.push_back(T_prjlvalue);
+    newbits_args.push_back(T_pint8);
+    jl_newbits_func = Function::Create(FunctionType::get(T_prjlvalue, newbits_args, false),
+                                         Function::ExternalLinkage,
+                                         "jl_new_bits");
+    add_return_attr(jl_newbits_func, Attribute::NoAlias);
+    add_return_attr(jl_newbits_func, Attribute::NonNull);
+    add_named_global(jl_newbits_func, (void*)jl_new_bits);
+
     jl_typeof_func = Function::Create(FunctionType::get(T_prjlvalue, {T_prjlvalue}, false),
                                       Function::ExternalLinkage,
                                       "julia.typeof");
@@ -6873,6 +7148,21 @@ static void init_julia_llvm_env(Module *m)
                          "jl_load_and_lookup", m);
     add_named_global(jldlsym_func, &jl_load_and_lookup);
 
+    std::vector<Type *> getcfunctiontrampoline_args(0);
+    getcfunctiontrampoline_args.push_back(T_prjlvalue); // f (object)
+    getcfunctiontrampoline_args.push_back(T_pjlvalue); // result
+    getcfunctiontrampoline_args.push_back(T_pint8); // cache
+    getcfunctiontrampoline_args.push_back(T_pjlvalue); // fill
+    getcfunctiontrampoline_args.push_back(FunctionType::get(T_pint8, { T_pint8, T_ppjlvalue }, false)->getPointerTo()); // trampoline
+    getcfunctiontrampoline_args.push_back(T_pjlvalue); // env
+    getcfunctiontrampoline_args.push_back(T_pprjlvalue); // vals
+    jlgetcfunctiontrampoline_func =
+        Function::Create(FunctionType::get(T_prjlvalue, getcfunctiontrampoline_args, false),
+                         Function::ExternalLinkage,
+                         "jl_get_cfunction_trampoline", m);
+    add_return_attr(jlgetcfunctiontrampoline_func, Attribute::NonNull);
+    add_named_global(jlgetcfunctiontrampoline_func, &jl_get_cfunction_trampoline);
+
     std::vector<Type *> getnthfld_args(0);
     getnthfld_args.push_back(T_prjlvalue);
     getnthfld_args.push_back(T_size);
@@ -6881,13 +7171,13 @@ static void init_julia_llvm_env(Module *m)
                          Function::ExternalLinkage,
                          "jl_get_nth_field_checked", m);
     add_return_attr(jlgetnthfieldchecked_func, Attribute::NonNull);
-    add_named_global(jlgetnthfieldchecked_func, *jl_get_nth_field_checked);
+    add_named_global(jlgetnthfieldchecked_func, &jl_get_nth_field_checked);
 
     diff_gc_total_bytes_func =
         Function::Create(FunctionType::get(T_int64, false),
                          Function::ExternalLinkage,
                          "jl_gc_diff_total_bytes", m);
-    add_named_global(diff_gc_total_bytes_func, *jl_gc_diff_total_bytes);
+    add_named_global(diff_gc_total_bytes_func, &jl_gc_diff_total_bytes);
 
     std::vector<Type*> array_owner_args(0);
     array_owner_args.push_back(T_prjlvalue);
@@ -6898,7 +7188,7 @@ static void init_julia_llvm_env(Module *m)
     jlarray_data_owner_func->addFnAttr(Attribute::ReadOnly);
     jlarray_data_owner_func->addFnAttr(Attribute::NoUnwind);
     add_return_attr(jlarray_data_owner_func, Attribute::NonNull);
-    add_named_global(jlarray_data_owner_func, jl_array_data_owner);
+    add_named_global(jlarray_data_owner_func, &jl_array_data_owner);
 
     gcroot_flush_func = Function::Create(FunctionType::get(T_void, false),
                                          Function::ExternalLinkage,
diff --git a/src/julia-syntax.scm b/src/julia-syntax.scm
index d48189f0481a98..54a4446bb77f14 100644
--- a/src/julia-syntax.scm
+++ b/src/julia-syntax.scm
@@ -3053,7 +3053,7 @@ f(x) = yt(x)
                                (memq (car e) '(quote top core line inert local local-def unnecessary
                                                meta inbounds boundscheck simdloop decl
                                                implicit-global global globalref outerref
-                                               const = null method call foreigncall ssavalue
+                                               const = null method call foreigncall cfunction ssavalue
                                                gc_preserve_begin gc_preserve_end))))
                          (lam:body lam))))
                (unused (map cadr (filter (lambda (x) (memq (car x) '(method =)))
@@ -3400,7 +3400,7 @@ f(x) = yt(x)
   (or (ssavalue? lhs)
       (valid-ir-argument? e)
       (and (symbol? lhs) (pair? e)
-           (memq (car e) '(new the_exception isdefined call invoke foreigncall gc_preserve_begin)))))
+           (memq (car e) '(new the_exception isdefined call invoke foreigncall cfunction gc_preserve_begin)))))
 
 (define (valid-ir-return? e)
   ;; returning lambda directly is needed for @generated
@@ -3574,7 +3574,7 @@ f(x) = yt(x)
                   ((and (pair? e1) (eq? (car e1) 'globalref)) (emit e1) #f) ;; keep globals for undefined-var checking
                   (else #f)))
           (case (car e)
-            ((call new foreigncall)
+            ((call new foreigncall cfunction)
              (let* ((args
                      (cond ((eq? (car e) 'foreigncall)
                             (for-each (lambda (a)
@@ -3592,6 +3592,11 @@ f(x) = yt(x)
                                         (list (cadr e)))
                                     (list-head (cddr e) 4)
                                     (compile-args (list-tail e 6) break-labels linearize-args)))
+                           ;; NOTE: arguments of cfunction must be left in place
+                           ;;       except for argument 2 (fptr)
+                           ((eq? (car e) 'cfunction)
+                            (let ((fptr (car (compile-args (list (caddr e)) break-labels linearize-args))))
+                              (cons (cadr e) (cons fptr (cdddr e)))))
                            ;; TODO: evaluate first argument to cglobal some other way
                            ((and (length> e 2)
                                  (or (eq? (cadr e) 'cglobal)
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 3155c202f202bf..d65980556190a9 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -743,6 +743,12 @@ extern void *jl_winsock_handle;
 void *jl_get_library(const char *f_lib);
 JL_DLLEXPORT void *jl_load_and_lookup(const char *f_lib, const char *f_name,
                                       void **hnd);
+JL_DLLEXPORT jl_value_t *jl_get_cfunction_trampoline(
+    jl_value_t *fobj, jl_datatype_t *result, htable_t *cache, jl_svec_t *fill,
+    void *(*init_trampoline)(void *tramp, void **nval),
+    jl_unionall_t *env, jl_value_t **vals);
+
+
 // Windows only
 #define JL_EXE_LIBNAME ((const char*)1)
 #define JL_DL_LIBNAME ((const char*)2)
@@ -1012,7 +1018,9 @@ extern jl_sym_t *lambda_sym;  extern jl_sym_t *assign_sym;
 extern jl_sym_t *method_sym;  extern jl_sym_t *slot_sym;
 extern jl_sym_t *enter_sym;   extern jl_sym_t *leave_sym;
 extern jl_sym_t *exc_sym;     extern jl_sym_t *new_sym;
-extern jl_sym_t *compiler_temp_sym; extern jl_sym_t *foreigncall_sym;
+extern jl_sym_t *compiler_temp_sym;
+extern jl_sym_t *foreigncall_sym;
+extern jl_sym_t *cfunction_sym;
 extern jl_sym_t *const_sym;   extern jl_sym_t *thunk_sym;
 extern jl_sym_t *underscore_sym; extern jl_sym_t *colon_sym;
 extern jl_sym_t *abstracttype_sym; extern jl_sym_t *primtype_sym;
diff --git a/src/method.c b/src/method.c
index 2ad7cdf7f80923..a7ad458bee04f5 100644
--- a/src/method.c
+++ b/src/method.c
@@ -42,6 +42,51 @@ static jl_value_t *resolve_globals(jl_value_t *expr, jl_module_t *module, jl_sve
         }
         else {
             size_t i = 0, nargs = jl_array_len(e->args);
+            if (e->head == cfunction_sym) {
+                JL_NARGS(cfunction method definition, 5, 5); // (type, func, rt, at, cc)
+                jl_value_t *typ = jl_exprarg(e, 0);
+                if (!jl_is_type(typ))
+                    jl_error("first parameter to :cfunction must be a type");
+                if (typ == (jl_value_t*)jl_voidpointer_type) {
+                    jl_value_t *a = jl_exprarg(e, 1);
+                    JL_TYPECHK(cfunction method definition, quotenode, a);
+                    *(jl_value_t**)a = jl_toplevel_eval(module, *(jl_value_t**)a);
+                    jl_gc_wb(a, *(jl_value_t**)a);
+                }
+                jl_value_t *rt = jl_exprarg(e, 2);
+                jl_value_t *at = jl_exprarg(e, 3);
+                if (!jl_is_type(rt)) {
+                    JL_TRY {
+                        rt = jl_interpret_toplevel_expr_in(module, rt, NULL, sparam_vals);
+                    }
+                    JL_CATCH {
+                        if (jl_typeis(jl_exception_in_transit, jl_errorexception_type))
+                            jl_error("could not evaluate cfunction return type (it might depend on a local variable)");
+                        else
+                            jl_rethrow();
+                    }
+                    jl_exprargset(e, 2, rt);
+                }
+                if (!jl_is_svec(at)) {
+                    JL_TRY {
+                        at = jl_interpret_toplevel_expr_in(module, at, NULL, sparam_vals);
+                    }
+                    JL_CATCH {
+                        if (jl_typeis(jl_exception_in_transit, jl_errorexception_type))
+                            jl_error("could not evaluate cfunction argument type (it might depend on a local variable)");
+                        else
+                            jl_rethrow();
+                    }
+                    jl_exprargset(e, 3, at);
+                }
+                if (jl_is_svec(rt))
+                    jl_error("cfunction: missing return type");
+                JL_TYPECHK(cfunction method definition, type, rt);
+                JL_TYPECHK(cfunction method definition, simplevector, at);
+                JL_TYPECHK(cfunction method definition, quotenode, jl_exprarg(e, 4));
+                JL_TYPECHK(cfunction method definition, symbol, *(jl_value_t**)jl_exprarg(e, 4));
+                return expr;
+            }
             if (e->head == foreigncall_sym) {
                 JL_NARGSV(ccall method definition, 5); // (fptr, rt, at, cc, narg)
                 jl_value_t *rt = jl_exprarg(e, 1);
@@ -79,7 +124,7 @@ static jl_value_t *resolve_globals(jl_value_t *expr, jl_module_t *module, jl_sve
                 JL_TYPECHK(ccall method definition, long, jl_exprarg(e, 4));
             }
             if (e->head == method_sym || e->head == abstracttype_sym || e->head == structtype_sym ||
-                e->head == primtype_sym || e->head == module_sym) {
+                     e->head == primtype_sym || e->head == module_sym) {
                 i++;
             }
             for (; i < nargs; i++) {
@@ -527,11 +572,10 @@ static jl_method_t *jl_new_method(
     JL_GC_PUSH1(&root);
 
     m = jl_new_method_uninit(inmodule);
-    m->sparam_syms = sparam_syms;
     root = (jl_value_t*)m;
-    m->min_world = ++jl_world_counter;
-    m->name = name;
     m->sig = (jl_value_t*)sig;
+    m->sparam_syms = sparam_syms;
+    m->name = name;
     m->isva = isva;
     m->nargs = nargs;
     jl_method_set_source(m, definition);
@@ -547,6 +591,7 @@ static jl_method_t *jl_new_method(
     }
 
     JL_GC_POP();
+    m->min_world = ++jl_world_counter;
     return m;
 }
 
diff --git a/src/runtime_ccall.cpp b/src/runtime_ccall.cpp
index e87ce1ec43667f..e7f04b76b9967e 100644
--- a/src/runtime_ccall.cpp
+++ b/src/runtime_ccall.cpp
@@ -10,6 +10,13 @@
 #include "processor.h"
 #include "julia_assert.h"
 
+#ifndef _OS_WINDOWS_
+#include <sys/mman.h>
+#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS)
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#endif
+
 using namespace llvm;
 
 // --- library symbol lookup ---
@@ -97,3 +104,138 @@ jl_value_t *jl_get_JIT(void)
     const std::string& HostJITName = "ORCJIT";
     return jl_pchar_to_string(HostJITName.data(), HostJITName.size());
 }
+
+
+static void *trampoline_freelist;
+
+static void *trampoline_alloc()
+{
+    const int sz = 64; // oversized for most platforms. todo: use precise value?
+    if (!trampoline_freelist) {
+#ifdef _OS_WINDOWS_
+        void *mem = VirtualAlloc(NULL, jl_page_size,
+                MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+        void *mem = mmap(0, jl_page_size, PROT_READ | PROT_WRITE | PROT_EXEC,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#endif
+        void *next = NULL;
+        for (size_t i = 0; i + sz <= jl_page_size; i += sz) {
+            void **curr = (void**)((char*)mem + i);
+            *curr = next;
+            next = (void*)curr;
+        }
+        trampoline_freelist = next;
+    }
+    void *tramp = trampoline_freelist;
+    trampoline_freelist = *(void**)tramp;
+    return tramp;
+}
+
+static void trampoline_free(void *tramp)
+{
+    *(void**)tramp = trampoline_freelist;
+    trampoline_freelist = tramp;
+}
+
+static void trampoline_deleter(void **f)
+{
+    void *tramp = f[0];
+    void *fobj = f[1];
+    void *cache = f[2];
+    void *nval = f[3];
+    f[0] = NULL;
+    f[2] = NULL;
+    f[3] = NULL;
+    if (tramp)
+        trampoline_free(tramp);
+    if (fobj && cache)
+        ptrhash_remove((htable_t*)cache, fobj);
+    if (nval)
+        free(nval);
+}
+
+// TODO: need a thread lock around the cache access parts of this function
+extern "C" JL_DLLEXPORT
+jl_value_t *jl_get_cfunction_trampoline(
+    // dynamic inputs:
+    jl_value_t *fobj,
+    jl_datatype_t *result_type,
+    // call-site constants:
+    htable_t *cache, // weakref htable indexed by (fobj, vals)
+    jl_svec_t *fill,
+    void *(*init_trampoline)(void *tramp, void **nval),
+    jl_unionall_t *env,
+    jl_value_t **vals)
+{
+    // lookup (fobj, vals) in cache
+    if (!cache->table)
+        htable_new(cache, 1);
+    if (fill != jl_emptysvec) {
+        htable_t **cache2 = (htable_t**)ptrhash_bp(cache, (void*)vals);
+        cache = *cache2;
+        if (cache == HT_NOTFOUND) {
+            cache = htable_new((htable_t*)malloc(sizeof(htable_t)), 1);
+            *cache2 = cache;
+        }
+    }
+    void *tramp = ptrhash_get(cache, (void*)fobj);
+    if (tramp != HT_NOTFOUND) {
+        assert((jl_datatype_t*)jl_typeof(tramp) == result_type);
+        return (jl_value_t*)tramp;
+    }
+
+    // not found, allocate a new one
+    size_t n = jl_svec_len(fill);
+    void **nval = (void**)malloc(sizeof(void**) * (n + 1));
+    nval[0] = (void*)fobj;
+    jl_value_t *result;
+    JL_TRY {
+        for (size_t i = 0; i < n; i++) {
+            jl_value_t *sparam_val = jl_instantiate_type_in_env(jl_svecref(fill, i), env, vals);
+            if (sparam_val != (jl_value_t*)jl_any_type)
+                if (!jl_is_concrete_type(sparam_val) || !jl_is_immutable(sparam_val))
+                    sparam_val = NULL;
+            nval[i + 1] = (void*)sparam_val;
+        }
+        int permanent =
+            (result_type == jl_voidpointer_type) ||
+            jl_is_concrete_type(fobj) ||
+            (((jl_datatype_t*)jl_typeof(fobj))->instance == fobj);
+        if (jl_is_unionall(fobj)) {
+            jl_value_t *uw = jl_unwrap_unionall(fobj);
+            if (jl_is_datatype(uw) && ((jl_datatype_t*)uw)->name->wrapper == fobj)
+                permanent = true;
+        }
+        if (permanent) {
+            result = jl_valueof(malloc(sizeof(jl_taggedvalue_t) + jl_datatype_size(result_type)));
+            jl_set_typeof(result, result_type);
+            memset(result, 0, jl_datatype_size(result_type));
+        }
+        else {
+            result = jl_new_struct_uninit(result_type);
+        }
+        if (result_type != jl_voidpointer_type) {
+            assert(jl_datatype_size(result_type) == sizeof(void*) * 4);
+            ((void**)result)[1] = (void*)fobj;
+        }
+        if (!permanent) {
+            void *ptr_finalizer[2] = {
+                    (void*)jl_voidpointer_type,
+                    (void*)&trampoline_deleter
+                };
+            jl_gc_add_finalizer(result, (jl_value_t*)&ptr_finalizer[1]);
+            ((void**)result)[2] = (void*)cache;
+            ((void**)result)[3] = (void*)nval;
+        }
+    }
+    JL_CATCH {
+        free(nval);
+        jl_rethrow();
+    }
+    tramp = trampoline_alloc();
+    ((void**)result)[0] = tramp;
+    tramp = init_trampoline(tramp, nval);
+    ptrhash_put(cache, (void*)fobj, result);
+    return result;
+}
diff --git a/src/toplevel.c b/src/toplevel.c
index 5fc25aa4b9b614..d5468c42e91460 100644
--- a/src/toplevel.c
+++ b/src/toplevel.c
@@ -340,6 +340,10 @@ static void expr_attributes(jl_value_t *v, int *has_intrinsics, int *has_defs)
              head == structtype_sym || jl_is_toplevel_only_expr(v)) {
         *has_defs = 1;
     }
+    else if (head == cfunction_sym) {
+        *has_intrinsics = 1;
+        return;
+    }
     else if (head == foreigncall_sym) {
         *has_intrinsics = 1;
         return;
@@ -829,12 +833,15 @@ jl_value_t *jl_toplevel_eval_flex(jl_module_t *m, jl_value_t *e, int fast, int e
         // worthwhile and also unsound (see #24316).
         // TODO: This is still not correct since an `eval` can happen elsewhere, but it
         // helps in common cases.
+        size_t last_age = ptls->world_age;
+        size_t world = jl_world_counter;
+        ptls->world_age = world;
         if (!has_defs) {
-            size_t world = jl_get_ptls_states()->world_age;
             jl_type_infer(&li, world, 0);
         }
         jl_value_t *dummy_f_arg = NULL;
         result = jl_call_method_internal(li, &dummy_f_arg, 1);
+        ptls->world_age = last_age;
     }
     else {
         // use interpreter
diff --git a/stdlib/FileWatching/src/FileWatching.jl b/stdlib/FileWatching/src/FileWatching.jl
index 973c82dd52e6ce..d2a9c4edd37241 100644
--- a/stdlib/FileWatching/src/FileWatching.jl
+++ b/stdlib/FileWatching/src/FileWatching.jl
@@ -311,13 +311,6 @@ function _uv_hook_close(uv::FolderMonitor)
     nothing
 end
 
-function __init__()
-    global uv_jl_pollcb = cfunction(uv_pollcb, Cvoid, Tuple{Ptr{Cvoid}, Cint, Cint})
-    global uv_jl_fspollcb = cfunction(uv_fspollcb, Cvoid, Tuple{Ptr{Cvoid}, Cint, Ptr{Cvoid}, Ptr{Cvoid}})
-    global uv_jl_fseventscb_file = cfunction(uv_fseventscb_file, Cvoid, Tuple{Ptr{Cvoid}, Ptr{Int8}, Int32, Int32})
-    global uv_jl_fseventscb_folder = cfunction(uv_fseventscb_folder, Cvoid, Tuple{Ptr{Cvoid}, Ptr{Int8}, Int32, Int32})
-end
-
 function uv_fseventscb_file(handle::Ptr{Cvoid}, filename::Ptr, events::Int32, status::Int32)
     t = @handle_as handle FileMonitor
     if status != 0
@@ -373,6 +366,14 @@ function uv_fspollcb(handle::Ptr{Cvoid}, status::Int32, prev::Ptr, curr::Ptr)
     nothing
 end
 
+function __init__()
+    global uv_jl_pollcb = @cfunction(uv_pollcb, Cvoid, (Ptr{Cvoid}, Cint, Cint))
+    global uv_jl_fspollcb = @cfunction(uv_fspollcb, Cvoid, (Ptr{Cvoid}, Cint, Ptr{Cvoid}, Ptr{Cvoid}))
+    global uv_jl_fseventscb_file = @cfunction(uv_fseventscb_file, Cvoid, (Ptr{Cvoid}, Ptr{Int8}, Int32, Int32))
+    global uv_jl_fseventscb_folder = @cfunction(uv_fseventscb_folder, Cvoid, (Ptr{Cvoid}, Ptr{Int8}, Int32, Int32))
+    nothing
+end
+
 function start_watching(t::_FDWatcher)
     t.handle == C_NULL && return throw(ArgumentError("FDWatcher is closed"))
     readable = t.refcount[1] > 0
diff --git a/stdlib/LibGit2/src/callbacks.jl b/stdlib/LibGit2/src/callbacks.jl
index 8885c8a242d0b2..8d31b38a0b811b 100644
--- a/stdlib/LibGit2/src/callbacks.jl
+++ b/stdlib/LibGit2/src/callbacks.jl
@@ -260,8 +260,8 @@ For addition details see the LibGit2 guide on
 [authenticating against a server](https://libgit2.github.com/docs/guides/authentication/).
 """
 function credentials_callback(libgit2credptr::Ptr{Ptr{Cvoid}}, url_ptr::Cstring,
-                              username_ptr::Cstring,
-                              allowed_types::Cuint, p::CredentialPayload)
+                              username_ptr::Cstring, allowed_types::Cuint,
+                              p::CredentialPayload)
     err = Cint(0)
 
     # Parse URL only during the first call to this function. Future calls will use the
@@ -340,7 +340,7 @@ function credentials_callback(libgit2credptr::Ptr{Ptr{Cvoid}}, url_ptr::Cstring,
                               username_ptr::Cstring, allowed_types::Cuint,
                               payloads::Dict)
     p = payloads[:credentials]
-    credentials_callback(libgit2credptr, url_ptr, username_ptr, allowed_types, p)
+    return credentials_callback(libgit2credptr, url_ptr, username_ptr, allowed_types, p)
 end
 
 function fetchhead_foreach_callback(ref_name::Cstring, remote_url::Cstring,
@@ -352,8 +352,8 @@ function fetchhead_foreach_callback(ref_name::Cstring, remote_url::Cstring,
 end
 
 "C function pointer for `mirror_callback`"
-mirror_cb() = cfunction(mirror_callback, Cint, Tuple{Ptr{Ptr{Cvoid}}, Ptr{Cvoid}, Cstring, Cstring, Ptr{Cvoid}})
+mirror_cb() = @cfunction(mirror_callback, Cint, (Ptr{Ptr{Cvoid}}, Ptr{Cvoid}, Cstring, Cstring, Ptr{Cvoid}))
 "C function pointer for `credentials_callback`"
-credentials_cb() = cfunction(credentials_callback, Cint, Tuple{Ptr{Ptr{Cvoid}}, Cstring, Cstring, Cuint, Any})
+credentials_cb() = @cfunction(credentials_callback, Cint, (Ptr{Ptr{Cvoid}}, Cstring, Cstring, Cuint, Any))
 "C function pointer for `fetchhead_foreach_callback`"
-fetchhead_foreach_cb() = cfunction(fetchhead_foreach_callback, Cint, Tuple{Cstring, Cstring, Ptr{GitHash}, Cuint, Ptr{Cvoid}})
+fetchhead_foreach_cb() = @cfunction(fetchhead_foreach_callback, Cint, (Cstring, Cstring, Ptr{GitHash}, Cuint, Ptr{Cvoid}))
diff --git a/stdlib/LibGit2/src/tree.jl b/stdlib/LibGit2/src/tree.jl
index d991908445d032..b9d652bead90fa 100644
--- a/stdlib/LibGit2/src/tree.jl
+++ b/stdlib/LibGit2/src/tree.jl
@@ -1,7 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 """
-    treewalk(f::Function, tree::GitTree, payload=Any[], post::Bool=false)
+    treewalk(f, tree::GitTree, post::Bool=false)
 
 Traverse the entries in `tree` and its subtrees in post or pre order. Preorder
 means beginning at the root and then traversing the leftmost subtree (and
@@ -12,18 +12,25 @@ subtree, traversing upwards through it, then traversing the next right subtree
 
 The function parameter `f` should have following signature:
 
-    (Cstring, Ptr{Cvoid}, Ptr{Cvoid}) -> Cint
+    (String, GitTreeEntry) -> Cint
 
 A negative value returned from `f` stops the tree walk. A positive value means
 that the entry will be skipped if `post` is `false`.
 """
-function treewalk(f::Function, tree::GitTree, payload=Any[], post::Bool = false)
-    cbf = cfunction(f, Cint, Tuple{Cstring, Ptr{Cvoid}, Ptr{Cvoid}})
-    cbf_payload = Ref{typeof(payload)}(payload)
+function treewalk(f, tree::GitTree, post::Bool = false)
     # NOTE: don't use @check/GitError directly, because the code can be arbitrary
+    payload = Any[ tree, f ]
+    cbf = @cfunction(function treewalk_callback(root_cstr::Cstring, entry_ptr::Ptr{Cvoid}, payload::Vector{Any})::Cint
+            # decode arguments
+            root = unsafe_string(root_cstr)
+            tree = payload[1]::GitTree
+            f = payload[2]
+            entry = GitTreeEntry(tree, entry_ptr, false)
+            return f(root, entry)
+        end, Cint, (Cstring, Ptr{Cvoid}, Ref{Vector{Any}}))
     err = ccall((:git_tree_walk, :libgit2), Cint,
-                (Ptr{Cvoid}, Cint, Ptr{Cvoid}, Ptr{Cvoid}),
-                tree.ptr, post, cbf, cbf_payload)
+                (Ptr{Cvoid}, Cint, Ptr{Cvoid}, Any),
+                tree.ptr, post, cbf, payload)
     if err < 0
         err_class, _ = Error.last_error()
         if err_class != Error.Callback
@@ -31,8 +38,7 @@ function treewalk(f::Function, tree::GitTree, payload=Any[], post::Bool = false)
             throw(GitError(err))
         end
     end
-
-    return cbf_payload
+    nothing
 end
 
 repository(tree::GitTree) = tree.owner
@@ -141,7 +147,7 @@ the case of a file, or another [`GitTree`](@ref) if looking up a directory).
 # Examples
 ```julia
 tree = LibGit2.GitTree(repo, "HEAD^{tree}")
-readme = tree["README.md]
+readme = tree["README.md"]
 subtree = tree["test"]
 runtests = subtree["runtests.jl"]
 ```
@@ -155,31 +161,20 @@ function Base.getindex(tree::GitTree, target::AbstractString)
         return tree
     end
 
-    payload = Any[tree, target, nothing]
-    treewalk(_getindex_callback, tree, payload)
-    oid = payload[3]
-    if oid === nothing
-        throw(KeyError(target))
+    local oid = nothing
+    function _getindex_callback(root::String, entry::GitTreeEntry)::Cint
+        path = joinpath(root, filename(entry))
+        if path == target
+            # we found the target, save the oid and stop the walk
+            oid = entryid(entry)
+            return -1
+        elseif entrytype(entry) == GitTree && !startswith(target, path)
+            # this subtree isn't relevant, so skip it
+            return 1
+        end
+        return 0
     end
+    treewalk(_getindex_callback, tree)
+    oid === nothing && throw(KeyError(target))
     return GitObject(repository(tree), oid)
 end
-
-function _getindex_callback(root_cstr, entry_ptr, payload_ptr)
-    # decode arguments
-    root = unsafe_string(root_cstr)
-    payload = Base.unsafe_pointer_to_objref(payload_ptr)
-    tree = payload[1]
-    target = payload[2]
-    entry = GitTreeEntry(tree, entry_ptr, false)
-
-    path = joinpath(root, filename(entry))
-    if path == target
-        # we found the target, save the oid and stop the walk
-        payload[3] = entryid(entry)
-        return Cint(-1)
-    elseif entrytype(entry) == GitTree && !startswith(target, path)
-        # this subtree isn't relevant, so skip it
-        return Cint(1)
-    end
-    return Cint(0)
-end
diff --git a/stdlib/LibGit2/test/libgit2.jl b/stdlib/LibGit2/test/libgit2.jl
index 2d22b5d289d0e5..32be56fd390265 100644
--- a/stdlib/LibGit2/test/libgit2.jl
+++ b/stdlib/LibGit2/test/libgit2.jl
@@ -4,7 +4,7 @@ module LibGit2Tests
 
 import LibGit2
 using Test
-using Random, Serialization
+using Random, Serialization, Sockets
 
 const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test")
 isdefined(Main, :TestHelpers) || @eval Main include(joinpath($(BASE_TEST_PATH), "TestHelpers.jl"))
diff --git a/stdlib/LibGit2/test/online.jl b/stdlib/LibGit2/test/online.jl
index d40e42677b0c0f..6de50a66a34925 100644
--- a/stdlib/LibGit2/test/online.jl
+++ b/stdlib/LibGit2/test/online.jl
@@ -39,7 +39,7 @@ mktempdir() do dir
             status = Ref((current=0, total=-1))
             callbacks = LibGit2.Callbacks(
                 :transfer_progress => (
-                    cfunction(transfer_progress, Cint, Tuple{Ptr{LibGit2.TransferProgress}, Any}),
+                    @cfunction(transfer_progress, Cint, (Ptr{LibGit2.TransferProgress}, Any)),
                     status,
                 )
             )
diff --git a/stdlib/Libdl/src/Libdl.jl b/stdlib/Libdl/src/Libdl.jl
index bc1831125f0fda..e9616e933f2400 100644
--- a/stdlib/Libdl/src/Libdl.jl
+++ b/stdlib/Libdl/src/Libdl.jl
@@ -236,8 +236,8 @@ function dllist()
     dynamic_libraries = Vector{AbstractString}()
 
     @static if Sys.islinux()
-        callback = cfunction(dl_phdr_info_callback, Cint,
-                             Tuple{Ref{dl_phdr_info}, Csize_t, Ref{Vector{AbstractString}}})
+        callback = @cfunction(dl_phdr_info_callback, Cint,
+                              (Ref{dl_phdr_info}, Csize_t, Ref{Vector{AbstractString}}))
         ccall(:dl_iterate_phdr, Cint, (Ptr{Cvoid}, Ref{Vector{AbstractString}}), callback, dynamic_libraries)
     end
 
@@ -256,8 +256,8 @@ function dllist()
     end
 
     @static if Sys.isbsd() && !Sys.isapple()
-        callback = cfunction(dl_phdr_info_callback, Cint,
-                             Tuple{Ref{dl_phdr_info}, Csize_t, Ref{Vector{AbstractString}}})
+        callback = @cfunction(dl_phdr_info_callback, Cint,
+                              (Ref{dl_phdr_info}, Csize_t, Ref{Vector{AbstractString}}))
         ccall(:dl_iterate_phdr, Cint, (Ptr{Cvoid}, Ref{Vector{AbstractString}}), callback, dynamic_libraries)
         popfirst!(dynamic_libraries)
     end
diff --git a/stdlib/Sockets/src/Sockets.jl b/stdlib/Sockets/src/Sockets.jl
index e947da8fd48cc1..4ca5383c3a5e73 100644
--- a/stdlib/Sockets/src/Sockets.jl
+++ b/stdlib/Sockets/src/Sockets.jl
@@ -645,12 +645,12 @@ include("PipeServer.jl")
 # libuv callback handles
 
 function __init__()
-    global uv_jl_getaddrinfocb = cfunction(uv_getaddrinfocb, Cvoid, Tuple{Ptr{Cvoid}, Cint, Ptr{Cvoid}})
-    global uv_jl_getnameinfocb = cfunction(uv_getnameinfocb, Cvoid, Tuple{Ptr{Cvoid}, Cint, Cstring, Cstring})
-    global uv_jl_recvcb        = cfunction(uv_recvcb, Cvoid, Tuple{Ptr{Cvoid}, Cssize_t, Ptr{Cvoid}, Ptr{Cvoid}, Cuint})
-    global uv_jl_sendcb        = cfunction(uv_sendcb, Cvoid, Tuple{Ptr{Cvoid}, Cint})
-    global uv_jl_connectioncb  = cfunction(uv_connectioncb, Cvoid, Tuple{Ptr{Cvoid}, Cint})
-    global uv_jl_connectcb     = cfunction(uv_connectcb, Cvoid, Tuple{Ptr{Cvoid}, Cint})
+    global uv_jl_getaddrinfocb = @cfunction(uv_getaddrinfocb, Cvoid, (Ptr{Cvoid}, Cint, Ptr{Cvoid}))
+    global uv_jl_getnameinfocb = @cfunction(uv_getnameinfocb, Cvoid, (Ptr{Cvoid}, Cint, Cstring, Cstring))
+    global uv_jl_recvcb        = @cfunction(uv_recvcb, Cvoid, (Ptr{Cvoid}, Cssize_t, Ptr{Cvoid}, Ptr{Cvoid}, Cuint))
+    global uv_jl_sendcb        = @cfunction(uv_sendcb, Cvoid, (Ptr{Cvoid}, Cint))
+    global uv_jl_connectioncb  = @cfunction(uv_connectioncb, Cvoid, (Ptr{Cvoid}, Cint))
+    global uv_jl_connectcb     = @cfunction(uv_connectcb, Cvoid, (Ptr{Cvoid}, Cint))
 end
 
 # deprecations
diff --git a/test/ambiguous.jl b/test/ambiguous.jl
index 674f549ce3ba86..5212525ecf8196 100644
--- a/test/ambiguous.jl
+++ b/test/ambiguous.jl
@@ -1,5 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
+world_counter() = ccall(:jl_get_world_counter, UInt, ())
+
 # DO NOT ALTER ORDER OR SPACING OF METHODS BELOW
 const lineoffset = @__LINE__
 ambig(x, y) = 1
@@ -69,7 +71,7 @@ end
 # Test that non-ambiguous cases work
 let io = IOBuffer()
     @test precompile(ambig, (Int, Int)) == true
-    cf = cfunction(ambig, Int, Tuple{Int, Int})
+    cf = @eval @cfunction(ambig, Int, (Int, Int))
     @test ccall(cf, Int, (Int, Int), 1, 2) == 4
     @test length(code_lowered(ambig, (Int, Int))) == 1
     @test length(code_typed(ambig, (Int, Int))) == 1
@@ -78,8 +80,9 @@ end
 # Test that ambiguous cases fail appropriately
 let io = IOBuffer()
     @test precompile(ambig, (UInt8, Int)) == false
-    cf = cfunction(ambig, Int, Tuple{UInt8, Int})  # test for a crash (doesn't throw an error)
-    @test_throws MethodError ccall(cf, Int, (UInt8, Int), 1, 2)
+    cf = @eval @cfunction(ambig, Int, (UInt8, Int))  # test for a crash (doesn't throw an error)
+    @test_throws(MethodError(ambig, (UInt8(1), Int(2)), world_counter()),
+                 ccall(cf, Int, (UInt8, Int), 1, 2))
     @test_throws(ErrorException("no unique matching method found for the specified argument types"),
                  which(ambig, (UInt8, Int)))
     @test length(code_typed(ambig, (UInt8, Int))) == 0
diff --git a/test/ccall.jl b/test/ccall.jl
index 5f129ed3b3cae3..ff6e3ade8c4d4c 100644
--- a/test/ccall.jl
+++ b/test/ccall.jl
@@ -755,96 +755,194 @@ end
 ## cfunction roundtrip
 
 verbose && Libc.flush_cstdio()
+verbose && println("Testing cfunction closures: ")
+
+# helper Type for testing that constructors work
+# with cfucntion and that object identity is preserved
+mutable struct IdentityTestKV{K, V}
+    (T::Type{<:IdentityTestKV})(S) = (@test T === S; T)
+end
+
+@noinline function testclosure(f, a::T, permanent::Bool=false, tt::Type{S}=Any) where {T, S}
+    @nospecialize(f, a, tt)
+    # generic API 1
+    cf = @cfunction $f Ref{T} (Ref{T},)
+    @test cf.ptr != C_NULL
+    @test cf.f === f
+    @test (cf._1 == C_NULL) == permanent
+    @test (cf._2 == C_NULL) == permanent
+    @assert cf === Base.cconvert(Ptr{Cvoid}, cf)
+    GC.@preserve cf begin
+        fptr = Base.unsafe_convert(Ptr{Cvoid}, cf)
+        b = ccall(fptr, Ref{T}, (Ref{T},), a)
+    end
+    # generic API 2
+    cf2 = @cfunction $f Any (Ref{S},)
+    @test cf2.ptr != C_NULL
+    @test cf2.f === f
+    @test (cf2._1 == C_NULL) == permanent
+    @test (cf2._2 == C_NULL) == permanent
+    @assert cf2 === Base.cconvert(Ptr{Cvoid}, cf2)
+    GC.@preserve cf2 begin
+        fptr = Base.unsafe_convert(Ptr{Cvoid}, cf2)
+        b = ccall(fptr, Any, (Ref{S},), a)
+    end
+    return b
+end
+
+# We can't (currently) execute some of these signatures (without compile-all),
+# but we can at least look at some of the generated code
+function check_code_trampoline(f, t, n::Int)
+    @nospecialize(f, t)
+    @test Base.return_types(f, t) == Any[Any]
+    llvm = sprint(code_llvm, f, t)
+    @test count(x -> true, eachmatch(r"@jl_get_cfunction_trampoline\(", llvm)) == n
+end
+check_code_trampoline(testclosure, (Any, Any, Bool, Type), 2)
+check_code_trampoline(testclosure, (Any, Int, Bool, Type{Int}), 2)
+check_code_trampoline(testclosure, (Any, String, Bool, Type{String}), 2)
+check_code_trampoline(testclosure, (typeof(identity), Any, Bool, Type), 2)
+check_code_trampoline(testclosure, (typeof(identity), Int, Bool, Type{Int}), 0)
+check_code_trampoline(testclosure, (typeof(identity), String, Bool, Type{String}), 0)
+
+function g(i)
+    x = -332210 + i
+    y = "foo"
+    a(z) = x
+    b(z) = y
+    c(z) = (y = z)
+    IdentityTestVK{V, K} = IdentityTestKV{K, V}
+    @test IdentityTestVK !== IdentityTestKV
+    @test IdentityTestVK == IdentityTestKV
+    for _ = 1:5
+        @test testclosure(a, 23) == -332210 + i
+        @test testclosure(b, "bar") == "foo"
+        @test testclosure(c, "bar") == "bar"
+        @test testclosure(b, "foo") == "bar"
+        @test testclosure(c, "foo") == "foo"
+        @test testclosure(identity, IdentityTestKV, true) === IdentityTestKV
+        @test testclosure(identity, IdentityTestVK, true) === IdentityTestVK
+        @test testclosure(IdentityTestKV, IdentityTestKV, true) === IdentityTestKV
+        @test testclosure(IdentityTestVK, IdentityTestVK, false) === IdentityTestVK
+    end
+end
+g(1)
+g(2)
+g(3)
+
 verbose && println("Testing cfunction roundtrip: ")
 
 cf64 = 2.84+5.2im
 cf32 = 3.34f0+53.2f0im
-ci32 = Complex{Int32}(Int32(10),Int32(31))
-ci64 = Complex{Int64}(Int64(20),Int64(51))
+ci32 = Complex{Int32}(Int32(10), Int32(31))
+ci64 = Complex{Int64}(Int64(20), Int64(51))
 s1 = Struct1(352.39422f23, 19.287577)
-==(a::Struct1,b::Struct1) = a.x == b.x && a.y == b.y
-
-for (t,v) in ((Complex{Int32},:ci32),(Complex{Int64},:ci64),
-              (ComplexF32,:cf32),(ComplexF64,:cf64),(Struct1,:s1))
-    fname = Symbol("foo",v)
-    fname1 = Symbol("foo1",v)
+==(a::Struct1, b::Struct1) = (a.x == b.x && a.y == b.y)
+
+for (t, v) in ((Complex{Int32}, :ci32), (Complex{Int64}, :ci64),
+              (ComplexF32, :cf32), (ComplexF64, :cf64), (Struct1, :s1))
+    fname = Symbol("foo", v)
+    fname1 = Symbol("foo1", v)
+    a = copy(@eval $v)
+    verbose && println(t)
+    verbose && println("A: ", a)
     @eval begin
-        verbose && println($t)
-        a = copy($v)
-        verbose && println("A: ",a)
-        function $fname1(s::$t)
-            verbose && println("B: ",s)
+        global function $fname1(s::$t)
+            verbose && println("B: ", s)
             @test s == $v
-            @test s === a
+            @test s === $a
             global c = s
-            s
+            return s
         end
-        function $fname1(s)
+        global function $fname1(s)
             @assert false
         end
-        function $fname(s::$t)
-            verbose && println("B: ",s)
+        global function $fname(s::$t)
+            verbose && println("B: ", s)
             @test s == $v
             if($(t).mutable)
-                @test !(s === a)
+                @test !(s === $a)
             end
             global c = s
-            s
+            return s
         end
-        function $fname(s)
+        global function $fname(s)
             @assert false
         end
-        b = ccall(cfunction($fname1, Ref{$t}, Tuple{Ref{$t}}), Ref{$t}, (Ref{$t},), a)
-        verbose && println("C: ",b)
+    end
+    @eval let a = $a, b
+        b = testclosure($fname1, a, true)
+        verbose && println("C: ", b)
         @test b == $v
         @test b === a
         @test b === c
-        b = ccall(cfunction($fname, $t, Tuple{$t}), $t, ($t,), a)
+        let cf = @cfunction($fname1, Ref{$t}, (Ref{$t},))
+            b = ccall(cf, Ref{$t}, (Ref{$t},), a)
+        end
+        verbose && println("C: ", b)
+        @test b == $v
+        @test b === a
+        @test b === c
+        let cf = @cfunction($fname, $t, ($t,))
+            b = ccall(cf, $t, ($t,), a)
+        end
         verbose && println("C: ",b)
         @test b == $v
         if ($(t).mutable)
             @test !(b === c)
             @test !(b === a)
         end
-        b = ccall(cfunction($fname1, $t, Tuple{Ref{$t}}), $t, (Ref{$t},), a)
+        let cf = @cfunction($fname1, $t, (Ref{$t},))
+            b = ccall(cf, $t, (Ref{$t},), a)
+        end
         verbose && println("C: ",b)
         @test b == $v
         if ($(t).mutable)
             @test !(b === c)
             @test !(b === a)
         end
-        b = ccall(cfunction($fname, Ref{$t}, Tuple{$t}), Ref{$t}, ($t,), a)
+        let cf = @cfunction($fname, Ref{$t}, ($t,))
+            b = ccall(cf, Ref{$t}, ($t,), a)
+        end
         verbose && println("C: ",b)
         @test b == $v
         @test b === c
         if ($(t).mutable)
             @test !(b === a)
         end
-        b = ccall(cfunction($fname, Any, Tuple{Ref{$t}}), Any, (Ref{$t},), $v)
+        let cf = @cfunction($fname, Any, (Ref{$t},))
+            b = ccall(cf, Any, (Ref{$t},), $v)
+        end
         verbose && println("C: ",b)
         @test b == $v
         @test b === c
         if ($(t).mutable)
             @test !(b === a)
         end
-        b = ccall(cfunction($fname, Any, Tuple{Ref{Any}}), Any, (Ref{Any},), $v)
+        let cf = @cfunction($fname, Any, (Ref{Any},))
+            b = ccall(cf, Any, (Ref{Any},), $v)
+        end
         @test b == $v
         @test b === c
         if ($(t).mutable)
             @test !(b === a)
         end
-        @test_throws TypeError ccall(cfunction($fname, Ref{AbstractString}, Tuple{Ref{Any}}), Any, (Ref{Any},), $v)
-        @test_throws TypeError ccall(cfunction($fname, AbstractString, Tuple{Ref{Any}}), Any, (Ref{Any},), $v)
+        let cf = @cfunction($fname, Ref{AbstractString}, (Ref{Any},))
+            @test_throws TypeError ccall(cf, Any, (Ref{Any},), $v)
+        end
+        let cf = @cfunction($fname, AbstractString, (Ref{Any},))
+            @test_throws TypeError ccall(cf, Any, (Ref{Any},), $v)
+        end
     end
 end
 
 # issue 13031
 foo13031(x) = Cint(1)
-foo13031p = cfunction(foo13031, Cint, Tuple{Ref{Tuple{}}})
+foo13031p = @cfunction(foo13031, Cint, (Ref{Tuple{}},))
 ccall(foo13031p, Cint, (Ref{Tuple{}},), ())
 
 foo13031(x,y,z) = z
-foo13031p = cfunction(foo13031, Cint, Tuple{Ref{Tuple{}}, Ref{Tuple{}}, Cint})
+foo13031p = @cfunction(foo13031, Cint, (Ref{Tuple{}}, Ref{Tuple{}}, Cint))
 ccall(foo13031p, Cint, (Ref{Tuple{}},Ref{Tuple{}},Cint), (), (), 8)
 
 # issue 17219
@@ -996,10 +1094,9 @@ if Sys.ARCH === :x86_64
         T = NTuple{4, VecElement{s}}
         @eval function rt_sse(a1::$T, a2::$T, a3::$T, a4::$T)
             return ccall(
-                cfunction(foo_ams, $T, Tuple{$T, $T, $T, $T}),
-                $T,
-                ($T, $T, $T, $T),
-                a1,  a2,  a3, a4)
+                @cfunction(foo_ams, $T, ($T, $T, $T, $T)),
+                $T, ($T, $T, $T, $T),
+                     a1, a2, a3, a4)
         end
 
         a1 = VecReg(ntuple(i -> VecElement(s(1i)), 4))
@@ -1146,7 +1243,7 @@ function f17204(a)
     end
     return b
 end
-@test ccall(cfunction(f17204, Vector{Any}, Tuple{Vector{Any}}),
+@test ccall(@cfunction(f17204, Vector{Any}, (Vector{Any},)),
             Vector{Any}, (Vector{Any},), Any[1:10;]) == Any[11:20;]
 
 # This used to trigger incorrect ccall callee inlining.
@@ -1273,7 +1370,7 @@ end
 struct CallableSingleton
 end
 (::CallableSingleton)(x, y) = x + y
-@test ccall(cfunction(CallableSingleton(), Int, Tuple{Int,Int}),
+@test ccall(@cfunction(CallableSingleton(), Int, (Int, Int)),
             Int, (Int, Int), 1, 2) === 3
 
 # 19805
@@ -1283,21 +1380,21 @@ end
 
 evalf_callback_19805(ci::callinfos_19805{FUNC_FT}) where {FUNC_FT} = ci.f(0.5)::Float64
 
-evalf_callback_c_19805(ci::callinfos_19805{FUNC_FT}) where {FUNC_FT} = cfunction(
-    evalf_callback_19805, Float64, Tuple{callinfos_19805{FUNC_FT}})
+evalf_callback_c_19805(ci::callinfos_19805{FUNC_FT}) where {FUNC_FT} = @cfunction(
+    evalf_callback_19805, Float64, (callinfos_19805{FUNC_FT},))
 
 @test_throws(ErrorException("cfunction argument 1 doesn't correspond to a C type"),
              evalf_callback_c_19805( callinfos_19805(sin) ))
 @test_throws(ErrorException("cfunction argument 2 doesn't correspond to a C type"),
-             cfunction(+, Int, Tuple{Int, Nothing}))
-@test_throws(ErrorException("cfunction: Vararg syntax not allowed for cfunction argument list"),
-             cfunction(+, Int, Tuple{Vararg{Int}}))
-@test_throws(ErrorException("cfunction: argument type Ref should have an element type, not Ref{<:T}"),
-             cfunction(+, Int, Tuple{Ref{T}, Ref{T}}) where T)
-@test_throws(ErrorException("cfunction: return type Ref should have an element type, not Ref{<:T}"),
-             cfunction(+, Ref{T}, Tuple{Int, Int}) where T)
-@test_throws(ErrorException("cfunction: return type Ref{Any} is invalid. Use Any or Ptr{Any} instead."),
-             cfunction(+, Ref{Any}, Tuple{Int, Int}))
+             @cfunction(+, Int, (Int, Nothing)))
+@test_throws(ErrorException("cfunction: Vararg syntax not allowed for argument list"),
+             @cfunction(+, Int, (Vararg{Int},)))
+@test_throws(ErrorException("could not evaluate cfunction argument type (it might depend on a local variable)"),
+             @eval () -> @cfunction(+, Int, (Ref{T}, Ref{T})) where T)
+@test_throws(ErrorException("could not evaluate cfunction return type (it might depend on a local variable)"),
+             @eval () -> @cfunction(+, Ref{T}, (Int, Int)) where T)
+@test_throws(ErrorException("cfunction return type Ref{Any} is invalid. Use Any or Ptr{Any} instead."),
+             @cfunction(+, Ref{Any}, (Int, Int)))
 
 # test Ref{abstract_type} calling parameter passes a heap box
 abstract type Abstract22734 end
@@ -1310,7 +1407,7 @@ function cb22734(ptr::Ptr{Cvoid})
     obj = unsafe_pointer_to_objref(ptr)::Bits22734
     obj.x + obj.y
 end
-ptr22734 = cfunction(cb22734, Float64, Tuple{Ptr{Cvoid}})
+ptr22734 = @cfunction(cb22734, Float64, (Ptr{Cvoid},))
 function caller22734(ptr)
     obj = Bits22734(12, 20)
     ccall(ptr, Float64, (Ref{Abstract22734},), obj)
diff --git a/test/core.jl b/test/core.jl
index 1353c1b99b4339..a6f8be32705cba 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -4747,21 +4747,6 @@ let
     @test k(1) == 1
 end
 
-# PR #18054: compilation of cfunction leaves IRBuilder in bad state,
-#            causing heap-use-after-free when compiling f18054
-function f18054()
-    return Cint(0)
-end
-cfunction(f18054, Cint, Tuple{})
-
-# issue #18986: the ccall optimization of cfunction leaves JL_TRY stack in bad state
-dummy18996() = return nothing
-function main18986()
-    cfunction(dummy18986, Cvoid, ())
-    ccall((:dummy2, "this_is_a_nonexisting_library"), Cvoid, ())
-end
-@test_throws ErrorException main18986()
-
 # issue #18085
 f18085(a, x...) = (0, )
 for (f, g) in ((:asin, :sin), (:acos, :cos))
@@ -4773,7 +4758,7 @@ end
 # issue #18236 constant VecElement in ast triggers codegen assertion/undef
 # VecElement of scalar
 v18236 = VecElement(1.0)
-ptr18236 = cfunction(identity, VecElement{Float64}, Tuple{VecElement{Float64}})
+ptr18236 = @cfunction(identity, VecElement{Float64}, (VecElement{Float64},))
 @eval @noinline f18236(ptr) = ccall(ptr, VecElement{Float64},
                                     (VecElement{Float64},), $v18236)
 @test f18236(ptr18236) === v18236
@@ -4785,8 +4770,8 @@ ptr18236 = cfunction(identity, VecElement{Float64}, Tuple{VecElement{Float64}})
 # We should be at least testing this on some platforms.
 # Not sure if there's a better way to trigger unboxing in codegen.
 v18236_2 = VecElement((Int8(1), Int8(2)))
-ptr18236_2 = cfunction(identity, VecElement{NTuple{2,Int8}},
-                       Tuple{VecElement{NTuple{2,Int8}}})
+ptr18236_2 = @cfunction(identity, VecElement{NTuple{2,Int8}},
+                        (VecElement{NTuple{2,Int8}},))
 @eval @noinline f18236_2(ptr) = ccall(ptr, VecElement{NTuple{2,Int8}},
                                       (VecElement{NTuple{2,Int8}},),
                                       $v18236_2)
diff --git a/test/embedding/embedding.c b/test/embedding/embedding.c
index 12dde4be3bde87..5496355ef08d1a 100644
--- a/test/embedding/embedding.c
+++ b/test/embedding/embedding.c
@@ -146,7 +146,7 @@ int main()
         );
 
         typedef void (*Func_VOID__VOID)(void);
-        jl_value_t *pbar = jl_eval_string("cfunction(bar_from_c, Cvoid, Tuple{})");
+        jl_value_t *pbar = jl_eval_string("@cfunction(bar_from_c, Cvoid, ())");
         Func_VOID__VOID bar = (Func_VOID__VOID)jl_unbox_voidpointer(pbar);
         bar();
         checked_eval_string("bar() = println(\"calling new bar\")");
diff --git a/test/misc.jl b/test/misc.jl
index 0548d9206fb6fb..ef8249ff6f9f3b 100644
--- a/test/misc.jl
+++ b/test/misc.jl
@@ -396,16 +396,15 @@ let a = [1,2,3]
 end
 
 # Test that we can VirtualProtect jitted code to writable
-if Sys.iswindows()
-    @noinline function WeVirtualProtectThisToRWX(x, y)
-        x+y
-    end
-
-    let addr = cfunction(WeVirtualProtectThisToRWX, UInt64, Tuple{UInt64, UInt64})
-        addr = addr-(UInt64(addr)%4096)
+@noinline function WeVirtualProtectThisToRWX(x, y)
+    return x + y
+end
+@static if Sys.iswindows()
+    let addr = @cfunction(WeVirtualProtectThisToRWX, UInt64, (UInt64, UInt64))
+        addr = addr - (UInt64(addr) % 4096)
         PAGE_EXECUTE_READWRITE = 0x40
         oldPerm = Ref{UInt32}()
-        err18083 = ccall(:VirtualProtect,stdcall,Cint,
+        err18083 = ccall(:VirtualProtect, stdcall, Cint,
             (Ptr{Cvoid}, Csize_t, UInt32, Ptr{UInt32}),
             addr, 4096, PAGE_EXECUTE_READWRITE, oldPerm)
         err18083 == 0 && error(Libc.GetLastError())
diff --git a/test/reflection.jl b/test/reflection.jl
index 09d109e7fb8f6f..7b223149abccbb 100644
--- a/test/reflection.jl
+++ b/test/reflection.jl
@@ -395,7 +395,9 @@ end
 tracefoo(x, y) = x+y
 didtrace = false
 tracer(x::Ptr{Cvoid}) = (@test isa(unsafe_pointer_to_objref(x), Core.MethodInstance); global didtrace = true; nothing)
-ccall(:jl_register_method_tracer, Cvoid, (Ptr{Cvoid},), cfunction(tracer, Cvoid, Tuple{Ptr{Cvoid}}))
+let ctracer = @cfunction(tracer, Cvoid, (Ptr{Cvoid},))
+    ccall(:jl_register_method_tracer, Cvoid, (Ptr{Cvoid},), ctracer)
+end
 meth = which(tracefoo,Tuple{Any,Any})
 ccall(:jl_trace_method, Cvoid, (Any,), meth)
 @test tracefoo(1, 2) == 3
@@ -408,7 +410,9 @@ ccall(:jl_register_method_tracer, Cvoid, (Ptr{Cvoid},), C_NULL)
 
 # Method Tracing test
 methtracer(x::Ptr{Cvoid}) = (@test isa(unsafe_pointer_to_objref(x), Method); global didtrace = true; nothing)
-ccall(:jl_register_newmeth_tracer, Cvoid, (Ptr{Cvoid},), cfunction(methtracer, Cvoid, Tuple{Ptr{Cvoid}}))
+let cmethtracer = @cfunction(methtracer, Cvoid, (Ptr{Cvoid},))
+    ccall(:jl_register_newmeth_tracer, Cvoid, (Ptr{Cvoid},), cmethtracer)
+end
 tracefoo2(x, y) = x*y
 @test didtrace
 didtrace = false
diff --git a/test/spawn.jl b/test/spawn.jl
index 93e16ed435592c..9e8cb688076f93 100644
--- a/test/spawn.jl
+++ b/test/spawn.jl
@@ -267,7 +267,7 @@ let fname = tempname(), p
     oldhandle = OLD_STDERR.handle
     OLD_STDERR.status = Base.StatusClosing
     OLD_STDERR.handle = C_NULL
-    ccall(:uv_close, Cvoid, (Ptr{Cvoid}, Ptr{Cvoid}), oldhandle, cfunction(thrash, Cvoid, Tuple{Ptr{Cvoid}}))
+    ccall(:uv_close, Cvoid, (Ptr{Cvoid}, Ptr{Cvoid}), oldhandle, @cfunction(thrash, Cvoid, (Ptr{Cvoid},)))
     sleep(1)
     import Base.zzzInvalidIdentifier
     """
diff --git a/test/staged.jl b/test/staged.jl
index 966bac8333e313..53553a1af2067f 100644
--- a/test/staged.jl
+++ b/test/staged.jl
@@ -148,11 +148,14 @@ module TestGeneratedThrow
     end
 
     foo() = (bar(rand() > 0.5 ? 1 : 1.0); error("foo"))
+    inited = false
     function __init__()
-        code_typed(foo,(); optimize = false)
-        cfunction(foo,Cvoid,Tuple{})
+        code_typed(foo, (); optimize = false)
+        @cfunction(foo, Cvoid, ())
+        global inited = true
     end
 end
+@test TestGeneratedThrow.inited
 
 # @generated functions including inner functions
 @generated function _g_f_with_inner(x)
diff --git a/test/threads.jl b/test/threads.jl
index 7c943265ff2e5f..8e208876968774 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -383,20 +383,38 @@ for period in (0.06, Dates.Millisecond(60))
     end
 end
 
-complex_cfunction = function(a)
-    s = zero(eltype(a))
-    @inbounds @simd for i in a
-        s += muladd(a[i], a[i], -2)
-    end
-    return s
-end
 function test_thread_cfunction()
+    # ensure a runtime call to `get_trampoline` will be created
+    # TODO: get_trampoline is not thread-safe (as this test shows)
+    function complex_cfunction(a)
+        s = zero(eltype(a))
+        @inbounds @simd for i in a
+            s += muladd(a[i], a[i], -2)
+        end
+        return s
+    end
+    fs = [ let a = zeros(10)
+            () -> complex_cfunction(a)
+        end for i in 1:1000 ]
+    @noinline cf(f) = @cfunction $f Float64 ()
+    cfs = Vector{Base.CFunction}(undef, length(fs))
+    cf1 = cf(fs[1])
     @threads for i in 1:1000
-        # Make sure this is not inferrable
-        # and a runtime call to `jl_function_ptr` will be created
-        ccall(:jl_function_ptr, Ptr{Cvoid}, (Any, Any, Any),
-              complex_cfunction, Float64, Tuple{Ref{Vector{Float64}}})
+        cfs[i] = cf(fs[i])
+    end
+    @test cfs[1] == cf1
+    @test cfs[2] == cf(fs[2])
+    @test length(unique(cfs)) == 1000
+    ok = zeros(Int, nthreads())
+    @threads for i in 1:10000
+        i = mod1(i, 1000)
+        fi = fs[i]
+        cfi = cf(fi)
+        GC.@preserve cfi begin
+            ok[threadid()] += (cfi === cfs[i])
+        end
     end
+    @test sum(ok) == 10000
 end
 test_thread_cfunction()