Generate c signature when possible #11306

yuyichao · 2015-05-17T04:38:49Z

Any reason to be conservative on using c signature before?

yuyichao · 2015-05-17T05:00:18Z

The original commit that introduce this (9a68a05) doesn't have a explaination for these checks.

yuyichao · 2015-05-17T05:31:12Z

Test script:

@noinline g(a) = 1
@noinline f(a) = Int
@noinline f(a, b) = Int

@code_llvm g(Int)
@code_llvm f(Int)
@code_llvm f(Int, Int)

k1() = g(Int)
k2() = f(Int)
k3() = f(Int, Int)

@code_llvm k1()
@code_llvm k2()
@code_llvm k3()

function time_func(f::Function, args...)
    println(f)
    f(args...)
    gc()
    @time for i in 1:100000000
        f(args...)
    end
    gc()
end

time_func(k1)
time_func(k2)
time_func(k3)

Before:

define i64 @julia_g_44381(%jl_value_t*) {
top:
  ret i64 1
}

define %jl_value_t* @julia_f_44384(%jl_value_t*, %jl_value_t**, i32) {
top:
  %3 = load %jl_value_t** inttoptr (i64 140245884996296 to %jl_value_t**), align 8
  ret %jl_value_t* %3
}

define %jl_value_t* @julia_f_44386(%jl_value_t*, %jl_value_t**, i32) {
top:
  %3 = load %jl_value_t** inttoptr (i64 140245884996296 to %jl_value_t**), align 8
  ret %jl_value_t* %3
}

define i64 @julia_k1_44390() {
top:
  %0 = load %jl_value_t** inttoptr (i64 140245884996296 to %jl_value_t**), align 8
  %1 = call i64 @julia_g_44381(%jl_value_t* %0)
  ret i64 %1
}

define %jl_value_t* @julia_k2_44391() {
top:
  %0 = alloca [3 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [3 x %jl_value_t*]* %0, i64 0, i64 0
  %1 = getelementptr [3 x %jl_value_t*]* %0, i64 0, i64 2
  %2 = bitcast [3 x %jl_value_t*]* %0 to i64*
  store i64 2, i64* %2, align 8
  %3 = getelementptr [3 x %jl_value_t*]* %0, i64 0, i64 1
  %4 = bitcast %jl_value_t** %3 to %jl_value_t***
  %5 = load %jl_value_t*** @jl_pgcstack, align 8
  store %jl_value_t** %5, %jl_value_t*** %4, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8
  store %jl_value_t* null, %jl_value_t** %1, align 8
  %6 = load %jl_value_t** inttoptr (i64 140245884996296 to %jl_value_t**), align 8
  store %jl_value_t* %6, %jl_value_t** %1, align 8
  %7 = call %jl_value_t* @julia_f_44384(%jl_value_t* inttoptr (i64 140245917780336 to %jl_value_t*), %jl_value_t** %1, i32 1)
  %8 = load %jl_value_t*** %4, align 8
  store %jl_value_t** %8, %jl_value_t*** @jl_pgcstack, align 8
  ret %jl_value_t* %7
}

define %jl_value_t* @julia_k3_44392() {
top:
  %0 = alloca [4 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [4 x %jl_value_t*]* %0, i64 0, i64 0
  %1 = getelementptr [4 x %jl_value_t*]* %0, i64 0, i64 2
  %2 = bitcast [4 x %jl_value_t*]* %0 to i64*
  store i64 4, i64* %2, align 8
  %3 = getelementptr [4 x %jl_value_t*]* %0, i64 0, i64 1
  %4 = bitcast %jl_value_t** %3 to %jl_value_t***
  %5 = load %jl_value_t*** @jl_pgcstack, align 8
  store %jl_value_t** %5, %jl_value_t*** %4, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8
  store %jl_value_t* null, %jl_value_t** %1, align 8
  %6 = getelementptr [4 x %jl_value_t*]* %0, i64 0, i64 3
  store %jl_value_t* null, %jl_value_t** %6, align 8
  %7 = load %jl_value_t** inttoptr (i64 140245884996296 to %jl_value_t**), align 8
  store %jl_value_t* %7, %jl_value_t** %1, align 8
  %8 = load %jl_value_t** inttoptr (i64 140245884996296 to %jl_value_t**), align 8
  store %jl_value_t* %8, %jl_value_t** %6, align 8
  %9 = call %jl_value_t* @julia_f_44386(%jl_value_t* inttoptr (i64 140245917882576 to %jl_value_t*), %jl_value_t** %1, i32 2)
  %10 = load %jl_value_t*** %4, align 8
  store %jl_value_t** %10, %jl_value_t*** @jl_pgcstack, align 8
  ret %jl_value_t* %9
}
k1
   1.298 seconds     
k2
   1.403 seconds     
k3
   1.465 seconds

After

define i64 @julia_g_67425(%jl_value_t*) {
top:
  ret i64 1
}

define %jl_value_t* @julia_f_67428(%jl_value_t*) {
top:
  %1 = load %jl_value_t** inttoptr (i64 139685242901352 to %jl_value_t**), align 8
  ret %jl_value_t* %1
}

define %jl_value_t* @julia_f_67430(%jl_value_t*, %jl_value_t*) {
top:
  %2 = load %jl_value_t** inttoptr (i64 139685242901352 to %jl_value_t**), align 8
  ret %jl_value_t* %2
}

define i64 @julia_k1_67434() {
top:
  %0 = load %jl_value_t** inttoptr (i64 139685242901352 to %jl_value_t**), align 8
  %1 = call i64 @julia_g_67425(%jl_value_t* %0)
  ret i64 %1
}

define %jl_value_t* @julia_k2_67435() {
top:
  %0 = load %jl_value_t** inttoptr (i64 139685242901352 to %jl_value_t**), align 8
  %1 = call %jl_value_t* @julia_f_67428(%jl_value_t* %0)
  ret %jl_value_t* %1
}

define %jl_value_t* @julia_k3_67436() {
top:
  %0 = load %jl_value_t** inttoptr (i64 139685242901352 to %jl_value_t**), align 8
  %1 = call %jl_value_t* @julia_f_67430(%jl_value_t* %0, %jl_value_t* %0)
  ret %jl_value_t* %1
}
k1
   1.288 seconds     
k2
   1.123 seconds     
k3
   1.109 seconds

I don't really understand why is k2 and k3 even faster than k1 now... (it's consistent and is not because of the order I run them...). Maybe sth like boxing of the result?

Edit: making k*'s returning nothing makes k1 as fast as k2 and k3 so I guess it's sth related to boxing an integer...

yuyichao · 2015-05-18T03:09:39Z

So I use the following script to benchmark the compilation time and it is indeed ~20-30% slower for these simple functions. This is about the same factor for speed up in execution time so IMHO the optimization is worth doing.

This also increases the system image size (sys.so after stripping) by ~2% and I guess it might be related to generating wrappers?

macro test_recompile(args, ex)
    batch_num = 100
    new_ex = quote
        function gen_function()
            $(esc(ex))
        end
        function test_once()
            types = Base.typesof($args...)
            funcs = $(Expr(:tuple, repeated(:(gen_function()), batch_num)...))
            for i in 1:$batch_num
                code_typed(funcs[i], types)
            end
            tic()
            for i in 1:$batch_num
                ccall(:jl_get_llvmf, Ptr{Void}, (Any, Any, Bool),
                      funcs[i], types, false)
            end
            toq()
        end
        function tester()
            t = 0.0
            for i in 1:10
                t += test_once()
            end
            println("elapsed time: ", t, " seconds")
        end
        tester()
    end
end

@test_recompile (Int,) f(a) = (a,)
@test_recompile (Int, Int) f(a, b) = (a, b)
@test_recompile (Int, Int, Int) f(a, b, c) = (a, b, c)
@test_recompile (Int, Int, Int, Int) f(a, b, c, d) = (a, b, c, d)

@test_recompile (Int,) f(a) = Int
@test_recompile (Int, Int) f(a, b) = Int
@test_recompile (Int, Int, Int) f(a, b, c) = Int
@test_recompile (Int, Int, Int, Int) f(a, b, c, d) = Int

yuyichao · 2015-05-25T17:32:20Z

Ping.

Maybe any guideline on how much to specialize?

@JeffBezanson I noticed that 01c871c enables more aggressive specialization and AFAIK has a even bigger impact (at least on the size of sys.so) than this one. What's the rule of thumb to determine how mush to specialize?

JeffBezanson · 2015-05-25T19:39:37Z

I think a good way to go here might be to allow signatures with 0, 1, 2, or 3 jl_value_t* arguments as part of the jlcall convention. In other words, we would need a tag along with every jl_fptr_t to tell us what arguments it takes. Then we wouldn't need so many jlcall wrappers.

Doing this well requires some thought. Ideally we would not literally add a tag to jl_fptr_t, since that would be a lot of extra indirection. We could instead use a lambda_info's cFunction directly, for example. In fact the entire jl_function_t type is kind of useless at this point, and needs a redesign. See discussion in #10269 for example.

yuyichao · 2015-05-25T20:03:06Z

I think a good way to go here might be to allow signatures with 0, 1, 2, or 3 jl_value_t* arguments as part of the jlcall convention. In other words, we would need a tag along with every jl_fptr_t to tell us what arguments it takes. Then we wouldn't need so many jlcall wrappers.

I think I understand this part. Although I don't really understand why it is necessary now to generate jlcall wrappers and how this could avoid that (and probably how much overhead is caused by the wrapper). (Well, I guess I know that we need a wrapper if we need to call it with jl_apply but I'm not sure when it is typically needed)

Doing this well requires some thought. Ideally we would not literally add a tag to jl_fptr_t, since that would be a lot of extra indirection. We could instead use a lambda_info's cFunction directly, for example. In fact the entire jl_function_t type is kind of useless at this point, and needs a redesign. See discussion in #10269 for example.

I guess I'm missing sth trivial here but how could jl_apply uses lambda_info if it is only passed a function pointer?

JeffBezanson · 2015-05-25T20:09:21Z

I guess I'm missing sth trivial here but how could jl_apply uses lambda_info if it is only passed a function pointer?

The redesign would need to address that. Usually you have a jl_function_t, not just a jl_fptr_t.
We currently have Function, LambdaStaticData, MethodTable, and Method, and that seems like too many types to me.

jl_apply is needed to support late-bound calls, where all we have is a function object and list of arguments, and we want to try calling. For example f(args...) where args is an Any array. We might be trying to pass too many arguments, but there still needs to be an ABI for doing that (probably!).

yuyichao · 2015-05-26T00:03:12Z

If I understand correctly, if the change you proposed is implement, there won't be a difference between jlcall and ccall for functions with less than 4 arguments and a jlcall wrapper will not be necessary. In this case, what's the plan for functions with more arguments? AFAICT, generating a c signature can reduce the overhead for functions that are know at codegen time although codegen for them with a wrapper can still be expensive.

Another crazy thought is that can we just codegen (cached) generic wrappers?
i.e. the reverse with what jl_call* are doing. This way for the functions relavant for this PR, no special wrapper needs to be generated. If it is generalized to arguments that can be lowered, it can probably save some codegen for those functions as well. (I guess that's similar to generate a call that is specialized on a generic function and all arguments). (Hopefully I have described what I am thinking clearly enough....)

yuyichao · 2015-05-26T00:04:23Z

And could the generic wrapper idea be implemented now? IIRC, the function is passed as the first argument?

yuyichao · 2015-06-25T20:07:23Z

I've updated the PR to specialize the call signature based on the number of arguments.

@JeffBezanson @vtjnash

P.S. AppVeyor seems broken now?

tkelman · 2015-06-25T20:19:18Z

There may have been a race condition or something where the build number did not increment correctly. I think I just fixed it.

vtjnash · 2016-03-08T06:29:02Z

it's probably worth revisiting this now (i suspect we are almost always generating the specsig since jb/functions was merged), but we probably don't particularly need this PR

yuyichao mentioned this pull request May 17, 2015

Throw exceptions with arguments without creating a GC frame #11284

Closed

yuyichao mentioned this pull request May 26, 2015

RFC: Try to generate reusable jlcall wrapper #11439

Open

yuyichao force-pushed the jl_value_t-spec branch from 11ac3db to 7a02dfa Compare June 25, 2015 20:02

yuyichao added the performance Must go faster label Jun 30, 2015

yuyichao force-pushed the jl_value_t-spec branch 2 times, most recently from a34cd9d to 5bf0652 Compare October 9, 2015 18:07

yuyichao added the compiler:codegen Generation of LLVM IR and native code label Oct 9, 2015

Generate c signature when possible. Fix JuliaLang#11304

e1c33ee

yuyichao force-pushed the jl_value_t-spec branch from 5bf0652 to e1c33ee Compare November 1, 2015 03:01

vtjnash closed this Mar 8, 2016

yuyichao deleted the jl_value_t-spec branch March 8, 2016 13:40

yuyichao mentioned this pull request Aug 6, 2019

add specialized codegen for jl_get_current_task #32812

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Generate c signature when possible #11306

Generate c signature when possible #11306

yuyichao commented May 17, 2015

yuyichao commented May 17, 2015

yuyichao commented May 17, 2015

yuyichao commented May 18, 2015

yuyichao commented May 25, 2015

JeffBezanson commented May 25, 2015

yuyichao commented May 25, 2015

JeffBezanson commented May 25, 2015

yuyichao commented May 26, 2015

yuyichao commented May 26, 2015

yuyichao commented Jun 25, 2015

tkelman commented Jun 25, 2015

vtjnash commented Mar 8, 2016

Generate c signature when possible #11306

Generate c signature when possible #11306

Conversation

yuyichao commented May 17, 2015

yuyichao commented May 17, 2015

yuyichao commented May 17, 2015

yuyichao commented May 18, 2015

yuyichao commented May 25, 2015

JeffBezanson commented May 25, 2015

yuyichao commented May 25, 2015

JeffBezanson commented May 25, 2015

yuyichao commented May 26, 2015

yuyichao commented May 26, 2015

yuyichao commented Jun 25, 2015

tkelman commented Jun 25, 2015

vtjnash commented Mar 8, 2016