From e577f8e763d81fd8fa24e37859881d45c145b857 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 5 Jan 2017 22:27:14 -0500 Subject: [PATCH] only use accurate powf function The powi intrinsic optimization over calling powf is that it is inaccurate. We don't need that. When it is equally accurate (e.g. tiny constant powers), LLVM will already recognize and optimize any call to a function named `powf`, and produce the same speedup. fix #19872 --- base/fastmath.jl | 6 +++--- base/inference.jl | 1 - base/math.jl | 23 +++++++++++------------ base/special/exp.jl | 8 ++++++-- src/codegen.cpp | 23 ----------------------- src/intrinsics.cpp | 28 ---------------------------- src/intrinsics.h | 1 - src/julia_internal.h | 1 - src/runtime_intrinsics.c | 25 ------------------------- test/math.jl | 7 +++++++ 10 files changed, 27 insertions(+), 96 deletions(-) diff --git a/base/fastmath.jl b/base/fastmath.jl index 36d975506e48c..b02ed7d84a1af 100644 --- a/base/fastmath.jl +++ b/base/fastmath.jl @@ -23,7 +23,7 @@ module FastMath export @fastmath -import Core.Intrinsics: powi_llvm, sqrt_llvm_fast, neg_float_fast, +import Core.Intrinsics: sqrt_llvm_fast, neg_float_fast, add_float_fast, sub_float_fast, mul_float_fast, div_float_fast, rem_float_fast, eq_float_fast, ne_float_fast, lt_float_fast, le_float_fast @@ -243,8 +243,8 @@ end # builtins -pow_fast{T<:FloatTypes}(x::T, y::Integer) = pow_fast(x, Int32(y)) -pow_fast{T<:FloatTypes}(x::T, y::Int32) = Base.powi_llvm(x, y) +pow_fast(x::Float32, y::Integer) = ccall("llvm.powi.f32", llvmcall, Float32, (Float32, Int32), x, y) +pow_fast(x::Float64, y::Integer) = ccall("llvm.powi.f64", llvmcall, Float64, (Float64, Int32), x, y) # TODO: Change sqrt_llvm intrinsic to avoid nan checking; add nan # checking to sqrt in math.jl; remove sqrt_llvm_fast intrinsic diff --git a/base/inference.jl b/base/inference.jl index db597a6023510..a65cb444ca357 100644 --- a/base/inference.jl +++ b/base/inference.jl @@ -447,7 +447,6 @@ add_tfunc(floor_llvm, 1, 1, math_tfunc) add_tfunc(trunc_llvm, 1, 1, math_tfunc) add_tfunc(rint_llvm, 1, 1, math_tfunc) add_tfunc(sqrt_llvm, 1, 1, math_tfunc) -add_tfunc(powi_llvm, 2, 2, math_tfunc) add_tfunc(sqrt_llvm_fast, 1, 1, math_tfunc) ## same-type comparisons ## cmp_tfunc(x::ANY, y::ANY) = Bool diff --git a/base/math.jl b/base/math.jl index 523f7fec89299..21e646088e7f5 100644 --- a/base/math.jl +++ b/base/math.jl @@ -32,7 +32,7 @@ using Base: sign_mask, exponent_mask, exponent_one, exponent_bias, exponent_half, exponent_max, exponent_raw_max, fpinttype, significand_mask, significand_bits, exponent_bits -using Core.Intrinsics: sqrt_llvm, powi_llvm +using Core.Intrinsics: sqrt_llvm # non-type specific math functions @@ -308,6 +308,8 @@ exp10(x::Float32) = 10.0f0^x exp10(x::Integer) = exp10(float(x)) # utility for converting NaN return to DomainError +# the branch in nan_dom_err prevents its callers from inlining, so be sure to force it +# until the heuristics can be improved @inline nan_dom_err(f, x) = isnan(f) & !isnan(x) ? throw(DomainError()) : f # functions that return NaN on non-NaN argument for domain error @@ -414,9 +416,9 @@ log1p(x) for f in (:sin, :cos, :tan, :asin, :acos, :acosh, :atanh, :log, :log2, :log10, :lgamma, :log1p) @eval begin - ($f)(x::Float64) = nan_dom_err(ccall(($(string(f)),libm), Float64, (Float64,), x), x) - ($f)(x::Float32) = nan_dom_err(ccall(($(string(f,"f")),libm), Float32, (Float32,), x), x) - ($f)(x::Real) = ($f)(float(x)) + @inline ($f)(x::Float64) = nan_dom_err(ccall(($(string(f)),libm), Float64, (Float64,), x), x) + @inline ($f)(x::Float32) = nan_dom_err(ccall(($(string(f, "f")), libm), Float32, (Float32,), x), x) + @inline ($f)(x::Real) = ($f)(float(x)) end end @@ -677,14 +679,11 @@ function modf(x::Float64) f, _modf_temp[] end -^(x::Float64, y::Float64) = nan_dom_err(ccall((:pow,libm), Float64, (Float64,Float64), x, y), x+y) -^(x::Float32, y::Float32) = nan_dom_err(ccall((:powf,libm), Float32, (Float32,Float32), x, y), x+y) - -^(x::Float64, y::Integer) = x^Int32(y) -^(x::Float64, y::Int32) = powi_llvm(x, y) -^(x::Float32, y::Integer) = x^Int32(y) -^(x::Float32, y::Int32) = powi_llvm(x, y) -^(x::Float16, y::Integer) = Float16(Float32(x)^y) +@inline ^(x::Float64, y::Float64) = nan_dom_err(ccall("llvm.pow.f64", llvmcall, Float64, (Float64, Float64), x, y), x + y) +@inline ^(x::Float32, y::Float32) = nan_dom_err(ccall("llvm.pow.f32", llvmcall, Float32, (Float32, Float32), x, y), x + y) +@inline ^(x::Float64, y::Integer) = x ^ Float64(y) +@inline ^(x::Float32, y::Integer) = x ^ Float32(y) +@inline ^(x::Float16, y::Integer) = Float16(Float32(x) ^ Float32(y)) function angle_restrict_symm(theta) const P1 = 4 * 7.8539812564849853515625e-01 diff --git a/base/special/exp.jl b/base/special/exp.jl index 9d7012a98034f..b90e7987eccf5 100644 --- a/base/special/exp.jl +++ b/base/special/exp.jl @@ -65,6 +65,10 @@ exp_small_thres(::Type{Float32}) = 2.0f0^-13 Compute the natural base exponential of `x`, in other words ``e^x``. """ function exp{T<:Union{Float32,Float64}}(x::T) + ^ = Base.FastMath.pow_fast + # we don't need the accurate power function here since we only compute 2^c, + # where c is a constant + xa = reinterpret(Unsigned, x) & ~sign_mask(T) xsb = signbit(x) @@ -114,13 +118,13 @@ function exp{T<:Union{Float32,Float64}}(x::T) # scale back if k > -significand_bits(T) # multiply by 2.0 first to prevent overflow, which helps extends the range - k == exponent_max(T) && return y*T(2.0)*T(2.0)^(exponent_max(T) - 1) + k == exponent_max(T) && return y * T(2.0) * T(2.0)^(exponent_max(T) - 1) twopk = reinterpret(T, rem(exponent_bias(T) + k, fpinttype(T)) << significand_bits(T)) return y*twopk else # add significand_bits(T) + 1 to lift the range outside the subnormals twopk = reinterpret(T, rem(exponent_bias(T) + significand_bits(T) + 1 + k, fpinttype(T)) << significand_bits(T)) - return y*twopk*T(2.0)^(-significand_bits(T) - 1) + return y * twopk * T(2.0)^(-significand_bits(T) - 1) end elseif xa < reinterpret(Unsigned, exp_small_thres(T)) # |x| < exp_small_thres # Taylor approximation for small x diff --git a/src/codegen.cpp b/src/codegen.cpp index 76d8bdd511f38..3545f023aaf1b 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -395,10 +395,6 @@ static Function *jldlsym_func; static Function *jlnewbits_func; static Function *jltypeassert_func; static Function *jldepwarnpi_func; -#if JL_LLVM_VERSION < 30600 -static Function *jlpow_func; -static Function *jlpowf_func; -#endif //static Function *jlgetnthfield_func; static Function *jlgetnthfieldchecked_func; //static Function *jlsetnthfield_func; @@ -5985,25 +5981,6 @@ static void init_julia_llvm_env(Module *m) "jl_gc_diff_total_bytes", m); add_named_global(diff_gc_total_bytes_func, *jl_gc_diff_total_bytes); -#if JL_LLVM_VERSION < 30600 - Type *powf_type[2] = { T_float32, T_float32 }; - jlpowf_func = Function::Create(FunctionType::get(T_float32, powf_type, false), - Function::ExternalLinkage, - "powf", m); - add_named_global(jlpowf_func, &powf, false); - - Type *pow_type[2] = { T_float64, T_float64 }; - jlpow_func = Function::Create(FunctionType::get(T_float64, pow_type, false), - Function::ExternalLinkage, - "pow", m); - add_named_global(jlpow_func, -#ifdef _COMPILER_MICROSOFT_ - static_cast(&pow), -#else - &pow, -#endif - false); -#endif std::vector array_owner_args(0); array_owner_args.push_back(T_pjlvalue); jlarray_data_owner_func = diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 27cc70985c070..b694e917a7256 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -71,7 +71,6 @@ static void jl_init_intrinsic_functions_codegen(Module *m) float_func[rint_llvm] = true; float_func[sqrt_llvm] = true; float_func[sqrt_llvm_fast] = true; - float_func[powi_llvm] = true; } extern "C" @@ -915,33 +914,6 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs, return mark_julia_type(ans, false, x.typ, ctx); } - case powi_llvm: { - const jl_cgval_t &x = argv[0]; - const jl_cgval_t &y = argv[1]; - if (!jl_is_bitstype(x.typ) || !jl_is_bitstype(y.typ) || jl_datatype_size(y.typ) != 4) - return emit_runtime_call(f, argv, nargs, ctx); - Type *xt = FLOATT(bitstype_to_llvm(x.typ)); - Type *yt = T_int32; - if (!xt) - return emit_runtime_call(f, argv, nargs, ctx); - - Value *xv = emit_unbox(xt, x, x.typ); - Value *yv = emit_unbox(yt, y, y.typ); -#if JL_LLVM_VERSION >= 30600 - Value *powi = Intrinsic::getDeclaration(jl_Module, Intrinsic::powi, makeArrayRef(xt)); -#if JL_LLVM_VERSION >= 30700 - Value *ans = builder.CreateCall(powi, {xv, yv}); -#else - Value *ans = builder.CreateCall2(powi, xv, yv); -#endif -#else - // issue #6506 - Value *ans = builder.CreateCall2(prepare_call(xt == T_float64 ? jlpow_func : jlpowf_func), - xv, builder.CreateSIToFP(yv, xt)); -#endif - return mark_julia_type(ans, false, x.typ, ctx); - } - default: { assert(nargs >= 1 && "invalid nargs for intrinsic call"); const jl_cgval_t &xinfo = argv[0]; diff --git a/src/intrinsics.h b/src/intrinsics.h index 479b048b940a5..db22de2fa5a5c 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -91,7 +91,6 @@ ADD_I(trunc_llvm, 1) \ ADD_I(rint_llvm, 1) \ ADD_I(sqrt_llvm, 1) \ - ADD_I(powi_llvm, 2) \ ALIAS(sqrt_llvm_fast, sqrt_llvm) \ /* pointer access */ \ ADD_I(pointerref, 3) \ diff --git a/src/julia_internal.h b/src/julia_internal.h index e0e2c8282799b..0b6b16bed095f 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -677,7 +677,6 @@ JL_DLLEXPORT jl_value_t *jl_floor_llvm(jl_value_t *a); JL_DLLEXPORT jl_value_t *jl_trunc_llvm(jl_value_t *a); JL_DLLEXPORT jl_value_t *jl_rint_llvm(jl_value_t *a); JL_DLLEXPORT jl_value_t *jl_sqrt_llvm(jl_value_t *a); -JL_DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_abs_float(jl_value_t *a); JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b); diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index d69b1be21e6db..4f4d6bce39fd6 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -956,31 +956,6 @@ un_fintrinsic(trunc_float,trunc_llvm) un_fintrinsic(rint_float,rint_llvm) un_fintrinsic(sqrt_float,sqrt_llvm) -JL_DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b) -{ - jl_ptls_t ptls = jl_get_ptls_states(); - jl_value_t *ty = jl_typeof(a); - if (!jl_is_bitstype(ty)) - jl_error("powi_llvm: a is not a bitstype"); - if (!jl_is_bitstype(jl_typeof(b)) || jl_datatype_size(jl_typeof(b)) != 4) - jl_error("powi_llvm: b is not a 32-bit bitstype"); - int sz = jl_datatype_size(ty); - jl_value_t *newv = jl_gc_alloc(ptls, sz, ty); - void *pa = jl_data_ptr(a), *pr = jl_data_ptr(newv); - switch (sz) { - /* choose the right size c-type operation */ - case 4: - *(float*)pr = powf(*(float*)pa, (float)jl_unbox_int32(b)); - break; - case 8: - *(double*)pr = pow(*(double*)pa, (double)jl_unbox_int32(b)); - break; - default: - jl_error("powi_llvm: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); - } - return newv; -} - JL_DLLEXPORT jl_value_t *jl_select_value(jl_value_t *isfalse, jl_value_t *a, jl_value_t *b) { JL_TYPECHK(isfalse, bool, isfalse); diff --git a/test/math.jl b/test/math.jl index b83a2bdba876c..e35ac7d016e89 100644 --- a/test/math.jl +++ b/test/math.jl @@ -996,6 +996,13 @@ end end end +@testset "issue #19872" begin + f19872(x) = x ^ 3 + @test issubnormal(2.0 ^ (-1024)) + @test f19872(2.0) === 8.0 + @test !issubnormal(0.0) +end + @test Base.Math.f32(complex(1.0,1.0)) == complex(Float32(1.),Float32(1.)) @test Base.Math.f16(complex(1.0,1.0)) == complex(Float16(1.),Float16(1.))