diff --git a/base/inference.jl b/base/inference.jl index e4dce78d1af54f..775c3baa9bd52e 100644 --- a/base/inference.jl +++ b/base/inference.jl @@ -504,6 +504,8 @@ add_tfunc(sdiv_int, 2, 2, math_tfunc, 30) add_tfunc(udiv_int, 2, 2, math_tfunc, 30) add_tfunc(srem_int, 2, 2, math_tfunc, 30) add_tfunc(urem_int, 2, 2, math_tfunc, 30) +add_tfunc(add_ptr, 2, 2, math_tfunc, 1) +add_tfunc(sub_ptr, 2, 2, math_tfunc, 1) add_tfunc(neg_float, 1, 1, math_tfunc, 1) add_tfunc(add_float, 2, 2, math_tfunc, 1) add_tfunc(sub_float, 2, 2, math_tfunc, 1) diff --git a/base/pointer.jl b/base/pointer.jl index b2197d21db8c08..2daa2e4a4408a8 100644 --- a/base/pointer.jl +++ b/base/pointer.jl @@ -147,8 +147,8 @@ eltype(::Type{Ptr{T}}) where {T} = T isless(x::Ptr, y::Ptr) = isless(UInt(x), UInt(y)) -(x::Ptr, y::Ptr) = UInt(x) - UInt(y) -+(x::Ptr, y::Integer) = oftype(x, (UInt(x) + (y % UInt) % UInt)) --(x::Ptr, y::Integer) = oftype(x, (UInt(x) - (y % UInt) % UInt)) ++(x::Ptr, y::Integer) = oftype(x, Intrinsics.add_ptr(UInt(x), (y % UInt) % UInt)) +-(x::Ptr, y::Integer) = oftype(x, Intrinsics.sub_ptr(UInt(x), (y % UInt) % UInt)) +(x::Integer, y::Ptr) = y + x """ diff --git a/deps/llvm.mk b/deps/llvm.mk index 7f44868aba359e..07b77b15298e16 100644 --- a/deps/llvm.mk +++ b/deps/llvm.mk @@ -460,6 +460,7 @@ $(eval $(call LLVM_PATCH,llvm-D32593)) $(eval $(call LLVM_PATCH,llvm-D33179)) $(eval $(call LLVM_PATCH,llvm-PR29010-i386-xmm)) # Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-3.9.0-D37576-NVPTX-sm_70)) # NVPTX, Remove for 6.0 +$(eval $(call LLVM_PATCH,llvm-D37939-Mem2Reg-Also-handle-memcpy)) else ifeq ($(LLVM_VER_SHORT),4.0) # Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365 $(eval $(call LLVM_PATCH,llvm-4.0.0_threads)) diff --git a/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch new file mode 100644 index 00000000000000..b8753b0439ba0b --- /dev/null +++ b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch @@ -0,0 +1,365 @@ +From da4504b2d3c6629fbd58634bf76f1b85939d07cf Mon Sep 17 00:00:00 2001 +From: Keno Fischer +Date: Fri, 15 Sep 2017 18:30:59 -0400 +Subject: [PATCH] [Mem2Reg] Also handle memcpy + +Summary: +In julia, when we know we're moving data between two memory locations, +we always emit that as a memcpy rather than a load/store pair. However, +this can give worse optimization results in certain cases because some +optimizations that can handle load/store pairs cannot handle memcpys. +Mem2reg is one of these optimizations. This patch adds rudamentary +support for mem2reg for recognizing memcpys that cover the whole alloca +we're promoting. While several more sophisticated passes (SROA, GVN) +can get similar optimizations, it is preferable to have these kinds +of cases caught early to expose optimization opportunities before +getting to these later passes. The approach taken here is to split +the memcpy into a load/store pair early (after legality analysis) +and retain the rest of the analysis only on loads/stores. It would +be possible of course to leave the memcpy as is and generate the +left over load or store only on demand. However, that would entail +a significantly larger patch for unclear benefit. + +Reviewers: chandlerc, dberlin + +Subscribers: llvm-commits + +Differential Revision: https://reviews.llvm.org/D37939 +--- + lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 166 ++++++++++++++++++++--- + test/Transforms/Mem2Reg/memcpy.ll | 101 ++++++++++++++ + 2 files changed, 251 insertions(+), 16 deletions(-) + create mode 100644 test/Transforms/Mem2Reg/memcpy.ll + +diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +index ac28f59..b08a0a1 100644 +--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp ++++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +@@ -49,6 +49,58 @@ STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); + STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); + STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); + ++static bool isSplittableMemCpy(const MemCpyInst *MCI, const AllocaInst *AI) { ++ // Punt if this alloca is an array allocation ++ if (AI->isArrayAllocation()) ++ return false; ++ if (MCI->isVolatile()) ++ return false; ++ Value *Length = MCI->getLength(); ++ if (!isa(Length)) ++ return false; ++ // Anything less than the full alloca, we leave for SROA ++ const DataLayout &DL = AI->getModule()->getDataLayout(); ++ size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType()); ++ if (cast(Length)->getZExtValue() != AIElSize) ++ return false; ++ // If the other argument is also an alloca, we need to be sure that either ++ // the types are bitcastable, or the other alloca is not eligible for ++ // promotion (e.g. because the memcpy is for less than the whole size of ++ // that alloca), otherwise we risk turning an allocatable alloca into a ++ // non-allocatable one when splitting the memcpy. ++ AllocaInst *OtherAI = dyn_cast( ++ AI == MCI->getSource() ? MCI->getDest() : MCI->getSource()); ++ if (OtherAI) { ++ if (!CastInst::isBitCastable(AI->getAllocatedType(), ++ OtherAI->getAllocatedType()) && ++ DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize) ++ return false; ++ } ++ return true; ++} ++ ++/// Look at the result of a bitcast and see if it's only used by lifetime ++/// intrinsics or splittable memcpys. This is needed, because IRBuilder ++/// will always insert a bitcast to i8* for these intrinsics. ++static bool onlyHasCanonicalizableUsers(const AllocaInst *AI, const Value *V) { ++ for (const User *U : V->users()) { ++ const IntrinsicInst *II = dyn_cast(U); ++ if (!II) ++ return false; ++ ++ if (isa(II)) { ++ if (!isSplittableMemCpy(cast(II), AI)) ++ return false; ++ continue; ++ } ++ ++ if (II->getIntrinsicID() != Intrinsic::lifetime_start && ++ II->getIntrinsicID() != Intrinsic::lifetime_end) ++ return false; ++ } ++ return true; ++} ++ + bool llvm::isAllocaPromotable(const AllocaInst *AI) { + // FIXME: If the memory unit is of pointer or integer type, we can permit + // assignments to subsections of the memory unit. +@@ -68,6 +120,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { + // not have any meaning for a local alloca. + if (SI->isVolatile()) + return false; ++ } else if (const MemCpyInst *MCI = dyn_cast(U)) { ++ if (!isSplittableMemCpy(MCI, AI)) ++ return false; + } else if (const IntrinsicInst *II = dyn_cast(U)) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) +@@ -75,7 +130,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { + } else if (const BitCastInst *BCI = dyn_cast(U)) { + if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) + return false; +- if (!onlyUsedByLifetimeMarkers(BCI)) ++ if (!onlyHasCanonicalizableUsers(AI, BCI)) + return false; + } else if (const GetElementPtrInst *GEPI = dyn_cast(U)) { + if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) +@@ -181,7 +235,13 @@ public: + /// This code only looks at accesses to allocas. + static bool isInterestingInstruction(const Instruction *I) { ++ if (isa(I)) { ++ const MemCpyInst *MCI = cast(I); ++ return isa(MCI->getSource()) || ++ isa(MCI->getDest()); ++ } else { + return (isa(I) && isa(I->getOperand(0))) || + (isa(I) && isa(I->getOperand(1))); + } ++ } + + /// Get or calculate the index of the specified instruction. +@@ -208,6 +264,25 @@ public: + return It->second; + } + ++ // When we split a memcpy intrinsic, we need to update the numbering in this ++ // struct. To make sure the relative ordering remains the same, we give both ++ // the LI and the SI the number that the MCI used to have (if they are both ++ // interesting). This means that they will have equal numbers, which usually ++ // can't happen. However, since they can never reference the same alloca ++ // (since memcpy operands may not overlap), this is fine, because we will ++ // never compare instruction indices for instructions that operate on distinct ++ // allocas. ++ void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) { ++ DenseMap::iterator It = ++ InstNumbers.find(MCI); ++ if (It == InstNumbers.end()) ++ return; ++ unsigned MemCpyNumber = It->second; ++ InstNumbers[LI] = MemCpyNumber; ++ InstNumbers[SI] = MemCpyNumber; ++ deleteValue(MCI); ++ } ++ + void deleteValue(const Instruction *I) { InstNumbers.erase(I); } + + void clear() { InstNumbers.clear(); } +@@ -305,9 +380,58 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { + AC->registerAssumption(CI); + } + +-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { +- // Knowing that this alloca is promotable, we know that it's safe to kill all +- // instructions except for load and store. ++/// Split a memcpy instruction into the corresponding load/store. It is a little ++/// more complicated than one might imagine, because we need to deal with the ++/// fact that the side of the copy we're not currently processing might also ++/// be a promotable alloca. We need to be careful to not break the promotable ++/// predicate for that other alloca (if any). ++static void doMemCpySplit(LargeBlockInfo &LBI, MemCpyInst *MCI, ++ AllocaInst *AI) { ++ AAMDNodes AA; ++ MCI->getAAMetadata(AA); ++ Value *MCISrc = MCI->getSource(); ++ Type *LoadType = AI->getAllocatedType(); ++ AllocaInst *SrcAI = dyn_cast(MCISrc); ++ if (SrcAI && SrcAI->getType() != AI->getType()) { ++ if (CastInst::isBitCastable(SrcAI->getAllocatedType(), LoadType)) ++ LoadType = SrcAI->getAllocatedType(); ++ } ++ if (cast(MCISrc->getType())->getElementType() != LoadType) ++ MCISrc = CastInst::Create( ++ Instruction::BitCast, MCISrc, ++ LoadType->getPointerTo( ++ cast(MCISrc->getType())->getAddressSpace()), ++ "", MCI); ++ // This might add to the end of the use list, but that's fine. At worst, ++ // we'd not visit the instructions we insert here, but we don't care ++ // about them in this loop anyway. ++ LoadInst *LI = new LoadInst(LoadType, MCISrc, "", MCI->isVolatile(), ++ MCI->getAlignment(), MCI); ++ Value *Val = LI; ++ Value *MCIDest = MCI->getDest(); ++ AllocaInst *DestAI = dyn_cast(MCIDest); ++ Type *DestElTy = DestAI ? DestAI->getAllocatedType() : AI->getAllocatedType(); ++ if (LI->getType() != DestElTy && ++ CastInst::isBitCastable(LI->getType(), DestElTy)) ++ Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI); ++ if (cast(MCIDest->getType())->getElementType() != Val->getType()) ++ MCIDest = CastInst::Create( ++ Instruction::BitCast, MCIDest, ++ Val->getType()->getPointerTo( ++ cast(MCIDest->getType())->getAddressSpace()), ++ "", MCI); ++ StoreInst *SI = ++ new StoreInst(Val, MCIDest, MCI->isVolatile(), MCI->getAlignment(), MCI); ++ LI->setAAMetadata(AA); ++ SI->setAAMetadata(AA); ++ LBI.splitMemCpy(MCI, LI, SI); ++ MCI->eraseFromParent(); ++} ++ ++static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) { ++ // Knowing that this alloca is promotable, we know that it's safe to split ++ // MTIs into load/store and to kill all other instructions except for ++ // load and store. + + for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { + Instruction *I = cast(*UI); +@@ -315,14 +439,24 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { + if (isa(I) || isa(I)) + continue; + ++ if (isa(I)) { ++ MemCpyInst *MCI = cast(I); ++ doMemCpySplit(LBI, MCI, AI); ++ continue; ++ } ++ + if (!I->getType()->isVoidTy()) { +- // The only users of this bitcast/GEP instruction are lifetime intrinsics. +- // Follow the use/def chain to erase them now instead of leaving it for +- // dead code elimination later. ++ // The only users of this bitcast/GEP instruction are lifetime/memcpy ++ // intrinsics. Split memcpys and delete lifetime intrinsics. + for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { + Instruction *Inst = cast(*UUI); + ++UUI; +- Inst->eraseFromParent(); ++ if (isa(Inst)) { ++ doMemCpySplit(LBI, cast(Inst), AI); ++ } else { ++ // Must be a lifetime intrinsic ++ Inst->eraseFromParent(); ++ } + } + } + I->eraseFromParent(); +@@ -542,7 +676,7 @@ void PromoteMem2Reg::run() { + assert(AI->getParent()->getParent() == &F && + "All allocas should be in the same function, which is same as DF!"); + +- removeLifetimeIntrinsicUsers(AI); ++ canonicalizeUsers(LBI, AI); + + if (AI->use_empty()) { + // If there are no uses of the alloca, just delete it now. +diff --git a/test/Transforms/Mem2Reg/memcpy.ll b/test/Transforms/Mem2Reg/memcpy.ll +new file mode 100644 +index 0000000..fbc4096 +--- /dev/null ++++ b/test/Transforms/Mem2Reg/memcpy.ll +@@ -0,0 +1,101 @@ ++; RUN: opt < %s -mem2reg -S | FileCheck %s ++ ++target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ++ ++declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1) ++declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *, i8 *, i32, i32, i1) ++declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1) ++declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1) ++ ++define i128 @test_cpy_different(i64) { ++; CHECK-LABEL: @test_cpy_different ++; CHECK-NOT: alloca i64 ++; CHECK: store i64 %0 ++ %a = alloca i64 ++ %b = alloca i128 ++ store i128 0, i128 *%b ++ store i64 %0, i64 *%a ++ call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0) ++ %loaded = load i128, i128 *%b ++ ret i128 %loaded ++} ++ ++define i64 @test_cpy_same(i64) { ++; CHECK-LABEL: @test_cpy_same ++; CHECK-NOT: alloca ++; CHECK: ret i64 %0 ++ %a = alloca i64 ++ %b = alloca i64 ++ store i64 %0, i64 *%a ++ call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0) ++ %loaded = load i64, i64 *%b ++ ret i64 %loaded ++} ++ ++define double @test_cpy_different_type(i64) { ++; CHECK-LABEL: @test_cpy_different_type ++; CHECK-NOT: alloca ++; CHECK: bitcast i64 %0 to double ++ %a = alloca i64 ++ %b = alloca double ++ store i64 %0, i64 *%a ++ call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0) ++ %loaded = load double, double *%b ++ ret double %loaded ++} ++ ++define i128 @test_cpy_differenti8(i64) { ++; CHECK-LABEL: @test_cpy_differenti8 ++; CHECK-NOT: alloca i64 ++; CHECK: store i64 %0 ++ %a = alloca i64 ++ %b = alloca i128 ++ store i128 0, i128 *%b ++ store i64 %0, i64 *%a ++ %acast = bitcast i64* %a to i8* ++ %bcast = bitcast i128* %b to i8* ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0) ++ %loaded = load i128, i128 *%b ++ ret i128 %loaded ++} ++ ++define i64 @test_cpy_samei8(i64) { ++; CHECK-LABEL: @test_cpy_samei8 ++; CHECK-NOT: alloca ++; CHECK: ret i64 %0 ++ %a = alloca i64 ++ %b = alloca i64 ++ store i64 %0, i64 *%a ++ %acast = bitcast i64* %a to i8* ++ %bcast = bitcast i64* %b to i8* ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0) ++ %loaded = load i64, i64 *%b ++ ret i64 %loaded ++} ++ ++define double @test_cpy_different_typei8(i64) { ++; CHECK-LABEL: @test_cpy_different_typei8 ++; CHECK-NOT: alloca ++; CHECK: bitcast i64 %0 to double ++ %a = alloca i64 ++ %b = alloca double ++ store i64 %0, i64 *%a ++ %acast = bitcast i64* %a to i8* ++ %bcast = bitcast double* %b to i8* ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0) ++ %loaded = load double, double *%b ++ ret double %loaded ++} ++ ++define i64 @test_cpy_differenti8_reverse(i128) { ++; CHECK-LABEL: @test_cpy_differenti8_reverse ++; CHECK-NOT: alloca i64 ++ %a = alloca i64 ++ %b = alloca i128 ++ store i128 %0, i128 *%b ++ %acast = bitcast i64* %a to i8* ++ %bcast = bitcast i128* %b to i8* ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%acast, i8 *%bcast, i32 8, i32 0, i1 0) ++ %loaded = load i64, i64 *%a ++ ret i64 %loaded ++} +-- +2.9.3 + diff --git a/src/cgutils.cpp b/src/cgutils.cpp index af9f99826528a0..c6f2006974043f 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -235,7 +235,7 @@ static Value *emit_pointer_from_objref(jl_codectx_t &ctx, Value *V) #else Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); #endif - return Call; + return ctx.builder.CreatePtrToInt(Call, T_size); } // --- emitting pointers directly into code --- @@ -368,6 +368,12 @@ static Value *emit_bitcast(jl_codectx_t &ctx, Value *v, Type *jl_value) } } +static Value *maybe_bitcast(jl_codectx_t &ctx, Value *V, Type *to) { + if (to != V->getType()) + return emit_bitcast(ctx, V, to); + return V; +} + static Value *julia_binding_gv(jl_codectx_t &ctx, Value *bv) { Value *offset = ConstantInt::get(T_size, offsetof(jl_binding_t, value) / sizeof(size_t)); @@ -1250,8 +1256,8 @@ static void typed_store(jl_codectx_t &ctx, } else { data = ptr; } - Instruction *store = ctx.builder.CreateAlignedStore(r, ctx.builder.CreateGEP(data, - idx_0based), isboxed ? alignment : julia_alignment(jltype, alignment)); + Instruction *store = ctx.builder.CreateAlignedStore(r, idx_0based ? ctx.builder.CreateGEP(data, + idx_0based) : data, isboxed ? alignment : julia_alignment(jltype, alignment)); if (tbaa) tbaa_decorate(tbaa, store); } @@ -1267,7 +1273,7 @@ static Value *julia_bool(jl_codectx_t &ctx, Value *cond) // --- accessing the representations of built-in data types --- static Constant *julia_const_to_llvm(jl_value_t *e); -static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype = T_ppjlvalue) +static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x) { Value *data = x.V; if (x.constant) { @@ -1279,9 +1285,7 @@ static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype data = boxed(ctx, x); } } - if (astype && data->getType() != astype) - data = emit_bitcast(ctx, data, astype); - return decay_derived(data); + return data; } static void emit_memcpy_llvm(jl_codectx_t &ctx, Value *dst, Value *src, @@ -1342,7 +1346,7 @@ static Value *get_value_ptr(jl_codectx_t&, Value *ptr) static Value *get_value_ptr(jl_codectx_t &ctx, const jl_cgval_t &v) { - return data_pointer(ctx, v, nullptr); + return data_pointer(ctx, v); } template @@ -1372,7 +1376,9 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx, Value *fld = tbaa_decorate(strct.tbaa, maybe_mark_load_dereferenceable( ctx.builder.CreateLoad( - ctx.builder.CreateBitCast(ctx.builder.CreateGEP(decay_derived(data_pointer(ctx, strct)), idx), + ctx.builder.CreateBitCast( + ctx.builder.CreateGEP(decay_derived( + emit_bitcast(ctx, data_pointer(ctx, strct), T_pprjlvalue)), idx), PointerType::get(T_prjlvalue, AddressSpace::Derived))), maybe_null, minimum_field_size)); if (maybe_null) @@ -1384,11 +1390,11 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx, assert(nfields > 0); // nf == 0 trapped by all_pointers case jl_value_t *jt = jl_field_type(stt, 0); idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields), inbounds); - Value *ptr = data_pointer(ctx, strct); + Value *ptr = decay_derived(data_pointer(ctx, strct)); if (!stt->mutabl) { // just compute the pointer and let user load it when necessary Type *fty = julia_type_to_llvm(jt); - Value *addr = ctx.builder.CreateGEP(emit_bitcast(ctx, decay_derived(ptr), PointerType::get(fty,0)), idx); + Value *addr = ctx.builder.CreateGEP(emit_bitcast(ctx, ptr, PointerType::get(fty,0)), idx); *ret = mark_julia_slot(addr, jt, NULL, strct.tbaa); ret->isimmutable = strct.isimmutable; return true; @@ -1441,28 +1447,34 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st return ghostValue(jfty); Value *fldv = NULL; if (strct.ispointer()) { - Value *addr; + Value *addr = decay_derived(data_pointer(ctx, strct)); bool isboxed; Type *lt = julia_type_to_llvm((jl_value_t*)jt, &isboxed); if (isboxed) { - Value *ptr = decay_derived(data_pointer(ctx, strct, T_pint8)); - Value *llvm_idx = ConstantInt::get(T_size, jl_field_offset(jt, idx)); - addr = ctx.builder.CreateGEP(ptr, llvm_idx); + size_t byte_offset = jl_field_offset(jt, idx); + // byte_offset == 0 is an important special case here, e.g. + // for single field wrapper types. Introducing the bitcast + // can pessimize mem2reg + if (byte_offset > 0) { + addr = ctx.builder.CreateGEP( + emit_bitcast(ctx, addr, T_pint8), + ConstantInt::get(T_size, byte_offset)); + } } else { if (VectorType *vlt = dyn_cast(lt)) { // doesn't have the struct wrapper, so this must have been a VecElement // cast to the element type so that it can be addressed with GEP lt = vlt->getElementType(); - Value *ptr = data_pointer(ctx, strct, lt->getPointerTo()); + Value *ptr = emit_bitcast(ctx, addr, lt->getPointerTo()); Value *llvm_idx = ConstantInt::get(T_size, idx); addr = ctx.builder.CreateGEP(lt, ptr, llvm_idx); } else if (lt->isSingleValueType()) { - addr = data_pointer(ctx, strct, lt->getPointerTo()); + addr = emit_bitcast(ctx, addr, lt->getPointerTo()); } else { - Value *ptr = data_pointer(ctx, strct, lt->getPointerTo()); + Value *ptr = emit_bitcast(ctx, addr, lt->getPointerTo()); addr = ctx.builder.CreateStructGEP(lt, ptr, idx); } } @@ -1503,7 +1515,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st fieldval.isimmutable = strct.isimmutable; return fieldval; } - return typed_load(ctx, addr, ConstantInt::get(T_size, 0), jfty, strct.tbaa, true, align); + return typed_load(ctx, addr, NULL, jfty, strct.tbaa, true, align); } else if (isa(strct.V)) { return jl_cgval_t(); @@ -2152,13 +2164,15 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, const jl_cgval_t &src emit_unbox(ctx, store_ty, src, typ, dest, isVolatile); } else { - Value *src_ptr = data_pointer(ctx, src, T_pint8); - if (dest->getType() != T_pint8) - dest = emit_bitcast(ctx, dest, T_pint8); - if (skip) // copy dest -> dest to simulate an undef value / conditional copy - src_ptr = ctx.builder.CreateSelect(skip, dest, src_ptr); + Value *src_ptr = data_pointer(ctx, src); + unsigned nb = jl_datatype_size(typ); unsigned alignment = julia_alignment(typ, 0); - emit_memcpy(ctx, dest, src_ptr, jl_datatype_size(typ), alignment, isVolatile, tbaa); + Value *nbytes = ConstantInt::get(T_size, nb); + if (skip) // copy dest -> dest to simulate an undef value / conditional copy + nbytes = ctx.builder.CreateSelect(skip, + ConstantInt::get(T_size, 0), + nbytes); + emit_memcpy(ctx, dest, src_ptr, nbytes, alignment, isVolatile, tbaa); } } } @@ -2166,9 +2180,8 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, const jl_cgval_t &src Value *tindex = ctx.builder.CreateAnd(src.TIndex, ConstantInt::get(T_int8, 0x7f)); if (skip) tindex = ctx.builder.CreateSelect(skip, ConstantInt::get(T_int8, 0), tindex); - Value *src_ptr = data_pointer(ctx, src, T_pint8); - if (dest->getType() != T_pint8) - dest = emit_bitcast(ctx, dest, T_pint8); + Value *src_ptr = maybe_bitcast(ctx, data_pointer(ctx, src), T_pint8); + dest = maybe_bitcast(ctx, dest, T_pint8); BasicBlock *defaultBB = BasicBlock::Create(jl_LLVMContext, "union_move_skip", ctx.f); SwitchInst *switchInst = ctx.builder.CreateSwitch(tindex, defaultBB); BasicBlock *postBB = BasicBlock::Create(jl_LLVMContext, "post_union_move", ctx.f); @@ -2288,8 +2301,13 @@ static void emit_setfield(jl_codectx_t &ctx, { if (sty->mutabl || !checked) { assert(strct.ispointer()); - Value *addr = ctx.builder.CreateGEP(data_pointer(ctx, strct, T_pint8), - ConstantInt::get(T_size, jl_field_offset(sty, idx0))); + size_t byte_offset = jl_field_offset(sty, idx0); + Value *addr = data_pointer(ctx, strct); + if (byte_offset > 0) { + addr = ctx.builder.CreateGEP( + emit_bitcast(ctx, decay_derived(addr), T_pint8), + ConstantInt::get(T_size, byte_offset)); + } jl_value_t *jfty = jl_svecref(sty->types, idx0); if (jl_field_isptr(sty, idx0)) { Value *r = maybe_decay_untracked(boxed(ctx, rhs)); // don't need a temporary gcroot since it'll be rooted by strct @@ -2306,7 +2324,7 @@ static void emit_setfield(jl_codectx_t &ctx, return; Value *tindex = compute_tindex_unboxed(ctx, rhs_union, jfty); tindex = ctx.builder.CreateNUWSub(tindex, ConstantInt::get(T_int8, 1)); - Value *ptindex = ctx.builder.CreateGEP(T_int8, emit_bitcast(ctx, addr, T_pint8), ConstantInt::get(T_size, fsz - 1)); + Value *ptindex = ctx.builder.CreateGEP(T_int8, emit_bitcast(ctx, decay_derived(addr), T_pint8), ConstantInt::get(T_size, fsz - 1)); ctx.builder.CreateStore(tindex, ptindex); // copy data if (!rhs.isghost) { @@ -2315,8 +2333,9 @@ static void emit_setfield(jl_codectx_t &ctx, } else { unsigned align = jl_field_align(sty, idx0); - typed_store(ctx, addr, ConstantInt::get(T_size, 0), rhs, jfty, - strct.tbaa, data_pointer(ctx, strct, T_pjlvalue), align); + typed_store(ctx, addr, NULL, rhs, jfty, + strct.tbaa, maybe_bitcast(ctx, + data_pointer(ctx, strct), T_pjlvalue), align); } } else { @@ -2416,12 +2435,13 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg Value *strct = emit_allocobj(ctx, jl_datatype_size(sty), literal_pointer_val(ctx, (jl_value_t*)ty)); jl_cgval_t strctinfo = mark_julia_type(ctx, strct, true, ty); + strct = decay_derived(strct); for (size_t i = 0; i < nf; i++) { if (jl_field_isptr(sty, i)) { tbaa_decorate(strctinfo.tbaa, ctx.builder.CreateStore( ConstantPointerNull::get(cast(T_prjlvalue)), emit_bitcast(ctx, - ctx.builder.CreateGEP(emit_bitcast(ctx, decay_derived(strct), T_pint8), + ctx.builder.CreateGEP(emit_bitcast(ctx, strct, T_pint8), ConstantInt::get(T_size, jl_field_offset(sty, i))), T_pprjlvalue))); } diff --git a/src/codegen.cpp b/src/codegen.cpp index e1a3f99e146d31..7f13e04fcec9dc 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -2135,16 +2135,16 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, const jl_cgval_t &arg1, const if (sz > 512 && !((jl_datatype_t*)arg1.typ)->layout->haspadding) { Value *answer = ctx.builder.CreateCall(prepare_call(memcmp_derived_func), { - data_pointer(ctx, arg1, T_pint8), - data_pointer(ctx, arg2, T_pint8), + maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg1)), T_pint8), + maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg2)), T_pint8), ConstantInt::get(T_size, sz) }); return ctx.builder.CreateICmpEQ(answer, ConstantInt::get(T_int32, 0)); } else { Type *atp = at->getPointerTo(); - Value *varg1 = data_pointer(ctx, arg1, atp); - Value *varg2 = data_pointer(ctx, arg2, atp); + Value *varg1 = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg1)), atp); + Value *varg2 = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg2)), atp); jl_svec_t *types = ((jl_datatype_t*)arg1.typ)->types; Value *answer = ConstantInt::get(T_int1, 1); for (size_t i = 0, l = jl_svec_len(types); i < l; i++) { @@ -2645,7 +2645,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f, emit_datatype_nfields(ctx, emit_typeof_boxed(ctx, obj)), jl_true); } - Value *ptr = data_pointer(ctx, obj); + Value *ptr = decay_derived(data_pointer(ctx, obj)); *ret = typed_load(ctx, ptr, vidx, jt, obj.tbaa, false); return true; } @@ -2836,7 +2836,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f, } else { size_t offs = jl_field_offset(stt, fieldidx); - Value *ptr = data_pointer(ctx, obj, T_pint8); + Value *ptr = emit_bitcast(ctx, decay_derived(data_pointer(ctx, obj)), T_pint8); Value *llvm_idx = ConstantInt::get(T_size, offs); Value *addr = ctx.builder.CreateGEP(ptr, llvm_idx); // emit this using the same type as emit_getfield_knownidx @@ -2926,7 +2926,8 @@ static jl_cgval_t emit_call_function_object(jl_method_instance_t *li, jl_llvm_fu // can lazy load on demand, no copy needed assert(at == PointerType::get(et, AddressSpace::Derived)); assert(arg.ispointer()); - argvals[idx] = decay_derived(data_pointer(ctx, arg, at)); + argvals[idx] = decay_derived(maybe_bitcast(ctx, + data_pointer(ctx, arg), at)); } else { assert(at == et); @@ -3433,9 +3434,15 @@ static void emit_vi_assignment_unboxed(jl_codectx_t &ctx, jl_varinfo_t &vi, Valu tbaa = NULL; if (vi.pTIndex == NULL) { assert(jl_is_leaf_type(vi.value.typ)); - Value *copy_bytes = ConstantInt::get(T_int32, jl_datatype_size(vi.value.typ)); - emit_memcpy(ctx, vi.value.V, rval_info, copy_bytes, - jl_datatype_align(rval_info.typ), vi.isVolatile, tbaa); + // Sometimes we can get into situations where the LHS and RHS + // are the same slot. We're not allowed to memcpy in that case + // under penalty of undefined behavior. This check should catch + // the relevant situations. + if (vi.value.V != rval_info.V) { + Value *copy_bytes = ConstantInt::get(T_int32, jl_datatype_size(vi.value.typ)); + emit_memcpy(ctx, vi.value.V, rval_info, copy_bytes, + jl_datatype_align(rval_info.typ), vi.isVolatile, tbaa); + } } else { emit_unionmove(ctx, vi.value.V, rval_info, isboxed, vi.isVolatile, tbaa); @@ -4297,7 +4304,8 @@ static Function *gen_cfun_wrapper(jl_function_t *ff, jl_value_t *jlrettype, jl_t } else if (T->isAggregateType()) { // aggregate types are passed by pointer - arg = data_pointer(ctx, inputarg, T->getPointerTo()); + arg = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, inputarg)), + T->getPointerTo()); } else { arg = emit_unbox(ctx, T, inputarg, spect); @@ -6571,7 +6579,7 @@ static void init_julia_llvm_env(Module *m) "llvm.julia.gc_preserve_end"); add_named_global(gc_preserve_end_func, (void*)NULL, /*dllimport*/false); - pointer_from_objref_func = Function::Create(FunctionType::get(T_size, + pointer_from_objref_func = Function::Create(FunctionType::get(T_pjlvalue, ArrayRef(PointerType::get(T_jlvalue, AddressSpace::Derived)), false), Function::ExternalLinkage, "julia.pointer_from_objref"); diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 425941888d77b2..0dc7c5319738d0 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -269,6 +269,37 @@ static Constant *julia_const_to_llvm(jl_value_t *e) static jl_cgval_t ghostValue(jl_value_t *ty); +static Value *emit_unboxed_coercion(jl_codectx_t &ctx, Type *to, Value *unboxed) +{ + Type *ty = unboxed->getType(); + assert(ty != T_void); + bool frompointer = ty->isPointerTy(); + bool topointer = to->isPointerTy(); + if (frompointer && topointer) { + unboxed = emit_bitcast(ctx, unboxed, to); + } + else if (frompointer) { + Type *INTT_to = INTT(to); + unboxed = ctx.builder.CreatePtrToInt(unboxed, INTT_to); + if (INTT_to != to) + unboxed = ctx.builder.CreateBitCast(unboxed, to); + } + else if (topointer) { + Type *INTT_to = INTT(to); + if (to != INTT_to) + unboxed = ctx.builder.CreateBitCast(unboxed, INTT_to); + unboxed = ctx.builder.CreateIntToPtr(unboxed, to); + } + else if (ty == T_int1 && to == T_int8) { + // bools may be stored internally as int8 + unboxed = ctx.builder.CreateZExt(unboxed, T_int8); + } + else if (ty != to) { + unboxed = ctx.builder.CreateBitCast(unboxed, to); + } + return unboxed; +} + // emit code to unpack a raw value from a box into registers or a stack slot static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_value_t *jt, Value *dest, bool volatile_store) { @@ -287,33 +318,7 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va Constant *c = x.constant ? julia_const_to_llvm(x.constant) : NULL; if (!x.ispointer() || c) { // already unboxed, but sometimes need conversion - Value *unboxed = c ? c : x.V; - Type *ty = unboxed->getType(); - assert(ty != T_void); - bool frompointer = ty->isPointerTy(); - bool topointer = to->isPointerTy(); - if (frompointer && topointer) { - unboxed = emit_bitcast(ctx, unboxed, to); - } - else if (frompointer) { - Type *INTT_to = INTT(to); - unboxed = ctx.builder.CreatePtrToInt(unboxed, INTT_to); - if (INTT_to != to) - unboxed = ctx.builder.CreateBitCast(unboxed, to); - } - else if (topointer) { - Type *INTT_to = INTT(to); - if (to != INTT_to) - unboxed = ctx.builder.CreateBitCast(unboxed, INTT_to); - unboxed = ctx.builder.CreateIntToPtr(unboxed, to); - } - else if (ty == T_int1 && to == T_int8) { - // bools may be stored internally as int8 - unboxed = ctx.builder.CreateZExt(unboxed, T_int8); - } - else if (ty != to) { - unboxed = ctx.builder.CreateBitCast(unboxed, to); - } + Value *unboxed = emit_unboxed_coercion(ctx, to, c ? c : x.V); if (!dest) return unboxed; Type *dest_ty = unboxed->getType()->getPointerTo(); @@ -326,14 +331,12 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va // bools stored as int8, so an extra Trunc is needed to get an int1 Value *p = x.constant ? literal_pointer_val(ctx, x.constant) : x.V; Type *ptype = (to == T_int1 ? T_pint8 : to->getPointerTo()); - if (p->getType() != ptype) - p = emit_bitcast(ctx, p, ptype); Value *unboxed = NULL; if (to == T_int1) - unboxed = ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(p)), T_int1); + unboxed = ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(maybe_bitcast(ctx, p, ptype))), T_int1); else if (jt == (jl_value_t*)jl_bool_type) - unboxed = ctx.builder.CreateZExt(ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(p)), T_int1), to); + unboxed = ctx.builder.CreateZExt(ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(maybe_bitcast(ctx, p, ptype))), T_int1), to); if (unboxed) { if (!dest) return unboxed; @@ -354,6 +357,27 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va return NULL; } else { + if (p->getType() != ptype && isa(p)) { + // LLVM's mem2reg can't handle coercion if the load/store type does + // not match the type of the alloca. As such, it is better to + // perform the load using the alloca's type and then perform the + // appropriate coercion manually. + AllocaInst *AI = cast(p); + Type *AllocType = AI->getAllocatedType(); +#if JL_LLVM_VERSION >= 40000 + const DataLayout &DL = jl_data_layout; +#else + const DataLayout &DL = jl_ExecutionEngine->getDataLayout(); +#endif + if (!AI->isArrayAllocation() && + (AllocType->isFloatingPointTy() || AllocType->isIntegerTy() || AllocType->isPointerTy()) && + (to->isFloatingPointTy() || to->isIntegerTy() || to->isPointerTy()) && + DL.getTypeSizeInBits(AllocType) == DL.getTypeSizeInBits(to)) { + Instruction *load = ctx.builder.CreateAlignedLoad(p, alignment); + return emit_unboxed_coercion(ctx, to, tbaa_decorate(x.tbaa, load)); + } + } + p = maybe_bitcast(ctx, p, ptype); Instruction *load = ctx.builder.CreateAlignedLoad(p, alignment); return tbaa_decorate(x.tbaa, load); } @@ -439,7 +463,8 @@ static jl_cgval_t generic_bitcast(jl_codectx_t &ctx, const jl_cgval_t *argv) if (isboxed) vxt = llvmt; vx = tbaa_decorate(v.tbaa, ctx.builder.CreateLoad( - data_pointer(ctx, v, vxt == T_int1 ? T_pint8 : vxt->getPointerTo()))); + emit_bitcast(ctx, data_pointer(ctx, v), + vxt == T_int1 ? T_pint8 : vxt->getPointerTo()))); } vxt = vx->getType(); @@ -899,6 +924,26 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg case srem_int: return ctx.builder.CreateSRem(x, y); case urem_int: return ctx.builder.CreateURem(x, y); + // LLVM will not fold ptrtoint+arithmetic+inttoptr to GEP. The reason for this + // has to do with alias analysis. When adding two integers, either one of them + // could be the pointer base. With getelementptr, it is clear which of the + // operands is the pointer base. We also have this information at the julia + // level. Thus, to not lose information, we need to have a separate intrinsic + // for pointer arithmetic which lowers to getelementptr. + case add_ptr: { + return ctx.builder.CreatePtrToInt( + ctx.builder.CreateGEP(T_int8, + ctx.builder.CreateIntToPtr(x, T_pint8), y), t); + + } + + case sub_ptr: { + return ctx.builder.CreatePtrToInt( + ctx.builder.CreateGEP(T_int8, + ctx.builder.CreateIntToPtr(x, T_pint8), ctx.builder.CreateNeg(y)), t); + + } + // Implements IEEE negate. See issue #7868 case neg_float: return math_builder(ctx)().CreateFSub(ConstantFP::get(t, -0.0), x); case neg_float_fast: return math_builder(ctx, true)().CreateFNeg(x); diff --git a/src/intrinsics.h b/src/intrinsics.h index 80491639ac6b8a..0f04fe418c4e67 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -12,6 +12,8 @@ ADD_I(udiv_int, 2) \ ADD_I(srem_int, 2) \ ADD_I(urem_int, 2) \ + ADD_I(add_ptr, 2) \ + ADD_I(sub_ptr, 2) \ ADD_I(neg_float, 1) \ ADD_I(add_float, 2) \ ADD_I(sub_float, 2) \ diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 73a270141417d2..7218e53cb180c1 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -210,6 +210,9 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level) PM->add(createSimpleLoopUnrollPass()); // Unroll small loops //PM->add(createLoopStrengthReducePass()); // (jwb added) + // Re-run SROA after loop-unrolling (useful for small loops that operate, + // over the structure of an aggregate) + PM->add(createSROAPass()); // Break up aggregate allocas PM->add(createInstructionCombiningPass()); // Clean up after the unroller PM->add(createGVNPass()); // Remove redundancies PM->add(createMemCpyOptPass()); // Remove memcpy / form memset diff --git a/src/julia_internal.h b/src/julia_internal.h index 4a587616da6cf0..9940934e9ef0cd 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -761,6 +761,9 @@ JL_DLLEXPORT jl_value_t *jl_udiv_int(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_srem_int(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_urem_int(jl_value_t *a, jl_value_t *b); +JL_DLLEXPORT jl_value_t *jl_add_ptr(jl_value_t *a, jl_value_t *b); +JL_DLLEXPORT jl_value_t *jl_sub_ptr(jl_value_t *a, jl_value_t *b); + JL_DLLEXPORT jl_value_t *jl_neg_float(jl_value_t *a); JL_DLLEXPORT jl_value_t *jl_add_float(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_sub_float(jl_value_t *a, jl_value_t *b); diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 4a67c39f841aad..216a37eb64bdd9 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -592,7 +592,6 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, } else if (auto call = dyn_cast(user)) { if (ptr_from_objref && ptr_from_objref == call->getCalledFunction()) { - new_i = new PtrToIntInst(new_i, T_size, "", call); call->replaceAllUsesWith(new_i); call->eraseFromParent(); return; diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 8acf06496db0f7..ccb660d966a43c 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1207,9 +1207,8 @@ bool LateLowerGCFrame::CleanupIR(Function &F) { } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) { auto *obj = CI->getOperand(0); auto *ASCI = new AddrSpaceCastInst(obj, T_pjlvalue, "", CI); - auto *ptr = new PtrToIntInst(ASCI, CI->getType(), "", CI); - ptr->takeName(CI); - CI->replaceAllUsesWith(ptr); + ASCI->takeName(CI); + CI->replaceAllUsesWith(ASCI); } else if (alloc_obj_func && callee == alloc_obj_func) { assert(CI->getNumArgOperands() == 3); auto sz = (size_t)cast(CI->getArgOperand(1))->getZExtValue(); diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index a768b9dec3501d..fecff170ba07a6 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -703,8 +703,10 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) un_iintrinsic_fast(LLVMNeg, neg, neg_int, u) #define add(a,b) a + b bi_iintrinsic_fast(LLVMAdd, add, add_int, u) +bi_iintrinsic_fast(LLVMAdd, add, add_ptr, u) #define sub(a,b) a - b bi_iintrinsic_fast(LLVMSub, sub, sub_int, u) +bi_iintrinsic_fast(LLVMSub, sub, sub_ptr, u) #define mul(a,b) a * b bi_iintrinsic_fast(LLVMMul, mul, mul_int, u) #define div(a,b) a / b