diff --git a/base/inference.jl b/base/inference.jl
index e4dce78d1af54f..775c3baa9bd52e 100644
--- a/base/inference.jl
+++ b/base/inference.jl
@@ -504,6 +504,8 @@ add_tfunc(sdiv_int, 2, 2, math_tfunc, 30)
 add_tfunc(udiv_int, 2, 2, math_tfunc, 30)
 add_tfunc(srem_int, 2, 2, math_tfunc, 30)
 add_tfunc(urem_int, 2, 2, math_tfunc, 30)
+add_tfunc(add_ptr, 2, 2, math_tfunc, 1)
+add_tfunc(sub_ptr, 2, 2, math_tfunc, 1)
 add_tfunc(neg_float, 1, 1, math_tfunc, 1)
 add_tfunc(add_float, 2, 2, math_tfunc, 1)
 add_tfunc(sub_float, 2, 2, math_tfunc, 1)
diff --git a/base/pointer.jl b/base/pointer.jl
index b2197d21db8c08..2daa2e4a4408a8 100644
--- a/base/pointer.jl
+++ b/base/pointer.jl
@@ -147,8 +147,8 @@ eltype(::Type{Ptr{T}}) where {T} = T
 isless(x::Ptr, y::Ptr) = isless(UInt(x), UInt(y))
 -(x::Ptr, y::Ptr) = UInt(x) - UInt(y)
 
-+(x::Ptr, y::Integer) = oftype(x, (UInt(x) + (y % UInt) % UInt))
--(x::Ptr, y::Integer) = oftype(x, (UInt(x) - (y % UInt) % UInt))
++(x::Ptr, y::Integer) = oftype(x, Intrinsics.add_ptr(UInt(x), (y % UInt) % UInt))
+-(x::Ptr, y::Integer) = oftype(x, Intrinsics.sub_ptr(UInt(x), (y % UInt) % UInt))
 +(x::Integer, y::Ptr) = y + x
 
 """
diff --git a/deps/llvm.mk b/deps/llvm.mk
index 7f44868aba359e..07b77b15298e16 100644
--- a/deps/llvm.mk
+++ b/deps/llvm.mk
@@ -460,6 +460,7 @@ $(eval $(call LLVM_PATCH,llvm-D32593))
 $(eval $(call LLVM_PATCH,llvm-D33179))
 $(eval $(call LLVM_PATCH,llvm-PR29010-i386-xmm)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-3.9.0-D37576-NVPTX-sm_70)) # NVPTX, Remove for 6.0
+$(eval $(call LLVM_PATCH,llvm-D37939-Mem2Reg-Also-handle-memcpy))
 else ifeq ($(LLVM_VER_SHORT),4.0)
 # Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365
 $(eval $(call LLVM_PATCH,llvm-4.0.0_threads))
diff --git a/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch
new file mode 100644
index 00000000000000..b8753b0439ba0b
--- /dev/null
+++ b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch
@@ -0,0 +1,365 @@
+From da4504b2d3c6629fbd58634bf76f1b85939d07cf Mon Sep 17 00:00:00 2001
+From: Keno Fischer <keno@juliacomputing.com>
+Date: Fri, 15 Sep 2017 18:30:59 -0400
+Subject: [PATCH] [Mem2Reg] Also handle memcpy
+
+Summary:
+In julia, when we know we're moving data between two memory locations,
+we always emit that as a memcpy rather than a load/store pair. However,
+this can give worse optimization results in certain cases because some
+optimizations that can handle load/store pairs cannot handle memcpys.
+Mem2reg is one of these optimizations. This patch adds rudamentary
+support for mem2reg for recognizing memcpys that cover the whole alloca
+we're promoting. While several more sophisticated passes (SROA, GVN)
+can get similar optimizations, it is preferable to have these kinds
+of cases caught early to expose optimization opportunities before
+getting to these later passes. The approach taken here is to split
+the memcpy into a load/store pair early (after legality analysis)
+and retain the rest of the analysis only on loads/stores. It would
+be possible of course to leave the memcpy as is and generate the
+left over load or store only on demand. However, that would entail
+a significantly larger patch for unclear benefit.
+
+Reviewers: chandlerc, dberlin
+
+Subscribers: llvm-commits
+
+Differential Revision: https://reviews.llvm.org/D37939
+---
+ lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 166 ++++++++++++++++++++---
+ test/Transforms/Mem2Reg/memcpy.ll                | 101 ++++++++++++++
+ 2 files changed, 251 insertions(+), 16 deletions(-)
+ create mode 100644 test/Transforms/Mem2Reg/memcpy.ll
+
+diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+index ac28f59..b08a0a1 100644
+--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
++++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+@@ -49,6 +49,58 @@ STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+ STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+ STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+ 
++static bool isSplittableMemCpy(const MemCpyInst *MCI, const AllocaInst *AI) {
++  // Punt if this alloca is an array allocation
++  if (AI->isArrayAllocation())
++    return false;
++  if (MCI->isVolatile())
++    return false;
++  Value *Length = MCI->getLength();
++  if (!isa<ConstantInt>(Length))
++    return false;
++  // Anything less than the full alloca, we leave for SROA
++  const DataLayout &DL = AI->getModule()->getDataLayout();
++  size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType());
++  if (cast<ConstantInt>(Length)->getZExtValue() != AIElSize)
++    return false;
++  // If the other argument is also an alloca, we need to be sure that either
++  // the types are bitcastable, or the other alloca is not eligible for
++  // promotion (e.g. because the memcpy is for less than the whole size of
++  // that alloca), otherwise we risk turning an allocatable alloca into a
++  // non-allocatable one when splitting the memcpy.
++  AllocaInst *OtherAI = dyn_cast<AllocaInst>(
++      AI == MCI->getSource() ? MCI->getDest() : MCI->getSource());
++  if (OtherAI) {
++    if (!CastInst::isBitCastable(AI->getAllocatedType(),
++                                 OtherAI->getAllocatedType()) &&
++        DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize)
++      return false;
++  }
++  return true;
++}
++
++/// Look at the result of a bitcast and see if it's only used by lifetime
++/// intrinsics or splittable memcpys. This is needed, because IRBuilder
++/// will always insert a bitcast to i8* for these intrinsics.
++static bool onlyHasCanonicalizableUsers(const AllocaInst *AI, const Value *V) {
++  for (const User *U : V->users()) {
++    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
++    if (!II)
++      return false;
++
++    if (isa<MemCpyInst>(II)) {
++      if (!isSplittableMemCpy(cast<MemCpyInst>(II), AI))
++        return false;
++      continue;
++    }
++
++    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
++        II->getIntrinsicID() != Intrinsic::lifetime_end)
++      return false;
++  }
++  return true;
++}
++
+ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+   // FIXME: If the memory unit is of pointer or integer type, we can permit
+   // assignments to subsections of the memory unit.
+@@ -68,6 +120,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+       // not have any meaning for a local alloca.
+       if (SI->isVolatile())
+         return false;
++    } else if (const MemCpyInst *MCI = dyn_cast<MemCpyInst>(U)) {
++      if (!isSplittableMemCpy(MCI, AI))
++        return false;
+     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+       if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+           II->getIntrinsicID() != Intrinsic::lifetime_end)
+@@ -75,7 +130,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+     } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+       if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+         return false;
+-      if (!onlyUsedByLifetimeMarkers(BCI))
++      if (!onlyHasCanonicalizableUsers(AI, BCI))
+         return false;
+     } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+       if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+@@ -181,7 +235,13 @@ public:
+   /// This code only looks at accesses to allocas.
+   static bool isInterestingInstruction(const Instruction *I) {
++    if (isa<MemCpyInst>(I)) {
++      const MemCpyInst *MCI = cast<MemCpyInst>(I);
++      return isa<AllocaInst>(MCI->getSource()) ||
++             isa<AllocaInst>(MCI->getDest());
++    } else {
+     return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+            (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+   }
++  }
+ 
+   /// Get or calculate the index of the specified instruction.
+@@ -208,6 +264,25 @@ public:
+     return It->second;
+   }
+ 
++  // When we split a memcpy intrinsic, we need to update the numbering in this
++  // struct. To make sure the relative ordering remains the same, we give both
++  // the LI and the SI the number that the MCI used to have (if they are both
++  // interesting). This means that they will have equal numbers, which usually
++  // can't happen. However, since they can never reference the same alloca
++  // (since memcpy operands may not overlap), this is fine, because we will
++  // never compare instruction indices for instructions that operate on distinct
++  // allocas.
++  void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) {
++    DenseMap<const Instruction *, unsigned>::iterator It =
++        InstNumbers.find(MCI);
++    if (It == InstNumbers.end())
++      return;
++    unsigned MemCpyNumber = It->second;
++    InstNumbers[LI] = MemCpyNumber;
++    InstNumbers[SI] = MemCpyNumber;
++    deleteValue(MCI);
++  }
++
+   void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
+ 
+   void clear() { InstNumbers.clear(); }
+@@ -305,9 +380,58 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+   AC->registerAssumption(CI);
+ }
+ 
+-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+-  // Knowing that this alloca is promotable, we know that it's safe to kill all
+-  // instructions except for load and store.
++/// Split a memcpy instruction into the corresponding load/store. It is a little
++/// more complicated than one might imagine, because we need to deal with the
++/// fact that the side of the copy we're not currently processing might also
++/// be a promotable alloca. We need to be careful to not break the promotable
++/// predicate for that other alloca (if any).
++static void doMemCpySplit(LargeBlockInfo &LBI, MemCpyInst *MCI,
++                          AllocaInst *AI) {
++  AAMDNodes AA;
++  MCI->getAAMetadata(AA);
++  Value *MCISrc = MCI->getSource();
++  Type *LoadType = AI->getAllocatedType();
++  AllocaInst *SrcAI = dyn_cast<AllocaInst>(MCISrc);
++  if (SrcAI && SrcAI->getType() != AI->getType()) {
++    if (CastInst::isBitCastable(SrcAI->getAllocatedType(), LoadType))
++      LoadType = SrcAI->getAllocatedType();
++  }
++  if (cast<PointerType>(MCISrc->getType())->getElementType() != LoadType)
++    MCISrc = CastInst::Create(
++        Instruction::BitCast, MCISrc,
++        LoadType->getPointerTo(
++            cast<PointerType>(MCISrc->getType())->getAddressSpace()),
++        "", MCI);
++  // This might add to the end of the use list, but that's fine. At worst,
++  // we'd not visit the instructions we insert here, but we don't care
++  // about them in this loop anyway.
++  LoadInst *LI = new LoadInst(LoadType, MCISrc, "", MCI->isVolatile(),
++                              MCI->getAlignment(), MCI);
++  Value *Val = LI;
++  Value *MCIDest = MCI->getDest();
++  AllocaInst *DestAI = dyn_cast<AllocaInst>(MCIDest);
++  Type *DestElTy = DestAI ? DestAI->getAllocatedType() : AI->getAllocatedType();
++  if (LI->getType() != DestElTy &&
++      CastInst::isBitCastable(LI->getType(), DestElTy))
++    Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI);
++  if (cast<PointerType>(MCIDest->getType())->getElementType() != Val->getType())
++    MCIDest = CastInst::Create(
++        Instruction::BitCast, MCIDest,
++        Val->getType()->getPointerTo(
++            cast<PointerType>(MCIDest->getType())->getAddressSpace()),
++        "", MCI);
++  StoreInst *SI =
++      new StoreInst(Val, MCIDest, MCI->isVolatile(), MCI->getAlignment(), MCI);
++  LI->setAAMetadata(AA);
++  SI->setAAMetadata(AA);
++  LBI.splitMemCpy(MCI, LI, SI);
++  MCI->eraseFromParent();
++}
++
++static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) {
++  // Knowing that this alloca is promotable, we know that it's safe to split
++  // MTIs into load/store and to kill all other instructions except for
++  // load and store.
+ 
+   for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) {
+     Instruction *I = cast<Instruction>(*UI);
+@@ -315,14 +439,24 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+     if (isa<LoadInst>(I) || isa<StoreInst>(I))
+       continue;
+ 
++    if (isa<MemCpyInst>(I)) {
++      MemCpyInst *MCI = cast<MemCpyInst>(I);
++      doMemCpySplit(LBI, MCI, AI);
++      continue;
++    }
++
+     if (!I->getType()->isVoidTy()) {
+-      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+-      // Follow the use/def chain to erase them now instead of leaving it for
+-      // dead code elimination later.
++      // The only users of this bitcast/GEP instruction are lifetime/memcpy
++      // intrinsics. Split memcpys and delete lifetime intrinsics.
+       for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) {
+         Instruction *Inst = cast<Instruction>(*UUI);
+         ++UUI;
+-        Inst->eraseFromParent();
++        if (isa<MemCpyInst>(Inst)) {
++          doMemCpySplit(LBI, cast<MemCpyInst>(Inst), AI);
++        } else {
++          // Must be a lifetime intrinsic
++          Inst->eraseFromParent();
++        }
+       }
+     }
+     I->eraseFromParent();
+@@ -542,7 +676,7 @@ void PromoteMem2Reg::run() {
+     assert(AI->getParent()->getParent() == &F &&
+            "All allocas should be in the same function, which is same as DF!");
+ 
+-    removeLifetimeIntrinsicUsers(AI);
++    canonicalizeUsers(LBI, AI);
+ 
+     if (AI->use_empty()) {
+       // If there are no uses of the alloca, just delete it now.
+diff --git a/test/Transforms/Mem2Reg/memcpy.ll b/test/Transforms/Mem2Reg/memcpy.ll
+new file mode 100644
+index 0000000..fbc4096
+--- /dev/null
++++ b/test/Transforms/Mem2Reg/memcpy.ll
+@@ -0,0 +1,101 @@
++; RUN: opt < %s -mem2reg -S | FileCheck %s
++
++target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
++
++declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1)
++declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *, i8 *, i32, i32, i1)
++declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1)
++declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1)
++
++define i128 @test_cpy_different(i64) {
++; CHECK-LABEL: @test_cpy_different
++; CHECK-NOT: alloca i64
++; CHECK: store i64 %0
++    %a = alloca i64
++    %b = alloca i128
++    store i128 0, i128 *%b
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load i128, i128 *%b
++    ret i128 %loaded
++}
++
++define i64 @test_cpy_same(i64) {
++; CHECK-LABEL: @test_cpy_same
++; CHECK-NOT: alloca
++; CHECK: ret i64 %0
++    %a = alloca i64
++    %b = alloca i64
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%b
++    ret i64 %loaded
++}
++
++define double @test_cpy_different_type(i64) {
++; CHECK-LABEL: @test_cpy_different_type
++; CHECK-NOT: alloca
++; CHECK: bitcast i64 %0 to double
++    %a = alloca i64
++    %b = alloca double
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load double, double *%b
++    ret double %loaded
++}
++
++define i128 @test_cpy_differenti8(i64) {
++; CHECK-LABEL: @test_cpy_differenti8
++; CHECK-NOT: alloca i64
++; CHECK: store i64 %0
++    %a = alloca i64
++    %b = alloca i128
++    store i128 0, i128 *%b
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i128* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load i128, i128 *%b
++    ret i128 %loaded
++}
++
++define i64 @test_cpy_samei8(i64) {
++; CHECK-LABEL: @test_cpy_samei8
++; CHECK-NOT: alloca
++; CHECK: ret i64 %0
++    %a = alloca i64
++    %b = alloca i64
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i64* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%b
++    ret i64 %loaded
++}
++
++define double @test_cpy_different_typei8(i64) {
++; CHECK-LABEL: @test_cpy_different_typei8
++; CHECK-NOT: alloca
++; CHECK: bitcast i64 %0 to double
++    %a = alloca i64
++    %b = alloca double
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast double* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load double, double *%b
++    ret double %loaded
++}
++
++define i64 @test_cpy_differenti8_reverse(i128) {
++; CHECK-LABEL: @test_cpy_differenti8_reverse
++; CHECK-NOT: alloca i64
++    %a = alloca i64
++    %b = alloca i128
++    store i128 %0, i128 *%b
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i128* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%acast, i8 *%bcast, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%a
++    ret i64 %loaded
++}
+-- 
+2.9.3
+
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index af9f99826528a0..c6f2006974043f 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -235,7 +235,7 @@ static Value *emit_pointer_from_objref(jl_codectx_t &ctx, Value *V)
 #else
     Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
 #endif
-    return Call;
+    return ctx.builder.CreatePtrToInt(Call, T_size);
 }
 
 // --- emitting pointers directly into code ---
@@ -368,6 +368,12 @@ static Value *emit_bitcast(jl_codectx_t &ctx, Value *v, Type *jl_value)
     }
 }
 
+static Value *maybe_bitcast(jl_codectx_t &ctx, Value *V, Type *to) {
+    if (to != V->getType())
+        return emit_bitcast(ctx, V, to);
+    return V;
+}
+
 static Value *julia_binding_gv(jl_codectx_t &ctx, Value *bv)
 {
     Value *offset = ConstantInt::get(T_size, offsetof(jl_binding_t, value) / sizeof(size_t));
@@ -1250,8 +1256,8 @@ static void typed_store(jl_codectx_t &ctx,
     } else {
         data = ptr;
     }
-    Instruction *store = ctx.builder.CreateAlignedStore(r, ctx.builder.CreateGEP(data,
-        idx_0based), isboxed ? alignment : julia_alignment(jltype, alignment));
+    Instruction *store = ctx.builder.CreateAlignedStore(r, idx_0based ? ctx.builder.CreateGEP(data,
+        idx_0based) : data, isboxed ? alignment : julia_alignment(jltype, alignment));
     if (tbaa)
         tbaa_decorate(tbaa, store);
 }
@@ -1267,7 +1273,7 @@ static Value *julia_bool(jl_codectx_t &ctx, Value *cond)
 // --- accessing the representations of built-in data types ---
 
 static Constant *julia_const_to_llvm(jl_value_t *e);
-static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype = T_ppjlvalue)
+static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x)
 {
     Value *data = x.V;
     if (x.constant) {
@@ -1279,9 +1285,7 @@ static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype
             data = boxed(ctx, x);
         }
     }
-    if (astype && data->getType() != astype)
-        data = emit_bitcast(ctx, data, astype);
-    return decay_derived(data);
+    return data;
 }
 
 static void emit_memcpy_llvm(jl_codectx_t &ctx, Value *dst, Value *src,
@@ -1342,7 +1346,7 @@ static Value *get_value_ptr(jl_codectx_t&, Value *ptr)
 
 static Value *get_value_ptr(jl_codectx_t &ctx, const jl_cgval_t &v)
 {
-    return data_pointer(ctx, v, nullptr);
+    return data_pointer(ctx, v);
 }
 
 template<typename T1, typename T2, typename T3>
@@ -1372,7 +1376,9 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
             Value *fld = tbaa_decorate(strct.tbaa,
                 maybe_mark_load_dereferenceable(
                     ctx.builder.CreateLoad(
-                        ctx.builder.CreateBitCast(ctx.builder.CreateGEP(decay_derived(data_pointer(ctx, strct)), idx),
+                        ctx.builder.CreateBitCast(
+                            ctx.builder.CreateGEP(decay_derived(
+                            emit_bitcast(ctx, data_pointer(ctx, strct), T_pprjlvalue)), idx),
                             PointerType::get(T_prjlvalue, AddressSpace::Derived))),
                     maybe_null,  minimum_field_size));
             if (maybe_null)
@@ -1384,11 +1390,11 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
             assert(nfields > 0); // nf == 0 trapped by all_pointers case
             jl_value_t *jt = jl_field_type(stt, 0);
             idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields), inbounds);
-            Value *ptr = data_pointer(ctx, strct);
+            Value *ptr = decay_derived(data_pointer(ctx, strct));
             if (!stt->mutabl) {
                 // just compute the pointer and let user load it when necessary
                 Type *fty = julia_type_to_llvm(jt);
-                Value *addr = ctx.builder.CreateGEP(emit_bitcast(ctx, decay_derived(ptr), PointerType::get(fty,0)), idx);
+                Value *addr = ctx.builder.CreateGEP(emit_bitcast(ctx, ptr, PointerType::get(fty,0)), idx);
                 *ret = mark_julia_slot(addr, jt, NULL, strct.tbaa);
                 ret->isimmutable = strct.isimmutable;
                 return true;
@@ -1441,28 +1447,34 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
         return ghostValue(jfty);
     Value *fldv = NULL;
     if (strct.ispointer()) {
-        Value *addr;
+        Value *addr = decay_derived(data_pointer(ctx, strct));
         bool isboxed;
         Type *lt = julia_type_to_llvm((jl_value_t*)jt, &isboxed);
         if (isboxed) {
-            Value *ptr = decay_derived(data_pointer(ctx, strct, T_pint8));
-            Value *llvm_idx = ConstantInt::get(T_size, jl_field_offset(jt, idx));
-            addr = ctx.builder.CreateGEP(ptr, llvm_idx);
+            size_t byte_offset = jl_field_offset(jt, idx);
+            // byte_offset == 0 is an important special case here, e.g.
+            // for single field wrapper types. Introducing the bitcast
+            // can pessimize mem2reg
+            if (byte_offset > 0) {
+                addr = ctx.builder.CreateGEP(
+                    emit_bitcast(ctx, addr, T_pint8),
+                    ConstantInt::get(T_size, byte_offset));
+            }
         }
         else {
             if (VectorType *vlt = dyn_cast<VectorType>(lt)) {
                 // doesn't have the struct wrapper, so this must have been a VecElement
                 // cast to the element type so that it can be addressed with GEP
                 lt = vlt->getElementType();
-                Value *ptr = data_pointer(ctx, strct, lt->getPointerTo());
+                Value *ptr = emit_bitcast(ctx, addr, lt->getPointerTo());
                 Value *llvm_idx = ConstantInt::get(T_size, idx);
                 addr = ctx.builder.CreateGEP(lt, ptr, llvm_idx);
             }
             else if (lt->isSingleValueType()) {
-                addr = data_pointer(ctx, strct, lt->getPointerTo());
+                addr = emit_bitcast(ctx, addr, lt->getPointerTo());
             }
             else {
-                Value *ptr = data_pointer(ctx, strct, lt->getPointerTo());
+                Value *ptr = emit_bitcast(ctx, addr, lt->getPointerTo());
                 addr = ctx.builder.CreateStructGEP(lt, ptr, idx);
             }
         }
@@ -1503,7 +1515,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
             fieldval.isimmutable = strct.isimmutable;
             return fieldval;
         }
-        return typed_load(ctx, addr, ConstantInt::get(T_size, 0), jfty, strct.tbaa, true, align);
+        return typed_load(ctx, addr, NULL, jfty, strct.tbaa, true, align);
     }
     else if (isa<UndefValue>(strct.V)) {
         return jl_cgval_t();
@@ -2152,13 +2164,15 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, const jl_cgval_t &src
                 emit_unbox(ctx, store_ty, src, typ, dest, isVolatile);
             }
             else {
-                Value *src_ptr = data_pointer(ctx, src, T_pint8);
-                if (dest->getType() != T_pint8)
-                    dest = emit_bitcast(ctx, dest, T_pint8);
-                if (skip) // copy dest -> dest to simulate an undef value / conditional copy
-                    src_ptr = ctx.builder.CreateSelect(skip, dest, src_ptr);
+                Value *src_ptr = data_pointer(ctx, src);
+                unsigned nb = jl_datatype_size(typ);
                 unsigned alignment = julia_alignment(typ, 0);
-                emit_memcpy(ctx, dest, src_ptr, jl_datatype_size(typ), alignment, isVolatile, tbaa);
+                Value *nbytes = ConstantInt::get(T_size, nb);
+                if (skip) // copy dest -> dest to simulate an undef value / conditional copy
+                    nbytes = ctx.builder.CreateSelect(skip,
+                        ConstantInt::get(T_size, 0),
+                        nbytes);
+                emit_memcpy(ctx, dest, src_ptr, nbytes, alignment, isVolatile, tbaa);
             }
         }
     }
@@ -2166,9 +2180,8 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, const jl_cgval_t &src
         Value *tindex = ctx.builder.CreateAnd(src.TIndex, ConstantInt::get(T_int8, 0x7f));
         if (skip)
             tindex = ctx.builder.CreateSelect(skip, ConstantInt::get(T_int8, 0), tindex);
-        Value *src_ptr = data_pointer(ctx, src, T_pint8);
-        if (dest->getType() != T_pint8)
-            dest = emit_bitcast(ctx, dest, T_pint8);
+        Value *src_ptr = maybe_bitcast(ctx, data_pointer(ctx, src), T_pint8);
+        dest = maybe_bitcast(ctx, dest, T_pint8);
         BasicBlock *defaultBB = BasicBlock::Create(jl_LLVMContext, "union_move_skip", ctx.f);
         SwitchInst *switchInst = ctx.builder.CreateSwitch(tindex, defaultBB);
         BasicBlock *postBB = BasicBlock::Create(jl_LLVMContext, "post_union_move", ctx.f);
@@ -2288,8 +2301,13 @@ static void emit_setfield(jl_codectx_t &ctx,
 {
     if (sty->mutabl || !checked) {
         assert(strct.ispointer());
-        Value *addr = ctx.builder.CreateGEP(data_pointer(ctx, strct, T_pint8),
-                ConstantInt::get(T_size, jl_field_offset(sty, idx0)));
+        size_t byte_offset = jl_field_offset(sty, idx0);
+        Value *addr = data_pointer(ctx, strct);
+        if (byte_offset > 0) {
+            addr = ctx.builder.CreateGEP(
+                emit_bitcast(ctx, decay_derived(addr), T_pint8),
+                ConstantInt::get(T_size, byte_offset));
+        }
         jl_value_t *jfty = jl_svecref(sty->types, idx0);
         if (jl_field_isptr(sty, idx0)) {
             Value *r = maybe_decay_untracked(boxed(ctx, rhs)); // don't need a temporary gcroot since it'll be rooted by strct
@@ -2306,7 +2324,7 @@ static void emit_setfield(jl_codectx_t &ctx,
                 return;
             Value *tindex = compute_tindex_unboxed(ctx, rhs_union, jfty);
             tindex = ctx.builder.CreateNUWSub(tindex, ConstantInt::get(T_int8, 1));
-            Value *ptindex = ctx.builder.CreateGEP(T_int8, emit_bitcast(ctx, addr, T_pint8), ConstantInt::get(T_size, fsz - 1));
+            Value *ptindex = ctx.builder.CreateGEP(T_int8, emit_bitcast(ctx, decay_derived(addr), T_pint8), ConstantInt::get(T_size, fsz - 1));
             ctx.builder.CreateStore(tindex, ptindex);
             // copy data
             if (!rhs.isghost) {
@@ -2315,8 +2333,9 @@ static void emit_setfield(jl_codectx_t &ctx,
         }
         else {
             unsigned align = jl_field_align(sty, idx0);
-            typed_store(ctx, addr, ConstantInt::get(T_size, 0), rhs, jfty,
-                strct.tbaa, data_pointer(ctx, strct, T_pjlvalue), align);
+            typed_store(ctx, addr, NULL, rhs, jfty,
+                strct.tbaa, maybe_bitcast(ctx,
+                data_pointer(ctx, strct), T_pjlvalue), align);
         }
     }
     else {
@@ -2416,12 +2435,13 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
         Value *strct = emit_allocobj(ctx, jl_datatype_size(sty),
                                      literal_pointer_val(ctx, (jl_value_t*)ty));
         jl_cgval_t strctinfo = mark_julia_type(ctx, strct, true, ty);
+        strct = decay_derived(strct);
         for (size_t i = 0; i < nf; i++) {
             if (jl_field_isptr(sty, i)) {
                 tbaa_decorate(strctinfo.tbaa, ctx.builder.CreateStore(
                         ConstantPointerNull::get(cast<PointerType>(T_prjlvalue)),
                         emit_bitcast(ctx,
-                            ctx.builder.CreateGEP(emit_bitcast(ctx, decay_derived(strct), T_pint8),
+                            ctx.builder.CreateGEP(emit_bitcast(ctx, strct, T_pint8),
                                 ConstantInt::get(T_size, jl_field_offset(sty, i))),
                             T_pprjlvalue)));
             }
diff --git a/src/codegen.cpp b/src/codegen.cpp
index e1a3f99e146d31..7f13e04fcec9dc 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -2135,16 +2135,16 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, const jl_cgval_t &arg1, const
         if (sz > 512 && !((jl_datatype_t*)arg1.typ)->layout->haspadding) {
             Value *answer = ctx.builder.CreateCall(prepare_call(memcmp_derived_func),
                             {
-                            data_pointer(ctx, arg1, T_pint8),
-                            data_pointer(ctx, arg2, T_pint8),
+                            maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg1)), T_pint8),
+                            maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg2)), T_pint8),
                             ConstantInt::get(T_size, sz)
                             });
             return ctx.builder.CreateICmpEQ(answer, ConstantInt::get(T_int32, 0));
         }
         else {
             Type *atp = at->getPointerTo();
-            Value *varg1 = data_pointer(ctx, arg1, atp);
-            Value *varg2 = data_pointer(ctx, arg2, atp);
+            Value *varg1 = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg1)), atp);
+            Value *varg2 = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg2)), atp);
             jl_svec_t *types = ((jl_datatype_t*)arg1.typ)->types;
             Value *answer = ConstantInt::get(T_int1, 1);
             for (size_t i = 0, l = jl_svec_len(types); i < l; i++) {
@@ -2645,7 +2645,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                                 emit_datatype_nfields(ctx, emit_typeof_boxed(ctx, obj)),
                                 jl_true);
                         }
-                        Value *ptr = data_pointer(ctx, obj);
+                        Value *ptr = decay_derived(data_pointer(ctx, obj));
                         *ret = typed_load(ctx, ptr, vidx, jt, obj.tbaa, false);
                         return true;
                     }
@@ -2836,7 +2836,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         }
         else {
             size_t offs = jl_field_offset(stt, fieldidx);
-            Value *ptr = data_pointer(ctx, obj, T_pint8);
+            Value *ptr = emit_bitcast(ctx, decay_derived(data_pointer(ctx, obj)), T_pint8);
             Value *llvm_idx = ConstantInt::get(T_size, offs);
             Value *addr = ctx.builder.CreateGEP(ptr, llvm_idx);
             // emit this using the same type as emit_getfield_knownidx
@@ -2926,7 +2926,8 @@ static jl_cgval_t emit_call_function_object(jl_method_instance_t *li, jl_llvm_fu
                 // can lazy load on demand, no copy needed
                 assert(at == PointerType::get(et, AddressSpace::Derived));
                 assert(arg.ispointer());
-                argvals[idx] = decay_derived(data_pointer(ctx, arg, at));
+                argvals[idx] = decay_derived(maybe_bitcast(ctx,
+                    data_pointer(ctx, arg), at));
             }
             else {
                 assert(at == et);
@@ -3433,9 +3434,15 @@ static void emit_vi_assignment_unboxed(jl_codectx_t &ctx, jl_varinfo_t &vi, Valu
                 tbaa = NULL;
             if (vi.pTIndex == NULL) {
                 assert(jl_is_leaf_type(vi.value.typ));
-                Value *copy_bytes = ConstantInt::get(T_int32, jl_datatype_size(vi.value.typ));
-                emit_memcpy(ctx, vi.value.V, rval_info, copy_bytes,
-                            jl_datatype_align(rval_info.typ), vi.isVolatile, tbaa);
+                // Sometimes we can get into situations where the LHS and RHS
+                // are the same slot. We're not allowed to memcpy in that case
+                // under penalty of undefined behavior. This check should catch
+                // the relevant situations.
+                if (vi.value.V != rval_info.V) {
+                    Value *copy_bytes = ConstantInt::get(T_int32, jl_datatype_size(vi.value.typ));
+                    emit_memcpy(ctx, vi.value.V, rval_info, copy_bytes,
+                                jl_datatype_align(rval_info.typ), vi.isVolatile, tbaa);
+                }
             }
             else {
                 emit_unionmove(ctx, vi.value.V, rval_info, isboxed, vi.isVolatile, tbaa);
@@ -4297,7 +4304,8 @@ static Function *gen_cfun_wrapper(jl_function_t *ff, jl_value_t *jlrettype, jl_t
             }
             else if (T->isAggregateType()) {
                 // aggregate types are passed by pointer
-                arg = data_pointer(ctx, inputarg, T->getPointerTo());
+                arg = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, inputarg)),
+                    T->getPointerTo());
             }
             else {
                 arg = emit_unbox(ctx, T, inputarg, spect);
@@ -6571,7 +6579,7 @@ static void init_julia_llvm_env(Module *m)
                                         "llvm.julia.gc_preserve_end");
     add_named_global(gc_preserve_end_func, (void*)NULL, /*dllimport*/false);
 
-    pointer_from_objref_func = Function::Create(FunctionType::get(T_size,
+    pointer_from_objref_func = Function::Create(FunctionType::get(T_pjlvalue,
                                          ArrayRef<Type*>(PointerType::get(T_jlvalue, AddressSpace::Derived)), false),
                                          Function::ExternalLinkage,
                                          "julia.pointer_from_objref");
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 425941888d77b2..0dc7c5319738d0 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -269,6 +269,37 @@ static Constant *julia_const_to_llvm(jl_value_t *e)
 
 static jl_cgval_t ghostValue(jl_value_t *ty);
 
+static Value *emit_unboxed_coercion(jl_codectx_t &ctx, Type *to, Value *unboxed)
+{
+    Type *ty = unboxed->getType();
+    assert(ty != T_void);
+    bool frompointer = ty->isPointerTy();
+    bool topointer = to->isPointerTy();
+    if (frompointer && topointer) {
+        unboxed = emit_bitcast(ctx, unboxed, to);
+    }
+    else if (frompointer) {
+        Type *INTT_to = INTT(to);
+        unboxed = ctx.builder.CreatePtrToInt(unboxed, INTT_to);
+        if (INTT_to != to)
+            unboxed = ctx.builder.CreateBitCast(unboxed, to);
+    }
+    else if (topointer) {
+        Type *INTT_to = INTT(to);
+        if (to != INTT_to)
+            unboxed = ctx.builder.CreateBitCast(unboxed, INTT_to);
+        unboxed = ctx.builder.CreateIntToPtr(unboxed, to);
+    }
+    else if (ty == T_int1 && to == T_int8) {
+        // bools may be stored internally as int8
+        unboxed = ctx.builder.CreateZExt(unboxed, T_int8);
+    }
+    else if (ty != to) {
+        unboxed = ctx.builder.CreateBitCast(unboxed, to);
+    }
+    return unboxed;
+}
+
 // emit code to unpack a raw value from a box into registers or a stack slot
 static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_value_t *jt, Value *dest, bool volatile_store)
 {
@@ -287,33 +318,7 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
 
     Constant *c = x.constant ? julia_const_to_llvm(x.constant) : NULL;
     if (!x.ispointer() || c) { // already unboxed, but sometimes need conversion
-        Value *unboxed = c ? c : x.V;
-        Type *ty = unboxed->getType();
-        assert(ty != T_void);
-        bool frompointer = ty->isPointerTy();
-        bool topointer = to->isPointerTy();
-        if (frompointer && topointer) {
-            unboxed = emit_bitcast(ctx, unboxed, to);
-        }
-        else if (frompointer) {
-            Type *INTT_to = INTT(to);
-            unboxed = ctx.builder.CreatePtrToInt(unboxed, INTT_to);
-            if (INTT_to != to)
-                unboxed = ctx.builder.CreateBitCast(unboxed, to);
-        }
-        else if (topointer) {
-            Type *INTT_to = INTT(to);
-            if (to != INTT_to)
-                unboxed = ctx.builder.CreateBitCast(unboxed, INTT_to);
-            unboxed = ctx.builder.CreateIntToPtr(unboxed, to);
-        }
-        else if (ty == T_int1 && to == T_int8) {
-            // bools may be stored internally as int8
-            unboxed = ctx.builder.CreateZExt(unboxed, T_int8);
-        }
-        else if (ty != to) {
-            unboxed = ctx.builder.CreateBitCast(unboxed, to);
-        }
+        Value *unboxed = emit_unboxed_coercion(ctx, to, c ? c : x.V);
         if (!dest)
             return unboxed;
         Type *dest_ty = unboxed->getType()->getPointerTo();
@@ -326,14 +331,12 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
     // bools stored as int8, so an extra Trunc is needed to get an int1
     Value *p = x.constant ? literal_pointer_val(ctx, x.constant) : x.V;
     Type *ptype = (to == T_int1 ? T_pint8 : to->getPointerTo());
-    if (p->getType() != ptype)
-        p = emit_bitcast(ctx, p, ptype);
 
     Value *unboxed = NULL;
     if (to == T_int1)
-        unboxed = ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(p)), T_int1);
+        unboxed = ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(maybe_bitcast(ctx, p, ptype))), T_int1);
     else if (jt == (jl_value_t*)jl_bool_type)
-        unboxed = ctx.builder.CreateZExt(ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(p)), T_int1), to);
+        unboxed = ctx.builder.CreateZExt(ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(maybe_bitcast(ctx, p, ptype))), T_int1), to);
     if (unboxed) {
         if (!dest)
             return unboxed;
@@ -354,6 +357,27 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
         return NULL;
     }
     else {
+        if (p->getType() != ptype && isa<AllocaInst>(p)) {
+            // LLVM's mem2reg can't handle coercion if the load/store type does
+            // not match the type of the alloca. As such, it is better to
+            // perform the load using the alloca's type and then perform the
+            // appropriate coercion manually.
+            AllocaInst *AI = cast<AllocaInst>(p);
+            Type *AllocType = AI->getAllocatedType();
+#if JL_LLVM_VERSION >= 40000
+            const DataLayout &DL = jl_data_layout;
+#else
+            const DataLayout &DL = jl_ExecutionEngine->getDataLayout();
+#endif
+            if (!AI->isArrayAllocation() &&
+                    (AllocType->isFloatingPointTy() || AllocType->isIntegerTy() || AllocType->isPointerTy()) &&
+                    (to->isFloatingPointTy() || to->isIntegerTy() || to->isPointerTy()) &&
+                    DL.getTypeSizeInBits(AllocType) == DL.getTypeSizeInBits(to)) {
+                Instruction *load = ctx.builder.CreateAlignedLoad(p, alignment);
+                return emit_unboxed_coercion(ctx, to, tbaa_decorate(x.tbaa, load));
+            }
+        }
+        p = maybe_bitcast(ctx, p, ptype);
         Instruction *load = ctx.builder.CreateAlignedLoad(p, alignment);
         return tbaa_decorate(x.tbaa, load);
     }
@@ -439,7 +463,8 @@ static jl_cgval_t generic_bitcast(jl_codectx_t &ctx, const jl_cgval_t *argv)
         if (isboxed)
             vxt = llvmt;
         vx = tbaa_decorate(v.tbaa, ctx.builder.CreateLoad(
-                    data_pointer(ctx, v, vxt == T_int1 ? T_pint8 : vxt->getPointerTo())));
+                    emit_bitcast(ctx, data_pointer(ctx, v),
+                        vxt == T_int1 ? T_pint8 : vxt->getPointerTo())));
     }
 
     vxt = vx->getType();
@@ -899,6 +924,26 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg
     case srem_int: return ctx.builder.CreateSRem(x, y);
     case urem_int: return ctx.builder.CreateURem(x, y);
 
+    // LLVM will not fold ptrtoint+arithmetic+inttoptr to GEP. The reason for this
+    // has to do with alias analysis. When adding two integers, either one of them
+    // could be the pointer base. With getelementptr, it is clear which of the
+    // operands is the pointer base. We also have this information at the julia
+    // level. Thus, to not lose information, we need to have a separate intrinsic
+    // for pointer arithmetic which lowers to getelementptr.
+    case add_ptr: {
+        return ctx.builder.CreatePtrToInt(
+            ctx.builder.CreateGEP(T_int8,
+                ctx.builder.CreateIntToPtr(x, T_pint8), y), t);
+
+    }
+
+    case sub_ptr: {
+        return ctx.builder.CreatePtrToInt(
+            ctx.builder.CreateGEP(T_int8,
+                ctx.builder.CreateIntToPtr(x, T_pint8), ctx.builder.CreateNeg(y)), t);
+
+    }
+
 // Implements IEEE negate. See issue #7868
     case neg_float: return math_builder(ctx)().CreateFSub(ConstantFP::get(t, -0.0), x);
     case neg_float_fast: return math_builder(ctx, true)().CreateFNeg(x);
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 80491639ac6b8a..0f04fe418c4e67 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -12,6 +12,8 @@
     ADD_I(udiv_int, 2) \
     ADD_I(srem_int, 2) \
     ADD_I(urem_int, 2) \
+    ADD_I(add_ptr, 2) \
+    ADD_I(sub_ptr, 2) \
     ADD_I(neg_float, 1) \
     ADD_I(add_float, 2) \
     ADD_I(sub_float, 2) \
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 73a270141417d2..7218e53cb180c1 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -210,6 +210,9 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level)
     PM->add(createSimpleLoopUnrollPass());     // Unroll small loops
     //PM->add(createLoopStrengthReducePass());   // (jwb added)
 
+    // Re-run SROA after loop-unrolling (useful for small loops that operate,
+    // over the structure of an aggregate)
+    PM->add(createSROAPass());                 // Break up aggregate allocas
     PM->add(createInstructionCombiningPass()); // Clean up after the unroller
     PM->add(createGVNPass());                  // Remove redundancies
     PM->add(createMemCpyOptPass());            // Remove memcpy / form memset
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 4a587616da6cf0..9940934e9ef0cd 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -761,6 +761,9 @@ JL_DLLEXPORT jl_value_t *jl_udiv_int(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_srem_int(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_urem_int(jl_value_t *a, jl_value_t *b);
 
+JL_DLLEXPORT jl_value_t *jl_add_ptr(jl_value_t *a, jl_value_t *b);
+JL_DLLEXPORT jl_value_t *jl_sub_ptr(jl_value_t *a, jl_value_t *b);
+
 JL_DLLEXPORT jl_value_t *jl_neg_float(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_add_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_sub_float(jl_value_t *a, jl_value_t *b);
diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp
index 4a67c39f841aad..216a37eb64bdd9 100644
--- a/src/llvm-alloc-opt.cpp
+++ b/src/llvm-alloc-opt.cpp
@@ -592,7 +592,6 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst,
         }
         else if (auto call = dyn_cast<CallInst>(user)) {
             if (ptr_from_objref && ptr_from_objref == call->getCalledFunction()) {
-                new_i = new PtrToIntInst(new_i, T_size, "", call);
                 call->replaceAllUsesWith(new_i);
                 call->eraseFromParent();
                 return;
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 8acf06496db0f7..ccb660d966a43c 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -1207,9 +1207,8 @@ bool LateLowerGCFrame::CleanupIR(Function &F) {
             } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) {
                 auto *obj = CI->getOperand(0);
                 auto *ASCI = new AddrSpaceCastInst(obj, T_pjlvalue, "", CI);
-                auto *ptr = new PtrToIntInst(ASCI, CI->getType(), "", CI);
-                ptr->takeName(CI);
-                CI->replaceAllUsesWith(ptr);
+                ASCI->takeName(CI);
+                CI->replaceAllUsesWith(ASCI);
             } else if (alloc_obj_func && callee == alloc_obj_func) {
                 assert(CI->getNumArgOperands() == 3);
                 auto sz = (size_t)cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index a768b9dec3501d..fecff170ba07a6 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -703,8 +703,10 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c)
 un_iintrinsic_fast(LLVMNeg, neg, neg_int, u)
 #define add(a,b) a + b
 bi_iintrinsic_fast(LLVMAdd, add, add_int, u)
+bi_iintrinsic_fast(LLVMAdd, add, add_ptr, u)
 #define sub(a,b) a - b
 bi_iintrinsic_fast(LLVMSub, sub, sub_int, u)
+bi_iintrinsic_fast(LLVMSub, sub, sub_ptr, u)
 #define mul(a,b) a * b
 bi_iintrinsic_fast(LLVMMul, mul, mul_int, u)
 #define div(a,b) a / b