forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add LLVM patch D14260 to improve SIMD code
Arch Robison proposed the patch <http://reviews.llvm.org/D14260> "Optimize store of "bitcast" from vector to aggregate" for LLVM. This patch applies cleanly to LLVM 3.7.1. It seems to be the last missing puzzle piece on the LLVM side to allow generating efficient SIMD instructions via `llvm_call` in Julia. For an example package, see e.g. <https://github.com/eschnett/SIMD.jl>. Some discussion relevant to this PR are in JuliaLang#5355. @ArchRobison, please comment. Julia stores tuples as LLVM arrays, whereas LLVM SIMD instructions require LLVM vectors. The respective conversions are unfortunately not always optimized out unless the patch above is applied, leading to a cumbersome sequence of instructions to disassemble and reassemble a SIMD vector. An example is given here <eschnett/SIMD.jl#1 (comment)>. Without this patch, the loop kernel looks like (x86-64, AVX2 instructions): ``` vunpcklpd %xmm4, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm4[0] vunpcklpd %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0] vinsertf128 $1, %xmm3, %ymm1, %ymm1 vmovupd 8(%rcx), %xmm2 vinsertf128 $1, 24(%rcx), %ymm2, %ymm2 vaddpd %ymm2, %ymm1, %ymm1 vpermilpd $1, %xmm1, %xmm2 # xmm2 = xmm1[1,0] vextractf128 $1, %ymm1, %xmm3 vpermilpd $1, %xmm3, %xmm4 # xmm4 = xmm3[1,0] Source line: 62 vaddsd (%rcx), %xmm0, %xmm0 ``` Note that the SIMD vector is kept in register `%ymm1`, but is unnecessarily scalarized into registers `%xmm{0,1,2,3}` at the end of the kernel, and re-assembled in the beginning. With this patch, the loop kernel looks like: ``` L192: vaddpd (%rdx), %ymm1, %ymm1 Source line: 62 addq %rsi, %rdx addq %rcx, %rdi jne L192 ``` which is perfect.
- Loading branch information
Showing
2 changed files
with
145 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp | ||
=================================================================== | ||
--- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp | ||
+++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp | ||
@@ -879,6 +879,61 @@ | ||
return nullptr; | ||
} | ||
|
||
+/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast. | ||
+/// | ||
+/// \returns underlying value that was "cast", or nullptr otherwise. | ||
+/// | ||
+/// For example, if we have: | ||
+/// | ||
+/// %E0 = extractelement <2 x double> %U, i32 0 | ||
+/// %V0 = insertvalue [2 x double] undef, double %E0, 0 | ||
+/// %E1 = extractelement <2 x double> %U, i32 1 | ||
+/// %V1 = insertvalue [2 x double] %V0, double %E1, 1 | ||
+/// | ||
+/// and the layout of a <2 x double> is isomorphic to a [2 x double], | ||
+/// then %V1 can be safely approximated by a conceptual "bitcast" of %U. | ||
+/// Note that %U may contain non-undef values where %V1 has undef. | ||
+static Value* likeBitCastFromVector(InstCombiner &IC, Value* V) { | ||
+ Value *U = nullptr; | ||
+ while (auto *IV = dyn_cast<InsertValueInst>(V)) { | ||
+ auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand()); | ||
+ if (!E) | ||
+ return nullptr; | ||
+ auto *W = E->getVectorOperand(); | ||
+ if (!U) | ||
+ U = W; | ||
+ else if (U != W) | ||
+ return nullptr; | ||
+ auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand()); | ||
+ if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin()) | ||
+ return nullptr; | ||
+ V = IV->getAggregateOperand(); | ||
+ } | ||
+ if (!isa<UndefValue>(V) ||!U) | ||
+ return nullptr; | ||
+ | ||
+ VectorType *UT = cast<VectorType>(U->getType()); | ||
+ Type *VT = V->getType(); | ||
+ // Check that types UT and VT are bitwise isomorphic. | ||
+ const DataLayout &DL = IC.getDataLayout(); | ||
+ if (DL.getTypeSizeInBits(UT) != DL.getTypeSizeInBits(VT)) { | ||
+ return nullptr; | ||
+ } | ||
+ if (ArrayType *AT = dyn_cast<ArrayType>(VT)) { | ||
+ if (AT->getNumElements() != UT->getNumElements()) | ||
+ return nullptr; | ||
+ } else { | ||
+ StructType *ST = cast<StructType>(VT); | ||
+ if (ST->getNumElements() != UT->getNumElements()) | ||
+ return nullptr; | ||
+ for (const Type *EltT : ST->elements()) { | ||
+ if (EltT != UT->getElementType()) | ||
+ return nullptr; | ||
+ } | ||
+ } | ||
+ return U; | ||
+} | ||
+ | ||
/// \brief Combine stores to match the type of value being stored. | ||
/// | ||
/// The core idea here is that the memory does not have any intrinsic type and | ||
@@ -914,6 +969,11 @@ | ||
return true; | ||
} | ||
|
||
+ if (Value *U = likeBitCastFromVector(IC, V)) { | ||
+ combineStoreToNewValue(IC, SI, U); | ||
+ return true; | ||
+ } | ||
+ | ||
// FIXME: We should also canonicalize loads of vectors when their elements are | ||
// cast to other types. | ||
return false; | ||
Index: test/Transforms/InstCombine/insert-val-extract-elem.ll | ||
=================================================================== | ||
--- test/Transforms/InstCombine/insert-val-extract-elem.ll | ||
+++ test/Transforms/InstCombine/insert-val-extract-elem.ll | ||
@@ -0,0 +1,53 @@ | ||
+; RUN: opt -S -instcombine %s | FileCheck %s | ||
+ | ||
+; CHECK-NOT: insertvalue | ||
+; CHECK-NOT: extractelement | ||
+; CHECK: store <2 x double> | ||
+define void @julia_2xdouble([2 x double]* sret, <2 x double>*) { | ||
+top: | ||
+ %x = load <2 x double>, <2 x double>* %1 | ||
+ %x0 = extractelement <2 x double> %x, i32 0 | ||
+ %i0 = insertvalue [2 x double] undef, double %x0, 0 | ||
+ %x1 = extractelement <2 x double> %x, i32 1 | ||
+ %i1 = insertvalue [2 x double] %i0, double %x1, 1 | ||
+ store [2 x double] %i1, [2 x double]* %0, align 4 | ||
+ ret void | ||
+} | ||
+ | ||
+; CHECK-NOT: insertvalue | ||
+; CHECK-NOT: extractelement | ||
+; CHECK: store <4 x float> | ||
+define void @julia_4xfloat([4 x float]* sret, <4 x float>*) { | ||
+top: | ||
+ %x = load <4 x float>, <4 x float>* %1 | ||
+ %x0 = extractelement <4 x float> %x, i32 0 | ||
+ %i0 = insertvalue [4 x float] undef, float %x0, 0 | ||
+ %x1 = extractelement <4 x float> %x, i32 1 | ||
+ %i1 = insertvalue [4 x float] %i0, float %x1, 1 | ||
+ %x2 = extractelement <4 x float> %x, i32 2 | ||
+ %i2 = insertvalue [4 x float] %i1, float %x2, 2 | ||
+ %x3 = extractelement <4 x float> %x, i32 3 | ||
+ %i3 = insertvalue [4 x float] %i2, float %x3, 3 | ||
+ store [4 x float] %i3, [4 x float]* %0, align 4 | ||
+ ret void | ||
+} | ||
+ | ||
+%pseudovec = type { float, float, float, float } | ||
+ | ||
+; CHECK-NOT: insertvalue | ||
+; CHECK-NOT: extractelement | ||
+; CHECK: store <4 x float> | ||
+define void @julia_pseudovec(%pseudovec* sret, <4 x float>*) { | ||
+top: | ||
+ %x = load <4 x float>, <4 x float>* %1 | ||
+ %x0 = extractelement <4 x float> %x, i32 0 | ||
+ %i0 = insertvalue %pseudovec undef, float %x0, 0 | ||
+ %x1 = extractelement <4 x float> %x, i32 1 | ||
+ %i1 = insertvalue %pseudovec %i0, float %x1, 1 | ||
+ %x2 = extractelement <4 x float> %x, i32 2 | ||
+ %i2 = insertvalue %pseudovec %i1, float %x2, 2 | ||
+ %x3 = extractelement <4 x float> %x, i32 3 | ||
+ %i3 = insertvalue %pseudovec %i2, float %x3, 3 | ||
+ store %pseudovec %i3, %pseudovec* %0, align 4 | ||
+ ret void | ||
+} |