From 9c484617807a339e8dc09d5da0f3add30d9b7dd6 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Wed, 31 Oct 2018 15:40:15 +0000 Subject: [PATCH] Revert "Merge pull request #404 from stuartarchibald/wip/llvm7" This reverts commit ebc4c596ae74d5667790948c56fea41fa1d11885, reversing changes made to a52ca0d05add8677d1b5be7eb1ba7c1e5bab5cc3. --- README.rst | 3 +- .../incremental/setup_conda_environment.cmd | 2 +- .../incremental/setup_conda_environment.sh | 2 +- ...issing-header-for-InstructionCombini.patch | 31 + conda-recipes/D47188-svml-VF.patch | 1936 ----------------- conda-recipes/D47188-svml.patch | 821 +++++++ conda-recipes/llvmdev/meta.yaml | 27 +- conda-recipes/llvmdev_manylinux1/meta.yaml | 13 +- conda-recipes/llvmlite/meta.yaml | 2 +- docs/source/admin-guide/install.rst | 6 +- docs/source/conf.py | 6 +- docs/source/user-guide/ir/index.rst | 2 +- ffi/build.py | 4 +- llvmlite/tests/test_binding.py | 28 +- 14 files changed, 895 insertions(+), 1988 deletions(-) create mode 100644 conda-recipes/0001-Transforms-Add-missing-header-for-InstructionCombini.patch delete mode 100644 conda-recipes/D47188-svml-VF.patch create mode 100644 conda-recipes/D47188-svml.patch diff --git a/README.rst b/README.rst index 4ebf9b731..88afc4513 100644 --- a/README.rst +++ b/README.rst @@ -70,8 +70,7 @@ Historical compatibility table: ================= ======================== llvmlite versions compatible LLVM versions ================= ======================== -0.26.0 - ... 7.0.x -0.23.0 - 0.25.0 6.0.x +0.23.0 - ... 6.0.x 0.21.0 - 0.22.0 5.0.x 0.17.0 - 0.20.0 4.0.x 0.16.0 - 0.17.0 3.9.x diff --git a/buildscripts/incremental/setup_conda_environment.cmd b/buildscripts/incremental/setup_conda_environment.cmd index 9fac4c949..0a62a63b1 100644 --- a/buildscripts/incremental/setup_conda_environment.cmd +++ b/buildscripts/incremental/setup_conda_environment.cmd @@ -17,6 +17,6 @@ conda create -n %CONDA_ENV% -q -y python=%PYTHON% cmake call activate %CONDA_ENV% @rem Install llvmdev -%CONDA_INSTALL% -c numba llvmdev="7.0*" +%CONDA_INSTALL% -c numba llvmdev="6.0*" @rem Install enum34 for Python < 3.4 if %PYTHON% LSS 3.4 (%CONDA_INSTALL% enum34) diff --git a/buildscripts/incremental/setup_conda_environment.sh b/buildscripts/incremental/setup_conda_environment.sh index d933b479c..ddfca7ed8 100755 --- a/buildscripts/incremental/setup_conda_environment.sh +++ b/buildscripts/incremental/setup_conda_environment.sh @@ -25,7 +25,7 @@ source activate $CONDA_ENV set -v # Install llvmdev (separate channel, for now) -$CONDA_INSTALL -c numba llvmdev="7.0*" +$CONDA_INSTALL -c numba llvmdev="6.0*" # Install the compiler toolchain, for osx, bootstrapping needed # which happens in build.sh diff --git a/conda-recipes/0001-Transforms-Add-missing-header-for-InstructionCombini.patch b/conda-recipes/0001-Transforms-Add-missing-header-for-InstructionCombini.patch new file mode 100644 index 000000000..ec5a20983 --- /dev/null +++ b/conda-recipes/0001-Transforms-Add-missing-header-for-InstructionCombini.patch @@ -0,0 +1,31 @@ +From 7c9054610e354340f9474dcd13a927f929912d1d Mon Sep 17 00:00:00 2001 +From: Eugene Zelenko +Date: Tue, 6 Mar 2018 23:06:13 +0000 +Subject: [PATCH] [Transforms] Add missing header for InstructionCombining.cpp, + in order to export LLVMInitializeInstCombine as extern "C". Fixes PR35947. + +Patch by Brenton Bostick. + +Differential revision: https://reviews.llvm.org/D44140 + + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326843 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/Transforms/InstCombine/InstructionCombining.cpp | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp +index a3b2fe9..7ec7343 100644 +--- a/lib/Transforms/InstCombine/InstructionCombining.cpp ++++ b/lib/Transforms/InstCombine/InstructionCombining.cpp +@@ -34,6 +34,7 @@ + //===----------------------------------------------------------------------===// + + #include "InstCombineInternal.h" ++#include "llvm-c/Initialization.h" + #include "llvm/ADT/APInt.h" + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/DenseMap.h" +-- +1.8.3.1 + diff --git a/conda-recipes/D47188-svml-VF.patch b/conda-recipes/D47188-svml-VF.patch deleted file mode 100644 index 04f337865..000000000 --- a/conda-recipes/D47188-svml-VF.patch +++ /dev/null @@ -1,1936 +0,0 @@ -From bcfc1167bf4dafc776f18296b06b2048372d3cb1 Mon Sep 17 00:00:00 2001 -From: Anton Malakhov -Date: Tue, 25 Sep 2018 11:24:55 -0500 -Subject: [PATCH] Fixes vectorizer and extends SVML support - -This patch is created on top of LLVM 7.0.0 collecting several fixes: - -1. https://reviews.llvm.org/D47188 patch fixes the problem with improper calls -to SVML library as it has non-standard calling conventions. So accordingly it -has SVML calling conventions definitions and code to set CC to the vectorized -calls. As SVML provides several implementations for the math functions we also -took into consideration fast attribute and select more fast implementation in -such case. This work is based on original Matt Masten's work. -Author: Denis Nagorny - -2. implements support to legalize SVML calls by breaking down the illegal -vector call instruction into multiple legal vector call instructions during -code generation. Currently the vectorizer does not check legality of the -generated SVML (or any VECLIB) call instructions, and this -can lead to potential problems even during vector type -legalization. This patch addresses this issue by adding -a legality check during code generation and replaces the -illegal SVML call with corresponding legalized instructions. -(RFC: http://lists.llvm.org/pipermail/llvm-dev/2018-June/124357.html) -Author: Karthik Senthil - -3. Functional merge of the patches above which fixes calling convention - - -diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h -index a3fe8340..2b93099e 100644 ---- a/include/llvm/Analysis/TargetLibraryInfo.h -+++ b/include/llvm/Analysis/TargetLibraryInfo.h -@@ -38,6 +38,12 @@ struct VecDesc { - NumLibFuncs - }; - -+enum SVMLAccuracy { -+ SVML_DEFAULT, -+ SVML_HA, -+ SVML_EP -+}; -+ - /// Implementation of the target library information. - /// - /// This class constructs tables that hold the target library information and -@@ -150,7 +156,8 @@ public: - /// Return true if the function F has a vector equivalent with vectorization - /// factor VF. - bool isFunctionVectorizable(StringRef F, unsigned VF) const { -- return !getVectorizedFunction(F, VF).empty(); -+ bool Ignored; -+ return !getVectorizedFunction(F, VF, Ignored, false).empty(); - } - - /// Return true if the function F has a vector equivalent with any -@@ -159,7 +166,8 @@ public: - - /// Return the name of the equivalent of F, vectorized with factor VF. If no - /// such mapping exists, return the empty string. -- StringRef getVectorizedFunction(StringRef F, unsigned VF) const; -+ std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML, -+ bool IsFast) const; - - /// Return true if the function F has a scalar equivalent, and set VF to be - /// the vectorization factor. -@@ -253,8 +261,9 @@ public: - bool isFunctionVectorizable(StringRef F) const { - return Impl->isFunctionVectorizable(F); - } -- StringRef getVectorizedFunction(StringRef F, unsigned VF) const { -- return Impl->getVectorizedFunction(F, VF); -+ std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML, -+ bool IsFast) const { -+ return Impl->getVectorizedFunction(F, VF, FromSVML, IsFast); - } - - /// Tests if the function is both available and a candidate for optimized code -diff --git a/include/llvm/IR/CMakeLists.txt b/include/llvm/IR/CMakeLists.txt -index 830f3750..dfe25b6d 100644 ---- a/include/llvm/IR/CMakeLists.txt -+++ b/include/llvm/IR/CMakeLists.txt -@@ -5,3 +5,7 @@ set(LLVM_TARGET_DEFINITIONS Intrinsics.td) - tablegen(LLVM IntrinsicEnums.inc -gen-intrinsic-enums) - tablegen(LLVM IntrinsicImpl.inc -gen-intrinsic-impl) - add_public_tablegen_target(intrinsics_gen) -+ -+set(LLVM_TARGET_DEFINITIONS SVML.td) -+tablegen(LLVM SVML.inc -gen-svml) -+add_public_tablegen_target(svml_gen) -diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h -index b9c02d7e..1ec5c9b6 100644 ---- a/include/llvm/IR/CallingConv.h -+++ b/include/llvm/IR/CallingConv.h -@@ -220,6 +220,9 @@ namespace CallingConv { - /// shader if tessellation is in use, or otherwise the vertex shader. - AMDGPU_ES = 96, - -+ /// Intel_SVML - Calling conventions for Intel Short Math Vector Library -+ Intel_SVML = 97, -+ - /// The highest possible calling convention ID. Must be some 2^k - 1. - MaxID = 1023 - }; -diff --git a/include/llvm/IR/SVML.td b/include/llvm/IR/SVML.td -new file mode 100644 -index 00000000..5af71040 ---- /dev/null -+++ b/include/llvm/IR/SVML.td -@@ -0,0 +1,62 @@ -+//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===// -+// -+// The LLVM Compiler Infrastructure -+// -+// This file is distributed under the University of Illinois Open Source -+// License. See LICENSE.TXT for details. -+// -+//===----------------------------------------------------------------------===// -+// -+// This file is used by TableGen to define the different typs of SVML function -+// variants used with -fveclib=SVML. -+// -+//===----------------------------------------------------------------------===// -+ -+class SvmlVariant; -+ -+def sin : SvmlVariant; -+def cos : SvmlVariant; -+def pow : SvmlVariant; -+def exp : SvmlVariant; -+def log : SvmlVariant; -+def acos : SvmlVariant; -+def acosh : SvmlVariant; -+def asin : SvmlVariant; -+def asinh : SvmlVariant; -+def atan2 : SvmlVariant; -+def atan : SvmlVariant; -+def atanh : SvmlVariant; -+def cbrt : SvmlVariant; -+def cdfnorm : SvmlVariant; -+def cdfnorminv : SvmlVariant; -+def cosd : SvmlVariant; -+def cosh : SvmlVariant; -+def erf : SvmlVariant; -+def erfc : SvmlVariant; -+def erfcinv : SvmlVariant; -+def erfinv : SvmlVariant; -+def exp10 : SvmlVariant; -+def exp2 : SvmlVariant; -+def expm1 : SvmlVariant; -+def hypot : SvmlVariant; -+def invsqrt : SvmlVariant; -+def log10 : SvmlVariant; -+def log1p : SvmlVariant; -+def log2 : SvmlVariant; -+def sind : SvmlVariant; -+def sinh : SvmlVariant; -+def sqrt : SvmlVariant; -+def tan : SvmlVariant; -+def tanh : SvmlVariant; -+ -+// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions. -+// We should call the default variant of these functions in all cases instead. -+ -+// def nearbyint : SvmlVariant; -+// def logb : SvmlVariant; -+// def floor : SvmlVariant; -+// def fmod : SvmlVariant; -+// def ceil : SvmlVariant; -+// def trunc : SvmlVariant; -+// def rint : SvmlVariant; -+// def round : SvmlVariant; -diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt -index 8e8535ab..5a434bc9 100644 ---- a/lib/Analysis/CMakeLists.txt -+++ b/lib/Analysis/CMakeLists.txt -@@ -93,4 +93,5 @@ add_llvm_library(LLVMAnalysis - - DEPENDS - intrinsics_gen -+ svml_gen - ) -diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp -index 102135fb..aec30e35 100644 ---- a/lib/Analysis/TargetLibraryInfo.cpp -+++ b/lib/Analysis/TargetLibraryInfo.cpp -@@ -50,6 +50,11 @@ static bool hasSinCosPiStret(const Triple &T) { - return true; - } - -+std::string svmlMangle(StringRef FnName, const bool IsFast) { -+ std::string FullName = FnName; -+ return IsFast ? FullName : FullName + "_ha"; -+} -+ - /// Initialize the set of available library functions based on the specified - /// target triple. This should be carefully written so that a missing target - /// triple gets a sane set of defaults. -@@ -1452,109 +1457,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( - } - case SVML: { - const VecDesc VecFuncs[] = { -- {"sin", "__svml_sin2", 2}, -- {"sin", "__svml_sin4", 4}, -- {"sin", "__svml_sin8", 8}, -- -- {"sinf", "__svml_sinf4", 4}, -- {"sinf", "__svml_sinf8", 8}, -- {"sinf", "__svml_sinf16", 16}, -- -- {"llvm.sin.f64", "__svml_sin2", 2}, -- {"llvm.sin.f64", "__svml_sin4", 4}, -- {"llvm.sin.f64", "__svml_sin8", 8}, -- -- {"llvm.sin.f32", "__svml_sinf4", 4}, -- {"llvm.sin.f32", "__svml_sinf8", 8}, -- {"llvm.sin.f32", "__svml_sinf16", 16}, -- -- {"cos", "__svml_cos2", 2}, -- {"cos", "__svml_cos4", 4}, -- {"cos", "__svml_cos8", 8}, -- -- {"cosf", "__svml_cosf4", 4}, -- {"cosf", "__svml_cosf8", 8}, -- {"cosf", "__svml_cosf16", 16}, -- -- {"llvm.cos.f64", "__svml_cos2", 2}, -- {"llvm.cos.f64", "__svml_cos4", 4}, -- {"llvm.cos.f64", "__svml_cos8", 8}, -- -- {"llvm.cos.f32", "__svml_cosf4", 4}, -- {"llvm.cos.f32", "__svml_cosf8", 8}, -- {"llvm.cos.f32", "__svml_cosf16", 16}, -- -- {"pow", "__svml_pow2", 2}, -- {"pow", "__svml_pow4", 4}, -- {"pow", "__svml_pow8", 8}, -- -- {"powf", "__svml_powf4", 4}, -- {"powf", "__svml_powf8", 8}, -- {"powf", "__svml_powf16", 16}, -- -- { "__pow_finite", "__svml_pow2", 2 }, -- { "__pow_finite", "__svml_pow4", 4 }, -- { "__pow_finite", "__svml_pow8", 8 }, -- -- { "__powf_finite", "__svml_powf4", 4 }, -- { "__powf_finite", "__svml_powf8", 8 }, -- { "__powf_finite", "__svml_powf16", 16 }, -- -- {"llvm.pow.f64", "__svml_pow2", 2}, -- {"llvm.pow.f64", "__svml_pow4", 4}, -- {"llvm.pow.f64", "__svml_pow8", 8}, -- -- {"llvm.pow.f32", "__svml_powf4", 4}, -- {"llvm.pow.f32", "__svml_powf8", 8}, -- {"llvm.pow.f32", "__svml_powf16", 16}, -- -- {"exp", "__svml_exp2", 2}, -- {"exp", "__svml_exp4", 4}, -- {"exp", "__svml_exp8", 8}, -- -- {"expf", "__svml_expf4", 4}, -- {"expf", "__svml_expf8", 8}, -- {"expf", "__svml_expf16", 16}, -- -- { "__exp_finite", "__svml_exp2", 2 }, -- { "__exp_finite", "__svml_exp4", 4 }, -- { "__exp_finite", "__svml_exp8", 8 }, -- -- { "__expf_finite", "__svml_expf4", 4 }, -- { "__expf_finite", "__svml_expf8", 8 }, -- { "__expf_finite", "__svml_expf16", 16 }, -- -- {"llvm.exp.f64", "__svml_exp2", 2}, -- {"llvm.exp.f64", "__svml_exp4", 4}, -- {"llvm.exp.f64", "__svml_exp8", 8}, -- -- {"llvm.exp.f32", "__svml_expf4", 4}, -- {"llvm.exp.f32", "__svml_expf8", 8}, -- {"llvm.exp.f32", "__svml_expf16", 16}, -- -- {"log", "__svml_log2", 2}, -- {"log", "__svml_log4", 4}, -- {"log", "__svml_log8", 8}, -- -- {"logf", "__svml_logf4", 4}, -- {"logf", "__svml_logf8", 8}, -- {"logf", "__svml_logf16", 16}, -- -- { "__log_finite", "__svml_log2", 2 }, -- { "__log_finite", "__svml_log4", 4 }, -- { "__log_finite", "__svml_log8", 8 }, -- -- { "__logf_finite", "__svml_logf4", 4 }, -- { "__logf_finite", "__svml_logf8", 8 }, -- { "__logf_finite", "__svml_logf16", 16 }, -- -- {"llvm.log.f64", "__svml_log2", 2}, -- {"llvm.log.f64", "__svml_log4", 4}, -- {"llvm.log.f64", "__svml_log8", 8}, -- -- {"llvm.log.f32", "__svml_logf4", 4}, -- {"llvm.log.f32", "__svml_logf8", 8}, -- {"llvm.log.f32", "__svml_logf16", 16}, -+#define GET_SVML_VARIANTS -+#include "llvm/IR/SVML.inc" -+#undef GET_SVML_VARIANTS - }; - addVectorizableFunctions(VecFuncs); - break; -@@ -1575,19 +1480,26 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const { - return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName; - } - --StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, -- unsigned VF) const { -+std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, -+ unsigned VF, -+ bool &FromSVML, -+ bool IsFast) const { -+ FromSVML = ClVectorLibrary == SVML; - F = sanitizeFunctionName(F); - if (F.empty()) - return F; - std::vector::const_iterator I = std::lower_bound( - VectorDescs.begin(), VectorDescs.end(), F, compareWithScalarFnName); - while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) { -- if (I->VectorizationFactor == VF) -+ if (I->VectorizationFactor == VF) { -+ if (FromSVML) { -+ return svmlMangle(I->VectorFnName, IsFast); -+ } - return I->VectorFnName; -+ } - ++I; - } -- return StringRef(); -+ return std::string(); - } - - StringRef TargetLibraryInfoImpl::getScalarizedFunction(StringRef F, -diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp -index da9855ff..c7182754 100644 ---- a/lib/AsmParser/LLLexer.cpp -+++ b/lib/AsmParser/LLLexer.cpp -@@ -600,6 +600,7 @@ lltok::Kind LLLexer::LexIdentifier() { - KEYWORD(spir_kernel); - KEYWORD(spir_func); - KEYWORD(intel_ocl_bicc); -+ KEYWORD(intel_svmlcc); - KEYWORD(x86_64_sysvcc); - KEYWORD(win64cc); - KEYWORD(x86_regcallcc); -diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp -index 7cf74dd1..0b539e86 100644 ---- a/lib/AsmParser/LLParser.cpp -+++ b/lib/AsmParser/LLParser.cpp -@@ -1843,6 +1843,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) { - /// ::= 'ccc' - /// ::= 'fastcc' - /// ::= 'intel_ocl_bicc' -+/// ::= 'intel_svmlcc' - /// ::= 'coldcc' - /// ::= 'x86_stdcallcc' - /// ::= 'x86_fastcallcc' -@@ -1902,6 +1903,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) { - case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; - case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; - case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; -+ case lltok::kw_intel_svmlcc: CC = CallingConv::Intel_SVML; break; - case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; - case lltok::kw_win64cc: CC = CallingConv::Win64; break; - case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; -diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h -index 8d8c7e99..3c89b78a 100644 ---- a/lib/AsmParser/LLToken.h -+++ b/lib/AsmParser/LLToken.h -@@ -131,6 +131,7 @@ enum Kind { - kw_fastcc, - kw_coldcc, - kw_intel_ocl_bicc, -+ kw_intel_svmlcc, - kw_x86_stdcallcc, - kw_x86_fastcallcc, - kw_x86_thiscallcc, -diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp -index 99a25a72..0b6e6787 100644 ---- a/lib/IR/AsmWriter.cpp -+++ b/lib/IR/AsmWriter.cpp -@@ -360,6 +360,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { - case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; - case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; - case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; -+ case CallingConv::Intel_SVML: Out << "intel_svmlcc"; break; - case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; - case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; - case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; -diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp -index e5231bb7..58b1f0a4 100644 ---- a/lib/IR/Verifier.cpp -+++ b/lib/IR/Verifier.cpp -@@ -2114,6 +2114,7 @@ void Verifier::visitFunction(const Function &F) { - case CallingConv::Fast: - case CallingConv::Cold: - case CallingConv::Intel_OCL_BI: -+ case CallingConv::Intel_SVML: - case CallingConv::PTX_Kernel: - case CallingConv::PTX_Device: - Assert(!F.isVarArg(), "Calling convention does not support varargs or " -diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td -index fcc9a296..d36c3a0f 100644 ---- a/lib/Target/X86/X86CallingConv.td -+++ b/lib/Target/X86/X86CallingConv.td -@@ -476,12 +476,29 @@ def RetCC_X86_64 : CallingConv<[ - CCDelegateTo - ]>; - -+// Intel_SVML return-value convention. -+def RetCC_Intel_SVML : CallingConv<[ -+ // Vector types are returned in XMM0,XMM1 -+ CCIfType<[v4f32, v2f64], -+ CCAssignToReg<[XMM0,XMM1]>>, -+ -+ // 256-bit FP vectors -+ CCIfType<[v8f32, v4f64], -+ CCAssignToReg<[YMM0,YMM1]>>, -+ -+ // 512-bit FP vectors -+ CCIfType<[v16f32, v8f64], -+ CCAssignToReg<[ZMM0,ZMM1]>> -+]>; -+ - // This is the return-value convention used for the entire X86 backend. - def RetCC_X86 : CallingConv<[ - - // Check if this is the Intel OpenCL built-ins calling convention - CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, - -+ CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo>, -+ - CCIfSubtarget<"is64Bit()", CCDelegateTo>, - CCDelegateTo - ]>; -@@ -983,6 +1000,22 @@ def CC_Intel_OCL_BI : CallingConv<[ - CCDelegateTo - ]>; - -+// X86-64 Intel Short Vector Math Library calling convention. -+def CC_Intel_SVML : CallingConv<[ -+ -+ // The SSE vector arguments are passed in XMM registers. -+ CCIfType<[v4f32, v2f64], -+ CCAssignToReg<[XMM0, XMM1, XMM2]>>, -+ -+ // The 256-bit vector arguments are passed in YMM registers. -+ CCIfType<[v8f32, v4f64], -+ CCAssignToReg<[YMM0, YMM1, YMM2]>>, -+ -+ // The 512-bit vector arguments are passed in ZMM registers. -+ CCIfType<[v16f32, v8f64], -+ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>> -+]>; -+ - def CC_X86_32_Intr : CallingConv<[ - CCAssignToStack<4, 4> - ]>; -@@ -1039,6 +1072,7 @@ def CC_X86_64 : CallingConv<[ - // This is the argument convention used for the entire X86 backend. - def CC_X86 : CallingConv<[ - CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, -+ CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo>, - CCIfSubtarget<"is64Bit()", CCDelegateTo>, - CCDelegateTo - ]>; -@@ -1147,4 +1181,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, - (sequence "R%u", 12, 15))>; - def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, - (sequence "XMM%u", 8, 15))>; -- -+ -+// SVML calling convention -+def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>; -+def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML, -+ K4, K5, K6, K7)>; -+ -+def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>; -+ -+def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, -+ (sequence "XMM%u", 8, 15))>; -+def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, -+ (sequence "XMM%u", 6, 15))>; -+ -+def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, -+ (sequence "YMM%u", 8, 15))>; -+def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, -+ (sequence "YMM%u", 6, 15))>; -+ -+def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, -+ (sequence "ZMM%u", 16, 31), -+ K4, K5, K6, K7)>; -+def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, -+ (sequence "ZMM%u", 6, 21), -+ K4, K5, K6, K7)>; -diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp -index 2820004c..817be44a 100644 ---- a/lib/Target/X86/X86ISelLowering.cpp -+++ b/lib/Target/X86/X86ISelLowering.cpp -@@ -3279,7 +3279,8 @@ SDValue X86TargetLowering::LowerFormalArguments( - // FIXME: Only some x86_32 calling conventions support AVX512. - if (Subtarget.hasAVX512() && - (Is64Bit || (CallConv == CallingConv::X86_VectorCall || -- CallConv == CallingConv::Intel_OCL_BI))) -+ CallConv == CallingConv::Intel_OCL_BI || -+ CallConv == CallingConv::Intel_SVML))) - VecVT = MVT::v16f32; - else if (Subtarget.hasAVX()) - VecVT = MVT::v8f32; -diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp -index 55842a4a..28877c90 100644 ---- a/lib/Target/X86/X86RegisterInfo.cpp -+++ b/lib/Target/X86/X86RegisterInfo.cpp -@@ -311,6 +311,23 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - return CSR_64_Intel_OCL_BI_SaveList; - break; - } -+ case CallingConv::Intel_SVML: { -+ if (Is64Bit) { -+ if (HasAVX512) -+ return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_SaveList : -+ CSR_64_Intel_SVML_AVX512_SaveList; -+ if (HasAVX) -+ return IsWin64 ? CSR_Win64_Intel_SVML_AVX_SaveList : -+ CSR_64_Intel_SVML_AVX_SaveList; -+ -+ return IsWin64 ? CSR_Win64_Intel_SVML_SaveList : -+ CSR_64_Intel_SVML_SaveList; -+ } else { // Is32Bit -+ if (HasAVX512) -+ return CSR_32_Intel_SVML_AVX512_SaveList; -+ return CSR_32_Intel_SVML_SaveList; -+ } -+ } - case CallingConv::HHVM: - return CSR_64_HHVM_SaveList; - case CallingConv::X86_RegCall: -@@ -425,6 +442,23 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, - return CSR_64_Intel_OCL_BI_RegMask; - break; - } -+ case CallingConv::Intel_SVML: { -+ if (Is64Bit) { -+ if (HasAVX512) -+ return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_RegMask : -+ CSR_64_Intel_SVML_AVX512_RegMask; -+ if (HasAVX) -+ return IsWin64 ? CSR_Win64_Intel_SVML_AVX_RegMask : -+ CSR_64_Intel_SVML_AVX_RegMask; -+ -+ return IsWin64 ? CSR_Win64_Intel_SVML_RegMask : -+ CSR_64_Intel_SVML_RegMask; -+ } else { // Is32Bit -+ if (HasAVX512) -+ return CSR_32_Intel_SVML_AVX512_RegMask; -+ return CSR_32_Intel_SVML_RegMask; -+ } -+ } - case CallingConv::HHVM: - return CSR_64_HHVM_RegMask; - case CallingConv::X86_RegCall: -diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h -index 85e8256a..3208a93d 100644 ---- a/lib/Target/X86/X86Subtarget.h -+++ b/lib/Target/X86/X86Subtarget.h -@@ -769,6 +769,7 @@ public: - case CallingConv::X86_ThisCall: - case CallingConv::X86_VectorCall: - case CallingConv::Intel_OCL_BI: -+ case CallingConv::Intel_SVML: - return isTargetWin64(); - // This convention allows using the Win64 convention on other targets. - case CallingConv::Win64: -diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp -index 1c7d0a63..299b161d 100644 ---- a/lib/Transforms/Vectorize/LoopVectorize.cpp -+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp -@@ -602,6 +602,27 @@ protected: - /// vector of instructions. - void addMetadata(ArrayRef To, Instruction *From); - -+ /// Check legality of given SVML call instruction \p VecCall generated for -+ /// scalar call \p Call. If illegal then the appropriate legal instruction -+ /// is returned. -+ Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call); -+ -+ /// Returns the legal VF for a call instruction \p CI using TTI information -+ /// and vector type. -+ unsigned getLegalVFForCall(CallInst *CI); -+ -+ /// Partially vectorize a given call \p Call by breaking it down into multiple -+ /// calls of \p LegalCall, decided by the variant VF \p LegalVF. -+ Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall, -+ unsigned LegalVF); -+ -+ /// Generate shufflevector instruction for a vector value \p V based on the -+ /// current \p Part and a smaller VF \p LegalVF. -+ Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part); -+ -+ /// Combine partially vectorized calls stored in \p CallResults. -+ Value *combinePartialVecCalls(SmallVectorImpl &CallResults); -+ - /// The original loop. - Loop *OrigLoop; - -@@ -4105,6 +4126,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { - } - - Function *VectorF; -+ bool FromSVML = false; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; -@@ -4113,7 +4135,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the library call. -- StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); -+ bool IsFast = CI->getFastMathFlags().isFast(); -+ std::string VFnName = TLI->getVectorizedFunction(FnName, VF, FromSVML, IsFast); - assert(!VFnName.empty() && "Vector function name is empty."); - VectorF = M->getFunction(VFnName); - if (!VectorF) { -@@ -4132,9 +4155,22 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { - - if (isa(V)) - V->copyFastMathFlags(CI); -- -- VectorLoopValueMap.setVectorValue(&I, Part, V); -- addMetadata(V, &I); -+ if (FromSVML) -+ V->setCallingConv(CallingConv::Intel_SVML); -+ // Perform legalization of SVML call instruction only if original call -+ // was not Intrinsic -+ if (!isa(CI) && FromSVML) { -+ assert((V->getCalledFunction()->getName()).startswith("__svml")); -+ LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump()); -+ auto *LegalV = cast(legalizeSVMLCall(V, CI)); -+ LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: "; -+ LegalV->dump()); -+ VectorLoopValueMap.setVectorValue(&I, Part, LegalV); -+ addMetadata(LegalV, &I); -+ } else { -+ VectorLoopValueMap.setVectorValue(&I, Part, V); -+ addMetadata(V, &I); -+ } - } - - break; -@@ -4163,6 +4199,244 @@ void InnerLoopVectorizer::updateAnalysis() { - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); - } - -+//===----------------------------------------------------------------------===// -+// Implementation of functions for SVML vector call legalization. -+//===----------------------------------------------------------------------===// -+// -+// Unlike other VECLIBs, SVML needs to be used with target-legal -+// vector types. Otherwise, link failures and/or runtime failures -+// will occur. A motivating example could be - -+// -+// double *a; -+// float *b; -+// #pragma clang loop vectorize_width(8) -+// for(i = 0; i < N; ++i) { -+// a[i] = sin(i); // Legal SVML VF must be 4 or below on AVX -+// b[i] = cosf(i); // VF can be 8 on AVX since 8 floats can fit in YMM -+// } -+// -+// Current implementation of vector code generation in LV is -+// driven based on a single VF (in InnerLoopVectorizer::VF). This -+// inhibits the flexibility of adjusting/choosing different VF -+// for different instructions. -+// -+// Due to this limitation it is much more straightforward to -+// first generate the illegal sin8 (svml_sin8 for SVML vector -+// library) call and then legalize it than trying to avoid -+// generating illegal code from the beginning. -+// -+// A solution for this problem is to check legality of the -+// call instruction right after generating it in vectorizer and -+// if it is illegal we split the call arguments and issue multiple -+// calls to match the legal VF. This is demonstrated currently for -+// the SVML vector library calls (non-intrinsic version only). -+// -+// Future directions and extensions: -+// 1) This legalization example shows us that a good direction -+// for the VPlan framework would be to model the vector call -+// instructions in a way that legal VF for each call is chosen -+// correctly within vectorizer and illegal code generation is -+// avoided. -+// 2) This logic can also be extended to general vector functions -+// i.e. legalization OpenMP decalre simd functions. The -+// requirements needed for this will be documented soon. -+ -+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall, -+ CallInst *Call) { -+ unsigned LegalVF = getLegalVFForCall(VecCall); -+ -+ assert(LegalVF > 1 && -+ "Legal VF for SVML call must be greater than 1 to vectorize"); -+ -+ if (LegalVF == VF) -+ return VecCall; -+ else if (LegalVF > VF) -+ // TODO: handle case when we are underfilling vectors -+ return VecCall; -+ -+ // Legal VF for this SVML call is smaller than chosen VF, break it down into -+ // smaller call instructions -+ -+ // Convert args, types and return type to match legal VF -+ SmallVector NewTys; -+ SmallVector NewArgs; -+ Type *NewRetTy = ToVectorTy(Call->getType(), LegalVF); -+ -+ for (Value *ArgOperand : Call->arg_operands()) { -+ Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF); -+ NewTys.push_back(Ty); -+ NewArgs.push_back(UndefValue::get(Ty)); -+ } -+ -+ // Construct legal vector function -+ Function *F = Call->getCalledFunction(); -+ StringRef FnName = F->getName(); -+ Module *M = Call->getModule(); -+ bool unused = false; -+ std::string LegalVFnName = TLI->getVectorizedFunction(FnName, LegalVF, unused, Call->getFastMathFlags().isFast()); -+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalVFnName: " << LegalVFnName << " FnName: " << FnName << "\n"); -+ assert(!LegalVFnName.empty() && (LegalVFnName != FnName) && -+ "Could not find legal vector function in TLI."); -+ -+ // Since this is targeting SVML calls specifically, we know the module -+ // will not have a vector version of the call -+ assert(!M->getFunction(LegalVFnName) && -+ "Module has vector version for legal SVML call."); -+ FunctionType *LegalFTy = FunctionType::get(NewRetTy, NewTys, false); -+ Function *LegalVectorF = -+ Function::Create(LegalFTy, Function::ExternalLinkage, LegalVFnName, M); -+ assert(LegalVectorF && "Can't create legal SVML vector function."); -+ LegalVectorF->copyAttributesFrom(F); -+ -+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump()); -+ -+ SmallVector OpBundles; -+ Call->getOperandBundlesAsDefs(OpBundles); -+ CallInst *LegalV = CallInst::Create(LegalVectorF, NewArgs, OpBundles); -+ -+ if (isa(LegalV)) -+ LegalV->copyFastMathFlags(Call); -+ -+ // Set SVML calling conventions -+ LegalV->setCallingConv(CallingConv::Intel_SVML); -+ -+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump()); -+ -+ Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV, LegalVF); -+ -+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump()); -+ -+ // Remove the illegal call from Builder -+ VecCall->eraseFromParent(); -+ -+ if (LegalV) -+ delete LegalV; -+ -+ return LegalizedCall; -+} -+ -+unsigned InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) { -+ const DataLayout DL = CI->getModule()->getDataLayout(); -+ FunctionType *CallFT = CI->getFunctionType(); -+ // All functions that need legalization should have a vector return type. -+ // This is true for all SVML functions that are currently supported. -+ assert(isa(CallFT->getReturnType()) && -+ "Return type of call that needs legalization is not a vector."); -+ auto *VecCallRetType = cast(CallFT->getReturnType()); -+ Type *ElemType = VecCallRetType->getElementType(); -+ -+ unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType); -+ unsigned VectorBitWidth = TTI->getRegisterBitWidth(true); -+ unsigned LegalVF = VectorBitWidth / TypeBitWidth; -+ -+ LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n"); -+ LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n"); -+ LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth -+ << "\n"); -+ LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n"); -+ -+ return LegalVF; -+} -+ -+// Partial vectorization of a call instruction is achieved by making clones of -+// \p LegalCall and overwriting its argument operands with shufflevector -+// equivalent decided based on \p LegalVF and current Part being filled. -+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call, -+ CallInst *LegalCall, -+ unsigned LegalVF) { -+ unsigned NumParts = VF / LegalVF; -+ LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n"); -+ SmallVector CallResults; -+ -+ for (unsigned Part = 0; Part < NumParts; ++Part) { -+ auto *ClonedCall = cast(LegalCall->clone()); -+ -+ // Update the arg operand of cloned call to shufflevector -+ for (unsigned i = 0, ie = Call->getNumArgOperands(); i != ie; ++i) { -+ auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part); -+ ClonedCall->setArgOperand(i, NewOp); -+ } -+ -+ LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump()); -+ -+ auto *PartialVecCall = Builder.Insert(ClonedCall); -+ CallResults.push_back(PartialVecCall); -+ } -+ -+ return combinePartialVecCalls(CallResults); -+} -+ -+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF, -+ unsigned Part) { -+ // Example: -+ // Consider the following vector code - -+ // %1 = sitofp <4 x i32> %0 to <4 x double> -+ // %2 = call <4 x double> @__svml_sin4(<4 x double> %1) -+ // -+ // If the LegalVF is 2, we partially vectorize the sin4 call by invoking -+ // generateShuffleValue on the operand %1 -+ // If Part = 1, output value is - -+ // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> -+ // and if Part = 2, output is - -+ // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> -+ -+ assert(isa(V->getType()) && -+ "Cannot generate shuffles for non-vector values."); -+ SmallVector ShuffleMask; -+ Value *Undef = UndefValue::get(V->getType()); -+ -+ unsigned ElemIdx = Part * LegalVF; -+ -+ for (unsigned K = 0; K < LegalVF; K++) -+ ShuffleMask.push_back(ElemIdx + K); -+ -+ auto *ShuffleInst = -+ Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle"); -+ -+ return ShuffleInst; -+} -+ -+// Results of the calls executed by smaller legal call instructions must be -+// combined to match the original VF for later use. This is done by constructing -+// shufflevector instructions in a cumulative fashion. -+Value *InnerLoopVectorizer::combinePartialVecCalls( -+ SmallVectorImpl &CallResults) { -+ assert(isa(CallResults[0]->getType()) && -+ "Cannot combine calls with non-vector results."); -+ auto *CallType = cast(CallResults[0]->getType()); -+ -+ Value *CombinedShuffle; -+ unsigned NumElems = CallType->getNumElements() * 2; -+ unsigned NumRegs = CallResults.size(); -+ -+ assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) && -+ "Number of partial vector calls to combine must be a power of 2 " -+ "(atleast 2^1)"); -+ -+ while (NumRegs > 1) { -+ for (unsigned I = 0; I < NumRegs; I += 2) { -+ SmallVector ShuffleMask; -+ for (unsigned J = 0; J < NumElems; J++) -+ ShuffleMask.push_back(J); -+ -+ CombinedShuffle = Builder.CreateShuffleVector( -+ CallResults[I], CallResults[I + 1], ShuffleMask, "combined"); -+ LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:"; -+ CombinedShuffle->dump()); -+ CallResults.push_back(CombinedShuffle); -+ } -+ -+ SmallVector::iterator Start = CallResults.begin(); -+ SmallVector::iterator End = Start + NumRegs; -+ CallResults.erase(Start, End); -+ -+ NumElems *= 2; -+ NumRegs /= 2; -+ } -+ -+ return CombinedShuffle; -+} -+ - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { - // We should not collect Scalars more than once per VF. Right now, this - // function is called from collectUniformsAndScalars(), which already does -diff --git a/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/test/Transforms/LoopVectorize/X86/scatter_crash.ll -old mode 100755 -new mode 100644 -diff --git a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll -index 5a4bfe5e..4da2e48a 100644 ---- a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll -+++ b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll -@@ -39,7 +39,8 @@ for.end: ; preds = %for.body - declare double @__exp_finite(double) #0 - - ; CHECK-LABEL: @exp_f64 --; CHECK: <4 x double> @__svml_exp4 -+; CHECK: <2 x double> @__svml_exp2 -+; CHECK: <2 x double> @__svml_exp2 - ; CHECK: ret - define void @exp_f64(double* nocapture %varray) { - entry: -@@ -99,7 +100,8 @@ for.end: ; preds = %for.body - declare double @__log_finite(double) #0 - - ; CHECK-LABEL: @log_f64 --; CHECK: <4 x double> @__svml_log4 -+; CHECK: <2 x double> @__svml_log2 -+; CHECK: <2 x double> @__svml_log2 - ; CHECK: ret - define void @log_f64(double* nocapture %varray) { - entry: -@@ -159,7 +161,8 @@ for.end: ; preds = %for.body - declare double @__pow_finite(double, double) #0 - - ; CHECK-LABEL: @pow_f64 --; CHECK: <4 x double> @__svml_pow4 -+; CHECK: <2 x double> @__svml_pow2 -+; CHECK: <2 x double> @__svml_pow2 - ; CHECK: ret - define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { - entry: -diff --git a/test/Transforms/LoopVectorize/X86/svml-calls.ll b/test/Transforms/LoopVectorize/X86/svml-calls.ll -index 8ff62f17..4d48d981 100644 ---- a/test/Transforms/LoopVectorize/X86/svml-calls.ll -+++ b/test/Transforms/LoopVectorize/X86/svml-calls.ll -@@ -31,7 +31,7 @@ declare float @llvm.log.f32(float) #0 - - define void @sin_f64(double* nocapture %varray) { - ; CHECK-LABEL: @sin_f64( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -54,7 +54,7 @@ for.end: - - define void @sin_f32(float* nocapture %varray) { - ; CHECK-LABEL: @sin_f32( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -77,7 +77,7 @@ for.end: - - define void @sin_f64_intrinsic(double* nocapture %varray) { - ; CHECK-LABEL: @sin_f64_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -100,7 +100,7 @@ for.end: - - define void @sin_f32_intrinsic(float* nocapture %varray) { - ; CHECK-LABEL: @sin_f32_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -123,7 +123,7 @@ for.end: - - define void @cos_f64(double* nocapture %varray) { - ; CHECK-LABEL: @cos_f64( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -146,7 +146,7 @@ for.end: - - define void @cos_f32(float* nocapture %varray) { - ; CHECK-LABEL: @cos_f32( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -169,7 +169,7 @@ for.end: - - define void @cos_f64_intrinsic(double* nocapture %varray) { - ; CHECK-LABEL: @cos_f64_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -192,7 +192,7 @@ for.end: - - define void @cos_f32_intrinsic(float* nocapture %varray) { - ; CHECK-LABEL: @cos_f32_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -215,7 +215,7 @@ for.end: - - define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { - ; CHECK-LABEL: @pow_f64( --; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) -+; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -240,7 +240,7 @@ for.end: - - define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { - ; CHECK-LABEL: @pow_f64_intrinsic( --; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) -+; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -265,7 +265,7 @@ for.end: - - define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { - ; CHECK-LABEL: @pow_f32( --; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) -+; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -290,7 +290,7 @@ for.end: - - define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { - ; CHECK-LABEL: @pow_f32_intrinsic( --; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) -+; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -315,7 +315,7 @@ for.end: - - define void @exp_f64(double* nocapture %varray) { - ; CHECK-LABEL: @exp_f64( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -338,7 +338,7 @@ for.end: - - define void @exp_f32(float* nocapture %varray) { - ; CHECK-LABEL: @exp_f32( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -361,7 +361,7 @@ for.end: - - define void @exp_f64_intrinsic(double* nocapture %varray) { - ; CHECK-LABEL: @exp_f64_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -384,7 +384,7 @@ for.end: - - define void @exp_f32_intrinsic(float* nocapture %varray) { - ; CHECK-LABEL: @exp_f32_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -407,7 +407,7 @@ for.end: - - define void @log_f64(double* nocapture %varray) { - ; CHECK-LABEL: @log_f64( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -430,7 +430,7 @@ for.end: - - define void @log_f32(float* nocapture %varray) { - ; CHECK-LABEL: @log_f32( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -453,7 +453,7 @@ for.end: - - define void @log_f64_intrinsic(double* nocapture %varray) { - ; CHECK-LABEL: @log_f64_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -476,7 +476,7 @@ for.end: - - define void @log_f32_intrinsic(float* nocapture %varray) { - ; CHECK-LABEL: @log_f32_intrinsic( --; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) -+; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) - ; CHECK: ret void - ; - entry: -@@ -497,5 +497,44 @@ for.end: - ret void - } - --attributes #0 = { nounwind readnone } -+; CHECK-LABEL: @atan2_finite -+; CHECK: intel_svmlcc <4 x double> @__svml_atan24 -+; CHECK: intel_svmlcc <4 x double> @__svml_atan24 -+; CHECK: ret -+ -+declare double @__atan2_finite(double, double) local_unnamed_addr #0 - -+define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 { -+entry: -+ br label %for.cond1.preheader -+ -+for.cond1.preheader: ; preds = %for.inc7, %entry -+ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ] -+ %0 = trunc i64 %indvars.iv19 to i32 -+ %conv = sitofp i32 %0 to double -+ br label %for.body3 -+ -+for.body3: ; preds = %for.body3, %for.cond1.preheader -+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] -+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -+ %1 = trunc i64 %indvars.iv.next to i32 -+ %conv4 = sitofp i32 %1 to double -+ %call = tail call fast double @__atan2_finite(double %conv, double %conv4) -+ %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv -+ store double %call, double* %arrayidx6, align 8 -+ %exitcond = icmp eq i64 %indvars.iv.next, 100 -+ br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5 -+ -+for.inc7: ; preds = %for.body3 -+ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 -+ %exitcond21 = icmp eq i64 %indvars.iv.next20, 100 -+ br i1 %exitcond21, label %for.end9, label %for.cond1.preheader -+ -+for.end9: ; preds = %for.inc7 -+ ret void -+} -+ -+attributes #0 = { nounwind readnone } -+!5 = distinct !{!5, !6, !7} -+!6 = !{!"llvm.loop.vectorize.width", i32 8} -+!7 = !{!"llvm.loop.vectorize.enable", i1 true} -diff --git a/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll -new file mode 100644 -index 00000000..93676abb ---- /dev/null -+++ b/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll -@@ -0,0 +1,508 @@ -+; Check legalization of SVML calls. Also checks that intrinsic calls are not legalizedby vectorizer. -+ -+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s -+ -+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -+target triple = "x86_64-unknown-linux-gnu" -+ -+declare double @sin(double) #0 -+declare float @sinf(float) #0 -+declare double @llvm.sin.f64(double) #0 -+declare float @llvm.sin.f32(float) #0 -+ -+declare double @cos(double) #0 -+declare float @cosf(float) #0 -+declare double @llvm.cos.f64(double) #0 -+declare float @llvm.cos.f32(float) #0 -+ -+declare double @pow(double, double) #0 -+declare float @powf(float, float) #0 -+declare double @llvm.pow.f64(double, double) #0 -+declare float @llvm.pow.f32(float, float) #0 -+ -+declare double @exp(double) #0 -+declare float @expf(float) #0 -+declare double @llvm.exp.f64(double) #0 -+declare float @llvm.exp.f32(float) #0 -+ -+declare double @log(double) #0 -+declare float @logf(float) #0 -+declare double @llvm.log.f64(double) #0 -+declare float @llvm.log.f32(float) #0 -+ -+ -+define void @sin_f64(double* nocapture %varray) { -+; CHECK-LABEL: @sin_f64( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]]) -+; CHECK: [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @sin(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @sin_f32(float* nocapture %varray) { -+; CHECK-LABEL: @sin_f32( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @sinf(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @sin_f64_intrinsic(double* nocapture %varray) { -+; CHECK-LABEL: @sin_f64_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x double> @__svml_sin8_ha(<8 x double> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @llvm.sin.f64(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @sin_f32_intrinsic(float* nocapture %varray) { -+; CHECK-LABEL: @sin_f32_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @llvm.sin.f32(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @cos_f64(double* nocapture %varray) { -+; CHECK-LABEL: @cos_f64( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]]) -+; CHECK: [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @cos(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @cos_f32(float* nocapture %varray) { -+; CHECK-LABEL: @cos_f32( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @cosf(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @cos_f64_intrinsic(double* nocapture %varray) { -+; CHECK-LABEL: @cos_f64_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x double> @__svml_cos8_ha(<8 x double> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @llvm.cos.f64(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @cos_f32_intrinsic(float* nocapture %varray) { -+; CHECK-LABEL: @cos_f32_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @llvm.cos.f32(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { -+; CHECK-LABEL: @pow_f64( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]]) -+; CHECK: [[TMP4:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv -+ %tmp1 = load double, double* %arrayidx, align 4 -+ %tmp2 = tail call double @pow(double %conv, double %tmp1) -+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %tmp2, double* %arrayidx2, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { -+; CHECK-LABEL: @pow_f64_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x double> @__svml_pow8_ha(<8 x double> [[TMP2:%.*]], <8 x double> [[TMP3:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv -+ %tmp1 = load double, double* %arrayidx, align 4 -+ %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1) -+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %tmp2, double* %arrayidx2, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { -+; CHECK-LABEL: @pow_f32( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv -+ %tmp1 = load float, float* %arrayidx, align 4 -+ %tmp2 = tail call float @powf(float %conv, float %tmp1) -+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %tmp2, float* %arrayidx2, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { -+; CHECK-LABEL: @pow_f32_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv -+ %tmp1 = load float, float* %arrayidx, align 4 -+ %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1) -+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %tmp2, float* %arrayidx2, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @exp_f64(double* nocapture %varray) { -+; CHECK-LABEL: @exp_f64( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]]) -+; CHECK: [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @exp(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @exp_f32(float* nocapture %varray) { -+; CHECK-LABEL: @exp_f32( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @expf(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @exp_f64_intrinsic(double* nocapture %varray) { -+; CHECK-LABEL: @exp_f64_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x double> @__svml_exp8_ha(<8 x double> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @llvm.exp.f64(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @exp_f32_intrinsic(float* nocapture %varray) { -+; CHECK-LABEL: @exp_f32_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @llvm.exp.f32(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @log_f64(double* nocapture %varray) { -+; CHECK-LABEL: @log_f64( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]]) -+; CHECK: [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @log(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @log_f32(float* nocapture %varray) { -+; CHECK-LABEL: @log_f32( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @logf(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @log_f64_intrinsic(double* nocapture %varray) { -+; CHECK-LABEL: @log_f64_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x double> @__svml_log8_ha(<8 x double> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to double -+ %call = tail call double @llvm.log.f64(double %conv) -+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv -+ store double %call, double* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+define void @log_f32_intrinsic(float* nocapture %varray) { -+; CHECK-LABEL: @log_f32_intrinsic( -+; CHECK: [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]]) -+; CHECK: ret void -+; -+entry: -+ br label %for.body -+ -+for.body: -+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -+ %tmp = trunc i64 %iv to i32 -+ %conv = sitofp i32 %tmp to float -+ %call = tail call float @llvm.log.f32(float %conv) -+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv -+ store float %call, float* %arrayidx, align 4 -+ %iv.next = add nuw nsw i64 %iv, 1 -+ %exitcond = icmp eq i64 %iv.next, 1000 -+ br i1 %exitcond, label %for.end, label %for.body -+ -+for.end: -+ ret void -+} -+ -+attributes #0 = { nounwind readnone } -+ -diff --git a/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll -new file mode 100644 -index 00000000..007eea7a ---- /dev/null -+++ b/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll -@@ -0,0 +1,61 @@ -+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype. -+; The C code used to generate this test: -+ -+; #include -+; -+; void foo(double *a, int N){ -+; int i; -+; #pragma clang loop vectorize_width(8) -+; for (i=0;i [[I0:%.*]] to <8 x double> -+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> -+; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc <4 x double> @__svml_sin4(<4 x double> [[S1]]) -+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> -+; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc <4 x double> @__svml_sin4(<4 x double> [[S2]]) -+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> -+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8 -+ -+ -+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -+target triple = "x86_64-unknown-linux-gnu" -+ -+; Function Attrs: nounwind uwtable -+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 { -+entry: -+ %cmp5 = icmp sgt i32 %N, 0 -+ br i1 %cmp5, label %for.body.preheader, label %for.end -+ -+for.body.preheader: ; preds = %entry -+ %wide.trip.count = zext i32 %N to i64 -+ br label %for.body -+ -+for.body: ; preds = %for.body, %for.body.preheader -+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -+ %0 = trunc i64 %indvars.iv to i32 -+ %conv = sitofp i32 %0 to double -+ %call = tail call fast double @sin(double %conv) #2 -+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv -+ store double %call, double* %arrayidx, align 8, !tbaa !2 -+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count -+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6 -+ -+for.end: ; preds = %for.body, %entry -+ ret void -+} -+ -+; Function Attrs: nounwind -+declare dso_local double @sin(double) local_unnamed_addr #1 -+ -+!2 = !{!3, !3, i64 0} -+!3 = !{!"double", !4, i64 0} -+!4 = !{!"omnipotent char", !5, i64 0} -+!5 = !{!"Simple C/C++ TBAA"} -+!6 = distinct !{!6, !7} -+!7 = !{!"llvm.loop.vectorize.width", i32 8} -diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt -index 0428249f..206ddcd0 100644 ---- a/utils/TableGen/CMakeLists.txt -+++ b/utils/TableGen/CMakeLists.txt -@@ -38,6 +38,7 @@ add_tablegen(llvm-tblgen LLVM - SearchableTableEmitter.cpp - SubtargetEmitter.cpp - SubtargetFeatureInfo.cpp -+ SVMLEmitter.cpp - TableGen.cpp - Types.cpp - X86DisassemblerTables.cpp -diff --git a/utils/TableGen/SVMLEmitter.cpp b/utils/TableGen/SVMLEmitter.cpp -new file mode 100644 -index 00000000..8800ca82 ---- /dev/null -+++ b/utils/TableGen/SVMLEmitter.cpp -@@ -0,0 +1,110 @@ -+//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===// -+// -+// The LLVM Compiler Infrastructure -+// -+// This file is distributed under the University of Illinois Open Source -+// License. See LICENSE.TXT for details. -+// -+//===----------------------------------------------------------------------===// -+// -+// This tablegen backend emits the scalar to svml function map for TLI. -+// -+//===----------------------------------------------------------------------===// -+ -+#include "CodeGenTarget.h" -+#include "llvm/Support/Format.h" -+#include "llvm/TableGen/Error.h" -+#include "llvm/TableGen/Record.h" -+#include "llvm/TableGen/TableGenBackend.h" -+#include -+#include -+ -+using namespace llvm; -+ -+#define DEBUG_TYPE "SVMLVariants" -+#include "llvm/Support/Debug.h" -+ -+namespace { -+ -+class SVMLVariantsEmitter { -+ -+ RecordKeeper &Records; -+ -+private: -+ void emitSVMLVariants(raw_ostream &OS); -+ -+public: -+ SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {} -+ -+ void run(raw_ostream &OS); -+}; -+} // End anonymous namespace -+ -+/// \brief Emit the set of SVML variant function names. -+// The default is to emit the high accuracy SVML variants until a mechanism is -+// introduced to allow a selection of different variants through precision -+// requirements specified by the user. This code generates mappings to svml -+// that are in the scalar form of llvm intrinsics, math library calls, or the -+// finite variants of math library calls. -+void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) { -+ -+ const unsigned MinSinglePrecVL = 4; -+ const unsigned MaxSinglePrecVL = 16; -+ const unsigned MinDoublePrecVL = 2; -+ const unsigned MaxDoublePrecVL = 8; -+ -+ OS << "#ifdef GET_SVML_VARIANTS\n"; -+ -+ for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) { -+ StringRef SvmlVariantNameStr = D->getName(); -+ // Single Precision SVML -+ for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) { -+ // Emit the scalar math library function to svml function entry. -+ OS << "{\"" << SvmlVariantNameStr << "f" << "\", "; -+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " -+ << VL << "},\n"; -+ -+ // Emit the scalar intrinsic to svml function entry. -+ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", "; -+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " -+ << VL << "},\n"; -+ -+ // Emit the finite math library function to svml function entry. -+ OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", "; -+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " -+ << VL << "},\n"; -+ } -+ -+ // Double Precision SVML -+ for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) { -+ // Emit the scalar math library function to svml function entry. -+ OS << "{\"" << SvmlVariantNameStr << "\", "; -+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL -+ << "},\n"; -+ -+ // Emit the scalar intrinsic to svml function entry. -+ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", "; -+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL -+ << "},\n"; -+ -+ // Emit the finite math library function to svml function entry. -+ OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", "; -+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " -+ << VL << "},\n"; -+ } -+ } -+ -+ OS << "#endif // GET_SVML_VARIANTS\n\n"; -+} -+ -+void SVMLVariantsEmitter::run(raw_ostream &OS) { -+ emitSVMLVariants(OS); -+} -+ -+namespace llvm { -+ -+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) { -+ SVMLVariantsEmitter(RK).run(OS); -+} -+ -+} // End llvm namespace -diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp -index b7826062..bbb164ea 100644 ---- a/utils/TableGen/TableGen.cpp -+++ b/utils/TableGen/TableGen.cpp -@@ -53,6 +53,7 @@ enum ActionType { - GenX86EVEX2VEXTables, - GenX86FoldTables, - GenRegisterBank, -+ GenSVMLVariants, - }; - - namespace { -@@ -117,7 +118,9 @@ namespace { - clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", - "Generate X86 fold tables"), - clEnumValN(GenRegisterBank, "gen-register-bank", -- "Generate registers bank descriptions"))); -+ "Generate registers bank descriptions"), -+ clEnumValN(GenSVMLVariants, "gen-svml", -+ "Generate SVML variant function names"))); - - cl::OptionCategory PrintEnumsCat("Options for -print-enums"); - cl::opt -@@ -231,6 +234,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { - case GenX86FoldTables: - EmitX86FoldTables(Records, OS); - break; -+ case GenSVMLVariants: -+ EmitSVMLVariants(Records, OS); -+ break; - } - - return false; -diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h -index 1329a6d8..8b808de0 100644 ---- a/utils/TableGen/TableGenBackends.h -+++ b/utils/TableGen/TableGenBackends.h -@@ -89,6 +89,7 @@ void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS); - void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS); - void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); - void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); -+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS); - - } // End llvm namespace - -diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim -index d58ffb21..a9b60f12 100644 ---- a/utils/vim/syntax/llvm.vim -+++ b/utils/vim/syntax/llvm.vim -@@ -94,6 +94,7 @@ syn keyword llvmKeyword - \ inreg - \ inteldialect - \ intel_ocl_bicc -+ \ intel_svmlcc - \ internal - \ linkonce - \ linkonce_odr diff --git a/conda-recipes/D47188-svml.patch b/conda-recipes/D47188-svml.patch new file mode 100644 index 000000000..9d90ae87c --- /dev/null +++ b/conda-recipes/D47188-svml.patch @@ -0,0 +1,821 @@ +From https://reviews.llvm.org/D47188 rebased on top of LLVM 6.0.0 +With additional hot-fix in LoopVectorize.cpp for numba/numba#3016 + +diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h +index a3fe834..124b81d 100644 +--- a/include/llvm/Analysis/TargetLibraryInfo.h ++++ b/include/llvm/Analysis/TargetLibraryInfo.h +@@ -38,6 +38,12 @@ struct VecDesc { + NumLibFuncs + }; + ++enum SVMLAccuracy { ++ SVML_DEFAULT, ++ SVML_HA, ++ SVML_EP ++}; ++ + /// Implementation of the target library information. + /// + /// This class constructs tables that hold the target library information and +@@ -150,7 +156,8 @@ public: + /// Return true if the function F has a vector equivalent with vectorization + /// factor VF. + bool isFunctionVectorizable(StringRef F, unsigned VF) const { +- return !getVectorizedFunction(F, VF).empty(); ++ bool IgnoreMeThere; ++ return !getVectorizedFunction(F, VF, IgnoreMeThere, false).empty(); + } + + /// Return true if the function F has a vector equivalent with any +@@ -159,7 +166,8 @@ public: + + /// Return the name of the equivalent of F, vectorized with factor VF. If no + /// such mapping exists, return the empty string. +- StringRef getVectorizedFunction(StringRef F, unsigned VF) const; ++ std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML, ++ bool IsFast) const; + + /// Return true if the function F has a scalar equivalent, and set VF to be + /// the vectorization factor. +@@ -253,8 +261,9 @@ public: + bool isFunctionVectorizable(StringRef F) const { + return Impl->isFunctionVectorizable(F); + } +- StringRef getVectorizedFunction(StringRef F, unsigned VF) const { +- return Impl->getVectorizedFunction(F, VF); ++ std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML, ++ bool IsFast) const { ++ return Impl->getVectorizedFunction(F, VF, FromSVML, IsFast); + } + + /// Tests if the function is both available and a candidate for optimized code +diff --git a/include/llvm/IR/CMakeLists.txt b/include/llvm/IR/CMakeLists.txt +index cf75d58..374fd65 100644 +--- a/include/llvm/IR/CMakeLists.txt ++++ b/include/llvm/IR/CMakeLists.txt +@@ -4,3 +4,7 @@ tablegen(LLVM Attributes.gen -gen-attrs) + set(LLVM_TARGET_DEFINITIONS Intrinsics.td) + tablegen(LLVM Intrinsics.gen -gen-intrinsic) + add_public_tablegen_target(intrinsics_gen) ++ ++set(LLVM_TARGET_DEFINITIONS SVML.td) ++tablegen(LLVM SVML.gen -gen-svml) ++add_public_tablegen_target(svml_gen) +diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h +index 84fe836..46700f0 100644 +--- a/include/llvm/IR/CallingConv.h ++++ b/include/llvm/IR/CallingConv.h +@@ -220,6 +220,9 @@ namespace CallingConv { + /// shader if tessellation is in use, or otherwise the vertex shader. + AMDGPU_ES = 96, + ++ /// Intel_SVML - Calling conventions for Intel Short Math Vector Library ++ Intel_SVML = 97, ++ // + /// The highest possible calling convention ID. Must be some 2^k - 1. + MaxID = 1023 + }; +diff --git a/include/llvm/IR/SVML.td b/include/llvm/IR/SVML.td +new file mode 100644 +index 0000000..90f2902 +--- /dev/null ++++ b/include/llvm/IR/SVML.td +@@ -0,0 +1,62 @@ ++//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file is used by TableGen to define the different typs of SVML function ++// variants used with -fveclib=SVML. ++// ++//===----------------------------------------------------------------------===// ++ ++class SvmlVariant; ++ ++def sin : SvmlVariant; ++def cos : SvmlVariant; ++def pow : SvmlVariant; ++def exp : SvmlVariant; ++def log : SvmlVariant; ++def acos : SvmlVariant; ++def acosh : SvmlVariant; ++def asin : SvmlVariant; ++def asinh : SvmlVariant; ++def atan2 : SvmlVariant; ++def atan : SvmlVariant; ++def atanh : SvmlVariant; ++def cbrt : SvmlVariant; ++def cdfnorm : SvmlVariant; ++def cdfnorminv : SvmlVariant; ++def cosd : SvmlVariant; ++def cosh : SvmlVariant; ++def erf : SvmlVariant; ++def erfc : SvmlVariant; ++def erfcinv : SvmlVariant; ++def erfinv : SvmlVariant; ++def exp10 : SvmlVariant; ++def exp2 : SvmlVariant; ++def expm1 : SvmlVariant; ++def hypot : SvmlVariant; ++def invsqrt : SvmlVariant; ++def log10 : SvmlVariant; ++def log1p : SvmlVariant; ++def log2 : SvmlVariant; ++def sind : SvmlVariant; ++def sinh : SvmlVariant; ++def sqrt : SvmlVariant; ++def tan : SvmlVariant; ++def tanh : SvmlVariant; ++ ++// While SVML doesn't provide _ha versions of ++// the following symbols let's disable their vectorization. ++ ++// def nearbyint : SvmlVariant; ++// def logb : SvmlVariant; ++// def floor : SvmlVariant; ++// def fmod : SvmlVariant; ++// def ceil : SvmlVariant; ++// def trunc : SvmlVariant; ++// def rint : SvmlVariant; ++// def round : SvmlVariant; +diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt +index af2e30d..fa8aaac 100644 +--- a/lib/Analysis/CMakeLists.txt ++++ b/lib/Analysis/CMakeLists.txt +@@ -90,4 +90,5 @@ add_llvm_library(LLVMAnalysis + + DEPENDS + intrinsics_gen ++ svml_gen + ) +diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp +index d18246a..3d108d8 100644 +--- a/lib/Analysis/TargetLibraryInfo.cpp ++++ b/lib/Analysis/TargetLibraryInfo.cpp +@@ -50,6 +50,11 @@ static bool hasSinCosPiStret(const Triple &T) { + return true; + } + ++std::string svmlMangle(StringRef FnName, const bool IsFast) { ++ std::string FullName = FnName; ++ return IsFast ? FullName : FullName + "_ha"; ++} ++ + /// Initialize the set of available library functions based on the specified + /// target triple. This should be carefully written so that a missing target + /// triple gets a sane set of defaults. +@@ -1379,93 +1384,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( + } + case SVML: { + const VecDesc VecFuncs[] = { +- {"sin", "__svml_sin2", 2}, +- {"sin", "__svml_sin4", 4}, +- {"sin", "__svml_sin8", 8}, +- +- {"sinf", "__svml_sinf4", 4}, +- {"sinf", "__svml_sinf8", 8}, +- {"sinf", "__svml_sinf16", 16}, +- +- {"cos", "__svml_cos2", 2}, +- {"cos", "__svml_cos4", 4}, +- {"cos", "__svml_cos8", 8}, +- +- {"cosf", "__svml_cosf4", 4}, +- {"cosf", "__svml_cosf8", 8}, +- {"cosf", "__svml_cosf16", 16}, +- +- {"pow", "__svml_pow2", 2}, +- {"pow", "__svml_pow4", 4}, +- {"pow", "__svml_pow8", 8}, +- +- {"powf", "__svml_powf4", 4}, +- {"powf", "__svml_powf8", 8}, +- {"powf", "__svml_powf16", 16}, +- +- { "__pow_finite", "__svml_pow2", 2 }, +- { "__pow_finite", "__svml_pow4", 4 }, +- { "__pow_finite", "__svml_pow8", 8 }, +- +- { "__powf_finite", "__svml_powf4", 4 }, +- { "__powf_finite", "__svml_powf8", 8 }, +- { "__powf_finite", "__svml_powf16", 16 }, +- +- {"llvm.pow.f64", "__svml_pow2", 2}, +- {"llvm.pow.f64", "__svml_pow4", 4}, +- {"llvm.pow.f64", "__svml_pow8", 8}, +- +- {"llvm.pow.f32", "__svml_powf4", 4}, +- {"llvm.pow.f32", "__svml_powf8", 8}, +- {"llvm.pow.f32", "__svml_powf16", 16}, +- +- {"exp", "__svml_exp2", 2}, +- {"exp", "__svml_exp4", 4}, +- {"exp", "__svml_exp8", 8}, +- +- {"expf", "__svml_expf4", 4}, +- {"expf", "__svml_expf8", 8}, +- {"expf", "__svml_expf16", 16}, +- +- { "__exp_finite", "__svml_exp2", 2 }, +- { "__exp_finite", "__svml_exp4", 4 }, +- { "__exp_finite", "__svml_exp8", 8 }, +- +- { "__expf_finite", "__svml_expf4", 4 }, +- { "__expf_finite", "__svml_expf8", 8 }, +- { "__expf_finite", "__svml_expf16", 16 }, +- +- {"llvm.exp.f64", "__svml_exp2", 2}, +- {"llvm.exp.f64", "__svml_exp4", 4}, +- {"llvm.exp.f64", "__svml_exp8", 8}, +- +- {"llvm.exp.f32", "__svml_expf4", 4}, +- {"llvm.exp.f32", "__svml_expf8", 8}, +- {"llvm.exp.f32", "__svml_expf16", 16}, +- +- {"log", "__svml_log2", 2}, +- {"log", "__svml_log4", 4}, +- {"log", "__svml_log8", 8}, +- +- {"logf", "__svml_logf4", 4}, +- {"logf", "__svml_logf8", 8}, +- {"logf", "__svml_logf16", 16}, +- +- { "__log_finite", "__svml_log2", 2 }, +- { "__log_finite", "__svml_log4", 4 }, +- { "__log_finite", "__svml_log8", 8 }, +- +- { "__logf_finite", "__svml_logf4", 4 }, +- { "__logf_finite", "__svml_logf8", 8 }, +- { "__logf_finite", "__svml_logf16", 16 }, +- +- {"llvm.log.f64", "__svml_log2", 2}, +- {"llvm.log.f64", "__svml_log4", 4}, +- {"llvm.log.f64", "__svml_log8", 8}, +- +- {"llvm.log.f32", "__svml_logf4", 4}, +- {"llvm.log.f32", "__svml_logf8", 8}, +- {"llvm.log.f32", "__svml_logf16", 16}, ++#define GET_SVML_VARIANTS ++#include "llvm/IR/SVML.gen" ++#undef GET_SVML_VARIANTS + }; + addVectorizableFunctions(VecFuncs); + break; +@@ -1486,16 +1407,21 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const { + return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName; + } + +-StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, +- unsigned VF) const { ++std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, ++ unsigned VF, bool &FromSVML, bool IsFast) const { ++ FromSVML = ClVectorLibrary == SVML; + F = sanitizeFunctionName(F); + if (F.empty()) + return F; + std::vector::const_iterator I = std::lower_bound( + VectorDescs.begin(), VectorDescs.end(), F, compareWithScalarFnName); + while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) { +- if (I->VectorizationFactor == VF) ++ if (I->VectorizationFactor == VF) { ++ if (FromSVML) { ++ return svmlMangle(I->VectorFnName, IsFast); ++ } + return I->VectorFnName; ++ } + ++I; + } + return StringRef(); +diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp +index d8be4ad..945d5f6 100644 +--- a/lib/AsmParser/LLLexer.cpp ++++ b/lib/AsmParser/LLLexer.cpp +@@ -592,6 +592,7 @@ lltok::Kind LLLexer::LexIdentifier() { + KEYWORD(spir_kernel); + KEYWORD(spir_func); + KEYWORD(intel_ocl_bicc); ++ KEYWORD(intel_svmlcc); + KEYWORD(x86_64_sysvcc); + KEYWORD(win64cc); + KEYWORD(x86_regcallcc); +diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp +index c3ab955..c1d9fa0 100644 +--- a/lib/AsmParser/LLParser.cpp ++++ b/lib/AsmParser/LLParser.cpp +@@ -1711,6 +1711,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) { + /// ::= 'ccc' + /// ::= 'fastcc' + /// ::= 'intel_ocl_bicc' ++/// ::= 'intel_svmlcc' + /// ::= 'coldcc' + /// ::= 'x86_stdcallcc' + /// ::= 'x86_fastcallcc' +@@ -1770,6 +1771,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) { + case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; + case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; + case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; ++ case lltok::kw_intel_svmlcc: CC = CallingConv::Intel_SVML; break; + case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; + case lltok::kw_win64cc: CC = CallingConv::Win64; break; + case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; +diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h +index ad826cc..08170f0 100644 +--- a/lib/AsmParser/LLToken.h ++++ b/lib/AsmParser/LLToken.h +@@ -130,6 +130,7 @@ enum Kind { + kw_fastcc, + kw_coldcc, + kw_intel_ocl_bicc, ++ kw_intel_svmlcc, + kw_x86_stdcallcc, + kw_x86_fastcallcc, + kw_x86_thiscallcc, +diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp +index 0fafe82..086aabc 100644 +--- a/lib/IR/AsmWriter.cpp ++++ b/lib/IR/AsmWriter.cpp +@@ -356,6 +356,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { + case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; + case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; + case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; ++ case CallingConv::Intel_SVML: Out << "intel_svmlcc"; break; + case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; + case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; + case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; +diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp +index 1754f7d..77fbe7e 100644 +--- a/lib/IR/Verifier.cpp ++++ b/lib/IR/Verifier.cpp +@@ -2025,6 +2025,7 @@ void Verifier::visitFunction(const Function &F) { + case CallingConv::Fast: + case CallingConv::Cold: + case CallingConv::Intel_OCL_BI: ++ case CallingConv::Intel_SVML: + case CallingConv::PTX_Kernel: + case CallingConv::PTX_Device: + Assert(!F.isVarArg(), "Calling convention does not support varargs or " +diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td +index 5d806fe..5db30d9 100644 +--- a/lib/Target/X86/X86CallingConv.td ++++ b/lib/Target/X86/X86CallingConv.td +@@ -469,12 +469,29 @@ def RetCC_X86_64 : CallingConv<[ + CCDelegateTo + ]>; + ++// Intel_SVML return-value convention. ++def RetCC_Intel_SVML : CallingConv<[ ++ // Vector types are returned in XMM0,XMM1 ++ CCIfType<[v4f32, v2f64], ++ CCAssignToReg<[XMM0,XMM1]>>, ++ ++ // 256-bit FP vectors ++ CCIfType<[v8f32, v4f64], ++ CCAssignToReg<[YMM0,YMM1]>>, ++ ++ // 512-bit FP vectors ++ CCIfType<[v16f32, v8f64], ++ CCAssignToReg<[ZMM0,ZMM1]>> ++]>; ++ + // This is the return-value convention used for the entire X86 backend. + def RetCC_X86 : CallingConv<[ + + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + ++ CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo>, ++ + CCIfSubtarget<"is64Bit()", CCDelegateTo>, + CCDelegateTo + ]>; +@@ -971,6 +988,22 @@ def CC_Intel_OCL_BI : CallingConv<[ + CCDelegateTo + ]>; + ++// X86-64 Intel Short Vector Math Library calling convention. ++def CC_Intel_SVML : CallingConv<[ ++ ++ // The SSE vector arguments are passed in XMM registers. ++ CCIfType<[v4f32, v2f64], ++ CCAssignToReg<[XMM0, XMM1, XMM2]>>, ++ ++ // The 256-bit vector arguments are passed in YMM registers. ++ CCIfType<[v8f32, v4f64], ++ CCAssignToReg<[YMM0, YMM1, YMM2]>>, ++ ++ // The 512-bit vector arguments are passed in ZMM registers. ++ CCIfType<[v16f32, v8f64], ++ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>> ++]>; ++ + def CC_X86_32_Intr : CallingConv<[ + CCAssignToStack<4, 4> + ]>; +@@ -1027,6 +1060,7 @@ def CC_X86_64 : CallingConv<[ + // This is the argument convention used for the entire X86 backend. + def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, ++ CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo>, + CCIfSubtarget<"is64Bit()", CCDelegateTo>, + CCDelegateTo + ]>; +@@ -1135,4 +1169,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 12, 15))>; + def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; +- ++ ++// SVML calling convention ++def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>; ++def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML, ++ K4, K5, K6, K7)>; ++ ++def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>; ++ ++def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "XMM%u", 8, 15))>; ++def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "XMM%u", 6, 15))>; ++ ++def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "YMM%u", 8, 15))>; ++def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "YMM%u", 6, 15))>; ++ ++def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "ZMM%u", 16, 31), ++ K4, K5, K6, K7)>; ++def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "ZMM%u", 6, 21), ++ K4, K5, K6, K7)>; +diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp +index 10e19f9..5af236a 100644 +--- a/lib/Target/X86/X86ISelLowering.cpp ++++ b/lib/Target/X86/X86ISelLowering.cpp +@@ -3203,7 +3203,8 @@ SDValue X86TargetLowering::LowerFormalArguments( + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget.hasAVX512() && + (Is64Bit || (CallConv == CallingConv::X86_VectorCall || +- CallConv == CallingConv::Intel_OCL_BI))) ++ CallConv == CallingConv::Intel_OCL_BI || ++ CallConv == CallingConv::Intel_SVML))) + VecVT = MVT::v16f32; + else if (Subtarget.hasAVX()) + VecVT = MVT::v8f32; +diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp +index bc31e95..a8b1fa6 100644 +--- a/lib/Target/X86/X86RegisterInfo.cpp ++++ b/lib/Target/X86/X86RegisterInfo.cpp +@@ -311,6 +311,23 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + return CSR_64_Intel_OCL_BI_SaveList; + break; + } ++ case CallingConv::Intel_SVML: { ++ if (Is64Bit) { ++ if (HasAVX512) ++ return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_SaveList : ++ CSR_64_Intel_SVML_AVX512_SaveList; ++ if (HasAVX) ++ return IsWin64 ? CSR_Win64_Intel_SVML_AVX_SaveList : ++ CSR_64_Intel_SVML_AVX_SaveList; ++ ++ return IsWin64 ? CSR_Win64_Intel_SVML_SaveList : ++ CSR_64_Intel_SVML_SaveList; ++ } else { // Is32Bit ++ if (HasAVX512) ++ return CSR_32_Intel_SVML_AVX512_SaveList; ++ return CSR_32_Intel_SVML_SaveList; ++ } ++ } + case CallingConv::HHVM: + return CSR_64_HHVM_SaveList; + case CallingConv::X86_RegCall: +@@ -425,6 +442,23 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + return CSR_64_Intel_OCL_BI_RegMask; + break; + } ++ case CallingConv::Intel_SVML: { ++ if (Is64Bit) { ++ if (HasAVX512) ++ return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_RegMask : ++ CSR_64_Intel_SVML_AVX512_RegMask; ++ if (HasAVX) ++ return IsWin64 ? CSR_Win64_Intel_SVML_AVX_RegMask : ++ CSR_64_Intel_SVML_AVX_RegMask; ++ ++ return IsWin64 ? CSR_Win64_Intel_SVML_RegMask : ++ CSR_64_Intel_SVML_RegMask; ++ } else { // Is32Bit ++ if (HasAVX512) ++ return CSR_32_Intel_SVML_AVX512_RegMask; ++ return CSR_32_Intel_SVML_RegMask; ++ } ++ } + case CallingConv::HHVM: + return CSR_64_HHVM_RegMask; + case CallingConv::X86_RegCall: +diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h +index 37ffac1..8ad2131 100644 +--- a/lib/Target/X86/X86Subtarget.h ++++ b/lib/Target/X86/X86Subtarget.h +@@ -673,6 +673,7 @@ public: + case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: + case CallingConv::Intel_OCL_BI: ++ case CallingConv::Intel_SVML: + return isTargetWin64(); + // This convention allows using the Win64 convention on other targets. + case CallingConv::Win64: +diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp +index 5bcf0c0..cef0009 100644 +--- a/lib/Transforms/Vectorize/LoopVectorize.cpp ++++ b/lib/Transforms/Vectorize/LoopVectorize.cpp +@@ -3974,6 +3974,17 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, + if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + return Cost; + ++ // this goes against LLVM coding philosophy, but it'll stop bleeding ++ bool IgnoreMe; ++ StringRef FuncName = TLI->getVectorizedFunction(FnName, VF, IgnoreMe, true); ++#if LLVM_ON_WIN32 ++ StringRef SvmlPrefix("\0_svml", 6); // nobody knows why symbols are like this ++#else ++ StringRef SvmlPrefix("__svml"); ++#endif ++ if (FuncName.startswith(SvmlPrefix) && !TTI.isTypeLegal(RetTy)) ++ return Cost; ++ + // If the corresponding vector cost is cheaper, return its cost. + unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); + if (VectorCallCost < Cost) { +@@ -4917,6 +4923,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { + } + + Function *VectorF; ++ bool FromSVML = false; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; +@@ -4925,7 +4932,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. +- StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); ++ bool IsFast = CI->getFastMathFlags().isFast(); ++ std::string VFnName = TLI->getVectorizedFunction(FnName, VF, FromSVML, IsFast); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { +@@ -4944,7 +4952,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { + + if (isa(V)) + V->copyFastMathFlags(CI); +- ++ if (FromSVML) V->setCallingConv(CallingConv::Intel_SVML); + VectorLoopValueMap.setVectorValue(&I, Part, V); + addMetadata(V, &I); + } +diff --git a/test/Transforms/LoopVectorize/X86/svml-calls.ll b/test/Transforms/LoopVectorize/X86/svml-calls.ll +index 6342a9d..39797c6 100644 +--- a/test/Transforms/LoopVectorize/X86/svml-calls.ll ++++ b/test/Transforms/LoopVectorize/X86/svml-calls.ll +@@ -182,4 +182,44 @@ for.end: ; preds = %for.body + ret void + } + ++; CHECK-LABEL: @atan2_finite ++; CHECK: <8 x double> @__svml_atan28 ++; CHECK: ret ++ ++declare double @__atan2_finite(double, double) local_unnamed_addr #0 ++ ++define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 { ++entry: ++ br label %for.cond1.preheader ++ ++for.cond1.preheader: ; preds = %for.inc7, %entry ++ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ] ++ %0 = trunc i64 %indvars.iv19 to i32 ++ %conv = sitofp i32 %0 to double ++ br label %for.body3 ++ ++for.body3: ; preds = %for.body3, %for.cond1.preheader ++ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] ++ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ++ %1 = trunc i64 %indvars.iv.next to i32 ++ %conv4 = sitofp i32 %1 to double ++ %call = tail call fast double @__atan2_finite(double %conv, double %conv4) ++ %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv ++ store double %call, double* %arrayidx6, align 8 ++ %exitcond = icmp eq i64 %indvars.iv.next, 100 ++ br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5 ++ ++for.inc7: ; preds = %for.body3 ++ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 ++ %exitcond21 = icmp eq i64 %indvars.iv.next20, 100 ++ br i1 %exitcond21, label %for.end9, label %for.cond1.preheader ++ ++for.end9: ; preds = %for.inc7 ++ ret void ++} ++ + attributes #0 = { nounwind readnone } ++ ++!5 = distinct !{!5, !6, !7} ++!6 = !{!"llvm.loop.vectorize.width", i32 8} ++!7 = !{!"llvm.loop.vectorize.enable", i1 true} +diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt +index 0944d54..7b68420 100644 +--- a/utils/TableGen/CMakeLists.txt ++++ b/utils/TableGen/CMakeLists.txt +@@ -36,6 +36,7 @@ add_tablegen(llvm-tblgen LLVM + SearchableTableEmitter.cpp + SubtargetEmitter.cpp + SubtargetFeatureInfo.cpp ++ SVMLEmitter.cpp + TableGen.cpp + Types.cpp + X86DisassemblerTables.cpp +diff --git a/utils/TableGen/SVMLEmitter.cpp b/utils/TableGen/SVMLEmitter.cpp +new file mode 100644 +index 0000000..c80f055 +--- /dev/null ++++ b/utils/TableGen/SVMLEmitter.cpp +@@ -0,0 +1,114 @@ ++//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This tablegen backend emits the scalar to svml function map for TLI. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "CodeGenTarget.h" ++#include "llvm/Support/Format.h" ++#include "llvm/TableGen/Error.h" ++#include "llvm/TableGen/Record.h" ++#include "llvm/TableGen/TableGenBackend.h" ++#include ++#include ++ ++using namespace llvm; ++ ++#define DEBUG_TYPE "SVMLVariants" ++#include "llvm/Support/Debug.h" ++ ++namespace { ++ ++class SVMLVariantsEmitter { ++ ++ RecordKeeper &Records; ++ ++private: ++ void emitSVMLVariants(raw_ostream &OS); ++ ++public: ++ SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {} ++ ++ void run(raw_ostream &OS); ++}; ++} // End anonymous namespace ++ ++/// \brief Emit the set of SVML variant function names. ++// The default is to emit the high accuracy SVML variants until a mechanism is ++// introduced to allow a selection of different variants through precision ++// requirements specified by the user. This code generates mappings to svml ++// that are in the scalar form of llvm intrinsics, math library calls, or the ++// finite variants of math library calls. ++void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) { ++ ++ unsigned MinSinglePrecVL = 4; ++ unsigned MaxSinglePrecVL = 16; ++ unsigned MinDoublePrecVL = 2; ++ unsigned MaxDoublePrecVL = 8; ++ ++ Record *SvmlVariantsClass = Records.getClass("SvmlVariant"); ++ assert(SvmlVariantsClass && ++ "SvmlVariant class not found in target description file!"); ++ ++ OS << "#ifdef GET_SVML_VARIANTS\n"; ++ ++ for (const auto &D : Records.getDefs()) { ++ std::string SvmlVariantNameStr = D.first; ++ // Single Precision SVML ++ for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) { ++ // Emit the scalar math library function to svml function entry. ++ OS << "{\"" << SvmlVariantNameStr << "f" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " ++ << VL << "},\n"; ++ ++ // Emit the scalar intrinsic to svml function entry. ++ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " ++ << VL << "},\n"; ++ ++ // Emit the finite math library function to svml function entry. ++ OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " ++ << VL << "},\n"; ++ } ++ ++ // Double Precision SVML ++ for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) { ++ // Emit the scalar math library function to svml function entry. ++ OS << "{\"" << SvmlVariantNameStr << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL ++ << "},\n"; ++ ++ // Emit the scalar intrinsic to svml function entry. ++ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL ++ << "},\n"; ++ ++ // Emit the finite math library function to svml function entry. ++ OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " ++ << VL << "},\n"; ++ } ++ } ++ ++ OS << "#endif // GET_SVML_VARIANTS\n\n"; ++} ++ ++void SVMLVariantsEmitter::run(raw_ostream &OS) { ++ emitSVMLVariants(OS); ++} ++ ++namespace llvm { ++ ++void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) { ++ SVMLVariantsEmitter(RK).run(OS); ++} ++ ++} // End llvm namespace +diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp +index b0e0385..3e8cd88 100644 +--- a/utils/TableGen/TableGen.cpp ++++ b/utils/TableGen/TableGen.cpp +@@ -49,6 +49,7 @@ enum ActionType { + GenX86EVEX2VEXTables, + GenX86FoldTables, + GenRegisterBank, ++ GenSVMLVariants, + }; + + namespace { +@@ -105,7 +106,9 @@ namespace { + clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", + "Generate X86 fold tables"), + clEnumValN(GenRegisterBank, "gen-register-bank", +- "Generate registers bank descriptions"))); ++ "Generate registers bank descriptions"), ++ clEnumValN(GenSVMLVariants, "gen-svml", ++ "Generate SVML variant function names"))); + + cl::OptionCategory PrintEnumsCat("Options for -print-enums"); + cl::opt +@@ -207,6 +210,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { + case GenX86FoldTables: + EmitX86FoldTables(Records, OS); + break; ++ case GenSVMLVariants: ++ EmitSVMLVariants(Records, OS); ++ break; + } + + return false; +diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h +index 914cd5a..bdf8b4d 100644 +--- a/utils/TableGen/TableGenBackends.h ++++ b/utils/TableGen/TableGenBackends.h +@@ -85,6 +85,7 @@ void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS); + void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS); + void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); + void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); ++void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS); + + } // End llvm namespace + +diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim +index 42a4cf3..9198a6f 100644 +--- a/utils/vim/syntax/llvm.vim ++++ b/utils/vim/syntax/llvm.vim +@@ -92,6 +92,7 @@ syn keyword llvmKeyword + \ inreg + \ inteldialect + \ intel_ocl_bicc ++ \ intel_svmlcc + \ internal + \ linkonce + \ linkonce_odr diff --git a/conda-recipes/llvmdev/meta.yaml b/conda-recipes/llvmdev/meta.yaml index 0789a009d..08f39df37 100644 --- a/conda-recipes/llvmdev/meta.yaml +++ b/conda-recipes/llvmdev/meta.yaml @@ -1,7 +1,18 @@ -{% set shortversion = "7.0" %} -{% set version = "7.0.0" %} -{% set sha256 = "8bc1f844e6cbde1b652c19c1edebc1864456fd9c78b8c1bea038e51b363fe222" %} -{% set build_number = "0" %} +{% set shortversion = "6.0" %} + +{% if ppc64le %} + +{% set version = "6.0.1" %} +{% set sha256 = "b6d6c324f9c71494c0ccaf3dac1f16236d970002b42bb24a6c9e1634f7d0f4e2" %} +{% set build_number = "1" %} + +{% else %} + +{% set version = "6.0.0" %} +{% set sha256 = "1ff53c915b4e761ef400b803f07261ade637b0c269d99569f18040f3dcee4408" %} +{% set build_number = "5" %} + +{% endif %} package: name: llvmdev @@ -15,11 +26,9 @@ source: # http://lists.llvm.org/pipermail/llvm-dev/2016-January/094520.html - ../llvm-lto-static.patch # [win] # Intel SVML optimizations - # The second part of this patch was published as: - # https://reviews.llvm.org/D53035 - # (the first, as mentioned in the patch itself, was: - # https://reviews.llvm.org/D47188) - - ../D47188-svml-VF.patch + - ../D47188-svml.patch + # https://reviews.llvm.org/D44140 Fix LLVM-C symbol export, backport to 6.0.0 from upstream + - ../0001-Transforms-Add-missing-header-for-InstructionCombini.patch # [not ppc64le] # undefined behavior bug due to Twine usage - ../twine_cfg_undefined_behavior.patch diff --git a/conda-recipes/llvmdev_manylinux1/meta.yaml b/conda-recipes/llvmdev_manylinux1/meta.yaml index bb4e10dd5..03b6e98c8 100644 --- a/conda-recipes/llvmdev_manylinux1/meta.yaml +++ b/conda-recipes/llvmdev_manylinux1/meta.yaml @@ -1,8 +1,7 @@ -{% set shortversion = "7.0" %} -{% set version = "7.0.0" %} -{% set sha256 = "8bc1f844e6cbde1b652c19c1edebc1864456fd9c78b8c1bea038e51b363fe222" %} -{% set build_number = "0" %} - +{% set shortversion = "6.0" %} +{% set version = "6.0.0" %} +{% set sha256 = "1ff53c915b4e761ef400b803f07261ade637b0c269d99569f18040f3dcee4408" %} +{% set build_number = "2" %} package: name: llvmdev @@ -16,7 +15,9 @@ source: # http://lists.llvm.org/pipermail/llvm-dev/2016-January/094520.html - ../llvm-lto-static.patch # [win] # Intel SVML optimizations - - ../D47188-svml-VF.patch + - ../D47188-svml.patch + # https://reviews.llvm.org/D44140 Fix LLVM-C symbol export + - ../0001-Transforms-Add-missing-header-for-InstructionCombini.patch # undefined behavior bug due to Twine usage - ../twine_cfg_undefined_behavior.patch diff --git a/conda-recipes/llvmlite/meta.yaml b/conda-recipes/llvmlite/meta.yaml index 4e41a91ed..7d61a99bb 100644 --- a/conda-recipes/llvmlite/meta.yaml +++ b/conda-recipes/llvmlite/meta.yaml @@ -25,7 +25,7 @@ requirements: host: - python # On channel https://anaconda.org/numba/ - - llvmdev 7.0* + - llvmdev 6.0* - vs2015_runtime # [win] - enum34 # [py27] # llvmdev is built with libz compression support diff --git a/docs/source/admin-guide/install.rst b/docs/source/admin-guide/install.rst index a08d47dad..e0beb41f0 100644 --- a/docs/source/admin-guide/install.rst +++ b/docs/source/admin-guide/install.rst @@ -73,14 +73,16 @@ The LLVM build process is fully scripted by conda-build_, and the `llvmdev recip The manual instructions below describe the main steps, but refer to the recipe for details: -#. Download the `LLVM 7.0.0 source code `_. +#. Download the `LLVM 6.0.0 source code `_. +(Note that PPC64LE requires LLVM 6.0.1 for specific bug fixes.) #. Download or git checkout the `llvmlite source code `_. #. Decompress the LLVM tar file and apply the following patches from the ``llvmlite/conda-recipes/`` directory: #. ``llvm-lto-static.patch``: Fix issue with LTO shared library on Windows - #. ``D47188-svml-VF.patch``: Add support for vectorized math functions via Intel SVML + #. ``D47188-svml.patch``: Add support for vectorized math functions via Intel SVML + #. ``0001-Transforms-Add-missing-header-for-InstructionCombini.patch``: Fix release bug with LLVM 6.0.0, skip on LLVM 6.0.1. #. ``twine_cfg_undefined_behavior.patch``: Fix obscure memory corruption bug in LLVM that hasn't been fixed in master yet #. For Linux/macOS: diff --git a/docs/source/conf.py b/docs/source/conf.py index 0f70f4b49..787d70b50 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -58,9 +58,9 @@ # built documents. # # The short X.Y version. -version = '0.25.0' +version = '0.24.0' # The full version, including alpha/beta/rc tags. -release = '0.25.0' +release = '0.24.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -301,5 +301,5 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { 'python': ('https://docs.python.org/3', None), - 'llvm': ('http://llvm.org/releases/7.0.0/docs', None), + 'llvm': ('http://llvm.org/releases/6.0.0/docs', None), } diff --git a/docs/source/user-guide/ir/index.rst b/docs/source/user-guide/ir/index.rst index 3eebc5e69..26ed9d053 100644 --- a/docs/source/user-guide/ir/index.rst +++ b/docs/source/user-guide/ir/index.rst @@ -18,7 +18,7 @@ construct a pure Python representation of the IR. To use this module, you should be familiar with the concepts in the `LLVM Language Reference -`_. +`_. .. toctree:: :maxdepth: 1 diff --git a/ffi/build.py b/ffi/build.py index 5933fc3c4..643ff07f2 100755 --- a/ffi/build.py +++ b/ffi/build.py @@ -109,9 +109,9 @@ def main_posix(kind, library_ext): out = out.decode('latin1') print(out) - if not out.startswith('7.0.'): + if not out.startswith('6.0.'): msg = ( - "Building llvmlite requires LLVM 7.0.x. Be sure to " + "Building llvmlite requires LLVM 6.0.x. Be sure to " "set LLVM_CONFIG to the right executable path.\n" "Read the documentation at http://llvmlite.pydata.org/ for more " "information about building llvmlite.\n" diff --git a/llvmlite/tests/test_binding.py b/llvmlite/tests/test_binding.py index 0b50ca603..0f3468d88 100644 --- a/llvmlite/tests/test_binding.py +++ b/llvmlite/tests/test_binding.py @@ -333,7 +333,7 @@ def test_set_option(self): def test_version(self): major, minor, patch = llvm.llvm_version_info - self.assertEqual((major, minor), (7, 0)) + self.assertEqual((major, minor), (6, 0)) self.assertIn(patch, range(10)) def test_check_jit_execution(self): @@ -1041,29 +1041,9 @@ def test_run(self): orig_asm = str(mod) pm.run(mod) opt_asm = str(mod) - # Quick check that optimizations were run, should get: - # define i32 @sum(i32 %.1, i32 %.2) local_unnamed_addr #0 { - # %.X = add i32 %.2, %.1 - # ret i32 %.X - # } - # where X in %.X is 3 or 4 - opt_asm_split = opt_asm.splitlines() - for idx, l in enumerate(opt_asm_split): - if l.strip().startswith('ret i32'): - toks = {'%.3', '%.4'} - for t in toks: - if t in l: - break - else: - raise RuntimeError("expected tokens not found") - add_line = opt_asm_split[idx] - othertoken = (toks ^ {t}).pop() - - self.assertIn("%.3", orig_asm) - self.assertNotIn(othertoken, opt_asm) - break - else: - raise RuntimeError("expected IR not found") + # Quick check that optimizations were run + self.assertIn("%.3", orig_asm) + self.assertNotIn("%.3", opt_asm) class TestFunctionPassManager(BaseTest, PassManagerTestMixin):