diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index 178faf6c3b28e..019975cc3ba05 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -7,6 +7,7 @@ #include "llvm-version.h" #include "support/dtypes.h" +#include #include #include @@ -99,23 +100,41 @@ void LowerPTLS::runOnFunction(LLVMContext &ctx, Module &M, Function *F, ptlsStates->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); } - else if (jl_tls_offset != -1) { #ifdef LLVM37 + else if (jl_tls_offset != -1) { auto T_int8 = Type::getInt8Ty(ctx); auto T_pint8 = PointerType::get(T_int8, 0); - auto T_size = (sizeof(size_t) == 8 ? Type::getInt64Ty(ctx) : - Type::getInt32Ty(ctx)); // Replace the function call with inline assembly if we know // how to generate it. - const char *asm_str = nullptr; +# if defined(_CPU_X86_64_) || defined(_CPU_X86_) + // Workaround LLVM bug by hiding the offset computation + // (and therefore the optimization opportunity) from LLVM. + static const std::string asm_str = [&] () { + std::stringstream stm; # if defined(_CPU_X86_64_) - asm_str = "movq %fs:0, $0"; -# elif defined(_CPU_X86_) - asm_str = "movl %gs:0, $0"; -# elif defined(_CPU_AARCH64_) - asm_str = "mrs $0, tpidr_el0"; + stm << "movq %fs:0, $0;\naddq $$" << jl_tls_offset << ", $0"; +# else + stm << "movl %gs:0, $0;\naddl $$" << jl_tls_offset << ", $0"; # endif - assert(asm_str && "Cannot emit thread pointer for this architecture."); + return stm.str(); + }(); + // The add instruction clobbers flags + auto tp = InlineAsm::get(FunctionType::get(T_pint8, false), + asm_str.c_str(), + "=r,~{dirflag},~{fpsr},~{flags}", false); + Value *tls = CallInst::Create(tp, "ptls_i8", ptlsStates); + tls = new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0), + "ptls", ptlsStates); +# elif defined(_CPU_AARCH64_) + // AArch64 doesn't seem to have this issue. + // (Possibly because there are many more registers and the offset is + // positive and small) + // It's also harder to emit the offset in a generic way on AArch64 + // (need to generate one or two `add` with shift) so let llvm emit + // the add for now. + auto T_size = (sizeof(size_t) == 8 ? Type::getInt64Ty(ctx) : + Type::getInt32Ty(ctx)); + const char *asm_str = "mrs $0, tpidr_el0"; auto offset = ConstantInt::getSigned(T_size, jl_tls_offset); auto tp = InlineAsm::get(FunctionType::get(T_pint8, false), asm_str, "=r", false); @@ -124,10 +143,14 @@ void LowerPTLS::runOnFunction(LLVMContext &ctx, Module &M, Function *F, "ptls_i8", ptlsStates); tls = new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0), "ptls", ptlsStates); +# else + Value *tls = nullptr; + assert(0 && "Cannot emit thread pointer for this architecture."); +# endif ptlsStates->replaceAllUsesWith(tls); ptlsStates->eraseFromParent(); -#endif } +#endif else { ptlsStates->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); diff --git a/src/threading.c b/src/threading.c index b4de746abe526..6b1d373c7f1f6 100644 --- a/src/threading.c +++ b/src/threading.c @@ -147,7 +147,7 @@ jl_get_ptls_states_func jl_get_ptls_states_getter(void) #if defined(__GLIBC__) && (defined(_CPU_X86_64_) || defined(_CPU_X86_) || \ ((defined(_CPU_AARCH64_) || defined(_CPU_ARM_) || \ defined(_CPU_PPC64_) || defined(_CPU_PPC_)) && \ - __GNUC__ >= 5)) + __GNUC__ >= 6)) // Only enable this on architectures that are tested. // For example, GCC doesn't seem to support the `ifunc` attribute on power yet. # if __GLIBC_PREREQ(2, 12)