From a8d7b68072d15d0ec1a19e27a8064253bea912c5 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 19 Sep 2024 12:45:52 -0400 Subject: [PATCH 1/2] Improvements to JITLink Seeing what this will look like, since it has a number of features (delayed compilation, concurrent compilation) that are starting to become important, so it would be nice to switch to only supporting one common implementation of memory management. Refs #50248 I am expecting https://github.com/llvm/llvm-project/issues/63236 may cause some problems, since we reconfigured some CI machines to minimize that issue, but it is still likely relevant. --- src/codegen.cpp | 14 +-- src/jitlayers.cpp | 289 +++++++++++++++++++++++++--------------------- src/jitlayers.h | 26 ++--- 3 files changed, 174 insertions(+), 155 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index a7a985284c87b..cee88a3d20f3d 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -10396,7 +10396,7 @@ static void init_jit_functions(void) } #ifdef JL_USE_INTEL_JITEVENTS -char jl_using_intel_jitevents; // Non-zero if running under Intel VTune Amplifier +char jl_using_intel_jitevents = 0; // Non-zero if running under Intel VTune Amplifier #endif #ifdef JL_USE_OPROFILE_JITEVENTS @@ -10510,9 +10510,6 @@ extern "C" void jl_init_llvm(void) #if defined(JL_USE_INTEL_JITEVENTS) || \ defined(JL_USE_OPROFILE_JITEVENTS) || \ defined(JL_USE_PERF_JITEVENTS) -#ifdef JL_USE_JITLINK -#pragma message("JIT profiling support (JL_USE_*_JITEVENTS) not yet available on platforms that use JITLink") -#else const char *jit_profiling = getenv("ENABLE_JITPROFILING"); #if defined(JL_USE_INTEL_JITEVENTS) @@ -10529,24 +10526,23 @@ extern "C" void jl_init_llvm(void) #if defined(JL_USE_PERF_JITEVENTS) if (jit_profiling && atoi(jit_profiling)) { - jl_using_perf_jitevents= 1; + jl_using_perf_jitevents = 1; } #endif #ifdef JL_USE_INTEL_JITEVENTS if (jl_using_intel_jitevents) - jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createIntelJITEventListener()); + jl_ExecutionEngine->enableIntelJITEventListener(); #endif #ifdef JL_USE_OPROFILE_JITEVENTS if (jl_using_oprofile_jitevents) - jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createOProfileJITEventListener()); + jl_ExecutionEngine->enableOProfileJITEventListener(); #endif #ifdef JL_USE_PERF_JITEVENTS if (jl_using_perf_jitevents) - jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createPerfJITEventListener()); -#endif + jl_ExecutionEngine->enablePerfJITEventListener(); #endif #endif diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 442103c91be0f..7489b086105e6 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -14,6 +14,15 @@ #include #include #include +#if JL_LLVM_VERSION >= 180000 +#include +#include +#include +#endif +#if JL_LLVM_VERSION >= 190000 +#include +#include +#endif #include #include #include @@ -142,13 +151,14 @@ void jl_dump_llvm_opt_impl(void *s) **jl_ExecutionEngine->get_dump_llvm_opt_stream() = (ios_t*)s; } +#ifndef JL_USE_JITLINK static int jl_add_to_ee( orc::ThreadSafeModule &M, const StringMap &NewExports, DenseMap &Queued, SmallVectorImpl &Stack) JL_NOTSAFEPOINT; +#endif static void jl_decorate_module(Module &M) JL_NOTSAFEPOINT; -static uint64_t getAddressForFunction(StringRef fname) JL_NOTSAFEPOINT; void jl_link_global(GlobalVariable *GV, void *addr) JL_NOTSAFEPOINT { @@ -177,23 +187,6 @@ void jl_jit_globals(std::map &globals) JL_NOTSAFEPOINT } } -// used for image_codegen, where we keep all the gvs external -// so we can't jit them directly into each module -static orc::ThreadSafeModule jl_get_globals_module(orc::ThreadSafeContext &ctx, const DataLayout &DL, const Triple &T, std::map &globals) JL_NOTSAFEPOINT -{ - auto lock = ctx.getLock(); - auto GTSM = jl_create_ts_module("globals", ctx, DL, T); - auto GM = GTSM.getModuleUnlocked(); - for (auto &global : globals) { - auto GV = global.second; - auto GV2 = new GlobalVariable(*GM, GV->getValueType(), GV->isConstant(), GlobalValue::ExternalLinkage, literal_static_pointer_val(global.first, GV->getValueType()), GV->getName(), nullptr, GV->getThreadLocalMode(), GV->getAddressSpace(), false); - GV2->copyAttributesFrom(GV); - GV2->setDSOLocal(true); - GV2->setAlignment(GV->getAlign()); - } - return GTSM; -} - // this generates llvm code for the lambda info // and adds the result to the jitlayers // (and the shadow module), @@ -238,8 +231,21 @@ static jl_callptr_t _jl_compile_codeinst( // to ensure that the globals are defined when they are compiled. if (params.imaging_mode) { // Won't contain any PLT/dlsym calls, so no need to optimize those - jl_ExecutionEngine->addModule(jl_get_globals_module(params.tsctx, params.DL, params.TargetTriple, params.global_targets)); - } else { + if (!params.global_targets.empty()) { + void **globalslots = new void*[params.global_targets.size()]; + void **slot = globalslots; + for (auto &global : params.global_targets) { + auto GV = global.second; + *slot = global.first; + jl_ExecutionEngine->addGlobalMapping(GV->getName(), (uintptr_t)slot); + slot++; + } +#ifdef __clang_analyzer__ + static void **leaker = globalslots; // for the purpose of the analyzer, we need to expressly leak this variable or it thinks we forgot to free it +#endif + } + } + else { StringMap NewGlobals; for (auto &global : params.global_targets) { NewGlobals[global.second->getName()] = global.first; @@ -255,6 +261,7 @@ static jl_callptr_t _jl_compile_codeinst( } } +#ifndef JL_USE_JITLINK // Collect the exported functions from the params.compiled_functions modules, // which form dependencies on which functions need to be // compiled first. Cycles of functions are compiled together. @@ -281,18 +288,40 @@ static jl_callptr_t _jl_compile_codeinst( jl_add_to_ee(M, NewExports, Queued, Stack); assert(Queued.empty() && Stack.empty() && !M); } +#else + for (auto &def : params.compiled_functions) { + // Add the results to the execution engine now + orc::ThreadSafeModule &M = std::get<0>(def.second); + if (M) + jl_ExecutionEngine->addModule(std::move(M)); + } +#endif ++CompiledCodeinsts; MaxWorkqueueSize.updateMax(params.compiled_functions.size()); IndirectCodeinsts += params.compiled_functions.size() - 1; } + // batch compile job for all new functions + SmallVector NewDefs; + for (auto &def : params.compiled_functions) { + jl_llvm_functions_t &decls = std::get<1>(def.second); + if (decls.functionObject != "jl_fptr_args" && + decls.functionObject != "jl_fptr_sparam" && + decls.functionObject != "jl_f_opaque_closure_call") + NewDefs.push_back(decls.functionObject); + if (!decls.specFunctionObject.empty()) + NewDefs.push_back(decls.specFunctionObject); + } + auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs); + size_t i = 0; + size_t nextaddr = 0; for (auto &def : params.compiled_functions) { jl_code_instance_t *this_code = def.first; if (i < jl_timing_print_limit) jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_DEFAULT_BLOCK); - jl_llvm_functions_t decls = std::get<1>(def.second); + jl_llvm_functions_t &decls = std::get<1>(def.second); jl_callptr_t addr; bool isspecsig = false; if (decls.functionObject == "jl_fptr_args") { @@ -305,12 +334,16 @@ static jl_callptr_t _jl_compile_codeinst( addr = jl_f_opaque_closure_call_addr; } else { - addr = (jl_callptr_t)getAddressForFunction(decls.functionObject); + assert(NewDefs[nextaddr] == decls.functionObject); + addr = (jl_callptr_t)Addrs[nextaddr++]; + assert(addr); isspecsig = true; } if (!decls.specFunctionObject.empty()) { void *prev_specptr = NULL; - auto spec = (void*)getAddressForFunction(decls.specFunctionObject); + assert(NewDefs[nextaddr] == decls.specFunctionObject); + void *spec = (void*)Addrs[nextaddr++]; + assert(spec); if (jl_atomic_cmpswap_acqrel(&this_code->specptr.fptr, &prev_specptr, spec)) { // only set specsig and invoke if we were the first to set specptr jl_atomic_store_relaxed(&this_code->specsigflags, (uint8_t) isspecsig); @@ -601,48 +634,6 @@ static auto countBasicBlocks(const Function &F) JL_NOTSAFEPOINT static constexpr size_t N_optlevels = 4; -static Expected validateExternRelocations(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT { -#if !defined(JL_NDEBUG) && !defined(JL_USE_JITLINK) - auto isIntrinsicFunction = [](GlobalObject &GO) JL_NOTSAFEPOINT { - auto F = dyn_cast(&GO); - if (!F) - return false; - return F->isIntrinsic() || F->getName().starts_with("julia."); - }; - // validate the relocations for M (only for RuntimeDyld, JITLink performs its own symbol validation) - auto Err = TSM.withModuleDo([isIntrinsicFunction](Module &M) JL_NOTSAFEPOINT { - Error Err = Error::success(); - for (auto &GO : make_early_inc_range(M.global_objects())) { - if (!GO.isDeclarationForLinker()) - continue; - if (GO.use_empty()) { - GO.eraseFromParent(); - continue; - } - if (isIntrinsicFunction(GO)) - continue; - auto sym = jl_ExecutionEngine->findUnmangledSymbol(GO.getName()); - if (sym) - continue; - // TODO have we ever run into this check? It's been guaranteed to not - // fire in an assert build, since previously LLVM would abort due to - // not handling the error if we didn't find the unmangled symbol - if (SectionMemoryManager::getSymbolAddressInProcess( - jl_ExecutionEngine->getMangledName(GO.getName()))) { - consumeError(sym.takeError()); - continue; - } - Err = joinErrors(std::move(Err), sym.takeError()); - } - return Err; - }); - if (Err) { - return std::move(Err); - } -#endif - return std::move(TSM); -} - static Expected selectOptLevel(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) { TSM.withModuleDo([](Module &M) { size_t opt_level = std::max(static_cast(jl_options.opt_level), 0); @@ -673,18 +664,6 @@ static Expected selectOptLevel(orc::ThreadSafeModule TSM, return std::move(TSM); } -static void recordDebugTSM(orc::MaterializationResponsibility &, orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT { - auto ptr = TSM.withModuleDo([](Module &M) JL_NOTSAFEPOINT { - auto md = M.getModuleFlag("julia.__jit_debug_tsm_addr"); - if (!md) - return static_cast(nullptr); - return reinterpret_cast(cast(cast(md)->getValue())->getZExtValue()); - }); - if (ptr) { - *ptr = std::move(TSM); - } -} - void jl_register_jit_object(const object::ObjectFile &debugObj, std::function getLoadAddress, std::function lookupWriteAddress); @@ -1584,6 +1563,7 @@ JuliaOJIT::JuliaOJIT() #ifdef JL_USE_JITLINK MemMgr(createJITLinkMemoryManager()), ObjectLayer(ES, *MemMgr), + CompileLayer(ES, ObjectLayer, std::make_unique>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)), #else MemMgr(createRTDyldMemoryManager()), ObjectLayer( @@ -1593,15 +1573,12 @@ JuliaOJIT::JuliaOJIT() return result; } ), -#endif LockLayer(ObjectLayer), CompileLayer(ES, LockLayer, std::make_unique>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)), +#endif JITPointersLayer(ES, CompileLayer, orc::IRTransformLayer::TransformFunction(JITPointersT(SharedBytes, RLST_mutex))), OptimizeLayer(ES, JITPointersLayer, orc::IRTransformLayer::TransformFunction(OptimizerT(*TM, PrintLLVMTimers, llvm_printing_mutex))), - OptSelLayer(ES, OptimizeLayer, orc::IRTransformLayer::TransformFunction(selectOptLevel)), - DepsVerifyLayer(ES, OptSelLayer, orc::IRTransformLayer::TransformFunction(validateExternRelocations)), - ExternalCompileLayer(ES, LockLayer, - std::make_unique>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)) + OptSelLayer(ES, OptimizeLayer, orc::IRTransformLayer::TransformFunction(selectOptLevel)) { JL_MUTEX_INIT(&this->jitlock, "JuliaOJIT"); #ifdef JL_USE_JITLINK @@ -1625,7 +1602,6 @@ JuliaOJIT::JuliaOJIT() registerRTDyldJITObject(Object, LO, MemMgr); }); #endif - CompileLayer.setNotifyCompiled(recordDebugTSM); std::string ErrorStr; @@ -1786,8 +1762,8 @@ void JuliaOJIT::addModule(orc::ThreadSafeModule TSM) { JL_TIMING(LLVM_JIT, JIT_Total); ++ModulesAdded; +#ifndef JL_USE_JITLINK orc::SymbolLookupSet NewExports; - orc::ThreadSafeModule CurrentlyCompiling; TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT { for (auto &F : M.global_values()) { if (!F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) { @@ -1796,42 +1772,24 @@ void JuliaOJIT::addModule(orc::ThreadSafeModule TSM) } } assert(!verifyLLVMIR(M)); - auto jit_debug_tsm_addr = ConstantInt::get(Type::getIntNTy(M.getContext(), sizeof(void*) * CHAR_BIT), (uintptr_t) &CurrentlyCompiling); - M.addModuleFlag(Module::Error, "julia.__jit_debug_tsm_addr", jit_debug_tsm_addr); }); +#endif - // TODO: what is the performance characteristics of this? - auto Err = DepsVerifyLayer.add(JD, std::move(TSM)); + auto Err = OptSelLayer.add(JD, std::move(TSM)); if (Err) { ES.reportError(std::move(Err)); errs() << "Failed to add module to JIT!\n"; - if (CurrentlyCompiling) { - CurrentlyCompiling.withModuleDo([](Module &M) JL_NOTSAFEPOINT { errs() << "Dumping failing module\n" << M << "\n"; }); - } else { - errs() << "Module unavailable to be printed\n"; - } abort(); } +#ifndef JL_USE_JITLINK // force eager compilation (for now), due to memory management specifics // (can't handle compilation recursion) auto Lookups = ES.lookup({{&JD, orc::JITDylibLookupFlags::MatchExportedSymbolsOnly}}, NewExports); if (!Lookups) { ES.reportError(Lookups.takeError()); errs() << "Failed to lookup symbols in module!\n"; - if (CurrentlyCompiling) { - CurrentlyCompiling.withModuleDo([](Module &M) JL_NOTSAFEPOINT { errs() << "Dumping failing module\n" << M << "\n"; }); - } else { - errs() << "Module unavailable to be printed\n"; - } - } - for (auto &Sym : *Lookups) { - #if JL_LLVM_VERSION >= 170000 - assert(Sym.second.getAddress()); - #else - assert(Sym.second); - #endif - (void) Sym; } +#endif } Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM, bool ShouldOptimize) @@ -1850,12 +1808,33 @@ Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM, return Error::success(); })) return Err; - return ExternalCompileLayer.add(JD.getDefaultResourceTracker(), std::move(TSM)); + return CompileLayer.add(JD.getDefaultResourceTracker(), std::move(TSM)); } Error JuliaOJIT::addObjectFile(orc::JITDylib &JD, std::unique_ptr Obj) { assert(Obj && "Can not add null object"); +#ifdef JL_USE_JITLINK + return ObjectLayer.add(JD.getDefaultResourceTracker(), std::move(Obj)); +#else return LockLayer.add(JD.getDefaultResourceTracker(), std::move(Obj)); +#endif +} + +SmallVector JuliaOJIT::findSymbols(ArrayRef Names) +{ + DenseMap Unmangled; + orc::SymbolLookupSet Exports; + for (StringRef Name : Names) { + auto Mangled = ES.intern(getMangledName(Name)); + Unmangled[NonOwningSymbolStringPtr(Mangled)] = Unmangled.size(); + Exports.add(std::move(Mangled)); + } + SymbolMap Syms = cantFail(ES.lookup(orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports))); + SmallVector Addrs(Names.size()); + for (auto it : Syms) { + Addrs[Unmangled.at(orc::NonOwningSymbolStringPtr(it.first))] = it.second.getAddress().getValue(); + } + return Addrs; } #if JL_LLVM_VERSION >= 170000 @@ -1887,7 +1866,7 @@ uint64_t JuliaOJIT::getGlobalValueAddress(StringRef Name) consumeError(addr.takeError()); return 0; } - return cantFail(std::move(addr)).getAddress().getValue(); + return addr->getAddress().getValue(); } uint64_t JuliaOJIT::getFunctionAddress(StringRef Name) @@ -1897,7 +1876,7 @@ uint64_t JuliaOJIT::getFunctionAddress(StringRef Name) consumeError(addr.takeError()); return 0; } - return cantFail(std::move(addr)).getAddress().getValue(); + return addr->getAddress().getValue(); } #else JL_JITSymbol JuliaOJIT::findSymbol(StringRef Name, bool ExportedSymbolsOnly) @@ -1973,41 +1952,92 @@ StringRef JuliaOJIT::getFunctionAtAddress(uint64_t Addr, jl_callptr_t invoke, jl return *fname; } - #ifdef JL_USE_JITLINK -extern "C" orc::shared::CWrapperFunctionResult -llvm_orc_registerJITLoaderGDBAllocAction(const char *Data, size_t Size); +#if JL_LLVM_VERSION >= 170000 +#define addAbsoluteToMap(map,name) \ + (map[mangle(#name)] = {ExecutorAddr::fromPtr(&name), JITSymbolFlags::Exported | JITSymbolFlags::Callable}, orc::ExecutorAddr::fromPtr(&name)) +#else +#define addAbsoluteToMap(map,name) \ + (map[mangle(#name)] = JITEvaluatedSymbol::fromPointer(&name, JITSymbolFlags::Exported | JITSymbolFlags::Callable), orc::ExecutorAddr::fromPtr(&name)) +#endif void JuliaOJIT::enableJITDebuggingSupport() { orc::SymbolMap GDBFunctions; - #if JL_LLVM_VERSION >= 170000 - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBAllocAction")] = {ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBAllocAction), JITSymbolFlags::Exported | JITSymbolFlags::Callable}; - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBWrapper")] = {ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBWrapper), JITSymbolFlags::Exported | JITSymbolFlags::Callable}; - #else - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBAllocAction")] = JITEvaluatedSymbol::fromPointer(&llvm_orc_registerJITLoaderGDBAllocAction, JITSymbolFlags::Exported | JITSymbolFlags::Callable); - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBWrapper")] = JITEvaluatedSymbol::fromPointer(&llvm_orc_registerJITLoaderGDBWrapper, JITSymbolFlags::Exported | JITSymbolFlags::Callable); - #endif + addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBAllocAction); + auto registerJITLoaderGDBWrapper = addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBWrapper); cantFail(JD.define(orc::absoluteSymbols(GDBFunctions))); if (TM->getTargetTriple().isOSBinFormatMachO()) ObjectLayer.addPlugin(cantFail(orc::GDBJITDebugInfoRegistrationPlugin::Create(ES, JD, TM->getTargetTriple()))); #ifndef _COMPILER_ASAN_ENABLED_ // TODO: Fix duplicated sections spam #51794 else if (TM->getTargetTriple().isOSBinFormatELF()) //EPCDebugObjectRegistrar doesn't take a JITDylib, so we have to directly provide the call address - ObjectLayer.addPlugin(std::make_unique(ES, std::make_unique(ES, orc::ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBWrapper)))); + ObjectLayer.addPlugin(std::make_unique(ES, std::make_unique(ES, registerJITLoaderGDBWrapper))); +#endif +} + +void JuliaOJIT::enableIntelJITEventListener() +{ +#if JL_LLVM_VERSION >= 190000 + if (TT.isOSBinFormatELF()) { + orc::SymbolMap VTuneFunctions; + auto RegisterImplAddr = addAbsoluteToMap(VTuneFunctions,llvm_orc_registerVTuneImpl); + auto UnregisterImplAddr = addAbsoluteToMap(VTuneFunctions,llvm_orc_unregisterVTuneImpl); + ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create())); + //ObjectLayer.addPlugin(cantFail(VTuneSupportPlugin::Create(ES.getExecutorProcessControl(), + // JD, /*EmitDebugInfo=*/true, + // /*TestMode=*/false))); + bool EmitDebugInfo = true; + ObjectLayer.addPlugin(std::make_unique( + ES.getExecutorProcessControl(), RegisterImplAddr, UnregisterImplAddr, EmitDebugInfo)); + } +#endif +} + +void JuliaOJIT::enableOProfileJITEventListener() +{ + // implement when available in LLVM +} + +void JuliaOJIT::enablePerfJITEventListener() +{ +#if JL_LLVM_VERSION >= 180000 + orc::SymbolMap PerfFunctions; + auto StartAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfStart); + auto EndAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfEnd); + auto ImplAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfImpl); + cantFail(JD.define(orc::absoluteSymbols(PerfFunctions))); + if (TM->getTargetTriple().isOSBinFormatELF()) { + ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create())); + //ObjectLayer.addPlugin(cantFail(PerfSupportPlugin::Create( + // ES.getExecutorProcessControl(), *JD, true, true))); + bool EmitDebugInfo = true, EmitUnwindInfo = true; + ObjectLayer.addPlugin(std::make_unique( + ES.getExecutorProcessControl(), StartAddr, EndAddr, ImplAddr, EmitDebugInfo, EmitUnwindInfo)); + } #endif } #else +void JuliaOJIT::RegisterJITEventListener(JITEventListener *L) +{ + if (L) + ObjectLayer.registerJITEventListener(*L); +} void JuliaOJIT::enableJITDebuggingSupport() { RegisterJITEventListener(JITEventListener::createGDBRegistrationListener()); } - -void JuliaOJIT::RegisterJITEventListener(JITEventListener *L) +void JuliaOJIT::enableIntelJITEventListener() { - if (!L) - return; - this->ObjectLayer.registerJITEventListener(*L); + RegisterJITEventListener(JITEventListener::createIntelJITEventListener()); +} +void JuliaOJIT::enableOProfileJITEventListener() +{ + RegisterJITEventListener(JITEventListener::createOProfileJITEventListener()); +} +void JuliaOJIT::enablePerfJITEventListener() +{ + RegisterJITEventListener(JITEventListener::createPerfJITEventListener()); } #endif @@ -2251,6 +2281,7 @@ static void jl_decorate_module(Module &M) { } } +#ifndef JL_USE_JITLINK // Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable static int jl_add_to_ee( orc::ThreadSafeModule &M, @@ -2316,13 +2347,7 @@ static int jl_add_to_ee( jl_ExecutionEngine->addModule(std::move(M)); return 0; } - -static uint64_t getAddressForFunction(StringRef fname) -{ - auto addr = jl_ExecutionEngine->getFunctionAddress(fname); - assert(addr); - return addr; -} +#endif // helper function for adding a DLLImport (dlsym) address to the execution engine void add_named_global(StringRef name, void *addr) diff --git a/src/jitlayers.h b/src/jitlayers.h index 93669c2351d88..3353a4093bd27 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -47,7 +47,7 @@ // and feature support (e.g. Windows, JITEventListeners for various profilers, // etc.). Thus, we currently only use JITLink where absolutely required, that is, // for Mac/aarch64 and Linux/aarch64. -// #define JL_FORCE_JITLINK +//#define JL_FORCE_JITLINK #if defined(_COMPILER_ASAN_ENABLED_) || defined(_COMPILER_MSAN_ENABLED_) || defined(_COMPILER_TSAN_ENABLED_) # define HAS_SANITIZER @@ -363,7 +363,6 @@ class JuliaOJIT { typedef orc::ObjectLinkingLayer ObjLayerT; #else typedef orc::RTDyldObjectLinkingLayer ObjLayerT; -#endif struct LockLayerT : public orc::ObjectLayer { LockLayerT(orc::ObjectLayer &BaseLayer) JL_NOTSAFEPOINT : orc::ObjectLayer(BaseLayer.getExecutionSession()), BaseLayer(BaseLayer) {} @@ -381,11 +380,11 @@ class JuliaOJIT { orc::ObjectLayer &BaseLayer; std::mutex EmissionMutex; }; +#endif typedef orc::IRCompileLayer CompileLayerT; typedef orc::IRTransformLayer JITPointersLayerT; typedef orc::IRTransformLayer OptimizeLayerT; typedef orc::IRTransformLayer OptSelLayerT; - typedef orc::IRTransformLayer DepsVerifyLayerT; typedef object::OwningBinary OwningObj; template - void registerObject(const ObjT &Obj, const LoadResult &LO); +#ifndef JL_USE_JITLINK + void RegisterJITEventListener(JITEventListener *L) JL_NOTSAFEPOINT; +#endif public: @@ -511,10 +509,9 @@ class JuliaOJIT { ~JuliaOJIT() JL_NOTSAFEPOINT; void enableJITDebuggingSupport() JL_NOTSAFEPOINT; -#ifndef JL_USE_JITLINK - // JITLink doesn't support old JITEventListeners (yet). - void RegisterJITEventListener(JITEventListener *L) JL_NOTSAFEPOINT; -#endif + void enableIntelJITEventListener() JL_NOTSAFEPOINT; + void enableOProfileJITEventListener() JL_NOTSAFEPOINT; + void enablePerfJITEventListener() JL_NOTSAFEPOINT; orc::SymbolStringPtr mangle(StringRef Name) JL_NOTSAFEPOINT; void addGlobalMapping(StringRef Name, uint64_t Addr) JL_NOTSAFEPOINT; @@ -525,7 +522,7 @@ class JuliaOJIT { bool ShouldOptimize = false) JL_NOTSAFEPOINT; Error addObjectFile(orc::JITDylib &JD, std::unique_ptr Obj) JL_NOTSAFEPOINT; - orc::IRCompileLayer &getIRCompileLayer() JL_NOTSAFEPOINT { return ExternalCompileLayer; }; + orc::IRCompileLayer &getIRCompileLayer() JL_NOTSAFEPOINT { return CompileLayer; }; orc::ExecutionSession &getExecutionSession() JL_NOTSAFEPOINT { return ES; } orc::JITDylib &getExternalJITDylib() JL_NOTSAFEPOINT { return ExternalJD; } @@ -533,6 +530,7 @@ class JuliaOJIT { Expected findSymbol(StringRef Name, bool ExportedSymbolsOnly) JL_NOTSAFEPOINT; Expected findUnmangledSymbol(StringRef Name) JL_NOTSAFEPOINT; Expected findExternalJDSymbol(StringRef Name, bool ExternalJDOnly) JL_NOTSAFEPOINT; + SmallVector findSymbols(ArrayRef Names) JL_NOTSAFEPOINT; #else JITEvaluatedSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) JL_NOTSAFEPOINT; JITEvaluatedSymbol findUnmangledSymbol(StringRef Name) JL_NOTSAFEPOINT; @@ -616,13 +614,13 @@ class JuliaOJIT { const std::unique_ptr MemMgr; #endif ObjLayerT ObjectLayer; +#ifndef JL_USE_JITLINK LockLayerT LockLayer; +#endif CompileLayerT CompileLayer; JITPointersLayerT JITPointersLayer; OptimizeLayerT OptimizeLayer; OptSelLayerT OptSelLayer; - DepsVerifyLayerT DepsVerifyLayer; - CompileLayerT ExternalCompileLayer; }; extern JuliaOJIT *jl_ExecutionEngine; std::unique_ptr jl_create_llvm_module(StringRef name, LLVMContext &ctx, const DataLayout &DL = jl_ExecutionEngine->getDataLayout(), const Triple &triple = jl_ExecutionEngine->getTargetTriple()) JL_NOTSAFEPOINT; From 218edeaa80c7d70609e2029f81287137d7b201dd Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Fri, 4 Oct 2024 14:38:47 +0000 Subject: [PATCH 2/2] rewrite catchjmp asm to use normal relocations instead of manual editing --- src/cgmemmgr.cpp | 29 -------------- src/codegen.cpp | 1 + src/debug-registry.h | 3 +- src/debuginfo.cpp | 47 ++++++++-------------- src/jitlayers.cpp | 95 +++++++++++++++++++++++++++----------------- 5 files changed, 78 insertions(+), 97 deletions(-) diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp index c78e6092ca5db..8557698a4e513 100644 --- a/src/cgmemmgr.cpp +++ b/src/cgmemmgr.cpp @@ -833,28 +833,6 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { mapAddresses(Dyld, ro_alloc); mapAddresses(Dyld, exe_alloc); } -#ifdef _OS_WINDOWS_ - template - void *lookupWriteAddressFor(void *rt_addr, Alloc &&allocator) - { - for (auto &alloc: allocator->allocations) { - if (alloc.rt_addr == rt_addr) { - return alloc.wr_addr; - } - } - return nullptr; - } - void *lookupWriteAddressFor(void *rt_addr) - { - if (!ro_alloc) - return rt_addr; - if (void *ptr = lookupWriteAddressFor(rt_addr, ro_alloc)) - return ptr; - if (void *ptr = lookupWriteAddressFor(rt_addr, exe_alloc)) - return ptr; - return rt_addr; - } -#endif // _OS_WINDOWS_ }; uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size, @@ -947,13 +925,6 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr, } -#ifdef _OS_WINDOWS_ -void *lookupWriteAddressFor(RTDyldMemoryManager *memmgr, void *rt_addr) -{ - return ((RTDyldMemoryManagerJL*)memmgr)->lookupWriteAddressFor(rt_addr); -} -#endif - RTDyldMemoryManager* createRTDyldMemoryManager() { return new RTDyldMemoryManagerJL(); diff --git a/src/codegen.cpp b/src/codegen.cpp index cee88a3d20f3d..bcda527416676 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -10371,6 +10371,7 @@ static void init_jit_functions(void) #ifdef _OS_WINDOWS_ #if defined(_CPU_X86_64_) + add_named_global("__julia_personality", &__julia_personality); #if defined(_COMPILER_GCC_) add_named_global("___chkstk_ms", &___chkstk_ms); #else diff --git a/src/debug-registry.h b/src/debug-registry.h index 85a94245ce6aa..4c9e13d8cd72d 100644 --- a/src/debug-registry.h +++ b/src/debug-registry.h @@ -145,8 +145,7 @@ class JITDebugInfoRegistry void add_code_in_flight(llvm::StringRef name, jl_code_instance_t *codeinst, const llvm::DataLayout &DL) JL_NOTSAFEPOINT; jl_method_instance_t *lookupLinfo(size_t pointer) JL_NOTSAFEPOINT; void registerJITObject(const llvm::object::ObjectFile &Object, - std::function getLoadAddress, - std::function lookupWriteAddress); + std::function getLoadAddress); objectmap_t& getObjectMap() JL_NOTSAFEPOINT; void add_image_info(image_info_t info) JL_NOTSAFEPOINT; bool get_image_info(uint64_t base, image_info_t *info) const JL_NOTSAFEPOINT; diff --git a/src/debuginfo.cpp b/src/debuginfo.cpp index 84550811072fe..cfaf8d4c70ee9 100644 --- a/src/debuginfo.cpp +++ b/src/debuginfo.cpp @@ -223,11 +223,21 @@ static void create_PRUNTIME_FUNCTION(uint8_t *Code, size_t Size, StringRef fnnam #endif void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object, - std::function getLoadAddress, - std::function lookupWriteAddress) + std::function getLoadAddress) { object::section_iterator EndSection = Object.section_end(); + bool anyfunctions = false; + for (const object::SymbolRef &sym_iter : Object.symbols()) { + object::SymbolRef::Type SymbolType = cantFail(sym_iter.getType()); + if (SymbolType != object::SymbolRef::ST_Function) + continue; + anyfunctions = true; + break; + } + if (!anyfunctions) + return; + #ifdef _CPU_ARM_ // ARM does not have/use .eh_frame uint64_t arm_exidx_addr = 0; @@ -281,14 +291,13 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object, #if defined(_OS_WINDOWS_) uint64_t SectionAddrCheck = 0; uint64_t SectionLoadCheck = 0; (void)SectionLoadCheck; - uint64_t SectionWriteCheck = 0; (void)SectionWriteCheck; uint8_t *UnwindData = NULL; #if defined(_CPU_X86_64_) uint8_t *catchjmp = NULL; for (const object::SymbolRef &sym_iter : Object.symbols()) { StringRef sName = cantFail(sym_iter.getName()); if (sName.equals("__UnwindData") || sName.equals("__catchjmp")) { - uint64_t Addr = cantFail(sym_iter.getAddress()); + uint64_t Addr = cantFail(sym_iter.getAddress()); // offset into object (including section offset) auto Section = cantFail(sym_iter.getSection()); assert(Section != EndSection && Section->isText()); uint64_t SectionAddr = Section->getAddress(); @@ -300,10 +309,7 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object, SectionLoadCheck == SectionLoadAddr); SectionAddrCheck = SectionAddr; SectionLoadCheck = SectionLoadAddr; - SectionWriteCheck = SectionLoadAddr; - if (lookupWriteAddress) - SectionWriteCheck = (uintptr_t)lookupWriteAddress((void*)SectionLoadAddr); - Addr += SectionWriteCheck - SectionLoadCheck; + Addr += SectionLoadAddr - SectionAddr; if (sName.equals("__UnwindData")) { UnwindData = (uint8_t*)Addr; } @@ -314,25 +320,7 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object, } assert(catchjmp); assert(UnwindData); - assert(SectionAddrCheck); assert(SectionLoadCheck); - assert(!memcmp(catchjmp, "\0\0\0\0\0\0\0\0\0\0\0\0", 12) && - !memcmp(UnwindData, "\0\0\0\0\0\0\0\0\0\0\0\0", 12)); - catchjmp[0] = 0x48; - catchjmp[1] = 0xb8; // mov RAX, QWORD PTR [&__julia_personality] - *(uint64_t*)(&catchjmp[2]) = (uint64_t)&__julia_personality; - catchjmp[10] = 0xff; - catchjmp[11] = 0xe0; // jmp RAX - UnwindData[0] = 0x09; // version info, UNW_FLAG_EHANDLER - UnwindData[1] = 4; // size of prolog (bytes) - UnwindData[2] = 2; // count of unwind codes (slots) - UnwindData[3] = 0x05; // frame register (rbp) = rsp - UnwindData[4] = 4; // second instruction - UnwindData[5] = 0x03; // mov RBP, RSP - UnwindData[6] = 1; // first instruction - UnwindData[7] = 0x50; // push RBP - *(DWORD*)&UnwindData[8] = (DWORD)(catchjmp - (uint8_t*)SectionWriteCheck); // relative location of catchjmp - UnwindData -= SectionWriteCheck - SectionLoadCheck; #endif // defined(_OS_X86_64_) #endif // defined(_OS_WINDOWS_) @@ -353,7 +341,7 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object, uint64_t SectionAddr = Section->getAddress(); StringRef secName = cantFail(Section->getName()); uint64_t SectionLoadAddr = getLoadAddress(secName); - Addr -= SectionAddr - SectionLoadAddr; + Addr += SectionLoadAddr - SectionAddr; StringRef sName = cantFail(sym_iter.getName()); uint64_t SectionSize = Section->getSize(); size_t Size = sym_size.second; @@ -404,10 +392,9 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object, } void jl_register_jit_object(const object::ObjectFile &Object, - std::function getLoadAddress, - std::function lookupWriteAddress) + std::function getLoadAddress) { - getJITDebugRegistry().registerJITObject(Object, getLoadAddress, lookupWriteAddress); + getJITDebugRegistry().registerJITObject(Object, getLoadAddress); } // TODO: convert the safe names from aotcomile.cpp:makeSafeName back into symbols diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 7489b086105e6..d1757cadee05c 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -665,8 +665,7 @@ static Expected selectOptLevel(orc::ThreadSafeModule TSM, } void jl_register_jit_object(const object::ObjectFile &debugObj, - std::function getLoadAddress, - std::function lookupWriteAddress); + std::function getLoadAddress); namespace { @@ -726,7 +725,7 @@ class JLDebuginfoPlugin : public ObjectLinkingLayer::Plugin { return result->second; }; - jl_register_jit_object(*NewInfo->Object, getLoadAddress, nullptr); + jl_register_jit_object(*NewInfo->Object, getLoadAddress); PendingObjs.erase(&MR); } @@ -957,10 +956,6 @@ class ForwardingMemoryManager : public RuntimeDyld::MemoryManager { }; -#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_) -void *lookupWriteAddressFor(RTDyldMemoryManager *MemMgr, void *rt_addr); -#endif - void registerRTDyldJITObject(const object::ObjectFile &Object, const RuntimeDyld::LoadedObjectInfo &L, const std::shared_ptr &MemMgr) @@ -983,14 +978,7 @@ void registerRTDyldJITObject(const object::ObjectFile &Object, }; auto DebugObject = L.getObjectForDebug(Object); // ELF requires us to make a copy to mutate the header with the section load addresses. On other platforms this is a no-op. - jl_register_jit_object(DebugObject.getBinary() ? *DebugObject.getBinary() : Object, - getLoadAddress, -#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_) - [MemMgr](void *p) { return lookupWriteAddressFor(MemMgr.get(), p); } -#else - nullptr -#endif - ); + jl_register_jit_object(DebugObject.getBinary() ? *DebugObject.getBinary() : Object, getLoadAddress); } namespace { static std::unique_ptr createTargetMachine() JL_NOTSAFEPOINT { @@ -1281,8 +1269,9 @@ namespace { } // Windows needs some inline asm to help - // build unwind tables - jl_decorate_module(M); + // build unwind tables, if they have any functions to decorate + if (!M.functions().empty()) + jl_decorate_module(M); }); return std::move(TSM); } @@ -2255,30 +2244,64 @@ static void jl_decorate_module(Module &M) { if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) { // Add special values used by debuginfo to build the UnwindData table registration for Win64 // This used to be GV, but with https://reviews.llvm.org/D100944 we no longer can emit GV into `.text` - // TODO: The data is set in debuginfo.cpp but it should be okay to actually emit it here. - std::string inline_asm = "\ - .section "; - inline_asm += + // and with JITLink it became difficult to change the content afterwards, but we + // would prefer that this simple content wasn't recompiled in every single module, + // so we emit the necessary PLT trampoline as inline assembly. + // This is somewhat duplicated with the .pdata section, but we haven't been able to + // use that yet due to relocation issues. +#define ASM_USES_ELF // use ELF or COFF syntax based on FORCE_ELF + StringRef inline_asm( + ".section" #if JL_LLVM_VERSION >= 180000 - ".ltext,\"ax\",@progbits"; + " .ltext,\"ax\",@progbits\n" #else - ".text"; + " .text\n" #endif - inline_asm += "\n\ - .type __UnwindData,@object \n\ - .p2align 2, 0x90 \n\ - __UnwindData: \n\ - .zero 12 \n\ - .size __UnwindData, 12 \n\ - \n\ - .type __catchjmp,@object \n\ - .p2align 2, 0x90 \n\ - __catchjmp: \n\ - .zero 12 \n\ - .size __catchjmp, 12"; - + ".globl __julia_personality\n" + "\n" +#ifdef ASM_USES_ELF + ".type __UnwindData,@object\n" +#else + ".def __UnwindData\n" + ".scl 2\n" + ".type 0\n" + ".endef\n" +#endif + ".p2align 2, 0x90\n" + "__UnwindData:\n" + " .byte 0x09;\n" // version info, UNW_FLAG_EHANDLER + " .byte 4;\n" // size of prolog (bytes) + " .byte 2;\n" // count of unwind codes (slots) + " .byte 0x05;\n" // frame register (rbp) = rsp + " .byte 4;\n" // second instruction + " .byte 0x03;\n" // mov RBP, RSP + " .byte 1;\n" // first instruction + " .byte 0x50;\n" // push RBP + " .int __catchjmp - " +#if JL_LLVM_VERSION >= 180000 + ".ltext;\n" // Section-relative offset (if using COFF and JITLink, this can be relative to __ImageBase instead, though then we could possibly use pdata/xdata directly then) +#else + ".text;\n" +#endif + ".size __UnwindData, 12\n" + "\n" +#ifdef ASM_USES_ELF + ".type __catchjmp,@function\n" +#else + ".def __catchjmp\n" + ".scl 2\n" + ".type 32\n" + ".endef\n" +#endif + ".p2align 2, 0x90\n" + "__catchjmp:\n" + " movabsq $__julia_personality, %rax\n" + " jmpq *%rax\n" + ".size __catchjmp, . - __catchjmp\n" + "\n"); M.appendModuleInlineAsm(inline_asm); } +#undef ASM_USES_ELF } #ifndef JL_USE_JITLINK