From ad407a6d2198c999f8f7b48a85d190694e392eb5 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Thu, 11 Jul 2024 10:06:25 +0200 Subject: [PATCH] Actually setup jit targets when compiling packageimages instead of targeting only one (#54471) --- src/codegen.cpp | 5 ++- src/llvm-multiversioning.cpp | 1 + src/processor_arm.cpp | 52 +++++++++++++++++++++-- src/processor_fallback.cpp | 23 +++++++++-- src/processor_x86.cpp | 80 ++++++++++++++++++++++++++++++++++-- test/precompile.jl | 13 ++++++ 6 files changed, 162 insertions(+), 12 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index b886d201e118c..3bfce7592cad7 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -7438,8 +7438,11 @@ static Function* gen_cfun_wrapper( ctx.builder.ClearInsertionPoint(); if (aliasname) { - GlobalAlias::create(cw->getValueType(), cw->getType()->getAddressSpace(), + auto alias = GlobalAlias::create(cw->getValueType(), cw->getType()->getAddressSpace(), GlobalValue::ExternalLinkage, aliasname, cw, M); + if(ctx.emission_context.TargetTriple.isOSBinFormatCOFF()) { + alias->setDLLStorageClass(GlobalValue::DLLStorageClassTypes::DLLExportStorageClass); + } } if (nest) { diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 65fbcd3e0861c..08600e24490b1 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -677,6 +677,7 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F) trampoline->removeFnAttr("julia.mv.reloc"); trampoline->removeFnAttr("julia.mv.clones"); trampoline->addFnAttr("julia.mv.alias"); + trampoline->setDLLStorageClass(alias->getDLLStorageClass()); alias->eraseFromParent(); uint32_t id; diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp index 1852188c718a9..0d9ed43a23a58 100644 --- a/src/processor_arm.cpp +++ b/src/processor_arm.cpp @@ -1890,12 +1890,56 @@ const std::pair &jl_get_llvm_disasm_target(void) return res; } +#ifndef __clang_gcanalyzer__ llvm::SmallVector jl_get_llvm_clone_targets(void) { - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); + + auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, true); + llvm::SmallVector, 0> image_targets; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, image_targets.empty()); + image_targets.push_back(std::move(data)); + } + auto ntargets = image_targets.size(); + if (image_targets.empty()) + jl_error("No targets specified"); llvm::SmallVector res; - for (auto &target: jit_targets) { + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = image_targets[i]; + if (t.en.flags & JL_TARGET_CLONE_ALL) + continue; + auto &features0 = image_targets[t.base].en.features; + // Always clone when code checks CPU features + t.en.flags |= JL_TARGET_CLONE_CPU; + static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16}; + for (auto fe: clone_fp16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_FLOAT16; + break; + } + } + // The most useful one in general... + t.en.flags |= JL_TARGET_CLONE_LOOP; +#ifdef _CPU_ARM_ + static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon}; + for (auto fe: clone_math) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_MATH; + break; + } + } + static constexpr uint32_t clone_simd[] = {Feature::neon}; + for (auto fe: clone_simd) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_SIMD; + break; + } + } +#endif + } + for (auto &target: image_targets) { auto features_en = target.en.features; auto features_dis = target.dis.features; for (auto &fename: feature_names) { @@ -1916,6 +1960,8 @@ llvm::SmallVector jl_get_llvm_clone_targets(void) return res; } +#endif + extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) { if (feature >= 32 * feature_sz) diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp index 87d72d5ba7958..f8d9eb9fd9e73 100644 --- a/src/processor_fallback.cpp +++ b/src/processor_fallback.cpp @@ -144,13 +144,27 @@ const std::pair &jl_get_llvm_disasm_target(void) jl_get_cpu_features_llvm(), {{}, 0}, {{}, 0}, 0}); return res; } - +#ifndef __clang_gcanalyzer__ llvm::SmallVector jl_get_llvm_clone_targets(void) { - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); + + auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, true); + llvm::SmallVector, 0> image_targets; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, image_targets.empty()); + image_targets.push_back(std::move(data)); + } + auto ntargets = image_targets.size(); + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = image_targets[i]; + t.en.flags |= JL_TARGET_CLONE_ALL; + } + if (image_targets.empty()) + jl_error("No image targets found"); llvm::SmallVector res; - for (auto &target: jit_targets) { + for (auto &target: image_targets) { jl_target_spec_t ele; std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target); ele.data = serialize_target_data(target.name, target.en.features, @@ -161,6 +175,7 @@ llvm::SmallVector jl_get_llvm_clone_targets(void) } return res; } +#endif JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits) { diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 7f173440c8b9b..db954680289ea 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -910,6 +910,8 @@ static uint32_t pkgimg_init_cb(const void *id, jl_value_t **rejection_reason) return match.best_idx; } +//This function serves as a fallback during bootstrapping, at that point we don't have a sysimage with native code +// so we won't call sysimg_init_cb, else this function shouldn't do anything. static void ensure_jit_target(bool imaging) { auto &cmdline = get_cmdline_targets(); @@ -1102,13 +1104,82 @@ const std::pair &jl_get_llvm_disasm_target(void) {feature_masks, 0}, {{}, 0}, 0}); return res; } - +//This function parses the -C command line to figure out which targets to multiversion to. +#ifndef __clang_gcanalyzer__ llvm::SmallVector jl_get_llvm_clone_targets(void) { - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); + auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, true); + llvm::SmallVector, 0> image_targets; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, image_targets.empty()); + image_targets.push_back(std::move(data)); + } + + auto ntargets = image_targets.size(); + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = image_targets[i]; + if (t.en.flags & JL_TARGET_CLONE_ALL) + continue; + // Always clone when code checks CPU features + t.en.flags |= JL_TARGET_CLONE_CPU; + // The most useful one in general... + t.en.flags |= JL_TARGET_CLONE_LOOP; + auto &features0 = image_targets[t.base].en.features; + // Special case for KNL/KNM since they're so different + if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) { + if ((t.name == "knl" || t.name == "knm") && + image_targets[t.base].name != "knl" && image_targets[t.base].name != "knm") { + t.en.flags |= JL_TARGET_CLONE_ALL; + break; + } + } + static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4}; + static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3, + Feature::sse41, Feature::sse42, + Feature::avx, Feature::avx2, + Feature::vaes, Feature::vpclmulqdq, + Feature::sse4a, Feature::avx512f, + Feature::avx512dq, Feature::avx512ifma, + Feature::avx512pf, Feature::avx512er, + Feature::avx512cd, Feature::avx512bw, + Feature::avx512vl, Feature::avx512vbmi, + Feature::avx512vpopcntdq, Feature::avxvnni, + Feature::avx512vbmi2, Feature::avx512vnni, + Feature::avx512bitalg, Feature::avx512bf16, + Feature::avx512vp2intersect, Feature::avx512fp16}; + for (auto fe: clone_math) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_MATH; + break; + } + } + for (auto fe: clone_simd) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_SIMD; + break; + } + } + static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16}; + for (auto fe: clone_fp16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_FLOAT16; + break; + } + } + static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16}; + for (auto fe: clone_bf16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_BFLOAT16; + break; + } + } + } + if (image_targets.empty()) + jl_error("No targets specified"); llvm::SmallVector res; - for (auto &target: jit_targets) { + for (auto &target: image_targets) { auto features_en = target.en.features; auto features_dis = target.dis.features; for (auto &fename: feature_names) { @@ -1128,6 +1199,7 @@ llvm::SmallVector jl_get_llvm_clone_targets(void) } return res; } +#endif extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) { diff --git a/test/precompile.jl b/test/precompile.jl index 21a17e0778496..3241ee8b25a35 100644 --- a/test/precompile.jl +++ b/test/precompile.jl @@ -1963,6 +1963,19 @@ precompile_test_harness("Test flags") do load_path @test !Base.isprecompiled(id, ;flags=current_flags) end +if Base.get_bool_env("CI", false) && (Sys.ARCH === :x86_64 || Sys.ARCH === :aarch64) + @testset "Multiversioning" begin # This test isn't the most robust because it relies on being in CI, + pkg = Base.identify_package("Test") # but we need better target reflection to make a better one. + cachefiles = Base.find_all_in_cache_path(pkg) + pkgpath = Base.locate_package(pkg) + idx = findfirst(cachefiles) do cf + Base.stale_cachefile(pkgpath, cf) !== true + end + targets = Base.parse_image_targets(Base.parse_cache_header(cachefiles[idx])[7]) + @test length(targets) > 1 + end +end + precompile_test_harness("No backedge precompile") do load_path # Test that the system doesn't accidentally forget to revalidate a method without backedges write(joinpath(load_path, "NoBackEdges.jl"),