From e40e9bf52b65558221fd58b2965ba9152ec37a82 Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Tue, 25 Jun 2024 11:52:44 +0200 Subject: [PATCH 1/6] Add LLVM::Module.parse(memory_buffer, context) --- src/llvm/lib_llvm/bit_reader.cr | 5 +++++ src/llvm/module.cr | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 src/llvm/lib_llvm/bit_reader.cr diff --git a/src/llvm/lib_llvm/bit_reader.cr b/src/llvm/lib_llvm/bit_reader.cr new file mode 100644 index 000000000000..9bfd271cbbe2 --- /dev/null +++ b/src/llvm/lib_llvm/bit_reader.cr @@ -0,0 +1,5 @@ +require "./types" + +lib LibLLVM + fun parse_bitcode_in_context2 = LLVMParseBitcodeInContext2(c : ContextRef, mb : MemoryBufferRef, m : ModuleRef*) : Int +end diff --git a/src/llvm/module.cr b/src/llvm/module.cr index f216d485055c..32b025bffee7 100644 --- a/src/llvm/module.cr +++ b/src/llvm/module.cr @@ -6,6 +6,12 @@ class LLVM::Module getter context : Context + def self.parse(memory_buffer : MemoryBuffer, context : Context) : self + LibLLVM.parse_bitcode_in_context2(context, memory_buffer, out module_ref) + raise "BUG: failed to parse LLVM bitcode from memory buffer" unless module_ref + new(module_ref, context) + end + def initialize(@unwrap : LibLLVM::ModuleRef, @context : Context) @owned = false end From e46a572fbb607ca6a1259e108e8de6f774f81505 Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Tue, 2 Jul 2024 13:23:18 +0200 Subject: [PATCH 2/6] Add Crystal::Compiler#mt_codegen When compiled with -Dpreview_mt the compiler will take advantage of the MT environment to codegen the compilation units in parallel, avoiding fork (that's not supported with MT) and allowing parallel codegen on Windows. --- src/compiler/crystal/compiler.cr | 150 +++++++++++++++++++++---------- 1 file changed, 105 insertions(+), 45 deletions(-) diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr index 38880ee9ed64..008faa9ff7c6 100644 --- a/src/compiler/crystal/compiler.cr +++ b/src/compiler/crystal/compiler.cr @@ -5,6 +5,9 @@ require "crystal/digest/md5" {% if flag?(:msvc) %} require "./loader" {% end %} +{% if flag?(:preview_mt) %} + require "wait_group" +{% end %} module Crystal @[Flags] @@ -80,7 +83,13 @@ module Crystal property? no_codegen = false # Maximum number of LLVM modules that are compiled in parallel - property n_threads : Int32 = {% if flag?(:preview_mt) || flag?(:win32) %} 1 {% else %} 8 {% end %} + property n_threads : Int32 = {% if flag?(:preview_mt) %} + ENV["CRYSTAL_WORKERS"]?.try(&.to_i?) || 4 + {% elsif flag?(:win32) %} + 1 + {% else %} + 8 + {% end %} # Default prelude file to use. This ends up adding a # `require "prelude"` (or whatever name is set here) to @@ -391,7 +400,7 @@ module Crystal llvm_mod = unit.llvm_mod @progress_tracker.stage("Codegen (bc+obj)") do - optimize llvm_mod unless @optimization_mode.o0? + optimize llvm_mod, target_machine unless @optimization_mode.o0? unit.emit(@emit_targets, emit_base_filename || output_filename) @@ -512,7 +521,8 @@ module Crystal private def parallel_codegen(units, n_threads) {% if flag?(:preview_mt) %} - raise "Cannot fork compiler in multithread mode." + raise "LLVM isn't multithreaded and cannot fork compiler in multithread mode." unless LLVM.multithreaded? + mt_codegen(units, n_threads) {% elsif LibC.has_method?("fork") %} fork_codegen(units, n_threads) {% else %} @@ -520,6 +530,42 @@ module Crystal {% end %} end + private def mt_codegen(units, n_threads) + channel = Channel(CompilationUnit).new(n_threads * 2) + wg = WaitGroup.new(n_threads) + mutex = Mutex.new + + n_threads.times do + spawn do + while unit = channel.receive? + unit.compile(isolate_context: true) + next unless wants_stats_or_progress? + mutex.synchronize { @progress_tracker.stage_progress += 1 } + end + ensure + wg.done + end + end + + units.each do |unit| + # We generate the bitcode in the main thread because LLVM contexts + # must be unique per compilation unit, but we share different contexts + # across many modules (or rely on the global context); trying to + # codegen in parallel would segfault! + # + # Luckily generating the bitcode is quick and once the bitcode is + # generated we don't need the global LLVM contexts anymore but can + # parse the bitcode in an isolated context and we can parallelize the + # slowest part: the optimization pass & compiling the object file. + unit.generate_bitcode + + channel.send(unit) + end + channel.close + + wg.wait + end + private def fork_codegen(units, n_threads) workers = fork_workers(n_threads) do |input, output| while i = input.gets(chomp: true).presence @@ -743,7 +789,7 @@ module Crystal end {% end %} - protected def optimize(llvm_mod) + protected def optimize(llvm_mod, target_machine) {% if LibLLVM::IS_LT_130 %} optimize_with_pass_manager(llvm_mod) {% else %} @@ -819,6 +865,9 @@ module Crystal getter llvm_mod property? reused_previous_compilation = false getter object_extension : String + @memory_buffer : LLVM::MemoryBuffer? + @object_name : String? + @bc_name : String? def initialize(@compiler : Compiler, program : Program, @name : String, @llvm_mod : LLVM::Module, @output_dir : String, @bc_flags_changed : Bool) @@ -848,40 +897,44 @@ module Crystal @object_extension = compiler.codegen_target.object_extension end - def compile - compile_to_object + def generate_bitcode + @memory_buffer ||= llvm_mod.write_bitcode_to_memory_buffer end - private def compile_to_object - bc_name = self.bc_name - object_name = self.object_name - temporary_object_name = self.temporary_object_name + # To compile a file we first generate a `.bc` file and then create an + # object file from it. These `.bc` files are stored in the cache + # directory. + # + # On a next compilation of the same project, and if the compile flags + # didn't change (a combination of the target triple, mcpu and link flags, + # amongst others), we check if the new `.bc` file is exactly the same as + # the old one. In that case the `.o` file will also be the same, so we + # simply reuse the old one. Generating an `.o` file is what takes most + # time. + # + # However, instead of directly generating the final `.o` file from the + # `.bc` file, we generate it to a temporary name (`.o.tmp`) and then we + # rename that file to `.o`. We do this because the compiler could be + # interrupted while the `.o` file is being generated, leading to a + # corrupted file that later would cause compilation issues. Moving a file + # is an atomic operation so no corrupted `.o` file should be generated. + def compile(isolate_context = false) + if must_compile? + isolate_module_context if isolate_context + update_bitcode_cache + compile_to_object + else + @reused_previous_compilation = true + end + dump_llvm_ir + end + + private def must_compile? + memory_buffer = generate_bitcode - # To compile a file we first generate a `.bc` file and then - # create an object file from it. These `.bc` files are stored - # in the cache directory. - # - # On a next compilation of the same project, and if the compile - # flags didn't change (a combination of the target triple, mcpu - # and link flags, amongst others), we check if the new - # `.bc` file is exactly the same as the old one. In that case - # the `.o` file will also be the same, so we simply reuse the - # old one. Generating an `.o` file is what takes most time. - # - # However, instead of directly generating the final `.o` file - # from the `.bc` file, we generate it to a temporary name (`.o.tmp`) - # and then we rename that file to `.o`. We do this because the compiler - # could be interrupted while the `.o` file is being generated, leading - # to a corrupted file that later would cause compilation issues. - # Moving a file is an atomic operation so no corrupted `.o` file should - # be generated. - - must_compile = true can_reuse_previous_compilation = compiler.emit_targets.none? && !@bc_flags_changed && File.exists?(bc_name) && File.exists?(object_name) - memory_buffer = llvm_mod.write_bitcode_to_memory_buffer - if can_reuse_previous_compilation memory_io = IO::Memory.new(memory_buffer.to_slice) changed = File.open(bc_name) { |bc_file| !IO.same_content?(bc_file, memory_io) } @@ -889,32 +942,39 @@ module Crystal # If the user cancelled a previous compilation # it might be that the .o file is empty if !changed && File.size(object_name) > 0 - must_compile = false memory_buffer.dispose - memory_buffer = nil + return false else # We need to compile, so we'll write the memory buffer to file end end - # If there's a memory buffer, it means we must create a .o from it - if memory_buffer + true + end + + # Parse the previously generated bitcode into the LLVM module using a + # dedicated context, so we can safely optimize & compile the module in + # multiple threads (llvm contexts can't be shared across threads). + private def isolate_module_context + @llvm_mod = LLVM::Module.parse(@memory_buffer.not_nil!, LLVM::Context.new) + end + + private def update_bitcode_cache + if memory_buffer = @memory_buffer # Delete existing .o file. It cannot be used anymore. File.delete?(object_name) # Create the .bc file (for next compilations) File.write(bc_name, memory_buffer.to_slice) memory_buffer.dispose end + end - if must_compile - compiler.optimize llvm_mod unless compiler.optimization_mode.o0? - compiler.target_machine.emit_obj_to_file llvm_mod, temporary_object_name - File.rename(temporary_object_name, object_name) - else - @reused_previous_compilation = true - end - - dump_llvm_ir + private def compile_to_object + temporary_object_name = self.temporary_object_name + target_machine = compiler.create_target_machine + compiler.optimize llvm_mod, target_machine unless compiler.optimization_mode.o0? + target_machine.emit_obj_to_file llvm_mod, temporary_object_name + File.rename(temporary_object_name, object_name) end private def dump_llvm_ir From 83b102d5ebf79b69b5a260f3a87450a9719d6652 Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Tue, 2 Jul 2024 14:53:04 +0200 Subject: [PATCH 3/6] fixup --- src/compiler/crystal/compiler.cr | 1 - 1 file changed, 1 deletion(-) diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr index 008faa9ff7c6..ccdd994e39f3 100644 --- a/src/compiler/crystal/compiler.cr +++ b/src/compiler/crystal/compiler.cr @@ -539,7 +539,6 @@ module Crystal spawn do while unit = channel.receive? unit.compile(isolate_context: true) - next unless wants_stats_or_progress? mutex.synchronize { @progress_tracker.stage_progress += 1 } end ensure From 8e94f9cc7a83e080fb3f380c0a89c94cb086e1cc Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Tue, 2 Jul 2024 18:31:14 +0200 Subject: [PATCH 4/6] Review suggestions from Sija --- src/compiler/crystal/compiler.cr | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr index ccdd994e39f3..38774368b3cd 100644 --- a/src/compiler/crystal/compiler.cr +++ b/src/compiler/crystal/compiler.cr @@ -705,9 +705,10 @@ module Crystal puts puts "Codegen (bc+obj):" - if units.size == reused + case reused + when units.size puts " - all previous .o files were reused" - elsif reused == 0 + when .zero? puts " - no previous .o files were reused" else puts " - #{reused}/#{units.size} .o files were reused" @@ -959,13 +960,13 @@ module Crystal end private def update_bitcode_cache - if memory_buffer = @memory_buffer - # Delete existing .o file. It cannot be used anymore. - File.delete?(object_name) - # Create the .bc file (for next compilations) - File.write(bc_name, memory_buffer.to_slice) - memory_buffer.dispose - end + return unless memory_buffer = @memory_buffer + + # Delete existing .o file. It cannot be used anymore. + File.delete?(object_name) + # Create the .bc file (for next compilations) + File.write(bc_name, memory_buffer.to_slice) + memory_buffer.dispose end private def compile_to_object From 96b6f77423fe970a8cee5b05a612d5e0628d10e6 Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Tue, 2 Jul 2024 22:31:22 +0200 Subject: [PATCH 5/6] Fix: thread safety of LLVM legacy pass manager --- src/compiler/crystal/compiler.cr | 89 +++++++++++++++----------------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr index 38774368b3cd..28adf6eb9fbf 100644 --- a/src/compiler/crystal/compiler.cr +++ b/src/compiler/crystal/compiler.cr @@ -337,6 +337,12 @@ module Crystal CompilationUnit.new(self, program, type_name, llvm_mod, output_dir, bc_flags_changed) end + {% if LibLLVM::IS_LT_170 %} + # initialize the legacy pass manager once in the main thread/process + # before we start codegen in threads (MT) or processes (fork) + init_llvm_legacy_pass_manager unless optimization_mode.o0? + {% end %} + if @cross_compile cross_compile program, units, output_filename else @@ -735,57 +741,48 @@ module Crystal end {% if LibLLVM::IS_LT_170 %} + property! pass_manager_builder : LLVM::PassManagerBuilder + + private def init_llvm_legacy_pass_manager + registry = LLVM::PassRegistry.instance + registry.initialize_all + + builder = LLVM::PassManagerBuilder.new + builder.size_level = 0 + + case optimization_mode + in .o3? + builder.opt_level = 3 + builder.use_inliner_with_threshold = 275 + in .o2? + builder.opt_level = 2 + builder.use_inliner_with_threshold = 275 + in .o1? + builder.opt_level = 1 + builder.use_inliner_with_threshold = 150 + in .o0? + # default behaviour, no optimizations + in .os? + builder.opt_level = 2 + builder.size_level = 1 + builder.use_inliner_with_threshold = 50 + in .oz? + builder.opt_level = 2 + builder.size_level = 2 + builder.use_inliner_with_threshold = 5 + end + + @pass_manager_builder = builder + end + private def optimize_with_pass_manager(llvm_mod) fun_pass_manager = llvm_mod.new_function_pass_manager pass_manager_builder.populate fun_pass_manager fun_pass_manager.run llvm_mod - module_pass_manager.run llvm_mod - end - @module_pass_manager : LLVM::ModulePassManager? - - private def module_pass_manager - @module_pass_manager ||= begin - mod_pass_manager = LLVM::ModulePassManager.new - pass_manager_builder.populate mod_pass_manager - mod_pass_manager - end - end - - @pass_manager_builder : LLVM::PassManagerBuilder? - - private def pass_manager_builder - @pass_manager_builder ||= begin - registry = LLVM::PassRegistry.instance - registry.initialize_all - - builder = LLVM::PassManagerBuilder.new - builder.size_level = 0 - - case optimization_mode - in .o3? - builder.opt_level = 3 - builder.use_inliner_with_threshold = 275 - in .o2? - builder.opt_level = 2 - builder.use_inliner_with_threshold = 275 - in .o1? - builder.opt_level = 1 - builder.use_inliner_with_threshold = 150 - in .o0? - # default behaviour, no optimizations - in .os? - builder.opt_level = 2 - builder.size_level = 1 - builder.use_inliner_with_threshold = 50 - in .oz? - builder.opt_level = 2 - builder.size_level = 2 - builder.use_inliner_with_threshold = 5 - end - - builder - end + module_pass_manager = LLVM::ModulePassManager.new + pass_manager_builder.populate module_pass_manager + module_pass_manager.run llvm_mod end {% end %} From 62226c2bb41bff21d608b503323f5628f19b5e16 Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Thu, 5 Sep 2024 10:56:57 +0200 Subject: [PATCH 6/6] Leverage WaitGroup#spawn --- src/compiler/crystal/compiler.cr | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr index 28adf6eb9fbf..ea1697743c1c 100644 --- a/src/compiler/crystal/compiler.cr +++ b/src/compiler/crystal/compiler.cr @@ -538,17 +538,15 @@ module Crystal private def mt_codegen(units, n_threads) channel = Channel(CompilationUnit).new(n_threads * 2) - wg = WaitGroup.new(n_threads) + wg = WaitGroup.new mutex = Mutex.new n_threads.times do - spawn do + wg.spawn do while unit = channel.receive? unit.compile(isolate_context: true) mutex.synchronize { @progress_tracker.stage_progress += 1 } end - ensure - wg.done end end