diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr index 38880ee9ed64..008faa9ff7c6 100644 --- a/src/compiler/crystal/compiler.cr +++ b/src/compiler/crystal/compiler.cr @@ -5,6 +5,9 @@ require "crystal/digest/md5" {% if flag?(:msvc) %} require "./loader" {% end %} +{% if flag?(:preview_mt) %} + require "wait_group" +{% end %} module Crystal @[Flags] @@ -80,7 +83,13 @@ module Crystal property? no_codegen = false # Maximum number of LLVM modules that are compiled in parallel - property n_threads : Int32 = {% if flag?(:preview_mt) || flag?(:win32) %} 1 {% else %} 8 {% end %} + property n_threads : Int32 = {% if flag?(:preview_mt) %} + ENV["CRYSTAL_WORKERS"]?.try(&.to_i?) || 4 + {% elsif flag?(:win32) %} + 1 + {% else %} + 8 + {% end %} # Default prelude file to use. This ends up adding a # `require "prelude"` (or whatever name is set here) to @@ -391,7 +400,7 @@ module Crystal llvm_mod = unit.llvm_mod @progress_tracker.stage("Codegen (bc+obj)") do - optimize llvm_mod unless @optimization_mode.o0? + optimize llvm_mod, target_machine unless @optimization_mode.o0? unit.emit(@emit_targets, emit_base_filename || output_filename) @@ -512,7 +521,8 @@ module Crystal private def parallel_codegen(units, n_threads) {% if flag?(:preview_mt) %} - raise "Cannot fork compiler in multithread mode." + raise "LLVM isn't multithreaded and cannot fork compiler in multithread mode." unless LLVM.multithreaded? + mt_codegen(units, n_threads) {% elsif LibC.has_method?("fork") %} fork_codegen(units, n_threads) {% else %} @@ -520,6 +530,42 @@ module Crystal {% end %} end + private def mt_codegen(units, n_threads) + channel = Channel(CompilationUnit).new(n_threads * 2) + wg = WaitGroup.new(n_threads) + mutex = Mutex.new + + n_threads.times do + spawn do + while unit = channel.receive? + unit.compile(isolate_context: true) + next unless wants_stats_or_progress? + mutex.synchronize { @progress_tracker.stage_progress += 1 } + end + ensure + wg.done + end + end + + units.each do |unit| + # We generate the bitcode in the main thread because LLVM contexts + # must be unique per compilation unit, but we share different contexts + # across many modules (or rely on the global context); trying to + # codegen in parallel would segfault! + # + # Luckily generating the bitcode is quick and once the bitcode is + # generated we don't need the global LLVM contexts anymore but can + # parse the bitcode in an isolated context and we can parallelize the + # slowest part: the optimization pass & compiling the object file. + unit.generate_bitcode + + channel.send(unit) + end + channel.close + + wg.wait + end + private def fork_codegen(units, n_threads) workers = fork_workers(n_threads) do |input, output| while i = input.gets(chomp: true).presence @@ -743,7 +789,7 @@ module Crystal end {% end %} - protected def optimize(llvm_mod) + protected def optimize(llvm_mod, target_machine) {% if LibLLVM::IS_LT_130 %} optimize_with_pass_manager(llvm_mod) {% else %} @@ -819,6 +865,9 @@ module Crystal getter llvm_mod property? reused_previous_compilation = false getter object_extension : String + @memory_buffer : LLVM::MemoryBuffer? + @object_name : String? + @bc_name : String? def initialize(@compiler : Compiler, program : Program, @name : String, @llvm_mod : LLVM::Module, @output_dir : String, @bc_flags_changed : Bool) @@ -848,40 +897,44 @@ module Crystal @object_extension = compiler.codegen_target.object_extension end - def compile - compile_to_object + def generate_bitcode + @memory_buffer ||= llvm_mod.write_bitcode_to_memory_buffer end - private def compile_to_object - bc_name = self.bc_name - object_name = self.object_name - temporary_object_name = self.temporary_object_name + # To compile a file we first generate a `.bc` file and then create an + # object file from it. These `.bc` files are stored in the cache + # directory. + # + # On a next compilation of the same project, and if the compile flags + # didn't change (a combination of the target triple, mcpu and link flags, + # amongst others), we check if the new `.bc` file is exactly the same as + # the old one. In that case the `.o` file will also be the same, so we + # simply reuse the old one. Generating an `.o` file is what takes most + # time. + # + # However, instead of directly generating the final `.o` file from the + # `.bc` file, we generate it to a temporary name (`.o.tmp`) and then we + # rename that file to `.o`. We do this because the compiler could be + # interrupted while the `.o` file is being generated, leading to a + # corrupted file that later would cause compilation issues. Moving a file + # is an atomic operation so no corrupted `.o` file should be generated. + def compile(isolate_context = false) + if must_compile? + isolate_module_context if isolate_context + update_bitcode_cache + compile_to_object + else + @reused_previous_compilation = true + end + dump_llvm_ir + end + + private def must_compile? + memory_buffer = generate_bitcode - # To compile a file we first generate a `.bc` file and then - # create an object file from it. These `.bc` files are stored - # in the cache directory. - # - # On a next compilation of the same project, and if the compile - # flags didn't change (a combination of the target triple, mcpu - # and link flags, amongst others), we check if the new - # `.bc` file is exactly the same as the old one. In that case - # the `.o` file will also be the same, so we simply reuse the - # old one. Generating an `.o` file is what takes most time. - # - # However, instead of directly generating the final `.o` file - # from the `.bc` file, we generate it to a temporary name (`.o.tmp`) - # and then we rename that file to `.o`. We do this because the compiler - # could be interrupted while the `.o` file is being generated, leading - # to a corrupted file that later would cause compilation issues. - # Moving a file is an atomic operation so no corrupted `.o` file should - # be generated. - - must_compile = true can_reuse_previous_compilation = compiler.emit_targets.none? && !@bc_flags_changed && File.exists?(bc_name) && File.exists?(object_name) - memory_buffer = llvm_mod.write_bitcode_to_memory_buffer - if can_reuse_previous_compilation memory_io = IO::Memory.new(memory_buffer.to_slice) changed = File.open(bc_name) { |bc_file| !IO.same_content?(bc_file, memory_io) } @@ -889,32 +942,39 @@ module Crystal # If the user cancelled a previous compilation # it might be that the .o file is empty if !changed && File.size(object_name) > 0 - must_compile = false memory_buffer.dispose - memory_buffer = nil + return false else # We need to compile, so we'll write the memory buffer to file end end - # If there's a memory buffer, it means we must create a .o from it - if memory_buffer + true + end + + # Parse the previously generated bitcode into the LLVM module using a + # dedicated context, so we can safely optimize & compile the module in + # multiple threads (llvm contexts can't be shared across threads). + private def isolate_module_context + @llvm_mod = LLVM::Module.parse(@memory_buffer.not_nil!, LLVM::Context.new) + end + + private def update_bitcode_cache + if memory_buffer = @memory_buffer # Delete existing .o file. It cannot be used anymore. File.delete?(object_name) # Create the .bc file (for next compilations) File.write(bc_name, memory_buffer.to_slice) memory_buffer.dispose end + end - if must_compile - compiler.optimize llvm_mod unless compiler.optimization_mode.o0? - compiler.target_machine.emit_obj_to_file llvm_mod, temporary_object_name - File.rename(temporary_object_name, object_name) - else - @reused_previous_compilation = true - end - - dump_llvm_ir + private def compile_to_object + temporary_object_name = self.temporary_object_name + target_machine = compiler.create_target_machine + compiler.optimize llvm_mod, target_machine unless compiler.optimization_mode.o0? + target_machine.emit_obj_to_file llvm_mod, temporary_object_name + File.rename(temporary_object_name, object_name) end private def dump_llvm_ir