Compiler: enable parallel codegen with MT (#14748)

Implements parallel codegen of object files when MT is enabled in the compiler (`-Dpreview_mt`). It only impacts codegen for compilations with more than one compilation unit (module), that is when neither of `--single-module`, `--release` or `--cross-compile` is specified. This behavior is identical to the fork based codegen. **Advantages:** - allows parallel codegen on Windows (untested); - no need to fork many processes; - no repeated GC collections in each forked processes; - a simple Channel to distribute work efficiently (no need for IPC). The main points are increased portability and simpler logic, despite having to take care of LLVM thread safety quirks (see comments). Co-authored-by: Johannes Müller <[email protected]>
crystal-lang · Sep 21, 2024 · c74f6bc · c74f6bc
1 parent 5f018fc
commit c74f6bc
Show file tree

Hide file tree

Showing 3 changed files with 164 additions and 98 deletions.
diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr
@@ -5,6 +5,9 @@ require "crystal/digest/md5"
 {% if flag?(:msvc) %}
   require "./loader"
 {% end %}
+{% if flag?(:preview_mt) %}
+  require "wait_group"
+{% end %}
 
 module Crystal
   @[Flags]
@@ -80,7 +83,13 @@ module Crystal
     property? no_codegen = false
 
     # Maximum number of LLVM modules that are compiled in parallel
-    property n_threads : Int32 = {% if flag?(:preview_mt) || flag?(:win32) %} 1 {% else %} 8 {% end %}
+    property n_threads : Int32 = {% if flag?(:preview_mt) %}
+      ENV["CRYSTAL_WORKERS"]?.try(&.to_i?) || 4
+    {% elsif flag?(:win32) %}
+      1
+    {% else %}
+      8
+    {% end %}
 
     # Default prelude file to use. This ends up adding a
     # `require "prelude"` (or whatever name is set here) to
@@ -328,6 +337,12 @@ module Crystal
         CompilationUnit.new(self, program, type_name, llvm_mod, output_dir, bc_flags_changed)
       end
 
+      {% if LibLLVM::IS_LT_170 %}
+        # initialize the legacy pass manager once in the main thread/process
+        # before we start codegen in threads (MT) or processes (fork)
+        init_llvm_legacy_pass_manager unless optimization_mode.o0?
+      {% end %}
+
       if @cross_compile
         cross_compile program, units, output_filename
       else
@@ -391,7 +406,7 @@ module Crystal
       llvm_mod = unit.llvm_mod
 
       @progress_tracker.stage("Codegen (bc+obj)") do
-        optimize llvm_mod unless @optimization_mode.o0?
+        optimize llvm_mod, target_machine unless @optimization_mode.o0?
 
         unit.emit(@emit_targets, emit_base_filename || output_filename)
 
@@ -529,14 +544,48 @@ module Crystal
 
     private def parallel_codegen(units, n_threads)
       {% if flag?(:preview_mt) %}
-        raise "Cannot fork compiler in multithread mode."
+        raise "LLVM isn't multithreaded and cannot fork compiler in multithread mode." unless LLVM.multithreaded?
+        mt_codegen(units, n_threads)
       {% elsif LibC.has_method?("fork") %}
         fork_codegen(units, n_threads)
       {% else %}
         raise "Cannot fork compiler. `Crystal::System::Process.fork` is not implemented on this system."
       {% end %}
     end
 
+    private def mt_codegen(units, n_threads)
+      channel = Channel(CompilationUnit).new(n_threads * 2)
+      wg = WaitGroup.new
+      mutex = Mutex.new
+
+      n_threads.times do
+        wg.spawn do
+          while unit = channel.receive?
+            unit.compile(isolate_context: true)
+            mutex.synchronize { @progress_tracker.stage_progress += 1 }
+          end
+        end
+      end
+
+      units.each do |unit|
+        # We generate the bitcode in the main thread because LLVM contexts
+        # must be unique per compilation unit, but we share different contexts
+        # across many modules (or rely on the global context); trying to
+        # codegen in parallel would segfault!
+        #
+        # Luckily generating the bitcode is quick and once the bitcode is
+        # generated we don't need the global LLVM contexts anymore but can
+        # parse the bitcode in an isolated context and we can parallelize the
+        # slowest part: the optimization pass & compiling the object file.
+        unit.generate_bitcode
+
+        channel.send(unit)
+      end
+      channel.close
+
+      wg.wait
+    end
+
     private def fork_codegen(units, n_threads)
       workers = fork_workers(n_threads) do |input, output|
         while i = input.gets(chomp: true).presence
@@ -677,9 +726,10 @@ module Crystal
 
       puts
       puts "Codegen (bc+obj):"
-      if units.size == reused
+      case reused
+      when units.size
         puts " - all previous .o files were reused"
-      elsif reused == 0
+      when .zero?
         puts " - no previous .o files were reused"
       else
         puts " - #{reused}/#{units.size} .o files were reused"
@@ -706,61 +756,52 @@ module Crystal
     end
 
     {% if LibLLVM::IS_LT_170 %}
+      property! pass_manager_builder : LLVM::PassManagerBuilder
+
+      private def init_llvm_legacy_pass_manager
+        registry = LLVM::PassRegistry.instance
+        registry.initialize_all
+
+        builder = LLVM::PassManagerBuilder.new
+        builder.size_level = 0
+
+        case optimization_mode
+        in .o3?
+          builder.opt_level = 3
+          builder.use_inliner_with_threshold = 275
+        in .o2?
+          builder.opt_level = 2
+          builder.use_inliner_with_threshold = 275
+        in .o1?
+          builder.opt_level = 1
+          builder.use_inliner_with_threshold = 150
+        in .o0?
+          # default behaviour, no optimizations
+        in .os?
+          builder.opt_level = 2
+          builder.size_level = 1
+          builder.use_inliner_with_threshold = 50
+        in .oz?
+          builder.opt_level = 2
+          builder.size_level = 2
+          builder.use_inliner_with_threshold = 5
+        end
+
+        @pass_manager_builder = builder
+      end
+
       private def optimize_with_pass_manager(llvm_mod)
         fun_pass_manager = llvm_mod.new_function_pass_manager
         pass_manager_builder.populate fun_pass_manager
         fun_pass_manager.run llvm_mod
-        module_pass_manager.run llvm_mod
-      end
-
-      @module_pass_manager : LLVM::ModulePassManager?
-
-      private def module_pass_manager
-        @module_pass_manager ||= begin
-          mod_pass_manager = LLVM::ModulePassManager.new
-          pass_manager_builder.populate mod_pass_manager
-          mod_pass_manager
-        end
-      end
 
-      @pass_manager_builder : LLVM::PassManagerBuilder?
-
-      private def pass_manager_builder
-        @pass_manager_builder ||= begin
-          registry = LLVM::PassRegistry.instance
-          registry.initialize_all
-
-          builder = LLVM::PassManagerBuilder.new
-          builder.size_level = 0
-
-          case optimization_mode
-          in .o3?
-            builder.opt_level = 3
-            builder.use_inliner_with_threshold = 275
-          in .o2?
-            builder.opt_level = 2
-            builder.use_inliner_with_threshold = 275
-          in .o1?
-            builder.opt_level = 1
-            builder.use_inliner_with_threshold = 150
-          in .o0?
-            # default behaviour, no optimizations
-          in .os?
-            builder.opt_level = 2
-            builder.size_level = 1
-            builder.use_inliner_with_threshold = 50
-          in .oz?
-            builder.opt_level = 2
-            builder.size_level = 2
-            builder.use_inliner_with_threshold = 5
-          end
-
-          builder
-        end
+        module_pass_manager = LLVM::ModulePassManager.new
+        pass_manager_builder.populate module_pass_manager
+        module_pass_manager.run llvm_mod
       end
     {% end %}
 
-    protected def optimize(llvm_mod)
+    protected def optimize(llvm_mod, target_machine)
       {% if LibLLVM::IS_LT_130 %}
         optimize_with_pass_manager(llvm_mod)
       {% else %}
@@ -836,6 +877,9 @@ module Crystal
       getter llvm_mod
       property? reused_previous_compilation = false
       getter object_extension : String
+      @memory_buffer : LLVM::MemoryBuffer?
+      @object_name : String?
+      @bc_name : String?
 
       def initialize(@compiler : Compiler, program : Program, @name : String,
                      @llvm_mod : LLVM::Module, @output_dir : String, @bc_flags_changed : Bool)
@@ -865,73 +909,84 @@ module Crystal
         @object_extension = compiler.codegen_target.object_extension
       end
 
-      def compile
-        compile_to_object
+      def generate_bitcode
+        @memory_buffer ||= llvm_mod.write_bitcode_to_memory_buffer
       end
 
-      private def compile_to_object
-        bc_name = self.bc_name
-        object_name = self.object_name
-        temporary_object_name = self.temporary_object_name
+      # To compile a file we first generate a `.bc` file and then create an
+      # object file from it. These `.bc` files are stored in the cache
+      # directory.
+      #
+      # On a next compilation of the same project, and if the compile flags
+      # didn't change (a combination of the target triple, mcpu and link flags,
+      # amongst others), we check if the new `.bc` file is exactly the same as
+      # the old one. In that case the `.o` file will also be the same, so we
+      # simply reuse the old one. Generating an `.o` file is what takes most
+      # time.
+      #
+      # However, instead of directly generating the final `.o` file from the
+      # `.bc` file, we generate it to a temporary name (`.o.tmp`) and then we
+      # rename that file to `.o`. We do this because the compiler could be
+      # interrupted while the `.o` file is being generated, leading to a
+      # corrupted file that later would cause compilation issues. Moving a file
+      # is an atomic operation so no corrupted `.o` file should be generated.
+      def compile(isolate_context = false)
+        if must_compile?
+          isolate_module_context if isolate_context
+          update_bitcode_cache
+          compile_to_object
+        else
+          @reused_previous_compilation = true
+        end
+        dump_llvm_ir
+      end
+
+      private def must_compile?
+        memory_buffer = generate_bitcode
 
-        # To compile a file we first generate a `.bc` file and then
-        # create an object file from it. These `.bc` files are stored
-        # in the cache directory.
-        #
-        # On a next compilation of the same project, and if the compile
-        # flags didn't change (a combination of the target triple, mcpu
-        # and link flags, amongst others), we check if the new
-        # `.bc` file is exactly the same as the old one. In that case
-        # the `.o` file will also be the same, so we simply reuse the
-        # old one. Generating an `.o` file is what takes most time.
-        #
-        # However, instead of directly generating the final `.o` file
-        # from the `.bc` file, we generate it to a temporary name (`.o.tmp`)
-        # and then we rename that file to `.o`. We do this because the compiler
-        # could be interrupted while the `.o` file is being generated, leading
-        # to a corrupted file that later would cause compilation issues.
-        # Moving a file is an atomic operation so no corrupted `.o` file should
-        # be generated.
-
-        must_compile = true
         can_reuse_previous_compilation =
           compiler.emit_targets.none? && !@bc_flags_changed && File.exists?(bc_name) && File.exists?(object_name)
 
-        memory_buffer = llvm_mod.write_bitcode_to_memory_buffer
-
         if can_reuse_previous_compilation
           memory_io = IO::Memory.new(memory_buffer.to_slice)
           changed = File.open(bc_name) { |bc_file| !IO.same_content?(bc_file, memory_io) }
 
           # If the user cancelled a previous compilation
           # it might be that the .o file is empty
           if !changed && File.size(object_name) > 0
-            must_compile = false
             memory_buffer.dispose
-            memory_buffer = nil
+            return false
           else
             # We need to compile, so we'll write the memory buffer to file
           end
         end
 
-        # If there's a memory buffer, it means we must create a .o from it
-        if memory_buffer
-          # Delete existing .o file. It cannot be used anymore.
-          File.delete?(object_name)
-          # Create the .bc file (for next compilations)
-          File.write(bc_name, memory_buffer.to_slice)
-          memory_buffer.dispose
-        end
+        true
+      end
 
-        if must_compile
-          compiler.optimize llvm_mod unless compiler.optimization_mode.o0?
-          compiler.target_machine.emit_obj_to_file llvm_mod, temporary_object_name
-          File.rename(temporary_object_name, object_name)
-        else
-          @reused_previous_compilation = true
-        end
+      # Parse the previously generated bitcode into the LLVM module using a
+      # dedicated context, so we can safely optimize & compile the module in
+      # multiple threads (llvm contexts can't be shared across threads).
+      private def isolate_module_context
+        @llvm_mod = LLVM::Module.parse(@memory_buffer.not_nil!, LLVM::Context.new)
+      end
 
-        dump_llvm_ir
+      private def update_bitcode_cache
+        return unless memory_buffer = @memory_buffer
+
+        # Delete existing .o file. It cannot be used anymore.
+        File.delete?(object_name)
+        # Create the .bc file (for next compilations)
+        File.write(bc_name, memory_buffer.to_slice)
+        memory_buffer.dispose
+      end
+
+      private def compile_to_object
+        temporary_object_name = self.temporary_object_name
+        target_machine = compiler.create_target_machine
+        compiler.optimize llvm_mod, target_machine unless compiler.optimization_mode.o0?
+        target_machine.emit_obj_to_file llvm_mod, temporary_object_name
+        File.rename(temporary_object_name, object_name)
       end
 
       private def dump_llvm_ir

diff --git a/src/llvm/lib_llvm/bit_reader.cr b/src/llvm/lib_llvm/bit_reader.cr
@@ -0,0 +1,5 @@
+require "./types"
+
+lib LibLLVM
+  fun parse_bitcode_in_context2 = LLVMParseBitcodeInContext2(c : ContextRef, mb : MemoryBufferRef, m : ModuleRef*) : Int
+end
diff --git a/src/llvm/module.cr b/src/llvm/module.cr
@@ -6,6 +6,12 @@ class LLVM::Module
 
   getter context : Context
 
+  def self.parse(memory_buffer : MemoryBuffer, context : Context) : self
+    LibLLVM.parse_bitcode_in_context2(context, memory_buffer, out module_ref)
+    raise "BUG: failed to parse LLVM bitcode from memory buffer" unless module_ref
+    new(module_ref, context)
+  end
+
   def initialize(@unwrap : LibLLVM::ModuleRef, @context : Context)
     @owned = false
   end