Add Crystal::Compiler#mt_codegen

When compiled with -Dpreview_mt the compiler will take advantage of the MT environment to codegen the compilation units in parallel, avoiding fork (that's not supported with MT) and allowing parallel codegen on Windows.
crystal-lang · Jul 2, 2024 · ac91f7c · ac91f7c
1 parent 95a604a
commit ac91f7c
Showing 1 changed file with 105 additions and 45 deletions.
diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr
@@ -5,6 +5,9 @@ require "crystal/digest/md5"
 {% if flag?(:msvc) %}
   require "./loader"
 {% end %}
+{% if flag?(:preview_mt) %}
+  require "wait_group"
+{% end %}
 
 module Crystal
   @[Flags]
@@ -80,7 +83,13 @@ module Crystal
     property? no_codegen = false
 
     # Maximum number of LLVM modules that are compiled in parallel
-    property n_threads : Int32 = {% if flag?(:preview_mt) || flag?(:win32) %} 1 {% else %} 8 {% end %}
+    property n_threads : Int32 = {% if flag?(:preview_mt) %}
+      ENV["CRYSTAL_WORKERS"]?.try(&.to_i?) || 4
+    {% elsif flag?(:win32) %}
+      1
+    {% else %}
+      8
+    {% end %}
 
     # Default prelude file to use. This ends up adding a
     # `require "prelude"` (or whatever name is set here) to
@@ -391,7 +400,7 @@ module Crystal
       llvm_mod = unit.llvm_mod
 
       @progress_tracker.stage("Codegen (bc+obj)") do
-        optimize llvm_mod unless @optimization_mode.o0?
+        optimize llvm_mod, target_machine unless @optimization_mode.o0?
 
         unit.emit(@emit_targets, emit_base_filename || output_filename)
 
@@ -512,14 +521,51 @@ module Crystal
 
     private def parallel_codegen(units, n_threads)
       {% if flag?(:preview_mt) %}
-        raise "Cannot fork compiler in multithread mode."
+        raise "LLVM isn't multithreaded and cannot fork compiler in multithread mode." unless LLVM.multithreaded?
+        mt_codegen(units, n_threads)
       {% elsif LibC.has_method?("fork") %}
         fork_codegen(units, n_threads)
       {% else %}
         raise "Cannot fork compiler. `Crystal::System::Process.fork` is not implemented on this system."
       {% end %}
     end
 
+    private def mt_codegen(units, n_threads)
+      channel = Channel(CompilationUnit).new(n_threads * 2)
+      wg = WaitGroup.new(n_threads)
+      mutex = Mutex.new
+
+      n_threads.times do
+        spawn do
+          while unit = channel.receive?
+            unit.compile(isolate_context: true)
+            next unless wants_stats_or_progress?
+            mutex.synchronize { @progress_tracker.stage_progress += 1 }
+          end
+        ensure
+          wg.done
+        end
+      end
+
+      units.each do |unit|
+        # We generate the bitcode in the main thread because LLVM contexts
+        # must be unique per compilation unit, but we share different contexts
+        # across many modules (or rely on the global context); trying to
+        # codegen in parallel would segfault!
+        #
+        # Luckily generating the bitcode is quick and once the bitcode is
+        # generated we don't need the global LLVM contexts anymore but can
+        # parse the bitcode in an isolated context and we can parallelize the
+        # slowest part: the optimization pass & compiling the object file.
+        unit.generate_bitcode
+
+        channel.send(unit)
+      end
+      channel.close
+
+      wg.wait
+    end
+
     private def fork_codegen(units, n_threads)
       workers = fork_workers(n_threads) do |input, output|
         while i = input.gets(chomp: true).presence
@@ -743,7 +789,7 @@ module Crystal
       end
     {% end %}
 
-    protected def optimize(llvm_mod)
+    protected def optimize(llvm_mod, target_machine)
       {% if LibLLVM::IS_LT_130 %}
         optimize_with_pass_manager(llvm_mod)
       {% else %}
@@ -819,6 +865,9 @@ module Crystal
       getter llvm_mod
       property? reused_previous_compilation = false
       getter object_extension : String
+      @memory_buffer : LLVM::MemoryBuffer?
+      @object_name : String?
+      @bc_name : String?
 
       def initialize(@compiler : Compiler, program : Program, @name : String,
                      @llvm_mod : LLVM::Module, @output_dir : String, @bc_flags_changed : Bool)
@@ -848,73 +897,84 @@ module Crystal
         @object_extension = compiler.codegen_target.object_extension
       end
 
-      def compile
-        compile_to_object
+      def generate_bitcode
+        @memory_buffer ||= llvm_mod.write_bitcode_to_memory_buffer
       end
 
-      private def compile_to_object
-        bc_name = self.bc_name
-        object_name = self.object_name
-        temporary_object_name = self.temporary_object_name
+      # To compile a file we first generate a `.bc` file and then create an
+      # object file from it. These `.bc` files are stored in the cache
+      # directory.
+      #
+      # On a next compilation of the same project, and if the compile flags
+      # didn't change (a combination of the target triple, mcpu and link flags,
+      # amongst others), we check if the new `.bc` file is exactly the same as
+      # the old one. In that case the `.o` file will also be the same, so we
+      # simply reuse the old one. Generating an `.o` file is what takes most
+      # time.
+      #
+      # However, instead of directly generating the final `.o` file from the
+      # `.bc` file, we generate it to a temporary name (`.o.tmp`) and then we
+      # rename that file to `.o`. We do this because the compiler could be
+      # interrupted while the `.o` file is being generated, leading to a
+      # corrupted file that later would cause compilation issues. Moving a file
+      # is an atomic operation so no corrupted `.o` file should be generated.
+      def compile(isolate_context = false)
+        if must_compile?
+          isolate_module_context if isolate_context
+          update_bitcode_cache
+          compile_to_object
+        else
+          @reused_previous_compilation = true
+        end
+        dump_llvm_ir
+      end
+
+      private def must_compile?
+        memory_buffer = generate_bitcode
 
-        # To compile a file we first generate a `.bc` file and then
-        # create an object file from it. These `.bc` files are stored
-        # in the cache directory.
-        #
-        # On a next compilation of the same project, and if the compile
-        # flags didn't change (a combination of the target triple, mcpu
-        # and link flags, amongst others), we check if the new
-        # `.bc` file is exactly the same as the old one. In that case
-        # the `.o` file will also be the same, so we simply reuse the
-        # old one. Generating an `.o` file is what takes most time.
-        #
-        # However, instead of directly generating the final `.o` file
-        # from the `.bc` file, we generate it to a temporary name (`.o.tmp`)
-        # and then we rename that file to `.o`. We do this because the compiler
-        # could be interrupted while the `.o` file is being generated, leading
-        # to a corrupted file that later would cause compilation issues.
-        # Moving a file is an atomic operation so no corrupted `.o` file should
-        # be generated.
-
-        must_compile = true
         can_reuse_previous_compilation =
           compiler.emit_targets.none? && !@bc_flags_changed && File.exists?(bc_name) && File.exists?(object_name)
 
-        memory_buffer = llvm_mod.write_bitcode_to_memory_buffer
-
         if can_reuse_previous_compilation
           memory_io = IO::Memory.new(memory_buffer.to_slice)
           changed = File.open(bc_name) { |bc_file| !IO.same_content?(bc_file, memory_io) }
 
           # If the user cancelled a previous compilation
           # it might be that the .o file is empty
           if !changed && File.size(object_name) > 0
-            must_compile = false
             memory_buffer.dispose
-            memory_buffer = nil
+            return false
           else
             # We need to compile, so we'll write the memory buffer to file
           end
         end
 
-        # If there's a memory buffer, it means we must create a .o from it
-        if memory_buffer
+        true
+      end
+
+      # Parse the previously generated bitcode into the LLVM module using a
+      # dedicated context, so we can safely optimize & compile the module in
+      # multiple threads (llvm contexts can't be shared across threads).
+      private def isolate_module_context
+        @llvm_mod = LLVM::Module.parse(@memory_buffer.not_nil!, LLVM::Context.new)
+      end
+
+      private def update_bitcode_cache
+        if memory_buffer = @memory_buffer
           # Delete existing .o file. It cannot be used anymore.
           File.delete?(object_name)
           # Create the .bc file (for next compilations)
           File.write(bc_name, memory_buffer.to_slice)
           memory_buffer.dispose
         end
+      end
 
-        if must_compile
-          compiler.optimize llvm_mod unless compiler.optimization_mode.o0?
-          compiler.target_machine.emit_obj_to_file llvm_mod, temporary_object_name
-          File.rename(temporary_object_name, object_name)
-        else
-          @reused_previous_compilation = true
-        end
-
-        dump_llvm_ir
+      private def compile_to_object
+        temporary_object_name = self.temporary_object_name
+        target_machine = compiler.create_target_machine
+        compiler.optimize llvm_mod, target_machine unless compiler.optimization_mode.o0?
+        target_machine.emit_obj_to_file llvm_mod, temporary_object_name
+        File.rename(temporary_object_name, object_name)
       end
 
       private def dump_llvm_ir