diff --git a/src/compiler/crystal/compiler.cr b/src/compiler/crystal/compiler.cr
index 38880ee9ed64..008faa9ff7c6 100644
--- a/src/compiler/crystal/compiler.cr
+++ b/src/compiler/crystal/compiler.cr
@@ -5,6 +5,9 @@ require "crystal/digest/md5"
 {% if flag?(:msvc) %}
   require "./loader"
 {% end %}
+{% if flag?(:preview_mt) %}
+  require "wait_group"
+{% end %}
 
 module Crystal
   @[Flags]
@@ -80,7 +83,13 @@ module Crystal
     property? no_codegen = false
 
     # Maximum number of LLVM modules that are compiled in parallel
-    property n_threads : Int32 = {% if flag?(:preview_mt) || flag?(:win32) %} 1 {% else %} 8 {% end %}
+    property n_threads : Int32 = {% if flag?(:preview_mt) %}
+      ENV["CRYSTAL_WORKERS"]?.try(&.to_i?) || 4
+    {% elsif flag?(:win32) %}
+      1
+    {% else %}
+      8
+    {% end %}
 
     # Default prelude file to use. This ends up adding a
     # `require "prelude"` (or whatever name is set here) to
@@ -391,7 +400,7 @@ module Crystal
       llvm_mod = unit.llvm_mod
 
       @progress_tracker.stage("Codegen (bc+obj)") do
-        optimize llvm_mod unless @optimization_mode.o0?
+        optimize llvm_mod, target_machine unless @optimization_mode.o0?
 
         unit.emit(@emit_targets, emit_base_filename || output_filename)
 
@@ -512,7 +521,8 @@ module Crystal
 
     private def parallel_codegen(units, n_threads)
       {% if flag?(:preview_mt) %}
-        raise "Cannot fork compiler in multithread mode."
+        raise "LLVM isn't multithreaded and cannot fork compiler in multithread mode." unless LLVM.multithreaded?
+        mt_codegen(units, n_threads)
       {% elsif LibC.has_method?("fork") %}
         fork_codegen(units, n_threads)
       {% else %}
@@ -520,6 +530,42 @@ module Crystal
       {% end %}
     end
 
+    private def mt_codegen(units, n_threads)
+      channel = Channel(CompilationUnit).new(n_threads * 2)
+      wg = WaitGroup.new(n_threads)
+      mutex = Mutex.new
+
+      n_threads.times do
+        spawn do
+          while unit = channel.receive?
+            unit.compile(isolate_context: true)
+            next unless wants_stats_or_progress?
+            mutex.synchronize { @progress_tracker.stage_progress += 1 }
+          end
+        ensure
+          wg.done
+        end
+      end
+
+      units.each do |unit|
+        # We generate the bitcode in the main thread because LLVM contexts
+        # must be unique per compilation unit, but we share different contexts
+        # across many modules (or rely on the global context); trying to
+        # codegen in parallel would segfault!
+        #
+        # Luckily generating the bitcode is quick and once the bitcode is
+        # generated we don't need the global LLVM contexts anymore but can
+        # parse the bitcode in an isolated context and we can parallelize the
+        # slowest part: the optimization pass & compiling the object file.
+        unit.generate_bitcode
+
+        channel.send(unit)
+      end
+      channel.close
+
+      wg.wait
+    end
+
     private def fork_codegen(units, n_threads)
       workers = fork_workers(n_threads) do |input, output|
         while i = input.gets(chomp: true).presence
@@ -743,7 +789,7 @@ module Crystal
       end
     {% end %}
 
-    protected def optimize(llvm_mod)
+    protected def optimize(llvm_mod, target_machine)
       {% if LibLLVM::IS_LT_130 %}
         optimize_with_pass_manager(llvm_mod)
       {% else %}
@@ -819,6 +865,9 @@ module Crystal
       getter llvm_mod
       property? reused_previous_compilation = false
       getter object_extension : String
+      @memory_buffer : LLVM::MemoryBuffer?
+      @object_name : String?
+      @bc_name : String?
 
       def initialize(@compiler : Compiler, program : Program, @name : String,
                      @llvm_mod : LLVM::Module, @output_dir : String, @bc_flags_changed : Bool)
@@ -848,40 +897,44 @@ module Crystal
         @object_extension = compiler.codegen_target.object_extension
       end
 
-      def compile
-        compile_to_object
+      def generate_bitcode
+        @memory_buffer ||= llvm_mod.write_bitcode_to_memory_buffer
       end
 
-      private def compile_to_object
-        bc_name = self.bc_name
-        object_name = self.object_name
-        temporary_object_name = self.temporary_object_name
+      # To compile a file we first generate a `.bc` file and then create an
+      # object file from it. These `.bc` files are stored in the cache
+      # directory.
+      #
+      # On a next compilation of the same project, and if the compile flags
+      # didn't change (a combination of the target triple, mcpu and link flags,
+      # amongst others), we check if the new `.bc` file is exactly the same as
+      # the old one. In that case the `.o` file will also be the same, so we
+      # simply reuse the old one. Generating an `.o` file is what takes most
+      # time.
+      #
+      # However, instead of directly generating the final `.o` file from the
+      # `.bc` file, we generate it to a temporary name (`.o.tmp`) and then we
+      # rename that file to `.o`. We do this because the compiler could be
+      # interrupted while the `.o` file is being generated, leading to a
+      # corrupted file that later would cause compilation issues. Moving a file
+      # is an atomic operation so no corrupted `.o` file should be generated.
+      def compile(isolate_context = false)
+        if must_compile?
+          isolate_module_context if isolate_context
+          update_bitcode_cache
+          compile_to_object
+        else
+          @reused_previous_compilation = true
+        end
+        dump_llvm_ir
+      end
+
+      private def must_compile?
+        memory_buffer = generate_bitcode
 
-        # To compile a file we first generate a `.bc` file and then
-        # create an object file from it. These `.bc` files are stored
-        # in the cache directory.
-        #
-        # On a next compilation of the same project, and if the compile
-        # flags didn't change (a combination of the target triple, mcpu
-        # and link flags, amongst others), we check if the new
-        # `.bc` file is exactly the same as the old one. In that case
-        # the `.o` file will also be the same, so we simply reuse the
-        # old one. Generating an `.o` file is what takes most time.
-        #
-        # However, instead of directly generating the final `.o` file
-        # from the `.bc` file, we generate it to a temporary name (`.o.tmp`)
-        # and then we rename that file to `.o`. We do this because the compiler
-        # could be interrupted while the `.o` file is being generated, leading
-        # to a corrupted file that later would cause compilation issues.
-        # Moving a file is an atomic operation so no corrupted `.o` file should
-        # be generated.
-
-        must_compile = true
         can_reuse_previous_compilation =
           compiler.emit_targets.none? && !@bc_flags_changed && File.exists?(bc_name) && File.exists?(object_name)
 
-        memory_buffer = llvm_mod.write_bitcode_to_memory_buffer
-
         if can_reuse_previous_compilation
           memory_io = IO::Memory.new(memory_buffer.to_slice)
           changed = File.open(bc_name) { |bc_file| !IO.same_content?(bc_file, memory_io) }
@@ -889,32 +942,39 @@ module Crystal
           # If the user cancelled a previous compilation
           # it might be that the .o file is empty
           if !changed && File.size(object_name) > 0
-            must_compile = false
             memory_buffer.dispose
-            memory_buffer = nil
+            return false
           else
             # We need to compile, so we'll write the memory buffer to file
           end
         end
 
-        # If there's a memory buffer, it means we must create a .o from it
-        if memory_buffer
+        true
+      end
+
+      # Parse the previously generated bitcode into the LLVM module using a
+      # dedicated context, so we can safely optimize & compile the module in
+      # multiple threads (llvm contexts can't be shared across threads).
+      private def isolate_module_context
+        @llvm_mod = LLVM::Module.parse(@memory_buffer.not_nil!, LLVM::Context.new)
+      end
+
+      private def update_bitcode_cache
+        if memory_buffer = @memory_buffer
           # Delete existing .o file. It cannot be used anymore.
           File.delete?(object_name)
           # Create the .bc file (for next compilations)
           File.write(bc_name, memory_buffer.to_slice)
           memory_buffer.dispose
         end
+      end
 
-        if must_compile
-          compiler.optimize llvm_mod unless compiler.optimization_mode.o0?
-          compiler.target_machine.emit_obj_to_file llvm_mod, temporary_object_name
-          File.rename(temporary_object_name, object_name)
-        else
-          @reused_previous_compilation = true
-        end
-
-        dump_llvm_ir
+      private def compile_to_object
+        temporary_object_name = self.temporary_object_name
+        target_machine = compiler.create_target_machine
+        compiler.optimize llvm_mod, target_machine unless compiler.optimization_mode.o0?
+        target_machine.emit_obj_to_file llvm_mod, temporary_object_name
+        File.rename(temporary_object_name, object_name)
       end
 
       private def dump_llvm_ir