Skip to content

Commit

Permalink
Compiler: enable parallel codegen with MT (#14748)
Browse files Browse the repository at this point in the history
Implements parallel codegen of object files when MT is enabled in the compiler (`-Dpreview_mt`).

It only impacts codegen for compilations with more than one compilation unit (module), that is when neither of `--single-module`, `--release` or `--cross-compile` is specified. This behavior is identical to the fork based codegen.

**Advantages:**

- allows parallel codegen on Windows (untested);
- no need to fork many processes;
- no repeated GC collections in each forked processes;
- a simple Channel to distribute work efficiently (no need for IPC).

The main points are increased portability and simpler logic, despite having to take care of LLVM thread safety quirks (see comments).

Co-authored-by: Johannes Müller <[email protected]>
  • Loading branch information
ysbaddaden and straight-shoota authored Sep 21, 2024
1 parent 5f018fc commit c74f6bc
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 98 deletions.
251 changes: 153 additions & 98 deletions src/compiler/crystal/compiler.cr
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ require "crystal/digest/md5"
{% if flag?(:msvc) %}
require "./loader"
{% end %}
{% if flag?(:preview_mt) %}
require "wait_group"
{% end %}

module Crystal
@[Flags]
Expand Down Expand Up @@ -80,7 +83,13 @@ module Crystal
property? no_codegen = false

# Maximum number of LLVM modules that are compiled in parallel
property n_threads : Int32 = {% if flag?(:preview_mt) || flag?(:win32) %} 1 {% else %} 8 {% end %}
property n_threads : Int32 = {% if flag?(:preview_mt) %}
ENV["CRYSTAL_WORKERS"]?.try(&.to_i?) || 4
{% elsif flag?(:win32) %}
1
{% else %}
8
{% end %}

# Default prelude file to use. This ends up adding a
# `require "prelude"` (or whatever name is set here) to
Expand Down Expand Up @@ -328,6 +337,12 @@ module Crystal
CompilationUnit.new(self, program, type_name, llvm_mod, output_dir, bc_flags_changed)
end

{% if LibLLVM::IS_LT_170 %}
# initialize the legacy pass manager once in the main thread/process
# before we start codegen in threads (MT) or processes (fork)
init_llvm_legacy_pass_manager unless optimization_mode.o0?
{% end %}

if @cross_compile
cross_compile program, units, output_filename
else
Expand Down Expand Up @@ -391,7 +406,7 @@ module Crystal
llvm_mod = unit.llvm_mod

@progress_tracker.stage("Codegen (bc+obj)") do
optimize llvm_mod unless @optimization_mode.o0?
optimize llvm_mod, target_machine unless @optimization_mode.o0?

unit.emit(@emit_targets, emit_base_filename || output_filename)

Expand Down Expand Up @@ -529,14 +544,48 @@ module Crystal

private def parallel_codegen(units, n_threads)
{% if flag?(:preview_mt) %}
raise "Cannot fork compiler in multithread mode."
raise "LLVM isn't multithreaded and cannot fork compiler in multithread mode." unless LLVM.multithreaded?
mt_codegen(units, n_threads)
{% elsif LibC.has_method?("fork") %}
fork_codegen(units, n_threads)
{% else %}
raise "Cannot fork compiler. `Crystal::System::Process.fork` is not implemented on this system."
{% end %}
end

private def mt_codegen(units, n_threads)
channel = Channel(CompilationUnit).new(n_threads * 2)
wg = WaitGroup.new
mutex = Mutex.new

n_threads.times do
wg.spawn do
while unit = channel.receive?
unit.compile(isolate_context: true)
mutex.synchronize { @progress_tracker.stage_progress += 1 }
end
end
end

units.each do |unit|
# We generate the bitcode in the main thread because LLVM contexts
# must be unique per compilation unit, but we share different contexts
# across many modules (or rely on the global context); trying to
# codegen in parallel would segfault!
#
# Luckily generating the bitcode is quick and once the bitcode is
# generated we don't need the global LLVM contexts anymore but can
# parse the bitcode in an isolated context and we can parallelize the
# slowest part: the optimization pass & compiling the object file.
unit.generate_bitcode

channel.send(unit)
end
channel.close

wg.wait
end

private def fork_codegen(units, n_threads)
workers = fork_workers(n_threads) do |input, output|
while i = input.gets(chomp: true).presence
Expand Down Expand Up @@ -677,9 +726,10 @@ module Crystal

puts
puts "Codegen (bc+obj):"
if units.size == reused
case reused
when units.size
puts " - all previous .o files were reused"
elsif reused == 0
when .zero?
puts " - no previous .o files were reused"
else
puts " - #{reused}/#{units.size} .o files were reused"
Expand All @@ -706,61 +756,52 @@ module Crystal
end

{% if LibLLVM::IS_LT_170 %}
property! pass_manager_builder : LLVM::PassManagerBuilder

private def init_llvm_legacy_pass_manager
registry = LLVM::PassRegistry.instance
registry.initialize_all

builder = LLVM::PassManagerBuilder.new
builder.size_level = 0

case optimization_mode
in .o3?
builder.opt_level = 3
builder.use_inliner_with_threshold = 275
in .o2?
builder.opt_level = 2
builder.use_inliner_with_threshold = 275
in .o1?
builder.opt_level = 1
builder.use_inliner_with_threshold = 150
in .o0?
# default behaviour, no optimizations
in .os?
builder.opt_level = 2
builder.size_level = 1
builder.use_inliner_with_threshold = 50
in .oz?
builder.opt_level = 2
builder.size_level = 2
builder.use_inliner_with_threshold = 5
end

@pass_manager_builder = builder
end

private def optimize_with_pass_manager(llvm_mod)
fun_pass_manager = llvm_mod.new_function_pass_manager
pass_manager_builder.populate fun_pass_manager
fun_pass_manager.run llvm_mod
module_pass_manager.run llvm_mod
end

@module_pass_manager : LLVM::ModulePassManager?

private def module_pass_manager
@module_pass_manager ||= begin
mod_pass_manager = LLVM::ModulePassManager.new
pass_manager_builder.populate mod_pass_manager
mod_pass_manager
end
end

@pass_manager_builder : LLVM::PassManagerBuilder?

private def pass_manager_builder
@pass_manager_builder ||= begin
registry = LLVM::PassRegistry.instance
registry.initialize_all

builder = LLVM::PassManagerBuilder.new
builder.size_level = 0

case optimization_mode
in .o3?
builder.opt_level = 3
builder.use_inliner_with_threshold = 275
in .o2?
builder.opt_level = 2
builder.use_inliner_with_threshold = 275
in .o1?
builder.opt_level = 1
builder.use_inliner_with_threshold = 150
in .o0?
# default behaviour, no optimizations
in .os?
builder.opt_level = 2
builder.size_level = 1
builder.use_inliner_with_threshold = 50
in .oz?
builder.opt_level = 2
builder.size_level = 2
builder.use_inliner_with_threshold = 5
end

builder
end
module_pass_manager = LLVM::ModulePassManager.new
pass_manager_builder.populate module_pass_manager
module_pass_manager.run llvm_mod
end
{% end %}

protected def optimize(llvm_mod)
protected def optimize(llvm_mod, target_machine)
{% if LibLLVM::IS_LT_130 %}
optimize_with_pass_manager(llvm_mod)
{% else %}
Expand Down Expand Up @@ -836,6 +877,9 @@ module Crystal
getter llvm_mod
property? reused_previous_compilation = false
getter object_extension : String
@memory_buffer : LLVM::MemoryBuffer?
@object_name : String?
@bc_name : String?

def initialize(@compiler : Compiler, program : Program, @name : String,
@llvm_mod : LLVM::Module, @output_dir : String, @bc_flags_changed : Bool)
Expand Down Expand Up @@ -865,73 +909,84 @@ module Crystal
@object_extension = compiler.codegen_target.object_extension
end

def compile
compile_to_object
def generate_bitcode
@memory_buffer ||= llvm_mod.write_bitcode_to_memory_buffer
end

private def compile_to_object
bc_name = self.bc_name
object_name = self.object_name
temporary_object_name = self.temporary_object_name
# To compile a file we first generate a `.bc` file and then create an
# object file from it. These `.bc` files are stored in the cache
# directory.
#
# On a next compilation of the same project, and if the compile flags
# didn't change (a combination of the target triple, mcpu and link flags,
# amongst others), we check if the new `.bc` file is exactly the same as
# the old one. In that case the `.o` file will also be the same, so we
# simply reuse the old one. Generating an `.o` file is what takes most
# time.
#
# However, instead of directly generating the final `.o` file from the
# `.bc` file, we generate it to a temporary name (`.o.tmp`) and then we
# rename that file to `.o`. We do this because the compiler could be
# interrupted while the `.o` file is being generated, leading to a
# corrupted file that later would cause compilation issues. Moving a file
# is an atomic operation so no corrupted `.o` file should be generated.
def compile(isolate_context = false)
if must_compile?
isolate_module_context if isolate_context
update_bitcode_cache
compile_to_object
else
@reused_previous_compilation = true
end
dump_llvm_ir
end

private def must_compile?
memory_buffer = generate_bitcode

# To compile a file we first generate a `.bc` file and then
# create an object file from it. These `.bc` files are stored
# in the cache directory.
#
# On a next compilation of the same project, and if the compile
# flags didn't change (a combination of the target triple, mcpu
# and link flags, amongst others), we check if the new
# `.bc` file is exactly the same as the old one. In that case
# the `.o` file will also be the same, so we simply reuse the
# old one. Generating an `.o` file is what takes most time.
#
# However, instead of directly generating the final `.o` file
# from the `.bc` file, we generate it to a temporary name (`.o.tmp`)
# and then we rename that file to `.o`. We do this because the compiler
# could be interrupted while the `.o` file is being generated, leading
# to a corrupted file that later would cause compilation issues.
# Moving a file is an atomic operation so no corrupted `.o` file should
# be generated.

must_compile = true
can_reuse_previous_compilation =
compiler.emit_targets.none? && !@bc_flags_changed && File.exists?(bc_name) && File.exists?(object_name)

memory_buffer = llvm_mod.write_bitcode_to_memory_buffer

if can_reuse_previous_compilation
memory_io = IO::Memory.new(memory_buffer.to_slice)
changed = File.open(bc_name) { |bc_file| !IO.same_content?(bc_file, memory_io) }

# If the user cancelled a previous compilation
# it might be that the .o file is empty
if !changed && File.size(object_name) > 0
must_compile = false
memory_buffer.dispose
memory_buffer = nil
return false
else
# We need to compile, so we'll write the memory buffer to file
end
end

# If there's a memory buffer, it means we must create a .o from it
if memory_buffer
# Delete existing .o file. It cannot be used anymore.
File.delete?(object_name)
# Create the .bc file (for next compilations)
File.write(bc_name, memory_buffer.to_slice)
memory_buffer.dispose
end
true
end

if must_compile
compiler.optimize llvm_mod unless compiler.optimization_mode.o0?
compiler.target_machine.emit_obj_to_file llvm_mod, temporary_object_name
File.rename(temporary_object_name, object_name)
else
@reused_previous_compilation = true
end
# Parse the previously generated bitcode into the LLVM module using a
# dedicated context, so we can safely optimize & compile the module in
# multiple threads (llvm contexts can't be shared across threads).
private def isolate_module_context
@llvm_mod = LLVM::Module.parse(@memory_buffer.not_nil!, LLVM::Context.new)
end

dump_llvm_ir
private def update_bitcode_cache
return unless memory_buffer = @memory_buffer

# Delete existing .o file. It cannot be used anymore.
File.delete?(object_name)
# Create the .bc file (for next compilations)
File.write(bc_name, memory_buffer.to_slice)
memory_buffer.dispose
end

private def compile_to_object
temporary_object_name = self.temporary_object_name
target_machine = compiler.create_target_machine
compiler.optimize llvm_mod, target_machine unless compiler.optimization_mode.o0?
target_machine.emit_obj_to_file llvm_mod, temporary_object_name
File.rename(temporary_object_name, object_name)
end

private def dump_llvm_ir
Expand Down
5 changes: 5 additions & 0 deletions src/llvm/lib_llvm/bit_reader.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
require "./types"

lib LibLLVM
fun parse_bitcode_in_context2 = LLVMParseBitcodeInContext2(c : ContextRef, mb : MemoryBufferRef, m : ModuleRef*) : Int
end
6 changes: 6 additions & 0 deletions src/llvm/module.cr
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ class LLVM::Module

getter context : Context

def self.parse(memory_buffer : MemoryBuffer, context : Context) : self
LibLLVM.parse_bitcode_in_context2(context, memory_buffer, out module_ref)
raise "BUG: failed to parse LLVM bitcode from memory buffer" unless module_ref
new(module_ref, context)
end

def initialize(@unwrap : LibLLVM::ModuleRef, @context : Context)
@owned = false
end
Expand Down

0 comments on commit c74f6bc

Please sign in to comment.