Skip to content

Commit

Permalink
Experiment with a slighly adjusted pipeline (#52850)
Browse files Browse the repository at this point in the history
Needs #57380 to merge first
  • Loading branch information
gbaraldi authored Feb 21, 2025
1 parent fb0a283 commit 58ce713
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 73 deletions.
155 changes: 96 additions & 59 deletions src/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,22 @@
#include <llvm/Passes/PassPlugin.h>

// NewPM needs to manually include all the pass headers
#include <llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h>
#include <llvm/Transforms/IPO/AlwaysInliner.h>
#include <llvm/Transforms/IPO/Annotation2Metadata.h>
#include <llvm/Transforms/IPO/ConstantMerge.h>
#include <llvm/Transforms/IPO/ForceFunctionAttrs.h>
#include <llvm/Transforms/IPO/GlobalDCE.h>
#include <llvm/Transforms/IPO/GlobalOpt.h>
#include <llvm/Transforms/IPO/StripDeadPrototypes.h>
#include <llvm/Transforms/InstCombine/InstCombine.h>
#include <llvm/Transforms/Instrumentation/AddressSanitizer.h>
#include <llvm/Transforms/Instrumentation/MemorySanitizer.h>
#include <llvm/Transforms/Instrumentation/ThreadSanitizer.h>
#include <llvm/Transforms/Scalar/ADCE.h>
#include <llvm/Transforms/Scalar/AnnotationRemarks.h>
#include <llvm/Transforms/Scalar/BDCE.h>
#include "llvm/Transforms/Scalar/ConstraintElimination.h"
#include <llvm/Transforms/Scalar/CorrelatedValuePropagation.h>
#include <llvm/Transforms/Scalar/DCE.h>
#include <llvm/Transforms/Scalar/DeadStoreElimination.h>
Expand All @@ -59,13 +63,17 @@
#include <llvm/Transforms/Scalar/LowerConstantIntrinsics.h>
#include <llvm/Transforms/Scalar/LowerExpectIntrinsic.h>
#include <llvm/Transforms/Scalar/MemCpyOptimizer.h>
#include <llvm/Transforms/Scalar/MergedLoadStoreMotion.h>
#include <llvm/Transforms/Scalar/Reassociate.h>
#include <llvm/Transforms/Scalar/SCCP.h>
#include <llvm/Transforms/Scalar/SROA.h>
#include <llvm/Transforms/Scalar/SimpleLoopUnswitch.h>
#include <llvm/Transforms/Scalar/SimplifyCFG.h>
#include <llvm/Transforms/Scalar/WarnMissedTransforms.h>
#include <llvm/Transforms/Utils/LibCallsShrinkWrap.h>
#include <llvm/Transforms/Utils/InjectTLIMappings.h>
#include <llvm/Transforms/Utils/Mem2Reg.h>
#include <llvm/Transforms/Utils/RelLookupTableConverter.h>
#include <llvm/Transforms/Utils/ModuleUtils.h>
#include <llvm/Transforms/Utils/SimplifyCFGOptions.h>
#include <llvm/Transforms/Vectorize/LoopVectorize.h>
Expand Down Expand Up @@ -196,10 +204,9 @@ namespace {
.convertSwitchRangeToICmp(true)
.convertSwitchToLookupTable(true)
.forwardSwitchCondToPhi(true)
//These mess with loop rotation, so only do them after that
.needCanonicalLoops(false)
.hoistCommonInsts(true)
// Causes an SRET assertion error in late-gc-lowering
// .sinkCommonInsts(true)
.sinkCommonInsts(true)
;
}

Expand Down Expand Up @@ -341,10 +348,16 @@ static void buildEarlySimplificationPipeline(ModulePassManager &MPM, PassBuilder
FPM.addPass(DCEPass());
FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions()));
if (O.getSpeedupLevel() >= 1) {
// TODO check the LLVM 15 default.
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
FPM.addPass(EarlyCSEPass());
}
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
if (O.getSpeedupLevel() >= 1) {
FunctionPassManager GlobalFPM;
MPM.addPass(GlobalOptPass());
GlobalFPM.addPass(PromotePass());
GlobalFPM.addPass(InstCombinePass());
}
}
invokeEarlySimplificationCallbacks(MPM, PB, O);
}
Expand Down Expand Up @@ -379,22 +392,24 @@ static void buildEarlyOptimizerPipeline(ModulePassManager &MPM, PassBuilder *PB,
if (O.getSpeedupLevel() >= 1) {
FunctionPassManager FPM;
if (O.getSpeedupLevel() >= 2) {
// TODO check the LLVM 15 default.
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
// SROA can duplicate PHI nodes which can block LowerSIMD
FPM.addPass(InstCombinePass());
FPM.addPass(JumpThreadingPass());
FPM.addPass(CorrelatedValuePropagationPass());
FPM.addPass(ReassociatePass());
FPM.addPass(EarlyCSEPass());
JULIA_PASS(FPM.addPass(AllocOptPass()));
} else { // if (O.getSpeedupLevel() >= 1) (exactly)
FPM.addPass(InstCombinePass());
FPM.addPass(EarlyCSEPass());
}
invokePeepholeEPCallbacks(FPM, PB, O);
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
FPM.addPass(EarlyCSEPass(true));
FPM.addPass(InstCombinePass());
FPM.addPass(AggressiveInstCombinePass());
FPM.addPass(JumpThreadingPass());
FPM.addPass(CorrelatedValuePropagationPass());
FPM.addPass(LibCallsShrinkWrapPass());
FPM.addPass(ReassociatePass());
FPM.addPass(ConstraintEliminationPass());
JULIA_PASS(FPM.addPass(AllocOptPass()));
} else { // if (O.getSpeedupLevel() >= 1) (exactly)
FPM.addPass(EarlyCSEPass());
FPM.addPass(InstCombinePass());
}
invokePeepholeEPCallbacks(FPM, PB, O);
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), /*UseMemorySSA = */true));
}
MPM.addPass(GlobalOptPass());
MPM.addPass(GlobalDCEPass());
}
MPM.addPass(AfterEarlyOptimizationMarkerPass());
Expand All @@ -407,41 +422,41 @@ static void buildLoopOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *PB
LoopPassManager LPM;
LPM.addPass(LowerSIMDLoopPass());
if (O.getSpeedupLevel() >= 2) {
LPM.addPass(LoopRotatePass());
LPM.addPass(LoopInstSimplifyPass());
LPM.addPass(LoopSimplifyCFGPass());
LPM.addPass(BeforeLICMMarkerPass());
auto opts = LICMOptions();
opts.AllowSpeculation = false;
LPM.addPass(LICMPass(opts));
LPM.addPass(JuliaLICMPass());
LPM.addPass(LoopRotatePass(true, false));
LPM.addPass(LICMPass(LICMOptions()));
LPM.addPass(JuliaLICMPass());
LPM.addPass(AfterLICMMarkerPass());
LPM.addPass(SimpleLoopUnswitchPass(/*NonTrivial*/true, true));
}
invokeLateLoopOptimizationCallbacks(LPM, PB, O);
//We don't know if the loop callbacks support MSSA
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */false));
}
if (O.getSpeedupLevel() >= 2) {
LoopPassManager LPM;
LPM.addPass(BeforeLICMMarkerPass());
LPM.addPass(LICMPass(LICMOptions()));
LPM.addPass(JuliaLICMPass());
LPM.addPass(SimpleLoopUnswitchPass(/*NonTrivial*/true, true));
LPM.addPass(LICMPass(LICMOptions()));
LPM.addPass(JuliaLICMPass());
LPM.addPass(AfterLICMMarkerPass());
//LICM needs MemorySSA now, so we must use it
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */true));
}
if (O.getSpeedupLevel() >= 2) {
if (O.getSpeedupLevel() >= 2)
FPM.addPass(IRCEPass());
}
{
LoopPassManager LPM;
LPM.addPass(BeforeLoopSimplificationMarkerPass());
if (O.getSpeedupLevel() >= 2) {
LPM.addPass(LoopInstSimplifyPass());
LPM.addPass(LoopIdiomRecognizePass());
LPM.addPass(IndVarSimplifyPass());
LPM.addPass(SimpleLoopUnswitchPass(/*NonTrivial*/true, true));
LPM.addPass(LoopDeletionPass());
// This unroll will only unroll loops when the trip count is known and small,
// so that no loop remains
LPM.addPass(LoopFullUnrollPass());
}
invokeLoopOptimizerEndCallbacks(LPM, PB, O);
LPM.addPass(AfterLoopSimplificationMarkerPass());
FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions()));
FPM.addPass(InstCombinePass());
//We don't know if the loop end callbacks support MSSA
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */false));
}
Expand All @@ -454,17 +469,28 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *
if (options.enable_scalar_optimizations) {
if (O.getSpeedupLevel() >= 2) {
JULIA_PASS(FPM.addPass(AllocOptPass()));
// TODO check the LLVM 15 default.
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
FPM.addPass(InstSimplifyPass());
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
FPM.addPass(MergedLoadStoreMotionPass());
FPM.addPass(GVNPass());
FPM.addPass(MemCpyOptPass());
FPM.addPass(SCCPPass());
FPM.addPass(BDCEPass());
FPM.addPass(InstCombinePass());
FPM.addPass(CorrelatedValuePropagationPass());
FPM.addPass(DCEPass());
FPM.addPass(ADCEPass());
FPM.addPass(MemCpyOptPass());
FPM.addPass(DSEPass());
FPM.addPass(IRCEPass());
FPM.addPass(InstCombinePass());
FPM.addPass(JumpThreadingPass());
FPM.addPass(ConstraintEliminationPass());
} else if (O.getSpeedupLevel() >= 1) {
JULIA_PASS(FPM.addPass(AllocOptPass()));
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
FPM.addPass(MemCpyOptPass());
FPM.addPass(SCCPPass());
FPM.addPass(BDCEPass());
FPM.addPass(InstCombinePass());
FPM.addPass(ADCEPass());
}
if (O.getSpeedupLevel() >= 3) {
FPM.addPass(GVNPass());
Expand All @@ -476,12 +502,15 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *
JULIA_PASS(FPM.addPass(AllocOptPass()));
{
LoopPassManager LPM;
LPM.addPass(LoopDeletionPass());
LPM.addPass(LoopInstSimplifyPass());
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
LPM.addPass(LICMPass(LICMOptions()));
LPM.addPass(JuliaLICMPass());
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */true));
}
FPM.addPass(LoopDistributePass());
}
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
FPM.addPass(InstCombinePass());
} else if (O.getSpeedupLevel() >= 1)
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));

invokeScalarOptimizerCallbacks(FPM, PB, O);
}
FPM.addPass(AfterScalarOptimizationMarkerPass());
Expand All @@ -491,19 +520,27 @@ static void buildVectorPipeline(FunctionPassManager &FPM, PassBuilder *PB, Optim
FPM.addPass(BeforeVectorizationMarkerPass());
if (options.enable_vector_pipeline) {
//TODO look into loop vectorize options
// Rerotate loops that might have been unrotated in the simplification
LoopPassManager LPM;
LPM.addPass(LoopRotatePass());
LPM.addPass(LoopDeletionPass());
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
FPM.addPass(LoopDistributePass());
FPM.addPass(InjectTLIMappings());
FPM.addPass(LoopVectorizePass());
FPM.addPass(LoopLoadEliminationPass());
FPM.addPass(InstCombinePass());
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
FPM.addPass(EarlyCSEPass());
FPM.addPass(CorrelatedValuePropagationPass());
FPM.addPass(InstCombinePass());
FPM.addPass(SLPVectorizerPass());
invokeVectorizerCallbacks(FPM, PB, O);
FPM.addPass(VectorCombinePass());
FPM.addPass(ADCEPass());
//TODO add BDCEPass here?
// This unroll will unroll vectorized loops
// as well as loops that we tried but failed to vectorize
invokeVectorizerCallbacks(FPM, PB, O);
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(O.getSpeedupLevel(), /*OnlyWhenForced = */ false, /*ForgetSCEV = */false)));
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
FPM.addPass(InstSimplifyPass());
FPM.addPass(AfterVectorizationMarkerPass());
}
FPM.addPass(AfterVectorizationMarkerPass());
}
Expand All @@ -525,18 +562,18 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
FunctionPassManager FPM;
JULIA_PASS(FPM.addPass(LateLowerGCPass()));
JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
if (O.getSpeedupLevel() >= 2) {
FPM.addPass(DSEPass());
FPM.addPass(GVNPass());
FPM.addPass(SCCPPass());
FPM.addPass(DCEPass());
}
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
}
JULIA_PASS(MPM.addPass(LowerPTLSPass(options.dump_native)));
MPM.addPass(RemoveJuliaAddrspacesPass()); //TODO: Make this conditional on arches (GlobalISel doesn't like our addrsspaces)
if (O.getSpeedupLevel() >= 1) {
FunctionPassManager FPM;
if (O.getSpeedupLevel() >= 2) {
FPM.addPass(DSEPass());
FPM.addPass(GVNPass());
FPM.addPass(SCCPPass());
FPM.addPass(DCEPass());
}
FPM.addPass(InstCombinePass());
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
Expand Down
4 changes: 2 additions & 2 deletions test/boundscheck_exec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ end |> only === Type{Int}

if bc_opt == bc_default
# Array/Memory escape analysis
function no_allocate(T::Type{<:Union{Memory, Vector}})
function no_allocate(T::Type{<:Union{Memory}})
v = T(undef, 2)
v[1] = 2
v[2] = 3
Expand All @@ -308,7 +308,7 @@ if bc_opt == bc_default
function test_alloc(::Type{T}; broken=false) where T
@test (@allocated no_allocate(T)) == 0 broken=broken
end
for T in [Memory, Vector]
for T in [Memory] # This requires changing the pointer_from_objref to something llvm sees through
for ET in [Int, Float32, Union{Int, Float64}]
no_allocate(T{ET}) #compile
# allocations aren't removed for Union eltypes which they theoretically could be eventually
Expand Down
3 changes: 1 addition & 2 deletions test/llvmpasses/image-codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
# CHECK-NOT: private global
# CHECK: jl_global
# COM: we emit both declarations and definitions, so we may see either style in the IR
# CHECK-SAME: = {{(external )?}}global
# CHECK: julia_f_
# CHECK-SAME: = {{(external )?}}
# CHECK-NOT: internal global
# CHECK-NOT: private global

Expand Down
1 change: 1 addition & 0 deletions test/llvmpasses/late-lower-gc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,4 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) {
; CHECK-NEXT: !10 = distinct !{!10}
; CHECK-NEXT: !11 = !{!12, !12, i64 0}
; CHECK-NEXT: !12 = !{!"jtbaa_const", !3}

15 changes: 15 additions & 0 deletions test/llvmpasses/pipeline-o2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,18 @@ function loopedlength(arr)
end
len
end
# COM: Vector
# ALL-LABEL: @julia_memset_like
# ALL: vector.body

# COM: Memory
# ALL-LABEL: @julia_memset_like
# ALL: vector.body
function memset_like(mem)
for idx in eachindex(mem)
mem[idx] = 1.0
end
end

emit(iterate_read, Vector{Int64})
emit(iterate_write, Vector{Int64}, Vector{Int64})
Expand All @@ -150,3 +162,6 @@ emit(sumloop, Int64)
emit(simd_sumloop, Float32)

emit(loopedlength, Vector{Int64})

emit(memset_like, Vector{Float64})
emit(memset_like, Memory{Float64})
13 changes: 3 additions & 10 deletions test/llvmpasses/pipeline-prints.ll
Original file line number Diff line number Diff line change
Expand Up @@ -285,25 +285,18 @@ attributes #2 = { inaccessiblemem_or_argmemonly }

; COM: InstSimplify/InstCombine should kill this zext-trunc pair
; AFTEREARLYSIMPLIFICATION: [[ZEXT:%.*]] = zext i1 {{%.*}} to i8
; AFTEREARLYSIMPLIFICATION-NEXT: trunc i8 [[ZEXT]] to i1

; BEFOREEARLYOPTIMIZATION: [[ZEXT:%.*]] = zext i1 {{%.*}} to i8
; BEFOREEARLYOPTIMIZATION-NEXT: trunc i8 [[ZEXT]] to i1

; AFTEREARLYOPTIMIZATION-NOT: zext i1 {{%.*}} to i8
; AFTEREARLYOPTIMIZATION-NOT: trunc i8 {{%.*}} to i1

; BEFORELOOPOPTIMIZATION-NOT: zext i1 {{%.*}} to i8
; BEFORELOOPOPTIMIZATION-NOT: trunc i8 {{%.*}} to i1

; COM: Loop simplification makes the exit condition obvious
; AFTERLOOPSIMPLIFICATION: L35.lr.ph:
; AFTERLOOPSIMPLIFICATION: add nuw nsw

; COM: Scalar optimization removes the previous add from the preheader
; AFTERSCALAROPTIMIZATION: L35.lr.ph:
; AFTERSCALAROPTIMIZATION-NOT: add nuw nsw
; AFTERSCALAROPTIMIZATION: br label %L35
; COM: Scalar optimization removes the preheader
; AFTERSCALAROPTIMIZATION: L17:
; AFTERSCALAROPTIMIZATION: icmp eq i64 {{%.*}}, 1,

; COM: Vectorization does stuff
; AFTERVECTORIZATION: vector.body
Expand Down

0 comments on commit 58ce713

Please sign in to comment.