Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve nop-match simplification #68481

Closed
wants to merge 8 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 160 additions & 33 deletions src/librustc_mir/transform/simplify_try.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::transform::{simplify, MirPass, MirSource};
use itertools::Itertools as _;
use rustc::mir::*;
use rustc::ty::{Ty, TyCtxt};
use rustc_index::vec::IndexVec;
use rustc_target::abi::VariantIdx;

/// Simplifies arms of form `Variant(x) => Variant(x)` to just a move.
Expand All @@ -25,6 +26,19 @@ use rustc_target::abi::VariantIdx;
/// discriminant(_LOCAL_0) = VAR_IDX;
/// ```
///
/// or
///
/// ```rust
/// StorageLive(_LOCAL_TMP);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate on why we should be doing this optimization in debug mode (and why we shouldn't instead run the storage and temporary elimination)? Will this make debug mode compile faster in all cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late reply!

AFAIK mir-opt-level is always 1 unless explicitly specified, regardless of release or debug build. mir-opt-level is independent from optimization level, and cargo does not set it. (-Zmir-optlevel is an unstable flag and setting it to 2 or larger is prone to ICE/miscompile anyway.)
You can confirm this by building MIR of this playground in release / debug build. Temporaries are not eliminated even in release build, and SimplifyArmIdentity is not working.

That said, the purpose of this PR is to improve the codegen of identical match in relase build.

diff of cargo rustc --release -- --emit=llvm-ir, using the same code as the playground above, before/after this PR
--- old.ll	2020-01-27 07:04:08.034979599 +0900
+++ new.ll	2020-01-27 07:04:21.682132255 +0900
@@ -1,5 +1,5 @@
-; ModuleID = 'foo.1lzogcxs-cgu.0'
-source_filename = "foo.1lzogcxs-cgu.0"
+; ModuleID = 'foo.2my2volq-cgu.0'
+source_filename = "foo.2my2volq-cgu.0"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -9,80 +9,35 @@
 %"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }
 
 ; foo::try_identity
-; Function Attrs: nofree norecurse nounwind nonlazybind uwtable
-define void @_ZN3foo12try_identity17hf7a3f301f9e3a548E(%"core::result::Result<u64, i32>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i32>"* noalias nocapture readonly dereferenceable(16) %x) unnamed_addr #0 {
+; Function Attrs: nounwind nonlazybind uwtable
+define void @_ZN3foo12try_identity17h273ad7d1ef012f0eE(%"core::result::Result<u64, i32>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i32>"* noalias nocapture readonly dereferenceable(16) %x) unnamed_addr #0 {
 start:
-  %1 = getelementptr inbounds %"core::result::Result<u64, i32>", %"core::result::Result<u64, i32>"* %x, i64 0, i32 0, i64 0
-  %2 = load i32, i32* %1, align 8, !range !2
-  %switch = icmp eq i32 %2, 1
-  br i1 %switch, label %bb3, label %bb2
-
-bb2:                                              ; preds = %start
-  %3 = getelementptr inbounds %"core::result::Result<u64, i32>", %"core::result::Result<u64, i32>"* %x, i64 0, i32 2, i64 1
-  %4 = bitcast i32* %3 to i64*
-  %x2 = load i64, i64* %4, align 8
-  %5 = getelementptr inbounds %"core::result::Result<u64, i32>", %"core::result::Result<u64, i32>"* %0, i64 0, i32 2, i64 1
-  %6 = bitcast i32* %5 to i64*
-  store i64 %x2, i64* %6, align 8
-  br label %bb4
-
-bb3:                                              ; preds = %start
-  %7 = getelementptr inbounds %"core::result::Result<u64, i32>", %"core::result::Result<u64, i32>"* %x, i64 0, i32 2, i64 0
-  %x1 = load i32, i32* %7, align 4
-  %8 = getelementptr inbounds %"core::result::Result<u64, i32>", %"core::result::Result<u64, i32>"* %0, i64 0, i32 2, i64 0
-  store i32 %x1, i32* %8, align 4
-  br label %bb4
-
-bb4:                                              ; preds = %bb2, %bb3
-  %.sink = phi i32 [ 0, %bb2 ], [ 1, %bb3 ]
-  %9 = getelementptr inbounds %"core::result::Result<u64, i32>", %"core::result::Result<u64, i32>"* %0, i64 0, i32 0, i64 0
-  store i32 %.sink, i32* %9, align 8
+  %1 = bitcast %"core::result::Result<u64, i32>"* %0 to i8*
+  %2 = bitcast %"core::result::Result<u64, i32>"* %x to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %1, i8* nonnull align 8 %2, i64 16, i1 false)
   ret void
 }
 
 ; foo::try_identity_drop
 ; Function Attrs: nounwind nonlazybind uwtable
-define void @_ZN3foo17try_identity_drop17h5d9ea5a8e7421c58E(%"core::result::Result<alloc::string::String, i32>"* noalias nocapture sret dereferenceable(32), %"core::result::Result<alloc::string::String, i32>"* noalias nocapture readonly dereferenceable(32) %x) unnamed_addr #1 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
+define void @_ZN3foo17try_identity_drop17h7d0574702d5fadd5E(%"core::result::Result<alloc::string::String, i32>"* noalias nocapture sret dereferenceable(32), %"core::result::Result<alloc::string::String, i32>"* noalias nocapture readonly dereferenceable(32) %x) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
 start:
-  %1 = getelementptr inbounds %"core::result::Result<alloc::string::String, i32>", %"core::result::Result<alloc::string::String, i32>"* %x, i64 0, i32 0, i64 0
-  %2 = load i32, i32* %1, align 8, !range !2
-  %switch = icmp eq i32 %2, 1
-  br i1 %switch, label %bb8, label %bb5
-
-bb4:                                              ; preds = %bb5, %bb8
-  %.sink = phi i32 [ 0, %bb5 ], [ 1, %bb8 ]
-  %3 = getelementptr inbounds %"core::result::Result<alloc::string::String, i32>", %"core::result::Result<alloc::string::String, i32>"* %0, i64 0, i32 0, i64 0
-  store i32 %.sink, i32* %3, align 8
+  %1 = bitcast %"core::result::Result<alloc::string::String, i32>"* %0 to i8*
+  %2 = bitcast %"core::result::Result<alloc::string::String, i32>"* %x to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %1, i8* nonnull align 8 %2, i64 32, i1 false)
   ret void
-
-bb5:                                              ; preds = %start
-  %x1.sroa.0.0..sroa_idx3 = getelementptr inbounds %"core::result::Result<alloc::string::String, i32>", %"core::result::Result<alloc::string::String, i32>"* %x, i64 0, i32 2, i64 1
-  %x1.sroa.0.0..sroa_cast = bitcast i32* %x1.sroa.0.0..sroa_idx3 to i8*
-  %_4.sroa.0.0..sroa_idx10 = getelementptr inbounds %"core::result::Result<alloc::string::String, i32>", %"core::result::Result<alloc::string::String, i32>"* %0, i64 0, i32 2, i64 1
-  %_4.sroa.0.0..sroa_cast = bitcast i32* %_4.sroa.0.0..sroa_idx10 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %_4.sroa.0.0..sroa_cast, i8* nonnull align 8 %x1.sroa.0.0..sroa_cast, i64 24, i1 false)
-  br label %bb4
-
-bb8:                                              ; preds = %start
-  %4 = getelementptr inbounds %"core::result::Result<alloc::string::String, i32>", %"core::result::Result<alloc::string::String, i32>"* %x, i64 0, i32 2, i64 0
-  %x2 = load i32, i32* %4, align 4
-  %5 = getelementptr inbounds %"core::result::Result<alloc::string::String, i32>", %"core::result::Result<alloc::string::String, i32>"* %0, i64 0, i32 2, i64 0
-  store i32 %x2, i32* %5, align 4
-  br label %bb4
 }
 
 ; Function Attrs: nounwind nonlazybind uwtable
-declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #1
+declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nofree norecurse nounwind nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
-attributes #1 = { nounwind nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
-attributes #2 = { argmemonly nounwind }
+attributes #0 = { nounwind nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
+attributes #1 = { argmemonly nounwind }
 
 !llvm.module.flags = !{!0, !1}
 
 !0 = !{i32 7, !"PIC Level", i32 2}
 !1 = !{i32 2, !"RtLibUseGOT", i32 1}
-!2 = !{i32 0, i32 2}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the compiler performance: I locally run rustc-perf, and it looks perf-neutral. The 2nd point in #66234 (comment) is still required for these optimizations to be useful in real cases.

Improved SimplifyArmIdentity

new1

Improved SimplifyArmIdentity + SinkCommonCodeFromPredecessors

new2

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So there's two things I'm worried about here:

  1. we're slowly adding more optimizations to debug mode, which by it's very definition should be easy to debug with a debugger. Such optimizations can easily end up messing up the source. This one doesn't do it yet, but still.

  2. You essentially had to implement an optimization twice, once for mir-opt-level 1 and once for mir-opt-level 2. I feel like that needlessly complicates our optimizations.

Instead of implementing this optimization twice, how about moving the ICEing optimizations to mir-opt-level 3 and making mir-opt-level 2 the standard for --release?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we're slowly adding more optimizations to debug mode, which by it's very definition should be easy to debug with a debugger. Such optimizations can easily end up messing up the source.

I realized this PR is already debuginfo-breaking. If the debugger was stepping on match, it would point to whichever arm the optimizer reduced all other arms into.

Instead of implementing this optimization twice, how about moving the ICEing optimizations to mir-opt-level 3 and making mir-opt-level 2 the standard for --release?

Your plan sounds right. Efforts should be concentrated on fixing/improving advanced MIR optimizations rather than adding ad-hoc optimization passes.

I'll close this PR. Thank you for your time, @oli-obk!

/// _LOCAL_TMP = ((_LOCAL_1 as Variant ).FIELD: TY );
/// StorageLive(_LOCAL_TMP_2);
/// _LOCAL_TMP_2 = _LOCAL_TMP
/// ((_LOCAL_0 as Variant).FIELD: TY) = move _LOCAL_TMP_2;
/// discriminant(_LOCAL_0) = VAR_IDX;
/// StorageDead(_LOCAL_TMP_2);
/// StorageDead(_LOCAL_TMP);
/// ```
///
/// into:
///
/// ```rust
Expand All @@ -36,44 +50,157 @@ impl<'tcx> MirPass<'tcx> for SimplifyArmIdentity {
fn run_pass(&self, _: TyCtxt<'tcx>, _: MirSource<'tcx>, body: &mut BodyAndCache<'tcx>) {
let (basic_blocks, local_decls) = body.basic_blocks_and_local_decls_mut();
for bb in basic_blocks {
// Need 3 statements:
let (s0, s1, s2) = match &mut *bb.statements {
[s0, s1, s2] => (s0, s1, s2),
_ => continue,
};

// Pattern match on the form we want:
let (local_tmp_s0, local_1, vf_s0) = match match_get_variant_field(s0) {
None => continue,
Some(x) => x,
};
let (local_tmp_s1, local_0, vf_s1) = match match_set_variant_field(s1) {
None => continue,
Some(x) => x,
};
if local_tmp_s0 != local_tmp_s1
// The field-and-variant information match up.
|| vf_s0 != vf_s1
// Source and target locals have the same type.
// FIXME(Centril | oli-obk): possibly relax to same layout?
|| local_decls[local_0].ty != local_decls[local_1].ty
// We're setting the discriminant of `local_0` to this variant.
|| Some((local_0, vf_s0.var_idx)) != match_set_discr(s2)
{
continue;
match &mut *bb.statements {
[s0, s1, s2] => match_copypropd_arm([s0, s1, s2], local_decls),
other => match_arm(other, local_decls),
}
}
}
}

/// Match on:
/// ```rust
/// _LOCAL_TMP = ((_LOCAL_1 as Variant ).FIELD: TY );
/// ((_LOCAL_0 as Variant).FIELD: TY) = move _LOCAL_TMP;
/// discriminant(_LOCAL_0) = VAR_IDX;
/// ```
fn match_copypropd_arm(
[s0, s1, s2]: [&mut Statement<'_>; 3],
local_decls: &mut IndexVec<Local, LocalDecl<'_>>,
) {
// Pattern match on the form we want:
let (local_tmp_s0, local_1, vf_s0) = match match_get_variant_field(s0) {
None => return,
Some(x) => x,
};
let (local_tmp_s1, local_0, vf_s1) = match match_set_variant_field(s1) {
None => return,
Some(x) => x,
};
if local_tmp_s0 != local_tmp_s1
// The field-and-variant information match up.
|| vf_s0 != vf_s1
// Source and target locals have the same type.
// FIXME(Centril | oli-obk): possibly relax to same layout?
|| local_decls[local_0].ty != local_decls[local_1].ty
// We're setting the discriminant of `local_0` to this variant.
|| Some((local_0, vf_s0.var_idx)) != match_set_discr(s2)
{
return;
}

// Right shape; transform!
match &mut s0.kind {
StatementKind::Assign(box (place, rvalue)) => {
*place = local_0.into();
*rvalue = Rvalue::Use(Operand::Move(local_1.into()));
}
_ => unreachable!(),
}
s1.make_nop();
s2.make_nop();
}

// Right shape; transform!
match &mut s0.kind {
StatementKind::Assign(box (place, rvalue)) => {
*place = local_0.into();
*rvalue = Rvalue::Use(Operand::Move(local_1.into()));
/// Match on:
/// ```rust
/// StorageLive(_LOCAL_TMP);
/// _LOCAL_TMP = ((_LOCAL_1 as Variant ).FIELD: TY );
/// StorageLive(_LOCAL_TMP_2);
/// _LOCAL_TMP_2 = _LOCAL_TMP
/// ((_LOCAL_0 as Variant).FIELD: TY) = move _LOCAL_TMP_2;
/// discriminant(_LOCAL_0) = VAR_IDX;
/// StorageDead(_LOCAL_TMP_2);
/// StorageDead(_LOCAL_TMP);
/// ```
fn match_arm(stmts: &mut [Statement<'_>], local_decls: &mut IndexVec<Local, LocalDecl<'_>>) {
if stmts.len() != 8 {
return;
}

// StorageLive(_LOCAL_TMP);
let local_tmp_live = if let StatementKind::StorageLive(local_tmp_live) = &stmts[0].kind {
*local_tmp_live
} else {
return;
};

// _LOCAL_TMP = ((_LOCAL_1 as Variant ).FIELD: TY );
let (local_tmp, local_1, vf_1) = match match_get_variant_field(&stmts[1]) {
None => return,
Some(x) => x,
};

if local_tmp_live != local_tmp {
return;
}

// StorageLive(_LOCAL_TMP_2);
let local_tmp_2_live = if let StatementKind::StorageLive(local_tmp_2_live) = &stmts[2].kind {
*local_tmp_2_live
} else {
return;
};

// _LOCAL_TMP_2 = _LOCAL_TMP
if let StatementKind::Assign(box (lhs, Rvalue::Use(rhs))) = &stmts[3].kind {
let lhs = lhs.as_local();
if lhs != Some(local_tmp_2_live) {
return;
}
match rhs {
Operand::Copy(rhs) | Operand::Move(rhs) => {
if rhs.as_local() != Some(local_tmp) {
return;
}
_ => unreachable!(),
}
s1.make_nop();
s2.make_nop();
_ => return,
}
} else {
return;
}

// ((_LOCAL_0 as Variant).FIELD: TY) = move _LOCAL_TMP_2;
let (local_tmp_2, local_0, vf_0) = match match_set_variant_field(&stmts[4]) {
None => return,
Some(x) => x,
};

if local_tmp_2 != local_tmp_2_live {
return;
}

if vf_1 != vf_0 // The field-and-variant information match up.
// Source and target locals have the same type.
// FIXME(Centril | oli-obk): possibly relax to same layout?
|| local_decls[local_0].ty != local_decls[local_1].ty
// We're setting the discriminant of `local_0` to this variant.
|| Some((local_0, vf_1.var_idx)) != match_set_discr(&stmts[5])
{
return;
}

// StorageDead(_LOCAL_TMP_2);
// StorageDead(_LOCAL_TMP);
match (&stmts[6].kind, &stmts[7].kind) {
(
StatementKind::StorageDead(local_tmp_2_dead),
StatementKind::StorageDead(local_tmp_dead),
) => {
if *local_tmp_2_dead != local_tmp_2 || *local_tmp_dead != local_tmp {
return;
}
}
_ => return,
}

// Right shape; transform!
stmts[0].kind = StatementKind::Assign(Box::new((
local_0.into(),
Rvalue::Use(Operand::Move(local_1.into())),
)));

for s in &mut stmts[1..] {
s.make_nop();
}
}

Expand Down