-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Experiment] Aggregate layout optimization #91507
Conversation
@bors try @rust-timer queue |
Awaiting bors try build completion. @rustbot label: +S-waiting-on-perf |
⌛ Trying commit 9410686 with merge d27f2514c2c2ab7f14562f3318cf9129f3315fbf... |
☀️ Try build successful - checks-actions |
Queued d27f2514c2c2ab7f14562f3318cf9129f3315fbf with parent 532d2b1, future comparison URL. |
Finished benchmarking commit (d27f2514c2c2ab7f14562f3318cf9129f3315fbf): comparison url. Summary: This change led to very large relevant mixed results 🤷 in compiler performance.
If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf. Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf. Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @bors rollup=never |
…_eq" This reverts commit 644b292.
All the regressions appears to be in I've push a revert of my partial revert of the The question is now whenever this might be acceptable ? @jyn514 an opinion ? Rust pub fn sum_f32_big_8(a: [f32; 8], b: [f32; 8]) -> [f32; 8] {
let mut c = [0.0; 8];
for i in 0..8 {
c[i] = a[i] + b[i];
}
c
}
pub fn sum_f32_big_16(a: [f32; 16], b: [f32; 16]) -> [f32; 16] {
let mut c = [0.0; 16];
for i in 0..16 {
c[i] = a[i] + b[i];
}
c
} This PR sum_f32_big_8:
mov rax, rdi
movups xmm0, xmmword ptr [rsi]
movups xmm1, xmmword ptr [rdx]
addps xmm1, xmm0
movups xmmword ptr [rdi], xmm1
movups xmm0, xmmword ptr [rsi + 16]
movups xmm1, xmmword ptr [rdx + 16]
addps xmm1, xmm0
movups xmmword ptr [rdi + 16], xmm1
ret
sum_f32_big_16:
mov rax, rdi
movups xmm0, xmmword ptr [rsi]
movups xmm1, xmmword ptr [rdx]
addps xmm1, xmm0
movups xmmword ptr [rdi], xmm1
movups xmm0, xmmword ptr [rsi + 16]
movups xmm1, xmmword ptr [rdx + 16]
addps xmm1, xmm0
movups xmmword ptr [rdi + 16], xmm1
movups xmm0, xmmword ptr [rsi + 32]
movups xmm1, xmmword ptr [rdx + 32]
addps xmm1, xmm0
movups xmmword ptr [rdi + 32], xmm1
movups xmm0, xmmword ptr [rsi + 48]
movups xmm1, xmmword ptr [rdx + 48]
addps xmm1, xmm0
movups xmmword ptr [rdi + 48], xmm1
ret Nightly example::sum_f32_big_8:
mov rax, rdi
movss xmm0, dword ptr [rsi]
addss xmm0, dword ptr [rdx]
movss dword ptr [rdi], xmm0
movss xmm0, dword ptr [rsi + 4]
addss xmm0, dword ptr [rdx + 4]
movss dword ptr [rdi + 4], xmm0
movss xmm0, dword ptr [rsi + 8]
addss xmm0, dword ptr [rdx + 8]
movss dword ptr [rdi + 8], xmm0
movss xmm0, dword ptr [rsi + 12]
addss xmm0, dword ptr [rdx + 12]
movss dword ptr [rdi + 12], xmm0
movss xmm0, dword ptr [rsi + 16]
addss xmm0, dword ptr [rdx + 16]
movss dword ptr [rdi + 16], xmm0
movss xmm0, dword ptr [rsi + 20]
addss xmm0, dword ptr [rdx + 20]
movss dword ptr [rdi + 20], xmm0
movss xmm0, dword ptr [rsi + 24]
addss xmm0, dword ptr [rdx + 24]
movss dword ptr [rdi + 24], xmm0
movss xmm0, dword ptr [rsi + 28]
addss xmm0, dword ptr [rdx + 28]
movss dword ptr [rdi + 28], xmm0
ret
example::sum_f32_big_16:
mov rax, rdi
movss xmm0, dword ptr [rsi]
addss xmm0, dword ptr [rdx]
movss dword ptr [rdi], xmm0
movss xmm0, dword ptr [rsi + 4]
addss xmm0, dword ptr [rdx + 4]
movss dword ptr [rdi + 4], xmm0
movss xmm0, dword ptr [rsi + 8]
addss xmm0, dword ptr [rdx + 8]
movss dword ptr [rdi + 8], xmm0
movss xmm0, dword ptr [rsi + 12]
addss xmm0, dword ptr [rdx + 12]
movss dword ptr [rdi + 12], xmm0
movss xmm0, dword ptr [rsi + 16]
addss xmm0, dword ptr [rdx + 16]
movss dword ptr [rdi + 16], xmm0
movss xmm0, dword ptr [rsi + 20]
addss xmm0, dword ptr [rdx + 20]
movss dword ptr [rdi + 20], xmm0
movss xmm0, dword ptr [rsi + 24]
addss xmm0, dword ptr [rdx + 24]
movss dword ptr [rdi + 24], xmm0
movss xmm0, dword ptr [rsi + 28]
addss xmm0, dword ptr [rdx + 28]
movss dword ptr [rdi + 28], xmm0
movss xmm0, dword ptr [rsi + 32]
addss xmm0, dword ptr [rdx + 32]
movss dword ptr [rdi + 32], xmm0
movss xmm0, dword ptr [rsi + 36]
addss xmm0, dword ptr [rdx + 36]
movss dword ptr [rdi + 36], xmm0
movss xmm0, dword ptr [rsi + 40]
addss xmm0, dword ptr [rdx + 40]
movss dword ptr [rdi + 40], xmm0
movss xmm0, dword ptr [rsi + 44]
addss xmm0, dword ptr [rdx + 44]
movss dword ptr [rdi + 44], xmm0
movss xmm0, dword ptr [rsi + 48]
addss xmm0, dword ptr [rdx + 48]
movss dword ptr [rdi + 48], xmm0
movss xmm0, dword ptr [rsi + 52]
addss xmm0, dword ptr [rdx + 52]
movss dword ptr [rdi + 52], xmm0
movss xmm0, dword ptr [rsi + 56]
addss xmm0, dword ptr [rdx + 56]
movss dword ptr [rdi + 56], xmm0
movss xmm0, dword ptr [rsi + 60]
addss xmm0, dword ptr [rdx + 60]
movss dword ptr [rdi + 60], xmm0
ret |
@bors try @rust-timer queue I don't know much about codegen, sorry. Maybe ask someone on T-compiler. |
Awaiting bors try build completion. @rustbot label: +S-waiting-on-perf |
⌛ Trying commit fea8369 with merge 0adc4d3d46c3f04f1372220679a65f2f1c74297b... |
@bors try @rust-timer queue |
Awaiting bors try build completion. @rustbot label: +S-waiting-on-perf |
⌛ Trying commit fea8369 with merge 065bdb7fe5f757cfb1fbb650c3949635b438bb41... |
☀️ Try build successful - checks-actions |
Queued 065bdb7fe5f757cfb1fbb650c3949635b438bb41 with parent efec545, future comparison URL. |
Finished benchmarking commit (065bdb7fe5f757cfb1fbb650c3949635b438bb41): comparison url. Summary: This change led to very large relevant mixed results 🤷 in compiler performance.
If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf. Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf. Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @bors rollup=never |
The perf didn't improve and it seems clear that cost is way too high be be acceptable. |
@@ -1,6 +1,7 @@ | |||
// compile-flags: -O | |||
// only-x86_64 | |||
// ignore-debug: the debug assertions get in the way | |||
// ignore-test for now (this is just to get CI happy) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You probably figured this out already, but as the person who added a bunch of these codegen tests, the i48
part is usually incidental, just there as the easiest way to accurately check what it wanted to confirm. If a different encoding of the rust types into llvm types is better, I'm all for it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I did see that and what this PR did was to always use the same type (no more transformation like [f32; 4]
-> u128
) so that using them in/with a function didn't require extra steps witch works well for generating better assembly but at the cost of having way to much perf regression.
If you have a idea to reduce them, I'm all in.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I must also say that the transformation was only done to small types (max 128 bits in x86_64), this PR wouldn't have changed anything for them.
Try to lower the threshold for indirect aggregate and remove cast to integer in order to have better code generation.
This pull-request is a DRAFT and is my (dummy ? hopefully not) tentative to fix #91447.
If some one could start a perf run so that a could find out if this needs more thinking or not.
Rust code:
Nightly:
This PR:
r? @ghost