-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MSVC optimizations for count_digits #1890
Conversation
Changed the clz implementations to use xor instead of subtraction so that when count_digits "undoes" the BSR -> CLZ translation, the optimizer is more willing to recognize the equivalence. Changed the data array in bsr2log10 to static since otherwise MSVC generates code to build the array every time the function is called.
@@ -901,7 +901,7 @@ template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data { | |||
// Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)). | |||
// This is a function instead of an array to workaround a bug in GCC10 (#1810). | |||
FMT_INLINE uint16_t bsr2log10(int bsr) { | |||
constexpr uint16_t data[] = { | |||
static constexpr uint16_t data[] = { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm worried that this is going to cause GCC to trip up in the same way that #1810 identified. Note that MSVC isn't the only compiler to do this silliness with the data array, GCC does, too: https://gcc.godbolt.org/z/nbffbs. clang is unaffected (only a difference in the mangling). So if it does break GCC, can this at least be static everywhere except for when FMT_GCC_VERSION == 0
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm worried that this is going to cause GCC to trip up in the same way that #1810 identified.
Could you check it on godbolt?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry. I thought I was unable to reproduce the original issue (obviously isolated since godbolt's been updated with a "fixed" fmtlib), so I thought maybe GCC 10 had been fixed.
Got it to repro, and, no: https://godbolt.org/z/Ps4fod GCC 10 won't issue the string-op-overflow warnings with the data array being static. bsr2log10
also no longer requires always_inline to avoid the warning, as well. Which is just trivia, as there's no reason to remove it.
Thanks for the PR. |
LGTM, merged.
Do you have a before/after assembly example demonstrating the effect of this change for future reference? |
Thank you. This is probably a subpar mechanism of conveying this, but it's a start at least. 64bit code first, for a 32bit and then a 64bit number. Followed by 32bit code for a 32bit and then a 64bit number. All are "in context" from, from this Lines 1909 to 1916 in 2591ab9
cdqe instructions are seemingly just there because the optimizer is, again, unaware of the value range of bsr .
x86-64:32bit number:Baseline:; int num_digits = count_digits(abs_value);
mov eax,esi
mov ecx,1Fh
or eax,1
xor edi,edi
bsr edx,eax
sub ecx,edx
xor ecx,1Fh
call fmt::v7::detail::bsr2log10 (07FF713D22FE5h)
;FMT_INLINE uint16_t bsr2log10(int bsr) {
push rbp
lea rbp,[rsp-57h]
sub rsp,90h
mov rax,qword ptr [__security_cookie (07FF713DC7288h)]
xor rax,rsp
mov qword ptr [rbp+47h],rax
movsxd rax,ecx
mov dword ptr [rbp-39h],10001h
mov dword ptr [rbp-35h],20001h
mov dword ptr [rbp-31h],20002h
mov dword ptr [rbp-2Dh],30003h
mov dword ptr [rbp-29h],40003h
mov dword ptr [rbp-25h],40004h
mov dword ptr [rbp-21h],50004h
mov dword ptr [rbp-1Dh],50005h
mov dword ptr [rbp-19h],60006h
mov dword ptr [rbp-15h],70006h
mov dword ptr [rbp-11h],70007h
mov dword ptr [rbp-0Dh],80007h
mov dword ptr [rbp-9],80008h
mov dword ptr [rbp-5],90009h
mov dword ptr [rbp-1],0A0009h
mov dword ptr [rbp+3],0A000Ah
mov dword ptr [rbp+7],0B000Ah
mov dword ptr [rbp+0Bh],0B000Bh
mov dword ptr [rbp+0Fh],0C000Ch
mov dword ptr [rbp+13h],0D000Ch
mov dword ptr [rbp+17h],0D000Dh
mov dword ptr [rbp+1Bh],0E000Dh
mov dword ptr [rbp+1Fh],0E000Eh
mov dword ptr [rbp+23h],0F000Fh
mov dword ptr [rbp+27h],10000Fh
mov dword ptr [rbp+2Bh],100010h
mov dword ptr [rbp+2Fh],110010h
mov dword ptr [rbp+33h],110011h
mov dword ptr [rbp+37h],120012h
mov dword ptr [rbp+3Bh],130012h
mov dword ptr [rbp+3Fh],130013h
mov dword ptr [rbp+43h],140013h
movzx eax,word ptr [rbp+rax*2-39h]
mov rcx,qword ptr [rbp+47h]
xor rcx,rsp
call __security_check_cookie (07FF713D217C6h)
add rsp,90h
pop rbp
ret Static data:; int num_digits = count_digits(abs_value);
xor r8d,r8d
; auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
movzx r15d,sil
; int num_digits = count_digits(abs_value);
lea r9,[__ImageBase (07FF697110000h)]
mov eax,edi
or eax,1
bsr ecx,eax
mov eax,1Fh
sub eax,ecx
movsxd rcx,eax
xor rcx,1Fh
movzx edx,word ptr [r9+rcx*2+8AE40h]
mov ebp,edx
cmp edi,dword ptr [r9+rdx*4+89180h]
setb r8b
mov ecx,r8d
sub ebp,r8d Static data + xor:; int num_digits = count_digits(abs_value);
xor r8d,r8d
; auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
movzx r15d,sil
; int num_digits = count_digits(abs_value);
lea rcx,[__ImageBase (07FF6C5390000h)]
mov eax,edi
or eax,1
bsr eax,eax
cdqe
movzx edx,word ptr [rcx+rax*2+8AE40h]
mov ebp,edx
cmp edi,dword ptr [rcx+rdx*4+89180h]
setb r8b
mov ecx,r8d
sub ebp,r8d 64bit number:Baseline:; int num_digits = count_digits(abs_value);
mov ecx,3Fh
mov rax,rsi
or rax,1
xor edi,edi
bsr rdx,rax
sub ecx,edx
xor ecx,3Fh
call fmt::v7::detail::bsr2log10 (07FF713D22FE5h)
;FMT_INLINE uint16_t bsr2log10(int bsr) {
push rbp
lea rbp,[rsp-57h]
sub rsp,90h
mov rax,qword ptr [__security_cookie (07FF713DC7288h)]
xor rax,rsp
mov qword ptr [rbp+47h],rax
movsxd rax,ecx
mov dword ptr [rbp-39h],10001h
mov dword ptr [rbp-35h],20001h
mov dword ptr [rbp-31h],20002h
mov dword ptr [rbp-2Dh],30003h
mov dword ptr [rbp-29h],40003h
mov dword ptr [rbp-25h],40004h
mov dword ptr [rbp-21h],50004h
mov dword ptr [rbp-1Dh],50005h
mov dword ptr [rbp-19h],60006h
mov dword ptr [rbp-15h],70006h
mov dword ptr [rbp-11h],70007h
mov dword ptr [rbp-0Dh],80007h
mov dword ptr [rbp-9],80008h
mov dword ptr [rbp-5],90009h
mov dword ptr [rbp-1],0A0009h
mov dword ptr [rbp+3],0A000Ah
mov dword ptr [rbp+7],0B000Ah
mov dword ptr [rbp+0Bh],0B000Bh
mov dword ptr [rbp+0Fh],0C000Ch
mov dword ptr [rbp+13h],0D000Ch
mov dword ptr [rbp+17h],0D000Dh
mov dword ptr [rbp+1Bh],0E000Dh
mov dword ptr [rbp+1Fh],0E000Eh
mov dword ptr [rbp+23h],0F000Fh
mov dword ptr [rbp+27h],10000Fh
mov dword ptr [rbp+2Bh],100010h
mov dword ptr [rbp+2Fh],110010h
mov dword ptr [rbp+33h],110011h
mov dword ptr [rbp+37h],120012h
mov dword ptr [rbp+3Bh],130012h
mov dword ptr [rbp+3Fh],130013h
mov dword ptr [rbp+43h],140013h
movzx eax,word ptr [rbp+rax*2-39h]
mov rcx,qword ptr [rbp+47h]
xor rcx,rsp
call __security_check_cookie (07FF713D217C6h)
add rsp,90h
pop rbp
ret Static data:; int num_digits = count_digits(abs_value);
xor r8d,r8d
; auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
movzx r15d,sil
; int num_digits = count_digits(abs_value);
lea r9,[__ImageBase (07FF697110000h)]
mov rax,rdi
or rax,1
bsr rcx,rax
mov eax,3Fh
sub eax,ecx
movsxd rcx,eax
xor rcx,3Fh
movzx edx,word ptr [r9+rcx*2+8AE40h]
mov ebp,edx
cmp rdi,qword ptr [r9+rdx*8+891C0h]
setb r8b
mov ecx,r8d
sub ebp,r8d Static data + xor:; int num_digits = count_digits(abs_value);
xor r8d,r8d
; auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
movzx r15d,sil
lea rcx,[__ImageBase (07FF6C5390000h)]
mov rax,rdi
or rax,1
bsr rax,rax
cdqe
movzx edx,word ptr [rcx+rax*2+8AE40h]
mov ebp,edx
cmp rdi,qword ptr [rcx+rdx*8+891C0h]
setb r8b
mov ecx,r8d
sub ebp,r8d x86-32:32bit number:Baseline:; int num_digits = count_digits(abs_value);
push esi
call fmt::v7::detail::count_digits
add esp,4
mov ecx,eax
;inline int count_digits(uint32_t n) {
push ebp
mov ebp,esp
sub esp,88h
mov eax,dword ptr [__security_cookie (0272244h)]
xor eax,ebp
mov dword ptr [ebp-4],eax
mov edx,dword ptr [n]
mov eax,edx
or eax,1
mov dword ptr [ebp-84h],10001h
bsr ecx,eax
mov eax,1Fh
mov dword ptr [ebp-80h],20001h
sub eax,ecx
mov dword ptr [ebp-7Ch],20002h
xor eax,1Fh
mov dword ptr [ebp-78h],30003h
mov dword ptr [ebp-74h],40003h
mov dword ptr [ebp-70h],40004h
mov dword ptr [ebp-6Ch],50004h
mov dword ptr [ebp-68h],50005h
mov dword ptr [ebp-64h],60006h
mov dword ptr [ebp-60h],70006h
mov dword ptr [ebp-5Ch],70007h
mov dword ptr [ebp-58h],80007h
mov dword ptr [ebp-54h],80008h
mov dword ptr [ebp-50h],90009h
mov dword ptr [ebp-4Ch],0A0009h
mov dword ptr [ebp-48h],0A000Ah
mov dword ptr [ebp-44h],0B000Ah
mov dword ptr [ebp-40h],0B000Bh
mov dword ptr [ebp-3Ch],0C000Ch
mov dword ptr [ebp-38h],0D000Ch
mov dword ptr [ebp-34h],0D000Dh
mov dword ptr [ebp-30h],0E000Dh
mov dword ptr [ebp-2Ch],0E000Eh
mov dword ptr [ebp-28h],0F000Fh
mov dword ptr [ebp-24h],10000Fh
mov dword ptr [ebp-20h],100010h
mov dword ptr [ebp-1Ch],110010h
mov dword ptr [ebp-18h],110011h
mov dword ptr [ebp-14h],120012h
mov dword ptr [ebp-10h],130012h
mov dword ptr [ebp-0Ch],130013h
mov dword ptr [ebp-8],140013h
movzx eax,word ptr [ebp+eax*2-84h]
mov dword ptr [ebp-88h],0
cmp edx,dword ptr fmt::v7::detail::basic_data<void>::zero_or_powers_of_10_32 (025D080h)[eax*4]
sbb ecx,ecx
neg ecx
sub eax,ecx
mov ecx,dword ptr [ebp-4]
xor ecx,ebp
call @__security_check_cookie@4
mov esp,ebp
pop ebp
ret Static data:; int num_digits = count_digits(abs_value);
mov eax,edi
; auto it = reserve(out, size);
mov esi,dword ptr [out]
; int num_digits = count_digits(abs_value);
or eax,1
mov dword ptr [ebp-1Ch],0
bsr eax,eax
mov ecx,1Fh
sub ecx,eax
xor ecx,1Fh
movzx ecx,word ptr `fmt::v7::detail::bsr2log10'::`2'::data (0A1968h)[ecx*2]
cmp edi,dword ptr fmt::v7::detail::basic_data<void>::zero_or_powers_of_10_32 (09D080h)[ecx*4]
sbb eax,eax
neg eax
sub ecx,eax
; auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
movzx eax,bl
add eax,ecx
; int num_digits = count_digits(abs_value);
mov dword ptr [ebp-24h],ecx Static data + xor:; int num_digits = count_digits(abs_value);
mov eax,edi
; auto it = reserve(out, size);
mov esi,dword ptr [out]
; int num_digits = count_digits(abs_value);
or eax,1
mov dword ptr [ebp-1Ch],0
bsr eax,eax
movzx ecx,word ptr `fmt::v7::detail::bsr2log10'::`2'::data (05E1968h)[eax*2]
cmp edi,dword ptr fmt::v7::detail::basic_data<void>::zero_or_powers_of_10_32 (05DD080h)[ecx*4]
sbb eax,eax
neg eax
sub ecx,eax
; auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
add eax,ecx
; int num_digits = count_digits(abs_value);
mov dword ptr [ebp-24h],ecx 64bit number:Baseline:; int num_digits = count_digits(abs_value);
push ebx
push edi
call fmt::v7::detail::count_digits
;inline int count_digits(uint64_t n) {
push ebp
mov ebp,esp
sub esp,88h
mov eax,dword ptr [__security_cookie (0D52244h)]
xor eax,ebp
mov dword ptr [ebp-4],eax
mov edx,dword ptr [n]
mov eax,edx
push esi
push edi
mov edi,dword ptr [ebp+0Ch]
or eax,1
bsr esi,edi
mov dword ptr [ebp-88h],0
je fmt::v7::detail::count_digits+38h (0CD84C8h)
mov ecx,1Fh
sub ecx,esi
jmp fmt::v7::detail::count_digits+42h (0CD84D2h)
bsr eax,eax
mov ecx,3Fh
sub ecx,eax
xor ecx,3Fh
mov dword ptr [ebp-84h],10001h
mov dword ptr [ebp-80h],20001h
mov dword ptr [ebp-7Ch],20002h
mov dword ptr [ebp-78h],30003h
mov dword ptr [ebp-74h],40003h
mov dword ptr [ebp-70h],40004h
mov dword ptr [ebp-6Ch],50004h
mov dword ptr [ebp-68h],50005h
mov dword ptr [ebp-64h],60006h
mov dword ptr [ebp-60h],70006h
mov dword ptr [ebp-5Ch],70007h
mov dword ptr [ebp-58h],80007h
mov dword ptr [ebp-54h],80008h
mov dword ptr [ebp-50h],90009h
mov dword ptr [ebp-4Ch],0A0009h
mov dword ptr [ebp-48h],0A000Ah
mov dword ptr [ebp-44h],0B000Ah
mov dword ptr [ebp-40h],0B000Bh
mov dword ptr [ebp-3Ch],0C000Ch
mov dword ptr [ebp-38h],0D000Ch
mov dword ptr [ebp-34h],0D000Dh
mov dword ptr [ebp-30h],0E000Dh
mov dword ptr [ebp-2Ch],0E000Eh
mov dword ptr [ebp-28h],0F000Fh
mov dword ptr [ebp-24h],10000Fh
mov dword ptr [ebp-20h],100010h
mov dword ptr [ebp-1Ch],110010h
mov dword ptr [ebp-18h],110011h
mov dword ptr [ebp-14h],120012h
mov dword ptr [ebp-10h],130012h
mov dword ptr [ebp-0Ch],130013h
mov dword ptr [ebp-8],140013h
movzx eax,word ptr [ebp+ecx*2-84h]
cmp edi,dword ptr [eax*8+0D3D0BCh]
pop edi
pop esi
ja fmt::v7::detail::count_digits+15Bh (0CD85EBh)
jb fmt::v7::detail::count_digits+146h (0CD85D6h)
cmp edx,dword ptr fmt::v7::detail::basic_data<void>::zero_or_powers_of_10_64 (0D3D0B8h)[eax*8]
jae fmt::v7::detail::count_digits+15Bh (0CD85EBh)
mov ecx,1
sub eax,ecx
mov ecx,dword ptr [ebp-4]
xor ecx,ebp
call @__security_check_cookie@4 (0CD21B2h)
mov esp,ebp
pop ebp
ret
xor ecx,ecx
sub eax,ecx
mov ecx,dword ptr [ebp-4]
xor ecx,ebp
call @__security_check_cookie@4 (0CD21B2h)
mov esp,ebp
pop ebp
ret Static data:; int num_digits = count_digits(abs_value);
mov eax,edi
mov byte ptr [ebp-2Dh],dl
or eax,1
mov dword ptr [ebp-34h],0
bsr esi,ebx
je fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+56h (057DE6h)
mov ecx,1Fh
sub ecx,esi
jmp fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+60h (057DF0h)
bsr eax,eax
mov ecx,3Fh
sub ecx,eax
xor ecx,3Fh
movzx eax,word ptr `fmt::v7::detail::bsr2log10'::`2'::data (0A1968h)[ecx*2]
cmp ebx,dword ptr [eax*8+9D0BCh]
ja fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+86h (057E16h)
jb fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+7Fh (057E0Fh)
cmp edi,dword ptr fmt::v7::detail::basic_data<void>::zero_or_powers_of_10_64 (09D0B8h)[eax*8]
jae fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+86h (057E16h)
mov ecx,1
jmp fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+88h (057E18h)
xor ecx,ecx Static data + xor:; int num_digits = count_digits(abs_value);
mov eax,edi
mov byte ptr [ebp-2Dh],cl
or eax,1
mov dword ptr [ebp-34h],0
bsr edx,ebx
je fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+52h (0597DE2h)
lea eax,[edx+20h]
jmp fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+55h (0597DE5h)
bsr eax,eax
xor eax,3Fh
xor eax,3Fh
movzx eax,word ptr `fmt::v7::detail::bsr2log10'::`2'::data (05E1968h)[eax*2]
cmp ebx,dword ptr [eax*8+5DD0BCh]
ja fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+7Eh (0597E0Eh)
jb fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+77h (0597E07h)
cmp edi,dword ptr fmt::v7::detail::basic_data<void>::zero_or_powers_of_10_64 (05DD0B8h)[eax*8]
jae fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+7Eh (0597E0Eh)
mov edx,1
jmp fmt::v7::detail::write<char,fmt::v7::detail::buffer_appender<char>,__int64,0>+80h (0597E10h)
xor edx,edx I don't know what this is about: xor eax,3Fh
xor eax,3Fh I'm sure the side effects on the flags are very important here. |
Thanks for the details! |
Changed the clz implementations to use xor instead of subtraction so that when
count_digits "undoes" the BSR -> CLZ translation, the optimizer is more
willing to recognize the equivalence.
Changed the data array in bsr2log10 to static since otherwise MSVC generates
code to build the array every time the function is called.
I agree that my contributions are licensed under the {fmt} license, and agree to future changes to the licensing.