-
-
Notifications
You must be signed in to change notification settings - Fork 117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
perf: simd neon #133
perf: simd neon #133
Conversation
ASM of vectorized offsetzIn context of .section __TEXT,__text,regular,pure_instructions
.globl httparse::simd::neon::match_uri_vectored
.p2align 2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
mov x8, x0
ldr x0, [x0, #16]
ldp x9, x1, [x8]
movi.16b v0, #129
movi.16b v1, #162
movi.16b v2, #253
movi.16b v3, #60
Lloh22:
adrp x10, lCPI14_0@PAGE
Lloh23:
ldr q4, [x10, lCPI14_0@PAGEOFF]
mov w10, #16
LBB14_1:
subs x11, x1, x0
b.lo LBB14_5
cmp x11, #15
b.ls LBB14_4
ldr q5, [x9, x0]
add.16b v6, v5, v0
cmhi.16b v6, v1, v6
and.16b v5, v5, v2
cmeq.16b v5, v5, v3
orr.16b v5, v6, v5
and.16b v5, v5, v4
umaxv.16b b5, v5
fmov w11, s5
sub w11, w10, w11, uxtb
add x0, x11, x0
str x0, [x8, #16]
cmp w11, #16
b.eq LBB14_1
LBB14_4:
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
LBB14_5:
.cfi_restore_state
Lloh24:
adrp x2, l___unnamed_15@PAGE
Lloh25:
add x2, x2, l___unnamed_15@PAGEOFF
bl core::slice::index::slice_start_index_len_fail
.loh AdrpLdr Lloh22, Lloh23
.loh AdrpAdd Lloh24, Lloh25 ASM of unvectorized offsetz .globl httparse::simd::neon::match_uri_vectored
.p2align 2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
ldp x1, x8, [x0, #8]
ldr x9, [x0]
movi.16b v0, #129
movi.16b v1, #162
movi.16b v2, #253
movi.16b v3, #60
b LBB14_2
mov w10, #16
add x8, x10, x8
str x8, [x0, #16]
cmp x10, #16
b.ne LBB14_42
LBB14_2:
subs x10, x1, x8
b.lo LBB14_43
cmp x10, #15
b.ls LBB14_42
ldr q4, [x9, x8]
add.16b v5, v4, v0
cmhi.16b v5, v1, v5
and.16b v4, v4, v2
cmeq.16b v4, v4, v3
orr.16b v4, v5, v4
fmov x10, d4
cbnz x10, LBB14_24
mov.d x10, v4[1]
cbz x10, LBB14_1
tst x10, #0xff
b.eq LBB14_8
mov x10, #0
b LBB14_23
LBB14_8:
tst x10, #0xff00
b.eq LBB14_10
mov w10, #1
b LBB14_23
LBB14_10:
tst x10, #0xff0000
b.eq LBB14_12
mov w10, #2
b LBB14_23
LBB14_12:
tst x10, #0xff000000
b.eq LBB14_14
mov w10, #3
b LBB14_23
LBB14_14:
tst x10, #0xff00000000
b.eq LBB14_16
mov w10, #4
b LBB14_23
LBB14_16:
tst x10, #0xff0000000000
b.eq LBB14_18
mov w10, #5
b LBB14_23
LBB14_18:
tst x10, #0xff000000000000
b.eq LBB14_20
mov w10, #6
b LBB14_23
LBB14_20:
lsr x10, x10, #56
cbnz x10, LBB14_22
mov w10, #8
b LBB14_23
mov w10, #7
LBB14_23:
add x10, x10, #8
add x8, x10, x8
str x8, [x0, #16]
cmp x10, #16
b.eq LBB14_2
b LBB14_42
tst x10, #0xff
b.eq LBB14_26
mov x9, #0
b LBB14_41
LBB14_26:
tst x10, #0xff00
b.eq LBB14_28
mov w9, #1
b LBB14_41
LBB14_28:
tst x10, #0xff0000
b.eq LBB14_30
mov w9, #2
b LBB14_41
LBB14_30:
tst x10, #0xff000000
b.eq LBB14_32
mov w9, #3
b LBB14_41
LBB14_32:
tst x10, #0xff00000000
b.eq LBB14_34
mov w9, #4
b LBB14_41
LBB14_34:
tst x10, #0xff0000000000
b.eq LBB14_36
mov w9, #5
b LBB14_41
LBB14_36:
tst x10, #0xff000000000000
b.eq LBB14_38
mov w9, #6
b LBB14_41
LBB14_38:
lsr x9, x10, #56
cbnz x9, LBB14_40
mov w9, #8
b LBB14_41
mov w9, #7
LBB14_41:
add x8, x9, x8
str x8, [x0, #16]
LBB14_42:
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
LBB14_43:
.cfi_restore_state
Lloh22:
adrp x2, l___unnamed_15@PAGE
Lloh23:
add x2, x2, l___unnamed_15@PAGEOFF
mov x0, x8
bl core::slice::index::slice_start_index_len_fail
.loh AdrpAdd Lloh22, Lloh23 |
@seanmonstar I've resolved the conflicts, this should be good to land it's not perfect but it's a solid first pass, a few specifics to revisit later. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.
I've added a CI job to run |
First pass at neon support, building off seanmonstar#132
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right on!
I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised
I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised
I accidentally typed `std::mem` out of habit, this wasn't caught in #133 because `neon+no_std` isn't exercised
First pass at neon support, building off #132
I shared some promising benches here: #123 (comment)
Overall it's a net-positive but not totally optimal.
Next steps we would want to land #123 and then refactor it into a SWAR SIMD backend and stack them appropriately to avoid duplicated work and cleaner logic.
Additionally, offsetz is very much central to total perf, I believe the vectorized implementation I added underperforms due to inefficiently reloading the offset lookup table instead of keeping it a register at the top of the header parsing loop.