Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: simd neon #133

Merged
merged 1 commit into from
Apr 25, 2023
Merged

perf: simd neon #133

merged 1 commit into from
Apr 25, 2023

Conversation

AaronO
Copy link
Contributor

@AaronO AaronO commented Apr 15, 2023

First pass at neon support, building off #132

I shared some promising benches here: #123 (comment)

Overall it's a net-positive but not totally optimal.

Next steps we would want to land #123 and then refactor it into a SWAR SIMD backend and stack them appropriately to avoid duplicated work and cleaner logic.

Additionally, offsetz is very much central to total perf, I believe the vectorized implementation I added underperforms due to inefficiently reloading the offset lookup table instead of keeping it a register at the top of the header parsing loop.

@AaronO
Copy link
Contributor Author

AaronO commented Apr 15, 2023

ASM of vectorized offsetz

In context of match_uri_vectored

.section __TEXT,__text,regular,pure_instructions
        .globl  httparse::simd::neon::match_uri_vectored
        .p2align        2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
        .cfi_startproc
        stp x29, x30, [sp, #-16]!
        .cfi_def_cfa_offset 16
        mov x29, sp
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_remember_state
        mov x8, x0
        ldr x0, [x0, #16]
        ldp x9, x1, [x8]
        movi.16b v0, #129
        movi.16b v1, #162
        movi.16b v2, #253
        movi.16b v3, #60
Lloh22:
        adrp x10, lCPI14_0@PAGE
Lloh23:
        ldr q4, [x10, lCPI14_0@PAGEOFF]
        mov w10, #16
LBB14_1:
        subs x11, x1, x0
        b.lo LBB14_5
        cmp x11, #15
        b.ls LBB14_4
        ldr q5, [x9, x0]
        add.16b v6, v5, v0
        cmhi.16b v6, v1, v6
        and.16b v5, v5, v2
        cmeq.16b v5, v5, v3
        orr.16b v5, v6, v5
        and.16b v5, v5, v4
        umaxv.16b b5, v5
        fmov w11, s5
        sub w11, w10, w11, uxtb
        add x0, x11, x0
        str x0, [x8, #16]
        cmp w11, #16
        b.eq LBB14_1
LBB14_4:
        .cfi_def_cfa wsp, 16
        ldp x29, x30, [sp], #16
        .cfi_def_cfa_offset 0
        .cfi_restore w30
        .cfi_restore w29
        ret
LBB14_5:
        .cfi_restore_state
Lloh24:
        adrp x2, l___unnamed_15@PAGE
Lloh25:
        add x2, x2, l___unnamed_15@PAGEOFF
        bl core::slice::index::slice_start_index_len_fail
        .loh AdrpLdr    Lloh22, Lloh23
        .loh AdrpAdd    Lloh24, Lloh25

ASM of unvectorized offsetz

        .globl  httparse::simd::neon::match_uri_vectored
        .p2align        2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
        .cfi_startproc
        stp x29, x30, [sp, #-16]!
        .cfi_def_cfa_offset 16
        mov x29, sp
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_remember_state
        ldp x1, x8, [x0, #8]
        ldr x9, [x0]
        movi.16b v0, #129
        movi.16b v1, #162
        movi.16b v2, #253
        movi.16b v3, #60
        b LBB14_2

        mov w10, #16
        add x8, x10, x8
        str x8, [x0, #16]
        cmp x10, #16
        b.ne LBB14_42
LBB14_2:
        subs x10, x1, x8
        b.lo LBB14_43
        cmp x10, #15
        b.ls LBB14_42
        ldr q4, [x9, x8]
        add.16b v5, v4, v0
        cmhi.16b v5, v1, v5
        and.16b v4, v4, v2
        cmeq.16b v4, v4, v3
        orr.16b v4, v5, v4
        fmov x10, d4
        cbnz x10, LBB14_24
        mov.d x10, v4[1]
        cbz x10, LBB14_1
        tst x10, #0xff
        b.eq LBB14_8
        mov x10, #0
        b LBB14_23
LBB14_8:
        tst x10, #0xff00
        b.eq LBB14_10
        mov w10, #1
        b LBB14_23
LBB14_10:
        tst x10, #0xff0000
        b.eq LBB14_12
        mov w10, #2
        b LBB14_23
LBB14_12:
        tst x10, #0xff000000
        b.eq LBB14_14
        mov w10, #3
        b LBB14_23
LBB14_14:
        tst x10, #0xff00000000
        b.eq LBB14_16
        mov w10, #4
        b LBB14_23
LBB14_16:
        tst x10, #0xff0000000000
        b.eq LBB14_18
        mov w10, #5
        b LBB14_23
LBB14_18:
        tst x10, #0xff000000000000
        b.eq LBB14_20
        mov w10, #6
        b LBB14_23
LBB14_20:
        lsr x10, x10, #56
        cbnz x10, LBB14_22
        mov w10, #8
        b LBB14_23

        mov w10, #7
LBB14_23:
        add x10, x10, #8
        add x8, x10, x8
        str x8, [x0, #16]
        cmp x10, #16
        b.eq LBB14_2
        b LBB14_42

        tst x10, #0xff
        b.eq LBB14_26
        mov x9, #0
        b LBB14_41
LBB14_26:
        tst x10, #0xff00
        b.eq LBB14_28
        mov w9, #1
        b LBB14_41
LBB14_28:
        tst x10, #0xff0000
        b.eq LBB14_30
        mov w9, #2
        b LBB14_41
LBB14_30:
        tst x10, #0xff000000
        b.eq LBB14_32
        mov w9, #3
        b LBB14_41
LBB14_32:
        tst x10, #0xff00000000
        b.eq LBB14_34
        mov w9, #4
        b LBB14_41
LBB14_34:
        tst x10, #0xff0000000000
        b.eq LBB14_36
        mov w9, #5
        b LBB14_41
LBB14_36:
        tst x10, #0xff000000000000
        b.eq LBB14_38
        mov w9, #6
        b LBB14_41
LBB14_38:
        lsr x9, x10, #56
        cbnz x9, LBB14_40
        mov w9, #8
        b LBB14_41

        mov w9, #7
LBB14_41:
        add x8, x9, x8
        str x8, [x0, #16]
LBB14_42:
        .cfi_def_cfa wsp, 16
        ldp x29, x30, [sp], #16
        .cfi_def_cfa_offset 0
        .cfi_restore w30
        .cfi_restore w29
        ret
LBB14_43:
        .cfi_restore_state
Lloh22:
        adrp x2, l___unnamed_15@PAGE
Lloh23:
        add x2, x2, l___unnamed_15@PAGEOFF
        mov x0, x8
        bl core::slice::index::slice_start_index_len_fail
        .loh AdrpAdd    Lloh22, Lloh23

@AaronO
Copy link
Contributor Author

AaronO commented Apr 20, 2023

@seanmonstar I've resolved the conflicts, this should be good to land it's not perfect but it's a solid first pass, a few specifics to revisit later.

Copy link
Owner

@seanmonstar seanmonstar left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.

@AaronO
Copy link
Contributor Author

AaronO commented Apr 24, 2023

Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.

I've added a CI job to run aarch64/neon tests with qemu. I added tests in neon.rs that ensure the vectorized versions math their scalar counterparts. (Only some of the SWAR validators have false-negatives with a fallback to scalar check so correct in aggregate, since testing for the common-cases can be cheaper than exhaustive checks)

src/simd/neon.rs Outdated Show resolved Hide resolved
src/simd/neon.rs Show resolved Hide resolved
First pass at neon support, building off seanmonstar#132
Copy link
Owner

@seanmonstar seanmonstar left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right on!

@seanmonstar seanmonstar merged commit cdabb70 into seanmonstar:master Apr 25, 2023
AaronO added a commit to AaronO/httparse that referenced this pull request Apr 25, 2023
I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised
@AaronO AaronO mentioned this pull request Apr 25, 2023
AaronO added a commit to AaronO/httparse that referenced this pull request May 4, 2023
I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised
seanmonstar pushed a commit that referenced this pull request May 4, 2023
I accidentally typed `std::mem` out of habit, this wasn't caught in #133 because `neon+no_std` isn't exercised
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants