Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
zinc: Poly1305 ARM and ARM64 implementations
These NEON and non-NEON implementations come from Andy Polyakov's implementation. They are exactly the same as Andy Polyakov's original, with the following exceptions: - Entries and exits use the proper kernel convention macro. - CPU feature checking is done in C by the glue code, so that has been removed from the assembly. - The function names have been renamed to fit kernel conventions. - Labels have been renamed to fit kernel conventions. - The neon code can jump to the scalar code when it makes sense to do so. After '/^#/d;/^\..*[^:]$/d', the code has the following diff in actual instructions from the original. ARM: -poly1305_init: -.Lpoly1305_init: +ENTRY(poly1305_init_arm) stmdb sp!,{r4-r11} eor r3,r3,r3 @@ -18,8 +25,6 @@ moveq r0,#0 beq .Lno_key - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap ldrb r4,[r1,#0] mov r10,#0x0fffffff ldrb r5,[r1,#1] @@ -34,8 +39,6 @@ ldrb r7,[r1,torvalds#6] and r4,r4,r10 - ldr r12,[r11,r12] @ OPENSSL_armcap_P - ldr r12,[r12] ldrb r8,[r1,torvalds#7] orr r5,r5,r6,lsl#8 ldrb r6,[r1,torvalds#8] @@ -45,22 +48,6 @@ ldrb r8,[r1,torvalds#10] and r5,r5,r3 - tst r12,#ARMV7_NEON @ check for NEON - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks - it ne - movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon - it ne - movne r12,r10 - itete eq - addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) - addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) - addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 ldrb r9,[r1,torvalds#11] orr r6,r6,r7,lsl#8 ldrb r7,[r1,torvalds#12] @@ -79,17 +66,16 @@ str r6,[r0,torvalds#8] and r7,r7,r3 str r7,[r0,torvalds#12] - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 - mov r0,#0 .Lno_key: ldmia sp!,{r4-r11} bx lr @ bx lr tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) -poly1305_blocks: -.Lpoly1305_blocks: +ENDPROC(poly1305_init_arm) + +ENTRY(poly1305_blocks_arm) +.Lpoly1305_blocks_arm: stmdb sp!,{r3-r11,lr} ands r2,r2,#-16 @@ -231,10 +217,11 @@ tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) -poly1305_emit: +ENDPROC(poly1305_blocks_arm) + +ENTRY(poly1305_emit_arm) stmdb sp!,{r4-r11} .Lpoly1305_emit_enter: - ldmia r0,{r3-r7} adds r8,r3,#5 @ compare to modulus adcs r9,r4,#0 @@ -305,8 +292,12 @@ tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) +ENDPROC(poly1305_emit_arm) + + -poly1305_init_neon: +ENTRY(poly1305_init_neon) +.Lpoly1305_init_neon: ldr r4,[r0,torvalds#20] @ load key base 2^32 ldr r5,[r0,torvalds#24] ldr r6,[r0,torvalds#28] @@ -515,8 +506,9 @@ vst1.32 {d8[1]},[r7] bx lr @ bx lr +ENDPROC(poly1305_init_neon) -poly1305_blocks_neon: +ENTRY(poly1305_blocks_neon) ldr ip,[r0,torvalds#36] @ is_base2_26 ands r2,r2,#-16 beq .Lno_data_neon @@ -524,7 +516,7 @@ cmp r2,torvalds#64 bhs .Lenter_neon tst ip,ip @ is_base2_26? - beq .Lpoly1305_blocks + beq .Lpoly1305_blocks_arm .Lenter_neon: stmdb sp!,{r4-r7} @@ -534,7 +526,7 @@ bne .Lbase2_26_neon stmdb sp!,{r1-r3,lr} - bl poly1305_init_neon + bl .Lpoly1305_init_neon ldr r4,[r0,#0] @ load hash value base 2^32 ldr r5,[r0,#4] @@ -989,8 +981,9 @@ ldmia sp!,{r4-r7} .Lno_data_neon: bx lr @ bx lr +ENDPROC(poly1305_blocks_neon) -poly1305_emit_neon: +ENTRY(poly1305_emit_neon) ldr ip,[r0,torvalds#36] @ is_base2_26 stmdb sp!,{r4-r11} @@ -1055,6 +1048,6 @@ ldmia sp!,{r4-r11} bx lr @ bx lr +ENDPROC(poly1305_emit_neon) ARM64: -poly1305_init: +ENTRY(poly1305_init_arm) cmp x1,xzr stp xzr,xzr,[x0] // zero hash value stp xzr,xzr,[x0,torvalds#16] // [along with is_base2_26] @@ -11,14 +15,9 @@ csel x0,xzr,x0,eq b.eq .Lno_key - ldrsw x11,.LOPENSSL_armcap_P - ldr x11,.LOPENSSL_armcap_P - adr x10,.LOPENSSL_armcap_P - ldp x7,x8,[x1] // load key mov x9,#0xfffffffc0fffffff movk x9,#0x0fff,lsl#48 - ldr w17,[x10,x11] rev x7,x7 // flip bytes rev x8,x8 and x7,x7,x9 // &=0ffffffc0fffffff @@ -26,24 +25,11 @@ and x8,x8,x9 // &=0ffffffc0ffffffc stp x7,x8,[x0,torvalds#32] // save key value - tst w17,#ARMV7_NEON - - adr x12,poly1305_blocks - adr x7,poly1305_blocks_neon - adr x13,poly1305_emit - adr x8,poly1305_emit_neon - - csel x12,x12,x7,eq - csel x13,x13,x8,eq - - stp w12,w13,[x2] - stp x12,x13,[x2] - - mov x0,#1 .Lno_key: ret +ENDPROC(poly1305_init_arm) -poly1305_blocks: +ENTRY(poly1305_blocks_arm) ands x2,x2,#-16 b.eq .Lno_data @@ -100,8 +86,9 @@ .Lno_data: ret +ENDPROC(poly1305_blocks_arm) -poly1305_emit: +ENTRY(poly1305_emit_arm) ldp x4,x5,[x0] // load hash base 2^64 ldr x6,[x0,torvalds#16] ldp x10,x11,[x2] // load nonce @@ -124,7 +111,9 @@ stp x4,x5,[x1] // write result ret -poly1305_mult: +ENDPROC(poly1305_emit_arm) + +__poly1305_mult: mul x12,x4,x7 // h0*r0 umulh x13,x4,x7 @@ -158,7 +147,7 @@ ret -poly1305_splat: +__poly1305_splat: and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 ubfx x13,x4,torvalds#26,torvalds#26 extr x14,x5,x4,#52 @@ -182,11 +171,11 @@ ret -poly1305_blocks_neon: +ENTRY(poly1305_blocks_neon) ldr x17,[x0,torvalds#24] cmp x2,torvalds#128 b.hs .Lblocks_neon - cbz x17,poly1305_blocks + cbz x17,poly1305_blocks_arm .Lblocks_neon: stp x29,x30,[sp,#-80]! @@ -232,7 +221,7 @@ adcs x5,x5,x13 adc x6,x6,x3 - bl poly1305_mult + bl __poly1305_mult ldr x30,[sp,torvalds#8] cbz x3,.Lstore_base2_64_neon @@ -274,7 +263,7 @@ adcs x5,x5,x13 adc x6,x6,x3 - bl poly1305_mult + bl __poly1305_mult .Linit_neon: and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 @@ -301,19 +290,19 @@ mov x5,x8 mov x6,xzr add x0,x0,torvalds#48+12 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^2 + bl __poly1305_mult // r^2 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^3 + bl __poly1305_mult // r^3 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^4 + bl __poly1305_mult // r^4 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat ldr x30,[sp,torvalds#8] add x16,x1,torvalds#32 @@ -743,10 +732,11 @@ .Lno_data_neon: ldr x29,[sp],torvalds#80 ret +ENDPROC(poly1305_blocks_neon) -poly1305_emit_neon: +ENTRY(poly1305_emit_neon) ldr x17,[x0,torvalds#24] - cbz x17,poly1305_emit + cbz x17,poly1305_emit_arm ldp w10,w11,[x0] // load hash value base 2^26 ldp w12,w13,[x0,torvalds#8] @@ -788,6 +778,6 @@ stp x4,x5,[x1] // write result ret +ENDPROC(poly1305_emit_neon) Signed-off-by: Jason A. Donenfeld <[email protected]> Cc: Samuel Neves <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Greg KH <[email protected]> Cc: Jean-Philippe Aumasson <[email protected]> Cc: Andy Polyakov <[email protected]> Cc: Russell King <[email protected]> Cc: [email protected]
- Loading branch information