-
Notifications
You must be signed in to change notification settings - Fork 0
/
base64_arm64.s
225 lines (182 loc) · 6.45 KB
/
base64_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// Reference
// https://github.com/aklomp/base64/blob/master/lib/arch/neon64/enc_loop.c
// https://github.com/aklomp/base64/blob/master/lib/arch/neon64/dec_loop.c
//go:build arm64 && !purego
#include "textflag.h"
DATA enc_const<>+0x00(SB)/8, $0x0405030401020001 // reshufle mask
DATA enc_const<>+0x08(SB)/8, $0x0a0b090a07080607
DATA enc_const<>+0x10(SB)/8, $0x0FC0FC000FC0FC00 // mulhi mask
DATA enc_const<>+0x18(SB)/8, $0x0FC0FC000FC0FC00
DATA enc_const<>+0x20(SB)/8, $0x003F03F0003F03F0 // mullo mask
DATA enc_const<>+0x28(SB)/8, $0x003F03F0003F03F0
DATA enc_const<>+0x30(SB)/8, $0x1f1e1b1a17161312 // high part of word
DATA enc_const<>+0x38(SB)/8, $0x0f0e0b0a07060302
GLOBL enc_const<>(SB), (NOPTR+RODATA), $64
DATA dec_const<>+0x00(SB)/8, $0x0140014001400140 // dec_reshuffle_const0
DATA dec_const<>+0x08(SB)/8, $0x0140014001400140
DATA dec_const<>+0x10(SB)/8, $0x0001100000011000 // dec_reshuffle_const1
DATA dec_const<>+0x18(SB)/8, $0x0001100000011000
DATA dec_const<>+0x20(SB)/8, $0x090A040506000102 // dec_reshuffle_mask
DATA dec_const<>+0x28(SB)/8, $0xFFFFFFFF0C0D0E08
GLOBL dec_const<>(SB), (NOPTR+RODATA), $48
//func encodeAsm(dst, src []byte, lut *[64]byte) int
TEXT ·encodeAsm(SB),NOSPLIT,$0
MOVD dst_base+0(FP), R0
MOVD src_base+24(FP), R1
MOVD src_len+32(FP), R2
MOVD lut+48(FP), R3
// load the Base64 alphabet into registers
VLD1 (R3), [V8.B16, V9.B16, V10.B16, V11.B16]
MOVD $0x3F, R4
VDUP R4, V7.B16
EOR R5, R5, R5
loop48:
CMP $48, R2
BLT lessThan48
// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
VLD3.P 48(R1), [V0.B16, V1.B16, V2.B16]
VUSHR $2, V0.B16, V3.B16
VUSHR $4, V1.B16, V4.B16
VUSHR $6, V2.B16, V5.B16
VSLI $4, V0.B16, V4.B16
VSLI $2, V1.B16, V5.B16
// Clear the high two bits in the second, third and fourth output.
VAND V7.B16, V4.B16, V4.B16
VAND V7.B16, V5.B16, V5.B16
VAND V7.B16, V2.B16, V6.B16
// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16
VTBL V4.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V4.B16
VTBL V5.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V5.B16
VTBL V6.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V6.B16
// Interleave and store output:
VST4.P [V3.B16, V4.B16, V5.B16, V6.B16], 64(R0)
SUB $48, R2
ADD $64, R5
B loop48
lessThan48:
// fast return
CMP $16, R2
BLT done
MOVD $enc_const<>(SB), R4
VLD1 (R4), [V3.B16, V4.B16, V5.B16, V6.B16]
MOVD $0x01000010, R4
VDUP R4, V7.S4 // mullo constant
VSHL $2, V7.S4, V12.S4 // mulhi constant
loop12:
VLD1 (R1), [V0.B16]
VTBL V3.B16, [V0.B16], V0.B16 // shuffle bytes
VAND V4.B16, V0.B16, V1.B16 // AND mulhi mask
WORD $0x2e61c182 // UMULL V1.H8, V12.H8, V2.H8
WORD $0x6e61c181 // UMULL2 V1.H8, V12.H8, V1.H8
VTBL V6.B16, [V1.B16, V2.B16], V1.B16
VAND V0.B16, V5.B16, V0.B16
WORD $0x4e609ce0 // VMUL V0.H8, V7.H8, V0.H8
VORR V0.B16, V1.B16, V0.B16
// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16
VST1.P [V0.B16], 16(R0)
ADD $12, R1
ADD $16, R5
SUB $12, R2
CMP $16, R2
BGE loop12
done:
MOVD R5, ret+56(FP)
RET
//func decodeAsm(dst, src []byte, lut *[128]byte) int
TEXT ·decodeAsm(SB),NOSPLIT,$0
MOVD dst_base+0(FP), R0
MOVD src_base+24(FP), R1
MOVD src_len+32(FP), R2
MOVD lut+48(FP), R3
VLD1.P 64(R3), [V8.B16, V9.B16, V10.B16, V11.B16]
VLD1 (R3), [V12.B16, V13.B16, V14.B16, V15.B16]
MOVD $0x40, R4
VDUP R4, V7.B16
loop64:
CMP $64, R2
BLT lessThan64
VLD4.P 64(R1), [V20.B16, V21.B16, V22.B16, V23.B16]
// Get values from first LUT
VTBL V20.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16
VTBL V21.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V1.B16
VTBL V22.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V2.B16
VTBL V23.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16
// Get values from second LUT
VSUB V7.B16, V20.B16, V20.B16
WORD $0x4e147180 // VTBX V20.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V0.B16
VSUB V7.B16, V21.B16, V21.B16
WORD $0x4e157181 // VTBX V21.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V1.B16
VSUB V7.B16, V22.B16, V22.B16
WORD $0x4e167182 // VTBX V22.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V2.B16
VSUB V7.B16, V23.B16, V23.B16
WORD $0x4e177183 // VTBX V23.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V3.B16
// Check for invalid input, any value larger than 63
WORD $0x6e273c10 // VCMHS V7.B16, V0.B16, V16.B16
WORD $0x6e273c31 // VCMHS V7.B16, V1.B16, V17.B16
WORD $0x6e273c52 // VCMHS V7.B16, V2.B16, V18.B16
WORD $0x6e273c73 // VCMHS V7.B16, V3.B16, V19.B16
VORR V17.B16, V16.B16, V16.B16
VORR V18.B16, V16.B16, V16.B16
VORR V19.B16, V16.B16, V16.B16
// Check that all bits are zero:
WORD $0x6e30aa11 // VUMAXV V16.B16, V17
VMOV V17.B[0], R5
CBNZ R5, done
// Compress four bytes into three
VSHL $2, V0.B16, V4.B16
VUSHR $4, V1.B16, V16.B16
VORR V16.B16, V4.B16, V4.B16
VSHL $4, V1.B16, V5.B16
VUSHR $2, V2.B16, V16.B16
VORR V16.B16, V5.B16, V5.B16
VSHL $6, V2.B16, V16.B16
VORR V16.B16, V3.B16, V6.B16
// Interleave and store decoded result
VST3.P [V4.B16, V5.B16, V6.B16], 48(R0)
SUB $64, R2
B loop64
lessThan64:
// fast return
CMP $24, R2
BLT done
MOVD $dec_const<>(SB), R4
VLD1 (R4), [V1.B16, V2.B16, V3.B16]
loop16:
VLD1.P 16(R1), [V20.B16]
// Get values from first LUT
VTBL V20.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16
VSUB V7.B16, V20.B16, V20.B16
// Get values from second LUT
WORD $0x4e147180 // VTBX V20.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V0.B16
// Check for invalid input, any value larger than 63
WORD $0x6e273c10 // VCMHS V7.B16, V0.B16, V16.B16
// Check that all bits are zero:
WORD $0x6e30aa11 // VUMAXV V16.B16, V17
VMOV V17.B[0], R5
CBNZ R5, done
// Compress four bytes into three
// swap and merge adjacent 6-bit fields
WORD $0x2e20c024 // UMULL V0.B16, V1.B16, V4.H8
WORD $0x6e20c020 // UMULL2 V0.B16, V1.B16, V0.H8
VADDP V0.H8, V4.H8, V0.H8
// swap and merge 12-bit words into a 24-bit word
WORD $0x2e60c044 // UMULL V0.H8, V2.H8, V4.S4
WORD $0x6e60c040 // UMULL2 V0.H8, V2.H8, V0.S4
VADDP V0.S4, V4.S4, V0.S4
// reshuffle bytes
VTBL V3.B16, [V0.B16], V0.B16
VST1 [V0.B16], (R0)
ADD $12, R0
SUB $16, R2
CMP $24, R2
BGE loop16
done:
MOVD R2, ret+56(FP)
RET