chacha_neon_64.S revision 1.1 1 /* $NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 .macro adrl reg, addr
30 adrp \reg, \addr
31 add \reg, \reg, #:lo12:\addr
32 .endm
33
34 #define _ALIGN_TEXT \
35 .p2align 4
36
37 #define ENTRY(x) \
38 .text; \
39 _ALIGN_TEXT; \
40 .global x; \
41 .type x,@function; \
42 x:
43
44 #define END(x) \
45 .size x, . - x
46
47 #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
48 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
49 STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
50 STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
51 STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
52 STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
53 STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
54 STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
55 STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
56 STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
57 STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
58 STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
59 STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
60 STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
61 STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
62 STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
63 STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
64 STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
65 STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
66 STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
67 STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
68 /* end ROUND */
69
70 #define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
71 f(a0,b0,c0,d0, t0, r); \
72 f(a1,b1,c1,d1, t1, r); \
73 f(a2,b2,c2,d2, t2, r); \
74 f(a3,b3,c3,d3, t3, r); \
75 /* end of STEP */
76
77 /*
78 * Each step of the ChaCha quarterround, split up so we can interleave
79 * the quarterrounds on independent rows/diagonals to maximize pipeline
80 * efficiency. Reference:
81 *
82 * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
83 * Record of the State of the Art in Stream Ciphers -- SASC 2008.
84 * https://cr.yp.to/papers.html#chacha
85 *
86 * a += b; d ^= a; d <<<= 16;
87 * c += d; b ^= c; b <<<= 12;
88 * a += b; d ^= a; d <<<= 8;
89 * c += d; b ^= c; b <<<= 7;
90 *
91 * The rotations are implemented with:
92 * <<< 16 REV32 Vn.8h for 16,
93 * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR)
94 * <<< 8 TBL (general permutation; rot8 below stored in r)
95 * <<< 7 SHL/SRI/ORR
96 */
97 #define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
98 #define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
99 #if 0
100 #define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16
101 #define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16)
102 #define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
103 #else
104 #define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h
105 #define STEP3(a,b,c,d, t, r) /* nothing */
106 #define STEP4(a,b,c,d, t, r) /* nothing */
107 #endif
108
109 #define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
110 #if 0
111 #define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
112 #define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12
113 #define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12)
114 #define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
115 #else
116 #define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
117 #define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12
118 #define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12)
119 #define STEP9(a,b,c,d, t, r) /* nothing */
120 #endif
121
122 #define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
123 #define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
124 #if 0
125 #define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8
126 #define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8)
127 #define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
128 #else
129 #define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b
130 #define STEP13(a,b,c,d, t, r) /* nothing */
131 #define STEP14(a,b,c,d, t, r) /* nothing */
132 #endif
133
134 #define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
135 #if 0
136 #define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
137 #define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7
138 #define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7)
139 #define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
140 #else
141 #define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
142 #define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7
143 #define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7)
144 #define STEP19(a,b,c,d, t, r) /* nothing */
145 #endif
146
147 #if _BYTE_ORDER == _LITTLE_ENDIAN
148 #define HTOLE32(x)
149 #define LE32TOH(x)
150 #elif _BYTE_ORDER == _BIG_ENDIAN
151 #define HTOLE32(x) rev32 x, x
152 #define LE32TOH(x) rev32 x, x
153 #endif
154
155 /*
156 * chacha_stream256_neon(uint8_t s[256]@x0,
157 * uint32_t blkno@w1,
158 * const uint8_t nonce[12]@x2,
159 * const uint8_t key[12]@x3,
160 * const uint8_t const[16]@x4,
161 * unsigned nr@w5)
162 */
163 ENTRY(chacha_stream256_neon)
164 stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
165 mov fp, sp
166
167 stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
168 stp d10, d11, [sp, #0x20]
169 stp d12, d13, [sp, #0x30]
170 stp d14, d15, [sp, #0x40]
171
172 adrl x9, v0123 /* x9 := &v0123 */
173 mov x10, x4 /* r10 := c */
174 mov x11, x3 /* r11 := k */
175 add x12, x3, #16 /* r12 := k+4 */
176 mov x13, x2 /* r13 := nonce */
177
178 ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
179 dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */
180 ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
181 ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
182 ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
183 ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
184 add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
185
186 HTOLE32(v0.16b)
187 HTOLE32(v1.16b)
188 HTOLE32(v2.16b)
189 HTOLE32(v3.16b)
190 HTOLE32(v4.16b)
191 HTOLE32(v5.16b)
192 HTOLE32(v6.16b)
193 HTOLE32(v7.16b)
194 HTOLE32(v8.16b)
195 HTOLE32(v9.16b)
196 HTOLE32(v10.16b)
197 HTOLE32(v11.16b)
198 HTOLE32(v12.16b)
199 HTOLE32(v13.16b)
200 HTOLE32(v14.16b)
201 HTOLE32(v15.16b)
202
203 mov v16.16b, v0.16b
204 mov v17.16b, v1.16b
205 mov v18.16b, v2.16b
206 mov v19.16b, v3.16b
207 mov v20.16b, v4.16b
208 mov v21.16b, v5.16b
209 mov v22.16b, v6.16b
210 mov v23.16b, v7.16b
211 mov v24.16b, v8.16b
212 mov v25.16b, v9.16b
213 mov v26.16b, v12.16b /* reordered since v12 isn't dup */
214 mov w8, v10.s[0] /* v27-31 needed as temporaries */
215 mov w9, v11.s[0]
216 mov w10, v13.s[0]
217 mov w11, v14.s[0]
218 mov w12, v15.s[0]
219
220 1: subs w5, w5, #2
221 ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
222 v28,v29,v30,v31, v27)
223 ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
224 v28,v29,v30,v31, v27)
225 b.ne 1b
226
227 dup v27.4s, w8
228 dup v28.4s, w9
229 dup v29.4s, w10
230 dup v30.4s, w11
231 dup v31.4s, w12
232
233 add v0.4s, v0.4s, v16.4s
234 add v1.4s, v1.4s, v17.4s
235 add v2.4s, v2.4s, v18.4s
236 add v3.4s, v3.4s, v19.4s
237 add v4.4s, v4.4s, v20.4s
238 add v5.4s, v5.4s, v21.4s
239 add v6.4s, v6.4s, v22.4s
240 add v7.4s, v7.4s, v23.4s
241 add v8.4s, v8.4s, v24.4s
242 add v9.4s, v9.4s, v25.4s
243 add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
244 add v11.4s, v11.4s, v28.4s
245 add v12.4s, v12.4s, v26.4s
246 add v13.4s, v13.4s, v29.4s
247 add v14.4s, v14.4s, v30.4s
248 add v15.4s, v15.4s, v31.4s
249
250 LE32TOH(v0.16b)
251 LE32TOH(v1.16b)
252 LE32TOH(v2.16b)
253 LE32TOH(v3.16b)
254 LE32TOH(v4.16b)
255 LE32TOH(v5.16b)
256 LE32TOH(v6.16b)
257 LE32TOH(v7.16b)
258 LE32TOH(v8.16b)
259 LE32TOH(v9.16b)
260 LE32TOH(v10.16b)
261 LE32TOH(v11.16b)
262 LE32TOH(v12.16b)
263 LE32TOH(v13.16b)
264 LE32TOH(v14.16b)
265 LE32TOH(v15.16b)
266
267 st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
268 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
269 st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
270 st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
271 st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
272 st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
273 st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
274 st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
275 st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
276 st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
277 st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
278 st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
279 st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
280 st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
281 st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
282 st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
283
284 ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
285 ldp d10, d11, [sp, #0x20]
286 ldp d12, d13, [sp, #0x30]
287 ldp d14, d15, [sp, #0x40]
288
289 ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
290 ret
291 END(chacha_stream256_neon)
292
293 /*
294 * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
295 * uint32_t blkno@w2,
296 * const uint8_t nonce[12]@x3,
297 * const uint8_t key[32]@x4,
298 * const uint8_t const[16]@x5,
299 * unsigned nr@w6)
300 */
301 ENTRY(chacha_stream_xor256_neon)
302 stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
303 mov fp, sp
304
305 stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
306 stp d10, d11, [sp, #0x20]
307 stp d12, d13, [sp, #0x30]
308 stp d14, d15, [sp, #0x40]
309
310 adrl x9, v0123 /* x9 := &v0123 */
311 mov x10, x5 /* r10 := c */
312 mov x11, x4 /* r11 := k */
313 add x12, x4, #16 /* r12 := k+4 */
314 mov x13, x3 /* r13 := nonce */
315
316 ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
317 dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */
318 ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
319 ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
320 ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
321 ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
322 add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
323
324 HTOLE32(v0.16b)
325 HTOLE32(v1.16b)
326 HTOLE32(v2.16b)
327 HTOLE32(v3.16b)
328 HTOLE32(v4.16b)
329 HTOLE32(v5.16b)
330 HTOLE32(v6.16b)
331 HTOLE32(v7.16b)
332 HTOLE32(v8.16b)
333 HTOLE32(v9.16b)
334 HTOLE32(v10.16b)
335 HTOLE32(v11.16b)
336 HTOLE32(v12.16b)
337 HTOLE32(v13.16b)
338 HTOLE32(v14.16b)
339 HTOLE32(v15.16b)
340
341 mov v16.16b, v0.16b
342 mov v17.16b, v1.16b
343 mov v18.16b, v2.16b
344 mov v19.16b, v3.16b
345 mov v20.16b, v4.16b
346 mov v21.16b, v5.16b
347 mov v22.16b, v6.16b
348 mov v23.16b, v7.16b
349 mov v24.16b, v8.16b
350 mov v25.16b, v9.16b
351 mov v26.16b, v12.16b /* reordered since v12 isn't dup */
352 mov w8, v10.s[0] /* v27-31 needed as temporaries */
353 mov w9, v11.s[0]
354 mov w10, v13.s[0]
355 mov w11, v14.s[0]
356 mov w12, v15.s[0]
357
358 1: subs w6, w6, #2
359 ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
360 v28,v29,v30,v31, v27)
361 ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
362 v28,v29,v30,v31, v27)
363 b.ne 1b
364
365 dup v27.4s, w8
366 dup v28.4s, w9
367 dup v29.4s, w10
368 dup v30.4s, w11
369 dup v31.4s, w12
370
371 add v0.4s, v0.4s, v16.4s
372 add v1.4s, v1.4s, v17.4s
373 add v2.4s, v2.4s, v18.4s
374 add v3.4s, v3.4s, v19.4s
375 add v4.4s, v4.4s, v20.4s
376 add v5.4s, v5.4s, v21.4s
377 add v6.4s, v6.4s, v22.4s
378 add v7.4s, v7.4s, v23.4s
379 add v8.4s, v8.4s, v24.4s
380 add v9.4s, v9.4s, v25.4s
381 add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
382 add v11.4s, v11.4s, v28.4s
383 add v12.4s, v12.4s, v26.4s
384 add v13.4s, v13.4s, v29.4s
385 add v14.4s, v14.4s, v30.4s
386 add v15.4s, v15.4s, v31.4s
387
388 /*
389 * We could do these sixteen LD4-into-lane instructions instead
390 * by four LD1-into-register instructions, but we would need to
391 * permute the elements in v0-v15 to put them in the right
392 * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized
393 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
394 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
395 * exceed the savings in cost from four LD1 instructions rather
396 * than sixteen LD4 instructions, even if we interleave the LD1
397 * instructions with the ZIPs.
398 */
399 ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
400 ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
401 ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
402 ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
403 ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
404 ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
405 ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
406 ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
407 ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
408 ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
409 ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
410 ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
411 ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
412 ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
413 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
414 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
415
416 LE32TOH(v0.16b)
417 LE32TOH(v1.16b)
418 LE32TOH(v2.16b)
419 LE32TOH(v3.16b)
420 LE32TOH(v4.16b)
421 LE32TOH(v5.16b)
422 LE32TOH(v6.16b)
423 LE32TOH(v7.16b)
424 LE32TOH(v8.16b)
425 LE32TOH(v9.16b)
426 LE32TOH(v10.16b)
427 LE32TOH(v11.16b)
428 LE32TOH(v12.16b)
429 LE32TOH(v13.16b)
430 LE32TOH(v14.16b)
431 LE32TOH(v15.16b)
432
433 eor v16.16b, v16.16b, v0.16b
434 eor v17.16b, v17.16b, v1.16b
435 eor v18.16b, v18.16b, v2.16b
436 eor v19.16b, v19.16b, v3.16b
437 eor v20.16b, v20.16b, v4.16b
438 eor v21.16b, v21.16b, v5.16b
439 eor v22.16b, v22.16b, v6.16b
440 eor v23.16b, v23.16b, v7.16b
441 eor v24.16b, v24.16b, v8.16b
442 eor v25.16b, v25.16b, v9.16b
443 eor v26.16b, v26.16b, v10.16b
444 eor v27.16b, v27.16b, v11.16b
445 eor v28.16b, v28.16b, v12.16b
446 eor v29.16b, v29.16b, v13.16b
447 eor v30.16b, v30.16b, v14.16b
448 eor v31.16b, v31.16b, v15.16b
449
450 st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
451 st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
452 st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
453 st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
454 st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
455 st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
456 st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
457 st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
458 st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
459 st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
460 st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
461 st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
462 st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
463 st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
464 st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
465 st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
466
467 ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
468 ldp d10, d11, [sp, #0x20]
469 ldp d12, d13, [sp, #0x30]
470 ldp d14, d15, [sp, #0x40]
471
472 ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
473 ret
474 END(chacha_stream_xor256_neon)
475
476 .section .rodata
477 .p2align 4
478
479 .type v0123,@object
480 v0123:
481 .long 0, 1, 2, 3
482 END(v0123)
483
484 /*
485 * Must be immediately after v0123 -- we load them in a single
486 * ld1 instruction.
487 */
488 .type rot8,@object
489 rot8:
490 .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
491 END(rot8)
492