chacha_neon_64.S revision 1.7 1 /* $NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <aarch64/asm.h>
30
31 RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $")
32
33 #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
34 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
35 STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
36 STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
37 STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
38 STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
39 STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
40 STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
41 STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
42 STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
43 STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
44 STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
45 STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
46 STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
47 STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
48 STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
49 STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
50 STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
51 STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
52 STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
53 STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
54 /* end ROUND */
55
56 #define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
57 f(a0,b0,c0,d0, t0, r); \
58 f(a1,b1,c1,d1, t1, r); \
59 f(a2,b2,c2,d2, t2, r); \
60 f(a3,b3,c3,d3, t3, r); \
61 /* end of STEP */
62
63 /*
64 * Each step of the ChaCha quarterround, split up so we can interleave
65 * the quarterrounds on independent rows/diagonals to maximize pipeline
66 * efficiency. Reference:
67 *
68 * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
69 * Record of the State of the Art in Stream Ciphers -- SASC 2008.
70 * https://cr.yp.to/papers.html#chacha
71 *
72 * a += b; d ^= a; d <<<= 16;
73 * c += d; b ^= c; b <<<= 12;
74 * a += b; d ^= a; d <<<= 8;
75 * c += d; b ^= c; b <<<= 7;
76 *
77 * The rotations are implemented with:
78 * <<< 16 REV32 Vn.8h for 16,
79 * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR)
80 * <<< 8 TBL (general permutation; rot8 below stored in r)
81 * <<< 7 SHL/SRI/ORR
82 */
83 #define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
84 #define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
85 #if 0
86 #define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16
87 #define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16)
88 #define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
89 #else
90 #define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h
91 #define STEP3(a,b,c,d, t, r) /* nothing */
92 #define STEP4(a,b,c,d, t, r) /* nothing */
93 #endif
94
95 #define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
96 #if 0
97 #define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
98 #define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12
99 #define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12)
100 #define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
101 #else
102 #define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
103 #define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12
104 #define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12)
105 #define STEP9(a,b,c,d, t, r) /* nothing */
106 #endif
107
108 #define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
109 #define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
110 #if 0
111 #define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8
112 #define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8)
113 #define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
114 #else
115 #define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b
116 #define STEP13(a,b,c,d, t, r) /* nothing */
117 #define STEP14(a,b,c,d, t, r) /* nothing */
118 #endif
119
120 #define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
121 #if 0
122 #define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
123 #define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7
124 #define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7)
125 #define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
126 #else
127 #define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
128 #define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7
129 #define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7)
130 #define STEP19(a,b,c,d, t, r) /* nothing */
131 #endif
132
133 #if defined(__AARCH64EB__)
134 #define HTOLE32(x) rev32 x, x
135 #define LE32TOH(x) rev32 x, x
136 #else
137 #define LE32TOH(x)
138 #define HTOLE32(x)
139 #endif
140
141 /*
142 * chacha_stream256_neon(uint8_t s[256]@x0,
143 * uint32_t blkno@w1,
144 * const uint8_t nonce[12]@x2,
145 * const uint8_t key[32]@x3,
146 * const uint8_t const[16]@x4,
147 * unsigned nr@w5)
148 */
149 ENTRY(chacha_stream256_neon)
150 stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
151 mov fp, sp
152
153 stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
154 stp d10, d11, [sp, #0x20]
155 stp d12, d13, [sp, #0x30]
156 stp d14, d15, [sp, #0x40]
157
158 adrl x9, v0123 /* x9 := &v0123 */
159 mov x10, x4 /* r10 := c */
160 mov x11, x3 /* r11 := k */
161 add x12, x3, #16 /* r12 := k+4 */
162 mov x13, x2 /* r13 := nonce */
163
164 ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
165 dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */
166 ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
167 ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
168 ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
169 ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
170 add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
171
172 LE32TOH(v0.16b)
173 LE32TOH(v1.16b)
174 LE32TOH(v2.16b)
175 LE32TOH(v3.16b)
176 LE32TOH(v4.16b)
177 LE32TOH(v5.16b)
178 LE32TOH(v6.16b)
179 LE32TOH(v7.16b)
180 LE32TOH(v8.16b)
181 LE32TOH(v9.16b)
182 LE32TOH(v10.16b)
183 LE32TOH(v11.16b)
184 /* LE32TOH(v12.16b) -- blkno, already host order */
185 LE32TOH(v13.16b)
186 LE32TOH(v14.16b)
187 LE32TOH(v15.16b)
188
189 mov v16.16b, v0.16b
190 mov v17.16b, v1.16b
191 mov v18.16b, v2.16b
192 mov v19.16b, v3.16b
193 mov v20.16b, v4.16b
194 mov v21.16b, v5.16b
195 mov v22.16b, v6.16b
196 mov v23.16b, v7.16b
197 mov v24.16b, v8.16b
198 mov v25.16b, v9.16b
199 mov v26.16b, v12.16b /* reordered since v12 isn't dup */
200 mov w8, v10.s[0] /* v27-31 needed as temporaries */
201 mov w9, v11.s[0]
202 mov w10, v13.s[0]
203 mov w11, v14.s[0]
204 mov w12, v15.s[0]
205
206 _ALIGN_TEXT
207 1: subs w5, w5, #2
208 ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
209 v28,v29,v30,v31, v27)
210 ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
211 v28,v29,v30,v31, v27)
212 b.ne 1b
213
214 dup v27.4s, w8
215 dup v28.4s, w9
216 dup v29.4s, w10
217 dup v30.4s, w11
218 dup v31.4s, w12
219
220 add v0.4s, v0.4s, v16.4s
221 add v1.4s, v1.4s, v17.4s
222 add v2.4s, v2.4s, v18.4s
223 add v3.4s, v3.4s, v19.4s
224 add v4.4s, v4.4s, v20.4s
225 add v5.4s, v5.4s, v21.4s
226 add v6.4s, v6.4s, v22.4s
227 add v7.4s, v7.4s, v23.4s
228 add v8.4s, v8.4s, v24.4s
229 add v9.4s, v9.4s, v25.4s
230 add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
231 add v11.4s, v11.4s, v28.4s
232 add v12.4s, v12.4s, v26.4s
233 add v13.4s, v13.4s, v29.4s
234 add v14.4s, v14.4s, v30.4s
235 add v15.4s, v15.4s, v31.4s
236
237 HTOLE32(v0.16b)
238 HTOLE32(v1.16b)
239 HTOLE32(v2.16b)
240 HTOLE32(v3.16b)
241 HTOLE32(v4.16b)
242 HTOLE32(v5.16b)
243 HTOLE32(v6.16b)
244 HTOLE32(v7.16b)
245 HTOLE32(v8.16b)
246 HTOLE32(v9.16b)
247 HTOLE32(v10.16b)
248 HTOLE32(v11.16b)
249 HTOLE32(v12.16b)
250 HTOLE32(v13.16b)
251 HTOLE32(v14.16b)
252 HTOLE32(v15.16b)
253
254 st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
255 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
256 st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
257 st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
258 st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
259 st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
260 st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
261 st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
262 st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
263 st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
264 st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
265 st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
266 st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
267 st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
268 st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
269 st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
270
271 ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
272 ldp d10, d11, [sp, #0x20]
273 ldp d12, d13, [sp, #0x30]
274 ldp d14, d15, [sp, #0x40]
275
276 ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
277 ret
278 END(chacha_stream256_neon)
279
280 /*
281 * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
282 * uint32_t blkno@w2,
283 * const uint8_t nonce[12]@x3,
284 * const uint8_t key[32]@x4,
285 * const uint8_t const[16]@x5,
286 * unsigned nr@w6)
287 */
288 ENTRY(chacha_stream_xor256_neon)
289 stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
290 mov fp, sp
291
292 stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
293 stp d10, d11, [sp, #0x20]
294 stp d12, d13, [sp, #0x30]
295 stp d14, d15, [sp, #0x40]
296
297 adrl x9, v0123 /* x9 := &v0123 */
298 mov x10, x5 /* r10 := c */
299 mov x11, x4 /* r11 := k */
300 add x12, x4, #16 /* r12 := k+4 */
301 mov x13, x3 /* r13 := nonce */
302
303 ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
304 dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */
305 ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
306 ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
307 ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
308 ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
309 add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
310
311 LE32TOH(v0.16b)
312 LE32TOH(v1.16b)
313 LE32TOH(v2.16b)
314 LE32TOH(v3.16b)
315 LE32TOH(v4.16b)
316 LE32TOH(v5.16b)
317 LE32TOH(v6.16b)
318 LE32TOH(v7.16b)
319 LE32TOH(v8.16b)
320 LE32TOH(v9.16b)
321 LE32TOH(v10.16b)
322 LE32TOH(v11.16b)
323 /* LE32TOH(v12.16b) -- blkno, already host order */
324 LE32TOH(v13.16b)
325 LE32TOH(v14.16b)
326 LE32TOH(v15.16b)
327
328 mov v16.16b, v0.16b
329 mov v17.16b, v1.16b
330 mov v18.16b, v2.16b
331 mov v19.16b, v3.16b
332 mov v20.16b, v4.16b
333 mov v21.16b, v5.16b
334 mov v22.16b, v6.16b
335 mov v23.16b, v7.16b
336 mov v24.16b, v8.16b
337 mov v25.16b, v9.16b
338 mov v26.16b, v12.16b /* reordered since v12 isn't dup */
339 mov w8, v10.s[0] /* v27-31 needed as temporaries */
340 mov w9, v11.s[0]
341 mov w10, v13.s[0]
342 mov w11, v14.s[0]
343 mov w12, v15.s[0]
344
345 _ALIGN_TEXT
346 1: subs w6, w6, #2
347 ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
348 v28,v29,v30,v31, v27)
349 ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
350 v28,v29,v30,v31, v27)
351 b.ne 1b
352
353 dup v27.4s, w8
354 dup v28.4s, w9
355 dup v29.4s, w10
356 dup v30.4s, w11
357 dup v31.4s, w12
358
359 add v0.4s, v0.4s, v16.4s
360 add v1.4s, v1.4s, v17.4s
361 add v2.4s, v2.4s, v18.4s
362 add v3.4s, v3.4s, v19.4s
363 add v4.4s, v4.4s, v20.4s
364 add v5.4s, v5.4s, v21.4s
365 add v6.4s, v6.4s, v22.4s
366 add v7.4s, v7.4s, v23.4s
367 add v8.4s, v8.4s, v24.4s
368 add v9.4s, v9.4s, v25.4s
369 add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
370 add v11.4s, v11.4s, v28.4s
371 add v12.4s, v12.4s, v26.4s
372 add v13.4s, v13.4s, v29.4s
373 add v14.4s, v14.4s, v30.4s
374 add v15.4s, v15.4s, v31.4s
375
376 /*
377 * We could do these sixteen LD4-into-lane instructions instead
378 * by four LD1-into-register instructions, but we would need to
379 * permute the elements in v0-v15 to put them in the right
380 * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized
381 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
382 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
383 * exceed the savings in cost from four LD1 instructions rather
384 * than sixteen LD4 instructions, even if we interleave the LD1
385 * instructions with the ZIPs.
386 */
387 ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
388 ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
389 ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
390 ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
391 ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
392 ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
393 ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
394 ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
395 ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
396 ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
397 ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
398 ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
399 ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
400 ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
401 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
402 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
403
404 HTOLE32(v0.16b)
405 HTOLE32(v1.16b)
406 HTOLE32(v2.16b)
407 HTOLE32(v3.16b)
408 HTOLE32(v4.16b)
409 HTOLE32(v5.16b)
410 HTOLE32(v6.16b)
411 HTOLE32(v7.16b)
412 HTOLE32(v8.16b)
413 HTOLE32(v9.16b)
414 HTOLE32(v10.16b)
415 HTOLE32(v11.16b)
416 HTOLE32(v12.16b)
417 HTOLE32(v13.16b)
418 HTOLE32(v14.16b)
419 HTOLE32(v15.16b)
420
421 eor v16.16b, v16.16b, v0.16b
422 eor v17.16b, v17.16b, v1.16b
423 eor v18.16b, v18.16b, v2.16b
424 eor v19.16b, v19.16b, v3.16b
425 eor v20.16b, v20.16b, v4.16b
426 eor v21.16b, v21.16b, v5.16b
427 eor v22.16b, v22.16b, v6.16b
428 eor v23.16b, v23.16b, v7.16b
429 eor v24.16b, v24.16b, v8.16b
430 eor v25.16b, v25.16b, v9.16b
431 eor v26.16b, v26.16b, v10.16b
432 eor v27.16b, v27.16b, v11.16b
433 eor v28.16b, v28.16b, v12.16b
434 eor v29.16b, v29.16b, v13.16b
435 eor v30.16b, v30.16b, v14.16b
436 eor v31.16b, v31.16b, v15.16b
437
438 st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
439 st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
440 st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
441 st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
442 st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
443 st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
444 st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
445 st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
446 st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
447 st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
448 st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
449 st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
450 st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
451 st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
452 st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
453 st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
454
455 ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
456 ldp d10, d11, [sp, #0x20]
457 ldp d12, d13, [sp, #0x30]
458 ldp d14, d15, [sp, #0x40]
459
460 ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
461 ret
462 END(chacha_stream_xor256_neon)
463
464 .section .rodata
465 .p2align 4
466
467 .type v0123,@object
468 v0123:
469 .long 0, 1, 2, 3
470 END(v0123)
471
472 /*
473 * Must be immediately after v0123 -- we load them in a single
474 * ld1 instruction.
475 */
476 .type rot8,@object
477 rot8:
478 .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
479 END(rot8)
480