chacha_neon_64.S revision 1.2 1 1.2 riastrad /* $NetBSD: chacha_neon_64.S,v 1.2 2020/07/27 20:50:25 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.2 riastrad #include <aarch64/asm.h>
30 1.1 riastrad
31 1.1 riastrad #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
32 1.1 riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
33 1.1 riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
34 1.1 riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
35 1.1 riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
36 1.1 riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
37 1.1 riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
38 1.1 riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
39 1.1 riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
40 1.1 riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
41 1.1 riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
42 1.1 riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
43 1.1 riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
44 1.1 riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
45 1.1 riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
46 1.1 riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
47 1.1 riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
48 1.1 riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
49 1.1 riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
50 1.1 riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
51 1.1 riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
52 1.1 riastrad /* end ROUND */
53 1.1 riastrad
54 1.1 riastrad #define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
55 1.1 riastrad f(a0,b0,c0,d0, t0, r); \
56 1.1 riastrad f(a1,b1,c1,d1, t1, r); \
57 1.1 riastrad f(a2,b2,c2,d2, t2, r); \
58 1.1 riastrad f(a3,b3,c3,d3, t3, r); \
59 1.1 riastrad /* end of STEP */
60 1.1 riastrad
61 1.1 riastrad /*
62 1.1 riastrad * Each step of the ChaCha quarterround, split up so we can interleave
63 1.1 riastrad * the quarterrounds on independent rows/diagonals to maximize pipeline
64 1.1 riastrad * efficiency. Reference:
65 1.1 riastrad *
66 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
67 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008.
68 1.1 riastrad * https://cr.yp.to/papers.html#chacha
69 1.1 riastrad *
70 1.1 riastrad * a += b; d ^= a; d <<<= 16;
71 1.1 riastrad * c += d; b ^= c; b <<<= 12;
72 1.1 riastrad * a += b; d ^= a; d <<<= 8;
73 1.1 riastrad * c += d; b ^= c; b <<<= 7;
74 1.1 riastrad *
75 1.1 riastrad * The rotations are implemented with:
76 1.1 riastrad * <<< 16 REV32 Vn.8h for 16,
77 1.1 riastrad * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR)
78 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r)
79 1.1 riastrad * <<< 7 SHL/SRI/ORR
80 1.1 riastrad */
81 1.1 riastrad #define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
82 1.1 riastrad #define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
83 1.1 riastrad #if 0
84 1.1 riastrad #define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16
85 1.1 riastrad #define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16)
86 1.1 riastrad #define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
87 1.1 riastrad #else
88 1.1 riastrad #define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h
89 1.1 riastrad #define STEP3(a,b,c,d, t, r) /* nothing */
90 1.1 riastrad #define STEP4(a,b,c,d, t, r) /* nothing */
91 1.1 riastrad #endif
92 1.1 riastrad
93 1.1 riastrad #define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
94 1.1 riastrad #if 0
95 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
96 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12
97 1.1 riastrad #define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12)
98 1.1 riastrad #define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
99 1.1 riastrad #else
100 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
101 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12
102 1.1 riastrad #define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12)
103 1.1 riastrad #define STEP9(a,b,c,d, t, r) /* nothing */
104 1.1 riastrad #endif
105 1.1 riastrad
106 1.1 riastrad #define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
107 1.1 riastrad #define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
108 1.1 riastrad #if 0
109 1.1 riastrad #define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8
110 1.1 riastrad #define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8)
111 1.1 riastrad #define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
112 1.1 riastrad #else
113 1.1 riastrad #define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b
114 1.1 riastrad #define STEP13(a,b,c,d, t, r) /* nothing */
115 1.1 riastrad #define STEP14(a,b,c,d, t, r) /* nothing */
116 1.1 riastrad #endif
117 1.1 riastrad
118 1.1 riastrad #define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
119 1.1 riastrad #if 0
120 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
121 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7
122 1.1 riastrad #define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7)
123 1.1 riastrad #define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
124 1.1 riastrad #else
125 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
126 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7
127 1.1 riastrad #define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7)
128 1.1 riastrad #define STEP19(a,b,c,d, t, r) /* nothing */
129 1.1 riastrad #endif
130 1.1 riastrad
131 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
132 1.1 riastrad #define HTOLE32(x)
133 1.1 riastrad #define LE32TOH(x)
134 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
135 1.1 riastrad #define HTOLE32(x) rev32 x, x
136 1.1 riastrad #define LE32TOH(x) rev32 x, x
137 1.1 riastrad #endif
138 1.1 riastrad
139 1.1 riastrad /*
140 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@x0,
141 1.1 riastrad * uint32_t blkno@w1,
142 1.1 riastrad * const uint8_t nonce[12]@x2,
143 1.1 riastrad * const uint8_t key[12]@x3,
144 1.1 riastrad * const uint8_t const[16]@x4,
145 1.1 riastrad * unsigned nr@w5)
146 1.1 riastrad */
147 1.1 riastrad ENTRY(chacha_stream256_neon)
148 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
149 1.1 riastrad mov fp, sp
150 1.1 riastrad
151 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
152 1.1 riastrad stp d10, d11, [sp, #0x20]
153 1.1 riastrad stp d12, d13, [sp, #0x30]
154 1.1 riastrad stp d14, d15, [sp, #0x40]
155 1.1 riastrad
156 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */
157 1.1 riastrad mov x10, x4 /* r10 := c */
158 1.1 riastrad mov x11, x3 /* r11 := k */
159 1.1 riastrad add x12, x3, #16 /* r12 := k+4 */
160 1.1 riastrad mov x13, x2 /* r13 := nonce */
161 1.1 riastrad
162 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
163 1.1 riastrad dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */
164 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
165 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
166 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
167 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
168 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
169 1.1 riastrad
170 1.1 riastrad HTOLE32(v0.16b)
171 1.1 riastrad HTOLE32(v1.16b)
172 1.1 riastrad HTOLE32(v2.16b)
173 1.1 riastrad HTOLE32(v3.16b)
174 1.1 riastrad HTOLE32(v4.16b)
175 1.1 riastrad HTOLE32(v5.16b)
176 1.1 riastrad HTOLE32(v6.16b)
177 1.1 riastrad HTOLE32(v7.16b)
178 1.1 riastrad HTOLE32(v8.16b)
179 1.1 riastrad HTOLE32(v9.16b)
180 1.1 riastrad HTOLE32(v10.16b)
181 1.1 riastrad HTOLE32(v11.16b)
182 1.1 riastrad HTOLE32(v12.16b)
183 1.1 riastrad HTOLE32(v13.16b)
184 1.1 riastrad HTOLE32(v14.16b)
185 1.1 riastrad HTOLE32(v15.16b)
186 1.1 riastrad
187 1.1 riastrad mov v16.16b, v0.16b
188 1.1 riastrad mov v17.16b, v1.16b
189 1.1 riastrad mov v18.16b, v2.16b
190 1.1 riastrad mov v19.16b, v3.16b
191 1.1 riastrad mov v20.16b, v4.16b
192 1.1 riastrad mov v21.16b, v5.16b
193 1.1 riastrad mov v22.16b, v6.16b
194 1.1 riastrad mov v23.16b, v7.16b
195 1.1 riastrad mov v24.16b, v8.16b
196 1.1 riastrad mov v25.16b, v9.16b
197 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */
198 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */
199 1.1 riastrad mov w9, v11.s[0]
200 1.1 riastrad mov w10, v13.s[0]
201 1.1 riastrad mov w11, v14.s[0]
202 1.1 riastrad mov w12, v15.s[0]
203 1.1 riastrad
204 1.1 riastrad 1: subs w5, w5, #2
205 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
206 1.1 riastrad v28,v29,v30,v31, v27)
207 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
208 1.1 riastrad v28,v29,v30,v31, v27)
209 1.1 riastrad b.ne 1b
210 1.1 riastrad
211 1.1 riastrad dup v27.4s, w8
212 1.1 riastrad dup v28.4s, w9
213 1.1 riastrad dup v29.4s, w10
214 1.1 riastrad dup v30.4s, w11
215 1.1 riastrad dup v31.4s, w12
216 1.1 riastrad
217 1.1 riastrad add v0.4s, v0.4s, v16.4s
218 1.1 riastrad add v1.4s, v1.4s, v17.4s
219 1.1 riastrad add v2.4s, v2.4s, v18.4s
220 1.1 riastrad add v3.4s, v3.4s, v19.4s
221 1.1 riastrad add v4.4s, v4.4s, v20.4s
222 1.1 riastrad add v5.4s, v5.4s, v21.4s
223 1.1 riastrad add v6.4s, v6.4s, v22.4s
224 1.1 riastrad add v7.4s, v7.4s, v23.4s
225 1.1 riastrad add v8.4s, v8.4s, v24.4s
226 1.1 riastrad add v9.4s, v9.4s, v25.4s
227 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
228 1.1 riastrad add v11.4s, v11.4s, v28.4s
229 1.1 riastrad add v12.4s, v12.4s, v26.4s
230 1.1 riastrad add v13.4s, v13.4s, v29.4s
231 1.1 riastrad add v14.4s, v14.4s, v30.4s
232 1.1 riastrad add v15.4s, v15.4s, v31.4s
233 1.1 riastrad
234 1.1 riastrad LE32TOH(v0.16b)
235 1.1 riastrad LE32TOH(v1.16b)
236 1.1 riastrad LE32TOH(v2.16b)
237 1.1 riastrad LE32TOH(v3.16b)
238 1.1 riastrad LE32TOH(v4.16b)
239 1.1 riastrad LE32TOH(v5.16b)
240 1.1 riastrad LE32TOH(v6.16b)
241 1.1 riastrad LE32TOH(v7.16b)
242 1.1 riastrad LE32TOH(v8.16b)
243 1.1 riastrad LE32TOH(v9.16b)
244 1.1 riastrad LE32TOH(v10.16b)
245 1.1 riastrad LE32TOH(v11.16b)
246 1.1 riastrad LE32TOH(v12.16b)
247 1.1 riastrad LE32TOH(v13.16b)
248 1.1 riastrad LE32TOH(v14.16b)
249 1.1 riastrad LE32TOH(v15.16b)
250 1.1 riastrad
251 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
252 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
253 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
254 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
255 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
256 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
257 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
258 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
259 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
260 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
261 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
262 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
263 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
264 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
265 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
266 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
267 1.1 riastrad
268 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
269 1.1 riastrad ldp d10, d11, [sp, #0x20]
270 1.1 riastrad ldp d12, d13, [sp, #0x30]
271 1.1 riastrad ldp d14, d15, [sp, #0x40]
272 1.1 riastrad
273 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
274 1.1 riastrad ret
275 1.1 riastrad END(chacha_stream256_neon)
276 1.1 riastrad
277 1.1 riastrad /*
278 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
279 1.1 riastrad * uint32_t blkno@w2,
280 1.1 riastrad * const uint8_t nonce[12]@x3,
281 1.1 riastrad * const uint8_t key[32]@x4,
282 1.1 riastrad * const uint8_t const[16]@x5,
283 1.1 riastrad * unsigned nr@w6)
284 1.1 riastrad */
285 1.1 riastrad ENTRY(chacha_stream_xor256_neon)
286 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
287 1.1 riastrad mov fp, sp
288 1.1 riastrad
289 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
290 1.1 riastrad stp d10, d11, [sp, #0x20]
291 1.1 riastrad stp d12, d13, [sp, #0x30]
292 1.1 riastrad stp d14, d15, [sp, #0x40]
293 1.1 riastrad
294 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */
295 1.1 riastrad mov x10, x5 /* r10 := c */
296 1.1 riastrad mov x11, x4 /* r11 := k */
297 1.1 riastrad add x12, x4, #16 /* r12 := k+4 */
298 1.1 riastrad mov x13, x3 /* r13 := nonce */
299 1.1 riastrad
300 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
301 1.1 riastrad dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */
302 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
303 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
304 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
305 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
306 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
307 1.1 riastrad
308 1.1 riastrad HTOLE32(v0.16b)
309 1.1 riastrad HTOLE32(v1.16b)
310 1.1 riastrad HTOLE32(v2.16b)
311 1.1 riastrad HTOLE32(v3.16b)
312 1.1 riastrad HTOLE32(v4.16b)
313 1.1 riastrad HTOLE32(v5.16b)
314 1.1 riastrad HTOLE32(v6.16b)
315 1.1 riastrad HTOLE32(v7.16b)
316 1.1 riastrad HTOLE32(v8.16b)
317 1.1 riastrad HTOLE32(v9.16b)
318 1.1 riastrad HTOLE32(v10.16b)
319 1.1 riastrad HTOLE32(v11.16b)
320 1.1 riastrad HTOLE32(v12.16b)
321 1.1 riastrad HTOLE32(v13.16b)
322 1.1 riastrad HTOLE32(v14.16b)
323 1.1 riastrad HTOLE32(v15.16b)
324 1.1 riastrad
325 1.1 riastrad mov v16.16b, v0.16b
326 1.1 riastrad mov v17.16b, v1.16b
327 1.1 riastrad mov v18.16b, v2.16b
328 1.1 riastrad mov v19.16b, v3.16b
329 1.1 riastrad mov v20.16b, v4.16b
330 1.1 riastrad mov v21.16b, v5.16b
331 1.1 riastrad mov v22.16b, v6.16b
332 1.1 riastrad mov v23.16b, v7.16b
333 1.1 riastrad mov v24.16b, v8.16b
334 1.1 riastrad mov v25.16b, v9.16b
335 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */
336 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */
337 1.1 riastrad mov w9, v11.s[0]
338 1.1 riastrad mov w10, v13.s[0]
339 1.1 riastrad mov w11, v14.s[0]
340 1.1 riastrad mov w12, v15.s[0]
341 1.1 riastrad
342 1.1 riastrad 1: subs w6, w6, #2
343 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
344 1.1 riastrad v28,v29,v30,v31, v27)
345 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
346 1.1 riastrad v28,v29,v30,v31, v27)
347 1.1 riastrad b.ne 1b
348 1.1 riastrad
349 1.1 riastrad dup v27.4s, w8
350 1.1 riastrad dup v28.4s, w9
351 1.1 riastrad dup v29.4s, w10
352 1.1 riastrad dup v30.4s, w11
353 1.1 riastrad dup v31.4s, w12
354 1.1 riastrad
355 1.1 riastrad add v0.4s, v0.4s, v16.4s
356 1.1 riastrad add v1.4s, v1.4s, v17.4s
357 1.1 riastrad add v2.4s, v2.4s, v18.4s
358 1.1 riastrad add v3.4s, v3.4s, v19.4s
359 1.1 riastrad add v4.4s, v4.4s, v20.4s
360 1.1 riastrad add v5.4s, v5.4s, v21.4s
361 1.1 riastrad add v6.4s, v6.4s, v22.4s
362 1.1 riastrad add v7.4s, v7.4s, v23.4s
363 1.1 riastrad add v8.4s, v8.4s, v24.4s
364 1.1 riastrad add v9.4s, v9.4s, v25.4s
365 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
366 1.1 riastrad add v11.4s, v11.4s, v28.4s
367 1.1 riastrad add v12.4s, v12.4s, v26.4s
368 1.1 riastrad add v13.4s, v13.4s, v29.4s
369 1.1 riastrad add v14.4s, v14.4s, v30.4s
370 1.1 riastrad add v15.4s, v15.4s, v31.4s
371 1.1 riastrad
372 1.1 riastrad /*
373 1.1 riastrad * We could do these sixteen LD4-into-lane instructions instead
374 1.1 riastrad * by four LD1-into-register instructions, but we would need to
375 1.1 riastrad * permute the elements in v0-v15 to put them in the right
376 1.1 riastrad * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized
377 1.1 riastrad * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
378 1.1 riastrad * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
379 1.1 riastrad * exceed the savings in cost from four LD1 instructions rather
380 1.1 riastrad * than sixteen LD4 instructions, even if we interleave the LD1
381 1.1 riastrad * instructions with the ZIPs.
382 1.1 riastrad */
383 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
384 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
385 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
386 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
387 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
388 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
389 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
390 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
391 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
392 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
393 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
394 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
395 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
396 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
397 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
398 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
399 1.1 riastrad
400 1.1 riastrad LE32TOH(v0.16b)
401 1.1 riastrad LE32TOH(v1.16b)
402 1.1 riastrad LE32TOH(v2.16b)
403 1.1 riastrad LE32TOH(v3.16b)
404 1.1 riastrad LE32TOH(v4.16b)
405 1.1 riastrad LE32TOH(v5.16b)
406 1.1 riastrad LE32TOH(v6.16b)
407 1.1 riastrad LE32TOH(v7.16b)
408 1.1 riastrad LE32TOH(v8.16b)
409 1.1 riastrad LE32TOH(v9.16b)
410 1.1 riastrad LE32TOH(v10.16b)
411 1.1 riastrad LE32TOH(v11.16b)
412 1.1 riastrad LE32TOH(v12.16b)
413 1.1 riastrad LE32TOH(v13.16b)
414 1.1 riastrad LE32TOH(v14.16b)
415 1.1 riastrad LE32TOH(v15.16b)
416 1.1 riastrad
417 1.1 riastrad eor v16.16b, v16.16b, v0.16b
418 1.1 riastrad eor v17.16b, v17.16b, v1.16b
419 1.1 riastrad eor v18.16b, v18.16b, v2.16b
420 1.1 riastrad eor v19.16b, v19.16b, v3.16b
421 1.1 riastrad eor v20.16b, v20.16b, v4.16b
422 1.1 riastrad eor v21.16b, v21.16b, v5.16b
423 1.1 riastrad eor v22.16b, v22.16b, v6.16b
424 1.1 riastrad eor v23.16b, v23.16b, v7.16b
425 1.1 riastrad eor v24.16b, v24.16b, v8.16b
426 1.1 riastrad eor v25.16b, v25.16b, v9.16b
427 1.1 riastrad eor v26.16b, v26.16b, v10.16b
428 1.1 riastrad eor v27.16b, v27.16b, v11.16b
429 1.1 riastrad eor v28.16b, v28.16b, v12.16b
430 1.1 riastrad eor v29.16b, v29.16b, v13.16b
431 1.1 riastrad eor v30.16b, v30.16b, v14.16b
432 1.1 riastrad eor v31.16b, v31.16b, v15.16b
433 1.1 riastrad
434 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
435 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
436 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
437 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
438 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
439 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
440 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
441 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
442 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
443 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
444 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
445 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
446 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
447 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
448 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
449 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
450 1.1 riastrad
451 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
452 1.1 riastrad ldp d10, d11, [sp, #0x20]
453 1.1 riastrad ldp d12, d13, [sp, #0x30]
454 1.1 riastrad ldp d14, d15, [sp, #0x40]
455 1.1 riastrad
456 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
457 1.1 riastrad ret
458 1.1 riastrad END(chacha_stream_xor256_neon)
459 1.1 riastrad
460 1.1 riastrad .section .rodata
461 1.1 riastrad .p2align 4
462 1.1 riastrad
463 1.1 riastrad .type v0123,@object
464 1.1 riastrad v0123:
465 1.1 riastrad .long 0, 1, 2, 3
466 1.1 riastrad END(v0123)
467 1.1 riastrad
468 1.1 riastrad /*
469 1.1 riastrad * Must be immediately after v0123 -- we load them in a single
470 1.1 riastrad * ld1 instruction.
471 1.1 riastrad */
472 1.1 riastrad .type rot8,@object
473 1.1 riastrad rot8:
474 1.1 riastrad .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
475 1.1 riastrad END(rot8)
476