chacha_neon_64.S revision 1.1 1 1.1 riastrad /* $NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad .macro adrl reg, addr
30 1.1 riastrad adrp \reg, \addr
31 1.1 riastrad add \reg, \reg, #:lo12:\addr
32 1.1 riastrad .endm
33 1.1 riastrad
34 1.1 riastrad #define _ALIGN_TEXT \
35 1.1 riastrad .p2align 4
36 1.1 riastrad
37 1.1 riastrad #define ENTRY(x) \
38 1.1 riastrad .text; \
39 1.1 riastrad _ALIGN_TEXT; \
40 1.1 riastrad .global x; \
41 1.1 riastrad .type x,@function; \
42 1.1 riastrad x:
43 1.1 riastrad
44 1.1 riastrad #define END(x) \
45 1.1 riastrad .size x, . - x
46 1.1 riastrad
47 1.1 riastrad #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
48 1.1 riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
49 1.1 riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
50 1.1 riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
51 1.1 riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
52 1.1 riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
53 1.1 riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
54 1.1 riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
55 1.1 riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
56 1.1 riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
57 1.1 riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
58 1.1 riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
59 1.1 riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
60 1.1 riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
61 1.1 riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
62 1.1 riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
63 1.1 riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
64 1.1 riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
65 1.1 riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
66 1.1 riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
67 1.1 riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
68 1.1 riastrad /* end ROUND */
69 1.1 riastrad
70 1.1 riastrad #define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
71 1.1 riastrad f(a0,b0,c0,d0, t0, r); \
72 1.1 riastrad f(a1,b1,c1,d1, t1, r); \
73 1.1 riastrad f(a2,b2,c2,d2, t2, r); \
74 1.1 riastrad f(a3,b3,c3,d3, t3, r); \
75 1.1 riastrad /* end of STEP */
76 1.1 riastrad
77 1.1 riastrad /*
78 1.1 riastrad * Each step of the ChaCha quarterround, split up so we can interleave
79 1.1 riastrad * the quarterrounds on independent rows/diagonals to maximize pipeline
80 1.1 riastrad * efficiency. Reference:
81 1.1 riastrad *
82 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
83 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008.
84 1.1 riastrad * https://cr.yp.to/papers.html#chacha
85 1.1 riastrad *
86 1.1 riastrad * a += b; d ^= a; d <<<= 16;
87 1.1 riastrad * c += d; b ^= c; b <<<= 12;
88 1.1 riastrad * a += b; d ^= a; d <<<= 8;
89 1.1 riastrad * c += d; b ^= c; b <<<= 7;
90 1.1 riastrad *
91 1.1 riastrad * The rotations are implemented with:
92 1.1 riastrad * <<< 16 REV32 Vn.8h for 16,
93 1.1 riastrad * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR)
94 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r)
95 1.1 riastrad * <<< 7 SHL/SRI/ORR
96 1.1 riastrad */
97 1.1 riastrad #define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
98 1.1 riastrad #define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
99 1.1 riastrad #if 0
100 1.1 riastrad #define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16
101 1.1 riastrad #define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16)
102 1.1 riastrad #define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
103 1.1 riastrad #else
104 1.1 riastrad #define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h
105 1.1 riastrad #define STEP3(a,b,c,d, t, r) /* nothing */
106 1.1 riastrad #define STEP4(a,b,c,d, t, r) /* nothing */
107 1.1 riastrad #endif
108 1.1 riastrad
109 1.1 riastrad #define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
110 1.1 riastrad #if 0
111 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
112 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12
113 1.1 riastrad #define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12)
114 1.1 riastrad #define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
115 1.1 riastrad #else
116 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
117 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12
118 1.1 riastrad #define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12)
119 1.1 riastrad #define STEP9(a,b,c,d, t, r) /* nothing */
120 1.1 riastrad #endif
121 1.1 riastrad
122 1.1 riastrad #define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
123 1.1 riastrad #define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
124 1.1 riastrad #if 0
125 1.1 riastrad #define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8
126 1.1 riastrad #define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8)
127 1.1 riastrad #define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
128 1.1 riastrad #else
129 1.1 riastrad #define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b
130 1.1 riastrad #define STEP13(a,b,c,d, t, r) /* nothing */
131 1.1 riastrad #define STEP14(a,b,c,d, t, r) /* nothing */
132 1.1 riastrad #endif
133 1.1 riastrad
134 1.1 riastrad #define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
135 1.1 riastrad #if 0
136 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
137 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7
138 1.1 riastrad #define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7)
139 1.1 riastrad #define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
140 1.1 riastrad #else
141 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
142 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7
143 1.1 riastrad #define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7)
144 1.1 riastrad #define STEP19(a,b,c,d, t, r) /* nothing */
145 1.1 riastrad #endif
146 1.1 riastrad
147 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
148 1.1 riastrad #define HTOLE32(x)
149 1.1 riastrad #define LE32TOH(x)
150 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
151 1.1 riastrad #define HTOLE32(x) rev32 x, x
152 1.1 riastrad #define LE32TOH(x) rev32 x, x
153 1.1 riastrad #endif
154 1.1 riastrad
155 1.1 riastrad /*
156 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@x0,
157 1.1 riastrad * uint32_t blkno@w1,
158 1.1 riastrad * const uint8_t nonce[12]@x2,
159 1.1 riastrad * const uint8_t key[12]@x3,
160 1.1 riastrad * const uint8_t const[16]@x4,
161 1.1 riastrad * unsigned nr@w5)
162 1.1 riastrad */
163 1.1 riastrad ENTRY(chacha_stream256_neon)
164 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
165 1.1 riastrad mov fp, sp
166 1.1 riastrad
167 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
168 1.1 riastrad stp d10, d11, [sp, #0x20]
169 1.1 riastrad stp d12, d13, [sp, #0x30]
170 1.1 riastrad stp d14, d15, [sp, #0x40]
171 1.1 riastrad
172 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */
173 1.1 riastrad mov x10, x4 /* r10 := c */
174 1.1 riastrad mov x11, x3 /* r11 := k */
175 1.1 riastrad add x12, x3, #16 /* r12 := k+4 */
176 1.1 riastrad mov x13, x2 /* r13 := nonce */
177 1.1 riastrad
178 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
179 1.1 riastrad dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */
180 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
181 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
182 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
183 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
184 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
185 1.1 riastrad
186 1.1 riastrad HTOLE32(v0.16b)
187 1.1 riastrad HTOLE32(v1.16b)
188 1.1 riastrad HTOLE32(v2.16b)
189 1.1 riastrad HTOLE32(v3.16b)
190 1.1 riastrad HTOLE32(v4.16b)
191 1.1 riastrad HTOLE32(v5.16b)
192 1.1 riastrad HTOLE32(v6.16b)
193 1.1 riastrad HTOLE32(v7.16b)
194 1.1 riastrad HTOLE32(v8.16b)
195 1.1 riastrad HTOLE32(v9.16b)
196 1.1 riastrad HTOLE32(v10.16b)
197 1.1 riastrad HTOLE32(v11.16b)
198 1.1 riastrad HTOLE32(v12.16b)
199 1.1 riastrad HTOLE32(v13.16b)
200 1.1 riastrad HTOLE32(v14.16b)
201 1.1 riastrad HTOLE32(v15.16b)
202 1.1 riastrad
203 1.1 riastrad mov v16.16b, v0.16b
204 1.1 riastrad mov v17.16b, v1.16b
205 1.1 riastrad mov v18.16b, v2.16b
206 1.1 riastrad mov v19.16b, v3.16b
207 1.1 riastrad mov v20.16b, v4.16b
208 1.1 riastrad mov v21.16b, v5.16b
209 1.1 riastrad mov v22.16b, v6.16b
210 1.1 riastrad mov v23.16b, v7.16b
211 1.1 riastrad mov v24.16b, v8.16b
212 1.1 riastrad mov v25.16b, v9.16b
213 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */
214 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */
215 1.1 riastrad mov w9, v11.s[0]
216 1.1 riastrad mov w10, v13.s[0]
217 1.1 riastrad mov w11, v14.s[0]
218 1.1 riastrad mov w12, v15.s[0]
219 1.1 riastrad
220 1.1 riastrad 1: subs w5, w5, #2
221 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
222 1.1 riastrad v28,v29,v30,v31, v27)
223 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
224 1.1 riastrad v28,v29,v30,v31, v27)
225 1.1 riastrad b.ne 1b
226 1.1 riastrad
227 1.1 riastrad dup v27.4s, w8
228 1.1 riastrad dup v28.4s, w9
229 1.1 riastrad dup v29.4s, w10
230 1.1 riastrad dup v30.4s, w11
231 1.1 riastrad dup v31.4s, w12
232 1.1 riastrad
233 1.1 riastrad add v0.4s, v0.4s, v16.4s
234 1.1 riastrad add v1.4s, v1.4s, v17.4s
235 1.1 riastrad add v2.4s, v2.4s, v18.4s
236 1.1 riastrad add v3.4s, v3.4s, v19.4s
237 1.1 riastrad add v4.4s, v4.4s, v20.4s
238 1.1 riastrad add v5.4s, v5.4s, v21.4s
239 1.1 riastrad add v6.4s, v6.4s, v22.4s
240 1.1 riastrad add v7.4s, v7.4s, v23.4s
241 1.1 riastrad add v8.4s, v8.4s, v24.4s
242 1.1 riastrad add v9.4s, v9.4s, v25.4s
243 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
244 1.1 riastrad add v11.4s, v11.4s, v28.4s
245 1.1 riastrad add v12.4s, v12.4s, v26.4s
246 1.1 riastrad add v13.4s, v13.4s, v29.4s
247 1.1 riastrad add v14.4s, v14.4s, v30.4s
248 1.1 riastrad add v15.4s, v15.4s, v31.4s
249 1.1 riastrad
250 1.1 riastrad LE32TOH(v0.16b)
251 1.1 riastrad LE32TOH(v1.16b)
252 1.1 riastrad LE32TOH(v2.16b)
253 1.1 riastrad LE32TOH(v3.16b)
254 1.1 riastrad LE32TOH(v4.16b)
255 1.1 riastrad LE32TOH(v5.16b)
256 1.1 riastrad LE32TOH(v6.16b)
257 1.1 riastrad LE32TOH(v7.16b)
258 1.1 riastrad LE32TOH(v8.16b)
259 1.1 riastrad LE32TOH(v9.16b)
260 1.1 riastrad LE32TOH(v10.16b)
261 1.1 riastrad LE32TOH(v11.16b)
262 1.1 riastrad LE32TOH(v12.16b)
263 1.1 riastrad LE32TOH(v13.16b)
264 1.1 riastrad LE32TOH(v14.16b)
265 1.1 riastrad LE32TOH(v15.16b)
266 1.1 riastrad
267 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
268 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
269 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
270 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
271 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
272 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
273 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
274 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
275 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
276 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
277 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
278 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
279 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
280 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
281 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
282 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
283 1.1 riastrad
284 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
285 1.1 riastrad ldp d10, d11, [sp, #0x20]
286 1.1 riastrad ldp d12, d13, [sp, #0x30]
287 1.1 riastrad ldp d14, d15, [sp, #0x40]
288 1.1 riastrad
289 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
290 1.1 riastrad ret
291 1.1 riastrad END(chacha_stream256_neon)
292 1.1 riastrad
293 1.1 riastrad /*
294 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
295 1.1 riastrad * uint32_t blkno@w2,
296 1.1 riastrad * const uint8_t nonce[12]@x3,
297 1.1 riastrad * const uint8_t key[32]@x4,
298 1.1 riastrad * const uint8_t const[16]@x5,
299 1.1 riastrad * unsigned nr@w6)
300 1.1 riastrad */
301 1.1 riastrad ENTRY(chacha_stream_xor256_neon)
302 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
303 1.1 riastrad mov fp, sp
304 1.1 riastrad
305 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
306 1.1 riastrad stp d10, d11, [sp, #0x20]
307 1.1 riastrad stp d12, d13, [sp, #0x30]
308 1.1 riastrad stp d14, d15, [sp, #0x40]
309 1.1 riastrad
310 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */
311 1.1 riastrad mov x10, x5 /* r10 := c */
312 1.1 riastrad mov x11, x4 /* r11 := k */
313 1.1 riastrad add x12, x4, #16 /* r12 := k+4 */
314 1.1 riastrad mov x13, x3 /* r13 := nonce */
315 1.1 riastrad
316 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
317 1.1 riastrad dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */
318 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
319 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
320 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
321 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
322 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
323 1.1 riastrad
324 1.1 riastrad HTOLE32(v0.16b)
325 1.1 riastrad HTOLE32(v1.16b)
326 1.1 riastrad HTOLE32(v2.16b)
327 1.1 riastrad HTOLE32(v3.16b)
328 1.1 riastrad HTOLE32(v4.16b)
329 1.1 riastrad HTOLE32(v5.16b)
330 1.1 riastrad HTOLE32(v6.16b)
331 1.1 riastrad HTOLE32(v7.16b)
332 1.1 riastrad HTOLE32(v8.16b)
333 1.1 riastrad HTOLE32(v9.16b)
334 1.1 riastrad HTOLE32(v10.16b)
335 1.1 riastrad HTOLE32(v11.16b)
336 1.1 riastrad HTOLE32(v12.16b)
337 1.1 riastrad HTOLE32(v13.16b)
338 1.1 riastrad HTOLE32(v14.16b)
339 1.1 riastrad HTOLE32(v15.16b)
340 1.1 riastrad
341 1.1 riastrad mov v16.16b, v0.16b
342 1.1 riastrad mov v17.16b, v1.16b
343 1.1 riastrad mov v18.16b, v2.16b
344 1.1 riastrad mov v19.16b, v3.16b
345 1.1 riastrad mov v20.16b, v4.16b
346 1.1 riastrad mov v21.16b, v5.16b
347 1.1 riastrad mov v22.16b, v6.16b
348 1.1 riastrad mov v23.16b, v7.16b
349 1.1 riastrad mov v24.16b, v8.16b
350 1.1 riastrad mov v25.16b, v9.16b
351 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */
352 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */
353 1.1 riastrad mov w9, v11.s[0]
354 1.1 riastrad mov w10, v13.s[0]
355 1.1 riastrad mov w11, v14.s[0]
356 1.1 riastrad mov w12, v15.s[0]
357 1.1 riastrad
358 1.1 riastrad 1: subs w6, w6, #2
359 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
360 1.1 riastrad v28,v29,v30,v31, v27)
361 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
362 1.1 riastrad v28,v29,v30,v31, v27)
363 1.1 riastrad b.ne 1b
364 1.1 riastrad
365 1.1 riastrad dup v27.4s, w8
366 1.1 riastrad dup v28.4s, w9
367 1.1 riastrad dup v29.4s, w10
368 1.1 riastrad dup v30.4s, w11
369 1.1 riastrad dup v31.4s, w12
370 1.1 riastrad
371 1.1 riastrad add v0.4s, v0.4s, v16.4s
372 1.1 riastrad add v1.4s, v1.4s, v17.4s
373 1.1 riastrad add v2.4s, v2.4s, v18.4s
374 1.1 riastrad add v3.4s, v3.4s, v19.4s
375 1.1 riastrad add v4.4s, v4.4s, v20.4s
376 1.1 riastrad add v5.4s, v5.4s, v21.4s
377 1.1 riastrad add v6.4s, v6.4s, v22.4s
378 1.1 riastrad add v7.4s, v7.4s, v23.4s
379 1.1 riastrad add v8.4s, v8.4s, v24.4s
380 1.1 riastrad add v9.4s, v9.4s, v25.4s
381 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
382 1.1 riastrad add v11.4s, v11.4s, v28.4s
383 1.1 riastrad add v12.4s, v12.4s, v26.4s
384 1.1 riastrad add v13.4s, v13.4s, v29.4s
385 1.1 riastrad add v14.4s, v14.4s, v30.4s
386 1.1 riastrad add v15.4s, v15.4s, v31.4s
387 1.1 riastrad
388 1.1 riastrad /*
389 1.1 riastrad * We could do these sixteen LD4-into-lane instructions instead
390 1.1 riastrad * by four LD1-into-register instructions, but we would need to
391 1.1 riastrad * permute the elements in v0-v15 to put them in the right
392 1.1 riastrad * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized
393 1.1 riastrad * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
394 1.1 riastrad * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
395 1.1 riastrad * exceed the savings in cost from four LD1 instructions rather
396 1.1 riastrad * than sixteen LD4 instructions, even if we interleave the LD1
397 1.1 riastrad * instructions with the ZIPs.
398 1.1 riastrad */
399 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
400 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
401 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
402 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
403 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
404 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
405 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
406 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
407 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
408 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
409 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
410 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
411 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
412 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
413 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
414 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
415 1.1 riastrad
416 1.1 riastrad LE32TOH(v0.16b)
417 1.1 riastrad LE32TOH(v1.16b)
418 1.1 riastrad LE32TOH(v2.16b)
419 1.1 riastrad LE32TOH(v3.16b)
420 1.1 riastrad LE32TOH(v4.16b)
421 1.1 riastrad LE32TOH(v5.16b)
422 1.1 riastrad LE32TOH(v6.16b)
423 1.1 riastrad LE32TOH(v7.16b)
424 1.1 riastrad LE32TOH(v8.16b)
425 1.1 riastrad LE32TOH(v9.16b)
426 1.1 riastrad LE32TOH(v10.16b)
427 1.1 riastrad LE32TOH(v11.16b)
428 1.1 riastrad LE32TOH(v12.16b)
429 1.1 riastrad LE32TOH(v13.16b)
430 1.1 riastrad LE32TOH(v14.16b)
431 1.1 riastrad LE32TOH(v15.16b)
432 1.1 riastrad
433 1.1 riastrad eor v16.16b, v16.16b, v0.16b
434 1.1 riastrad eor v17.16b, v17.16b, v1.16b
435 1.1 riastrad eor v18.16b, v18.16b, v2.16b
436 1.1 riastrad eor v19.16b, v19.16b, v3.16b
437 1.1 riastrad eor v20.16b, v20.16b, v4.16b
438 1.1 riastrad eor v21.16b, v21.16b, v5.16b
439 1.1 riastrad eor v22.16b, v22.16b, v6.16b
440 1.1 riastrad eor v23.16b, v23.16b, v7.16b
441 1.1 riastrad eor v24.16b, v24.16b, v8.16b
442 1.1 riastrad eor v25.16b, v25.16b, v9.16b
443 1.1 riastrad eor v26.16b, v26.16b, v10.16b
444 1.1 riastrad eor v27.16b, v27.16b, v11.16b
445 1.1 riastrad eor v28.16b, v28.16b, v12.16b
446 1.1 riastrad eor v29.16b, v29.16b, v13.16b
447 1.1 riastrad eor v30.16b, v30.16b, v14.16b
448 1.1 riastrad eor v31.16b, v31.16b, v15.16b
449 1.1 riastrad
450 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
451 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
452 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
453 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
454 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
455 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
456 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
457 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
458 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
459 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
460 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
461 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
462 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
463 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
464 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
465 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
466 1.1 riastrad
467 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
468 1.1 riastrad ldp d10, d11, [sp, #0x20]
469 1.1 riastrad ldp d12, d13, [sp, #0x30]
470 1.1 riastrad ldp d14, d15, [sp, #0x40]
471 1.1 riastrad
472 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
473 1.1 riastrad ret
474 1.1 riastrad END(chacha_stream_xor256_neon)
475 1.1 riastrad
476 1.1 riastrad .section .rodata
477 1.1 riastrad .p2align 4
478 1.1 riastrad
479 1.1 riastrad .type v0123,@object
480 1.1 riastrad v0123:
481 1.1 riastrad .long 0, 1, 2, 3
482 1.1 riastrad END(v0123)
483 1.1 riastrad
484 1.1 riastrad /*
485 1.1 riastrad * Must be immediately after v0123 -- we load them in a single
486 1.1 riastrad * ld1 instruction.
487 1.1 riastrad */
488 1.1 riastrad .type rot8,@object
489 1.1 riastrad rot8:
490 1.1 riastrad .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
491 1.1 riastrad END(rot8)
492