chacha_neon_32.S revision 1.1 1 1.1 riastrad /* $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <machine/asm.h>
30 1.1 riastrad
31 1.1 riastrad RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")
32 1.1 riastrad
33 1.1 riastrad .fpu neon
34 1.1 riastrad
35 1.1 riastrad /*
36 1.1 riastrad * ChaCha round, split up so we can interleave the quarterrounds on
37 1.1 riastrad * independent rows/diagonals to maximize pipeline efficiency, with
38 1.1 riastrad * spills to deal with the scarcity of registers. Reference:
39 1.1 riastrad *
40 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
41 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008.
42 1.1 riastrad * https://cr.yp.to/papers.html#chacha
43 1.1 riastrad *
44 1.1 riastrad * a += b; d ^= a; d <<<= 16;
45 1.1 riastrad * c += d; b ^= c; b <<<= 12;
46 1.1 riastrad * a += b; d ^= a; d <<<= 8;
47 1.1 riastrad * c += d; b ^= c; b <<<= 7;
48 1.1 riastrad *
49 1.1 riastrad * The rotations are implemented with:
50 1.1 riastrad * <<< 16 VREV32.16 for 16,
51 1.1 riastrad * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR)
52 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r)
53 1.1 riastrad * <<< 7 VSHL/VSRI/VORR
54 1.1 riastrad */
55 1.1 riastrad
56 1.1 riastrad .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
57 1.1 riastrad vld1.32 {\c2-\c3}, [fp, :256]
58 1.1 riastrad .endm
59 1.1 riastrad
60 1.1 riastrad .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
61 1.1 riastrad /* a += b; d ^= a; d <<<= 16 */
62 1.1 riastrad vadd.u32 \a0, \a0, \b0
63 1.1 riastrad vadd.u32 \a1, \a1, \b1
64 1.1 riastrad vadd.u32 \a2, \a2, \b2
65 1.1 riastrad vadd.u32 \a3, \a3, \b3
66 1.1 riastrad
67 1.1 riastrad veor \d0, \d0, \a0
68 1.1 riastrad veor \d1, \d1, \a1
69 1.1 riastrad veor \d2, \d2, \a2
70 1.1 riastrad veor \d3, \d3, \a3
71 1.1 riastrad
72 1.1 riastrad vrev32.16 \d0, \d0
73 1.1 riastrad vrev32.16 \d1, \d1
74 1.1 riastrad vrev32.16 \d2, \d2
75 1.1 riastrad vrev32.16 \d3, \d3
76 1.1 riastrad
77 1.1 riastrad /* c += d; b ^= c; b <<<= 12 */
78 1.1 riastrad vadd.u32 \c0, \c0, \d0
79 1.1 riastrad vadd.u32 \c1, \c1, \d1
80 1.1 riastrad vadd.u32 \c2, \c2, \d2
81 1.1 riastrad vadd.u32 \c3, \c3, \d3
82 1.1 riastrad
83 1.1 riastrad vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */
84 1.1 riastrad
85 1.1 riastrad veor \c0, \b0, \c0
86 1.1 riastrad veor \c1, \b1, \c1
87 1.1 riastrad vshl.u32 \b0, \c0, #12
88 1.1 riastrad vshl.u32 \b1, \c1, #12
89 1.1 riastrad vsri.u32 \b0, \c0, #(32 - 12)
90 1.1 riastrad vsri.u32 \b1, \c1, #(32 - 12)
91 1.1 riastrad
92 1.1 riastrad veor \c0, \b2, \c2
93 1.1 riastrad veor \c1, \b3, \c3
94 1.1 riastrad vshl.u32 \b2, \c0, #12
95 1.1 riastrad vshl.u32 \b3, \c1, #12
96 1.1 riastrad vsri.u32 \b2, \c0, #(32 - 12)
97 1.1 riastrad vsri.u32 \b3, \c1, #(32 - 12)
98 1.1 riastrad
99 1.1 riastrad vld1.8 {\c0l}, [r7, :64] /* load rot8 table */
100 1.1 riastrad
101 1.1 riastrad /* a += b; d ^= a; d <<<= 8 */
102 1.1 riastrad vadd.u32 \a0, \a0, \b0
103 1.1 riastrad vadd.u32 \a1, \a1, \b1
104 1.1 riastrad vadd.u32 \a2, \a2, \b2
105 1.1 riastrad vadd.u32 \a3, \a3, \b3
106 1.1 riastrad
107 1.1 riastrad veor \d0, \d0, \a0
108 1.1 riastrad veor \d1, \d1, \a1
109 1.1 riastrad veor \d2, \d2, \a2
110 1.1 riastrad veor \d3, \d3, \a3
111 1.1 riastrad
112 1.1 riastrad vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */
113 1.1 riastrad vtbl.8 \d0h, {\d0h}, \c0l
114 1.1 riastrad vtbl.8 \d1l, {\d1l}, \c0l
115 1.1 riastrad vtbl.8 \d1h, {\d1h}, \c0l
116 1.1 riastrad vtbl.8 \d2l, {\d2l}, \c0l
117 1.1 riastrad vtbl.8 \d2h, {\d2h}, \c0l
118 1.1 riastrad vtbl.8 \d3l, {\d3l}, \c0l
119 1.1 riastrad vtbl.8 \d3h, {\d3h}, \c0l
120 1.1 riastrad
121 1.1 riastrad vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */
122 1.1 riastrad
123 1.1 riastrad /* c += d; b ^= c; b <<<= 7 */
124 1.1 riastrad vadd.u32 \c2, \c2, \d2
125 1.1 riastrad vadd.u32 \c3, \c3, \d3
126 1.1 riastrad vadd.u32 \c0, \c0, \d0
127 1.1 riastrad vadd.u32 \c1, \c1, \d1
128 1.1 riastrad
129 1.1 riastrad vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */
130 1.1 riastrad
131 1.1 riastrad veor \c2, \b2, \c2
132 1.1 riastrad veor \c3, \b3, \c3
133 1.1 riastrad vshl.u32 \b2, \c2, #7
134 1.1 riastrad vshl.u32 \b3, \c3, #7
135 1.1 riastrad vsri.u32 \b2, \c2, #(32 - 7)
136 1.1 riastrad vsri.u32 \b3, \c3, #(32 - 7)
137 1.1 riastrad
138 1.1 riastrad veor \c2, \b0, \c0
139 1.1 riastrad veor \c3, \b1, \c1
140 1.1 riastrad vshl.u32 \b0, \c2, #7
141 1.1 riastrad vshl.u32 \b1, \c3, #7
142 1.1 riastrad vsri.u32 \b0, \c2, #(32 - 7)
143 1.1 riastrad vsri.u32 \b1, \c3, #(32 - 7)
144 1.1 riastrad .endm
145 1.1 riastrad
146 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
147 1.1 riastrad #define HTOLE32(x)
148 1.1 riastrad #define LE32TOH(x)
149 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
150 1.1 riastrad #define HTOLE32(x) vrev32.8 x, x
151 1.1 riastrad #define LE32TOH(x) vrev32.8 x, x
152 1.1 riastrad #endif
153 1.1 riastrad
154 1.1 riastrad .text
155 1.1 riastrad .p2align 2
156 1.1 riastrad .Lconstants_addr:
157 1.1 riastrad .long .Lconstants - .
158 1.1 riastrad
159 1.1 riastrad /*
160 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@r0,
161 1.1 riastrad * uint32_t blkno@r1,
162 1.1 riastrad * const uint8_t nonce[12]@r2,
163 1.1 riastrad * const uint8_t key[32]@r3,
164 1.1 riastrad * const uint8_t const[16]@sp[0],
165 1.1 riastrad * unsigned nr@sp[4])
166 1.1 riastrad */
167 1.1 riastrad ENTRY(chacha_stream256_neon)
168 1.1 riastrad /* save callee-saves registers */
169 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr}
170 1.1 riastrad vpush {d8-d15}
171 1.1 riastrad
172 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
173 1.1 riastrad ldr r7, .Lconstants_addr
174 1.1 riastrad adr r6, .Lconstants_addr
175 1.1 riastrad
176 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */
177 1.1 riastrad sub fp, sp, #0x20
178 1.1 riastrad bic fp, fp, #0x1f /* align */
179 1.1 riastrad
180 1.1 riastrad /* get parameters */
181 1.1 riastrad add ip, sp, #96
182 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
183 1.1 riastrad ldm ip, {r4, r5} /* r4 := const, r5 := nr */
184 1.1 riastrad ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
185 1.1 riastrad
186 1.1 riastrad vld1.32 {q12}, [r4] /* q12 := constant */
187 1.1 riastrad vld1.32 {q13-q14}, [r3] /* q13-q14 := key */
188 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
189 1.1 riastrad
190 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */
191 1.1 riastrad vdup.32 q1, d24[1]
192 1.1 riastrad vdup.32 q2, d25[0]
193 1.1 riastrad vdup.32 q3, d25[1]
194 1.1 riastrad vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */
195 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
196 1.1 riastrad vdup.32 q5, d26[1]
197 1.1 riastrad vdup.32 q6, d27[0]
198 1.1 riastrad vdup.32 q7, d27[1]
199 1.1 riastrad vdup.32 q8, d28[0]
200 1.1 riastrad vdup.32 q9, d28[1]
201 1.1 riastrad vdup.32 q10, d29[0]
202 1.1 riastrad vdup.32 q11, d29[1]
203 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
204 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */
205 1.1 riastrad vdup.32 q14, r8
206 1.1 riastrad vdup.32 q15, r10
207 1.1 riastrad
208 1.1 riastrad HTOLE32(q0)
209 1.1 riastrad HTOLE32(q1)
210 1.1 riastrad HTOLE32(q2)
211 1.1 riastrad HTOLE32(q3)
212 1.1 riastrad HTOLE32(q4)
213 1.1 riastrad HTOLE32(q5)
214 1.1 riastrad HTOLE32(q6)
215 1.1 riastrad HTOLE32(q7)
216 1.1 riastrad HTOLE32(q8)
217 1.1 riastrad HTOLE32(q9)
218 1.1 riastrad HTOLE32(q10)
219 1.1 riastrad HTOLE32(q11)
220 1.1 riastrad HTOLE32(q12)
221 1.1 riastrad HTOLE32(q13)
222 1.1 riastrad HTOLE32(q14)
223 1.1 riastrad HTOLE32(q15)
224 1.1 riastrad
225 1.1 riastrad b 2f
226 1.1 riastrad
227 1.1 riastrad _ALIGN_TEXT
228 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
229 1.1 riastrad 2: subs r5, r5, #2
230 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
231 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31
232 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
233 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
234 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29
235 1.1 riastrad bne 1b
236 1.1 riastrad
237 1.1 riastrad /*
238 1.1 riastrad * q8-q9 are free / saved on the stack. We have:
239 1.1 riastrad *
240 1.1 riastrad * q0 = (x0[0], x1[0]; x2[0], x3[0])
241 1.1 riastrad * q1 = (x0[1], x1[1]; x2[1], x3[1])
242 1.1 riastrad * q2 = (x0[2], x1[2]; x2[2], x3[2])
243 1.1 riastrad * q3 = (x0[3], x1[3]; x2[3], x3[3])
244 1.1 riastrad * ...
245 1.1 riastrad * q15 = (x0[15], x1[15]; x2[15], x3[15])
246 1.1 riastrad *
247 1.1 riastrad * where xi[j] is the jth word of the ith 16-word block. Zip
248 1.1 riastrad * consecutive pairs with vzip.32, and you get:
249 1.1 riastrad *
250 1.1 riastrad * q0 = (x0[0], x0[1]; x1[0], x1[1])
251 1.1 riastrad * q1 = (x2[0], x2[1]; x3[0], x3[1])
252 1.1 riastrad * q2 = (x0[2], x0[3]; x1[2], x1[3])
253 1.1 riastrad * q3 = (x2[2], x2[3]; x3[2], x3[3])
254 1.1 riastrad * ...
255 1.1 riastrad * q15 = (x2[14], x2[15]; x3[14], x3[15])
256 1.1 riastrad *
257 1.1 riastrad * As 64-bit d registers, this is:
258 1.1 riastrad *
259 1.1 riastrad * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1])
260 1.1 riastrad * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1])
261 1.1 riastrad * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3])
262 1.1 riastrad * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3])
263 1.1 riastrad * ...
264 1.1 riastrad * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15])
265 1.1 riastrad *
266 1.1 riastrad * Swap d1<->d4, d3<->d6, ..., and you get:
267 1.1 riastrad *
268 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
269 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
270 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
271 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
272 1.1 riastrad * ...
273 1.1 riastrad * q15 = (x15[0], x15[1]; x15[2], x15[3])
274 1.1 riastrad */
275 1.1 riastrad
276 1.1 riastrad sub r7, r7, #0x10
277 1.1 riastrad vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */
278 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
279 1.1 riastrad
280 1.1 riastrad vzip.32 q0, q1
281 1.1 riastrad vzip.32 q2, q3
282 1.1 riastrad vzip.32 q4, q5
283 1.1 riastrad vzip.32 q6, q7
284 1.1 riastrad
285 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
286 1.1 riastrad vld1.32 {q9}, [r4] /* q9 := constant */
287 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
288 1.1 riastrad vld1.32 {q8}, [r3]! /* q8 := key[0:16) */
289 1.1 riastrad
290 1.1 riastrad vswp d1, d4
291 1.1 riastrad vswp d9, d12
292 1.1 riastrad vswp d3, d6
293 1.1 riastrad vswp d11, d14
294 1.1 riastrad
295 1.1 riastrad /*
296 1.1 riastrad * At this point, the blocks are:
297 1.1 riastrad *
298 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
299 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
300 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
301 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
302 1.1 riastrad * q4 = (x0[4], x0[5]; x0[6], x0[7])
303 1.1 riastrad * q5 = (x2[4], x2[5]; x2[6], x2[7])
304 1.1 riastrad * q6 = (x1[4], x1[5]; x1[6], x1[7])
305 1.1 riastrad * q7 = (x3[4], x3[5]; x3[6], x3[7])
306 1.1 riastrad *
307 1.1 riastrad * The first two rows to write out are q0 = x0[0:4) and q4 =
308 1.1 riastrad * x0[4:8). If we first swap q1 and q4, then once we've
309 1.1 riastrad * written them out we free up consecutive registers q0-q1 for
310 1.1 riastrad * store-multiple.
311 1.1 riastrad */
312 1.1 riastrad
313 1.1 riastrad vswp q1, q4
314 1.1 riastrad
315 1.1 riastrad vadd.u32 q0, q0, q9
316 1.1 riastrad vadd.u32 q4, q4, q9
317 1.1 riastrad vadd.u32 q2, q2, q9
318 1.1 riastrad vadd.u32 q3, q3, q9
319 1.1 riastrad
320 1.1 riastrad vadd.u32 q1, q1, q8
321 1.1 riastrad vadd.u32 q5, q5, q8
322 1.1 riastrad vadd.u32 q6, q6, q8
323 1.1 riastrad vadd.u32 q7, q7, q8
324 1.1 riastrad
325 1.1 riastrad vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */
326 1.1 riastrad
327 1.1 riastrad LE32TOH(q0)
328 1.1 riastrad LE32TOH(q1)
329 1.1 riastrad LE32TOH(q2)
330 1.1 riastrad LE32TOH(q3)
331 1.1 riastrad LE32TOH(q4)
332 1.1 riastrad LE32TOH(q5)
333 1.1 riastrad LE32TOH(q6)
334 1.1 riastrad LE32TOH(q7)
335 1.1 riastrad
336 1.1 riastrad vst1.32 {q0-q1}, [r0]!
337 1.1 riastrad vld1.32 {q0}, [r3] /* q0 := key[16:32) */
338 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
339 1.1 riastrad vmov d2, r3, r6
340 1.1 riastrad vmov d3, r8, r10
341 1.1 riastrad
342 1.1 riastrad vzip.32 q8, q9
343 1.1 riastrad vzip.32 q10, q11
344 1.1 riastrad vzip.32 q12, q13
345 1.1 riastrad vzip.32 q14, q15
346 1.1 riastrad
347 1.1 riastrad vswp d17, d20
348 1.1 riastrad vswp d25, d28
349 1.1 riastrad vswp d19, d22
350 1.1 riastrad vswp d27, d30
351 1.1 riastrad
352 1.1 riastrad vadd.u32 q8, q8, q0
353 1.1 riastrad vadd.u32 q9, q9, q0
354 1.1 riastrad vadd.u32 q10, q10, q0
355 1.1 riastrad vadd.u32 q11, q11, q0
356 1.1 riastrad
357 1.1 riastrad vadd.u32 q12, q12, q1
358 1.1 riastrad vadd.u32 q13, q13, q1
359 1.1 riastrad vadd.u32 q14, q14, q1
360 1.1 riastrad vadd.u32 q15, q15, q1
361 1.1 riastrad
362 1.1 riastrad LE32TOH(q8)
363 1.1 riastrad LE32TOH(q9)
364 1.1 riastrad LE32TOH(q10)
365 1.1 riastrad LE32TOH(q11)
366 1.1 riastrad LE32TOH(q12)
367 1.1 riastrad LE32TOH(q13)
368 1.1 riastrad LE32TOH(q14)
369 1.1 riastrad LE32TOH(q15)
370 1.1 riastrad
371 1.1 riastrad /* prepare to zero temporary space on stack */
372 1.1 riastrad vmov.i32 q0, #0
373 1.1 riastrad vmov.i32 q1, #0
374 1.1 riastrad
375 1.1 riastrad /* vst1.32 {q0}, [r0]! */
376 1.1 riastrad /* vst1.32 {q1}, [r0]! */ /* (was q4 before vswp) */
377 1.1 riastrad vst1.32 {q8}, [r0]!
378 1.1 riastrad vst1.32 {q12}, [r0]!
379 1.1 riastrad vst1.32 {q2}, [r0]!
380 1.1 riastrad vst1.32 {q6}, [r0]!
381 1.1 riastrad vst1.32 {q10}, [r0]!
382 1.1 riastrad vst1.32 {q14}, [r0]!
383 1.1 riastrad vst1.32 {q4}, [r0]! /* (was q1 before vswp) */
384 1.1 riastrad vst1.32 {q5}, [r0]!
385 1.1 riastrad vst1.32 {q9}, [r0]!
386 1.1 riastrad vst1.32 {q13}, [r0]!
387 1.1 riastrad vst1.32 {q3}, [r0]!
388 1.1 riastrad vst1.32 {q7}, [r0]!
389 1.1 riastrad vst1.32 {q11}, [r0]!
390 1.1 riastrad vst1.32 {q15}, [r0]
391 1.1 riastrad
392 1.1 riastrad /* zero temporary space on the stack */
393 1.1 riastrad vst1.8 {q0-q1}, [fp, :256]
394 1.1 riastrad
395 1.1 riastrad /* restore callee-saves registers and stack */
396 1.1 riastrad vpop {d8-d15}
397 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr}
398 1.1 riastrad bx lr
399 1.1 riastrad END(chacha_stream256_neon)
400 1.1 riastrad
401 1.1 riastrad /*
402 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
403 1.1 riastrad * uint32_t blkno@r2,
404 1.1 riastrad * const uint8_t nonce[12]@r3,
405 1.1 riastrad * const uint8_t key[32]@sp[0],
406 1.1 riastrad * const uint8_t const[16]@sp[4],
407 1.1 riastrad * unsigned nr@sp[8])
408 1.1 riastrad */
409 1.1 riastrad ENTRY(chacha_stream_xor256_neon)
410 1.1 riastrad /* save callee-saves registers */
411 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr}
412 1.1 riastrad vpush {d8-d15}
413 1.1 riastrad
414 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
415 1.1 riastrad ldr r7, .Lconstants_addr
416 1.1 riastrad adr r6, .Lconstants_addr
417 1.1 riastrad
418 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */
419 1.1 riastrad sub fp, sp, #0x20
420 1.1 riastrad bic fp, fp, #0x1f /* align */
421 1.1 riastrad
422 1.1 riastrad /* get parameters */
423 1.1 riastrad add ip, sp, #96
424 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
425 1.1 riastrad ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */
426 1.1 riastrad ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
427 1.1 riastrad
428 1.1 riastrad vld1.32 {q12}, [r5] /* q12 := constant */
429 1.1 riastrad vld1.32 {q13-q14}, [r4] /* q13-q14 := key */
430 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
431 1.1 riastrad
432 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */
433 1.1 riastrad vdup.32 q1, d24[1]
434 1.1 riastrad vdup.32 q2, d25[0]
435 1.1 riastrad vdup.32 q3, d25[1]
436 1.1 riastrad vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */
437 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
438 1.1 riastrad vdup.32 q5, d26[1]
439 1.1 riastrad vdup.32 q6, d27[0]
440 1.1 riastrad vdup.32 q7, d27[1]
441 1.1 riastrad vdup.32 q8, d28[0]
442 1.1 riastrad vdup.32 q9, d28[1]
443 1.1 riastrad vdup.32 q10, d29[0]
444 1.1 riastrad vdup.32 q11, d29[1]
445 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
446 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */
447 1.1 riastrad vdup.32 q14, r8
448 1.1 riastrad vdup.32 q15, r10
449 1.1 riastrad
450 1.1 riastrad HTOLE32(q0)
451 1.1 riastrad HTOLE32(q1)
452 1.1 riastrad HTOLE32(q2)
453 1.1 riastrad HTOLE32(q3)
454 1.1 riastrad HTOLE32(q4)
455 1.1 riastrad HTOLE32(q5)
456 1.1 riastrad HTOLE32(q6)
457 1.1 riastrad HTOLE32(q7)
458 1.1 riastrad HTOLE32(q8)
459 1.1 riastrad HTOLE32(q9)
460 1.1 riastrad HTOLE32(q10)
461 1.1 riastrad HTOLE32(q11)
462 1.1 riastrad HTOLE32(q12)
463 1.1 riastrad HTOLE32(q13)
464 1.1 riastrad HTOLE32(q14)
465 1.1 riastrad HTOLE32(q15)
466 1.1 riastrad
467 1.1 riastrad b 2f
468 1.1 riastrad
469 1.1 riastrad _ALIGN_TEXT
470 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
471 1.1 riastrad 2: subs ip, ip, #2
472 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
473 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31
474 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
475 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
476 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29
477 1.1 riastrad bne 1b
478 1.1 riastrad
479 1.1 riastrad /*
480 1.1 riastrad * q8-q9 are free / saved on the stack. Now for the real fun:
481 1.1 riastrad * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
482 1.1 riastrad * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are
483 1.1 riastrad * transposed from one another, and the x[i] are in general
484 1.1 riastrad * registers and memory. So we have:
485 1.1 riastrad *
486 1.1 riastrad * q0 = (x0[0], x1[0]; x2[0], x3[0])
487 1.1 riastrad * q1 = (x0[1], x1[1]; x2[1], x3[1])
488 1.1 riastrad * q2 = (x0[2], x1[2]; x2[2], x3[2])
489 1.1 riastrad * q3 = (x0[3], x1[3]; x2[3], x3[3])
490 1.1 riastrad * ...
491 1.1 riastrad * q15 = (x0[15], x1[15]; x2[15], x3[15])
492 1.1 riastrad *
493 1.1 riastrad * where xi[j] is the jth word of the ith 16-word block. Zip
494 1.1 riastrad * consecutive pairs with vzip.32, and you get:
495 1.1 riastrad *
496 1.1 riastrad * q0 = (x0[0], x0[1]; x1[0], x1[1])
497 1.1 riastrad * q1 = (x2[0], x2[1]; x3[0], x3[1])
498 1.1 riastrad * q2 = (x0[2], x0[3]; x1[2], x1[3])
499 1.1 riastrad * q3 = (x2[2], x2[3]; x3[2], x3[3])
500 1.1 riastrad * ...
501 1.1 riastrad * q15 = (x2[14], x2[15]; x3[14], x3[15])
502 1.1 riastrad *
503 1.1 riastrad * As 64-bit d registers, this is:
504 1.1 riastrad *
505 1.1 riastrad * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1])
506 1.1 riastrad * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1])
507 1.1 riastrad * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3])
508 1.1 riastrad * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3])
509 1.1 riastrad * ...
510 1.1 riastrad * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15])
511 1.1 riastrad *
512 1.1 riastrad * Swap d1<->d4, d3<->d6, ..., and you get:
513 1.1 riastrad *
514 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
515 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
516 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
517 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
518 1.1 riastrad * ...
519 1.1 riastrad * q15 = (x15[0], x15[1]; x15[2], x15[3])
520 1.1 riastrad */
521 1.1 riastrad
522 1.1 riastrad sub r7, r7, #0x10
523 1.1 riastrad vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */
524 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
525 1.1 riastrad
526 1.1 riastrad vzip.32 q0, q1
527 1.1 riastrad vzip.32 q2, q3
528 1.1 riastrad vzip.32 q4, q5
529 1.1 riastrad vzip.32 q6, q7
530 1.1 riastrad
531 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
532 1.1 riastrad vld1.32 {q9}, [r5] /* q9 := constant */
533 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
534 1.1 riastrad vld1.32 {q8}, [r4]! /* q8 := key[0:16) */
535 1.1 riastrad
536 1.1 riastrad vswp d1, d4
537 1.1 riastrad vswp d9, d12
538 1.1 riastrad vswp d3, d6
539 1.1 riastrad vswp d11, d14
540 1.1 riastrad
541 1.1 riastrad /*
542 1.1 riastrad * At this point, the blocks are:
543 1.1 riastrad *
544 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
545 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
546 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
547 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
548 1.1 riastrad * q4 = (x0[4], x0[5]; x0[6], x0[7])
549 1.1 riastrad * q5 = (x2[4], x2[5]; x2[6], x2[7])
550 1.1 riastrad * q6 = (x1[4], x1[5]; x1[6], x1[7])
551 1.1 riastrad * q7 = (x3[4], x3[5]; x3[6], x3[7])
552 1.1 riastrad *
553 1.1 riastrad * The first two rows to write out are q0 = x0[0:4) and q4 =
554 1.1 riastrad * x0[4:8). If we first swap q1 and q4, then once we've
555 1.1 riastrad * written them out we free up consecutive registers q0-q1 for
556 1.1 riastrad * store-multiple.
557 1.1 riastrad */
558 1.1 riastrad
559 1.1 riastrad vswp q1, q4
560 1.1 riastrad
561 1.1 riastrad vadd.u32 q0, q0, q9
562 1.1 riastrad vadd.u32 q4, q4, q9
563 1.1 riastrad vadd.u32 q2, q2, q9
564 1.1 riastrad vadd.u32 q3, q3, q9
565 1.1 riastrad
566 1.1 riastrad vadd.u32 q1, q1, q8
567 1.1 riastrad vadd.u32 q5, q5, q8
568 1.1 riastrad vadd.u32 q6, q6, q8
569 1.1 riastrad vadd.u32 q7, q7, q8
570 1.1 riastrad
571 1.1 riastrad vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */
572 1.1 riastrad
573 1.1 riastrad LE32TOH(q0)
574 1.1 riastrad LE32TOH(q1)
575 1.1 riastrad LE32TOH(q2)
576 1.1 riastrad LE32TOH(q6)
577 1.1 riastrad LE32TOH(q4)
578 1.1 riastrad LE32TOH(q5)
579 1.1 riastrad LE32TOH(q3)
580 1.1 riastrad LE32TOH(q7)
581 1.1 riastrad
582 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [0:32) */
583 1.1 riastrad veor q1, q1, q9
584 1.1 riastrad
585 1.1 riastrad vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */
586 1.1 riastrad
587 1.1 riastrad vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */
588 1.1 riastrad vld1.32 {q0}, [r4] /* q0 := key[16:32) */
589 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
590 1.1 riastrad vmov d2, r3, r6
591 1.1 riastrad vmov d3, r8, r10
592 1.1 riastrad
593 1.1 riastrad vzip.32 q8, q9
594 1.1 riastrad vzip.32 q10, q11
595 1.1 riastrad vzip.32 q12, q13
596 1.1 riastrad vzip.32 q14, q15
597 1.1 riastrad
598 1.1 riastrad vswp d17, d20
599 1.1 riastrad vswp d25, d28
600 1.1 riastrad vswp d19, d22
601 1.1 riastrad vswp d27, d30
602 1.1 riastrad
603 1.1 riastrad vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */
604 1.1 riastrad
605 1.1 riastrad vadd.u32 q8, q8, q0
606 1.1 riastrad vadd.u32 q12, q12, q0
607 1.1 riastrad vadd.u32 q10, q10, q0
608 1.1 riastrad vadd.u32 q11, q11, q0
609 1.1 riastrad
610 1.1 riastrad vadd.u32 q9, q9, q1
611 1.1 riastrad vadd.u32 q13, q13, q1
612 1.1 riastrad vadd.u32 q14, q14, q1
613 1.1 riastrad vadd.u32 q15, q15, q1
614 1.1 riastrad
615 1.1 riastrad vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */
616 1.1 riastrad
617 1.1 riastrad LE32TOH(q8)
618 1.1 riastrad LE32TOH(q9)
619 1.1 riastrad LE32TOH(q10)
620 1.1 riastrad LE32TOH(q14)
621 1.1 riastrad LE32TOH(q12)
622 1.1 riastrad LE32TOH(q13)
623 1.1 riastrad LE32TOH(q11)
624 1.1 riastrad LE32TOH(q15)
625 1.1 riastrad
626 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [32:64) */
627 1.1 riastrad veor q1, q1, q9
628 1.1 riastrad
629 1.1 riastrad vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */
630 1.1 riastrad vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */
631 1.1 riastrad vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */
632 1.1 riastrad
633 1.1 riastrad veor q2, q2, q8 /* compute ciphertext bytes [64:96) */
634 1.1 riastrad veor q6, q6, q9
635 1.1 riastrad
636 1.1 riastrad vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */
637 1.1 riastrad vst1.32 {q2}, [r0]! /* store ciphertext bytes [64:80) */
638 1.1 riastrad
639 1.1 riastrad veor q10, q10, q0 /* compute ciphertext bytes [96:128) */
640 1.1 riastrad veor q14, q14, q1
641 1.1 riastrad
642 1.1 riastrad vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */
643 1.1 riastrad vst1.32 {q6}, [r0]! /* store ciphertext bytes [80:96) */
644 1.1 riastrad
645 1.1 riastrad veor q4, q4, q8 /* compute ciphertext bytes [128:160) */
646 1.1 riastrad veor q5, q5, q9
647 1.1 riastrad
648 1.1 riastrad vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */
649 1.1 riastrad vst1.32 {q10}, [r0]! /* store ciphertext bytes [96:112) */
650 1.1 riastrad
651 1.1 riastrad veor q12, q12, q0 /* compute ciphertext bytes [160:192) */
652 1.1 riastrad veor q13, q13, q1
653 1.1 riastrad
654 1.1 riastrad vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */
655 1.1 riastrad vst1.32 {q14}, [r0]! /* store ciphertext bytes [112:128) */
656 1.1 riastrad
657 1.1 riastrad veor q8, q3, q8 /* compute ciphertext bytes [192:224) */
658 1.1 riastrad veor q9, q7, q9
659 1.1 riastrad
660 1.1 riastrad vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [128:160) */
661 1.1 riastrad vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [160:192) */
662 1.1 riastrad
663 1.1 riastrad veor q0, q11, q0 /* compute ciphertext bytes [224:256) */
664 1.1 riastrad veor q1, q15, q1
665 1.1 riastrad
666 1.1 riastrad vst1.32 {q8-q9}, [r0]! /* store ciphertext bytes [192:224) */
667 1.1 riastrad vst1.32 {q0-q1}, [r0] /* store ciphertext bytes [224:256) */
668 1.1 riastrad
669 1.1 riastrad /* zero temporary space on the stack */
670 1.1 riastrad vmov.i32 q0, #0
671 1.1 riastrad vmov.i32 q1, #0
672 1.1 riastrad vst1.8 {q0-q1}, [fp, :256]
673 1.1 riastrad
674 1.1 riastrad /* restore callee-saves registers and stack */
675 1.1 riastrad vpop {d8-d15}
676 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr}
677 1.1 riastrad bx lr
678 1.1 riastrad END(chacha_stream_xor256_neon)
679 1.1 riastrad
680 1.1 riastrad .section .rodata
681 1.1 riastrad .p2align 4
682 1.1 riastrad .Lconstants:
683 1.1 riastrad
684 1.1 riastrad .type v0123,%object
685 1.1 riastrad v0123:
686 1.1 riastrad .long 0, 1, 2, 3
687 1.1 riastrad END(v0123)
688 1.1 riastrad
689 1.1 riastrad .type rot8,%object
690 1.1 riastrad rot8:
691 1.1 riastrad .long 0x02010003, 0x06050407
692 1.1 riastrad END(rot8)
693