chacha_neon_32.S revision 1.3 1 1.3 riastrad /* $NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <machine/asm.h>
30 1.1 riastrad
31 1.3 riastrad RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $")
32 1.1 riastrad
33 1.1 riastrad .fpu neon
34 1.1 riastrad
35 1.1 riastrad /*
36 1.1 riastrad * ChaCha round, split up so we can interleave the quarterrounds on
37 1.1 riastrad * independent rows/diagonals to maximize pipeline efficiency, with
38 1.1 riastrad * spills to deal with the scarcity of registers. Reference:
39 1.1 riastrad *
40 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
41 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008.
42 1.1 riastrad * https://cr.yp.to/papers.html#chacha
43 1.1 riastrad *
44 1.1 riastrad * a += b; d ^= a; d <<<= 16;
45 1.1 riastrad * c += d; b ^= c; b <<<= 12;
46 1.1 riastrad * a += b; d ^= a; d <<<= 8;
47 1.1 riastrad * c += d; b ^= c; b <<<= 7;
48 1.1 riastrad *
49 1.1 riastrad * The rotations are implemented with:
50 1.1 riastrad * <<< 16 VREV32.16 for 16,
51 1.1 riastrad * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR)
52 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r)
53 1.1 riastrad * <<< 7 VSHL/VSRI/VORR
54 1.1 riastrad */
55 1.1 riastrad
56 1.1 riastrad .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
57 1.3 riastrad vld1.8 {\c2-\c3}, [fp, :256]
58 1.1 riastrad .endm
59 1.1 riastrad
60 1.1 riastrad .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
61 1.1 riastrad /* a += b; d ^= a; d <<<= 16 */
62 1.1 riastrad vadd.u32 \a0, \a0, \b0
63 1.1 riastrad vadd.u32 \a1, \a1, \b1
64 1.1 riastrad vadd.u32 \a2, \a2, \b2
65 1.1 riastrad vadd.u32 \a3, \a3, \b3
66 1.1 riastrad
67 1.1 riastrad veor \d0, \d0, \a0
68 1.1 riastrad veor \d1, \d1, \a1
69 1.1 riastrad veor \d2, \d2, \a2
70 1.1 riastrad veor \d3, \d3, \a3
71 1.1 riastrad
72 1.1 riastrad vrev32.16 \d0, \d0
73 1.1 riastrad vrev32.16 \d1, \d1
74 1.1 riastrad vrev32.16 \d2, \d2
75 1.1 riastrad vrev32.16 \d3, \d3
76 1.1 riastrad
77 1.1 riastrad /* c += d; b ^= c; b <<<= 12 */
78 1.1 riastrad vadd.u32 \c0, \c0, \d0
79 1.1 riastrad vadd.u32 \c1, \c1, \d1
80 1.1 riastrad vadd.u32 \c2, \c2, \d2
81 1.1 riastrad vadd.u32 \c3, \c3, \d3
82 1.1 riastrad
83 1.3 riastrad vst1.8 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */
84 1.1 riastrad
85 1.1 riastrad veor \c0, \b0, \c0
86 1.1 riastrad veor \c1, \b1, \c1
87 1.1 riastrad vshl.u32 \b0, \c0, #12
88 1.1 riastrad vshl.u32 \b1, \c1, #12
89 1.1 riastrad vsri.u32 \b0, \c0, #(32 - 12)
90 1.1 riastrad vsri.u32 \b1, \c1, #(32 - 12)
91 1.1 riastrad
92 1.1 riastrad veor \c0, \b2, \c2
93 1.1 riastrad veor \c1, \b3, \c3
94 1.1 riastrad vshl.u32 \b2, \c0, #12
95 1.1 riastrad vshl.u32 \b3, \c1, #12
96 1.1 riastrad vsri.u32 \b2, \c0, #(32 - 12)
97 1.1 riastrad vsri.u32 \b3, \c1, #(32 - 12)
98 1.1 riastrad
99 1.1 riastrad vld1.8 {\c0l}, [r7, :64] /* load rot8 table */
100 1.1 riastrad
101 1.1 riastrad /* a += b; d ^= a; d <<<= 8 */
102 1.1 riastrad vadd.u32 \a0, \a0, \b0
103 1.1 riastrad vadd.u32 \a1, \a1, \b1
104 1.1 riastrad vadd.u32 \a2, \a2, \b2
105 1.1 riastrad vadd.u32 \a3, \a3, \b3
106 1.1 riastrad
107 1.1 riastrad veor \d0, \d0, \a0
108 1.1 riastrad veor \d1, \d1, \a1
109 1.1 riastrad veor \d2, \d2, \a2
110 1.1 riastrad veor \d3, \d3, \a3
111 1.1 riastrad
112 1.1 riastrad vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */
113 1.1 riastrad vtbl.8 \d0h, {\d0h}, \c0l
114 1.1 riastrad vtbl.8 \d1l, {\d1l}, \c0l
115 1.1 riastrad vtbl.8 \d1h, {\d1h}, \c0l
116 1.1 riastrad vtbl.8 \d2l, {\d2l}, \c0l
117 1.1 riastrad vtbl.8 \d2h, {\d2h}, \c0l
118 1.1 riastrad vtbl.8 \d3l, {\d3l}, \c0l
119 1.1 riastrad vtbl.8 \d3h, {\d3h}, \c0l
120 1.1 riastrad
121 1.3 riastrad vld1.8 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */
122 1.1 riastrad
123 1.1 riastrad /* c += d; b ^= c; b <<<= 7 */
124 1.1 riastrad vadd.u32 \c2, \c2, \d2
125 1.1 riastrad vadd.u32 \c3, \c3, \d3
126 1.1 riastrad vadd.u32 \c0, \c0, \d0
127 1.1 riastrad vadd.u32 \c1, \c1, \d1
128 1.1 riastrad
129 1.3 riastrad vst1.8 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */
130 1.1 riastrad
131 1.1 riastrad veor \c2, \b2, \c2
132 1.1 riastrad veor \c3, \b3, \c3
133 1.1 riastrad vshl.u32 \b2, \c2, #7
134 1.1 riastrad vshl.u32 \b3, \c3, #7
135 1.1 riastrad vsri.u32 \b2, \c2, #(32 - 7)
136 1.1 riastrad vsri.u32 \b3, \c3, #(32 - 7)
137 1.1 riastrad
138 1.1 riastrad veor \c2, \b0, \c0
139 1.1 riastrad veor \c3, \b1, \c1
140 1.1 riastrad vshl.u32 \b0, \c2, #7
141 1.1 riastrad vshl.u32 \b1, \c3, #7
142 1.1 riastrad vsri.u32 \b0, \c2, #(32 - 7)
143 1.1 riastrad vsri.u32 \b1, \c3, #(32 - 7)
144 1.1 riastrad .endm
145 1.1 riastrad
146 1.1 riastrad .text
147 1.1 riastrad .p2align 2
148 1.1 riastrad .Lconstants_addr:
149 1.1 riastrad .long .Lconstants - .
150 1.1 riastrad
151 1.1 riastrad /*
152 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@r0,
153 1.1 riastrad * uint32_t blkno@r1,
154 1.1 riastrad * const uint8_t nonce[12]@r2,
155 1.1 riastrad * const uint8_t key[32]@r3,
156 1.1 riastrad * const uint8_t const[16]@sp[0],
157 1.1 riastrad * unsigned nr@sp[4])
158 1.1 riastrad */
159 1.1 riastrad ENTRY(chacha_stream256_neon)
160 1.1 riastrad /* save callee-saves registers */
161 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr}
162 1.1 riastrad vpush {d8-d15}
163 1.1 riastrad
164 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
165 1.1 riastrad ldr r7, .Lconstants_addr
166 1.1 riastrad adr r6, .Lconstants_addr
167 1.1 riastrad
168 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */
169 1.1 riastrad sub fp, sp, #0x20
170 1.1 riastrad bic fp, fp, #0x1f /* align */
171 1.1 riastrad
172 1.1 riastrad /* get parameters */
173 1.1 riastrad add ip, sp, #96
174 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
175 1.1 riastrad ldm ip, {r4, r5} /* r4 := const, r5 := nr */
176 1.1 riastrad ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
177 1.1 riastrad
178 1.3 riastrad vld1.8 {q12}, [r4] /* q12 := constant */
179 1.3 riastrad vld1.8 {q13-q14}, [r3] /* q13-q14 := key */
180 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
181 1.1 riastrad
182 1.3 riastrad #ifdef __ARM_BIG_ENDIAN
183 1.3 riastrad rev r6, r6
184 1.3 riastrad rev r8, r8
185 1.3 riastrad rev r10, r10
186 1.3 riastrad #endif
187 1.3 riastrad
188 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */
189 1.1 riastrad vdup.32 q1, d24[1]
190 1.1 riastrad vdup.32 q2, d25[0]
191 1.1 riastrad vdup.32 q3, d25[1]
192 1.1 riastrad vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */
193 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
194 1.1 riastrad vdup.32 q5, d26[1]
195 1.1 riastrad vdup.32 q6, d27[0]
196 1.1 riastrad vdup.32 q7, d27[1]
197 1.1 riastrad vdup.32 q8, d28[0]
198 1.1 riastrad vdup.32 q9, d28[1]
199 1.1 riastrad vdup.32 q10, d29[0]
200 1.1 riastrad vdup.32 q11, d29[1]
201 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
202 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */
203 1.1 riastrad vdup.32 q14, r8
204 1.1 riastrad vdup.32 q15, r10
205 1.1 riastrad
206 1.1 riastrad b 2f
207 1.1 riastrad
208 1.1 riastrad _ALIGN_TEXT
209 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
210 1.1 riastrad 2: subs r5, r5, #2
211 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
212 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31
213 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
214 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
215 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29
216 1.1 riastrad bne 1b
217 1.1 riastrad
218 1.1 riastrad /*
219 1.1 riastrad * q8-q9 are free / saved on the stack. We have:
220 1.1 riastrad *
221 1.1 riastrad * q0 = (x0[0], x1[0]; x2[0], x3[0])
222 1.1 riastrad * q1 = (x0[1], x1[1]; x2[1], x3[1])
223 1.1 riastrad * q2 = (x0[2], x1[2]; x2[2], x3[2])
224 1.1 riastrad * q3 = (x0[3], x1[3]; x2[3], x3[3])
225 1.1 riastrad * ...
226 1.1 riastrad * q15 = (x0[15], x1[15]; x2[15], x3[15])
227 1.1 riastrad *
228 1.1 riastrad * where xi[j] is the jth word of the ith 16-word block. Zip
229 1.1 riastrad * consecutive pairs with vzip.32, and you get:
230 1.1 riastrad *
231 1.1 riastrad * q0 = (x0[0], x0[1]; x1[0], x1[1])
232 1.1 riastrad * q1 = (x2[0], x2[1]; x3[0], x3[1])
233 1.1 riastrad * q2 = (x0[2], x0[3]; x1[2], x1[3])
234 1.1 riastrad * q3 = (x2[2], x2[3]; x3[2], x3[3])
235 1.1 riastrad * ...
236 1.1 riastrad * q15 = (x2[14], x2[15]; x3[14], x3[15])
237 1.1 riastrad *
238 1.1 riastrad * As 64-bit d registers, this is:
239 1.1 riastrad *
240 1.1 riastrad * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1])
241 1.1 riastrad * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1])
242 1.1 riastrad * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3])
243 1.1 riastrad * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3])
244 1.1 riastrad * ...
245 1.1 riastrad * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15])
246 1.1 riastrad *
247 1.1 riastrad * Swap d1<->d4, d3<->d6, ..., and you get:
248 1.1 riastrad *
249 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
250 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
251 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
252 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
253 1.1 riastrad * ...
254 1.1 riastrad * q15 = (x15[0], x15[1]; x15[2], x15[3])
255 1.1 riastrad */
256 1.1 riastrad
257 1.1 riastrad sub r7, r7, #0x10
258 1.1 riastrad vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */
259 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
260 1.1 riastrad
261 1.1 riastrad vzip.32 q0, q1
262 1.1 riastrad vzip.32 q2, q3
263 1.1 riastrad vzip.32 q4, q5
264 1.1 riastrad vzip.32 q6, q7
265 1.1 riastrad
266 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
267 1.3 riastrad vld1.8 {q9}, [r4] /* q9 := constant */
268 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
269 1.3 riastrad vld1.8 {q8}, [r3]! /* q8 := key[0:16) */
270 1.1 riastrad
271 1.1 riastrad vswp d1, d4
272 1.1 riastrad vswp d9, d12
273 1.1 riastrad vswp d3, d6
274 1.1 riastrad vswp d11, d14
275 1.1 riastrad
276 1.1 riastrad /*
277 1.1 riastrad * At this point, the blocks are:
278 1.1 riastrad *
279 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
280 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
281 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
282 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
283 1.1 riastrad * q4 = (x0[4], x0[5]; x0[6], x0[7])
284 1.1 riastrad * q5 = (x2[4], x2[5]; x2[6], x2[7])
285 1.1 riastrad * q6 = (x1[4], x1[5]; x1[6], x1[7])
286 1.1 riastrad * q7 = (x3[4], x3[5]; x3[6], x3[7])
287 1.1 riastrad *
288 1.1 riastrad * The first two rows to write out are q0 = x0[0:4) and q4 =
289 1.2 riastrad * x0[4:8). Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14
290 1.2 riastrad * enables us to issue all stores in consecutive pairs:
291 1.2 riastrad * x0 in q0-q1
292 1.2 riastrad * x1 in q8-q9
293 1.2 riastrad * x2 in q2-q3
294 1.2 riastrad * x3 in q10-q11
295 1.2 riastrad * x4 in q4-q5
296 1.2 riastrad * x5 in q12-q3
297 1.2 riastrad * x6 in q6-q7
298 1.2 riastrad * x7 in q14-q15
299 1.1 riastrad */
300 1.1 riastrad
301 1.1 riastrad vswp q1, q4
302 1.2 riastrad vswp q3, q6
303 1.1 riastrad
304 1.1 riastrad vadd.u32 q0, q0, q9
305 1.1 riastrad vadd.u32 q4, q4, q9
306 1.1 riastrad vadd.u32 q2, q2, q9
307 1.2 riastrad vadd.u32 q6, q6, q9
308 1.1 riastrad
309 1.1 riastrad vadd.u32 q1, q1, q8
310 1.1 riastrad vadd.u32 q5, q5, q8
311 1.2 riastrad vadd.u32 q3, q3, q8
312 1.1 riastrad vadd.u32 q7, q7, q8
313 1.1 riastrad
314 1.3 riastrad vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */
315 1.1 riastrad
316 1.3 riastrad vst1.8 {q0-q1}, [r0]!
317 1.3 riastrad vld1.8 {q0}, [r3] /* q0 := key[16:32) */
318 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
319 1.1 riastrad vmov d2, r3, r6
320 1.1 riastrad vmov d3, r8, r10
321 1.1 riastrad
322 1.1 riastrad vzip.32 q8, q9
323 1.1 riastrad vzip.32 q10, q11
324 1.1 riastrad vzip.32 q12, q13
325 1.1 riastrad vzip.32 q14, q15
326 1.1 riastrad
327 1.1 riastrad vswp d17, d20
328 1.1 riastrad vswp d25, d28
329 1.1 riastrad vswp d19, d22
330 1.1 riastrad vswp d27, d30
331 1.1 riastrad
332 1.2 riastrad vswp q9, q12
333 1.2 riastrad vswp q11, q14
334 1.2 riastrad
335 1.1 riastrad vadd.u32 q8, q8, q0
336 1.2 riastrad vadd.u32 q12, q12, q0
337 1.1 riastrad vadd.u32 q10, q10, q0
338 1.2 riastrad vadd.u32 q14, q14, q0
339 1.1 riastrad
340 1.2 riastrad vadd.u32 q9, q9, q1
341 1.1 riastrad vadd.u32 q13, q13, q1
342 1.2 riastrad vadd.u32 q11, q11, q1
343 1.1 riastrad vadd.u32 q15, q15, q1
344 1.1 riastrad
345 1.3 riastrad /* vst1.8 {q0-q1}, [r0]! */
346 1.3 riastrad vst1.8 {q8-q9}, [r0]!
347 1.3 riastrad vst1.8 {q2-q3}, [r0]!
348 1.3 riastrad vst1.8 {q10-q11}, [r0]!
349 1.3 riastrad vst1.8 {q4-q5}, [r0]!
350 1.3 riastrad vst1.8 {q12-q13}, [r0]!
351 1.3 riastrad vst1.8 {q6-q7}, [r0]!
352 1.3 riastrad vst1.8 {q14-q15}, [r0]
353 1.2 riastrad
354 1.2 riastrad /* zero temporary space on the stack */
355 1.1 riastrad vmov.i32 q0, #0
356 1.1 riastrad vmov.i32 q1, #0
357 1.1 riastrad vst1.8 {q0-q1}, [fp, :256]
358 1.1 riastrad
359 1.1 riastrad /* restore callee-saves registers and stack */
360 1.1 riastrad vpop {d8-d15}
361 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr}
362 1.1 riastrad bx lr
363 1.1 riastrad END(chacha_stream256_neon)
364 1.1 riastrad
365 1.1 riastrad /*
366 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
367 1.1 riastrad * uint32_t blkno@r2,
368 1.1 riastrad * const uint8_t nonce[12]@r3,
369 1.1 riastrad * const uint8_t key[32]@sp[0],
370 1.1 riastrad * const uint8_t const[16]@sp[4],
371 1.1 riastrad * unsigned nr@sp[8])
372 1.1 riastrad */
373 1.1 riastrad ENTRY(chacha_stream_xor256_neon)
374 1.1 riastrad /* save callee-saves registers */
375 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr}
376 1.1 riastrad vpush {d8-d15}
377 1.1 riastrad
378 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
379 1.1 riastrad ldr r7, .Lconstants_addr
380 1.1 riastrad adr r6, .Lconstants_addr
381 1.1 riastrad
382 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */
383 1.1 riastrad sub fp, sp, #0x20
384 1.1 riastrad bic fp, fp, #0x1f /* align */
385 1.1 riastrad
386 1.1 riastrad /* get parameters */
387 1.1 riastrad add ip, sp, #96
388 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
389 1.1 riastrad ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */
390 1.1 riastrad ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
391 1.1 riastrad
392 1.3 riastrad vld1.8 {q12}, [r5] /* q12 := constant */
393 1.3 riastrad vld1.8 {q13-q14}, [r4] /* q13-q14 := key */
394 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
395 1.1 riastrad
396 1.3 riastrad #ifdef __ARM_BIG_ENDIAN
397 1.3 riastrad rev r6, r6
398 1.3 riastrad rev r8, r8
399 1.3 riastrad rev r10, r10
400 1.3 riastrad #endif
401 1.3 riastrad
402 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */
403 1.1 riastrad vdup.32 q1, d24[1]
404 1.1 riastrad vdup.32 q2, d25[0]
405 1.1 riastrad vdup.32 q3, d25[1]
406 1.1 riastrad vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */
407 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
408 1.1 riastrad vdup.32 q5, d26[1]
409 1.1 riastrad vdup.32 q6, d27[0]
410 1.1 riastrad vdup.32 q7, d27[1]
411 1.1 riastrad vdup.32 q8, d28[0]
412 1.1 riastrad vdup.32 q9, d28[1]
413 1.1 riastrad vdup.32 q10, d29[0]
414 1.1 riastrad vdup.32 q11, d29[1]
415 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
416 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */
417 1.1 riastrad vdup.32 q14, r8
418 1.1 riastrad vdup.32 q15, r10
419 1.1 riastrad
420 1.1 riastrad b 2f
421 1.1 riastrad
422 1.1 riastrad _ALIGN_TEXT
423 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
424 1.1 riastrad 2: subs ip, ip, #2
425 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
426 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31
427 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
428 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
429 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29
430 1.1 riastrad bne 1b
431 1.1 riastrad
432 1.1 riastrad /*
433 1.1 riastrad * q8-q9 are free / saved on the stack. Now for the real fun:
434 1.1 riastrad * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
435 1.1 riastrad * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are
436 1.1 riastrad * transposed from one another, and the x[i] are in general
437 1.2 riastrad * registers and memory. See comments in chacha_stream256_neon
438 1.2 riastrad * for the layout with swaps.
439 1.1 riastrad */
440 1.1 riastrad
441 1.1 riastrad sub r7, r7, #0x10
442 1.1 riastrad vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */
443 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
444 1.1 riastrad
445 1.1 riastrad vzip.32 q0, q1
446 1.1 riastrad vzip.32 q2, q3
447 1.1 riastrad vzip.32 q4, q5
448 1.1 riastrad vzip.32 q6, q7
449 1.1 riastrad
450 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
451 1.3 riastrad vld1.8 {q9}, [r5] /* q9 := constant */
452 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
453 1.3 riastrad vld1.8 {q8}, [r4]! /* q8 := key[0:16) */
454 1.1 riastrad
455 1.2 riastrad vswp d3, d6
456 1.2 riastrad vswp d9, d12
457 1.1 riastrad vswp d1, d4
458 1.1 riastrad vswp d11, d14
459 1.1 riastrad
460 1.1 riastrad vswp q1, q4
461 1.2 riastrad vswp q3, q6
462 1.1 riastrad
463 1.1 riastrad vadd.u32 q0, q0, q9
464 1.1 riastrad vadd.u32 q4, q4, q9
465 1.1 riastrad vadd.u32 q2, q2, q9
466 1.2 riastrad vadd.u32 q6, q6, q9
467 1.1 riastrad
468 1.1 riastrad vadd.u32 q1, q1, q8
469 1.1 riastrad vadd.u32 q5, q5, q8
470 1.2 riastrad vadd.u32 q3, q3, q8
471 1.1 riastrad vadd.u32 q7, q7, q8
472 1.1 riastrad
473 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */
474 1.1 riastrad
475 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [0:32) */
476 1.1 riastrad veor q1, q1, q9
477 1.1 riastrad
478 1.3 riastrad vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */
479 1.1 riastrad
480 1.3 riastrad vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */
481 1.3 riastrad vld1.8 {q0}, [r4] /* q0 := key[16:32) */
482 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
483 1.1 riastrad vmov d2, r3, r6
484 1.1 riastrad vmov d3, r8, r10
485 1.1 riastrad
486 1.1 riastrad vzip.32 q8, q9
487 1.1 riastrad vzip.32 q10, q11
488 1.1 riastrad vzip.32 q12, q13
489 1.1 riastrad vzip.32 q14, q15
490 1.1 riastrad
491 1.2 riastrad vswp d19, d22
492 1.2 riastrad vswp d25, d28
493 1.1 riastrad vswp d17, d20
494 1.1 riastrad vswp d27, d30
495 1.1 riastrad
496 1.1 riastrad vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */
497 1.2 riastrad vswp q11, q14
498 1.1 riastrad
499 1.1 riastrad vadd.u32 q8, q8, q0
500 1.1 riastrad vadd.u32 q12, q12, q0
501 1.1 riastrad vadd.u32 q10, q10, q0
502 1.2 riastrad vadd.u32 q14, q14, q0
503 1.1 riastrad
504 1.1 riastrad vadd.u32 q9, q9, q1
505 1.1 riastrad vadd.u32 q13, q13, q1
506 1.2 riastrad vadd.u32 q11, q11, q1
507 1.1 riastrad vadd.u32 q15, q15, q1
508 1.1 riastrad
509 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */
510 1.1 riastrad
511 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [32:64) */
512 1.1 riastrad veor q1, q1, q9
513 1.1 riastrad
514 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */
515 1.3 riastrad vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */
516 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */
517 1.1 riastrad
518 1.1 riastrad veor q2, q2, q8 /* compute ciphertext bytes [64:96) */
519 1.2 riastrad veor q3, q3, q9
520 1.1 riastrad
521 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */
522 1.3 riastrad vst1.8 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */
523 1.1 riastrad
524 1.1 riastrad veor q10, q10, q0 /* compute ciphertext bytes [96:128) */
525 1.2 riastrad veor q11, q11, q1
526 1.1 riastrad
527 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */
528 1.3 riastrad vst1.8 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */
529 1.1 riastrad
530 1.1 riastrad veor q4, q4, q8 /* compute ciphertext bytes [128:160) */
531 1.1 riastrad veor q5, q5, q9
532 1.1 riastrad
533 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */
534 1.3 riastrad vst1.8 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */
535 1.1 riastrad
536 1.1 riastrad veor q12, q12, q0 /* compute ciphertext bytes [160:192) */
537 1.1 riastrad veor q13, q13, q1
538 1.1 riastrad
539 1.3 riastrad vld1.8 {q0-q1}, [r1] /* load plaintext bytes [224:256) */
540 1.3 riastrad vst1.8 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */
541 1.1 riastrad
542 1.2 riastrad veor q6, q6, q8 /* compute ciphertext bytes [192:224) */
543 1.2 riastrad veor q7, q7, q9
544 1.1 riastrad
545 1.3 riastrad vst1.8 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */
546 1.1 riastrad
547 1.2 riastrad veor q14, q14, q0 /* compute ciphertext bytes [224:256) */
548 1.2 riastrad veor q15, q15, q1
549 1.1 riastrad
550 1.3 riastrad vst1.8 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */
551 1.1 riastrad
552 1.1 riastrad /* zero temporary space on the stack */
553 1.1 riastrad vmov.i32 q0, #0
554 1.1 riastrad vmov.i32 q1, #0
555 1.1 riastrad vst1.8 {q0-q1}, [fp, :256]
556 1.1 riastrad
557 1.1 riastrad /* restore callee-saves registers and stack */
558 1.1 riastrad vpop {d8-d15}
559 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr}
560 1.1 riastrad bx lr
561 1.1 riastrad END(chacha_stream_xor256_neon)
562 1.1 riastrad
563 1.1 riastrad .section .rodata
564 1.1 riastrad .p2align 4
565 1.1 riastrad .Lconstants:
566 1.1 riastrad
567 1.1 riastrad .type v0123,%object
568 1.1 riastrad v0123:
569 1.1 riastrad .long 0, 1, 2, 3
570 1.1 riastrad END(v0123)
571 1.1 riastrad
572 1.1 riastrad .type rot8,%object
573 1.1 riastrad rot8:
574 1.3 riastrad .byte 3,0,1,2, 7,4,5,6
575 1.1 riastrad END(rot8)
576