chacha_neon_32.S revision 1.4 1 1.4 riastrad /* $NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <machine/asm.h>
30 1.1 riastrad
31 1.4 riastrad RCSID("$NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $")
32 1.1 riastrad
33 1.1 riastrad .fpu neon
34 1.1 riastrad
35 1.1 riastrad /*
36 1.1 riastrad * ChaCha round, split up so we can interleave the quarterrounds on
37 1.1 riastrad * independent rows/diagonals to maximize pipeline efficiency, with
38 1.1 riastrad * spills to deal with the scarcity of registers. Reference:
39 1.1 riastrad *
40 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
41 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008.
42 1.1 riastrad * https://cr.yp.to/papers.html#chacha
43 1.1 riastrad *
44 1.1 riastrad * a += b; d ^= a; d <<<= 16;
45 1.1 riastrad * c += d; b ^= c; b <<<= 12;
46 1.1 riastrad * a += b; d ^= a; d <<<= 8;
47 1.1 riastrad * c += d; b ^= c; b <<<= 7;
48 1.1 riastrad *
49 1.1 riastrad * The rotations are implemented with:
50 1.1 riastrad * <<< 16 VREV32.16 for 16,
51 1.1 riastrad * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR)
52 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r)
53 1.1 riastrad * <<< 7 VSHL/VSRI/VORR
54 1.1 riastrad */
55 1.1 riastrad
56 1.1 riastrad .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
57 1.4 riastrad vld1.8 {\c2-\c3}, [sp, :256]
58 1.1 riastrad .endm
59 1.1 riastrad
60 1.1 riastrad .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
61 1.1 riastrad /* a += b; d ^= a; d <<<= 16 */
62 1.1 riastrad vadd.u32 \a0, \a0, \b0
63 1.1 riastrad vadd.u32 \a1, \a1, \b1
64 1.1 riastrad vadd.u32 \a2, \a2, \b2
65 1.1 riastrad vadd.u32 \a3, \a3, \b3
66 1.1 riastrad
67 1.1 riastrad veor \d0, \d0, \a0
68 1.1 riastrad veor \d1, \d1, \a1
69 1.1 riastrad veor \d2, \d2, \a2
70 1.1 riastrad veor \d3, \d3, \a3
71 1.1 riastrad
72 1.1 riastrad vrev32.16 \d0, \d0
73 1.1 riastrad vrev32.16 \d1, \d1
74 1.1 riastrad vrev32.16 \d2, \d2
75 1.1 riastrad vrev32.16 \d3, \d3
76 1.1 riastrad
77 1.1 riastrad /* c += d; b ^= c; b <<<= 12 */
78 1.1 riastrad vadd.u32 \c0, \c0, \d0
79 1.1 riastrad vadd.u32 \c1, \c1, \d1
80 1.1 riastrad vadd.u32 \c2, \c2, \d2
81 1.1 riastrad vadd.u32 \c3, \c3, \d3
82 1.1 riastrad
83 1.4 riastrad vst1.8 {\c0-\c1}, [sp, :256] /* free c0 and c1 as temps */
84 1.1 riastrad
85 1.1 riastrad veor \c0, \b0, \c0
86 1.1 riastrad veor \c1, \b1, \c1
87 1.1 riastrad vshl.u32 \b0, \c0, #12
88 1.1 riastrad vshl.u32 \b1, \c1, #12
89 1.1 riastrad vsri.u32 \b0, \c0, #(32 - 12)
90 1.1 riastrad vsri.u32 \b1, \c1, #(32 - 12)
91 1.1 riastrad
92 1.1 riastrad veor \c0, \b2, \c2
93 1.1 riastrad veor \c1, \b3, \c3
94 1.1 riastrad vshl.u32 \b2, \c0, #12
95 1.1 riastrad vshl.u32 \b3, \c1, #12
96 1.1 riastrad vsri.u32 \b2, \c0, #(32 - 12)
97 1.1 riastrad vsri.u32 \b3, \c1, #(32 - 12)
98 1.1 riastrad
99 1.1 riastrad vld1.8 {\c0l}, [r7, :64] /* load rot8 table */
100 1.1 riastrad
101 1.1 riastrad /* a += b; d ^= a; d <<<= 8 */
102 1.1 riastrad vadd.u32 \a0, \a0, \b0
103 1.1 riastrad vadd.u32 \a1, \a1, \b1
104 1.1 riastrad vadd.u32 \a2, \a2, \b2
105 1.1 riastrad vadd.u32 \a3, \a3, \b3
106 1.1 riastrad
107 1.1 riastrad veor \d0, \d0, \a0
108 1.1 riastrad veor \d1, \d1, \a1
109 1.1 riastrad veor \d2, \d2, \a2
110 1.1 riastrad veor \d3, \d3, \a3
111 1.1 riastrad
112 1.1 riastrad vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */
113 1.1 riastrad vtbl.8 \d0h, {\d0h}, \c0l
114 1.1 riastrad vtbl.8 \d1l, {\d1l}, \c0l
115 1.1 riastrad vtbl.8 \d1h, {\d1h}, \c0l
116 1.1 riastrad vtbl.8 \d2l, {\d2l}, \c0l
117 1.1 riastrad vtbl.8 \d2h, {\d2h}, \c0l
118 1.1 riastrad vtbl.8 \d3l, {\d3l}, \c0l
119 1.1 riastrad vtbl.8 \d3h, {\d3h}, \c0l
120 1.1 riastrad
121 1.4 riastrad vld1.8 {\c0-\c1}, [sp, :256] /* restore c0 and c1 */
122 1.1 riastrad
123 1.1 riastrad /* c += d; b ^= c; b <<<= 7 */
124 1.1 riastrad vadd.u32 \c2, \c2, \d2
125 1.1 riastrad vadd.u32 \c3, \c3, \d3
126 1.1 riastrad vadd.u32 \c0, \c0, \d0
127 1.1 riastrad vadd.u32 \c1, \c1, \d1
128 1.1 riastrad
129 1.4 riastrad vst1.8 {\c2-\c3}, [sp, :256] /* free c2 and c3 as temps */
130 1.1 riastrad
131 1.1 riastrad veor \c2, \b2, \c2
132 1.1 riastrad veor \c3, \b3, \c3
133 1.1 riastrad vshl.u32 \b2, \c2, #7
134 1.1 riastrad vshl.u32 \b3, \c3, #7
135 1.1 riastrad vsri.u32 \b2, \c2, #(32 - 7)
136 1.1 riastrad vsri.u32 \b3, \c3, #(32 - 7)
137 1.1 riastrad
138 1.1 riastrad veor \c2, \b0, \c0
139 1.1 riastrad veor \c3, \b1, \c1
140 1.1 riastrad vshl.u32 \b0, \c2, #7
141 1.1 riastrad vshl.u32 \b1, \c3, #7
142 1.1 riastrad vsri.u32 \b0, \c2, #(32 - 7)
143 1.1 riastrad vsri.u32 \b1, \c3, #(32 - 7)
144 1.1 riastrad .endm
145 1.1 riastrad
146 1.1 riastrad .text
147 1.1 riastrad .p2align 2
148 1.1 riastrad .Lconstants_addr:
149 1.1 riastrad .long .Lconstants - .
150 1.1 riastrad
151 1.1 riastrad /*
152 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@r0,
153 1.1 riastrad * uint32_t blkno@r1,
154 1.1 riastrad * const uint8_t nonce[12]@r2,
155 1.1 riastrad * const uint8_t key[32]@r3,
156 1.1 riastrad * const uint8_t const[16]@sp[0],
157 1.1 riastrad * unsigned nr@sp[4])
158 1.1 riastrad */
159 1.1 riastrad ENTRY(chacha_stream256_neon)
160 1.1 riastrad /* save callee-saves registers */
161 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr}
162 1.1 riastrad vpush {d8-d15}
163 1.4 riastrad mov fp, sp
164 1.1 riastrad
165 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
166 1.1 riastrad ldr r7, .Lconstants_addr
167 1.1 riastrad adr r6, .Lconstants_addr
168 1.1 riastrad
169 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */
170 1.4 riastrad sub sp, sp, #0x20
171 1.4 riastrad bic sp, sp, #0x1f /* align */
172 1.1 riastrad
173 1.1 riastrad /* get parameters */
174 1.4 riastrad add ip, fp, #96
175 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
176 1.1 riastrad ldm ip, {r4, r5} /* r4 := const, r5 := nr */
177 1.1 riastrad ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
178 1.1 riastrad
179 1.3 riastrad vld1.8 {q12}, [r4] /* q12 := constant */
180 1.3 riastrad vld1.8 {q13-q14}, [r3] /* q13-q14 := key */
181 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
182 1.1 riastrad
183 1.3 riastrad #ifdef __ARM_BIG_ENDIAN
184 1.3 riastrad rev r6, r6
185 1.3 riastrad rev r8, r8
186 1.3 riastrad rev r10, r10
187 1.3 riastrad #endif
188 1.3 riastrad
189 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */
190 1.1 riastrad vdup.32 q1, d24[1]
191 1.1 riastrad vdup.32 q2, d25[0]
192 1.1 riastrad vdup.32 q3, d25[1]
193 1.1 riastrad vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */
194 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
195 1.1 riastrad vdup.32 q5, d26[1]
196 1.1 riastrad vdup.32 q6, d27[0]
197 1.1 riastrad vdup.32 q7, d27[1]
198 1.1 riastrad vdup.32 q8, d28[0]
199 1.1 riastrad vdup.32 q9, d28[1]
200 1.1 riastrad vdup.32 q10, d29[0]
201 1.1 riastrad vdup.32 q11, d29[1]
202 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
203 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */
204 1.1 riastrad vdup.32 q14, r8
205 1.1 riastrad vdup.32 q15, r10
206 1.1 riastrad
207 1.1 riastrad b 2f
208 1.1 riastrad
209 1.1 riastrad _ALIGN_TEXT
210 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
211 1.1 riastrad 2: subs r5, r5, #2
212 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
213 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31
214 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
215 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
216 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29
217 1.1 riastrad bne 1b
218 1.1 riastrad
219 1.1 riastrad /*
220 1.1 riastrad * q8-q9 are free / saved on the stack. We have:
221 1.1 riastrad *
222 1.1 riastrad * q0 = (x0[0], x1[0]; x2[0], x3[0])
223 1.1 riastrad * q1 = (x0[1], x1[1]; x2[1], x3[1])
224 1.1 riastrad * q2 = (x0[2], x1[2]; x2[2], x3[2])
225 1.1 riastrad * q3 = (x0[3], x1[3]; x2[3], x3[3])
226 1.1 riastrad * ...
227 1.1 riastrad * q15 = (x0[15], x1[15]; x2[15], x3[15])
228 1.1 riastrad *
229 1.1 riastrad * where xi[j] is the jth word of the ith 16-word block. Zip
230 1.1 riastrad * consecutive pairs with vzip.32, and you get:
231 1.1 riastrad *
232 1.1 riastrad * q0 = (x0[0], x0[1]; x1[0], x1[1])
233 1.1 riastrad * q1 = (x2[0], x2[1]; x3[0], x3[1])
234 1.1 riastrad * q2 = (x0[2], x0[3]; x1[2], x1[3])
235 1.1 riastrad * q3 = (x2[2], x2[3]; x3[2], x3[3])
236 1.1 riastrad * ...
237 1.1 riastrad * q15 = (x2[14], x2[15]; x3[14], x3[15])
238 1.1 riastrad *
239 1.1 riastrad * As 64-bit d registers, this is:
240 1.1 riastrad *
241 1.1 riastrad * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1])
242 1.1 riastrad * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1])
243 1.1 riastrad * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3])
244 1.1 riastrad * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3])
245 1.1 riastrad * ...
246 1.1 riastrad * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15])
247 1.1 riastrad *
248 1.1 riastrad * Swap d1<->d4, d3<->d6, ..., and you get:
249 1.1 riastrad *
250 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
251 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
252 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
253 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
254 1.1 riastrad * ...
255 1.1 riastrad * q15 = (x15[0], x15[1]; x15[2], x15[3])
256 1.1 riastrad */
257 1.1 riastrad
258 1.1 riastrad sub r7, r7, #0x10
259 1.1 riastrad vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */
260 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
261 1.1 riastrad
262 1.1 riastrad vzip.32 q0, q1
263 1.1 riastrad vzip.32 q2, q3
264 1.1 riastrad vzip.32 q4, q5
265 1.1 riastrad vzip.32 q6, q7
266 1.1 riastrad
267 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
268 1.3 riastrad vld1.8 {q9}, [r4] /* q9 := constant */
269 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
270 1.3 riastrad vld1.8 {q8}, [r3]! /* q8 := key[0:16) */
271 1.1 riastrad
272 1.1 riastrad vswp d1, d4
273 1.1 riastrad vswp d9, d12
274 1.1 riastrad vswp d3, d6
275 1.1 riastrad vswp d11, d14
276 1.1 riastrad
277 1.1 riastrad /*
278 1.1 riastrad * At this point, the blocks are:
279 1.1 riastrad *
280 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3])
281 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3])
282 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3])
283 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3])
284 1.1 riastrad * q4 = (x0[4], x0[5]; x0[6], x0[7])
285 1.1 riastrad * q5 = (x2[4], x2[5]; x2[6], x2[7])
286 1.1 riastrad * q6 = (x1[4], x1[5]; x1[6], x1[7])
287 1.1 riastrad * q7 = (x3[4], x3[5]; x3[6], x3[7])
288 1.1 riastrad *
289 1.1 riastrad * The first two rows to write out are q0 = x0[0:4) and q4 =
290 1.2 riastrad * x0[4:8). Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14
291 1.2 riastrad * enables us to issue all stores in consecutive pairs:
292 1.2 riastrad * x0 in q0-q1
293 1.2 riastrad * x1 in q8-q9
294 1.2 riastrad * x2 in q2-q3
295 1.2 riastrad * x3 in q10-q11
296 1.2 riastrad * x4 in q4-q5
297 1.2 riastrad * x5 in q12-q3
298 1.2 riastrad * x6 in q6-q7
299 1.2 riastrad * x7 in q14-q15
300 1.1 riastrad */
301 1.1 riastrad
302 1.1 riastrad vswp q1, q4
303 1.2 riastrad vswp q3, q6
304 1.1 riastrad
305 1.1 riastrad vadd.u32 q0, q0, q9
306 1.1 riastrad vadd.u32 q4, q4, q9
307 1.1 riastrad vadd.u32 q2, q2, q9
308 1.2 riastrad vadd.u32 q6, q6, q9
309 1.1 riastrad
310 1.1 riastrad vadd.u32 q1, q1, q8
311 1.1 riastrad vadd.u32 q5, q5, q8
312 1.2 riastrad vadd.u32 q3, q3, q8
313 1.1 riastrad vadd.u32 q7, q7, q8
314 1.1 riastrad
315 1.4 riastrad vld1.8 {q8-q9}, [sp, :256] /* restore q8-q9 */
316 1.1 riastrad
317 1.3 riastrad vst1.8 {q0-q1}, [r0]!
318 1.3 riastrad vld1.8 {q0}, [r3] /* q0 := key[16:32) */
319 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
320 1.1 riastrad vmov d2, r3, r6
321 1.1 riastrad vmov d3, r8, r10
322 1.1 riastrad
323 1.1 riastrad vzip.32 q8, q9
324 1.1 riastrad vzip.32 q10, q11
325 1.1 riastrad vzip.32 q12, q13
326 1.1 riastrad vzip.32 q14, q15
327 1.1 riastrad
328 1.1 riastrad vswp d17, d20
329 1.1 riastrad vswp d25, d28
330 1.1 riastrad vswp d19, d22
331 1.1 riastrad vswp d27, d30
332 1.1 riastrad
333 1.2 riastrad vswp q9, q12
334 1.2 riastrad vswp q11, q14
335 1.2 riastrad
336 1.1 riastrad vadd.u32 q8, q8, q0
337 1.2 riastrad vadd.u32 q12, q12, q0
338 1.1 riastrad vadd.u32 q10, q10, q0
339 1.2 riastrad vadd.u32 q14, q14, q0
340 1.1 riastrad
341 1.2 riastrad vadd.u32 q9, q9, q1
342 1.1 riastrad vadd.u32 q13, q13, q1
343 1.2 riastrad vadd.u32 q11, q11, q1
344 1.1 riastrad vadd.u32 q15, q15, q1
345 1.1 riastrad
346 1.3 riastrad /* vst1.8 {q0-q1}, [r0]! */
347 1.3 riastrad vst1.8 {q8-q9}, [r0]!
348 1.3 riastrad vst1.8 {q2-q3}, [r0]!
349 1.3 riastrad vst1.8 {q10-q11}, [r0]!
350 1.3 riastrad vst1.8 {q4-q5}, [r0]!
351 1.3 riastrad vst1.8 {q12-q13}, [r0]!
352 1.3 riastrad vst1.8 {q6-q7}, [r0]!
353 1.3 riastrad vst1.8 {q14-q15}, [r0]
354 1.2 riastrad
355 1.2 riastrad /* zero temporary space on the stack */
356 1.1 riastrad vmov.i32 q0, #0
357 1.1 riastrad vmov.i32 q1, #0
358 1.4 riastrad vst1.8 {q0-q1}, [sp, :256]
359 1.1 riastrad
360 1.1 riastrad /* restore callee-saves registers and stack */
361 1.4 riastrad mov sp, fp
362 1.1 riastrad vpop {d8-d15}
363 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr}
364 1.1 riastrad bx lr
365 1.1 riastrad END(chacha_stream256_neon)
366 1.1 riastrad
367 1.1 riastrad /*
368 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
369 1.1 riastrad * uint32_t blkno@r2,
370 1.1 riastrad * const uint8_t nonce[12]@r3,
371 1.1 riastrad * const uint8_t key[32]@sp[0],
372 1.1 riastrad * const uint8_t const[16]@sp[4],
373 1.1 riastrad * unsigned nr@sp[8])
374 1.1 riastrad */
375 1.1 riastrad ENTRY(chacha_stream_xor256_neon)
376 1.1 riastrad /* save callee-saves registers */
377 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr}
378 1.1 riastrad vpush {d8-d15}
379 1.4 riastrad mov fp, sp
380 1.1 riastrad
381 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
382 1.1 riastrad ldr r7, .Lconstants_addr
383 1.1 riastrad adr r6, .Lconstants_addr
384 1.1 riastrad
385 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */
386 1.4 riastrad sub sp, sp, #0x20
387 1.4 riastrad bic sp, sp, #0x1f /* align */
388 1.1 riastrad
389 1.1 riastrad /* get parameters */
390 1.4 riastrad add ip, fp, #96
391 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
392 1.1 riastrad ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */
393 1.1 riastrad ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
394 1.1 riastrad
395 1.3 riastrad vld1.8 {q12}, [r5] /* q12 := constant */
396 1.3 riastrad vld1.8 {q13-q14}, [r4] /* q13-q14 := key */
397 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
398 1.1 riastrad
399 1.3 riastrad #ifdef __ARM_BIG_ENDIAN
400 1.3 riastrad rev r6, r6
401 1.3 riastrad rev r8, r8
402 1.3 riastrad rev r10, r10
403 1.3 riastrad #endif
404 1.3 riastrad
405 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */
406 1.1 riastrad vdup.32 q1, d24[1]
407 1.1 riastrad vdup.32 q2, d25[0]
408 1.1 riastrad vdup.32 q3, d25[1]
409 1.1 riastrad vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */
410 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
411 1.1 riastrad vdup.32 q5, d26[1]
412 1.1 riastrad vdup.32 q6, d27[0]
413 1.1 riastrad vdup.32 q7, d27[1]
414 1.1 riastrad vdup.32 q8, d28[0]
415 1.1 riastrad vdup.32 q9, d28[1]
416 1.1 riastrad vdup.32 q10, d29[0]
417 1.1 riastrad vdup.32 q11, d29[1]
418 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
419 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */
420 1.1 riastrad vdup.32 q14, r8
421 1.1 riastrad vdup.32 q15, r10
422 1.1 riastrad
423 1.1 riastrad b 2f
424 1.1 riastrad
425 1.1 riastrad _ALIGN_TEXT
426 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
427 1.1 riastrad 2: subs ip, ip, #2
428 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
429 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31
430 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
431 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
432 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29
433 1.1 riastrad bne 1b
434 1.1 riastrad
435 1.1 riastrad /*
436 1.1 riastrad * q8-q9 are free / saved on the stack. Now for the real fun:
437 1.1 riastrad * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
438 1.1 riastrad * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are
439 1.1 riastrad * transposed from one another, and the x[i] are in general
440 1.2 riastrad * registers and memory. See comments in chacha_stream256_neon
441 1.2 riastrad * for the layout with swaps.
442 1.1 riastrad */
443 1.1 riastrad
444 1.1 riastrad sub r7, r7, #0x10
445 1.1 riastrad vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */
446 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
447 1.1 riastrad
448 1.1 riastrad vzip.32 q0, q1
449 1.1 riastrad vzip.32 q2, q3
450 1.1 riastrad vzip.32 q4, q5
451 1.1 riastrad vzip.32 q6, q7
452 1.1 riastrad
453 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
454 1.3 riastrad vld1.8 {q9}, [r5] /* q9 := constant */
455 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
456 1.3 riastrad vld1.8 {q8}, [r4]! /* q8 := key[0:16) */
457 1.1 riastrad
458 1.2 riastrad vswp d3, d6
459 1.2 riastrad vswp d9, d12
460 1.1 riastrad vswp d1, d4
461 1.1 riastrad vswp d11, d14
462 1.1 riastrad
463 1.1 riastrad vswp q1, q4
464 1.2 riastrad vswp q3, q6
465 1.1 riastrad
466 1.1 riastrad vadd.u32 q0, q0, q9
467 1.1 riastrad vadd.u32 q4, q4, q9
468 1.1 riastrad vadd.u32 q2, q2, q9
469 1.2 riastrad vadd.u32 q6, q6, q9
470 1.1 riastrad
471 1.1 riastrad vadd.u32 q1, q1, q8
472 1.1 riastrad vadd.u32 q5, q5, q8
473 1.2 riastrad vadd.u32 q3, q3, q8
474 1.1 riastrad vadd.u32 q7, q7, q8
475 1.1 riastrad
476 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */
477 1.1 riastrad
478 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [0:32) */
479 1.1 riastrad veor q1, q1, q9
480 1.1 riastrad
481 1.4 riastrad vld1.8 {q8-q9}, [sp, :256] /* restore q8-q9 */
482 1.1 riastrad
483 1.3 riastrad vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */
484 1.3 riastrad vld1.8 {q0}, [r4] /* q0 := key[16:32) */
485 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
486 1.1 riastrad vmov d2, r3, r6
487 1.1 riastrad vmov d3, r8, r10
488 1.1 riastrad
489 1.1 riastrad vzip.32 q8, q9
490 1.1 riastrad vzip.32 q10, q11
491 1.1 riastrad vzip.32 q12, q13
492 1.1 riastrad vzip.32 q14, q15
493 1.1 riastrad
494 1.2 riastrad vswp d19, d22
495 1.2 riastrad vswp d25, d28
496 1.1 riastrad vswp d17, d20
497 1.1 riastrad vswp d27, d30
498 1.1 riastrad
499 1.1 riastrad vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */
500 1.2 riastrad vswp q11, q14
501 1.1 riastrad
502 1.1 riastrad vadd.u32 q8, q8, q0
503 1.1 riastrad vadd.u32 q12, q12, q0
504 1.1 riastrad vadd.u32 q10, q10, q0
505 1.2 riastrad vadd.u32 q14, q14, q0
506 1.1 riastrad
507 1.1 riastrad vadd.u32 q9, q9, q1
508 1.1 riastrad vadd.u32 q13, q13, q1
509 1.2 riastrad vadd.u32 q11, q11, q1
510 1.1 riastrad vadd.u32 q15, q15, q1
511 1.1 riastrad
512 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */
513 1.1 riastrad
514 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [32:64) */
515 1.1 riastrad veor q1, q1, q9
516 1.1 riastrad
517 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */
518 1.3 riastrad vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */
519 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */
520 1.1 riastrad
521 1.1 riastrad veor q2, q2, q8 /* compute ciphertext bytes [64:96) */
522 1.2 riastrad veor q3, q3, q9
523 1.1 riastrad
524 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */
525 1.3 riastrad vst1.8 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */
526 1.1 riastrad
527 1.1 riastrad veor q10, q10, q0 /* compute ciphertext bytes [96:128) */
528 1.2 riastrad veor q11, q11, q1
529 1.1 riastrad
530 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */
531 1.3 riastrad vst1.8 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */
532 1.1 riastrad
533 1.1 riastrad veor q4, q4, q8 /* compute ciphertext bytes [128:160) */
534 1.1 riastrad veor q5, q5, q9
535 1.1 riastrad
536 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */
537 1.3 riastrad vst1.8 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */
538 1.1 riastrad
539 1.1 riastrad veor q12, q12, q0 /* compute ciphertext bytes [160:192) */
540 1.1 riastrad veor q13, q13, q1
541 1.1 riastrad
542 1.3 riastrad vld1.8 {q0-q1}, [r1] /* load plaintext bytes [224:256) */
543 1.3 riastrad vst1.8 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */
544 1.1 riastrad
545 1.2 riastrad veor q6, q6, q8 /* compute ciphertext bytes [192:224) */
546 1.2 riastrad veor q7, q7, q9
547 1.1 riastrad
548 1.3 riastrad vst1.8 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */
549 1.1 riastrad
550 1.2 riastrad veor q14, q14, q0 /* compute ciphertext bytes [224:256) */
551 1.2 riastrad veor q15, q15, q1
552 1.1 riastrad
553 1.3 riastrad vst1.8 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */
554 1.1 riastrad
555 1.1 riastrad /* zero temporary space on the stack */
556 1.1 riastrad vmov.i32 q0, #0
557 1.1 riastrad vmov.i32 q1, #0
558 1.4 riastrad vst1.8 {q0-q1}, [sp, :256]
559 1.1 riastrad
560 1.1 riastrad /* restore callee-saves registers and stack */
561 1.4 riastrad mov sp, fp
562 1.1 riastrad vpop {d8-d15}
563 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr}
564 1.1 riastrad bx lr
565 1.1 riastrad END(chacha_stream_xor256_neon)
566 1.1 riastrad
567 1.1 riastrad .section .rodata
568 1.1 riastrad .p2align 4
569 1.1 riastrad .Lconstants:
570 1.1 riastrad
571 1.1 riastrad .type v0123,%object
572 1.1 riastrad v0123:
573 1.1 riastrad .long 0, 1, 2, 3
574 1.1 riastrad END(v0123)
575 1.1 riastrad
576 1.1 riastrad .type rot8,%object
577 1.1 riastrad rot8:
578 1.3 riastrad .byte 3,0,1,2, 7,4,5,6
579 1.1 riastrad END(rot8)
580