chacha_neon_64.S revision 1.6 1 1.6 riastrad /* $NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.2 riastrad #include <aarch64/asm.h>
30 1.1 riastrad
31 1.6 riastrad RCSID("$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $")
32 1.4 riastrad
33 1.1 riastrad #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
34 1.1 riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
35 1.1 riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
36 1.1 riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
37 1.1 riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
38 1.1 riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
39 1.1 riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
40 1.1 riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
41 1.1 riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
42 1.1 riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
43 1.1 riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
44 1.1 riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
45 1.1 riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
46 1.1 riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
47 1.1 riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
48 1.1 riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
49 1.1 riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
50 1.1 riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
51 1.1 riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
52 1.1 riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
53 1.1 riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \
54 1.1 riastrad /* end ROUND */
55 1.1 riastrad
56 1.1 riastrad #define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
57 1.1 riastrad f(a0,b0,c0,d0, t0, r); \
58 1.1 riastrad f(a1,b1,c1,d1, t1, r); \
59 1.1 riastrad f(a2,b2,c2,d2, t2, r); \
60 1.1 riastrad f(a3,b3,c3,d3, t3, r); \
61 1.1 riastrad /* end of STEP */
62 1.1 riastrad
63 1.1 riastrad /*
64 1.1 riastrad * Each step of the ChaCha quarterround, split up so we can interleave
65 1.1 riastrad * the quarterrounds on independent rows/diagonals to maximize pipeline
66 1.1 riastrad * efficiency. Reference:
67 1.1 riastrad *
68 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
69 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008.
70 1.1 riastrad * https://cr.yp.to/papers.html#chacha
71 1.1 riastrad *
72 1.1 riastrad * a += b; d ^= a; d <<<= 16;
73 1.1 riastrad * c += d; b ^= c; b <<<= 12;
74 1.1 riastrad * a += b; d ^= a; d <<<= 8;
75 1.1 riastrad * c += d; b ^= c; b <<<= 7;
76 1.1 riastrad *
77 1.1 riastrad * The rotations are implemented with:
78 1.1 riastrad * <<< 16 REV32 Vn.8h for 16,
79 1.1 riastrad * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR)
80 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r)
81 1.1 riastrad * <<< 7 SHL/SRI/ORR
82 1.1 riastrad */
83 1.1 riastrad #define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
84 1.1 riastrad #define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
85 1.1 riastrad #if 0
86 1.1 riastrad #define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16
87 1.1 riastrad #define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16)
88 1.1 riastrad #define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
89 1.1 riastrad #else
90 1.1 riastrad #define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h
91 1.1 riastrad #define STEP3(a,b,c,d, t, r) /* nothing */
92 1.1 riastrad #define STEP4(a,b,c,d, t, r) /* nothing */
93 1.1 riastrad #endif
94 1.1 riastrad
95 1.1 riastrad #define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
96 1.1 riastrad #if 0
97 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
98 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12
99 1.1 riastrad #define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12)
100 1.1 riastrad #define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
101 1.1 riastrad #else
102 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
103 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12
104 1.1 riastrad #define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12)
105 1.1 riastrad #define STEP9(a,b,c,d, t, r) /* nothing */
106 1.1 riastrad #endif
107 1.1 riastrad
108 1.1 riastrad #define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s
109 1.1 riastrad #define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b
110 1.1 riastrad #if 0
111 1.1 riastrad #define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8
112 1.1 riastrad #define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8)
113 1.1 riastrad #define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b
114 1.1 riastrad #else
115 1.1 riastrad #define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b
116 1.1 riastrad #define STEP13(a,b,c,d, t, r) /* nothing */
117 1.1 riastrad #define STEP14(a,b,c,d, t, r) /* nothing */
118 1.1 riastrad #endif
119 1.1 riastrad
120 1.1 riastrad #define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s
121 1.1 riastrad #if 0
122 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b
123 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7
124 1.1 riastrad #define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7)
125 1.1 riastrad #define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b
126 1.1 riastrad #else
127 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b
128 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7
129 1.1 riastrad #define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7)
130 1.1 riastrad #define STEP19(a,b,c,d, t, r) /* nothing */
131 1.1 riastrad #endif
132 1.1 riastrad
133 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
134 1.1 riastrad #define HTOLE32(x)
135 1.1 riastrad #define LE32TOH(x)
136 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
137 1.1 riastrad #define HTOLE32(x) rev32 x, x
138 1.1 riastrad #define LE32TOH(x) rev32 x, x
139 1.1 riastrad #endif
140 1.1 riastrad
141 1.1 riastrad /*
142 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@x0,
143 1.1 riastrad * uint32_t blkno@w1,
144 1.1 riastrad * const uint8_t nonce[12]@x2,
145 1.5 riastrad * const uint8_t key[32]@x3,
146 1.1 riastrad * const uint8_t const[16]@x4,
147 1.1 riastrad * unsigned nr@w5)
148 1.1 riastrad */
149 1.1 riastrad ENTRY(chacha_stream256_neon)
150 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
151 1.1 riastrad mov fp, sp
152 1.1 riastrad
153 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
154 1.1 riastrad stp d10, d11, [sp, #0x20]
155 1.1 riastrad stp d12, d13, [sp, #0x30]
156 1.1 riastrad stp d14, d15, [sp, #0x40]
157 1.1 riastrad
158 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */
159 1.1 riastrad mov x10, x4 /* r10 := c */
160 1.1 riastrad mov x11, x3 /* r11 := k */
161 1.1 riastrad add x12, x3, #16 /* r12 := k+4 */
162 1.1 riastrad mov x13, x2 /* r13 := nonce */
163 1.1 riastrad
164 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
165 1.1 riastrad dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */
166 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
167 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
168 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
169 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
170 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
171 1.1 riastrad
172 1.6 riastrad LE32TOH(v0.16b)
173 1.6 riastrad LE32TOH(v1.16b)
174 1.6 riastrad LE32TOH(v2.16b)
175 1.6 riastrad LE32TOH(v3.16b)
176 1.6 riastrad LE32TOH(v4.16b)
177 1.6 riastrad LE32TOH(v5.16b)
178 1.6 riastrad LE32TOH(v6.16b)
179 1.6 riastrad LE32TOH(v7.16b)
180 1.6 riastrad LE32TOH(v8.16b)
181 1.6 riastrad LE32TOH(v9.16b)
182 1.6 riastrad LE32TOH(v10.16b)
183 1.6 riastrad LE32TOH(v11.16b)
184 1.6 riastrad /* LE32TOH(v12.16b) -- blkno, already host order */
185 1.6 riastrad LE32TOH(v13.16b)
186 1.6 riastrad LE32TOH(v14.16b)
187 1.6 riastrad LE32TOH(v15.16b)
188 1.1 riastrad
189 1.1 riastrad mov v16.16b, v0.16b
190 1.1 riastrad mov v17.16b, v1.16b
191 1.1 riastrad mov v18.16b, v2.16b
192 1.1 riastrad mov v19.16b, v3.16b
193 1.1 riastrad mov v20.16b, v4.16b
194 1.1 riastrad mov v21.16b, v5.16b
195 1.1 riastrad mov v22.16b, v6.16b
196 1.1 riastrad mov v23.16b, v7.16b
197 1.1 riastrad mov v24.16b, v8.16b
198 1.1 riastrad mov v25.16b, v9.16b
199 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */
200 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */
201 1.1 riastrad mov w9, v11.s[0]
202 1.1 riastrad mov w10, v13.s[0]
203 1.1 riastrad mov w11, v14.s[0]
204 1.1 riastrad mov w12, v15.s[0]
205 1.1 riastrad
206 1.3 riastrad _ALIGN_TEXT
207 1.1 riastrad 1: subs w5, w5, #2
208 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
209 1.1 riastrad v28,v29,v30,v31, v27)
210 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
211 1.1 riastrad v28,v29,v30,v31, v27)
212 1.1 riastrad b.ne 1b
213 1.1 riastrad
214 1.1 riastrad dup v27.4s, w8
215 1.1 riastrad dup v28.4s, w9
216 1.1 riastrad dup v29.4s, w10
217 1.1 riastrad dup v30.4s, w11
218 1.1 riastrad dup v31.4s, w12
219 1.1 riastrad
220 1.1 riastrad add v0.4s, v0.4s, v16.4s
221 1.1 riastrad add v1.4s, v1.4s, v17.4s
222 1.1 riastrad add v2.4s, v2.4s, v18.4s
223 1.1 riastrad add v3.4s, v3.4s, v19.4s
224 1.1 riastrad add v4.4s, v4.4s, v20.4s
225 1.1 riastrad add v5.4s, v5.4s, v21.4s
226 1.1 riastrad add v6.4s, v6.4s, v22.4s
227 1.1 riastrad add v7.4s, v7.4s, v23.4s
228 1.1 riastrad add v8.4s, v8.4s, v24.4s
229 1.1 riastrad add v9.4s, v9.4s, v25.4s
230 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
231 1.1 riastrad add v11.4s, v11.4s, v28.4s
232 1.1 riastrad add v12.4s, v12.4s, v26.4s
233 1.1 riastrad add v13.4s, v13.4s, v29.4s
234 1.1 riastrad add v14.4s, v14.4s, v30.4s
235 1.1 riastrad add v15.4s, v15.4s, v31.4s
236 1.1 riastrad
237 1.6 riastrad HTOLE32(v0.16b)
238 1.6 riastrad HTOLE32(v1.16b)
239 1.6 riastrad HTOLE32(v2.16b)
240 1.6 riastrad HTOLE32(v3.16b)
241 1.6 riastrad HTOLE32(v4.16b)
242 1.6 riastrad HTOLE32(v5.16b)
243 1.6 riastrad HTOLE32(v6.16b)
244 1.6 riastrad HTOLE32(v7.16b)
245 1.6 riastrad HTOLE32(v8.16b)
246 1.6 riastrad HTOLE32(v9.16b)
247 1.6 riastrad HTOLE32(v10.16b)
248 1.6 riastrad HTOLE32(v11.16b)
249 1.6 riastrad HTOLE32(v12.16b)
250 1.6 riastrad HTOLE32(v13.16b)
251 1.6 riastrad HTOLE32(v14.16b)
252 1.6 riastrad HTOLE32(v15.16b)
253 1.1 riastrad
254 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
255 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
256 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
257 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
258 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
259 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
260 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
261 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
262 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
263 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
264 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
265 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
266 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
267 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
268 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
269 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
270 1.1 riastrad
271 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
272 1.1 riastrad ldp d10, d11, [sp, #0x20]
273 1.1 riastrad ldp d12, d13, [sp, #0x30]
274 1.1 riastrad ldp d14, d15, [sp, #0x40]
275 1.1 riastrad
276 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
277 1.1 riastrad ret
278 1.1 riastrad END(chacha_stream256_neon)
279 1.1 riastrad
280 1.1 riastrad /*
281 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
282 1.1 riastrad * uint32_t blkno@w2,
283 1.1 riastrad * const uint8_t nonce[12]@x3,
284 1.1 riastrad * const uint8_t key[32]@x4,
285 1.1 riastrad * const uint8_t const[16]@x5,
286 1.1 riastrad * unsigned nr@w6)
287 1.1 riastrad */
288 1.1 riastrad ENTRY(chacha_stream_xor256_neon)
289 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */
290 1.1 riastrad mov fp, sp
291 1.1 riastrad
292 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */
293 1.1 riastrad stp d10, d11, [sp, #0x20]
294 1.1 riastrad stp d12, d13, [sp, #0x30]
295 1.1 riastrad stp d14, d15, [sp, #0x40]
296 1.1 riastrad
297 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */
298 1.1 riastrad mov x10, x5 /* r10 := c */
299 1.1 riastrad mov x11, x4 /* r11 := k */
300 1.1 riastrad add x12, x4, #16 /* r12 := k+4 */
301 1.1 riastrad mov x13, x3 /* r13 := nonce */
302 1.1 riastrad
303 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */
304 1.1 riastrad dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */
305 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */
306 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */
307 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */
308 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */
309 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */
310 1.1 riastrad
311 1.6 riastrad LE32TOH(v0.16b)
312 1.6 riastrad LE32TOH(v1.16b)
313 1.6 riastrad LE32TOH(v2.16b)
314 1.6 riastrad LE32TOH(v3.16b)
315 1.6 riastrad LE32TOH(v4.16b)
316 1.6 riastrad LE32TOH(v5.16b)
317 1.6 riastrad LE32TOH(v6.16b)
318 1.6 riastrad LE32TOH(v7.16b)
319 1.6 riastrad LE32TOH(v8.16b)
320 1.6 riastrad LE32TOH(v9.16b)
321 1.6 riastrad LE32TOH(v10.16b)
322 1.6 riastrad LE32TOH(v11.16b)
323 1.6 riastrad /* LE32TOH(v12.16b) -- blkno, already host order */
324 1.6 riastrad LE32TOH(v13.16b)
325 1.6 riastrad LE32TOH(v14.16b)
326 1.6 riastrad LE32TOH(v15.16b)
327 1.1 riastrad
328 1.1 riastrad mov v16.16b, v0.16b
329 1.1 riastrad mov v17.16b, v1.16b
330 1.1 riastrad mov v18.16b, v2.16b
331 1.1 riastrad mov v19.16b, v3.16b
332 1.1 riastrad mov v20.16b, v4.16b
333 1.1 riastrad mov v21.16b, v5.16b
334 1.1 riastrad mov v22.16b, v6.16b
335 1.1 riastrad mov v23.16b, v7.16b
336 1.1 riastrad mov v24.16b, v8.16b
337 1.1 riastrad mov v25.16b, v9.16b
338 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */
339 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */
340 1.1 riastrad mov w9, v11.s[0]
341 1.1 riastrad mov w10, v13.s[0]
342 1.1 riastrad mov w11, v14.s[0]
343 1.1 riastrad mov w12, v15.s[0]
344 1.1 riastrad
345 1.3 riastrad _ALIGN_TEXT
346 1.1 riastrad 1: subs w6, w6, #2
347 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
348 1.1 riastrad v28,v29,v30,v31, v27)
349 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
350 1.1 riastrad v28,v29,v30,v31, v27)
351 1.1 riastrad b.ne 1b
352 1.1 riastrad
353 1.1 riastrad dup v27.4s, w8
354 1.1 riastrad dup v28.4s, w9
355 1.1 riastrad dup v29.4s, w10
356 1.1 riastrad dup v30.4s, w11
357 1.1 riastrad dup v31.4s, w12
358 1.1 riastrad
359 1.1 riastrad add v0.4s, v0.4s, v16.4s
360 1.1 riastrad add v1.4s, v1.4s, v17.4s
361 1.1 riastrad add v2.4s, v2.4s, v18.4s
362 1.1 riastrad add v3.4s, v3.4s, v19.4s
363 1.1 riastrad add v4.4s, v4.4s, v20.4s
364 1.1 riastrad add v5.4s, v5.4s, v21.4s
365 1.1 riastrad add v6.4s, v6.4s, v22.4s
366 1.1 riastrad add v7.4s, v7.4s, v23.4s
367 1.1 riastrad add v8.4s, v8.4s, v24.4s
368 1.1 riastrad add v9.4s, v9.4s, v25.4s
369 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */
370 1.1 riastrad add v11.4s, v11.4s, v28.4s
371 1.1 riastrad add v12.4s, v12.4s, v26.4s
372 1.1 riastrad add v13.4s, v13.4s, v29.4s
373 1.1 riastrad add v14.4s, v14.4s, v30.4s
374 1.1 riastrad add v15.4s, v15.4s, v31.4s
375 1.1 riastrad
376 1.1 riastrad /*
377 1.1 riastrad * We could do these sixteen LD4-into-lane instructions instead
378 1.1 riastrad * by four LD1-into-register instructions, but we would need to
379 1.1 riastrad * permute the elements in v0-v15 to put them in the right
380 1.1 riastrad * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized
381 1.1 riastrad * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
382 1.1 riastrad * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
383 1.1 riastrad * exceed the savings in cost from four LD1 instructions rather
384 1.1 riastrad * than sixteen LD4 instructions, even if we interleave the LD1
385 1.1 riastrad * instructions with the ZIPs.
386 1.1 riastrad */
387 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
388 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
389 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
390 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
391 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
392 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
393 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
394 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
395 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
396 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
397 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
398 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
399 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
400 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
401 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
402 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
403 1.1 riastrad
404 1.6 riastrad HTOLE32(v0.16b)
405 1.6 riastrad HTOLE32(v1.16b)
406 1.6 riastrad HTOLE32(v2.16b)
407 1.6 riastrad HTOLE32(v3.16b)
408 1.6 riastrad HTOLE32(v4.16b)
409 1.6 riastrad HTOLE32(v5.16b)
410 1.6 riastrad HTOLE32(v6.16b)
411 1.6 riastrad HTOLE32(v7.16b)
412 1.6 riastrad HTOLE32(v8.16b)
413 1.6 riastrad HTOLE32(v9.16b)
414 1.6 riastrad HTOLE32(v10.16b)
415 1.6 riastrad HTOLE32(v11.16b)
416 1.6 riastrad HTOLE32(v12.16b)
417 1.6 riastrad HTOLE32(v13.16b)
418 1.6 riastrad HTOLE32(v14.16b)
419 1.6 riastrad HTOLE32(v15.16b)
420 1.1 riastrad
421 1.1 riastrad eor v16.16b, v16.16b, v0.16b
422 1.1 riastrad eor v17.16b, v17.16b, v1.16b
423 1.1 riastrad eor v18.16b, v18.16b, v2.16b
424 1.1 riastrad eor v19.16b, v19.16b, v3.16b
425 1.1 riastrad eor v20.16b, v20.16b, v4.16b
426 1.1 riastrad eor v21.16b, v21.16b, v5.16b
427 1.1 riastrad eor v22.16b, v22.16b, v6.16b
428 1.1 riastrad eor v23.16b, v23.16b, v7.16b
429 1.1 riastrad eor v24.16b, v24.16b, v8.16b
430 1.1 riastrad eor v25.16b, v25.16b, v9.16b
431 1.1 riastrad eor v26.16b, v26.16b, v10.16b
432 1.1 riastrad eor v27.16b, v27.16b, v11.16b
433 1.1 riastrad eor v28.16b, v28.16b, v12.16b
434 1.1 riastrad eor v29.16b, v29.16b, v13.16b
435 1.1 riastrad eor v30.16b, v30.16b, v14.16b
436 1.1 riastrad eor v31.16b, v31.16b, v15.16b
437 1.1 riastrad
438 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
439 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
440 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
441 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
442 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
443 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
444 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
445 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
446 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
447 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
448 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
449 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
450 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
451 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
452 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
453 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
454 1.1 riastrad
455 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */
456 1.1 riastrad ldp d10, d11, [sp, #0x20]
457 1.1 riastrad ldp d12, d13, [sp, #0x30]
458 1.1 riastrad ldp d14, d15, [sp, #0x40]
459 1.1 riastrad
460 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */
461 1.1 riastrad ret
462 1.1 riastrad END(chacha_stream_xor256_neon)
463 1.1 riastrad
464 1.1 riastrad .section .rodata
465 1.1 riastrad .p2align 4
466 1.1 riastrad
467 1.1 riastrad .type v0123,@object
468 1.1 riastrad v0123:
469 1.1 riastrad .long 0, 1, 2, 3
470 1.1 riastrad END(v0123)
471 1.1 riastrad
472 1.1 riastrad /*
473 1.1 riastrad * Must be immediately after v0123 -- we load them in a single
474 1.1 riastrad * ld1 instruction.
475 1.1 riastrad */
476 1.1 riastrad .type rot8,@object
477 1.1 riastrad rot8:
478 1.1 riastrad .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
479 1.1 riastrad END(rot8)
480