chacha_neon.c revision 1.4 1 1.4 riastrad /* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <sys/types.h>
30 1.1 riastrad #include <sys/endian.h>
31 1.1 riastrad
32 1.1 riastrad #include "arm_neon.h"
33 1.1 riastrad #include "chacha_neon.h"
34 1.1 riastrad
35 1.1 riastrad static inline uint32x4_t
36 1.1 riastrad vrolq_n_u32(uint32x4_t x, uint8_t n)
37 1.1 riastrad {
38 1.1 riastrad
39 1.1 riastrad return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
40 1.1 riastrad }
41 1.1 riastrad
42 1.1 riastrad static inline uint32x4_t
43 1.1 riastrad vhtole_u32(uint32x4_t x)
44 1.1 riastrad {
45 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
46 1.1 riastrad return x;
47 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
48 1.1 riastrad return vrev32q_u8(x);
49 1.1 riastrad #endif
50 1.1 riastrad }
51 1.1 riastrad
52 1.1 riastrad static inline uint32x4_t
53 1.1 riastrad vletoh_u32(uint32x4_t x)
54 1.1 riastrad {
55 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
56 1.1 riastrad return x;
57 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
58 1.1 riastrad return vrev32q_u8(x);
59 1.1 riastrad #endif
60 1.1 riastrad }
61 1.1 riastrad
62 1.4 riastrad static inline uint32x4_t
64 1.4 riastrad rol16(uint32x4_t x)
65 1.4 riastrad {
66 1.4 riastrad uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
67 1.4 riastrad
68 1.4 riastrad y16 = vrev32q_u16(x16);
69 1.4 riastrad
70 1.4 riastrad return vreinterpretq_u32_u16(y16);
71 1.4 riastrad }
72 1.4 riastrad
73 1.4 riastrad static inline uint32x4_t
74 1.4 riastrad rol12(uint32x4_t x)
75 1.4 riastrad {
76 1.4 riastrad
77 1.4 riastrad return vrolq_n_u32(x, 12);
78 1.4 riastrad }
79 1.4 riastrad
80 1.4 riastrad static inline uint32x4_t
81 1.4 riastrad rol8(uint32x4_t x)
82 1.4 riastrad {
83 1.4 riastrad #if defined(__aarch64__)
84 1.4 riastrad static const uint8x16_t rol8_tab = {
85 1.4 riastrad 3, 0, 1, 2, 7, 4, 5, 6,
86 1.4 riastrad 11, 8, 9,10, 15,12,13,14,
87 1.4 riastrad };
88 1.4 riastrad uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
89 1.4 riastrad
90 1.4 riastrad y8 = vqtbl1q_u8(x8, rol8_tab);
91 1.4 riastrad
92 1.4 riastrad return vreinterpretq_u32_u8(y8);
93 1.4 riastrad #elif 0
94 1.4 riastrad /*
95 1.4 riastrad * GCC does a lousy job with this, spilling two 64-bit vector
96 1.4 riastrad * registers to the stack every time. There should be plenty
97 1.4 riastrad * of vector registers free, requiring no spills at all, and
98 1.4 riastrad * GCC should be able to hoist the load of rol8_tab out of any
99 1.4 riastrad * loops, but it doesn't and so attempting to use VTBL hurts
100 1.4 riastrad * more than it helps.
101 1.4 riastrad */
102 1.4 riastrad static const uint8x8_t rol8_tab = {
103 1.4 riastrad 3, 0, 1, 2, 7, 4, 5, 6,
104 1.4 riastrad };
105 1.4 riastrad
106 1.4 riastrad uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
107 1.4 riastrad
108 1.4 riastrad y64 = (uint64x2_t) {
109 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
110 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
111 1.4 riastrad };
112 1.4 riastrad
113 1.4 riastrad return vreinterpretq_u32_u64(y64);
114 1.4 riastrad #else
115 1.4 riastrad return vrolq_n_u32(x, 8);
116 1.4 riastrad #endif
117 1.4 riastrad }
118 1.4 riastrad
119 1.4 riastrad static inline uint32x4_t
120 1.4 riastrad rol7(uint32x4_t x)
121 1.4 riastrad {
122 1.4 riastrad
123 1.4 riastrad return vrolq_n_u32(x, 7);
124 1.4 riastrad }
125 1.1 riastrad
126 1.1 riastrad static inline void
128 1.1 riastrad chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
129 1.1 riastrad unsigned nr)
130 1.1 riastrad {
131 1.1 riastrad uint32x4_t r0, r1, r2, r3;
132 1.1 riastrad uint32x4_t c0, c1, c2, c3;
133 1.1 riastrad
134 1.1 riastrad r0 = *p0;
135 1.1 riastrad r1 = *p1;
136 1.1 riastrad r2 = *p2;
137 1.1 riastrad r3 = *p3;
138 1.4 riastrad
139 1.4 riastrad for (; nr > 0; nr -= 2) {
140 1.4 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
141 1.4 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
142 1.1 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
143 1.1 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
144 1.1 riastrad
145 1.1 riastrad c0 = r0;
146 1.1 riastrad c1 = vextq_u32(r1, r1, 1);
147 1.1 riastrad c2 = vextq_u32(r2, r2, 2);
148 1.4 riastrad c3 = vextq_u32(r3, r3, 3);
149 1.4 riastrad
150 1.4 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
151 1.4 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
152 1.1 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
153 1.1 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
154 1.1 riastrad
155 1.1 riastrad r0 = c0;
156 1.1 riastrad r1 = vextq_u32(c1, c1, 3);
157 1.1 riastrad r2 = vextq_u32(c2, c2, 2);
158 1.1 riastrad r3 = vextq_u32(c3, c3, 1);
159 1.1 riastrad }
160 1.1 riastrad
161 1.1 riastrad *p0 = r0;
162 1.1 riastrad *p1 = r1;
163 1.1 riastrad *p2 = r2;
164 1.1 riastrad *p3 = r3;
165 1.1 riastrad }
166 1.1 riastrad
167 1.1 riastrad void
169 1.1 riastrad chacha_core_neon(uint8_t out[restrict static 64],
170 1.1 riastrad const uint8_t in[static 16],
171 1.1 riastrad const uint8_t k[static 32],
172 1.1 riastrad const uint8_t c[static 16],
173 1.1 riastrad unsigned nr)
174 1.1 riastrad {
175 1.1 riastrad uint32x4_t in0, in1, in2, in3;
176 1.1 riastrad uint32x4_t r0, r1, r2, r3;
177 1.1 riastrad
178 1.1 riastrad r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
179 1.1 riastrad r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
180 1.1 riastrad r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
181 1.1 riastrad r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
182 1.1 riastrad
183 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
184 1.1 riastrad
185 1.1 riastrad vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
186 1.1 riastrad vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
187 1.1 riastrad vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
188 1.1 riastrad vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
189 1.1 riastrad }
190 1.1 riastrad
191 1.1 riastrad void
192 1.1 riastrad hchacha_neon(uint8_t out[restrict static 32],
193 1.1 riastrad const uint8_t in[static 16],
194 1.1 riastrad const uint8_t k[static 32],
195 1.1 riastrad const uint8_t c[static 16],
196 1.1 riastrad unsigned nr)
197 1.1 riastrad {
198 1.1 riastrad uint32x4_t r0, r1, r2, r3;
199 1.1 riastrad
200 1.1 riastrad r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
201 1.1 riastrad r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
202 1.1 riastrad r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
203 1.1 riastrad r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
204 1.1 riastrad
205 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
206 1.1 riastrad
207 1.1 riastrad vst1q_u32((uint32_t *)out + 0, r0);
208 1.1 riastrad vst1q_u32((uint32_t *)out + 4, r3);
209 1.1 riastrad }
210 1.1 riastrad
211 1.1 riastrad void
213 1.1 riastrad chacha_stream_neon(uint8_t *restrict s, size_t n,
214 1.1 riastrad uint32_t blkno,
215 1.1 riastrad const uint8_t nonce[static 12],
216 1.3 riastrad const uint8_t k[static 32],
217 1.1 riastrad unsigned nr)
218 1.1 riastrad {
219 1.3 riastrad
220 1.1 riastrad #ifdef __aarch64__
221 1.1 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4)
222 1.1 riastrad chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
223 1.1 riastrad #endif
224 1.1 riastrad
225 1.1 riastrad if (n) {
226 1.1 riastrad const uint32x4_t blkno_inc = {1,0,0,0};
227 1.1 riastrad uint32x4_t in0, in1, in2, in3;
228 1.1 riastrad uint32x4_t r0, r1, r2, r3;
229 1.1 riastrad
230 1.1 riastrad in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
231 1.1 riastrad in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
232 1.1 riastrad in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
233 1.1 riastrad in3 = (uint32x4_t) {
234 1.1 riastrad blkno,
235 1.1 riastrad le32dec(nonce),
236 1.2 riastrad le32dec(nonce + 4),
237 1.1 riastrad le32dec(nonce + 8)
238 1.1 riastrad };
239 1.1 riastrad
240 1.1 riastrad for (; n; s += 64, n -= 64) {
241 1.1 riastrad r0 = in0;
242 1.1 riastrad r1 = in1;
243 1.1 riastrad r2 = in2;
244 1.1 riastrad r3 = in3;
245 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
246 1.2 riastrad r0 = vhtole_u32(vaddq_u32(r0, in0));
247 1.2 riastrad r1 = vhtole_u32(vaddq_u32(r1, in1));
248 1.2 riastrad r2 = vhtole_u32(vaddq_u32(r2, in2));
249 1.2 riastrad r3 = vhtole_u32(vaddq_u32(r3, in3));
250 1.2 riastrad
251 1.2 riastrad if (n < 64) {
252 1.2 riastrad uint8_t buf[64] __aligned(16);
253 1.2 riastrad
254 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*0, r0);
255 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*1, r1);
256 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*2, r2);
257 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*3, r3);
258 1.2 riastrad memcpy(s, buf, n);
259 1.1 riastrad
260 1.1 riastrad break;
261 1.1 riastrad }
262 1.1 riastrad
263 1.1 riastrad vst1q_u32((uint32_t *)s + 4*0, r0);
264 1.1 riastrad vst1q_u32((uint32_t *)s + 4*1, r1);
265 1.1 riastrad vst1q_u32((uint32_t *)s + 4*2, r2);
266 1.1 riastrad vst1q_u32((uint32_t *)s + 4*3, r3);
267 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
268 1.1 riastrad }
269 1.1 riastrad }
270 1.1 riastrad }
271 1.1 riastrad
272 1.1 riastrad void
274 1.1 riastrad chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
275 1.1 riastrad uint32_t blkno,
276 1.3 riastrad const uint8_t nonce[static 12],
277 1.1 riastrad const uint8_t k[static 32],
278 1.1 riastrad unsigned nr)
279 1.1 riastrad {
280 1.3 riastrad
281 1.1 riastrad #ifdef __aarch64__
282 1.1 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
283 1.1 riastrad chacha_stream_xor256_neon(s, p, blkno, nonce, k,
284 1.1 riastrad chacha_const32, nr);
285 1.1 riastrad #endif
286 1.1 riastrad
287 1.1 riastrad if (n) {
288 1.1 riastrad const uint32x4_t blkno_inc = {1,0,0,0};
289 1.1 riastrad uint32x4_t in0, in1, in2, in3;
290 1.1 riastrad uint32x4_t r0, r1, r2, r3;
291 1.1 riastrad
292 1.1 riastrad in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
293 1.1 riastrad in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
294 1.1 riastrad in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
295 1.1 riastrad in3 = (uint32x4_t) {
296 1.1 riastrad blkno,
297 1.2 riastrad le32dec(nonce),
298 1.1 riastrad le32dec(nonce + 4),
299 1.1 riastrad le32dec(nonce + 8)
300 1.1 riastrad };
301 1.1 riastrad
302 1.1 riastrad for (; n; s += 64, p += 64, n -= 64) {
303 1.1 riastrad r0 = in0;
304 1.1 riastrad r1 = in1;
305 1.1 riastrad r2 = in2;
306 1.1 riastrad r3 = in3;
307 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
308 1.2 riastrad r0 = vhtole_u32(vaddq_u32(r0, in0));
309 1.2 riastrad r1 = vhtole_u32(vaddq_u32(r1, in1));
310 1.2 riastrad r2 = vhtole_u32(vaddq_u32(r2, in2));
311 1.2 riastrad r3 = vhtole_u32(vaddq_u32(r3, in3));
312 1.2 riastrad
313 1.2 riastrad if (n < 64) {
314 1.2 riastrad uint8_t buf[64] __aligned(16);
315 1.2 riastrad unsigned i;
316 1.2 riastrad
317 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*0, r0);
318 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*1, r1);
319 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*2, r2);
320 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*3, r3);
321 1.2 riastrad
322 1.2 riastrad for (i = 0; i < n - n%4; i += 4)
323 1.2 riastrad le32enc(s + i,
324 1.2 riastrad le32dec(p + i) ^ le32dec(buf + i));
325 1.2 riastrad for (; i < n; i++)
326 1.1 riastrad s[i] = p[i] ^ buf[i];
327 1.1 riastrad
328 1.1 riastrad break;
329 1.1 riastrad }
330 1.1 riastrad
331 1.1 riastrad r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
332 1.1 riastrad r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
333 1.1 riastrad r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
334 1.1 riastrad r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
335 1.1 riastrad vst1q_u32((uint32_t *)s + 4*0, r0);
336 1.1 riastrad vst1q_u32((uint32_t *)s + 4*1, r1);
337 1.1 riastrad vst1q_u32((uint32_t *)s + 4*2, r2);
338 1.1 riastrad vst1q_u32((uint32_t *)s + 4*3, r3);
339 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
340 1.1 riastrad }
341 1.1 riastrad }
342 1.1 riastrad }
343 1.1 riastrad
344 1.1 riastrad void
346 1.1 riastrad xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
347 1.1 riastrad uint32_t blkno,
348 1.1 riastrad const uint8_t nonce[static 24],
349 1.1 riastrad const uint8_t k[static 32],
350 1.1 riastrad unsigned nr)
351 1.1 riastrad {
352 1.1 riastrad uint8_t subkey[32];
353 1.1 riastrad uint8_t subnonce[12];
354 1.1 riastrad
355 1.1 riastrad hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
356 1.1 riastrad memset(subnonce, 0, 4);
357 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8);
358 1.1 riastrad chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
359 1.1 riastrad }
360 1.1 riastrad
361 1.1 riastrad void
362 1.1 riastrad xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
363 1.1 riastrad uint32_t blkno,
364 1.1 riastrad const uint8_t nonce[static 24],
365 1.1 riastrad const uint8_t k[static 32],
366 1.1 riastrad unsigned nr)
367 1.1 riastrad {
368 1.1 riastrad uint8_t subkey[32];
369 1.1 riastrad uint8_t subnonce[12];
370
371 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
372 memset(subnonce, 0, 4);
373 memcpy(subnonce + 4, nonce + 16, 8);
374 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
375 }
376