chacha_neon.c revision 1.5 1 1.5 riastrad /* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <sys/types.h>
30 1.1 riastrad #include <sys/endian.h>
31 1.1 riastrad
32 1.1 riastrad #include "arm_neon.h"
33 1.1 riastrad #include "chacha_neon.h"
34 1.1 riastrad
35 1.1 riastrad static inline uint32x4_t
36 1.1 riastrad vrolq_n_u32(uint32x4_t x, uint8_t n)
37 1.1 riastrad {
38 1.1 riastrad
39 1.5 riastrad /*
40 1.5 riastrad * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
41 1.5 riastrad * practice it hurts performance at least on Cortex-A8.
42 1.5 riastrad */
43 1.5 riastrad #if 1
44 1.1 riastrad return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
45 1.5 riastrad #else
46 1.5 riastrad return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
47 1.5 riastrad #endif
48 1.1 riastrad }
49 1.1 riastrad
50 1.1 riastrad static inline uint32x4_t
51 1.1 riastrad vhtole_u32(uint32x4_t x)
52 1.1 riastrad {
53 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
54 1.1 riastrad return x;
55 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
56 1.1 riastrad return vrev32q_u8(x);
57 1.1 riastrad #endif
58 1.1 riastrad }
59 1.1 riastrad
60 1.1 riastrad static inline uint32x4_t
61 1.1 riastrad vletoh_u32(uint32x4_t x)
62 1.1 riastrad {
63 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
64 1.1 riastrad return x;
65 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
66 1.1 riastrad return vrev32q_u8(x);
67 1.1 riastrad #endif
68 1.1 riastrad }
69 1.1 riastrad
70 1.4 riastrad static inline uint32x4_t
72 1.4 riastrad rol16(uint32x4_t x)
73 1.4 riastrad {
74 1.4 riastrad uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
75 1.4 riastrad
76 1.4 riastrad y16 = vrev32q_u16(x16);
77 1.4 riastrad
78 1.4 riastrad return vreinterpretq_u32_u16(y16);
79 1.4 riastrad }
80 1.4 riastrad
81 1.4 riastrad static inline uint32x4_t
82 1.4 riastrad rol12(uint32x4_t x)
83 1.4 riastrad {
84 1.4 riastrad
85 1.4 riastrad return vrolq_n_u32(x, 12);
86 1.4 riastrad }
87 1.4 riastrad
88 1.4 riastrad static inline uint32x4_t
89 1.4 riastrad rol8(uint32x4_t x)
90 1.4 riastrad {
91 1.4 riastrad #if defined(__aarch64__)
92 1.4 riastrad static const uint8x16_t rol8_tab = {
93 1.4 riastrad 3, 0, 1, 2, 7, 4, 5, 6,
94 1.4 riastrad 11, 8, 9,10, 15,12,13,14,
95 1.4 riastrad };
96 1.4 riastrad uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
97 1.4 riastrad
98 1.4 riastrad y8 = vqtbl1q_u8(x8, rol8_tab);
99 1.4 riastrad
100 1.4 riastrad return vreinterpretq_u32_u8(y8);
101 1.4 riastrad #elif 0
102 1.4 riastrad /*
103 1.4 riastrad * GCC does a lousy job with this, spilling two 64-bit vector
104 1.4 riastrad * registers to the stack every time. There should be plenty
105 1.4 riastrad * of vector registers free, requiring no spills at all, and
106 1.4 riastrad * GCC should be able to hoist the load of rol8_tab out of any
107 1.4 riastrad * loops, but it doesn't and so attempting to use VTBL hurts
108 1.4 riastrad * more than it helps.
109 1.4 riastrad */
110 1.4 riastrad static const uint8x8_t rol8_tab = {
111 1.4 riastrad 3, 0, 1, 2, 7, 4, 5, 6,
112 1.4 riastrad };
113 1.4 riastrad
114 1.4 riastrad uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
115 1.4 riastrad
116 1.4 riastrad y64 = (uint64x2_t) {
117 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
118 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
119 1.4 riastrad };
120 1.4 riastrad
121 1.4 riastrad return vreinterpretq_u32_u64(y64);
122 1.4 riastrad #else
123 1.4 riastrad return vrolq_n_u32(x, 8);
124 1.4 riastrad #endif
125 1.4 riastrad }
126 1.4 riastrad
127 1.4 riastrad static inline uint32x4_t
128 1.4 riastrad rol7(uint32x4_t x)
129 1.4 riastrad {
130 1.4 riastrad
131 1.4 riastrad return vrolq_n_u32(x, 7);
132 1.4 riastrad }
133 1.1 riastrad
134 1.1 riastrad static inline void
136 1.1 riastrad chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
137 1.1 riastrad unsigned nr)
138 1.1 riastrad {
139 1.1 riastrad uint32x4_t r0, r1, r2, r3;
140 1.1 riastrad uint32x4_t c0, c1, c2, c3;
141 1.1 riastrad
142 1.1 riastrad r0 = *p0;
143 1.1 riastrad r1 = *p1;
144 1.1 riastrad r2 = *p2;
145 1.1 riastrad r3 = *p3;
146 1.4 riastrad
147 1.4 riastrad for (; nr > 0; nr -= 2) {
148 1.4 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
149 1.4 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
150 1.1 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
151 1.1 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
152 1.1 riastrad
153 1.1 riastrad c0 = r0;
154 1.1 riastrad c1 = vextq_u32(r1, r1, 1);
155 1.1 riastrad c2 = vextq_u32(r2, r2, 2);
156 1.4 riastrad c3 = vextq_u32(r3, r3, 3);
157 1.4 riastrad
158 1.4 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
159 1.4 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
160 1.1 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
161 1.1 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
162 1.1 riastrad
163 1.1 riastrad r0 = c0;
164 1.1 riastrad r1 = vextq_u32(c1, c1, 3);
165 1.1 riastrad r2 = vextq_u32(c2, c2, 2);
166 1.1 riastrad r3 = vextq_u32(c3, c3, 1);
167 1.1 riastrad }
168 1.1 riastrad
169 1.1 riastrad *p0 = r0;
170 1.1 riastrad *p1 = r1;
171 1.1 riastrad *p2 = r2;
172 1.1 riastrad *p3 = r3;
173 1.1 riastrad }
174 1.1 riastrad
175 1.1 riastrad void
177 1.1 riastrad chacha_core_neon(uint8_t out[restrict static 64],
178 1.1 riastrad const uint8_t in[static 16],
179 1.1 riastrad const uint8_t k[static 32],
180 1.1 riastrad const uint8_t c[static 16],
181 1.1 riastrad unsigned nr)
182 1.1 riastrad {
183 1.1 riastrad uint32x4_t in0, in1, in2, in3;
184 1.1 riastrad uint32x4_t r0, r1, r2, r3;
185 1.1 riastrad
186 1.1 riastrad r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
187 1.1 riastrad r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
188 1.1 riastrad r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
189 1.1 riastrad r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
190 1.1 riastrad
191 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
192 1.1 riastrad
193 1.1 riastrad vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
194 1.1 riastrad vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
195 1.1 riastrad vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
196 1.1 riastrad vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
197 1.1 riastrad }
198 1.1 riastrad
199 1.1 riastrad void
200 1.1 riastrad hchacha_neon(uint8_t out[restrict static 32],
201 1.1 riastrad const uint8_t in[static 16],
202 1.1 riastrad const uint8_t k[static 32],
203 1.1 riastrad const uint8_t c[static 16],
204 1.1 riastrad unsigned nr)
205 1.1 riastrad {
206 1.1 riastrad uint32x4_t r0, r1, r2, r3;
207 1.1 riastrad
208 1.1 riastrad r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
209 1.1 riastrad r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
210 1.1 riastrad r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
211 1.1 riastrad r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
212 1.1 riastrad
213 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
214 1.1 riastrad
215 1.1 riastrad vst1q_u32((uint32_t *)out + 0, r0);
216 1.1 riastrad vst1q_u32((uint32_t *)out + 4, r3);
217 1.1 riastrad }
218 1.1 riastrad
219 1.1 riastrad void
221 1.1 riastrad chacha_stream_neon(uint8_t *restrict s, size_t n,
222 1.1 riastrad uint32_t blkno,
223 1.1 riastrad const uint8_t nonce[static 12],
224 1.3 riastrad const uint8_t k[static 32],
225 1.1 riastrad unsigned nr)
226 1.1 riastrad {
227 1.3 riastrad
228 1.1 riastrad #ifdef __aarch64__
229 1.1 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4)
230 1.1 riastrad chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
231 1.1 riastrad #endif
232 1.1 riastrad
233 1.1 riastrad if (n) {
234 1.1 riastrad const uint32x4_t blkno_inc = {1,0,0,0};
235 1.1 riastrad uint32x4_t in0, in1, in2, in3;
236 1.1 riastrad uint32x4_t r0, r1, r2, r3;
237 1.1 riastrad
238 1.1 riastrad in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
239 1.1 riastrad in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
240 1.1 riastrad in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
241 1.1 riastrad in3 = (uint32x4_t) {
242 1.1 riastrad blkno,
243 1.1 riastrad le32dec(nonce),
244 1.2 riastrad le32dec(nonce + 4),
245 1.1 riastrad le32dec(nonce + 8)
246 1.1 riastrad };
247 1.1 riastrad
248 1.1 riastrad for (; n; s += 64, n -= 64) {
249 1.1 riastrad r0 = in0;
250 1.1 riastrad r1 = in1;
251 1.1 riastrad r2 = in2;
252 1.1 riastrad r3 = in3;
253 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
254 1.2 riastrad r0 = vhtole_u32(vaddq_u32(r0, in0));
255 1.2 riastrad r1 = vhtole_u32(vaddq_u32(r1, in1));
256 1.2 riastrad r2 = vhtole_u32(vaddq_u32(r2, in2));
257 1.2 riastrad r3 = vhtole_u32(vaddq_u32(r3, in3));
258 1.2 riastrad
259 1.2 riastrad if (n < 64) {
260 1.2 riastrad uint8_t buf[64] __aligned(16);
261 1.2 riastrad
262 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*0, r0);
263 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*1, r1);
264 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*2, r2);
265 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*3, r3);
266 1.2 riastrad memcpy(s, buf, n);
267 1.1 riastrad
268 1.1 riastrad break;
269 1.1 riastrad }
270 1.1 riastrad
271 1.1 riastrad vst1q_u32((uint32_t *)s + 4*0, r0);
272 1.1 riastrad vst1q_u32((uint32_t *)s + 4*1, r1);
273 1.1 riastrad vst1q_u32((uint32_t *)s + 4*2, r2);
274 1.1 riastrad vst1q_u32((uint32_t *)s + 4*3, r3);
275 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
276 1.1 riastrad }
277 1.1 riastrad }
278 1.1 riastrad }
279 1.1 riastrad
280 1.1 riastrad void
282 1.1 riastrad chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
283 1.1 riastrad uint32_t blkno,
284 1.3 riastrad const uint8_t nonce[static 12],
285 1.1 riastrad const uint8_t k[static 32],
286 1.1 riastrad unsigned nr)
287 1.1 riastrad {
288 1.3 riastrad
289 1.1 riastrad #ifdef __aarch64__
290 1.1 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
291 1.1 riastrad chacha_stream_xor256_neon(s, p, blkno, nonce, k,
292 1.1 riastrad chacha_const32, nr);
293 1.1 riastrad #endif
294 1.1 riastrad
295 1.1 riastrad if (n) {
296 1.1 riastrad const uint32x4_t blkno_inc = {1,0,0,0};
297 1.1 riastrad uint32x4_t in0, in1, in2, in3;
298 1.1 riastrad uint32x4_t r0, r1, r2, r3;
299 1.1 riastrad
300 1.1 riastrad in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
301 1.1 riastrad in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
302 1.1 riastrad in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
303 1.1 riastrad in3 = (uint32x4_t) {
304 1.1 riastrad blkno,
305 1.2 riastrad le32dec(nonce),
306 1.1 riastrad le32dec(nonce + 4),
307 1.1 riastrad le32dec(nonce + 8)
308 1.1 riastrad };
309 1.1 riastrad
310 1.1 riastrad for (; n; s += 64, p += 64, n -= 64) {
311 1.1 riastrad r0 = in0;
312 1.1 riastrad r1 = in1;
313 1.1 riastrad r2 = in2;
314 1.1 riastrad r3 = in3;
315 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
316 1.2 riastrad r0 = vhtole_u32(vaddq_u32(r0, in0));
317 1.2 riastrad r1 = vhtole_u32(vaddq_u32(r1, in1));
318 1.2 riastrad r2 = vhtole_u32(vaddq_u32(r2, in2));
319 1.2 riastrad r3 = vhtole_u32(vaddq_u32(r3, in3));
320 1.2 riastrad
321 1.2 riastrad if (n < 64) {
322 1.2 riastrad uint8_t buf[64] __aligned(16);
323 1.2 riastrad unsigned i;
324 1.2 riastrad
325 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*0, r0);
326 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*1, r1);
327 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*2, r2);
328 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*3, r3);
329 1.2 riastrad
330 1.2 riastrad for (i = 0; i < n - n%4; i += 4)
331 1.2 riastrad le32enc(s + i,
332 1.2 riastrad le32dec(p + i) ^ le32dec(buf + i));
333 1.2 riastrad for (; i < n; i++)
334 1.1 riastrad s[i] = p[i] ^ buf[i];
335 1.1 riastrad
336 1.1 riastrad break;
337 1.1 riastrad }
338 1.1 riastrad
339 1.1 riastrad r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
340 1.1 riastrad r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
341 1.1 riastrad r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
342 1.1 riastrad r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
343 1.1 riastrad vst1q_u32((uint32_t *)s + 4*0, r0);
344 1.1 riastrad vst1q_u32((uint32_t *)s + 4*1, r1);
345 1.1 riastrad vst1q_u32((uint32_t *)s + 4*2, r2);
346 1.1 riastrad vst1q_u32((uint32_t *)s + 4*3, r3);
347 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
348 1.1 riastrad }
349 1.1 riastrad }
350 1.1 riastrad }
351 1.1 riastrad
352 1.1 riastrad void
354 1.1 riastrad xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
355 1.1 riastrad uint32_t blkno,
356 1.1 riastrad const uint8_t nonce[static 24],
357 1.1 riastrad const uint8_t k[static 32],
358 1.1 riastrad unsigned nr)
359 1.1 riastrad {
360 1.1 riastrad uint8_t subkey[32];
361 1.1 riastrad uint8_t subnonce[12];
362 1.1 riastrad
363 1.1 riastrad hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
364 1.1 riastrad memset(subnonce, 0, 4);
365 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8);
366 1.1 riastrad chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
367 1.1 riastrad }
368 1.1 riastrad
369 1.1 riastrad void
370 1.1 riastrad xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
371 1.1 riastrad uint32_t blkno,
372 1.1 riastrad const uint8_t nonce[static 24],
373 1.1 riastrad const uint8_t k[static 32],
374 1.1 riastrad unsigned nr)
375 1.1 riastrad {
376 1.1 riastrad uint8_t subkey[32];
377 1.1 riastrad uint8_t subnonce[12];
378
379 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
380 memset(subnonce, 0, 4);
381 memcpy(subnonce + 4, nonce + 16, 8);
382 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
383 }
384