chacha_neon.c revision 1.4 1 /* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30 #include <sys/endian.h>
31
32 #include "arm_neon.h"
33 #include "chacha_neon.h"
34
35 static inline uint32x4_t
36 vrolq_n_u32(uint32x4_t x, uint8_t n)
37 {
38
39 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
40 }
41
42 static inline uint32x4_t
43 vhtole_u32(uint32x4_t x)
44 {
45 #if _BYTE_ORDER == _LITTLE_ENDIAN
46 return x;
47 #elif _BYTE_ORDER == _BIG_ENDIAN
48 return vrev32q_u8(x);
49 #endif
50 }
51
52 static inline uint32x4_t
53 vletoh_u32(uint32x4_t x)
54 {
55 #if _BYTE_ORDER == _LITTLE_ENDIAN
56 return x;
57 #elif _BYTE_ORDER == _BIG_ENDIAN
58 return vrev32q_u8(x);
59 #endif
60 }
61
62 static inline uint32x4_t
64 rol16(uint32x4_t x)
65 {
66 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
67
68 y16 = vrev32q_u16(x16);
69
70 return vreinterpretq_u32_u16(y16);
71 }
72
73 static inline uint32x4_t
74 rol12(uint32x4_t x)
75 {
76
77 return vrolq_n_u32(x, 12);
78 }
79
80 static inline uint32x4_t
81 rol8(uint32x4_t x)
82 {
83 #if defined(__aarch64__)
84 static const uint8x16_t rol8_tab = {
85 3, 0, 1, 2, 7, 4, 5, 6,
86 11, 8, 9,10, 15,12,13,14,
87 };
88 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
89
90 y8 = vqtbl1q_u8(x8, rol8_tab);
91
92 return vreinterpretq_u32_u8(y8);
93 #elif 0
94 /*
95 * GCC does a lousy job with this, spilling two 64-bit vector
96 * registers to the stack every time. There should be plenty
97 * of vector registers free, requiring no spills at all, and
98 * GCC should be able to hoist the load of rol8_tab out of any
99 * loops, but it doesn't and so attempting to use VTBL hurts
100 * more than it helps.
101 */
102 static const uint8x8_t rol8_tab = {
103 3, 0, 1, 2, 7, 4, 5, 6,
104 };
105
106 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
107
108 y64 = (uint64x2_t) {
109 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
110 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
111 };
112
113 return vreinterpretq_u32_u64(y64);
114 #else
115 return vrolq_n_u32(x, 8);
116 #endif
117 }
118
119 static inline uint32x4_t
120 rol7(uint32x4_t x)
121 {
122
123 return vrolq_n_u32(x, 7);
124 }
125
126 static inline void
128 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
129 unsigned nr)
130 {
131 uint32x4_t r0, r1, r2, r3;
132 uint32x4_t c0, c1, c2, c3;
133
134 r0 = *p0;
135 r1 = *p1;
136 r2 = *p2;
137 r3 = *p3;
138
139 for (; nr > 0; nr -= 2) {
140 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
141 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
142 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
143 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
144
145 c0 = r0;
146 c1 = vextq_u32(r1, r1, 1);
147 c2 = vextq_u32(r2, r2, 2);
148 c3 = vextq_u32(r3, r3, 3);
149
150 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
151 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
152 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
153 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
154
155 r0 = c0;
156 r1 = vextq_u32(c1, c1, 3);
157 r2 = vextq_u32(c2, c2, 2);
158 r3 = vextq_u32(c3, c3, 1);
159 }
160
161 *p0 = r0;
162 *p1 = r1;
163 *p2 = r2;
164 *p3 = r3;
165 }
166
167 void
169 chacha_core_neon(uint8_t out[restrict static 64],
170 const uint8_t in[static 16],
171 const uint8_t k[static 32],
172 const uint8_t c[static 16],
173 unsigned nr)
174 {
175 uint32x4_t in0, in1, in2, in3;
176 uint32x4_t r0, r1, r2, r3;
177
178 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
179 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
180 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
181 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
182
183 chacha_permute(&r0, &r1, &r2, &r3, nr);
184
185 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
186 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
187 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
188 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
189 }
190
191 void
192 hchacha_neon(uint8_t out[restrict static 32],
193 const uint8_t in[static 16],
194 const uint8_t k[static 32],
195 const uint8_t c[static 16],
196 unsigned nr)
197 {
198 uint32x4_t r0, r1, r2, r3;
199
200 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
201 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
202 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
203 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
204
205 chacha_permute(&r0, &r1, &r2, &r3, nr);
206
207 vst1q_u32((uint32_t *)out + 0, r0);
208 vst1q_u32((uint32_t *)out + 4, r3);
209 }
210
211 void
213 chacha_stream_neon(uint8_t *restrict s, size_t n,
214 uint32_t blkno,
215 const uint8_t nonce[static 12],
216 const uint8_t k[static 32],
217 unsigned nr)
218 {
219
220 #ifdef __aarch64__
221 for (; n >= 256; s += 256, n -= 256, blkno += 4)
222 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
223 #endif
224
225 if (n) {
226 const uint32x4_t blkno_inc = {1,0,0,0};
227 uint32x4_t in0, in1, in2, in3;
228 uint32x4_t r0, r1, r2, r3;
229
230 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
231 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
232 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
233 in3 = (uint32x4_t) {
234 blkno,
235 le32dec(nonce),
236 le32dec(nonce + 4),
237 le32dec(nonce + 8)
238 };
239
240 for (; n; s += 64, n -= 64) {
241 r0 = in0;
242 r1 = in1;
243 r2 = in2;
244 r3 = in3;
245 chacha_permute(&r0, &r1, &r2, &r3, nr);
246 r0 = vhtole_u32(vaddq_u32(r0, in0));
247 r1 = vhtole_u32(vaddq_u32(r1, in1));
248 r2 = vhtole_u32(vaddq_u32(r2, in2));
249 r3 = vhtole_u32(vaddq_u32(r3, in3));
250
251 if (n < 64) {
252 uint8_t buf[64] __aligned(16);
253
254 vst1q_u32((uint32_t *)buf + 4*0, r0);
255 vst1q_u32((uint32_t *)buf + 4*1, r1);
256 vst1q_u32((uint32_t *)buf + 4*2, r2);
257 vst1q_u32((uint32_t *)buf + 4*3, r3);
258 memcpy(s, buf, n);
259
260 break;
261 }
262
263 vst1q_u32((uint32_t *)s + 4*0, r0);
264 vst1q_u32((uint32_t *)s + 4*1, r1);
265 vst1q_u32((uint32_t *)s + 4*2, r2);
266 vst1q_u32((uint32_t *)s + 4*3, r3);
267 in3 = vaddq_u32(in3, blkno_inc);
268 }
269 }
270 }
271
272 void
274 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
275 uint32_t blkno,
276 const uint8_t nonce[static 12],
277 const uint8_t k[static 32],
278 unsigned nr)
279 {
280
281 #ifdef __aarch64__
282 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
283 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
284 chacha_const32, nr);
285 #endif
286
287 if (n) {
288 const uint32x4_t blkno_inc = {1,0,0,0};
289 uint32x4_t in0, in1, in2, in3;
290 uint32x4_t r0, r1, r2, r3;
291
292 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
293 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
294 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
295 in3 = (uint32x4_t) {
296 blkno,
297 le32dec(nonce),
298 le32dec(nonce + 4),
299 le32dec(nonce + 8)
300 };
301
302 for (; n; s += 64, p += 64, n -= 64) {
303 r0 = in0;
304 r1 = in1;
305 r2 = in2;
306 r3 = in3;
307 chacha_permute(&r0, &r1, &r2, &r3, nr);
308 r0 = vhtole_u32(vaddq_u32(r0, in0));
309 r1 = vhtole_u32(vaddq_u32(r1, in1));
310 r2 = vhtole_u32(vaddq_u32(r2, in2));
311 r3 = vhtole_u32(vaddq_u32(r3, in3));
312
313 if (n < 64) {
314 uint8_t buf[64] __aligned(16);
315 unsigned i;
316
317 vst1q_u32((uint32_t *)buf + 4*0, r0);
318 vst1q_u32((uint32_t *)buf + 4*1, r1);
319 vst1q_u32((uint32_t *)buf + 4*2, r2);
320 vst1q_u32((uint32_t *)buf + 4*3, r3);
321
322 for (i = 0; i < n - n%4; i += 4)
323 le32enc(s + i,
324 le32dec(p + i) ^ le32dec(buf + i));
325 for (; i < n; i++)
326 s[i] = p[i] ^ buf[i];
327
328 break;
329 }
330
331 r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
332 r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
333 r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
334 r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
335 vst1q_u32((uint32_t *)s + 4*0, r0);
336 vst1q_u32((uint32_t *)s + 4*1, r1);
337 vst1q_u32((uint32_t *)s + 4*2, r2);
338 vst1q_u32((uint32_t *)s + 4*3, r3);
339 in3 = vaddq_u32(in3, blkno_inc);
340 }
341 }
342 }
343
344 void
346 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
347 uint32_t blkno,
348 const uint8_t nonce[static 24],
349 const uint8_t k[static 32],
350 unsigned nr)
351 {
352 uint8_t subkey[32];
353 uint8_t subnonce[12];
354
355 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
356 memset(subnonce, 0, 4);
357 memcpy(subnonce + 4, nonce + 16, 8);
358 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
359 }
360
361 void
362 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
363 uint32_t blkno,
364 const uint8_t nonce[static 24],
365 const uint8_t k[static 32],
366 unsigned nr)
367 {
368 uint8_t subkey[32];
369 uint8_t subnonce[12];
370
371 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
372 memset(subnonce, 0, 4);
373 memcpy(subnonce + 4, nonce + 16, 8);
374 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
375 }
376