chacha_neon.c revision 1.9 1 1.9 rin /* $NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <sys/types.h>
30 1.1 riastrad #include <sys/endian.h>
31 1.1 riastrad
32 1.9 rin #include <crypto/arch/arm/arm_neon.h>
33 1.9 rin #include <crypto/arch/arm/arm_neon_imm.h>
34 1.1 riastrad #include "chacha_neon.h"
35 1.1 riastrad
36 1.8 riastrad /*
37 1.8 riastrad * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice
38 1.8 riastrad * it hurts performance at least on Cortex-A8.
39 1.8 riastrad */
40 1.5 riastrad #if 1
41 1.8 riastrad #define vrolq_n_u32(x, n) (vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n)))
42 1.5 riastrad #else
43 1.8 riastrad #define vrolq_n_u32(x, n) vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n))
44 1.5 riastrad #endif
45 1.1 riastrad
46 1.4 riastrad static inline uint32x4_t
48 1.4 riastrad rol16(uint32x4_t x)
49 1.4 riastrad {
50 1.4 riastrad uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
51 1.4 riastrad
52 1.4 riastrad y16 = vrev32q_u16(x16);
53 1.4 riastrad
54 1.4 riastrad return vreinterpretq_u32_u16(y16);
55 1.4 riastrad }
56 1.4 riastrad
57 1.4 riastrad static inline uint32x4_t
58 1.4 riastrad rol12(uint32x4_t x)
59 1.4 riastrad {
60 1.4 riastrad
61 1.4 riastrad return vrolq_n_u32(x, 12);
62 1.4 riastrad }
63 1.4 riastrad
64 1.4 riastrad static inline uint32x4_t
65 1.4 riastrad rol8(uint32x4_t x)
66 1.4 riastrad {
67 1.8 riastrad #if defined(__aarch64__)
68 1.4 riastrad static const uint8x16_t rol8_tab = VQ_N_U8(
69 1.8 riastrad 3, 0, 1, 2, 7, 4, 5, 6,
70 1.8 riastrad 11, 8, 9,10, 15,12,13,14
71 1.4 riastrad );
72 1.4 riastrad uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
73 1.4 riastrad
74 1.4 riastrad y8 = vqtbl1q_u8(x8, rol8_tab);
75 1.4 riastrad
76 1.4 riastrad return vreinterpretq_u32_u8(y8);
77 1.4 riastrad #elif 0
78 1.4 riastrad /*
79 1.4 riastrad * GCC does a lousy job with this, spilling two 64-bit vector
80 1.4 riastrad * registers to the stack every time. There should be plenty
81 1.4 riastrad * of vector registers free, requiring no spills at all, and
82 1.4 riastrad * GCC should be able to hoist the load of rol8_tab out of any
83 1.4 riastrad * loops, but it doesn't and so attempting to use VTBL hurts
84 1.4 riastrad * more than it helps.
85 1.8 riastrad */
86 1.8 riastrad static const uint8x8_t rol8_tab = V_N_U8(
87 1.8 riastrad 3, 0, 1, 2, 7, 4, 5, 6
88 1.4 riastrad );
89 1.4 riastrad
90 1.4 riastrad uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
91 1.4 riastrad
92 1.4 riastrad y64 = (uint64x2_t) {
93 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
94 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
95 1.4 riastrad };
96 1.4 riastrad
97 1.4 riastrad return vreinterpretq_u32_u64(y64);
98 1.4 riastrad #else
99 1.4 riastrad return vrolq_n_u32(x, 8);
100 1.4 riastrad #endif
101 1.4 riastrad }
102 1.4 riastrad
103 1.4 riastrad static inline uint32x4_t
104 1.4 riastrad rol7(uint32x4_t x)
105 1.4 riastrad {
106 1.4 riastrad
107 1.4 riastrad return vrolq_n_u32(x, 7);
108 1.4 riastrad }
109 1.1 riastrad
110 1.1 riastrad static inline void
112 1.1 riastrad chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
113 1.1 riastrad unsigned nr)
114 1.1 riastrad {
115 1.1 riastrad uint32x4_t r0, r1, r2, r3;
116 1.1 riastrad uint32x4_t c0, c1, c2, c3;
117 1.1 riastrad
118 1.1 riastrad r0 = *p0;
119 1.1 riastrad r1 = *p1;
120 1.1 riastrad r2 = *p2;
121 1.1 riastrad r3 = *p3;
122 1.4 riastrad
123 1.4 riastrad for (; nr > 0; nr -= 2) {
124 1.4 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
125 1.4 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
126 1.1 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
127 1.1 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
128 1.1 riastrad
129 1.1 riastrad c0 = r0;
130 1.1 riastrad c1 = vextq_u32(r1, r1, 1);
131 1.1 riastrad c2 = vextq_u32(r2, r2, 2);
132 1.4 riastrad c3 = vextq_u32(r3, r3, 3);
133 1.4 riastrad
134 1.4 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
135 1.4 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
136 1.1 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
137 1.1 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
138 1.1 riastrad
139 1.1 riastrad r0 = c0;
140 1.1 riastrad r1 = vextq_u32(c1, c1, 3);
141 1.1 riastrad r2 = vextq_u32(c2, c2, 2);
142 1.1 riastrad r3 = vextq_u32(c3, c3, 1);
143 1.1 riastrad }
144 1.1 riastrad
145 1.1 riastrad *p0 = r0;
146 1.1 riastrad *p1 = r1;
147 1.1 riastrad *p2 = r2;
148 1.1 riastrad *p3 = r3;
149 1.1 riastrad }
150 1.1 riastrad
151 1.1 riastrad void
153 1.1 riastrad chacha_core_neon(uint8_t out[restrict static 64],
154 1.1 riastrad const uint8_t in[static 16],
155 1.1 riastrad const uint8_t k[static 32],
156 1.1 riastrad const uint8_t c[static 16],
157 1.1 riastrad unsigned nr)
158 1.1 riastrad {
159 1.8 riastrad uint32x4_t in0, in1, in2, in3;
160 1.8 riastrad uint32x4_t r0, r1, r2, r3;
161 1.8 riastrad
162 1.8 riastrad r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c));
163 1.1 riastrad r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
164 1.1 riastrad r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
165 1.1 riastrad r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in));
166 1.8 riastrad
167 1.8 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
168 1.8 riastrad
169 1.8 riastrad vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0)));
170 1.1 riastrad vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1)));
171 1.1 riastrad vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2)));
172 1.1 riastrad vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3)));
173 1.1 riastrad }
174 1.1 riastrad
175 1.1 riastrad void
176 1.1 riastrad hchacha_neon(uint8_t out[restrict static 32],
177 1.1 riastrad const uint8_t in[static 16],
178 1.1 riastrad const uint8_t k[static 32],
179 1.1 riastrad const uint8_t c[static 16],
180 1.1 riastrad unsigned nr)
181 1.8 riastrad {
182 1.8 riastrad uint32x4_t r0, r1, r2, r3;
183 1.8 riastrad
184 1.8 riastrad r0 = vreinterpretq_u32_u8(vld1q_u8(c));
185 1.1 riastrad r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
186 1.1 riastrad r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
187 1.1 riastrad r3 = vreinterpretq_u32_u8(vld1q_u8(in));
188 1.8 riastrad
189 1.8 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
190 1.1 riastrad
191 1.1 riastrad vst1q_u8(out + 0, vreinterpretq_u8_u32(r0));
192 1.1 riastrad vst1q_u8(out + 16, vreinterpretq_u8_u32(r3));
193 1.1 riastrad }
194 1.1 riastrad
195 1.1 riastrad void
197 1.1 riastrad chacha_stream_neon(uint8_t *restrict s, size_t n,
198 1.1 riastrad uint32_t blkno,
199 1.1 riastrad const uint8_t nonce[static 12],
200 1.1 riastrad const uint8_t k[static 32],
201 1.1 riastrad unsigned nr)
202 1.1 riastrad {
203 1.1 riastrad
204 1.8 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4)
205 1.8 riastrad chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
206 1.1 riastrad
207 1.1 riastrad if (n) {
208 1.1 riastrad const uint32x4_t blkno_inc = /* (1,0,0,0) */
209 1.8 riastrad vsetq_lane_u32(1, vdupq_n_u32(0), 0);
210 1.8 riastrad uint32x4_t in0, in1, in2, in3;
211 1.8 riastrad uint32x4_t r0, r1, r2, r3;
212 1.8 riastrad
213 1.1 riastrad in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
214 1.1 riastrad in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
215 1.1 riastrad in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
216 1.1 riastrad in3 = (uint32x4_t) VQ_N_U32(
217 1.8 riastrad blkno,
218 1.1 riastrad le32dec(nonce),
219 1.2 riastrad le32dec(nonce + 4),
220 1.1 riastrad le32dec(nonce + 8)
221 1.1 riastrad );
222 1.1 riastrad
223 1.1 riastrad for (; n; s += 64, n -= 64) {
224 1.1 riastrad r0 = in0;
225 1.8 riastrad r1 = in1;
226 1.8 riastrad r2 = in2;
227 1.8 riastrad r3 = in3;
228 1.8 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
229 1.2 riastrad r0 = vaddq_u32(r0, in0);
230 1.2 riastrad r1 = vaddq_u32(r1, in1);
231 1.2 riastrad r2 = vaddq_u32(r2, in2);
232 1.2 riastrad r3 = vaddq_u32(r3, in3);
233 1.8 riastrad
234 1.8 riastrad if (n < 64) {
235 1.8 riastrad uint8_t buf[64] __aligned(16);
236 1.8 riastrad
237 1.2 riastrad vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
238 1.2 riastrad vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
239 1.2 riastrad vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
240 1.2 riastrad vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
241 1.2 riastrad memcpy(s, buf, n);
242 1.8 riastrad
243 1.8 riastrad break;
244 1.8 riastrad }
245 1.8 riastrad
246 1.1 riastrad vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
247 1.1 riastrad vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
248 1.1 riastrad vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
249 1.1 riastrad vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
250 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
251 1.1 riastrad }
252 1.1 riastrad }
253 1.1 riastrad }
254 1.1 riastrad
255 1.1 riastrad void
257 1.1 riastrad chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
258 1.1 riastrad uint32_t blkno,
259 1.1 riastrad const uint8_t nonce[static 12],
260 1.1 riastrad const uint8_t k[static 32],
261 1.1 riastrad unsigned nr)
262 1.1 riastrad {
263 1.1 riastrad
264 1.8 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
265 1.8 riastrad chacha_stream_xor256_neon(s, p, blkno, nonce, k,
266 1.1 riastrad chacha_const32, nr);
267 1.1 riastrad
268 1.1 riastrad if (n) {
269 1.8 riastrad const uint32x4_t blkno_inc = /* (1,0,0,0) */
270 1.8 riastrad vsetq_lane_u32(1, vdupq_n_u32(0), 0);
271 1.8 riastrad uint32x4_t in0, in1, in2, in3;
272 1.8 riastrad uint32x4_t r0, r1, r2, r3;
273 1.1 riastrad
274 1.1 riastrad in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
275 1.1 riastrad in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
276 1.1 riastrad in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
277 1.8 riastrad in3 = (uint32x4_t) VQ_N_U32(
278 1.1 riastrad blkno,
279 1.2 riastrad le32dec(nonce),
280 1.1 riastrad le32dec(nonce + 4),
281 1.1 riastrad le32dec(nonce + 8)
282 1.1 riastrad );
283 1.1 riastrad
284 1.1 riastrad for (; n; s += 64, p += 64, n -= 64) {
285 1.8 riastrad r0 = in0;
286 1.8 riastrad r1 = in1;
287 1.8 riastrad r2 = in2;
288 1.8 riastrad r3 = in3;
289 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
290 1.2 riastrad r0 = vaddq_u32(r0, in0);
291 1.2 riastrad r1 = vaddq_u32(r1, in1);
292 1.2 riastrad r2 = vaddq_u32(r2, in2);
293 1.2 riastrad r3 = vaddq_u32(r3, in3);
294 1.8 riastrad
295 1.8 riastrad if (n < 64) {
296 1.8 riastrad uint8_t buf[64] __aligned(16);
297 1.8 riastrad unsigned i;
298 1.2 riastrad
299 1.2 riastrad vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
300 1.2 riastrad vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
301 1.2 riastrad vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
302 1.2 riastrad vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
303 1.2 riastrad
304 1.2 riastrad for (i = 0; i < n - n%4; i += 4)
305 1.2 riastrad le32enc(s + i,
306 1.2 riastrad le32dec(p + i) ^ le32dec(buf + i));
307 1.2 riastrad for (; i < n; i++)
308 1.8 riastrad s[i] = p[i] ^ buf[i];
309 1.8 riastrad
310 1.8 riastrad break;
311 1.8 riastrad }
312 1.8 riastrad
313 1.8 riastrad r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0));
314 1.8 riastrad r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16));
315 1.8 riastrad r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32));
316 1.1 riastrad r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48));
317 1.1 riastrad vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
318 1.1 riastrad vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
319 1.1 riastrad vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
320 1.1 riastrad vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
321 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
322 1.1 riastrad }
323 1.1 riastrad }
324 1.1 riastrad }
325 1.1 riastrad
326 1.1 riastrad void
328 1.1 riastrad xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
329 1.1 riastrad uint32_t blkno,
330 1.1 riastrad const uint8_t nonce[static 24],
331 1.1 riastrad const uint8_t k[static 32],
332 1.1 riastrad unsigned nr)
333 1.1 riastrad {
334 1.1 riastrad uint8_t subkey[32];
335 1.1 riastrad uint8_t subnonce[12];
336 1.1 riastrad
337 1.1 riastrad hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
338 1.1 riastrad memset(subnonce, 0, 4);
339 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8);
340 1.1 riastrad chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
341 1.1 riastrad }
342 1.1 riastrad
343 1.1 riastrad void
344 1.1 riastrad xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
345 1.1 riastrad uint32_t blkno,
346 1.1 riastrad const uint8_t nonce[static 24],
347 1.1 riastrad const uint8_t k[static 32],
348 1.1 riastrad unsigned nr)
349 1.1 riastrad {
350 1.1 riastrad uint8_t subkey[32];
351 1.1 riastrad uint8_t subnonce[12];
352
353 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
354 memset(subnonce, 0, 4);
355 memcpy(subnonce + 4, nonce + 16, 8);
356 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
357 }
358