chacha_neon.c revision 1.9 1 /* $NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30 #include <sys/endian.h>
31
32 #include <crypto/arch/arm/arm_neon.h>
33 #include <crypto/arch/arm/arm_neon_imm.h>
34 #include "chacha_neon.h"
35
36 /*
37 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice
38 * it hurts performance at least on Cortex-A8.
39 */
40 #if 1
41 #define vrolq_n_u32(x, n) (vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n)))
42 #else
43 #define vrolq_n_u32(x, n) vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n))
44 #endif
45
46 static inline uint32x4_t
48 rol16(uint32x4_t x)
49 {
50 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
51
52 y16 = vrev32q_u16(x16);
53
54 return vreinterpretq_u32_u16(y16);
55 }
56
57 static inline uint32x4_t
58 rol12(uint32x4_t x)
59 {
60
61 return vrolq_n_u32(x, 12);
62 }
63
64 static inline uint32x4_t
65 rol8(uint32x4_t x)
66 {
67 #if defined(__aarch64__)
68 static const uint8x16_t rol8_tab = VQ_N_U8(
69 3, 0, 1, 2, 7, 4, 5, 6,
70 11, 8, 9,10, 15,12,13,14
71 );
72 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
73
74 y8 = vqtbl1q_u8(x8, rol8_tab);
75
76 return vreinterpretq_u32_u8(y8);
77 #elif 0
78 /*
79 * GCC does a lousy job with this, spilling two 64-bit vector
80 * registers to the stack every time. There should be plenty
81 * of vector registers free, requiring no spills at all, and
82 * GCC should be able to hoist the load of rol8_tab out of any
83 * loops, but it doesn't and so attempting to use VTBL hurts
84 * more than it helps.
85 */
86 static const uint8x8_t rol8_tab = V_N_U8(
87 3, 0, 1, 2, 7, 4, 5, 6
88 );
89
90 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
91
92 y64 = (uint64x2_t) {
93 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
94 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
95 };
96
97 return vreinterpretq_u32_u64(y64);
98 #else
99 return vrolq_n_u32(x, 8);
100 #endif
101 }
102
103 static inline uint32x4_t
104 rol7(uint32x4_t x)
105 {
106
107 return vrolq_n_u32(x, 7);
108 }
109
110 static inline void
112 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
113 unsigned nr)
114 {
115 uint32x4_t r0, r1, r2, r3;
116 uint32x4_t c0, c1, c2, c3;
117
118 r0 = *p0;
119 r1 = *p1;
120 r2 = *p2;
121 r3 = *p3;
122
123 for (; nr > 0; nr -= 2) {
124 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
125 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
126 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
127 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
128
129 c0 = r0;
130 c1 = vextq_u32(r1, r1, 1);
131 c2 = vextq_u32(r2, r2, 2);
132 c3 = vextq_u32(r3, r3, 3);
133
134 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
135 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
136 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
137 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
138
139 r0 = c0;
140 r1 = vextq_u32(c1, c1, 3);
141 r2 = vextq_u32(c2, c2, 2);
142 r3 = vextq_u32(c3, c3, 1);
143 }
144
145 *p0 = r0;
146 *p1 = r1;
147 *p2 = r2;
148 *p3 = r3;
149 }
150
151 void
153 chacha_core_neon(uint8_t out[restrict static 64],
154 const uint8_t in[static 16],
155 const uint8_t k[static 32],
156 const uint8_t c[static 16],
157 unsigned nr)
158 {
159 uint32x4_t in0, in1, in2, in3;
160 uint32x4_t r0, r1, r2, r3;
161
162 r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c));
163 r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
164 r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
165 r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in));
166
167 chacha_permute(&r0, &r1, &r2, &r3, nr);
168
169 vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0)));
170 vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1)));
171 vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2)));
172 vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3)));
173 }
174
175 void
176 hchacha_neon(uint8_t out[restrict static 32],
177 const uint8_t in[static 16],
178 const uint8_t k[static 32],
179 const uint8_t c[static 16],
180 unsigned nr)
181 {
182 uint32x4_t r0, r1, r2, r3;
183
184 r0 = vreinterpretq_u32_u8(vld1q_u8(c));
185 r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
186 r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
187 r3 = vreinterpretq_u32_u8(vld1q_u8(in));
188
189 chacha_permute(&r0, &r1, &r2, &r3, nr);
190
191 vst1q_u8(out + 0, vreinterpretq_u8_u32(r0));
192 vst1q_u8(out + 16, vreinterpretq_u8_u32(r3));
193 }
194
195 void
197 chacha_stream_neon(uint8_t *restrict s, size_t n,
198 uint32_t blkno,
199 const uint8_t nonce[static 12],
200 const uint8_t k[static 32],
201 unsigned nr)
202 {
203
204 for (; n >= 256; s += 256, n -= 256, blkno += 4)
205 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
206
207 if (n) {
208 const uint32x4_t blkno_inc = /* (1,0,0,0) */
209 vsetq_lane_u32(1, vdupq_n_u32(0), 0);
210 uint32x4_t in0, in1, in2, in3;
211 uint32x4_t r0, r1, r2, r3;
212
213 in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
214 in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
215 in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
216 in3 = (uint32x4_t) VQ_N_U32(
217 blkno,
218 le32dec(nonce),
219 le32dec(nonce + 4),
220 le32dec(nonce + 8)
221 );
222
223 for (; n; s += 64, n -= 64) {
224 r0 = in0;
225 r1 = in1;
226 r2 = in2;
227 r3 = in3;
228 chacha_permute(&r0, &r1, &r2, &r3, nr);
229 r0 = vaddq_u32(r0, in0);
230 r1 = vaddq_u32(r1, in1);
231 r2 = vaddq_u32(r2, in2);
232 r3 = vaddq_u32(r3, in3);
233
234 if (n < 64) {
235 uint8_t buf[64] __aligned(16);
236
237 vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
238 vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
239 vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
240 vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
241 memcpy(s, buf, n);
242
243 break;
244 }
245
246 vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
247 vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
248 vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
249 vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
250 in3 = vaddq_u32(in3, blkno_inc);
251 }
252 }
253 }
254
255 void
257 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
258 uint32_t blkno,
259 const uint8_t nonce[static 12],
260 const uint8_t k[static 32],
261 unsigned nr)
262 {
263
264 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
265 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
266 chacha_const32, nr);
267
268 if (n) {
269 const uint32x4_t blkno_inc = /* (1,0,0,0) */
270 vsetq_lane_u32(1, vdupq_n_u32(0), 0);
271 uint32x4_t in0, in1, in2, in3;
272 uint32x4_t r0, r1, r2, r3;
273
274 in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
275 in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
276 in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
277 in3 = (uint32x4_t) VQ_N_U32(
278 blkno,
279 le32dec(nonce),
280 le32dec(nonce + 4),
281 le32dec(nonce + 8)
282 );
283
284 for (; n; s += 64, p += 64, n -= 64) {
285 r0 = in0;
286 r1 = in1;
287 r2 = in2;
288 r3 = in3;
289 chacha_permute(&r0, &r1, &r2, &r3, nr);
290 r0 = vaddq_u32(r0, in0);
291 r1 = vaddq_u32(r1, in1);
292 r2 = vaddq_u32(r2, in2);
293 r3 = vaddq_u32(r3, in3);
294
295 if (n < 64) {
296 uint8_t buf[64] __aligned(16);
297 unsigned i;
298
299 vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
300 vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
301 vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
302 vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
303
304 for (i = 0; i < n - n%4; i += 4)
305 le32enc(s + i,
306 le32dec(p + i) ^ le32dec(buf + i));
307 for (; i < n; i++)
308 s[i] = p[i] ^ buf[i];
309
310 break;
311 }
312
313 r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0));
314 r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16));
315 r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32));
316 r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48));
317 vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
318 vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
319 vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
320 vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
321 in3 = vaddq_u32(in3, blkno_inc);
322 }
323 }
324 }
325
326 void
328 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
329 uint32_t blkno,
330 const uint8_t nonce[static 24],
331 const uint8_t k[static 32],
332 unsigned nr)
333 {
334 uint8_t subkey[32];
335 uint8_t subnonce[12];
336
337 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
338 memset(subnonce, 0, 4);
339 memcpy(subnonce + 4, nonce + 16, 8);
340 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
341 }
342
343 void
344 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
345 uint32_t blkno,
346 const uint8_t nonce[static 24],
347 const uint8_t k[static 32],
348 unsigned nr)
349 {
350 uint8_t subkey[32];
351 uint8_t subnonce[12];
352
353 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
354 memset(subnonce, 0, 4);
355 memcpy(subnonce + 4, nonce + 16, 8);
356 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
357 }
358