chacha_neon.c revision 1.3 1 /* $NetBSD: chacha_neon.c,v 1.3 2020/07/27 20:51:29 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30 #include <sys/endian.h>
31
32 #include "arm_neon.h"
33 #include "chacha_neon.h"
34
35 static inline uint32x4_t
36 vrolq_n_u32(uint32x4_t x, uint8_t n)
37 {
38
39 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
40 }
41
42 static inline uint32x4_t
43 vhtole_u32(uint32x4_t x)
44 {
45 #if _BYTE_ORDER == _LITTLE_ENDIAN
46 return x;
47 #elif _BYTE_ORDER == _BIG_ENDIAN
48 return vrev32q_u8(x);
49 #endif
50 }
51
52 static inline uint32x4_t
53 vletoh_u32(uint32x4_t x)
54 {
55 #if _BYTE_ORDER == _LITTLE_ENDIAN
56 return x;
57 #elif _BYTE_ORDER == _BIG_ENDIAN
58 return vrev32q_u8(x);
59 #endif
60 }
61
62 static inline void
64 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
65 unsigned nr)
66 {
67 uint32x4_t r0, r1, r2, r3;
68 uint32x4_t c0, c1, c2, c3;
69
70 r0 = *p0;
71 r1 = *p1;
72 r2 = *p2;
73 r3 = *p3;
74
75 for (; nr > 0; nr -= 2) {
76 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 16);
77 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 12);
78 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 8);
79 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 7);
80
81 c0 = r0;
82 c1 = vextq_u32(r1, r1, 1);
83 c2 = vextq_u32(r2, r2, 2);
84 c3 = vextq_u32(r3, r3, 3);
85
86 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 16);
87 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 12);
88 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 8);
89 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 7);
90
91 r0 = c0;
92 r1 = vextq_u32(c1, c1, 3);
93 r2 = vextq_u32(c2, c2, 2);
94 r3 = vextq_u32(c3, c3, 1);
95 }
96
97 *p0 = r0;
98 *p1 = r1;
99 *p2 = r2;
100 *p3 = r3;
101 }
102
103 void
105 chacha_core_neon(uint8_t out[restrict static 64],
106 const uint8_t in[static 16],
107 const uint8_t k[static 32],
108 const uint8_t c[static 16],
109 unsigned nr)
110 {
111 uint32x4_t in0, in1, in2, in3;
112 uint32x4_t r0, r1, r2, r3;
113
114 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
115 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
116 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
117 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
118
119 chacha_permute(&r0, &r1, &r2, &r3, nr);
120
121 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
122 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
123 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
124 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
125 }
126
127 void
128 hchacha_neon(uint8_t out[restrict static 32],
129 const uint8_t in[static 16],
130 const uint8_t k[static 32],
131 const uint8_t c[static 16],
132 unsigned nr)
133 {
134 uint32x4_t r0, r1, r2, r3;
135
136 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
137 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
138 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
139 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
140
141 chacha_permute(&r0, &r1, &r2, &r3, nr);
142
143 vst1q_u32((uint32_t *)out + 0, r0);
144 vst1q_u32((uint32_t *)out + 4, r3);
145 }
146
147 void
149 chacha_stream_neon(uint8_t *restrict s, size_t n,
150 uint32_t blkno,
151 const uint8_t nonce[static 12],
152 const uint8_t k[static 32],
153 unsigned nr)
154 {
155
156 #ifdef __aarch64__
157 for (; n >= 256; s += 256, n -= 256, blkno += 4)
158 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
159 #endif
160
161 if (n) {
162 const uint32x4_t blkno_inc = {1,0,0,0};
163 uint32x4_t in0, in1, in2, in3;
164 uint32x4_t r0, r1, r2, r3;
165
166 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
167 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
168 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
169 in3 = (uint32x4_t) {
170 blkno,
171 le32dec(nonce),
172 le32dec(nonce + 4),
173 le32dec(nonce + 8)
174 };
175
176 for (; n; s += 64, n -= 64) {
177 r0 = in0;
178 r1 = in1;
179 r2 = in2;
180 r3 = in3;
181 chacha_permute(&r0, &r1, &r2, &r3, nr);
182 r0 = vhtole_u32(vaddq_u32(r0, in0));
183 r1 = vhtole_u32(vaddq_u32(r1, in1));
184 r2 = vhtole_u32(vaddq_u32(r2, in2));
185 r3 = vhtole_u32(vaddq_u32(r3, in3));
186
187 if (n < 64) {
188 uint8_t buf[64] __aligned(16);
189
190 vst1q_u32((uint32_t *)buf + 4*0, r0);
191 vst1q_u32((uint32_t *)buf + 4*1, r1);
192 vst1q_u32((uint32_t *)buf + 4*2, r2);
193 vst1q_u32((uint32_t *)buf + 4*3, r3);
194 memcpy(s, buf, n);
195
196 break;
197 }
198
199 vst1q_u32((uint32_t *)s + 4*0, r0);
200 vst1q_u32((uint32_t *)s + 4*1, r1);
201 vst1q_u32((uint32_t *)s + 4*2, r2);
202 vst1q_u32((uint32_t *)s + 4*3, r3);
203 in3 = vaddq_u32(in3, blkno_inc);
204 }
205 }
206 }
207
208 void
210 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
211 uint32_t blkno,
212 const uint8_t nonce[static 12],
213 const uint8_t k[static 32],
214 unsigned nr)
215 {
216
217 #ifdef __aarch64__
218 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
219 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
220 chacha_const32, nr);
221 #endif
222
223 if (n) {
224 const uint32x4_t blkno_inc = {1,0,0,0};
225 uint32x4_t in0, in1, in2, in3;
226 uint32x4_t r0, r1, r2, r3;
227
228 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
229 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
230 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
231 in3 = (uint32x4_t) {
232 blkno,
233 le32dec(nonce),
234 le32dec(nonce + 4),
235 le32dec(nonce + 8)
236 };
237
238 for (; n; s += 64, p += 64, n -= 64) {
239 r0 = in0;
240 r1 = in1;
241 r2 = in2;
242 r3 = in3;
243 chacha_permute(&r0, &r1, &r2, &r3, nr);
244 r0 = vhtole_u32(vaddq_u32(r0, in0));
245 r1 = vhtole_u32(vaddq_u32(r1, in1));
246 r2 = vhtole_u32(vaddq_u32(r2, in2));
247 r3 = vhtole_u32(vaddq_u32(r3, in3));
248
249 if (n < 64) {
250 uint8_t buf[64] __aligned(16);
251 unsigned i;
252
253 vst1q_u32((uint32_t *)buf + 4*0, r0);
254 vst1q_u32((uint32_t *)buf + 4*1, r1);
255 vst1q_u32((uint32_t *)buf + 4*2, r2);
256 vst1q_u32((uint32_t *)buf + 4*3, r3);
257
258 for (i = 0; i < n - n%4; i += 4)
259 le32enc(s + i,
260 le32dec(p + i) ^ le32dec(buf + i));
261 for (; i < n; i++)
262 s[i] = p[i] ^ buf[i];
263
264 break;
265 }
266
267 r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
268 r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
269 r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
270 r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
271 vst1q_u32((uint32_t *)s + 4*0, r0);
272 vst1q_u32((uint32_t *)s + 4*1, r1);
273 vst1q_u32((uint32_t *)s + 4*2, r2);
274 vst1q_u32((uint32_t *)s + 4*3, r3);
275 in3 = vaddq_u32(in3, blkno_inc);
276 }
277 }
278 }
279
280 void
282 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
283 uint32_t blkno,
284 const uint8_t nonce[static 24],
285 const uint8_t k[static 32],
286 unsigned nr)
287 {
288 uint8_t subkey[32];
289 uint8_t subnonce[12];
290
291 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
292 memset(subnonce, 0, 4);
293 memcpy(subnonce + 4, nonce + 16, 8);
294 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
295 }
296
297 void
298 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
299 uint32_t blkno,
300 const uint8_t nonce[static 24],
301 const uint8_t k[static 32],
302 unsigned nr)
303 {
304 uint8_t subkey[32];
305 uint8_t subnonce[12];
306
307 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
308 memset(subnonce, 0, 4);
309 memcpy(subnonce + 4, nonce + 16, 8);
310 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
311 }
312