chacha_neon.c revision 1.1 1 /* $NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30 #include <sys/endian.h>
31
32 #include "arm_neon.h"
33 #include "chacha_neon.h"
34
35 static inline uint32x4_t
36 vrolq_n_u32(uint32x4_t x, uint8_t n)
37 {
38
39 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
40 }
41
42 static inline uint32x4_t
43 vhtole_u32(uint32x4_t x)
44 {
45 #if _BYTE_ORDER == _LITTLE_ENDIAN
46 return x;
47 #elif _BYTE_ORDER == _BIG_ENDIAN
48 return vrev32q_u8(x);
49 #endif
50 }
51
52 static inline uint32x4_t
53 vletoh_u32(uint32x4_t x)
54 {
55 #if _BYTE_ORDER == _LITTLE_ENDIAN
56 return x;
57 #elif _BYTE_ORDER == _BIG_ENDIAN
58 return vrev32q_u8(x);
59 #endif
60 }
61
62 static inline void
64 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
65 unsigned nr)
66 {
67 uint32x4_t r0, r1, r2, r3;
68 uint32x4_t c0, c1, c2, c3;
69
70 r0 = *p0;
71 r1 = *p1;
72 r2 = *p2;
73 r3 = *p3;
74
75 for (; nr > 0; nr -= 2) {
76 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 16);
77 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 12);
78 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 8);
79 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 7);
80
81 c0 = r0;
82 c1 = vextq_u32(r1, r1, 1);
83 c2 = vextq_u32(r2, r2, 2);
84 c3 = vextq_u32(r3, r3, 3);
85
86 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 16);
87 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 12);
88 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 8);
89 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 7);
90
91 r0 = c0;
92 r1 = vextq_u32(c1, c1, 3);
93 r2 = vextq_u32(c2, c2, 2);
94 r3 = vextq_u32(c3, c3, 1);
95 }
96
97 *p0 = r0;
98 *p1 = r1;
99 *p2 = r2;
100 *p3 = r3;
101 }
102
103 void
105 chacha_core_neon(uint8_t out[restrict static 64],
106 const uint8_t in[static 16],
107 const uint8_t k[static 32],
108 const uint8_t c[static 16],
109 unsigned nr)
110 {
111 uint32x4_t in0, in1, in2, in3;
112 uint32x4_t r0, r1, r2, r3;
113
114 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
115 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
116 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
117 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
118
119 chacha_permute(&r0, &r1, &r2, &r3, nr);
120
121 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
122 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
123 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
124 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
125 }
126
127 void
128 hchacha_neon(uint8_t out[restrict static 32],
129 const uint8_t in[static 16],
130 const uint8_t k[static 32],
131 const uint8_t c[static 16],
132 unsigned nr)
133 {
134 uint32x4_t r0, r1, r2, r3;
135
136 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
137 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
138 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
139 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
140
141 chacha_permute(&r0, &r1, &r2, &r3, nr);
142
143 vst1q_u32((uint32_t *)out + 0, r0);
144 vst1q_u32((uint32_t *)out + 4, r3);
145 }
146
147 void
149 chacha_stream_neon(uint8_t *restrict s, size_t n,
150 uint32_t blkno,
151 const uint8_t nonce[static 12],
152 const uint8_t k[static 32],
153 unsigned nr)
154 {
155
156 for (; n >= 256; s += 256, n -= 256, blkno += 4)
157 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
158
159 if (n) {
160 const uint32x4_t blkno_inc = {1,0,0,0};
161 uint32x4_t in0, in1, in2, in3;
162 uint32x4_t r0, r1, r2, r3;
163
164 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
165 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
166 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
167 in3 = (uint32x4_t) {
168 blkno,
169 le32dec(nonce),
170 le32dec(nonce + 4),
171 le32dec(nonce + 8)
172 };
173
174 for (; n >= 64; s += 64, n -= 64) {
175 r0 = in0;
176 r1 = in1;
177 r2 = in2;
178 r3 = in3;
179 chacha_permute(&r0, &r1, &r2, &r3, nr);
180 r0 = vhtole_u32(vaddq_u32(r0, in0));
181 r1 = vhtole_u32(vaddq_u32(r1, in1));
182 r2 = vhtole_u32(vaddq_u32(r2, in2));
183 r3 = vhtole_u32(vaddq_u32(r3, in3));
184 vst1q_u32((uint32_t *)s + 4*0, r0);
185 vst1q_u32((uint32_t *)s + 4*1, r1);
186 vst1q_u32((uint32_t *)s + 4*2, r2);
187 vst1q_u32((uint32_t *)s + 4*3, r3);
188 in3 = vaddq_u32(in3, blkno_inc);
189 }
190
191 if (n) {
192 uint8_t buf[64];
193
194 r0 = in0;
195 r1 = in1;
196 r2 = in2;
197 r3 = in3;
198 chacha_permute(&r0, &r1, &r2, &r3, nr);
199 r0 = vhtole_u32(vaddq_u32(r0, in0));
200 r1 = vhtole_u32(vaddq_u32(r1, in1));
201 r2 = vhtole_u32(vaddq_u32(r2, in2));
202 r3 = vhtole_u32(vaddq_u32(r3, in3));
203 vst1q_u32((uint32_t *)buf + 4*0, r0);
204 vst1q_u32((uint32_t *)buf + 4*1, r1);
205 vst1q_u32((uint32_t *)buf + 4*2, r2);
206 vst1q_u32((uint32_t *)buf + 4*3, r3);
207
208 memcpy(s, buf, n);
209 }
210 }
211 }
212
213 void
215 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
216 uint32_t blkno,
217 const uint8_t nonce[static 12],
218 const uint8_t k[static 32],
219 unsigned nr)
220 {
221
222 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
223 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
224 chacha_const32, nr);
225
226 if (n) {
227 const uint32x4_t blkno_inc = {1,0,0,0};
228 uint32x4_t in0, in1, in2, in3;
229 uint32x4_t r0, r1, r2, r3;
230
231 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
232 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
233 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
234 in3 = (uint32x4_t) {
235 blkno,
236 le32dec(nonce),
237 le32dec(nonce + 4),
238 le32dec(nonce + 8)
239 };
240
241 for (; n >= 64; s += 64, p += 64, n -= 64) {
242 r0 = in0;
243 r1 = in1;
244 r2 = in2;
245 r3 = in3;
246 chacha_permute(&r0, &r1, &r2, &r3, nr);
247 r0 = vhtole_u32(vaddq_u32(r0, in0));
248 r1 = vhtole_u32(vaddq_u32(r1, in1));
249 r2 = vhtole_u32(vaddq_u32(r2, in2));
250 r3 = vhtole_u32(vaddq_u32(r3, in3));
251 r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
252 r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
253 r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
254 r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
255 vst1q_u32((uint32_t *)s + 4*0, r0);
256 vst1q_u32((uint32_t *)s + 4*1, r1);
257 vst1q_u32((uint32_t *)s + 4*2, r2);
258 vst1q_u32((uint32_t *)s + 4*3, r3);
259 in3 = vaddq_u32(in3, blkno_inc);
260 }
261
262 if (n) {
263 uint8_t buf[64];
264 unsigned i;
265
266 r0 = in0;
267 r1 = in1;
268 r2 = in2;
269 r3 = in3;
270 chacha_permute(&r0, &r1, &r2, &r3, nr);
271 r0 = vhtole_u32(vaddq_u32(r0, in0));
272 r1 = vhtole_u32(vaddq_u32(r1, in1));
273 r2 = vhtole_u32(vaddq_u32(r2, in2));
274 r3 = vhtole_u32(vaddq_u32(r3, in3));
275 vst1q_u32((uint32_t *)buf + 4*0, r0);
276 vst1q_u32((uint32_t *)buf + 4*1, r1);
277 vst1q_u32((uint32_t *)buf + 4*2, r2);
278 vst1q_u32((uint32_t *)buf + 4*3, r3);
279
280 for (i = 0; i < n - n%4; i += 4)
281 le32enc(s + i,
282 le32dec(p + i) ^ le32dec(buf + i));
283 for (; i < n; i++)
284 s[i] = p[i] ^ buf[i];
285 }
286 }
287 }
288
289 void
291 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
292 uint32_t blkno,
293 const uint8_t nonce[static 24],
294 const uint8_t k[static 32],
295 unsigned nr)
296 {
297 uint8_t subkey[32];
298 uint8_t subnonce[12];
299
300 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
301 memset(subnonce, 0, 4);
302 memcpy(subnonce + 4, nonce + 16, 8);
303 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
304 }
305
306 void
307 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
308 uint32_t blkno,
309 const uint8_t nonce[static 24],
310 const uint8_t k[static 32],
311 unsigned nr)
312 {
313 uint8_t subkey[32];
314 uint8_t subnonce[12];
315
316 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
317 memset(subnonce, 0, 4);
318 memcpy(subnonce + 4, nonce + 16, 8);
319 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
320 }
321