chacha_neon.c revision 1.2 1 1.2 riastrad /* $NetBSD: chacha_neon.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <sys/types.h>
30 1.1 riastrad #include <sys/endian.h>
31 1.1 riastrad
32 1.1 riastrad #include "arm_neon.h"
33 1.1 riastrad #include "chacha_neon.h"
34 1.1 riastrad
35 1.1 riastrad static inline uint32x4_t
36 1.1 riastrad vrolq_n_u32(uint32x4_t x, uint8_t n)
37 1.1 riastrad {
38 1.1 riastrad
39 1.1 riastrad return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
40 1.1 riastrad }
41 1.1 riastrad
42 1.1 riastrad static inline uint32x4_t
43 1.1 riastrad vhtole_u32(uint32x4_t x)
44 1.1 riastrad {
45 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
46 1.1 riastrad return x;
47 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
48 1.1 riastrad return vrev32q_u8(x);
49 1.1 riastrad #endif
50 1.1 riastrad }
51 1.1 riastrad
52 1.1 riastrad static inline uint32x4_t
53 1.1 riastrad vletoh_u32(uint32x4_t x)
54 1.1 riastrad {
55 1.1 riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
56 1.1 riastrad return x;
57 1.1 riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
58 1.1 riastrad return vrev32q_u8(x);
59 1.1 riastrad #endif
60 1.1 riastrad }
61 1.1 riastrad
62 1.1 riastrad static inline void
64 1.1 riastrad chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
65 1.1 riastrad unsigned nr)
66 1.1 riastrad {
67 1.1 riastrad uint32x4_t r0, r1, r2, r3;
68 1.1 riastrad uint32x4_t c0, c1, c2, c3;
69 1.1 riastrad
70 1.1 riastrad r0 = *p0;
71 1.1 riastrad r1 = *p1;
72 1.1 riastrad r2 = *p2;
73 1.1 riastrad r3 = *p3;
74 1.1 riastrad
75 1.1 riastrad for (; nr > 0; nr -= 2) {
76 1.1 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 16);
77 1.1 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 12);
78 1.1 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 8);
79 1.1 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 7);
80 1.1 riastrad
81 1.1 riastrad c0 = r0;
82 1.1 riastrad c1 = vextq_u32(r1, r1, 1);
83 1.1 riastrad c2 = vextq_u32(r2, r2, 2);
84 1.1 riastrad c3 = vextq_u32(r3, r3, 3);
85 1.1 riastrad
86 1.1 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 16);
87 1.1 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 12);
88 1.1 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 8);
89 1.1 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 7);
90 1.1 riastrad
91 1.1 riastrad r0 = c0;
92 1.1 riastrad r1 = vextq_u32(c1, c1, 3);
93 1.1 riastrad r2 = vextq_u32(c2, c2, 2);
94 1.1 riastrad r3 = vextq_u32(c3, c3, 1);
95 1.1 riastrad }
96 1.1 riastrad
97 1.1 riastrad *p0 = r0;
98 1.1 riastrad *p1 = r1;
99 1.1 riastrad *p2 = r2;
100 1.1 riastrad *p3 = r3;
101 1.1 riastrad }
102 1.1 riastrad
103 1.1 riastrad void
105 1.1 riastrad chacha_core_neon(uint8_t out[restrict static 64],
106 1.1 riastrad const uint8_t in[static 16],
107 1.1 riastrad const uint8_t k[static 32],
108 1.1 riastrad const uint8_t c[static 16],
109 1.1 riastrad unsigned nr)
110 1.1 riastrad {
111 1.1 riastrad uint32x4_t in0, in1, in2, in3;
112 1.1 riastrad uint32x4_t r0, r1, r2, r3;
113 1.1 riastrad
114 1.1 riastrad r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
115 1.1 riastrad r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
116 1.1 riastrad r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
117 1.1 riastrad r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
118 1.1 riastrad
119 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
120 1.1 riastrad
121 1.1 riastrad vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
122 1.1 riastrad vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
123 1.1 riastrad vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
124 1.1 riastrad vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
125 1.1 riastrad }
126 1.1 riastrad
127 1.1 riastrad void
128 1.1 riastrad hchacha_neon(uint8_t out[restrict static 32],
129 1.1 riastrad const uint8_t in[static 16],
130 1.1 riastrad const uint8_t k[static 32],
131 1.1 riastrad const uint8_t c[static 16],
132 1.1 riastrad unsigned nr)
133 1.1 riastrad {
134 1.1 riastrad uint32x4_t r0, r1, r2, r3;
135 1.1 riastrad
136 1.1 riastrad r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
137 1.1 riastrad r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
138 1.1 riastrad r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
139 1.1 riastrad r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
140 1.1 riastrad
141 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
142 1.1 riastrad
143 1.1 riastrad vst1q_u32((uint32_t *)out + 0, r0);
144 1.1 riastrad vst1q_u32((uint32_t *)out + 4, r3);
145 1.1 riastrad }
146 1.1 riastrad
147 1.1 riastrad void
149 1.1 riastrad chacha_stream_neon(uint8_t *restrict s, size_t n,
150 1.1 riastrad uint32_t blkno,
151 1.1 riastrad const uint8_t nonce[static 12],
152 1.1 riastrad const uint8_t k[static 32],
153 1.1 riastrad unsigned nr)
154 1.1 riastrad {
155 1.1 riastrad
156 1.1 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4)
157 1.1 riastrad chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
158 1.1 riastrad
159 1.1 riastrad if (n) {
160 1.1 riastrad const uint32x4_t blkno_inc = {1,0,0,0};
161 1.1 riastrad uint32x4_t in0, in1, in2, in3;
162 1.1 riastrad uint32x4_t r0, r1, r2, r3;
163 1.1 riastrad
164 1.1 riastrad in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
165 1.1 riastrad in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
166 1.1 riastrad in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
167 1.1 riastrad in3 = (uint32x4_t) {
168 1.1 riastrad blkno,
169 1.1 riastrad le32dec(nonce),
170 1.1 riastrad le32dec(nonce + 4),
171 1.2 riastrad le32dec(nonce + 8)
172 1.1 riastrad };
173 1.1 riastrad
174 1.1 riastrad for (; n; s += 64, n -= 64) {
175 1.1 riastrad r0 = in0;
176 1.1 riastrad r1 = in1;
177 1.1 riastrad r2 = in2;
178 1.1 riastrad r3 = in3;
179 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
180 1.1 riastrad r0 = vhtole_u32(vaddq_u32(r0, in0));
181 1.2 riastrad r1 = vhtole_u32(vaddq_u32(r1, in1));
182 1.2 riastrad r2 = vhtole_u32(vaddq_u32(r2, in2));
183 1.2 riastrad r3 = vhtole_u32(vaddq_u32(r3, in3));
184 1.2 riastrad
185 1.2 riastrad if (n < 64) {
186 1.2 riastrad uint8_t buf[64] __aligned(16);
187 1.2 riastrad
188 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*0, r0);
189 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*1, r1);
190 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*2, r2);
191 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*3, r3);
192 1.2 riastrad memcpy(s, buf, n);
193 1.2 riastrad
194 1.1 riastrad break;
195 1.1 riastrad }
196 1.1 riastrad
197 1.1 riastrad vst1q_u32((uint32_t *)s + 4*0, r0);
198 1.1 riastrad vst1q_u32((uint32_t *)s + 4*1, r1);
199 1.1 riastrad vst1q_u32((uint32_t *)s + 4*2, r2);
200 1.1 riastrad vst1q_u32((uint32_t *)s + 4*3, r3);
201 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
202 1.1 riastrad }
203 1.1 riastrad }
204 1.1 riastrad }
205 1.1 riastrad
206 1.1 riastrad void
208 1.1 riastrad chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
209 1.1 riastrad uint32_t blkno,
210 1.1 riastrad const uint8_t nonce[static 12],
211 1.1 riastrad const uint8_t k[static 32],
212 1.1 riastrad unsigned nr)
213 1.1 riastrad {
214 1.1 riastrad
215 1.1 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
216 1.1 riastrad chacha_stream_xor256_neon(s, p, blkno, nonce, k,
217 1.1 riastrad chacha_const32, nr);
218 1.1 riastrad
219 1.1 riastrad if (n) {
220 1.1 riastrad const uint32x4_t blkno_inc = {1,0,0,0};
221 1.1 riastrad uint32x4_t in0, in1, in2, in3;
222 1.1 riastrad uint32x4_t r0, r1, r2, r3;
223 1.1 riastrad
224 1.1 riastrad in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
225 1.1 riastrad in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
226 1.1 riastrad in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
227 1.1 riastrad in3 = (uint32x4_t) {
228 1.1 riastrad blkno,
229 1.1 riastrad le32dec(nonce),
230 1.2 riastrad le32dec(nonce + 4),
231 1.1 riastrad le32dec(nonce + 8)
232 1.1 riastrad };
233 1.1 riastrad
234 1.1 riastrad for (; n; s += 64, p += 64, n -= 64) {
235 1.1 riastrad r0 = in0;
236 1.1 riastrad r1 = in1;
237 1.1 riastrad r2 = in2;
238 1.1 riastrad r3 = in3;
239 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
240 1.2 riastrad r0 = vhtole_u32(vaddq_u32(r0, in0));
241 1.2 riastrad r1 = vhtole_u32(vaddq_u32(r1, in1));
242 1.2 riastrad r2 = vhtole_u32(vaddq_u32(r2, in2));
243 1.2 riastrad r3 = vhtole_u32(vaddq_u32(r3, in3));
244 1.2 riastrad
245 1.2 riastrad if (n < 64) {
246 1.2 riastrad uint8_t buf[64] __aligned(16);
247 1.2 riastrad unsigned i;
248 1.2 riastrad
249 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*0, r0);
250 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*1, r1);
251 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*2, r2);
252 1.2 riastrad vst1q_u32((uint32_t *)buf + 4*3, r3);
253 1.2 riastrad
254 1.2 riastrad for (i = 0; i < n - n%4; i += 4)
255 1.2 riastrad le32enc(s + i,
256 1.2 riastrad le32dec(p + i) ^ le32dec(buf + i));
257 1.2 riastrad for (; i < n; i++)
258 1.2 riastrad s[i] = p[i] ^ buf[i];
259 1.1 riastrad
260 1.1 riastrad break;
261 1.1 riastrad }
262 1.1 riastrad
263 1.1 riastrad r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
264 1.1 riastrad r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
265 1.1 riastrad r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
266 1.1 riastrad r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
267 1.1 riastrad vst1q_u32((uint32_t *)s + 4*0, r0);
268 1.1 riastrad vst1q_u32((uint32_t *)s + 4*1, r1);
269 1.1 riastrad vst1q_u32((uint32_t *)s + 4*2, r2);
270 1.1 riastrad vst1q_u32((uint32_t *)s + 4*3, r3);
271 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc);
272 1.1 riastrad }
273 1.1 riastrad }
274 1.1 riastrad }
275 1.1 riastrad
276 1.1 riastrad void
278 1.1 riastrad xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
279 1.1 riastrad uint32_t blkno,
280 1.1 riastrad const uint8_t nonce[static 24],
281 1.1 riastrad const uint8_t k[static 32],
282 1.1 riastrad unsigned nr)
283 1.1 riastrad {
284 1.1 riastrad uint8_t subkey[32];
285 1.1 riastrad uint8_t subnonce[12];
286 1.1 riastrad
287 1.1 riastrad hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
288 1.1 riastrad memset(subnonce, 0, 4);
289 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8);
290 1.1 riastrad chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
291 1.1 riastrad }
292 1.1 riastrad
293 1.1 riastrad void
294 1.1 riastrad xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
295 1.1 riastrad uint32_t blkno,
296 1.1 riastrad const uint8_t nonce[static 24],
297 1.1 riastrad const uint8_t k[static 32],
298 1.1 riastrad unsigned nr)
299 1.1 riastrad {
300 1.1 riastrad uint8_t subkey[32];
301 1.1 riastrad uint8_t subnonce[12];
302 1.1 riastrad
303 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
304 memset(subnonce, 0, 4);
305 memcpy(subnonce + 4, nonce + 16, 8);
306 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
307 }
308