chacha_sse2.c revision 1.2 1 1.2 riastrad /* $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <sys/types.h>
30 1.1 riastrad #include <sys/endian.h>
31 1.1 riastrad
32 1.1 riastrad #include "immintrin.h"
33 1.1 riastrad
34 1.1 riastrad #include "chacha_sse2.h"
35 1.1 riastrad
36 1.1 riastrad static inline __m128i
37 1.1 riastrad rol32(__m128i x, uint8_t n)
38 1.1 riastrad {
39 1.1 riastrad
40 1.1 riastrad return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n);
41 1.1 riastrad }
42 1.1 riastrad
43 1.1 riastrad static inline void
45 1.1 riastrad chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
46 1.1 riastrad unsigned nr)
47 1.1 riastrad {
48 1.1 riastrad __m128i r0, r1, r2, r3;
49 1.1 riastrad __m128i c0, c1, c2, c3;
50 1.1 riastrad
51 1.1 riastrad r0 = *p0;
52 1.1 riastrad r1 = *p1;
53 1.1 riastrad r2 = *p2;
54 1.1 riastrad r3 = *p3;
55 1.1 riastrad
56 1.1 riastrad for (; nr > 0; nr -= 2) {
57 1.1 riastrad r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16);
58 1.1 riastrad r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12);
59 1.1 riastrad r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8);
60 1.1 riastrad r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7);
61 1.1 riastrad
62 1.1 riastrad c0 = r0;
63 1.1 riastrad c1 = _mm_shuffle_epi32(r1, 0x39);
64 1.1 riastrad c2 = _mm_shuffle_epi32(r2, 0x4e);
65 1.1 riastrad c3 = _mm_shuffle_epi32(r3, 0x93);
66 1.1 riastrad
67 1.1 riastrad c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16);
68 1.1 riastrad c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12);
69 1.1 riastrad c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8);
70 1.1 riastrad c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7);
71 1.1 riastrad
72 1.1 riastrad r0 = c0;
73 1.1 riastrad r1 = _mm_shuffle_epi32(c1, 0x93);
74 1.1 riastrad r2 = _mm_shuffle_epi32(c2, 0x4e);
75 1.1 riastrad r3 = _mm_shuffle_epi32(c3, 0x39);
76 1.1 riastrad }
77 1.1 riastrad
78 1.1 riastrad *p0 = r0;
79 1.1 riastrad *p1 = r1;
80 1.1 riastrad *p2 = r2;
81 1.1 riastrad *p3 = r3;
82 1.1 riastrad }
83 1.1 riastrad
84 1.1 riastrad void
86 1.1 riastrad chacha_core_sse2(uint8_t out[restrict static 64],
87 1.1 riastrad const uint8_t in[static 16],
88 1.1 riastrad const uint8_t k[static 32],
89 1.1 riastrad const uint8_t c[static 16],
90 1.1 riastrad unsigned nr)
91 1.1 riastrad {
92 1.1 riastrad __m128i in0, in1, in2, in3;
93 1.1 riastrad __m128i r0, r1, r2, r3;
94 1.1 riastrad
95 1.1 riastrad r0 = in0 = _mm_loadu_si128((const __m128i *)c);
96 1.1 riastrad r1 = in1 = _mm_loadu_si128((const __m128i *)k);
97 1.1 riastrad r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
98 1.1 riastrad r3 = in3 = _mm_loadu_si128((const __m128i *)in);
99 1.1 riastrad
100 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
101 1.1 riastrad
102 1.1 riastrad _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
103 1.1 riastrad _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
104 1.1 riastrad _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
105 1.1 riastrad _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
106 1.1 riastrad }
107 1.1 riastrad
108 1.1 riastrad void
109 1.1 riastrad hchacha_sse2(uint8_t out[restrict static 32],
110 1.1 riastrad const uint8_t in[static 16],
111 1.1 riastrad const uint8_t k[static 32],
112 1.1 riastrad const uint8_t c[static 16],
113 1.1 riastrad unsigned nr)
114 1.1 riastrad {
115 1.1 riastrad __m128i r0, r1, r2, r3;
116 1.1 riastrad
117 1.1 riastrad r0 = _mm_loadu_si128((const __m128i *)c);
118 1.1 riastrad r1 = _mm_loadu_si128((const __m128i *)k);
119 1.1 riastrad r2 = _mm_loadu_si128((const __m128i *)k + 1);
120 1.1 riastrad r3 = _mm_loadu_si128((const __m128i *)in);
121 1.1 riastrad
122 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
123 1.1 riastrad
124 1.1 riastrad _mm_storeu_si128((__m128i *)out + 0, r0);
125 1.1 riastrad _mm_storeu_si128((__m128i *)out + 1, r3);
126 1.1 riastrad }
127 1.1 riastrad
128 1.1 riastrad #define CHACHA_QUARTERROUND(a, b, c, d) do \
130 1.1 riastrad { \
131 1.1 riastrad (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \
132 1.1 riastrad (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \
133 1.1 riastrad (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \
134 1.1 riastrad (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \
135 1.1 riastrad } while (/*CONSTCOND*/0)
136 1.1 riastrad
137 1.1 riastrad static inline __m128i
138 1.1 riastrad load1_epi32(const void *p)
139 1.1 riastrad {
140 1.1 riastrad return (__m128i)_mm_load1_ps(p);
141 1.1 riastrad }
142 1.1 riastrad
143 1.1 riastrad static inline __m128i
144 1.1 riastrad loadu_epi32(const void *p)
145 1.1 riastrad {
146 1.1 riastrad return _mm_loadu_si128(p);
147 1.1 riastrad }
148 1.1 riastrad
149 1.1 riastrad static inline void
150 1.1 riastrad storeu_epi32(void *p, __m128i v)
151 1.1 riastrad {
152 1.1 riastrad return _mm_storeu_si128(p, v);
153 1.1 riastrad }
154 1.1 riastrad
155 1.1 riastrad static inline __m128i
156 1.1 riastrad unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
157 1.1 riastrad {
158 1.1 riastrad __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */
159 1.1 riastrad __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */
160 1.1 riastrad
161 1.1 riastrad /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */
162 1.1 riastrad return (__m128i)_mm_movelh_ps(lo, hi);
163 1.1 riastrad }
164 1.1 riastrad
165 1.1 riastrad static inline __m128i
166 1.1 riastrad unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
167 1.1 riastrad {
168 1.1 riastrad __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */
169 1.1 riastrad __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */
170 1.1 riastrad
171 1.1 riastrad /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */
172 1.1 riastrad return (__m128i)_mm_movehl_ps(hi, lo);
173 1.1 riastrad }
174 1.1 riastrad
175 1.1 riastrad static inline __m128i
176 1.1 riastrad unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
177 1.1 riastrad {
178 1.1 riastrad __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */
179 1.1 riastrad __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */
180 1.1 riastrad
181 1.1 riastrad /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */
182 1.1 riastrad return (__m128i)_mm_movelh_ps(lo, hi);
183 1.1 riastrad }
184 1.1 riastrad
185 1.1 riastrad static inline __m128i
186 1.1 riastrad unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
187 1.1 riastrad {
188 1.1 riastrad __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */
189 1.1 riastrad __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */
190 1.1 riastrad
191 1.1 riastrad /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */
192 1.1 riastrad return (__m128i)_mm_movehl_ps(hi, lo);
193 1.1 riastrad }
194 1.1 riastrad
195 1.1 riastrad void
197 1.1 riastrad chacha_stream_sse2(uint8_t *restrict s, size_t n,
198 1.1 riastrad uint32_t blkno,
199 1.1 riastrad const uint8_t nonce[static 12],
200 1.1 riastrad const uint8_t k[static 32],
201 1.1 riastrad unsigned nr)
202 1.1 riastrad {
203 1.1 riastrad __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
204 1.1 riastrad __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
205 1.1 riastrad __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
206 1.1 riastrad unsigned r;
207 1.1 riastrad
208 1.1 riastrad if (n < 256)
209 1.1 riastrad goto out;
210 1.1 riastrad
211 1.1 riastrad x0 = load1_epi32(chacha_const32 + 0);
212 1.1 riastrad x1 = load1_epi32(chacha_const32 + 4);
213 1.1 riastrad x2 = load1_epi32(chacha_const32 + 8);
214 1.1 riastrad x3 = load1_epi32(chacha_const32 + 12);
215 1.1 riastrad x4 = load1_epi32(k + 0);
216 1.1 riastrad x5 = load1_epi32(k + 4);
217 1.1 riastrad x6 = load1_epi32(k + 8);
218 1.1 riastrad x7 = load1_epi32(k + 12);
219 1.1 riastrad x8 = load1_epi32(k + 16);
220 1.1 riastrad x9 = load1_epi32(k + 20);
221 1.1 riastrad x10 = load1_epi32(k + 24);
222 1.1 riastrad x11 = load1_epi32(k + 28);
223 1.1 riastrad /* x12 set in the loop */
224 1.1 riastrad x13 = load1_epi32(nonce + 0);
225 1.1 riastrad x14 = load1_epi32(nonce + 4);
226 1.1 riastrad x15 = load1_epi32(nonce + 8);
227 1.1 riastrad
228 1.1 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4) {
230 1.1 riastrad x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
231 1.1 riastrad _mm_set_epi32(3,2,1,0));
232 1.1 riastrad y0 = x0;
233 1.1 riastrad y1 = x1;
234 1.1 riastrad y2 = x2;
235 1.1 riastrad y3 = x3;
236 1.1 riastrad y4 = x4;
237 1.1 riastrad y5 = x5;
238 1.1 riastrad y6 = x6;
239 1.1 riastrad y7 = x7;
240 1.1 riastrad y8 = x8;
241 1.1 riastrad y9 = x9;
242 1.1 riastrad y10 = x10;
243 1.1 riastrad y11 = x11;
244 1.1 riastrad y12 = x12;
245 1.1 riastrad y13 = x13;
246 1.1 riastrad y14 = x14;
247 1.1 riastrad y15 = x15;
248 1.1 riastrad for (r = nr; r > 0; r -= 2) {
249 1.1 riastrad CHACHA_QUARTERROUND( y0, y4, y8,y12);
250 1.1 riastrad CHACHA_QUARTERROUND( y1, y5, y9,y13);
251 1.1 riastrad CHACHA_QUARTERROUND( y2, y6,y10,y14);
252 1.1 riastrad CHACHA_QUARTERROUND( y3, y7,y11,y15);
253 1.1 riastrad CHACHA_QUARTERROUND( y0, y5,y10,y15);
254 1.1 riastrad CHACHA_QUARTERROUND( y1, y6,y11,y12);
255 1.1 riastrad CHACHA_QUARTERROUND( y2, y7, y8,y13);
256 1.1 riastrad CHACHA_QUARTERROUND( y3, y4, y9,y14);
257 1.1 riastrad }
258 1.1 riastrad y0 = _mm_add_epi32(y0, x0);
259 1.1 riastrad y1 = _mm_add_epi32(y1, x1);
260 1.1 riastrad y2 = _mm_add_epi32(y2, x2);
261 1.1 riastrad y3 = _mm_add_epi32(y3, x3);
262 1.1 riastrad y4 = _mm_add_epi32(y4, x4);
263 1.1 riastrad y5 = _mm_add_epi32(y5, x5);
264 1.1 riastrad y6 = _mm_add_epi32(y6, x6);
265 1.1 riastrad y7 = _mm_add_epi32(y7, x7);
266 1.1 riastrad y8 = _mm_add_epi32(y8, x8);
267 1.1 riastrad y9 = _mm_add_epi32(y9, x9);
268 1.1 riastrad y10 = _mm_add_epi32(y10, x10);
269 1.1 riastrad y11 = _mm_add_epi32(y11, x11);
270 1.1 riastrad y12 = _mm_add_epi32(y12, x12);
271 1.1 riastrad y13 = _mm_add_epi32(y13, x13);
272 1.1 riastrad y14 = _mm_add_epi32(y14, x14);
273 1.1 riastrad y15 = _mm_add_epi32(y15, x15);
274 1.1 riastrad
275 1.1 riastrad z0 = unpack0_epi32(y0, y1, y2, y3);
277 1.1 riastrad z1 = unpack0_epi32(y4, y5, y6, y7);
278 1.1 riastrad z2 = unpack0_epi32(y8, y9, y10, y11);
279 1.1 riastrad z3 = unpack0_epi32(y12, y13, y14, y15);
280 1.1 riastrad z4 = unpack1_epi32(y0, y1, y2, y3);
281 1.1 riastrad z5 = unpack1_epi32(y4, y5, y6, y7);
282 1.1 riastrad z6 = unpack1_epi32(y8, y9, y10, y11);
283 1.1 riastrad z7 = unpack1_epi32(y12, y13, y14, y15);
284 1.1 riastrad z8 = unpack2_epi32(y0, y1, y2, y3);
285 1.1 riastrad z9 = unpack2_epi32(y4, y5, y6, y7);
286 1.1 riastrad z10 = unpack2_epi32(y8, y9, y10, y11);
287 1.1 riastrad z11 = unpack2_epi32(y12, y13, y14, y15);
288 1.1 riastrad z12 = unpack3_epi32(y0, y1, y2, y3);
289 1.1 riastrad z13 = unpack3_epi32(y4, y5, y6, y7);
290 1.1 riastrad z14 = unpack3_epi32(y8, y9, y10, y11);
291 1.1 riastrad z15 = unpack3_epi32(y12, y13, y14, y15);
292 1.1 riastrad
293 1.1 riastrad storeu_epi32(s + 16*0, z0);
294 1.1 riastrad storeu_epi32(s + 16*1, z1);
295 1.1 riastrad storeu_epi32(s + 16*2, z2);
296 1.1 riastrad storeu_epi32(s + 16*3, z3);
297 1.1 riastrad storeu_epi32(s + 16*4, z4);
298 1.1 riastrad storeu_epi32(s + 16*5, z5);
299 1.1 riastrad storeu_epi32(s + 16*6, z6);
300 1.1 riastrad storeu_epi32(s + 16*7, z7);
301 1.1 riastrad storeu_epi32(s + 16*8, z8);
302 1.1 riastrad storeu_epi32(s + 16*9, z9);
303 1.1 riastrad storeu_epi32(s + 16*10, z10);
304 1.1 riastrad storeu_epi32(s + 16*11, z11);
305 1.1 riastrad storeu_epi32(s + 16*12, z12);
306 1.1 riastrad storeu_epi32(s + 16*13, z13);
307 1.1 riastrad storeu_epi32(s + 16*14, z14);
308 1.1 riastrad storeu_epi32(s + 16*15, z15);
309 1.1 riastrad }
310 1.1 riastrad
311 1.1 riastrad out: if (n) {
313 1.1 riastrad const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
314 1.1 riastrad __m128i in0, in1, in2, in3;
315 1.1 riastrad __m128i r0, r1, r2, r3;
316 1.2 riastrad
317 1.1 riastrad in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
318 1.1 riastrad in1 = _mm_loadu_si128((const __m128i *)k);
319 1.1 riastrad in2 = _mm_loadu_si128((const __m128i *)k + 1);
320 1.1 riastrad in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
321 1.1 riastrad le32dec(nonce), blkno);
322 1.1 riastrad
323 1.1 riastrad for (; n; s += 64, n -= 64) {
324 1.1 riastrad r0 = in0;
325 1.1 riastrad r1 = in1;
326 1.2 riastrad r2 = in2;
327 1.2 riastrad r3 = in3;
328 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
329 1.2 riastrad r0 = _mm_add_epi32(r0, in0);
330 1.2 riastrad r1 = _mm_add_epi32(r1, in1);
331 1.2 riastrad r2 = _mm_add_epi32(r2, in2);
332 1.2 riastrad r3 = _mm_add_epi32(r3, in3);
333 1.2 riastrad
334 1.2 riastrad if (n < 64) {
335 1.2 riastrad uint8_t buf[64] __aligned(16);
336 1.2 riastrad
337 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 0, r0);
338 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 1, r1);
339 1.1 riastrad _mm_storeu_si128((__m128i *)buf + 2, r2);
340 1.1 riastrad _mm_storeu_si128((__m128i *)buf + 3, r3);
341 1.1 riastrad memcpy(s, buf, n);
342 1.1 riastrad
343 1.1 riastrad break;
344 1.1 riastrad }
345 1.1 riastrad
346 1.1 riastrad _mm_storeu_si128((__m128i *)s + 0, r0);
347 1.1 riastrad _mm_storeu_si128((__m128i *)s + 1, r1);
348 1.1 riastrad _mm_storeu_si128((__m128i *)s + 2, r2);
349 1.1 riastrad _mm_storeu_si128((__m128i *)s + 3, r3);
350 1.1 riastrad in3 = _mm_add_epi32(in3, blkno_inc);
351 1.1 riastrad }
352 1.1 riastrad }
353 1.1 riastrad }
354 1.1 riastrad
355 1.1 riastrad void
357 1.1 riastrad chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n,
358 1.1 riastrad uint32_t blkno,
359 1.1 riastrad const uint8_t nonce[static 12],
360 1.1 riastrad const uint8_t k[static 32],
361 1.1 riastrad unsigned nr)
362 1.1 riastrad {
363 1.1 riastrad __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
364 1.1 riastrad __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
365 1.1 riastrad __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
366 1.1 riastrad unsigned r;
367 1.1 riastrad
368 1.1 riastrad if (n < 256)
369 1.1 riastrad goto out;
370 1.1 riastrad
371 1.1 riastrad x0 = load1_epi32(chacha_const32 + 0);
372 1.1 riastrad x1 = load1_epi32(chacha_const32 + 4);
373 1.1 riastrad x2 = load1_epi32(chacha_const32 + 8);
374 1.1 riastrad x3 = load1_epi32(chacha_const32 + 12);
375 1.1 riastrad x4 = load1_epi32(k + 0);
376 1.1 riastrad x5 = load1_epi32(k + 4);
377 1.1 riastrad x6 = load1_epi32(k + 8);
378 1.1 riastrad x7 = load1_epi32(k + 12);
379 1.1 riastrad x8 = load1_epi32(k + 16);
380 1.1 riastrad x9 = load1_epi32(k + 20);
381 1.1 riastrad x10 = load1_epi32(k + 24);
382 1.1 riastrad x11 = load1_epi32(k + 28);
383 1.1 riastrad /* x12 set in the loop */
384 1.1 riastrad x13 = load1_epi32(nonce + 0);
385 1.1 riastrad x14 = load1_epi32(nonce + 4);
386 1.1 riastrad x15 = load1_epi32(nonce + 8);
387 1.1 riastrad
388 1.1 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) {
389 1.1 riastrad x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
390 1.1 riastrad _mm_set_epi32(3,2,1,0));
391 1.1 riastrad y0 = x0;
392 1.1 riastrad y1 = x1;
393 1.1 riastrad y2 = x2;
394 1.1 riastrad y3 = x3;
395 1.1 riastrad y4 = x4;
396 1.1 riastrad y5 = x5;
397 1.1 riastrad y6 = x6;
398 1.1 riastrad y7 = x7;
399 1.1 riastrad y8 = x8;
400 1.1 riastrad y9 = x9;
401 1.1 riastrad y10 = x10;
402 1.1 riastrad y11 = x11;
403 1.1 riastrad y12 = x12;
404 1.1 riastrad y13 = x13;
405 1.1 riastrad y14 = x14;
406 1.1 riastrad y15 = x15;
407 1.1 riastrad for (r = nr; r > 0; r -= 2) {
408 1.1 riastrad CHACHA_QUARTERROUND( y0, y4, y8,y12);
409 1.1 riastrad CHACHA_QUARTERROUND( y1, y5, y9,y13);
410 1.1 riastrad CHACHA_QUARTERROUND( y2, y6,y10,y14);
411 1.1 riastrad CHACHA_QUARTERROUND( y3, y7,y11,y15);
412 1.1 riastrad CHACHA_QUARTERROUND( y0, y5,y10,y15);
413 1.1 riastrad CHACHA_QUARTERROUND( y1, y6,y11,y12);
414 1.1 riastrad CHACHA_QUARTERROUND( y2, y7, y8,y13);
415 1.1 riastrad CHACHA_QUARTERROUND( y3, y4, y9,y14);
416 1.1 riastrad }
417 1.1 riastrad y0 = _mm_add_epi32(y0, x0);
418 1.1 riastrad y1 = _mm_add_epi32(y1, x1);
419 1.1 riastrad y2 = _mm_add_epi32(y2, x2);
420 1.1 riastrad y3 = _mm_add_epi32(y3, x3);
421 1.1 riastrad y4 = _mm_add_epi32(y4, x4);
422 1.1 riastrad y5 = _mm_add_epi32(y5, x5);
423 1.1 riastrad y6 = _mm_add_epi32(y6, x6);
424 1.1 riastrad y7 = _mm_add_epi32(y7, x7);
425 1.1 riastrad y8 = _mm_add_epi32(y8, x8);
426 1.1 riastrad y9 = _mm_add_epi32(y9, x9);
427 1.1 riastrad y10 = _mm_add_epi32(y10, x10);
428 1.1 riastrad y11 = _mm_add_epi32(y11, x11);
429 1.1 riastrad y12 = _mm_add_epi32(y12, x12);
430 1.1 riastrad y13 = _mm_add_epi32(y13, x13);
431 1.1 riastrad y14 = _mm_add_epi32(y14, x14);
432 1.1 riastrad y15 = _mm_add_epi32(y15, x15);
433 1.1 riastrad
434 1.1 riastrad z0 = unpack0_epi32(y0, y1, y2, y3);
436 1.1 riastrad z1 = unpack0_epi32(y4, y5, y6, y7);
437 1.1 riastrad z2 = unpack0_epi32(y8, y9, y10, y11);
438 1.1 riastrad z3 = unpack0_epi32(y12, y13, y14, y15);
439 1.1 riastrad z4 = unpack1_epi32(y0, y1, y2, y3);
440 1.1 riastrad z5 = unpack1_epi32(y4, y5, y6, y7);
441 1.1 riastrad z6 = unpack1_epi32(y8, y9, y10, y11);
442 1.1 riastrad z7 = unpack1_epi32(y12, y13, y14, y15);
443 1.1 riastrad z8 = unpack2_epi32(y0, y1, y2, y3);
444 1.1 riastrad z9 = unpack2_epi32(y4, y5, y6, y7);
445 1.1 riastrad z10 = unpack2_epi32(y8, y9, y10, y11);
446 1.1 riastrad z11 = unpack2_epi32(y12, y13, y14, y15);
447 1.1 riastrad z12 = unpack3_epi32(y0, y1, y2, y3);
448 1.1 riastrad z13 = unpack3_epi32(y4, y5, y6, y7);
449 1.1 riastrad z14 = unpack3_epi32(y8, y9, y10, y11);
450 1.1 riastrad z15 = unpack3_epi32(y12, y13, y14, y15);
451 1.1 riastrad
452 1.1 riastrad storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0);
453 1.1 riastrad storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1);
454 1.1 riastrad storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2);
455 1.1 riastrad storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3);
456 1.1 riastrad storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4);
457 1.1 riastrad storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5);
458 1.1 riastrad storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6);
459 1.1 riastrad storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7);
460 1.1 riastrad storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8);
461 1.1 riastrad storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9);
462 1.1 riastrad storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10);
463 1.1 riastrad storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11);
464 1.1 riastrad storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12);
465 1.1 riastrad storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13);
466 1.1 riastrad storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14);
467 1.1 riastrad storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15);
468 1.1 riastrad }
469 1.1 riastrad
470 1.1 riastrad out: if (n) {
472 1.2 riastrad const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
473 1.1 riastrad __m128i in0, in1, in2, in3;
474 1.1 riastrad __m128i r0, r1, r2, r3;
475 1.1 riastrad
476 1.1 riastrad in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
477 1.1 riastrad in1 = _mm_loadu_si128((const __m128i *)k);
478 1.1 riastrad in2 = _mm_loadu_si128((const __m128i *)k + 1);
479 1.1 riastrad in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
480 1.1 riastrad le32dec(nonce), blkno);
481 1.1 riastrad
482 1.2 riastrad for (; n; s += 64, p += 64, n -= 64) {
483 1.2 riastrad r0 = in0;
484 1.2 riastrad r1 = in1;
485 1.2 riastrad r2 = in2;
486 1.2 riastrad r3 = in3;
487 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr);
488 1.2 riastrad r0 = _mm_add_epi32(r0, in0);
489 1.2 riastrad r1 = _mm_add_epi32(r1, in1);
490 1.2 riastrad r2 = _mm_add_epi32(r2, in2);
491 1.2 riastrad r3 = _mm_add_epi32(r3, in3);
492 1.2 riastrad
493 1.2 riastrad if (n < 64) {
494 1.2 riastrad uint8_t buf[64] __aligned(16);
495 1.2 riastrad unsigned i;
496 1.2 riastrad
497 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 0, r0);
498 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 1, r1);
499 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 2, r2);
500 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 3, r3);
501 1.1 riastrad
502 1.1 riastrad for (i = 0; i < n - n%4; i += 4)
503 1.1 riastrad le32enc(s + i,
504 1.1 riastrad le32dec(p + i) ^ le32dec(buf + i));
505 1.1 riastrad for (; i < n; i++)
506 1.1 riastrad s[i] = p[i] ^ buf[i];
507 1.1 riastrad
508 1.1 riastrad break;
509 1.1 riastrad }
510 1.1 riastrad
511 1.1 riastrad r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
512 1.1 riastrad r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
513 1.1 riastrad r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
514 1.1 riastrad r3 ^= _mm_loadu_si128((const __m128i *)p + 3);
515 1.1 riastrad _mm_storeu_si128((__m128i *)s + 0, r0);
516 1.1 riastrad _mm_storeu_si128((__m128i *)s + 1, r1);
517 1.1 riastrad _mm_storeu_si128((__m128i *)s + 2, r2);
518 1.1 riastrad _mm_storeu_si128((__m128i *)s + 3, r3);
519 1.1 riastrad in3 = _mm_add_epi32(in3, blkno_inc);
520 1.1 riastrad }
521 1.1 riastrad }
522 1.1 riastrad }
523 1.1 riastrad
524 1.1 riastrad void
526 1.1 riastrad xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes,
527 1.1 riastrad uint32_t blkno,
528 1.1 riastrad const uint8_t nonce[static 24],
529 1.1 riastrad const uint8_t k[static 32],
530 1.1 riastrad unsigned nr)
531 1.1 riastrad {
532 1.1 riastrad uint8_t subkey[32];
533 1.1 riastrad uint8_t subnonce[12];
534 1.1 riastrad
535 1.1 riastrad hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
536 1.1 riastrad memset(subnonce, 0, 4);
537 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8);
538 1.1 riastrad chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr);
539 1.1 riastrad }
540 1.1 riastrad
541 1.1 riastrad void
542 1.1 riastrad xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
543 1.1 riastrad uint32_t blkno,
544 1.1 riastrad const uint8_t nonce[static 24],
545 const uint8_t k[static 32],
546 unsigned nr)
547 {
548 uint8_t subkey[32];
549 uint8_t subnonce[12];
550
551 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
552 memset(subnonce, 0, 4);
553 memcpy(subnonce + 4, nonce + 16, 8);
554 chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr);
555 }
556