1 /* $NetBSD: chacha_sse2.c,v 1.3 2023/08/07 01:07:36 rin Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/endian.h> 31 32 #include <crypto/arch/x86/immintrin.h> 33 34 #include "chacha_sse2.h" 35 36 static inline __m128i 37 rol32(__m128i x, uint8_t n) 38 { 39 40 return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n); 41 } 42 43 static inline void 45 chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3, 46 unsigned nr) 47 { 48 __m128i r0, r1, r2, r3; 49 __m128i c0, c1, c2, c3; 50 51 r0 = *p0; 52 r1 = *p1; 53 r2 = *p2; 54 r3 = *p3; 55 56 for (; nr > 0; nr -= 2) { 57 r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16); 58 r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12); 59 r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8); 60 r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7); 61 62 c0 = r0; 63 c1 = _mm_shuffle_epi32(r1, 0x39); 64 c2 = _mm_shuffle_epi32(r2, 0x4e); 65 c3 = _mm_shuffle_epi32(r3, 0x93); 66 67 c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16); 68 c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12); 69 c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8); 70 c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7); 71 72 r0 = c0; 73 r1 = _mm_shuffle_epi32(c1, 0x93); 74 r2 = _mm_shuffle_epi32(c2, 0x4e); 75 r3 = _mm_shuffle_epi32(c3, 0x39); 76 } 77 78 *p0 = r0; 79 *p1 = r1; 80 *p2 = r2; 81 *p3 = r3; 82 } 83 84 void 86 chacha_core_sse2(uint8_t out[restrict static 64], 87 const uint8_t in[static 16], 88 const uint8_t k[static 32], 89 const uint8_t c[static 16], 90 unsigned nr) 91 { 92 __m128i in0, in1, in2, in3; 93 __m128i r0, r1, r2, r3; 94 95 r0 = in0 = _mm_loadu_si128((const __m128i *)c); 96 r1 = in1 = _mm_loadu_si128((const __m128i *)k); 97 r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1); 98 r3 = in3 = _mm_loadu_si128((const __m128i *)in); 99 100 chacha_permute(&r0, &r1, &r2, &r3, nr); 101 102 _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0)); 103 _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1)); 104 _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2)); 105 _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3)); 106 } 107 108 void 109 hchacha_sse2(uint8_t out[restrict static 32], 110 const uint8_t in[static 16], 111 const uint8_t k[static 32], 112 const uint8_t c[static 16], 113 unsigned nr) 114 { 115 __m128i r0, r1, r2, r3; 116 117 r0 = _mm_loadu_si128((const __m128i *)c); 118 r1 = _mm_loadu_si128((const __m128i *)k); 119 r2 = _mm_loadu_si128((const __m128i *)k + 1); 120 r3 = _mm_loadu_si128((const __m128i *)in); 121 122 chacha_permute(&r0, &r1, &r2, &r3, nr); 123 124 _mm_storeu_si128((__m128i *)out + 0, r0); 125 _mm_storeu_si128((__m128i *)out + 1, r3); 126 } 127 128 #define CHACHA_QUARTERROUND(a, b, c, d) do \ 130 { \ 131 (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \ 132 (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \ 133 (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \ 134 (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \ 135 } while (/*CONSTCOND*/0) 136 137 static inline __m128i 138 load1_epi32(const void *p) 139 { 140 return (__m128i)_mm_load1_ps(p); 141 } 142 143 static inline __m128i 144 loadu_epi32(const void *p) 145 { 146 return _mm_loadu_si128(p); 147 } 148 149 static inline void 150 storeu_epi32(void *p, __m128i v) 151 { 152 return _mm_storeu_si128(p, v); 153 } 154 155 static inline __m128i 156 unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 157 { 158 __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */ 159 __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */ 160 161 /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */ 162 return (__m128i)_mm_movelh_ps(lo, hi); 163 } 164 165 static inline __m128i 166 unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 167 { 168 __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */ 169 __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */ 170 171 /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */ 172 return (__m128i)_mm_movehl_ps(hi, lo); 173 } 174 175 static inline __m128i 176 unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 177 { 178 __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */ 179 __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */ 180 181 /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */ 182 return (__m128i)_mm_movelh_ps(lo, hi); 183 } 184 185 static inline __m128i 186 unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 187 { 188 __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */ 189 __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */ 190 191 /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */ 192 return (__m128i)_mm_movehl_ps(hi, lo); 193 } 194 195 void 197 chacha_stream_sse2(uint8_t *restrict s, size_t n, 198 uint32_t blkno, 199 const uint8_t nonce[static 12], 200 const uint8_t k[static 32], 201 unsigned nr) 202 { 203 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 204 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; 205 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; 206 unsigned r; 207 208 if (n < 256) 209 goto out; 210 211 x0 = load1_epi32(chacha_const32 + 0); 212 x1 = load1_epi32(chacha_const32 + 4); 213 x2 = load1_epi32(chacha_const32 + 8); 214 x3 = load1_epi32(chacha_const32 + 12); 215 x4 = load1_epi32(k + 0); 216 x5 = load1_epi32(k + 4); 217 x6 = load1_epi32(k + 8); 218 x7 = load1_epi32(k + 12); 219 x8 = load1_epi32(k + 16); 220 x9 = load1_epi32(k + 20); 221 x10 = load1_epi32(k + 24); 222 x11 = load1_epi32(k + 28); 223 /* x12 set in the loop */ 224 x13 = load1_epi32(nonce + 0); 225 x14 = load1_epi32(nonce + 4); 226 x15 = load1_epi32(nonce + 8); 227 228 for (; n >= 256; s += 256, n -= 256, blkno += 4) { 230 x12 = _mm_add_epi32(_mm_set1_epi32(blkno), 231 _mm_set_epi32(3,2,1,0)); 232 y0 = x0; 233 y1 = x1; 234 y2 = x2; 235 y3 = x3; 236 y4 = x4; 237 y5 = x5; 238 y6 = x6; 239 y7 = x7; 240 y8 = x8; 241 y9 = x9; 242 y10 = x10; 243 y11 = x11; 244 y12 = x12; 245 y13 = x13; 246 y14 = x14; 247 y15 = x15; 248 for (r = nr; r > 0; r -= 2) { 249 CHACHA_QUARTERROUND( y0, y4, y8,y12); 250 CHACHA_QUARTERROUND( y1, y5, y9,y13); 251 CHACHA_QUARTERROUND( y2, y6,y10,y14); 252 CHACHA_QUARTERROUND( y3, y7,y11,y15); 253 CHACHA_QUARTERROUND( y0, y5,y10,y15); 254 CHACHA_QUARTERROUND( y1, y6,y11,y12); 255 CHACHA_QUARTERROUND( y2, y7, y8,y13); 256 CHACHA_QUARTERROUND( y3, y4, y9,y14); 257 } 258 y0 = _mm_add_epi32(y0, x0); 259 y1 = _mm_add_epi32(y1, x1); 260 y2 = _mm_add_epi32(y2, x2); 261 y3 = _mm_add_epi32(y3, x3); 262 y4 = _mm_add_epi32(y4, x4); 263 y5 = _mm_add_epi32(y5, x5); 264 y6 = _mm_add_epi32(y6, x6); 265 y7 = _mm_add_epi32(y7, x7); 266 y8 = _mm_add_epi32(y8, x8); 267 y9 = _mm_add_epi32(y9, x9); 268 y10 = _mm_add_epi32(y10, x10); 269 y11 = _mm_add_epi32(y11, x11); 270 y12 = _mm_add_epi32(y12, x12); 271 y13 = _mm_add_epi32(y13, x13); 272 y14 = _mm_add_epi32(y14, x14); 273 y15 = _mm_add_epi32(y15, x15); 274 275 z0 = unpack0_epi32(y0, y1, y2, y3); 277 z1 = unpack0_epi32(y4, y5, y6, y7); 278 z2 = unpack0_epi32(y8, y9, y10, y11); 279 z3 = unpack0_epi32(y12, y13, y14, y15); 280 z4 = unpack1_epi32(y0, y1, y2, y3); 281 z5 = unpack1_epi32(y4, y5, y6, y7); 282 z6 = unpack1_epi32(y8, y9, y10, y11); 283 z7 = unpack1_epi32(y12, y13, y14, y15); 284 z8 = unpack2_epi32(y0, y1, y2, y3); 285 z9 = unpack2_epi32(y4, y5, y6, y7); 286 z10 = unpack2_epi32(y8, y9, y10, y11); 287 z11 = unpack2_epi32(y12, y13, y14, y15); 288 z12 = unpack3_epi32(y0, y1, y2, y3); 289 z13 = unpack3_epi32(y4, y5, y6, y7); 290 z14 = unpack3_epi32(y8, y9, y10, y11); 291 z15 = unpack3_epi32(y12, y13, y14, y15); 292 293 storeu_epi32(s + 16*0, z0); 294 storeu_epi32(s + 16*1, z1); 295 storeu_epi32(s + 16*2, z2); 296 storeu_epi32(s + 16*3, z3); 297 storeu_epi32(s + 16*4, z4); 298 storeu_epi32(s + 16*5, z5); 299 storeu_epi32(s + 16*6, z6); 300 storeu_epi32(s + 16*7, z7); 301 storeu_epi32(s + 16*8, z8); 302 storeu_epi32(s + 16*9, z9); 303 storeu_epi32(s + 16*10, z10); 304 storeu_epi32(s + 16*11, z11); 305 storeu_epi32(s + 16*12, z12); 306 storeu_epi32(s + 16*13, z13); 307 storeu_epi32(s + 16*14, z14); 308 storeu_epi32(s + 16*15, z15); 309 } 310 311 out: if (n) { 313 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); 314 __m128i in0, in1, in2, in3; 315 __m128i r0, r1, r2, r3; 316 317 in0 = _mm_loadu_si128((const __m128i *)chacha_const32); 318 in1 = _mm_loadu_si128((const __m128i *)k); 319 in2 = _mm_loadu_si128((const __m128i *)k + 1); 320 in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), 321 le32dec(nonce), blkno); 322 323 for (; n; s += 64, n -= 64) { 324 r0 = in0; 325 r1 = in1; 326 r2 = in2; 327 r3 = in3; 328 chacha_permute(&r0, &r1, &r2, &r3, nr); 329 r0 = _mm_add_epi32(r0, in0); 330 r1 = _mm_add_epi32(r1, in1); 331 r2 = _mm_add_epi32(r2, in2); 332 r3 = _mm_add_epi32(r3, in3); 333 334 if (n < 64) { 335 uint8_t buf[64] __aligned(16); 336 337 _mm_storeu_si128((__m128i *)buf + 0, r0); 338 _mm_storeu_si128((__m128i *)buf + 1, r1); 339 _mm_storeu_si128((__m128i *)buf + 2, r2); 340 _mm_storeu_si128((__m128i *)buf + 3, r3); 341 memcpy(s, buf, n); 342 343 break; 344 } 345 346 _mm_storeu_si128((__m128i *)s + 0, r0); 347 _mm_storeu_si128((__m128i *)s + 1, r1); 348 _mm_storeu_si128((__m128i *)s + 2, r2); 349 _mm_storeu_si128((__m128i *)s + 3, r3); 350 in3 = _mm_add_epi32(in3, blkno_inc); 351 } 352 } 353 } 354 355 void 357 chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n, 358 uint32_t blkno, 359 const uint8_t nonce[static 12], 360 const uint8_t k[static 32], 361 unsigned nr) 362 { 363 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 364 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; 365 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; 366 unsigned r; 367 368 if (n < 256) 369 goto out; 370 371 x0 = load1_epi32(chacha_const32 + 0); 372 x1 = load1_epi32(chacha_const32 + 4); 373 x2 = load1_epi32(chacha_const32 + 8); 374 x3 = load1_epi32(chacha_const32 + 12); 375 x4 = load1_epi32(k + 0); 376 x5 = load1_epi32(k + 4); 377 x6 = load1_epi32(k + 8); 378 x7 = load1_epi32(k + 12); 379 x8 = load1_epi32(k + 16); 380 x9 = load1_epi32(k + 20); 381 x10 = load1_epi32(k + 24); 382 x11 = load1_epi32(k + 28); 383 /* x12 set in the loop */ 384 x13 = load1_epi32(nonce + 0); 385 x14 = load1_epi32(nonce + 4); 386 x15 = load1_epi32(nonce + 8); 387 388 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) { 389 x12 = _mm_add_epi32(_mm_set1_epi32(blkno), 390 _mm_set_epi32(3,2,1,0)); 391 y0 = x0; 392 y1 = x1; 393 y2 = x2; 394 y3 = x3; 395 y4 = x4; 396 y5 = x5; 397 y6 = x6; 398 y7 = x7; 399 y8 = x8; 400 y9 = x9; 401 y10 = x10; 402 y11 = x11; 403 y12 = x12; 404 y13 = x13; 405 y14 = x14; 406 y15 = x15; 407 for (r = nr; r > 0; r -= 2) { 408 CHACHA_QUARTERROUND( y0, y4, y8,y12); 409 CHACHA_QUARTERROUND( y1, y5, y9,y13); 410 CHACHA_QUARTERROUND( y2, y6,y10,y14); 411 CHACHA_QUARTERROUND( y3, y7,y11,y15); 412 CHACHA_QUARTERROUND( y0, y5,y10,y15); 413 CHACHA_QUARTERROUND( y1, y6,y11,y12); 414 CHACHA_QUARTERROUND( y2, y7, y8,y13); 415 CHACHA_QUARTERROUND( y3, y4, y9,y14); 416 } 417 y0 = _mm_add_epi32(y0, x0); 418 y1 = _mm_add_epi32(y1, x1); 419 y2 = _mm_add_epi32(y2, x2); 420 y3 = _mm_add_epi32(y3, x3); 421 y4 = _mm_add_epi32(y4, x4); 422 y5 = _mm_add_epi32(y5, x5); 423 y6 = _mm_add_epi32(y6, x6); 424 y7 = _mm_add_epi32(y7, x7); 425 y8 = _mm_add_epi32(y8, x8); 426 y9 = _mm_add_epi32(y9, x9); 427 y10 = _mm_add_epi32(y10, x10); 428 y11 = _mm_add_epi32(y11, x11); 429 y12 = _mm_add_epi32(y12, x12); 430 y13 = _mm_add_epi32(y13, x13); 431 y14 = _mm_add_epi32(y14, x14); 432 y15 = _mm_add_epi32(y15, x15); 433 434 z0 = unpack0_epi32(y0, y1, y2, y3); 436 z1 = unpack0_epi32(y4, y5, y6, y7); 437 z2 = unpack0_epi32(y8, y9, y10, y11); 438 z3 = unpack0_epi32(y12, y13, y14, y15); 439 z4 = unpack1_epi32(y0, y1, y2, y3); 440 z5 = unpack1_epi32(y4, y5, y6, y7); 441 z6 = unpack1_epi32(y8, y9, y10, y11); 442 z7 = unpack1_epi32(y12, y13, y14, y15); 443 z8 = unpack2_epi32(y0, y1, y2, y3); 444 z9 = unpack2_epi32(y4, y5, y6, y7); 445 z10 = unpack2_epi32(y8, y9, y10, y11); 446 z11 = unpack2_epi32(y12, y13, y14, y15); 447 z12 = unpack3_epi32(y0, y1, y2, y3); 448 z13 = unpack3_epi32(y4, y5, y6, y7); 449 z14 = unpack3_epi32(y8, y9, y10, y11); 450 z15 = unpack3_epi32(y12, y13, y14, y15); 451 452 storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0); 453 storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1); 454 storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2); 455 storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3); 456 storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4); 457 storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5); 458 storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6); 459 storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7); 460 storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8); 461 storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9); 462 storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10); 463 storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11); 464 storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12); 465 storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13); 466 storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14); 467 storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15); 468 } 469 470 out: if (n) { 472 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); 473 __m128i in0, in1, in2, in3; 474 __m128i r0, r1, r2, r3; 475 476 in0 = _mm_loadu_si128((const __m128i *)chacha_const32); 477 in1 = _mm_loadu_si128((const __m128i *)k); 478 in2 = _mm_loadu_si128((const __m128i *)k + 1); 479 in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), 480 le32dec(nonce), blkno); 481 482 for (; n; s += 64, p += 64, n -= 64) { 483 r0 = in0; 484 r1 = in1; 485 r2 = in2; 486 r3 = in3; 487 chacha_permute(&r0, &r1, &r2, &r3, nr); 488 r0 = _mm_add_epi32(r0, in0); 489 r1 = _mm_add_epi32(r1, in1); 490 r2 = _mm_add_epi32(r2, in2); 491 r3 = _mm_add_epi32(r3, in3); 492 493 if (n < 64) { 494 uint8_t buf[64] __aligned(16); 495 unsigned i; 496 497 _mm_storeu_si128((__m128i *)buf + 0, r0); 498 _mm_storeu_si128((__m128i *)buf + 1, r1); 499 _mm_storeu_si128((__m128i *)buf + 2, r2); 500 _mm_storeu_si128((__m128i *)buf + 3, r3); 501 502 for (i = 0; i < n - n%4; i += 4) 503 le32enc(s + i, 504 le32dec(p + i) ^ le32dec(buf + i)); 505 for (; i < n; i++) 506 s[i] = p[i] ^ buf[i]; 507 508 break; 509 } 510 511 r0 ^= _mm_loadu_si128((const __m128i *)p + 0); 512 r1 ^= _mm_loadu_si128((const __m128i *)p + 1); 513 r2 ^= _mm_loadu_si128((const __m128i *)p + 2); 514 r3 ^= _mm_loadu_si128((const __m128i *)p + 3); 515 _mm_storeu_si128((__m128i *)s + 0, r0); 516 _mm_storeu_si128((__m128i *)s + 1, r1); 517 _mm_storeu_si128((__m128i *)s + 2, r2); 518 _mm_storeu_si128((__m128i *)s + 3, r3); 519 in3 = _mm_add_epi32(in3, blkno_inc); 520 } 521 } 522 } 523 524 void 526 xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes, 527 uint32_t blkno, 528 const uint8_t nonce[static 24], 529 const uint8_t k[static 32], 530 unsigned nr) 531 { 532 uint8_t subkey[32]; 533 uint8_t subnonce[12]; 534 535 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 536 memset(subnonce, 0, 4); 537 memcpy(subnonce + 4, nonce + 16, 8); 538 chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr); 539 } 540 541 void 542 xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 543 uint32_t blkno, 544 const uint8_t nonce[static 24], 545 const uint8_t k[static 32], 546 unsigned nr) 547 { 548 uint8_t subkey[32]; 549 uint8_t subnonce[12]; 550 551 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 552 memset(subnonce, 0, 4); 553 memcpy(subnonce + 4, nonce + 16, 8); 554 chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr); 555 } 556