1 1.3 rin /* $NetBSD: chacha_sse2.c,v 1.3 2023/08/07 01:07:36 rin Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad #include <sys/types.h> 30 1.1 riastrad #include <sys/endian.h> 31 1.1 riastrad 32 1.3 rin #include <crypto/arch/x86/immintrin.h> 33 1.1 riastrad 34 1.1 riastrad #include "chacha_sse2.h" 35 1.1 riastrad 36 1.1 riastrad static inline __m128i 37 1.1 riastrad rol32(__m128i x, uint8_t n) 38 1.1 riastrad { 39 1.1 riastrad 40 1.1 riastrad return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n); 41 1.1 riastrad } 42 1.1 riastrad 43 1.1 riastrad static inline void 45 1.1 riastrad chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3, 46 1.1 riastrad unsigned nr) 47 1.1 riastrad { 48 1.1 riastrad __m128i r0, r1, r2, r3; 49 1.1 riastrad __m128i c0, c1, c2, c3; 50 1.1 riastrad 51 1.1 riastrad r0 = *p0; 52 1.1 riastrad r1 = *p1; 53 1.1 riastrad r2 = *p2; 54 1.1 riastrad r3 = *p3; 55 1.1 riastrad 56 1.1 riastrad for (; nr > 0; nr -= 2) { 57 1.1 riastrad r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16); 58 1.1 riastrad r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12); 59 1.1 riastrad r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8); 60 1.1 riastrad r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7); 61 1.1 riastrad 62 1.1 riastrad c0 = r0; 63 1.1 riastrad c1 = _mm_shuffle_epi32(r1, 0x39); 64 1.1 riastrad c2 = _mm_shuffle_epi32(r2, 0x4e); 65 1.1 riastrad c3 = _mm_shuffle_epi32(r3, 0x93); 66 1.1 riastrad 67 1.1 riastrad c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16); 68 1.1 riastrad c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12); 69 1.1 riastrad c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8); 70 1.1 riastrad c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7); 71 1.1 riastrad 72 1.1 riastrad r0 = c0; 73 1.1 riastrad r1 = _mm_shuffle_epi32(c1, 0x93); 74 1.1 riastrad r2 = _mm_shuffle_epi32(c2, 0x4e); 75 1.1 riastrad r3 = _mm_shuffle_epi32(c3, 0x39); 76 1.1 riastrad } 77 1.1 riastrad 78 1.1 riastrad *p0 = r0; 79 1.1 riastrad *p1 = r1; 80 1.1 riastrad *p2 = r2; 81 1.1 riastrad *p3 = r3; 82 1.1 riastrad } 83 1.1 riastrad 84 1.1 riastrad void 86 1.1 riastrad chacha_core_sse2(uint8_t out[restrict static 64], 87 1.1 riastrad const uint8_t in[static 16], 88 1.1 riastrad const uint8_t k[static 32], 89 1.1 riastrad const uint8_t c[static 16], 90 1.1 riastrad unsigned nr) 91 1.1 riastrad { 92 1.1 riastrad __m128i in0, in1, in2, in3; 93 1.1 riastrad __m128i r0, r1, r2, r3; 94 1.1 riastrad 95 1.1 riastrad r0 = in0 = _mm_loadu_si128((const __m128i *)c); 96 1.1 riastrad r1 = in1 = _mm_loadu_si128((const __m128i *)k); 97 1.1 riastrad r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1); 98 1.1 riastrad r3 = in3 = _mm_loadu_si128((const __m128i *)in); 99 1.1 riastrad 100 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 101 1.1 riastrad 102 1.1 riastrad _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0)); 103 1.1 riastrad _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1)); 104 1.1 riastrad _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2)); 105 1.1 riastrad _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3)); 106 1.1 riastrad } 107 1.1 riastrad 108 1.1 riastrad void 109 1.1 riastrad hchacha_sse2(uint8_t out[restrict static 32], 110 1.1 riastrad const uint8_t in[static 16], 111 1.1 riastrad const uint8_t k[static 32], 112 1.1 riastrad const uint8_t c[static 16], 113 1.1 riastrad unsigned nr) 114 1.1 riastrad { 115 1.1 riastrad __m128i r0, r1, r2, r3; 116 1.1 riastrad 117 1.1 riastrad r0 = _mm_loadu_si128((const __m128i *)c); 118 1.1 riastrad r1 = _mm_loadu_si128((const __m128i *)k); 119 1.1 riastrad r2 = _mm_loadu_si128((const __m128i *)k + 1); 120 1.1 riastrad r3 = _mm_loadu_si128((const __m128i *)in); 121 1.1 riastrad 122 1.1 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 123 1.1 riastrad 124 1.1 riastrad _mm_storeu_si128((__m128i *)out + 0, r0); 125 1.1 riastrad _mm_storeu_si128((__m128i *)out + 1, r3); 126 1.1 riastrad } 127 1.1 riastrad 128 1.1 riastrad #define CHACHA_QUARTERROUND(a, b, c, d) do \ 130 1.1 riastrad { \ 131 1.1 riastrad (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \ 132 1.1 riastrad (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \ 133 1.1 riastrad (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \ 134 1.1 riastrad (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \ 135 1.1 riastrad } while (/*CONSTCOND*/0) 136 1.1 riastrad 137 1.1 riastrad static inline __m128i 138 1.1 riastrad load1_epi32(const void *p) 139 1.1 riastrad { 140 1.1 riastrad return (__m128i)_mm_load1_ps(p); 141 1.1 riastrad } 142 1.1 riastrad 143 1.1 riastrad static inline __m128i 144 1.1 riastrad loadu_epi32(const void *p) 145 1.1 riastrad { 146 1.1 riastrad return _mm_loadu_si128(p); 147 1.1 riastrad } 148 1.1 riastrad 149 1.1 riastrad static inline void 150 1.1 riastrad storeu_epi32(void *p, __m128i v) 151 1.1 riastrad { 152 1.1 riastrad return _mm_storeu_si128(p, v); 153 1.1 riastrad } 154 1.1 riastrad 155 1.1 riastrad static inline __m128i 156 1.1 riastrad unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 157 1.1 riastrad { 158 1.1 riastrad __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */ 159 1.1 riastrad __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */ 160 1.1 riastrad 161 1.1 riastrad /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */ 162 1.1 riastrad return (__m128i)_mm_movelh_ps(lo, hi); 163 1.1 riastrad } 164 1.1 riastrad 165 1.1 riastrad static inline __m128i 166 1.1 riastrad unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 167 1.1 riastrad { 168 1.1 riastrad __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */ 169 1.1 riastrad __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */ 170 1.1 riastrad 171 1.1 riastrad /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */ 172 1.1 riastrad return (__m128i)_mm_movehl_ps(hi, lo); 173 1.1 riastrad } 174 1.1 riastrad 175 1.1 riastrad static inline __m128i 176 1.1 riastrad unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 177 1.1 riastrad { 178 1.1 riastrad __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */ 179 1.1 riastrad __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */ 180 1.1 riastrad 181 1.1 riastrad /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */ 182 1.1 riastrad return (__m128i)_mm_movelh_ps(lo, hi); 183 1.1 riastrad } 184 1.1 riastrad 185 1.1 riastrad static inline __m128i 186 1.1 riastrad unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 187 1.1 riastrad { 188 1.1 riastrad __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */ 189 1.1 riastrad __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */ 190 1.1 riastrad 191 1.1 riastrad /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */ 192 1.1 riastrad return (__m128i)_mm_movehl_ps(hi, lo); 193 1.1 riastrad } 194 1.1 riastrad 195 1.1 riastrad void 197 1.1 riastrad chacha_stream_sse2(uint8_t *restrict s, size_t n, 198 1.1 riastrad uint32_t blkno, 199 1.1 riastrad const uint8_t nonce[static 12], 200 1.1 riastrad const uint8_t k[static 32], 201 1.1 riastrad unsigned nr) 202 1.1 riastrad { 203 1.1 riastrad __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 204 1.1 riastrad __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; 205 1.1 riastrad __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; 206 1.1 riastrad unsigned r; 207 1.1 riastrad 208 1.1 riastrad if (n < 256) 209 1.1 riastrad goto out; 210 1.1 riastrad 211 1.1 riastrad x0 = load1_epi32(chacha_const32 + 0); 212 1.1 riastrad x1 = load1_epi32(chacha_const32 + 4); 213 1.1 riastrad x2 = load1_epi32(chacha_const32 + 8); 214 1.1 riastrad x3 = load1_epi32(chacha_const32 + 12); 215 1.1 riastrad x4 = load1_epi32(k + 0); 216 1.1 riastrad x5 = load1_epi32(k + 4); 217 1.1 riastrad x6 = load1_epi32(k + 8); 218 1.1 riastrad x7 = load1_epi32(k + 12); 219 1.1 riastrad x8 = load1_epi32(k + 16); 220 1.1 riastrad x9 = load1_epi32(k + 20); 221 1.1 riastrad x10 = load1_epi32(k + 24); 222 1.1 riastrad x11 = load1_epi32(k + 28); 223 1.1 riastrad /* x12 set in the loop */ 224 1.1 riastrad x13 = load1_epi32(nonce + 0); 225 1.1 riastrad x14 = load1_epi32(nonce + 4); 226 1.1 riastrad x15 = load1_epi32(nonce + 8); 227 1.1 riastrad 228 1.1 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4) { 230 1.1 riastrad x12 = _mm_add_epi32(_mm_set1_epi32(blkno), 231 1.1 riastrad _mm_set_epi32(3,2,1,0)); 232 1.1 riastrad y0 = x0; 233 1.1 riastrad y1 = x1; 234 1.1 riastrad y2 = x2; 235 1.1 riastrad y3 = x3; 236 1.1 riastrad y4 = x4; 237 1.1 riastrad y5 = x5; 238 1.1 riastrad y6 = x6; 239 1.1 riastrad y7 = x7; 240 1.1 riastrad y8 = x8; 241 1.1 riastrad y9 = x9; 242 1.1 riastrad y10 = x10; 243 1.1 riastrad y11 = x11; 244 1.1 riastrad y12 = x12; 245 1.1 riastrad y13 = x13; 246 1.1 riastrad y14 = x14; 247 1.1 riastrad y15 = x15; 248 1.1 riastrad for (r = nr; r > 0; r -= 2) { 249 1.1 riastrad CHACHA_QUARTERROUND( y0, y4, y8,y12); 250 1.1 riastrad CHACHA_QUARTERROUND( y1, y5, y9,y13); 251 1.1 riastrad CHACHA_QUARTERROUND( y2, y6,y10,y14); 252 1.1 riastrad CHACHA_QUARTERROUND( y3, y7,y11,y15); 253 1.1 riastrad CHACHA_QUARTERROUND( y0, y5,y10,y15); 254 1.1 riastrad CHACHA_QUARTERROUND( y1, y6,y11,y12); 255 1.1 riastrad CHACHA_QUARTERROUND( y2, y7, y8,y13); 256 1.1 riastrad CHACHA_QUARTERROUND( y3, y4, y9,y14); 257 1.1 riastrad } 258 1.1 riastrad y0 = _mm_add_epi32(y0, x0); 259 1.1 riastrad y1 = _mm_add_epi32(y1, x1); 260 1.1 riastrad y2 = _mm_add_epi32(y2, x2); 261 1.1 riastrad y3 = _mm_add_epi32(y3, x3); 262 1.1 riastrad y4 = _mm_add_epi32(y4, x4); 263 1.1 riastrad y5 = _mm_add_epi32(y5, x5); 264 1.1 riastrad y6 = _mm_add_epi32(y6, x6); 265 1.1 riastrad y7 = _mm_add_epi32(y7, x7); 266 1.1 riastrad y8 = _mm_add_epi32(y8, x8); 267 1.1 riastrad y9 = _mm_add_epi32(y9, x9); 268 1.1 riastrad y10 = _mm_add_epi32(y10, x10); 269 1.1 riastrad y11 = _mm_add_epi32(y11, x11); 270 1.1 riastrad y12 = _mm_add_epi32(y12, x12); 271 1.1 riastrad y13 = _mm_add_epi32(y13, x13); 272 1.1 riastrad y14 = _mm_add_epi32(y14, x14); 273 1.1 riastrad y15 = _mm_add_epi32(y15, x15); 274 1.1 riastrad 275 1.1 riastrad z0 = unpack0_epi32(y0, y1, y2, y3); 277 1.1 riastrad z1 = unpack0_epi32(y4, y5, y6, y7); 278 1.1 riastrad z2 = unpack0_epi32(y8, y9, y10, y11); 279 1.1 riastrad z3 = unpack0_epi32(y12, y13, y14, y15); 280 1.1 riastrad z4 = unpack1_epi32(y0, y1, y2, y3); 281 1.1 riastrad z5 = unpack1_epi32(y4, y5, y6, y7); 282 1.1 riastrad z6 = unpack1_epi32(y8, y9, y10, y11); 283 1.1 riastrad z7 = unpack1_epi32(y12, y13, y14, y15); 284 1.1 riastrad z8 = unpack2_epi32(y0, y1, y2, y3); 285 1.1 riastrad z9 = unpack2_epi32(y4, y5, y6, y7); 286 1.1 riastrad z10 = unpack2_epi32(y8, y9, y10, y11); 287 1.1 riastrad z11 = unpack2_epi32(y12, y13, y14, y15); 288 1.1 riastrad z12 = unpack3_epi32(y0, y1, y2, y3); 289 1.1 riastrad z13 = unpack3_epi32(y4, y5, y6, y7); 290 1.1 riastrad z14 = unpack3_epi32(y8, y9, y10, y11); 291 1.1 riastrad z15 = unpack3_epi32(y12, y13, y14, y15); 292 1.1 riastrad 293 1.1 riastrad storeu_epi32(s + 16*0, z0); 294 1.1 riastrad storeu_epi32(s + 16*1, z1); 295 1.1 riastrad storeu_epi32(s + 16*2, z2); 296 1.1 riastrad storeu_epi32(s + 16*3, z3); 297 1.1 riastrad storeu_epi32(s + 16*4, z4); 298 1.1 riastrad storeu_epi32(s + 16*5, z5); 299 1.1 riastrad storeu_epi32(s + 16*6, z6); 300 1.1 riastrad storeu_epi32(s + 16*7, z7); 301 1.1 riastrad storeu_epi32(s + 16*8, z8); 302 1.1 riastrad storeu_epi32(s + 16*9, z9); 303 1.1 riastrad storeu_epi32(s + 16*10, z10); 304 1.1 riastrad storeu_epi32(s + 16*11, z11); 305 1.1 riastrad storeu_epi32(s + 16*12, z12); 306 1.1 riastrad storeu_epi32(s + 16*13, z13); 307 1.1 riastrad storeu_epi32(s + 16*14, z14); 308 1.1 riastrad storeu_epi32(s + 16*15, z15); 309 1.1 riastrad } 310 1.1 riastrad 311 1.1 riastrad out: if (n) { 313 1.1 riastrad const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); 314 1.1 riastrad __m128i in0, in1, in2, in3; 315 1.1 riastrad __m128i r0, r1, r2, r3; 316 1.2 riastrad 317 1.1 riastrad in0 = _mm_loadu_si128((const __m128i *)chacha_const32); 318 1.1 riastrad in1 = _mm_loadu_si128((const __m128i *)k); 319 1.1 riastrad in2 = _mm_loadu_si128((const __m128i *)k + 1); 320 1.1 riastrad in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), 321 1.1 riastrad le32dec(nonce), blkno); 322 1.1 riastrad 323 1.1 riastrad for (; n; s += 64, n -= 64) { 324 1.1 riastrad r0 = in0; 325 1.1 riastrad r1 = in1; 326 1.2 riastrad r2 = in2; 327 1.2 riastrad r3 = in3; 328 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 329 1.2 riastrad r0 = _mm_add_epi32(r0, in0); 330 1.2 riastrad r1 = _mm_add_epi32(r1, in1); 331 1.2 riastrad r2 = _mm_add_epi32(r2, in2); 332 1.2 riastrad r3 = _mm_add_epi32(r3, in3); 333 1.2 riastrad 334 1.2 riastrad if (n < 64) { 335 1.2 riastrad uint8_t buf[64] __aligned(16); 336 1.2 riastrad 337 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 0, r0); 338 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 1, r1); 339 1.1 riastrad _mm_storeu_si128((__m128i *)buf + 2, r2); 340 1.1 riastrad _mm_storeu_si128((__m128i *)buf + 3, r3); 341 1.1 riastrad memcpy(s, buf, n); 342 1.1 riastrad 343 1.1 riastrad break; 344 1.1 riastrad } 345 1.1 riastrad 346 1.1 riastrad _mm_storeu_si128((__m128i *)s + 0, r0); 347 1.1 riastrad _mm_storeu_si128((__m128i *)s + 1, r1); 348 1.1 riastrad _mm_storeu_si128((__m128i *)s + 2, r2); 349 1.1 riastrad _mm_storeu_si128((__m128i *)s + 3, r3); 350 1.1 riastrad in3 = _mm_add_epi32(in3, blkno_inc); 351 1.1 riastrad } 352 1.1 riastrad } 353 1.1 riastrad } 354 1.1 riastrad 355 1.1 riastrad void 357 1.1 riastrad chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n, 358 1.1 riastrad uint32_t blkno, 359 1.1 riastrad const uint8_t nonce[static 12], 360 1.1 riastrad const uint8_t k[static 32], 361 1.1 riastrad unsigned nr) 362 1.1 riastrad { 363 1.1 riastrad __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 364 1.1 riastrad __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; 365 1.1 riastrad __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; 366 1.1 riastrad unsigned r; 367 1.1 riastrad 368 1.1 riastrad if (n < 256) 369 1.1 riastrad goto out; 370 1.1 riastrad 371 1.1 riastrad x0 = load1_epi32(chacha_const32 + 0); 372 1.1 riastrad x1 = load1_epi32(chacha_const32 + 4); 373 1.1 riastrad x2 = load1_epi32(chacha_const32 + 8); 374 1.1 riastrad x3 = load1_epi32(chacha_const32 + 12); 375 1.1 riastrad x4 = load1_epi32(k + 0); 376 1.1 riastrad x5 = load1_epi32(k + 4); 377 1.1 riastrad x6 = load1_epi32(k + 8); 378 1.1 riastrad x7 = load1_epi32(k + 12); 379 1.1 riastrad x8 = load1_epi32(k + 16); 380 1.1 riastrad x9 = load1_epi32(k + 20); 381 1.1 riastrad x10 = load1_epi32(k + 24); 382 1.1 riastrad x11 = load1_epi32(k + 28); 383 1.1 riastrad /* x12 set in the loop */ 384 1.1 riastrad x13 = load1_epi32(nonce + 0); 385 1.1 riastrad x14 = load1_epi32(nonce + 4); 386 1.1 riastrad x15 = load1_epi32(nonce + 8); 387 1.1 riastrad 388 1.1 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) { 389 1.1 riastrad x12 = _mm_add_epi32(_mm_set1_epi32(blkno), 390 1.1 riastrad _mm_set_epi32(3,2,1,0)); 391 1.1 riastrad y0 = x0; 392 1.1 riastrad y1 = x1; 393 1.1 riastrad y2 = x2; 394 1.1 riastrad y3 = x3; 395 1.1 riastrad y4 = x4; 396 1.1 riastrad y5 = x5; 397 1.1 riastrad y6 = x6; 398 1.1 riastrad y7 = x7; 399 1.1 riastrad y8 = x8; 400 1.1 riastrad y9 = x9; 401 1.1 riastrad y10 = x10; 402 1.1 riastrad y11 = x11; 403 1.1 riastrad y12 = x12; 404 1.1 riastrad y13 = x13; 405 1.1 riastrad y14 = x14; 406 1.1 riastrad y15 = x15; 407 1.1 riastrad for (r = nr; r > 0; r -= 2) { 408 1.1 riastrad CHACHA_QUARTERROUND( y0, y4, y8,y12); 409 1.1 riastrad CHACHA_QUARTERROUND( y1, y5, y9,y13); 410 1.1 riastrad CHACHA_QUARTERROUND( y2, y6,y10,y14); 411 1.1 riastrad CHACHA_QUARTERROUND( y3, y7,y11,y15); 412 1.1 riastrad CHACHA_QUARTERROUND( y0, y5,y10,y15); 413 1.1 riastrad CHACHA_QUARTERROUND( y1, y6,y11,y12); 414 1.1 riastrad CHACHA_QUARTERROUND( y2, y7, y8,y13); 415 1.1 riastrad CHACHA_QUARTERROUND( y3, y4, y9,y14); 416 1.1 riastrad } 417 1.1 riastrad y0 = _mm_add_epi32(y0, x0); 418 1.1 riastrad y1 = _mm_add_epi32(y1, x1); 419 1.1 riastrad y2 = _mm_add_epi32(y2, x2); 420 1.1 riastrad y3 = _mm_add_epi32(y3, x3); 421 1.1 riastrad y4 = _mm_add_epi32(y4, x4); 422 1.1 riastrad y5 = _mm_add_epi32(y5, x5); 423 1.1 riastrad y6 = _mm_add_epi32(y6, x6); 424 1.1 riastrad y7 = _mm_add_epi32(y7, x7); 425 1.1 riastrad y8 = _mm_add_epi32(y8, x8); 426 1.1 riastrad y9 = _mm_add_epi32(y9, x9); 427 1.1 riastrad y10 = _mm_add_epi32(y10, x10); 428 1.1 riastrad y11 = _mm_add_epi32(y11, x11); 429 1.1 riastrad y12 = _mm_add_epi32(y12, x12); 430 1.1 riastrad y13 = _mm_add_epi32(y13, x13); 431 1.1 riastrad y14 = _mm_add_epi32(y14, x14); 432 1.1 riastrad y15 = _mm_add_epi32(y15, x15); 433 1.1 riastrad 434 1.1 riastrad z0 = unpack0_epi32(y0, y1, y2, y3); 436 1.1 riastrad z1 = unpack0_epi32(y4, y5, y6, y7); 437 1.1 riastrad z2 = unpack0_epi32(y8, y9, y10, y11); 438 1.1 riastrad z3 = unpack0_epi32(y12, y13, y14, y15); 439 1.1 riastrad z4 = unpack1_epi32(y0, y1, y2, y3); 440 1.1 riastrad z5 = unpack1_epi32(y4, y5, y6, y7); 441 1.1 riastrad z6 = unpack1_epi32(y8, y9, y10, y11); 442 1.1 riastrad z7 = unpack1_epi32(y12, y13, y14, y15); 443 1.1 riastrad z8 = unpack2_epi32(y0, y1, y2, y3); 444 1.1 riastrad z9 = unpack2_epi32(y4, y5, y6, y7); 445 1.1 riastrad z10 = unpack2_epi32(y8, y9, y10, y11); 446 1.1 riastrad z11 = unpack2_epi32(y12, y13, y14, y15); 447 1.1 riastrad z12 = unpack3_epi32(y0, y1, y2, y3); 448 1.1 riastrad z13 = unpack3_epi32(y4, y5, y6, y7); 449 1.1 riastrad z14 = unpack3_epi32(y8, y9, y10, y11); 450 1.1 riastrad z15 = unpack3_epi32(y12, y13, y14, y15); 451 1.1 riastrad 452 1.1 riastrad storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0); 453 1.1 riastrad storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1); 454 1.1 riastrad storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2); 455 1.1 riastrad storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3); 456 1.1 riastrad storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4); 457 1.1 riastrad storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5); 458 1.1 riastrad storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6); 459 1.1 riastrad storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7); 460 1.1 riastrad storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8); 461 1.1 riastrad storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9); 462 1.1 riastrad storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10); 463 1.1 riastrad storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11); 464 1.1 riastrad storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12); 465 1.1 riastrad storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13); 466 1.1 riastrad storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14); 467 1.1 riastrad storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15); 468 1.1 riastrad } 469 1.1 riastrad 470 1.1 riastrad out: if (n) { 472 1.2 riastrad const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); 473 1.1 riastrad __m128i in0, in1, in2, in3; 474 1.1 riastrad __m128i r0, r1, r2, r3; 475 1.1 riastrad 476 1.1 riastrad in0 = _mm_loadu_si128((const __m128i *)chacha_const32); 477 1.1 riastrad in1 = _mm_loadu_si128((const __m128i *)k); 478 1.1 riastrad in2 = _mm_loadu_si128((const __m128i *)k + 1); 479 1.1 riastrad in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), 480 1.1 riastrad le32dec(nonce), blkno); 481 1.1 riastrad 482 1.2 riastrad for (; n; s += 64, p += 64, n -= 64) { 483 1.2 riastrad r0 = in0; 484 1.2 riastrad r1 = in1; 485 1.2 riastrad r2 = in2; 486 1.2 riastrad r3 = in3; 487 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 488 1.2 riastrad r0 = _mm_add_epi32(r0, in0); 489 1.2 riastrad r1 = _mm_add_epi32(r1, in1); 490 1.2 riastrad r2 = _mm_add_epi32(r2, in2); 491 1.2 riastrad r3 = _mm_add_epi32(r3, in3); 492 1.2 riastrad 493 1.2 riastrad if (n < 64) { 494 1.2 riastrad uint8_t buf[64] __aligned(16); 495 1.2 riastrad unsigned i; 496 1.2 riastrad 497 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 0, r0); 498 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 1, r1); 499 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 2, r2); 500 1.2 riastrad _mm_storeu_si128((__m128i *)buf + 3, r3); 501 1.1 riastrad 502 1.1 riastrad for (i = 0; i < n - n%4; i += 4) 503 1.1 riastrad le32enc(s + i, 504 1.1 riastrad le32dec(p + i) ^ le32dec(buf + i)); 505 1.1 riastrad for (; i < n; i++) 506 1.1 riastrad s[i] = p[i] ^ buf[i]; 507 1.1 riastrad 508 1.1 riastrad break; 509 1.1 riastrad } 510 1.1 riastrad 511 1.1 riastrad r0 ^= _mm_loadu_si128((const __m128i *)p + 0); 512 1.1 riastrad r1 ^= _mm_loadu_si128((const __m128i *)p + 1); 513 1.1 riastrad r2 ^= _mm_loadu_si128((const __m128i *)p + 2); 514 1.1 riastrad r3 ^= _mm_loadu_si128((const __m128i *)p + 3); 515 1.1 riastrad _mm_storeu_si128((__m128i *)s + 0, r0); 516 1.1 riastrad _mm_storeu_si128((__m128i *)s + 1, r1); 517 1.1 riastrad _mm_storeu_si128((__m128i *)s + 2, r2); 518 1.1 riastrad _mm_storeu_si128((__m128i *)s + 3, r3); 519 1.1 riastrad in3 = _mm_add_epi32(in3, blkno_inc); 520 1.1 riastrad } 521 1.1 riastrad } 522 1.1 riastrad } 523 1.1 riastrad 524 1.1 riastrad void 526 1.1 riastrad xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes, 527 1.1 riastrad uint32_t blkno, 528 1.1 riastrad const uint8_t nonce[static 24], 529 1.1 riastrad const uint8_t k[static 32], 530 1.1 riastrad unsigned nr) 531 1.1 riastrad { 532 1.1 riastrad uint8_t subkey[32]; 533 1.1 riastrad uint8_t subnonce[12]; 534 1.1 riastrad 535 1.1 riastrad hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 536 1.1 riastrad memset(subnonce, 0, 4); 537 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8); 538 1.1 riastrad chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr); 539 1.1 riastrad } 540 1.1 riastrad 541 1.1 riastrad void 542 1.1 riastrad xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 543 1.1 riastrad uint32_t blkno, 544 1.1 riastrad const uint8_t nonce[static 24], 545 const uint8_t k[static 32], 546 unsigned nr) 547 { 548 uint8_t subkey[32]; 549 uint8_t subnonce[12]; 550 551 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 552 memset(subnonce, 0, 4); 553 memcpy(subnonce + 4, nonce + 16, 8); 554 chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr); 555 } 556