1 1.2 riastrad /* $NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad /* 30 1.1 riastrad * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES 31 1.1 riastrad * software, at <https://crypto.stanford.edu/vpaes/>, described in 32 1.1 riastrad * 33 1.1 riastrad * Mike Hamburg, `Accelerating AES with Vector Permute 34 1.1 riastrad * Instructions', in Christophe Clavier and Kris Gaj (eds.), 35 1.1 riastrad * Cryptographic Hardware and Embedded Systems -- CHES 2009, 36 1.1 riastrad * Springer LNCS 5747, pp. 18-32. 37 1.1 riastrad * 38 1.1 riastrad * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 39 1.1 riastrad */ 40 1.1 riastrad 41 1.1 riastrad #include <sys/cdefs.h> 42 1.2 riastrad __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $"); 43 1.1 riastrad 44 1.1 riastrad #include <sys/types.h> 45 1.1 riastrad 46 1.2 riastrad #ifdef _KERNEL 47 1.1 riastrad #include <sys/systm.h> 48 1.2 riastrad #else 49 1.2 riastrad #include <err.h> 50 1.2 riastrad #define panic(fmt, args...) err(1, fmt, ##args) 51 1.2 riastrad #endif 52 1.1 riastrad 53 1.1 riastrad #include "aes_ssse3_impl.h" 54 1.1 riastrad 55 1.1 riastrad static const union m128const { 56 1.1 riastrad uint64_t u64[2]; 57 1.1 riastrad __m128i m; 58 1.1 riastrad } 59 1.1 riastrad mc_forward[4] = { 60 1.1 riastrad {.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}}, 61 1.1 riastrad {.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}}, 62 1.1 riastrad {.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}}, 63 1.1 riastrad {.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}}, 64 1.1 riastrad }, 65 1.1 riastrad mc_backward[4] = { 66 1.1 riastrad {.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}}, 67 1.1 riastrad {.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}}, 68 1.1 riastrad {.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}}, 69 1.1 riastrad {.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}}, 70 1.1 riastrad }, 71 1.1 riastrad ipt[2] = { 72 1.1 riastrad {.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}}, 73 1.1 riastrad {.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}}, 74 1.1 riastrad }, 75 1.1 riastrad opt[2] = { 76 1.1 riastrad {.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}}, 77 1.1 riastrad {.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}}, 78 1.1 riastrad }, 79 1.1 riastrad dipt[2] = { 80 1.1 riastrad {.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}}, 81 1.1 riastrad {.u64 = {0x86E383E660056500, 0x12771772F491F194}}, 82 1.1 riastrad }, 83 1.1 riastrad sb1[2] = { 84 1.1 riastrad {.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}}, 85 1.1 riastrad {.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}}, 86 1.1 riastrad }, 87 1.1 riastrad sb2[2] = { 88 1.1 riastrad {.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}}, 89 1.1 riastrad {.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}}, 90 1.1 riastrad }, 91 1.1 riastrad sbo[2] = { 92 1.1 riastrad {.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}}, 93 1.1 riastrad {.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}}, 94 1.1 riastrad }, 95 1.1 riastrad dsb9[2] = { 96 1.1 riastrad {.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}}, 97 1.1 riastrad {.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}}, 98 1.1 riastrad }, 99 1.1 riastrad dsbd[2] = { 100 1.1 riastrad {.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}}, 101 1.1 riastrad {.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}}, 102 1.1 riastrad }, 103 1.1 riastrad dsbb[2] = { 104 1.1 riastrad {.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}}, 105 1.1 riastrad {.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}}, 106 1.1 riastrad }, 107 1.1 riastrad dsbe[2] = { 108 1.1 riastrad {.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}}, 109 1.1 riastrad {.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}}, 110 1.1 riastrad }, 111 1.1 riastrad dsbo[2] = { 112 1.1 riastrad {.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}}, 113 1.1 riastrad {.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}}, 114 1.1 riastrad }, 115 1.1 riastrad dks1[2] = { 116 1.1 riastrad {.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}}, 117 1.1 riastrad {.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}}, 118 1.1 riastrad }, 119 1.1 riastrad dks2[2] = { 120 1.1 riastrad {.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}}, 121 1.1 riastrad {.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}}, 122 1.1 riastrad }, 123 1.1 riastrad dks3[2] = { 124 1.1 riastrad {.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}}, 125 1.1 riastrad {.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}}, 126 1.1 riastrad }, 127 1.1 riastrad dks4[2] = { 128 1.1 riastrad {.u64 = {0xE3C390B053732000, 0xA080D3F310306343}}, 129 1.1 riastrad {.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}}, 130 1.1 riastrad }, 131 1.1 riastrad deskew[2] = { 132 1.1 riastrad {.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}}, 133 1.1 riastrad {.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}}, 134 1.1 riastrad }, 135 1.1 riastrad sr[4] = { 136 1.1 riastrad {.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}}, 137 1.1 riastrad {.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}}, 138 1.1 riastrad {.u64 = {0x0F060D040B020900, 0x070E050C030A0108}}, 139 1.1 riastrad {.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}}, 140 1.1 riastrad }, 141 1.1 riastrad rcon = {.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}}, 142 1.1 riastrad s63 = {.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}}, 143 1.1 riastrad of = {.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}}, 144 1.1 riastrad inv = {.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}}, 145 1.1 riastrad inva = {.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}}; 146 1.1 riastrad 147 1.1 riastrad static inline __m128i 148 1.1 riastrad loadroundkey(const uint32_t *rk32) 149 1.1 riastrad { 150 1.1 riastrad return _mm_load_si128((const void *)rk32); 151 1.1 riastrad } 152 1.1 riastrad 153 1.1 riastrad static inline void 154 1.1 riastrad storeroundkey(uint32_t *rk32, __m128i rk) 155 1.1 riastrad { 156 1.1 riastrad _mm_store_si128((void *)rk32, rk); 157 1.1 riastrad } 158 1.1 riastrad 159 1.1 riastrad /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ 160 1.1 riastrad static inline void 161 1.1 riastrad bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x) 162 1.1 riastrad { 163 1.1 riastrad 164 1.1 riastrad *lo = x & of.m; 165 1.1 riastrad *hi = _mm_srli_epi32(x & ~of.m, 4); 166 1.1 riastrad } 167 1.1 riastrad 168 1.1 riastrad /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c. */ 169 1.1 riastrad static inline __m128i 170 1.1 riastrad gf16_inva(__m128i x) 171 1.1 riastrad { 172 1.1 riastrad return _mm_shuffle_epi8(inva.m, x); 173 1.1 riastrad } 174 1.1 riastrad 175 1.1 riastrad /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c. */ 176 1.1 riastrad static inline __m128i 177 1.1 riastrad gf16_inv(__m128i x) 178 1.1 riastrad { 179 1.1 riastrad return _mm_shuffle_epi8(inv.m, x); 180 1.1 riastrad } 181 1.1 riastrad 182 1.1 riastrad /* 183 1.1 riastrad * t is a pair of maps respectively from low and high nybbles to bytes. 184 1.1 riastrad * Apply t the nybbles, and add the results in GF(2). 185 1.1 riastrad */ 186 1.1 riastrad static __m128i 187 1.1 riastrad aes_schedule_transform(__m128i x, const union m128const t[static 2]) 188 1.1 riastrad { 189 1.1 riastrad __m128i lo, hi; 190 1.1 riastrad 191 1.1 riastrad bytes2nybbles(&lo, &hi, x); 192 1.1 riastrad return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi); 193 1.1 riastrad } 194 1.1 riastrad 195 1.1 riastrad static inline void 196 1.1 riastrad subbytes(__m128i *io, __m128i *jo, __m128i x) 197 1.1 riastrad { 198 1.1 riastrad __m128i k, i, ak, j; 199 1.1 riastrad 200 1.1 riastrad bytes2nybbles(&k, &i, x); 201 1.1 riastrad ak = gf16_inva(k); 202 1.1 riastrad j = i ^ k; 203 1.1 riastrad *io = j ^ gf16_inv(ak ^ gf16_inv(i)); 204 1.1 riastrad *jo = i ^ gf16_inv(ak ^ gf16_inv(j)); 205 1.1 riastrad } 206 1.1 riastrad 207 1.1 riastrad static __m128i 208 1.1 riastrad aes_schedule_low_round(__m128i rk, __m128i prk) 209 1.1 riastrad { 210 1.1 riastrad __m128i io, jo; 211 1.1 riastrad 212 1.1 riastrad /* smear prk */ 213 1.1 riastrad prk ^= _mm_slli_si128(prk, 4); 214 1.1 riastrad prk ^= _mm_slli_si128(prk, 8); 215 1.1 riastrad prk ^= s63.m; 216 1.1 riastrad 217 1.1 riastrad /* subbytes */ 218 1.1 riastrad subbytes(&io, &jo, rk); 219 1.1 riastrad rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo); 220 1.1 riastrad 221 1.1 riastrad /* add in smeared stuff */ 222 1.1 riastrad return rk ^ prk; 223 1.1 riastrad } 224 1.1 riastrad 225 1.1 riastrad static __m128i 226 1.1 riastrad aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot) 227 1.1 riastrad { 228 1.1 riastrad 229 1.1 riastrad /* extract rcon from rcon_rot */ 230 1.1 riastrad prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15); 231 1.1 riastrad *rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15); 232 1.1 riastrad 233 1.1 riastrad /* rotate */ 234 1.1 riastrad rk = _mm_shuffle_epi32(rk, 0xff); 235 1.1 riastrad rk = _mm_alignr_epi8(rk, rk, 1); 236 1.1 riastrad 237 1.1 riastrad return aes_schedule_low_round(rk, prk); 238 1.1 riastrad } 239 1.1 riastrad 240 1.1 riastrad static __m128i 241 1.1 riastrad aes_schedule_mangle_enc(__m128i x, __m128i sr_i) 242 1.1 riastrad { 243 1.1 riastrad __m128i y = _mm_setzero_si128(); 244 1.1 riastrad 245 1.1 riastrad x ^= s63.m; 246 1.1 riastrad 247 1.1 riastrad x = _mm_shuffle_epi8(x, mc_forward[0].m); 248 1.1 riastrad y ^= x; 249 1.1 riastrad x = _mm_shuffle_epi8(x, mc_forward[0].m); 250 1.1 riastrad y ^= x; 251 1.1 riastrad x = _mm_shuffle_epi8(x, mc_forward[0].m); 252 1.1 riastrad y ^= x; 253 1.1 riastrad 254 1.1 riastrad return _mm_shuffle_epi8(y, sr_i); 255 1.1 riastrad } 256 1.1 riastrad 257 1.1 riastrad static __m128i 258 1.1 riastrad aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i) 259 1.1 riastrad { 260 1.1 riastrad 261 1.1 riastrad return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt); 262 1.1 riastrad } 263 1.1 riastrad 264 1.1 riastrad static __m128i 265 1.1 riastrad aes_schedule_mangle_dec(__m128i x, __m128i sr_i) 266 1.1 riastrad { 267 1.1 riastrad __m128i y = _mm_setzero_si128(); 268 1.1 riastrad 269 1.1 riastrad x = aes_schedule_transform(x, dks1); 270 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); 271 1.1 riastrad x = aes_schedule_transform(x, dks2); 272 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); 273 1.1 riastrad x = aes_schedule_transform(x, dks3); 274 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); 275 1.1 riastrad x = aes_schedule_transform(x, dks4); 276 1.1 riastrad y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); 277 1.1 riastrad 278 1.1 riastrad return _mm_shuffle_epi8(y, sr_i); 279 1.1 riastrad } 280 1.1 riastrad 281 1.1 riastrad static __m128i 282 1.1 riastrad aes_schedule_mangle_last_dec(__m128i x) 283 1.1 riastrad { 284 1.1 riastrad 285 1.1 riastrad return aes_schedule_transform(x ^ s63.m, deskew); 286 1.1 riastrad } 287 1.1 riastrad 288 1.1 riastrad static __m128i 289 1.1 riastrad aes_schedule_192_smear(__m128i prkhi, __m128i prk) 290 1.1 riastrad { 291 1.1 riastrad __m128i rk; 292 1.1 riastrad 293 1.1 riastrad rk = prkhi; 294 1.1 riastrad rk ^= _mm_shuffle_epi32(prkhi, 0x80); 295 1.1 riastrad rk ^= _mm_shuffle_epi32(prk, 0xfe); 296 1.1 riastrad 297 1.1 riastrad return rk; 298 1.1 riastrad } 299 1.1 riastrad 300 1.1 riastrad static __m128i 301 1.1 riastrad aes_schedule_192_smearhi(__m128i rk) 302 1.1 riastrad { 303 1.1 riastrad return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps()); 304 1.1 riastrad } 305 1.1 riastrad 306 1.1 riastrad void 307 1.1 riastrad aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) 308 1.1 riastrad { 309 1.1 riastrad uint32_t *rk32 = enc->aese_aes.aes_rk; 310 1.1 riastrad __m128i mrk; /* mangled round key */ 311 1.1 riastrad __m128i rk; /* round key */ 312 1.1 riastrad __m128i prk; /* previous round key */ 313 1.1 riastrad __m128i rcon_rot = rcon.m; 314 1.1 riastrad uint64_t i = 3; 315 1.1 riastrad 316 1.1 riastrad /* input transform */ 317 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt); 318 1.1 riastrad storeroundkey(rk32, rk); 319 1.1 riastrad rk32 += 4; 320 1.1 riastrad 321 1.1 riastrad switch (nrounds) { 322 1.1 riastrad case 10: 323 1.1 riastrad for (;;) { 324 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot); 325 1.1 riastrad if (--nrounds == 0) 326 1.1 riastrad break; 327 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); 328 1.1 riastrad storeroundkey(rk32, mrk); 329 1.1 riastrad rk32 += 4; 330 1.1 riastrad } 331 1.1 riastrad break; 332 1.1 riastrad case 12: { 333 1.1 riastrad __m128i prkhi; /* high half of previous round key */ 334 1.1 riastrad 335 1.1 riastrad prk = rk; 336 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt); 337 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 338 1.1 riastrad for (;;) { 339 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot); 340 1.1 riastrad rk = _mm_alignr_epi8(prk, prkhi, 8); 341 1.1 riastrad 342 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); 343 1.1 riastrad storeroundkey(rk32, mrk); 344 1.1 riastrad rk32 += 4; 345 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 346 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 347 1.1 riastrad 348 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); 349 1.1 riastrad storeroundkey(rk32, mrk); 350 1.1 riastrad rk32 += 4; 351 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 352 1.1 riastrad if ((nrounds -= 3) == 0) 353 1.1 riastrad break; 354 1.1 riastrad 355 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); 356 1.1 riastrad storeroundkey(rk32, mrk); 357 1.1 riastrad rk32 += 4; 358 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 359 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 360 1.1 riastrad } 361 1.1 riastrad break; 362 1.1 riastrad } 363 1.1 riastrad case 14: { 364 1.1 riastrad __m128i pprk; /* previous previous round key */ 365 1.1 riastrad 366 1.1 riastrad prk = rk; 367 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt); 368 1.1 riastrad for (;;) { 369 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); 370 1.1 riastrad storeroundkey(rk32, mrk); 371 1.1 riastrad rk32 += 4; 372 1.1 riastrad pprk = rk; 373 1.1 riastrad 374 1.1 riastrad /* high round */ 375 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 376 1.1 riastrad if ((nrounds -= 2) == 0) 377 1.1 riastrad break; 378 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); 379 1.1 riastrad storeroundkey(rk32, mrk); 380 1.1 riastrad rk32 += 4; 381 1.1 riastrad 382 1.1 riastrad /* low round */ 383 1.1 riastrad rk = _mm_shuffle_epi32(rk, 0xff); 384 1.1 riastrad rk = aes_schedule_low_round(rk, pprk); 385 1.1 riastrad } 386 1.1 riastrad break; 387 1.1 riastrad } 388 1.1 riastrad default: 389 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds); 390 1.1 riastrad } 391 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m)); 392 1.1 riastrad } 393 1.1 riastrad 394 1.1 riastrad void 395 1.1 riastrad aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) 396 1.1 riastrad { 397 1.1 riastrad uint32_t *rk32 = dec->aesd_aes.aes_rk; 398 1.1 riastrad __m128i mrk; /* mangled round key */ 399 1.1 riastrad __m128i ork; /* original round key */ 400 1.1 riastrad __m128i rk; /* round key */ 401 1.1 riastrad __m128i prk; /* previous round key */ 402 1.1 riastrad __m128i rcon_rot = rcon.m; 403 1.1 riastrad unsigned i = nrounds == 12 ? 0 : 2; 404 1.1 riastrad 405 1.1 riastrad ork = _mm_loadu_epi8(key); 406 1.1 riastrad 407 1.1 riastrad /* input transform */ 408 1.1 riastrad rk = aes_schedule_transform(ork, ipt); 409 1.1 riastrad 410 1.1 riastrad /* go from end */ 411 1.1 riastrad rk32 += 4*nrounds; 412 1.1 riastrad storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m)); 413 1.1 riastrad rk32 -= 4; 414 1.1 riastrad i ^= 3; 415 1.1 riastrad 416 1.1 riastrad switch (nrounds) { 417 1.1 riastrad case 10: 418 1.1 riastrad for (;;) { 419 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot); 420 1.1 riastrad if (--nrounds == 0) 421 1.1 riastrad break; 422 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); 423 1.1 riastrad storeroundkey(rk32, mrk); 424 1.1 riastrad rk32 -= 4; 425 1.1 riastrad } 426 1.1 riastrad break; 427 1.1 riastrad case 12: { 428 1.1 riastrad __m128i prkhi; /* high half of previous round key */ 429 1.1 riastrad 430 1.1 riastrad prk = rk; 431 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt); 432 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 433 1.1 riastrad for (;;) { 434 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot); 435 1.1 riastrad rk = _mm_alignr_epi8(prk, prkhi, 8); 436 1.1 riastrad 437 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); 438 1.1 riastrad storeroundkey(rk32, mrk); 439 1.1 riastrad rk32 -= 4; 440 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 441 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 442 1.1 riastrad 443 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); 444 1.1 riastrad storeroundkey(rk32, mrk); 445 1.1 riastrad rk32 -= 4; 446 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 447 1.1 riastrad if ((nrounds -= 3) == 0) 448 1.1 riastrad break; 449 1.1 riastrad 450 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); 451 1.1 riastrad storeroundkey(rk32, mrk); 452 1.1 riastrad rk32 -= 4; 453 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 454 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 455 1.1 riastrad } 456 1.1 riastrad break; 457 1.1 riastrad } 458 1.1 riastrad case 14: { 459 1.1 riastrad __m128i pprk; /* previous previous round key */ 460 1.1 riastrad 461 1.1 riastrad prk = rk; 462 1.1 riastrad rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt); 463 1.1 riastrad for (;;) { 464 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); 465 1.1 riastrad storeroundkey(rk32, mrk); 466 1.1 riastrad rk32 -= 4; 467 1.1 riastrad pprk = rk; 468 1.1 riastrad 469 1.1 riastrad /* high round */ 470 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 471 1.1 riastrad if ((nrounds -= 2) == 0) 472 1.1 riastrad break; 473 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); 474 1.1 riastrad storeroundkey(rk32, mrk); 475 1.1 riastrad rk32 -= 4; 476 1.1 riastrad 477 1.1 riastrad /* low round */ 478 1.1 riastrad rk = _mm_shuffle_epi32(rk, 0xff); 479 1.1 riastrad rk = aes_schedule_low_round(rk, pprk); 480 1.1 riastrad } 481 1.1 riastrad break; 482 1.1 riastrad } 483 1.1 riastrad default: 484 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds); 485 1.1 riastrad } 486 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); 487 1.1 riastrad } 488 1.1 riastrad 489 1.1 riastrad __m128i 490 1.1 riastrad aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds) 491 1.1 riastrad { 492 1.1 riastrad const uint32_t *rk32 = enc->aese_aes.aes_rk; 493 1.1 riastrad __m128i io, jo; 494 1.1 riastrad unsigned rmod4 = 0; 495 1.1 riastrad 496 1.1 riastrad x = aes_schedule_transform(x, ipt); 497 1.1 riastrad x ^= loadroundkey(rk32); 498 1.1 riastrad for (;;) { 499 1.1 riastrad __m128i A, A2, A2_B, A2_B_D; 500 1.1 riastrad 501 1.1 riastrad subbytes(&io, &jo, x); 502 1.1 riastrad 503 1.1 riastrad rk32 += 4; 504 1.1 riastrad rmod4 = (rmod4 + 1) % 4; 505 1.1 riastrad if (--nrounds == 0) 506 1.1 riastrad break; 507 1.1 riastrad 508 1.1 riastrad A = _mm_shuffle_epi8(sb1[0].m, io) ^ 509 1.1 riastrad _mm_shuffle_epi8(sb1[1].m, jo); 510 1.1 riastrad A ^= loadroundkey(rk32); 511 1.1 riastrad A2 = _mm_shuffle_epi8(sb2[0].m, io) ^ 512 1.1 riastrad _mm_shuffle_epi8(sb2[1].m, jo); 513 1.1 riastrad A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m); 514 1.1 riastrad A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m); 515 1.1 riastrad x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m); 516 1.1 riastrad } 517 1.1 riastrad x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo); 518 1.1 riastrad x ^= loadroundkey(rk32); 519 1.1 riastrad return _mm_shuffle_epi8(x, sr[rmod4].m); 520 1.1 riastrad } 521 1.1 riastrad 522 1.1 riastrad __m128i 523 1.1 riastrad aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds) 524 1.1 riastrad { 525 1.1 riastrad const uint32_t *rk32 = dec->aesd_aes.aes_rk; 526 1.1 riastrad unsigned i = 3 & ~(nrounds - 1); 527 1.1 riastrad __m128i io, jo, mc; 528 1.1 riastrad 529 1.1 riastrad x = aes_schedule_transform(x, dipt); 530 1.1 riastrad x ^= loadroundkey(rk32); 531 1.1 riastrad rk32 += 4; 532 1.1 riastrad 533 1.1 riastrad mc = mc_forward[3].m; 534 1.1 riastrad for (;;) { 535 1.1 riastrad subbytes(&io, &jo, x); 536 1.1 riastrad if (--nrounds == 0) 537 1.1 riastrad break; 538 1.1 riastrad 539 1.1 riastrad x = _mm_shuffle_epi8(dsb9[0].m, io) ^ 540 1.1 riastrad _mm_shuffle_epi8(dsb9[1].m, jo); 541 1.1 riastrad x ^= loadroundkey(rk32); 542 1.1 riastrad rk32 += 4; /* next round key */ 543 1.1 riastrad 544 1.1 riastrad x = _mm_shuffle_epi8(x, mc); 545 1.1 riastrad x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^ 546 1.1 riastrad _mm_shuffle_epi8(dsbd[1].m, jo); 547 1.1 riastrad 548 1.1 riastrad x = _mm_shuffle_epi8(x, mc); 549 1.1 riastrad x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^ 550 1.1 riastrad _mm_shuffle_epi8(dsbb[1].m, jo); 551 1.1 riastrad 552 1.1 riastrad x = _mm_shuffle_epi8(x, mc); 553 1.1 riastrad x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^ 554 1.1 riastrad _mm_shuffle_epi8(dsbe[1].m, jo); 555 1.1 riastrad 556 1.1 riastrad mc = _mm_alignr_epi8(mc, mc, 12); 557 1.1 riastrad } 558 1.1 riastrad x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo); 559 1.1 riastrad x ^= loadroundkey(rk32); 560 1.1 riastrad return _mm_shuffle_epi8(x, sr[i].m); 561 1.1 riastrad } 562