Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES
     31  * software, at <https://crypto.stanford.edu/vpaes/>, described in
     32  *
     33  *	Mike Hamburg, `Accelerating AES with Vector Permute
     34  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
     35  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
     36  *	Springer LNCS 5747, pp. 18-32.
     37  *
     38  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
     39  */
     40 
     41 #include <sys/cdefs.h>
     42 __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
     43 
     44 #include <sys/types.h>
     45 
     46 #ifdef _KERNEL
     47 #include <sys/systm.h>
     48 #else
     49 #include <err.h>
     50 #define	panic(fmt, args...)	err(1, fmt, ##args)
     51 #endif
     52 
     53 #include "aes_ssse3_impl.h"
     54 
     55 static const union m128const {
     56 	uint64_t u64[2];
     57 	__m128i m;
     58 }
     59 mc_forward[4] = {
     60 	{.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}},
     61 	{.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}},
     62 	{.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}},
     63 	{.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}},
     64 },
     65 mc_backward[4] = {
     66 	{.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}},
     67 	{.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}},
     68 	{.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}},
     69 	{.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}},
     70 },
     71 ipt[2] = {
     72 	{.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}},
     73 	{.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}},
     74 },
     75 opt[2] = {
     76 	{.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}},
     77 	{.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}},
     78 },
     79 dipt[2] = {
     80 	{.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}},
     81 	{.u64 = {0x86E383E660056500, 0x12771772F491F194}},
     82 },
     83 sb1[2] = {
     84 	{.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}},
     85 	{.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}},
     86 },
     87 sb2[2] = {
     88 	{.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}},
     89 	{.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}},
     90 },
     91 sbo[2] = {
     92 	{.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}},
     93 	{.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}},
     94 },
     95 dsb9[2] = {
     96 	{.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}},
     97 	{.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}},
     98 },
     99 dsbd[2] = {
    100 	{.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}},
    101 	{.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}},
    102 },
    103 dsbb[2] = {
    104 	{.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}},
    105 	{.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}},
    106 },
    107 dsbe[2] = {
    108 	{.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}},
    109 	{.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}},
    110 },
    111 dsbo[2] = {
    112 	{.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}},
    113 	{.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}},
    114 },
    115 dks1[2] = {
    116 	{.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}},
    117 	{.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}},
    118 },
    119 dks2[2] = {
    120 	{.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}},
    121 	{.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}},
    122 },
    123 dks3[2] = {
    124 	{.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}},
    125 	{.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}},
    126 },
    127 dks4[2] = {
    128 	{.u64 = {0xE3C390B053732000, 0xA080D3F310306343}},
    129 	{.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}},
    130 },
    131 deskew[2] = {
    132 	{.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}},
    133 	{.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}},
    134 },
    135 sr[4] = {
    136 	{.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}},
    137 	{.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}},
    138 	{.u64 = {0x0F060D040B020900, 0x070E050C030A0108}},
    139 	{.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}},
    140 },
    141 rcon =	{.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}},
    142 s63 =	{.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}},
    143 of =	{.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}},
    144 inv =	{.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}},
    145 inva =	{.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}};
    146 
    147 static inline __m128i
    148 loadroundkey(const uint32_t *rk32)
    149 {
    150 	return _mm_load_si128((const void *)rk32);
    151 }
    152 
    153 static inline void
    154 storeroundkey(uint32_t *rk32, __m128i rk)
    155 {
    156 	_mm_store_si128((void *)rk32, rk);
    157 }
    158 
    159 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
    160 static inline void
    161 bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x)
    162 {
    163 
    164 	*lo = x & of.m;
    165 	*hi = _mm_srli_epi32(x & ~of.m, 4);
    166 }
    167 
    168 /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c.  */
    169 static inline __m128i
    170 gf16_inva(__m128i x)
    171 {
    172 	return _mm_shuffle_epi8(inva.m, x);
    173 }
    174 
    175 /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c.  */
    176 static inline __m128i
    177 gf16_inv(__m128i x)
    178 {
    179 	return _mm_shuffle_epi8(inv.m, x);
    180 }
    181 
    182 /*
    183  * t is a pair of maps respectively from low and high nybbles to bytes.
    184  * Apply t the nybbles, and add the results in GF(2).
    185  */
    186 static __m128i
    187 aes_schedule_transform(__m128i x, const union m128const t[static 2])
    188 {
    189 	__m128i lo, hi;
    190 
    191 	bytes2nybbles(&lo, &hi, x);
    192 	return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi);
    193 }
    194 
    195 static inline void
    196 subbytes(__m128i *io, __m128i *jo, __m128i x)
    197 {
    198 	__m128i k, i, ak, j;
    199 
    200 	bytes2nybbles(&k, &i, x);
    201 	ak = gf16_inva(k);
    202 	j = i ^ k;
    203 	*io = j ^ gf16_inv(ak ^ gf16_inv(i));
    204 	*jo = i ^ gf16_inv(ak ^ gf16_inv(j));
    205 }
    206 
    207 static __m128i
    208 aes_schedule_low_round(__m128i rk, __m128i prk)
    209 {
    210 	__m128i io, jo;
    211 
    212 	/* smear prk */
    213 	prk ^= _mm_slli_si128(prk, 4);
    214 	prk ^= _mm_slli_si128(prk, 8);
    215 	prk ^= s63.m;
    216 
    217 	/* subbytes */
    218 	subbytes(&io, &jo, rk);
    219 	rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo);
    220 
    221 	/* add in smeared stuff */
    222 	return rk ^ prk;
    223 }
    224 
    225 static __m128i
    226 aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot)
    227 {
    228 
    229 	/* extract rcon from rcon_rot */
    230 	prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15);
    231 	*rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15);
    232 
    233 	/* rotate */
    234 	rk = _mm_shuffle_epi32(rk, 0xff);
    235 	rk = _mm_alignr_epi8(rk, rk, 1);
    236 
    237 	return aes_schedule_low_round(rk, prk);
    238 }
    239 
    240 static __m128i
    241 aes_schedule_mangle_enc(__m128i x, __m128i sr_i)
    242 {
    243 	__m128i y = _mm_setzero_si128();
    244 
    245 	x ^= s63.m;
    246 
    247 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    248 	y ^= x;
    249 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    250 	y ^= x;
    251 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    252 	y ^= x;
    253 
    254 	return _mm_shuffle_epi8(y, sr_i);
    255 }
    256 
    257 static __m128i
    258 aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i)
    259 {
    260 
    261 	return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt);
    262 }
    263 
    264 static __m128i
    265 aes_schedule_mangle_dec(__m128i x, __m128i sr_i)
    266 {
    267 	__m128i y = _mm_setzero_si128();
    268 
    269 	x = aes_schedule_transform(x, dks1);
    270 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    271 	x = aes_schedule_transform(x, dks2);
    272 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    273 	x = aes_schedule_transform(x, dks3);
    274 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    275 	x = aes_schedule_transform(x, dks4);
    276 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    277 
    278 	return _mm_shuffle_epi8(y, sr_i);
    279 }
    280 
    281 static __m128i
    282 aes_schedule_mangle_last_dec(__m128i x)
    283 {
    284 
    285 	return aes_schedule_transform(x ^ s63.m, deskew);
    286 }
    287 
    288 static __m128i
    289 aes_schedule_192_smear(__m128i prkhi, __m128i prk)
    290 {
    291 	__m128i rk;
    292 
    293 	rk = prkhi;
    294 	rk ^= _mm_shuffle_epi32(prkhi, 0x80);
    295 	rk ^= _mm_shuffle_epi32(prk, 0xfe);
    296 
    297 	return rk;
    298 }
    299 
    300 static __m128i
    301 aes_schedule_192_smearhi(__m128i rk)
    302 {
    303 	return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps());
    304 }
    305 
    306 void
    307 aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
    308 {
    309 	uint32_t *rk32 = enc->aese_aes.aes_rk;
    310 	__m128i mrk;		/* mangled round key */
    311 	__m128i rk;		/* round key */
    312 	__m128i prk;		/* previous round key */
    313 	__m128i rcon_rot = rcon.m;
    314 	uint64_t i = 3;
    315 
    316 	/* input transform */
    317 	rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt);
    318 	storeroundkey(rk32, rk);
    319 	rk32 += 4;
    320 
    321 	switch (nrounds) {
    322 	case 10:
    323 		for (;;) {
    324 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    325 			if (--nrounds == 0)
    326 				break;
    327 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    328 			storeroundkey(rk32, mrk);
    329 			rk32 += 4;
    330 		}
    331 		break;
    332 	case 12: {
    333 		__m128i prkhi;		/* high half of previous round key */
    334 
    335 		prk = rk;
    336 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
    337 		prkhi = aes_schedule_192_smearhi(rk);
    338 		for (;;) {
    339 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    340 			rk = _mm_alignr_epi8(prk, prkhi, 8);
    341 
    342 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    343 			storeroundkey(rk32, mrk);
    344 			rk32 += 4;
    345 			rk = aes_schedule_192_smear(prkhi, prk);
    346 			prkhi = aes_schedule_192_smearhi(rk);
    347 
    348 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    349 			storeroundkey(rk32, mrk);
    350 			rk32 += 4;
    351 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    352 			if ((nrounds -= 3) == 0)
    353 				break;
    354 
    355 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    356 			storeroundkey(rk32, mrk);
    357 			rk32 += 4;
    358 			rk = aes_schedule_192_smear(prkhi, prk);
    359 			prkhi = aes_schedule_192_smearhi(rk);
    360 		}
    361 		break;
    362 	}
    363 	case 14: {
    364 		__m128i pprk;		/* previous previous round key */
    365 
    366 		prk = rk;
    367 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
    368 		for (;;) {
    369 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    370 			storeroundkey(rk32, mrk);
    371 			rk32 += 4;
    372 			pprk = rk;
    373 
    374 			/* high round */
    375 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    376 			if ((nrounds -= 2) == 0)
    377 				break;
    378 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    379 			storeroundkey(rk32, mrk);
    380 			rk32 += 4;
    381 
    382 			/* low round */
    383 			rk = _mm_shuffle_epi32(rk, 0xff);
    384 			rk = aes_schedule_low_round(rk, pprk);
    385 		}
    386 		break;
    387 	}
    388 	default:
    389 		panic("invalid number of AES rounds: %u", nrounds);
    390 	}
    391 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m));
    392 }
    393 
    394 void
    395 aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
    396 {
    397 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
    398 	__m128i mrk;		/* mangled round key */
    399 	__m128i ork;		/* original round key */
    400 	__m128i rk;		/* round key */
    401 	__m128i prk;		/* previous round key */
    402 	__m128i rcon_rot = rcon.m;
    403 	unsigned i = nrounds == 12 ? 0 : 2;
    404 
    405 	ork = _mm_loadu_epi8(key);
    406 
    407 	/* input transform */
    408 	rk = aes_schedule_transform(ork, ipt);
    409 
    410 	/* go from end */
    411 	rk32 += 4*nrounds;
    412 	storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m));
    413 	rk32 -= 4;
    414 	i ^= 3;
    415 
    416 	switch (nrounds) {
    417 	case 10:
    418 		for (;;) {
    419 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    420 			if (--nrounds == 0)
    421 				break;
    422 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    423 			storeroundkey(rk32, mrk);
    424 			rk32 -= 4;
    425 		}
    426 		break;
    427 	case 12: {
    428 		__m128i prkhi;		/* high half of previous round key */
    429 
    430 		prk = rk;
    431 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
    432 		prkhi = aes_schedule_192_smearhi(rk);
    433 		for (;;) {
    434 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    435 			rk = _mm_alignr_epi8(prk, prkhi, 8);
    436 
    437 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    438 			storeroundkey(rk32, mrk);
    439 			rk32 -= 4;
    440 			rk = aes_schedule_192_smear(prkhi, prk);
    441 			prkhi = aes_schedule_192_smearhi(rk);
    442 
    443 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    444 			storeroundkey(rk32, mrk);
    445 			rk32 -= 4;
    446 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    447 			if ((nrounds -= 3) == 0)
    448 				break;
    449 
    450 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    451 			storeroundkey(rk32, mrk);
    452 			rk32 -= 4;
    453 			rk = aes_schedule_192_smear(prkhi, prk);
    454 			prkhi = aes_schedule_192_smearhi(rk);
    455 		}
    456 		break;
    457 	}
    458 	case 14: {
    459 		__m128i pprk;		/* previous previous round key */
    460 
    461 		prk = rk;
    462 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
    463 		for (;;) {
    464 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    465 			storeroundkey(rk32, mrk);
    466 			rk32 -= 4;
    467 			pprk = rk;
    468 
    469 			/* high round */
    470 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    471 			if ((nrounds -= 2) == 0)
    472 				break;
    473 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    474 			storeroundkey(rk32, mrk);
    475 			rk32 -= 4;
    476 
    477 			/* low round */
    478 			rk = _mm_shuffle_epi32(rk, 0xff);
    479 			rk = aes_schedule_low_round(rk, pprk);
    480 		}
    481 		break;
    482 	}
    483 	default:
    484 		panic("invalid number of AES rounds: %u", nrounds);
    485 	}
    486 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
    487 }
    488 
    489 __m128i
    490 aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds)
    491 {
    492 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    493 	__m128i io, jo;
    494 	unsigned rmod4 = 0;
    495 
    496 	x = aes_schedule_transform(x, ipt);
    497 	x ^= loadroundkey(rk32);
    498 	for (;;) {
    499 		__m128i A, A2, A2_B, A2_B_D;
    500 
    501 		subbytes(&io, &jo, x);
    502 
    503 		rk32 += 4;
    504 		rmod4 = (rmod4 + 1) % 4;
    505 		if (--nrounds == 0)
    506 			break;
    507 
    508 		A = _mm_shuffle_epi8(sb1[0].m, io) ^
    509 		    _mm_shuffle_epi8(sb1[1].m, jo);
    510 		A ^= loadroundkey(rk32);
    511 		A2 = _mm_shuffle_epi8(sb2[0].m, io) ^
    512 		    _mm_shuffle_epi8(sb2[1].m, jo);
    513 		A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m);
    514 		A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m);
    515 		x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m);
    516 	}
    517 	x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo);
    518 	x ^= loadroundkey(rk32);
    519 	return _mm_shuffle_epi8(x, sr[rmod4].m);
    520 }
    521 
    522 __m128i
    523 aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds)
    524 {
    525 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    526 	unsigned i = 3 & ~(nrounds - 1);
    527 	__m128i io, jo, mc;
    528 
    529 	x = aes_schedule_transform(x, dipt);
    530 	x ^= loadroundkey(rk32);
    531 	rk32 += 4;
    532 
    533 	mc = mc_forward[3].m;
    534 	for (;;) {
    535 		subbytes(&io, &jo, x);
    536 		if (--nrounds == 0)
    537 			break;
    538 
    539 		x = _mm_shuffle_epi8(dsb9[0].m, io) ^
    540 		    _mm_shuffle_epi8(dsb9[1].m, jo);
    541 		x ^= loadroundkey(rk32);
    542 		rk32 += 4;				/* next round key */
    543 
    544 		x = _mm_shuffle_epi8(x, mc);
    545 		x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^
    546 		    _mm_shuffle_epi8(dsbd[1].m, jo);
    547 
    548 		x = _mm_shuffle_epi8(x, mc);
    549 		x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^
    550 		    _mm_shuffle_epi8(dsbb[1].m, jo);
    551 
    552 		x = _mm_shuffle_epi8(x, mc);
    553 		x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^
    554 		    _mm_shuffle_epi8(dsbe[1].m, jo);
    555 
    556 		mc = _mm_alignr_epi8(mc, mc, 12);
    557 	}
    558 	x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo);
    559 	x ^= loadroundkey(rk32);
    560 	return _mm_shuffle_epi8(x, sr[i].m);
    561 }
    562