Home | History | Annotate | Line # | Download | only in x86
aes_ssse3.c revision 1.1
      1 /*	$NetBSD: aes_ssse3.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES
     31  * software, at <https://crypto.stanford.edu/vpaes/>, described in
     32  *
     33  *	Mike Hamburg, `Accelerating AES with Vector Permute
     34  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
     35  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
     36  *	Springer LNCS 5747, pp. 18-32.
     37  *
     38  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
     39  */
     40 
     41 #include <sys/cdefs.h>
     42 __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $");
     43 
     44 #include <sys/types.h>
     45 
     46 #include <sys/systm.h>
     47 
     48 #include "aes_ssse3_impl.h"
     49 
     50 static const union m128const {
     51 	uint64_t u64[2];
     52 	__m128i m;
     53 }
     54 mc_forward[4] = {
     55 	{.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}},
     56 	{.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}},
     57 	{.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}},
     58 	{.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}},
     59 },
     60 mc_backward[4] = {
     61 	{.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}},
     62 	{.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}},
     63 	{.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}},
     64 	{.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}},
     65 },
     66 ipt[2] = {
     67 	{.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}},
     68 	{.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}},
     69 },
     70 opt[2] = {
     71 	{.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}},
     72 	{.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}},
     73 },
     74 dipt[2] = {
     75 	{.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}},
     76 	{.u64 = {0x86E383E660056500, 0x12771772F491F194}},
     77 },
     78 sb1[2] = {
     79 	{.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}},
     80 	{.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}},
     81 },
     82 sb2[2] = {
     83 	{.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}},
     84 	{.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}},
     85 },
     86 sbo[2] = {
     87 	{.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}},
     88 	{.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}},
     89 },
     90 dsb9[2] = {
     91 	{.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}},
     92 	{.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}},
     93 },
     94 dsbd[2] = {
     95 	{.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}},
     96 	{.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}},
     97 },
     98 dsbb[2] = {
     99 	{.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}},
    100 	{.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}},
    101 },
    102 dsbe[2] = {
    103 	{.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}},
    104 	{.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}},
    105 },
    106 dsbo[2] = {
    107 	{.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}},
    108 	{.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}},
    109 },
    110 dks1[2] = {
    111 	{.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}},
    112 	{.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}},
    113 },
    114 dks2[2] = {
    115 	{.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}},
    116 	{.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}},
    117 },
    118 dks3[2] = {
    119 	{.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}},
    120 	{.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}},
    121 },
    122 dks4[2] = {
    123 	{.u64 = {0xE3C390B053732000, 0xA080D3F310306343}},
    124 	{.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}},
    125 },
    126 deskew[2] = {
    127 	{.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}},
    128 	{.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}},
    129 },
    130 sr[4] = {
    131 	{.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}},
    132 	{.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}},
    133 	{.u64 = {0x0F060D040B020900, 0x070E050C030A0108}},
    134 	{.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}},
    135 },
    136 rcon =	{.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}},
    137 s63 =	{.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}},
    138 of =	{.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}},
    139 inv =	{.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}},
    140 inva =	{.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}};
    141 
    142 static inline __m128i
    143 loadroundkey(const uint32_t *rk32)
    144 {
    145 	return _mm_load_si128((const void *)rk32);
    146 }
    147 
    148 static inline void
    149 storeroundkey(uint32_t *rk32, __m128i rk)
    150 {
    151 	_mm_store_si128((void *)rk32, rk);
    152 }
    153 
    154 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
    155 static inline void
    156 bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x)
    157 {
    158 
    159 	*lo = x & of.m;
    160 	*hi = _mm_srli_epi32(x & ~of.m, 4);
    161 }
    162 
    163 /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c.  */
    164 static inline __m128i
    165 gf16_inva(__m128i x)
    166 {
    167 	return _mm_shuffle_epi8(inva.m, x);
    168 }
    169 
    170 /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c.  */
    171 static inline __m128i
    172 gf16_inv(__m128i x)
    173 {
    174 	return _mm_shuffle_epi8(inv.m, x);
    175 }
    176 
    177 /*
    178  * t is a pair of maps respectively from low and high nybbles to bytes.
    179  * Apply t the nybbles, and add the results in GF(2).
    180  */
    181 static __m128i
    182 aes_schedule_transform(__m128i x, const union m128const t[static 2])
    183 {
    184 	__m128i lo, hi;
    185 
    186 	bytes2nybbles(&lo, &hi, x);
    187 	return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi);
    188 }
    189 
    190 static inline void
    191 subbytes(__m128i *io, __m128i *jo, __m128i x)
    192 {
    193 	__m128i k, i, ak, j;
    194 
    195 	bytes2nybbles(&k, &i, x);
    196 	ak = gf16_inva(k);
    197 	j = i ^ k;
    198 	*io = j ^ gf16_inv(ak ^ gf16_inv(i));
    199 	*jo = i ^ gf16_inv(ak ^ gf16_inv(j));
    200 }
    201 
    202 static __m128i
    203 aes_schedule_low_round(__m128i rk, __m128i prk)
    204 {
    205 	__m128i io, jo;
    206 
    207 	/* smear prk */
    208 	prk ^= _mm_slli_si128(prk, 4);
    209 	prk ^= _mm_slli_si128(prk, 8);
    210 	prk ^= s63.m;
    211 
    212 	/* subbytes */
    213 	subbytes(&io, &jo, rk);
    214 	rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo);
    215 
    216 	/* add in smeared stuff */
    217 	return rk ^ prk;
    218 }
    219 
    220 static __m128i
    221 aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot)
    222 {
    223 
    224 	/* extract rcon from rcon_rot */
    225 	prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15);
    226 	*rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15);
    227 
    228 	/* rotate */
    229 	rk = _mm_shuffle_epi32(rk, 0xff);
    230 	rk = _mm_alignr_epi8(rk, rk, 1);
    231 
    232 	return aes_schedule_low_round(rk, prk);
    233 }
    234 
    235 static __m128i
    236 aes_schedule_mangle_enc(__m128i x, __m128i sr_i)
    237 {
    238 	__m128i y = _mm_setzero_si128();
    239 
    240 	x ^= s63.m;
    241 
    242 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    243 	y ^= x;
    244 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    245 	y ^= x;
    246 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    247 	y ^= x;
    248 
    249 	return _mm_shuffle_epi8(y, sr_i);
    250 }
    251 
    252 static __m128i
    253 aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i)
    254 {
    255 
    256 	return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt);
    257 }
    258 
    259 static __m128i
    260 aes_schedule_mangle_dec(__m128i x, __m128i sr_i)
    261 {
    262 	__m128i y = _mm_setzero_si128();
    263 
    264 	x = aes_schedule_transform(x, dks1);
    265 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    266 	x = aes_schedule_transform(x, dks2);
    267 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    268 	x = aes_schedule_transform(x, dks3);
    269 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    270 	x = aes_schedule_transform(x, dks4);
    271 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    272 
    273 	return _mm_shuffle_epi8(y, sr_i);
    274 }
    275 
    276 static __m128i
    277 aes_schedule_mangle_last_dec(__m128i x)
    278 {
    279 
    280 	return aes_schedule_transform(x ^ s63.m, deskew);
    281 }
    282 
    283 static __m128i
    284 aes_schedule_192_smear(__m128i prkhi, __m128i prk)
    285 {
    286 	__m128i rk;
    287 
    288 	rk = prkhi;
    289 	rk ^= _mm_shuffle_epi32(prkhi, 0x80);
    290 	rk ^= _mm_shuffle_epi32(prk, 0xfe);
    291 
    292 	return rk;
    293 }
    294 
    295 static __m128i
    296 aes_schedule_192_smearhi(__m128i rk)
    297 {
    298 	return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps());
    299 }
    300 
    301 void
    302 aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
    303 {
    304 	uint32_t *rk32 = enc->aese_aes.aes_rk;
    305 	__m128i mrk;		/* mangled round key */
    306 	__m128i rk;		/* round key */
    307 	__m128i prk;		/* previous round key */
    308 	__m128i rcon_rot = rcon.m;
    309 	uint64_t i = 3;
    310 
    311 	/* input transform */
    312 	rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt);
    313 	storeroundkey(rk32, rk);
    314 	rk32 += 4;
    315 
    316 	switch (nrounds) {
    317 	case 10:
    318 		for (;;) {
    319 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    320 			if (--nrounds == 0)
    321 				break;
    322 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    323 			storeroundkey(rk32, mrk);
    324 			rk32 += 4;
    325 		}
    326 		break;
    327 	case 12: {
    328 		__m128i prkhi;		/* high half of previous round key */
    329 
    330 		prk = rk;
    331 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
    332 		prkhi = aes_schedule_192_smearhi(rk);
    333 		for (;;) {
    334 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    335 			rk = _mm_alignr_epi8(prk, prkhi, 8);
    336 
    337 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    338 			storeroundkey(rk32, mrk);
    339 			rk32 += 4;
    340 			rk = aes_schedule_192_smear(prkhi, prk);
    341 			prkhi = aes_schedule_192_smearhi(rk);
    342 
    343 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    344 			storeroundkey(rk32, mrk);
    345 			rk32 += 4;
    346 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    347 			if ((nrounds -= 3) == 0)
    348 				break;
    349 
    350 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    351 			storeroundkey(rk32, mrk);
    352 			rk32 += 4;
    353 			rk = aes_schedule_192_smear(prkhi, prk);
    354 			prkhi = aes_schedule_192_smearhi(rk);
    355 		}
    356 		break;
    357 	}
    358 	case 14: {
    359 		__m128i pprk;		/* previous previous round key */
    360 
    361 		prk = rk;
    362 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
    363 		for (;;) {
    364 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    365 			storeroundkey(rk32, mrk);
    366 			rk32 += 4;
    367 			pprk = rk;
    368 
    369 			/* high round */
    370 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    371 			if ((nrounds -= 2) == 0)
    372 				break;
    373 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    374 			storeroundkey(rk32, mrk);
    375 			rk32 += 4;
    376 
    377 			/* low round */
    378 			rk = _mm_shuffle_epi32(rk, 0xff);
    379 			rk = aes_schedule_low_round(rk, pprk);
    380 		}
    381 		break;
    382 	}
    383 	default:
    384 		panic("invalid number of AES rounds: %u", nrounds);
    385 	}
    386 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m));
    387 }
    388 
    389 void
    390 aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
    391 {
    392 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
    393 	__m128i mrk;		/* mangled round key */
    394 	__m128i ork;		/* original round key */
    395 	__m128i rk;		/* round key */
    396 	__m128i prk;		/* previous round key */
    397 	__m128i rcon_rot = rcon.m;
    398 	unsigned i = nrounds == 12 ? 0 : 2;
    399 
    400 	ork = _mm_loadu_epi8(key);
    401 
    402 	/* input transform */
    403 	rk = aes_schedule_transform(ork, ipt);
    404 
    405 	/* go from end */
    406 	rk32 += 4*nrounds;
    407 	storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m));
    408 	rk32 -= 4;
    409 	i ^= 3;
    410 
    411 	switch (nrounds) {
    412 	case 10:
    413 		for (;;) {
    414 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    415 			if (--nrounds == 0)
    416 				break;
    417 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    418 			storeroundkey(rk32, mrk);
    419 			rk32 -= 4;
    420 		}
    421 		break;
    422 	case 12: {
    423 		__m128i prkhi;		/* high half of previous round key */
    424 
    425 		prk = rk;
    426 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
    427 		prkhi = aes_schedule_192_smearhi(rk);
    428 		for (;;) {
    429 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    430 			rk = _mm_alignr_epi8(prk, prkhi, 8);
    431 
    432 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    433 			storeroundkey(rk32, mrk);
    434 			rk32 -= 4;
    435 			rk = aes_schedule_192_smear(prkhi, prk);
    436 			prkhi = aes_schedule_192_smearhi(rk);
    437 
    438 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    439 			storeroundkey(rk32, mrk);
    440 			rk32 -= 4;
    441 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    442 			if ((nrounds -= 3) == 0)
    443 				break;
    444 
    445 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    446 			storeroundkey(rk32, mrk);
    447 			rk32 -= 4;
    448 			rk = aes_schedule_192_smear(prkhi, prk);
    449 			prkhi = aes_schedule_192_smearhi(rk);
    450 		}
    451 		break;
    452 	}
    453 	case 14: {
    454 		__m128i pprk;		/* previous previous round key */
    455 
    456 		prk = rk;
    457 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
    458 		for (;;) {
    459 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    460 			storeroundkey(rk32, mrk);
    461 			rk32 -= 4;
    462 			pprk = rk;
    463 
    464 			/* high round */
    465 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    466 			if ((nrounds -= 2) == 0)
    467 				break;
    468 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    469 			storeroundkey(rk32, mrk);
    470 			rk32 -= 4;
    471 
    472 			/* low round */
    473 			rk = _mm_shuffle_epi32(rk, 0xff);
    474 			rk = aes_schedule_low_round(rk, pprk);
    475 		}
    476 		break;
    477 	}
    478 	default:
    479 		panic("invalid number of AES rounds: %u", nrounds);
    480 	}
    481 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
    482 }
    483 
    484 __m128i
    485 aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds)
    486 {
    487 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    488 	__m128i io, jo;
    489 	unsigned rmod4 = 0;
    490 
    491 	x = aes_schedule_transform(x, ipt);
    492 	x ^= loadroundkey(rk32);
    493 	for (;;) {
    494 		__m128i A, A2, A2_B, A2_B_D;
    495 
    496 		subbytes(&io, &jo, x);
    497 
    498 		rk32 += 4;
    499 		rmod4 = (rmod4 + 1) % 4;
    500 		if (--nrounds == 0)
    501 			break;
    502 
    503 		A = _mm_shuffle_epi8(sb1[0].m, io) ^
    504 		    _mm_shuffle_epi8(sb1[1].m, jo);
    505 		A ^= loadroundkey(rk32);
    506 		A2 = _mm_shuffle_epi8(sb2[0].m, io) ^
    507 		    _mm_shuffle_epi8(sb2[1].m, jo);
    508 		A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m);
    509 		A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m);
    510 		x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m);
    511 	}
    512 	x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo);
    513 	x ^= loadroundkey(rk32);
    514 	return _mm_shuffle_epi8(x, sr[rmod4].m);
    515 }
    516 
    517 __m128i
    518 aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds)
    519 {
    520 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    521 	unsigned i = 3 & ~(nrounds - 1);
    522 	__m128i io, jo, mc;
    523 
    524 	x = aes_schedule_transform(x, dipt);
    525 	x ^= loadroundkey(rk32);
    526 	rk32 += 4;
    527 
    528 	mc = mc_forward[3].m;
    529 	for (;;) {
    530 		subbytes(&io, &jo, x);
    531 		if (--nrounds == 0)
    532 			break;
    533 
    534 		x = _mm_shuffle_epi8(dsb9[0].m, io) ^
    535 		    _mm_shuffle_epi8(dsb9[1].m, jo);
    536 		x ^= loadroundkey(rk32);
    537 		rk32 += 4;				/* next round key */
    538 
    539 		x = _mm_shuffle_epi8(x, mc);
    540 		x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^
    541 		    _mm_shuffle_epi8(dsbd[1].m, jo);
    542 
    543 		x = _mm_shuffle_epi8(x, mc);
    544 		x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^
    545 		    _mm_shuffle_epi8(dsbb[1].m, jo);
    546 
    547 		x = _mm_shuffle_epi8(x, mc);
    548 		x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^
    549 		    _mm_shuffle_epi8(dsbe[1].m, jo);
    550 
    551 		mc = _mm_alignr_epi8(mc, mc, 12);
    552 	}
    553 	x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo);
    554 	x ^= loadroundkey(rk32);
    555 	return _mm_shuffle_epi8(x, sr[i].m);
    556 }
    557