Home | History | Annotate | Line # | Download | only in x86
      1  1.2  riastrad /*	$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad /*
     30  1.1  riastrad  * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES
     31  1.1  riastrad  * software, at <https://crypto.stanford.edu/vpaes/>, described in
     32  1.1  riastrad  *
     33  1.1  riastrad  *	Mike Hamburg, `Accelerating AES with Vector Permute
     34  1.1  riastrad  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
     35  1.1  riastrad  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
     36  1.1  riastrad  *	Springer LNCS 5747, pp. 18-32.
     37  1.1  riastrad  *
     38  1.1  riastrad  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
     39  1.1  riastrad  */
     40  1.1  riastrad 
     41  1.1  riastrad #include <sys/cdefs.h>
     42  1.2  riastrad __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
     43  1.1  riastrad 
     44  1.1  riastrad #include <sys/types.h>
     45  1.1  riastrad 
     46  1.2  riastrad #ifdef _KERNEL
     47  1.1  riastrad #include <sys/systm.h>
     48  1.2  riastrad #else
     49  1.2  riastrad #include <err.h>
     50  1.2  riastrad #define	panic(fmt, args...)	err(1, fmt, ##args)
     51  1.2  riastrad #endif
     52  1.1  riastrad 
     53  1.1  riastrad #include "aes_ssse3_impl.h"
     54  1.1  riastrad 
     55  1.1  riastrad static const union m128const {
     56  1.1  riastrad 	uint64_t u64[2];
     57  1.1  riastrad 	__m128i m;
     58  1.1  riastrad }
     59  1.1  riastrad mc_forward[4] = {
     60  1.1  riastrad 	{.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}},
     61  1.1  riastrad 	{.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}},
     62  1.1  riastrad 	{.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}},
     63  1.1  riastrad 	{.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}},
     64  1.1  riastrad },
     65  1.1  riastrad mc_backward[4] = {
     66  1.1  riastrad 	{.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}},
     67  1.1  riastrad 	{.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}},
     68  1.1  riastrad 	{.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}},
     69  1.1  riastrad 	{.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}},
     70  1.1  riastrad },
     71  1.1  riastrad ipt[2] = {
     72  1.1  riastrad 	{.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}},
     73  1.1  riastrad 	{.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}},
     74  1.1  riastrad },
     75  1.1  riastrad opt[2] = {
     76  1.1  riastrad 	{.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}},
     77  1.1  riastrad 	{.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}},
     78  1.1  riastrad },
     79  1.1  riastrad dipt[2] = {
     80  1.1  riastrad 	{.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}},
     81  1.1  riastrad 	{.u64 = {0x86E383E660056500, 0x12771772F491F194}},
     82  1.1  riastrad },
     83  1.1  riastrad sb1[2] = {
     84  1.1  riastrad 	{.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}},
     85  1.1  riastrad 	{.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}},
     86  1.1  riastrad },
     87  1.1  riastrad sb2[2] = {
     88  1.1  riastrad 	{.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}},
     89  1.1  riastrad 	{.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}},
     90  1.1  riastrad },
     91  1.1  riastrad sbo[2] = {
     92  1.1  riastrad 	{.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}},
     93  1.1  riastrad 	{.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}},
     94  1.1  riastrad },
     95  1.1  riastrad dsb9[2] = {
     96  1.1  riastrad 	{.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}},
     97  1.1  riastrad 	{.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}},
     98  1.1  riastrad },
     99  1.1  riastrad dsbd[2] = {
    100  1.1  riastrad 	{.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}},
    101  1.1  riastrad 	{.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}},
    102  1.1  riastrad },
    103  1.1  riastrad dsbb[2] = {
    104  1.1  riastrad 	{.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}},
    105  1.1  riastrad 	{.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}},
    106  1.1  riastrad },
    107  1.1  riastrad dsbe[2] = {
    108  1.1  riastrad 	{.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}},
    109  1.1  riastrad 	{.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}},
    110  1.1  riastrad },
    111  1.1  riastrad dsbo[2] = {
    112  1.1  riastrad 	{.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}},
    113  1.1  riastrad 	{.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}},
    114  1.1  riastrad },
    115  1.1  riastrad dks1[2] = {
    116  1.1  riastrad 	{.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}},
    117  1.1  riastrad 	{.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}},
    118  1.1  riastrad },
    119  1.1  riastrad dks2[2] = {
    120  1.1  riastrad 	{.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}},
    121  1.1  riastrad 	{.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}},
    122  1.1  riastrad },
    123  1.1  riastrad dks3[2] = {
    124  1.1  riastrad 	{.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}},
    125  1.1  riastrad 	{.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}},
    126  1.1  riastrad },
    127  1.1  riastrad dks4[2] = {
    128  1.1  riastrad 	{.u64 = {0xE3C390B053732000, 0xA080D3F310306343}},
    129  1.1  riastrad 	{.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}},
    130  1.1  riastrad },
    131  1.1  riastrad deskew[2] = {
    132  1.1  riastrad 	{.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}},
    133  1.1  riastrad 	{.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}},
    134  1.1  riastrad },
    135  1.1  riastrad sr[4] = {
    136  1.1  riastrad 	{.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}},
    137  1.1  riastrad 	{.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}},
    138  1.1  riastrad 	{.u64 = {0x0F060D040B020900, 0x070E050C030A0108}},
    139  1.1  riastrad 	{.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}},
    140  1.1  riastrad },
    141  1.1  riastrad rcon =	{.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}},
    142  1.1  riastrad s63 =	{.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}},
    143  1.1  riastrad of =	{.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}},
    144  1.1  riastrad inv =	{.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}},
    145  1.1  riastrad inva =	{.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}};
    146  1.1  riastrad 
    147  1.1  riastrad static inline __m128i
    148  1.1  riastrad loadroundkey(const uint32_t *rk32)
    149  1.1  riastrad {
    150  1.1  riastrad 	return _mm_load_si128((const void *)rk32);
    151  1.1  riastrad }
    152  1.1  riastrad 
    153  1.1  riastrad static inline void
    154  1.1  riastrad storeroundkey(uint32_t *rk32, __m128i rk)
    155  1.1  riastrad {
    156  1.1  riastrad 	_mm_store_si128((void *)rk32, rk);
    157  1.1  riastrad }
    158  1.1  riastrad 
    159  1.1  riastrad /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
    160  1.1  riastrad static inline void
    161  1.1  riastrad bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x)
    162  1.1  riastrad {
    163  1.1  riastrad 
    164  1.1  riastrad 	*lo = x & of.m;
    165  1.1  riastrad 	*hi = _mm_srli_epi32(x & ~of.m, 4);
    166  1.1  riastrad }
    167  1.1  riastrad 
    168  1.1  riastrad /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c.  */
    169  1.1  riastrad static inline __m128i
    170  1.1  riastrad gf16_inva(__m128i x)
    171  1.1  riastrad {
    172  1.1  riastrad 	return _mm_shuffle_epi8(inva.m, x);
    173  1.1  riastrad }
    174  1.1  riastrad 
    175  1.1  riastrad /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c.  */
    176  1.1  riastrad static inline __m128i
    177  1.1  riastrad gf16_inv(__m128i x)
    178  1.1  riastrad {
    179  1.1  riastrad 	return _mm_shuffle_epi8(inv.m, x);
    180  1.1  riastrad }
    181  1.1  riastrad 
    182  1.1  riastrad /*
    183  1.1  riastrad  * t is a pair of maps respectively from low and high nybbles to bytes.
    184  1.1  riastrad  * Apply t the nybbles, and add the results in GF(2).
    185  1.1  riastrad  */
    186  1.1  riastrad static __m128i
    187  1.1  riastrad aes_schedule_transform(__m128i x, const union m128const t[static 2])
    188  1.1  riastrad {
    189  1.1  riastrad 	__m128i lo, hi;
    190  1.1  riastrad 
    191  1.1  riastrad 	bytes2nybbles(&lo, &hi, x);
    192  1.1  riastrad 	return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi);
    193  1.1  riastrad }
    194  1.1  riastrad 
    195  1.1  riastrad static inline void
    196  1.1  riastrad subbytes(__m128i *io, __m128i *jo, __m128i x)
    197  1.1  riastrad {
    198  1.1  riastrad 	__m128i k, i, ak, j;
    199  1.1  riastrad 
    200  1.1  riastrad 	bytes2nybbles(&k, &i, x);
    201  1.1  riastrad 	ak = gf16_inva(k);
    202  1.1  riastrad 	j = i ^ k;
    203  1.1  riastrad 	*io = j ^ gf16_inv(ak ^ gf16_inv(i));
    204  1.1  riastrad 	*jo = i ^ gf16_inv(ak ^ gf16_inv(j));
    205  1.1  riastrad }
    206  1.1  riastrad 
    207  1.1  riastrad static __m128i
    208  1.1  riastrad aes_schedule_low_round(__m128i rk, __m128i prk)
    209  1.1  riastrad {
    210  1.1  riastrad 	__m128i io, jo;
    211  1.1  riastrad 
    212  1.1  riastrad 	/* smear prk */
    213  1.1  riastrad 	prk ^= _mm_slli_si128(prk, 4);
    214  1.1  riastrad 	prk ^= _mm_slli_si128(prk, 8);
    215  1.1  riastrad 	prk ^= s63.m;
    216  1.1  riastrad 
    217  1.1  riastrad 	/* subbytes */
    218  1.1  riastrad 	subbytes(&io, &jo, rk);
    219  1.1  riastrad 	rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo);
    220  1.1  riastrad 
    221  1.1  riastrad 	/* add in smeared stuff */
    222  1.1  riastrad 	return rk ^ prk;
    223  1.1  riastrad }
    224  1.1  riastrad 
    225  1.1  riastrad static __m128i
    226  1.1  riastrad aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot)
    227  1.1  riastrad {
    228  1.1  riastrad 
    229  1.1  riastrad 	/* extract rcon from rcon_rot */
    230  1.1  riastrad 	prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15);
    231  1.1  riastrad 	*rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15);
    232  1.1  riastrad 
    233  1.1  riastrad 	/* rotate */
    234  1.1  riastrad 	rk = _mm_shuffle_epi32(rk, 0xff);
    235  1.1  riastrad 	rk = _mm_alignr_epi8(rk, rk, 1);
    236  1.1  riastrad 
    237  1.1  riastrad 	return aes_schedule_low_round(rk, prk);
    238  1.1  riastrad }
    239  1.1  riastrad 
    240  1.1  riastrad static __m128i
    241  1.1  riastrad aes_schedule_mangle_enc(__m128i x, __m128i sr_i)
    242  1.1  riastrad {
    243  1.1  riastrad 	__m128i y = _mm_setzero_si128();
    244  1.1  riastrad 
    245  1.1  riastrad 	x ^= s63.m;
    246  1.1  riastrad 
    247  1.1  riastrad 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    248  1.1  riastrad 	y ^= x;
    249  1.1  riastrad 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    250  1.1  riastrad 	y ^= x;
    251  1.1  riastrad 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
    252  1.1  riastrad 	y ^= x;
    253  1.1  riastrad 
    254  1.1  riastrad 	return _mm_shuffle_epi8(y, sr_i);
    255  1.1  riastrad }
    256  1.1  riastrad 
    257  1.1  riastrad static __m128i
    258  1.1  riastrad aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i)
    259  1.1  riastrad {
    260  1.1  riastrad 
    261  1.1  riastrad 	return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt);
    262  1.1  riastrad }
    263  1.1  riastrad 
    264  1.1  riastrad static __m128i
    265  1.1  riastrad aes_schedule_mangle_dec(__m128i x, __m128i sr_i)
    266  1.1  riastrad {
    267  1.1  riastrad 	__m128i y = _mm_setzero_si128();
    268  1.1  riastrad 
    269  1.1  riastrad 	x = aes_schedule_transform(x, dks1);
    270  1.1  riastrad 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    271  1.1  riastrad 	x = aes_schedule_transform(x, dks2);
    272  1.1  riastrad 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    273  1.1  riastrad 	x = aes_schedule_transform(x, dks3);
    274  1.1  riastrad 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    275  1.1  riastrad 	x = aes_schedule_transform(x, dks4);
    276  1.1  riastrad 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
    277  1.1  riastrad 
    278  1.1  riastrad 	return _mm_shuffle_epi8(y, sr_i);
    279  1.1  riastrad }
    280  1.1  riastrad 
    281  1.1  riastrad static __m128i
    282  1.1  riastrad aes_schedule_mangle_last_dec(__m128i x)
    283  1.1  riastrad {
    284  1.1  riastrad 
    285  1.1  riastrad 	return aes_schedule_transform(x ^ s63.m, deskew);
    286  1.1  riastrad }
    287  1.1  riastrad 
    288  1.1  riastrad static __m128i
    289  1.1  riastrad aes_schedule_192_smear(__m128i prkhi, __m128i prk)
    290  1.1  riastrad {
    291  1.1  riastrad 	__m128i rk;
    292  1.1  riastrad 
    293  1.1  riastrad 	rk = prkhi;
    294  1.1  riastrad 	rk ^= _mm_shuffle_epi32(prkhi, 0x80);
    295  1.1  riastrad 	rk ^= _mm_shuffle_epi32(prk, 0xfe);
    296  1.1  riastrad 
    297  1.1  riastrad 	return rk;
    298  1.1  riastrad }
    299  1.1  riastrad 
    300  1.1  riastrad static __m128i
    301  1.1  riastrad aes_schedule_192_smearhi(__m128i rk)
    302  1.1  riastrad {
    303  1.1  riastrad 	return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps());
    304  1.1  riastrad }
    305  1.1  riastrad 
    306  1.1  riastrad void
    307  1.1  riastrad aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
    308  1.1  riastrad {
    309  1.1  riastrad 	uint32_t *rk32 = enc->aese_aes.aes_rk;
    310  1.1  riastrad 	__m128i mrk;		/* mangled round key */
    311  1.1  riastrad 	__m128i rk;		/* round key */
    312  1.1  riastrad 	__m128i prk;		/* previous round key */
    313  1.1  riastrad 	__m128i rcon_rot = rcon.m;
    314  1.1  riastrad 	uint64_t i = 3;
    315  1.1  riastrad 
    316  1.1  riastrad 	/* input transform */
    317  1.1  riastrad 	rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt);
    318  1.1  riastrad 	storeroundkey(rk32, rk);
    319  1.1  riastrad 	rk32 += 4;
    320  1.1  riastrad 
    321  1.1  riastrad 	switch (nrounds) {
    322  1.1  riastrad 	case 10:
    323  1.1  riastrad 		for (;;) {
    324  1.1  riastrad 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    325  1.1  riastrad 			if (--nrounds == 0)
    326  1.1  riastrad 				break;
    327  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    328  1.1  riastrad 			storeroundkey(rk32, mrk);
    329  1.1  riastrad 			rk32 += 4;
    330  1.1  riastrad 		}
    331  1.1  riastrad 		break;
    332  1.1  riastrad 	case 12: {
    333  1.1  riastrad 		__m128i prkhi;		/* high half of previous round key */
    334  1.1  riastrad 
    335  1.1  riastrad 		prk = rk;
    336  1.1  riastrad 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
    337  1.1  riastrad 		prkhi = aes_schedule_192_smearhi(rk);
    338  1.1  riastrad 		for (;;) {
    339  1.1  riastrad 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    340  1.1  riastrad 			rk = _mm_alignr_epi8(prk, prkhi, 8);
    341  1.1  riastrad 
    342  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    343  1.1  riastrad 			storeroundkey(rk32, mrk);
    344  1.1  riastrad 			rk32 += 4;
    345  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    346  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    347  1.1  riastrad 
    348  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    349  1.1  riastrad 			storeroundkey(rk32, mrk);
    350  1.1  riastrad 			rk32 += 4;
    351  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    352  1.1  riastrad 			if ((nrounds -= 3) == 0)
    353  1.1  riastrad 				break;
    354  1.1  riastrad 
    355  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    356  1.1  riastrad 			storeroundkey(rk32, mrk);
    357  1.1  riastrad 			rk32 += 4;
    358  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    359  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    360  1.1  riastrad 		}
    361  1.1  riastrad 		break;
    362  1.1  riastrad 	}
    363  1.1  riastrad 	case 14: {
    364  1.1  riastrad 		__m128i pprk;		/* previous previous round key */
    365  1.1  riastrad 
    366  1.1  riastrad 		prk = rk;
    367  1.1  riastrad 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
    368  1.1  riastrad 		for (;;) {
    369  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    370  1.1  riastrad 			storeroundkey(rk32, mrk);
    371  1.1  riastrad 			rk32 += 4;
    372  1.1  riastrad 			pprk = rk;
    373  1.1  riastrad 
    374  1.1  riastrad 			/* high round */
    375  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    376  1.1  riastrad 			if ((nrounds -= 2) == 0)
    377  1.1  riastrad 				break;
    378  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
    379  1.1  riastrad 			storeroundkey(rk32, mrk);
    380  1.1  riastrad 			rk32 += 4;
    381  1.1  riastrad 
    382  1.1  riastrad 			/* low round */
    383  1.1  riastrad 			rk = _mm_shuffle_epi32(rk, 0xff);
    384  1.1  riastrad 			rk = aes_schedule_low_round(rk, pprk);
    385  1.1  riastrad 		}
    386  1.1  riastrad 		break;
    387  1.1  riastrad 	}
    388  1.1  riastrad 	default:
    389  1.1  riastrad 		panic("invalid number of AES rounds: %u", nrounds);
    390  1.1  riastrad 	}
    391  1.1  riastrad 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m));
    392  1.1  riastrad }
    393  1.1  riastrad 
    394  1.1  riastrad void
    395  1.1  riastrad aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
    396  1.1  riastrad {
    397  1.1  riastrad 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
    398  1.1  riastrad 	__m128i mrk;		/* mangled round key */
    399  1.1  riastrad 	__m128i ork;		/* original round key */
    400  1.1  riastrad 	__m128i rk;		/* round key */
    401  1.1  riastrad 	__m128i prk;		/* previous round key */
    402  1.1  riastrad 	__m128i rcon_rot = rcon.m;
    403  1.1  riastrad 	unsigned i = nrounds == 12 ? 0 : 2;
    404  1.1  riastrad 
    405  1.1  riastrad 	ork = _mm_loadu_epi8(key);
    406  1.1  riastrad 
    407  1.1  riastrad 	/* input transform */
    408  1.1  riastrad 	rk = aes_schedule_transform(ork, ipt);
    409  1.1  riastrad 
    410  1.1  riastrad 	/* go from end */
    411  1.1  riastrad 	rk32 += 4*nrounds;
    412  1.1  riastrad 	storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m));
    413  1.1  riastrad 	rk32 -= 4;
    414  1.1  riastrad 	i ^= 3;
    415  1.1  riastrad 
    416  1.1  riastrad 	switch (nrounds) {
    417  1.1  riastrad 	case 10:
    418  1.1  riastrad 		for (;;) {
    419  1.1  riastrad 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    420  1.1  riastrad 			if (--nrounds == 0)
    421  1.1  riastrad 				break;
    422  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    423  1.1  riastrad 			storeroundkey(rk32, mrk);
    424  1.1  riastrad 			rk32 -= 4;
    425  1.1  riastrad 		}
    426  1.1  riastrad 		break;
    427  1.1  riastrad 	case 12: {
    428  1.1  riastrad 		__m128i prkhi;		/* high half of previous round key */
    429  1.1  riastrad 
    430  1.1  riastrad 		prk = rk;
    431  1.1  riastrad 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
    432  1.1  riastrad 		prkhi = aes_schedule_192_smearhi(rk);
    433  1.1  riastrad 		for (;;) {
    434  1.1  riastrad 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    435  1.1  riastrad 			rk = _mm_alignr_epi8(prk, prkhi, 8);
    436  1.1  riastrad 
    437  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    438  1.1  riastrad 			storeroundkey(rk32, mrk);
    439  1.1  riastrad 			rk32 -= 4;
    440  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    441  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    442  1.1  riastrad 
    443  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    444  1.1  riastrad 			storeroundkey(rk32, mrk);
    445  1.1  riastrad 			rk32 -= 4;
    446  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    447  1.1  riastrad 			if ((nrounds -= 3) == 0)
    448  1.1  riastrad 				break;
    449  1.1  riastrad 
    450  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    451  1.1  riastrad 			storeroundkey(rk32, mrk);
    452  1.1  riastrad 			rk32 -= 4;
    453  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    454  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    455  1.1  riastrad 		}
    456  1.1  riastrad 		break;
    457  1.1  riastrad 	}
    458  1.1  riastrad 	case 14: {
    459  1.1  riastrad 		__m128i pprk;		/* previous previous round key */
    460  1.1  riastrad 
    461  1.1  riastrad 		prk = rk;
    462  1.1  riastrad 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
    463  1.1  riastrad 		for (;;) {
    464  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    465  1.1  riastrad 			storeroundkey(rk32, mrk);
    466  1.1  riastrad 			rk32 -= 4;
    467  1.1  riastrad 			pprk = rk;
    468  1.1  riastrad 
    469  1.1  riastrad 			/* high round */
    470  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    471  1.1  riastrad 			if ((nrounds -= 2) == 0)
    472  1.1  riastrad 				break;
    473  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
    474  1.1  riastrad 			storeroundkey(rk32, mrk);
    475  1.1  riastrad 			rk32 -= 4;
    476  1.1  riastrad 
    477  1.1  riastrad 			/* low round */
    478  1.1  riastrad 			rk = _mm_shuffle_epi32(rk, 0xff);
    479  1.1  riastrad 			rk = aes_schedule_low_round(rk, pprk);
    480  1.1  riastrad 		}
    481  1.1  riastrad 		break;
    482  1.1  riastrad 	}
    483  1.1  riastrad 	default:
    484  1.1  riastrad 		panic("invalid number of AES rounds: %u", nrounds);
    485  1.1  riastrad 	}
    486  1.1  riastrad 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
    487  1.1  riastrad }
    488  1.1  riastrad 
    489  1.1  riastrad __m128i
    490  1.1  riastrad aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds)
    491  1.1  riastrad {
    492  1.1  riastrad 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    493  1.1  riastrad 	__m128i io, jo;
    494  1.1  riastrad 	unsigned rmod4 = 0;
    495  1.1  riastrad 
    496  1.1  riastrad 	x = aes_schedule_transform(x, ipt);
    497  1.1  riastrad 	x ^= loadroundkey(rk32);
    498  1.1  riastrad 	for (;;) {
    499  1.1  riastrad 		__m128i A, A2, A2_B, A2_B_D;
    500  1.1  riastrad 
    501  1.1  riastrad 		subbytes(&io, &jo, x);
    502  1.1  riastrad 
    503  1.1  riastrad 		rk32 += 4;
    504  1.1  riastrad 		rmod4 = (rmod4 + 1) % 4;
    505  1.1  riastrad 		if (--nrounds == 0)
    506  1.1  riastrad 			break;
    507  1.1  riastrad 
    508  1.1  riastrad 		A = _mm_shuffle_epi8(sb1[0].m, io) ^
    509  1.1  riastrad 		    _mm_shuffle_epi8(sb1[1].m, jo);
    510  1.1  riastrad 		A ^= loadroundkey(rk32);
    511  1.1  riastrad 		A2 = _mm_shuffle_epi8(sb2[0].m, io) ^
    512  1.1  riastrad 		    _mm_shuffle_epi8(sb2[1].m, jo);
    513  1.1  riastrad 		A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m);
    514  1.1  riastrad 		A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m);
    515  1.1  riastrad 		x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m);
    516  1.1  riastrad 	}
    517  1.1  riastrad 	x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo);
    518  1.1  riastrad 	x ^= loadroundkey(rk32);
    519  1.1  riastrad 	return _mm_shuffle_epi8(x, sr[rmod4].m);
    520  1.1  riastrad }
    521  1.1  riastrad 
    522  1.1  riastrad __m128i
    523  1.1  riastrad aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds)
    524  1.1  riastrad {
    525  1.1  riastrad 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    526  1.1  riastrad 	unsigned i = 3 & ~(nrounds - 1);
    527  1.1  riastrad 	__m128i io, jo, mc;
    528  1.1  riastrad 
    529  1.1  riastrad 	x = aes_schedule_transform(x, dipt);
    530  1.1  riastrad 	x ^= loadroundkey(rk32);
    531  1.1  riastrad 	rk32 += 4;
    532  1.1  riastrad 
    533  1.1  riastrad 	mc = mc_forward[3].m;
    534  1.1  riastrad 	for (;;) {
    535  1.1  riastrad 		subbytes(&io, &jo, x);
    536  1.1  riastrad 		if (--nrounds == 0)
    537  1.1  riastrad 			break;
    538  1.1  riastrad 
    539  1.1  riastrad 		x = _mm_shuffle_epi8(dsb9[0].m, io) ^
    540  1.1  riastrad 		    _mm_shuffle_epi8(dsb9[1].m, jo);
    541  1.1  riastrad 		x ^= loadroundkey(rk32);
    542  1.1  riastrad 		rk32 += 4;				/* next round key */
    543  1.1  riastrad 
    544  1.1  riastrad 		x = _mm_shuffle_epi8(x, mc);
    545  1.1  riastrad 		x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^
    546  1.1  riastrad 		    _mm_shuffle_epi8(dsbd[1].m, jo);
    547  1.1  riastrad 
    548  1.1  riastrad 		x = _mm_shuffle_epi8(x, mc);
    549  1.1  riastrad 		x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^
    550  1.1  riastrad 		    _mm_shuffle_epi8(dsbb[1].m, jo);
    551  1.1  riastrad 
    552  1.1  riastrad 		x = _mm_shuffle_epi8(x, mc);
    553  1.1  riastrad 		x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^
    554  1.1  riastrad 		    _mm_shuffle_epi8(dsbe[1].m, jo);
    555  1.1  riastrad 
    556  1.1  riastrad 		mc = _mm_alignr_epi8(mc, mc, 12);
    557  1.1  riastrad 	}
    558  1.1  riastrad 	x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo);
    559  1.1  riastrad 	x ^= loadroundkey(rk32);
    560  1.1  riastrad 	return _mm_shuffle_epi8(x, sr[i].m);
    561  1.1  riastrad }
    562