Home | History | Annotate | Line # | Download | only in arm
aes_neon.c revision 1.1
      1 /*	$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
     31  * software, at <https://crypto.stanford.edu/vpaes/>, described in
     32  *
     33  *	Mike Hamburg, `Accelerating AES with Vector Permute
     34  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
     35  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
     36  *	Springer LNCS 5747, pp. 18-32.
     37  *
     38  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
     39  */
     40 
     41 #include <sys/cdefs.h>
     42 __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $");
     43 
     44 #include <sys/types.h>
     45 
     46 #include <sys/systm.h>
     47 
     48 #include "aes_neon_impl.h"
     49 
     50 static const uint8x16_t
     51 mc_forward[4] = {
     52 	{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
     53 	 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
     54 	{0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
     55 	 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
     56 	{0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
     57 	 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
     58 	{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
     59 	 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
     60 },
     61 mc_backward[4] = {
     62 	{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
     63 	 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
     64 	{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
     65 	 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
     66 	{0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
     67 	 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
     68 	{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
     69 	 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
     70 },
     71 ipt[2] = {
     72 	{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
     73 	 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
     74 	{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
     75 	 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
     76 },
     77 opt[2] = {
     78 	{0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
     79 	 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
     80 	{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
     81 	 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
     82 },
     83 dipt[2] = {
     84 	{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
     85 	 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
     86 	{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
     87 	 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
     88 },
     89 sb1[2] = {
     90 	{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
     91 	 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
     92 	{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
     93 	 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
     94 },
     95 sb2[2] = {
     96 	{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
     97 	 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
     98 	{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
     99 	 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
    100 },
    101 sbo[2] = {
    102 	{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
    103 	 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
    104 	{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
    105 	 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
    106 },
    107 dsb9[2] = {
    108 	{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
    109 	 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
    110 	{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
    111 	 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
    112 },
    113 dsbd[2] = {
    114 	{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
    115 	 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
    116 	{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
    117 	 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
    118 },
    119 dsbb[2] = {
    120 	{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
    121 	 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
    122 	{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
    123 	 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
    124 },
    125 dsbe[2] = {
    126 	{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
    127 	 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
    128 	{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
    129 	 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
    130 },
    131 dsbo[2] = {
    132 	{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
    133 	 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
    134 	{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
    135 	 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
    136 },
    137 dks1[2] = {
    138 	{0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
    139 	 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
    140 	{0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
    141 	 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
    142 },
    143 dks2[2] = {
    144 	{0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
    145 	 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
    146 	{0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
    147 	 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
    148 },
    149 dks3[2] = {
    150 	{0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
    151 	 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
    152 	{0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
    153 	 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
    154 },
    155 dks4[2] = {
    156 	{0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
    157 	 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
    158 	{0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
    159 	 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F},
    160 },
    161 deskew[2] = {
    162 	{0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
    163 	 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D},
    164 	{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
    165 	 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
    166 },
    167 sr[4] = {
    168 	{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
    169 	 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
    170 	{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
    171 	 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B},
    172 	{0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
    173 	 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07},
    174 	{0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
    175 	 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03},
    176 },
    177 rcon =	{0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
    178 	0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70},
    179 s63 =	{0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
    180 	0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B},
    181 of =	{0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
    182 	0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
    183 inv =	{0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
    184 	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04},
    185 inva =	{0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
    186 	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03};
    187 
    188 static inline uint8x16_t
    189 loadroundkey(const void *rkp)
    190 {
    191 	return vld1q_u8(rkp);
    192 }
    193 
    194 static inline void
    195 storeroundkey(void *rkp, uint8x16_t rk)
    196 {
    197 	vst1q_u8(rkp, rk);
    198 }
    199 
    200 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
    201 static inline void
    202 bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
    203 {
    204 
    205 	*lo = of & x;
    206 	*hi = of & vshrq_n_u8(x, 4);
    207 }
    208 
    209 /*
    210  * t is a pair of maps respectively from low and high nybbles to bytes.
    211  * Apply t the nybbles, and add the results in GF(2).
    212  */
    213 static uint8x16_t
    214 aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
    215 {
    216 	uint8x16_t lo, hi;
    217 
    218 	bytes2nybbles(&lo, &hi, x);
    219 	return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
    220 }
    221 
    222 static inline void
    223 subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
    224     uint8x16_t inva_)
    225 {
    226 	uint8x16_t k, i, ak, j;
    227 
    228 	bytes2nybbles(&k, &i, x);
    229 	ak = vqtbl1q_u8(inva_, k);
    230 	j = i ^ k;
    231 	*io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
    232 	*jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
    233 }
    234 
    235 static uint8x16_t
    236 aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
    237 {
    238 	uint8x16_t io, jo;
    239 
    240 	/* smear prk */
    241 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
    242 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
    243 	prk ^= s63;
    244 
    245 	/* subbytes */
    246 	subbytes(&io, &jo, rk, inv, inva);
    247 	rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
    248 
    249 	/* add in smeared stuff */
    250 	return rk ^ prk;
    251 }
    252 
    253 static uint8x16_t
    254 aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
    255 {
    256 	uint32x4_t rk32;
    257 
    258 	/* extract rcon from rcon_rot */
    259 	prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
    260 	*rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
    261 
    262 	/* rotate */
    263 	rk32 = vreinterpretq_u32_u8(rk);
    264 	rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
    265 	rk = vreinterpretq_u8_u32(rk32);
    266 	rk = vextq_u8(rk, rk, 1);
    267 
    268 	return aes_schedule_low_round(rk, prk);
    269 }
    270 
    271 static uint8x16_t
    272 aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
    273 {
    274 	uint8x16_t y = vdupq_n_u8(0);
    275 
    276 	x ^= s63;
    277 
    278 	x = vqtbl1q_u8(x, mc_forward[0]);
    279 	y ^= x;
    280 	x = vqtbl1q_u8(x, mc_forward[0]);
    281 	y ^= x;
    282 	x = vqtbl1q_u8(x, mc_forward[0]);
    283 	y ^= x;
    284 
    285 	return vqtbl1q_u8(y, sr_i);
    286 }
    287 
    288 static uint8x16_t
    289 aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
    290 {
    291 
    292 	return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
    293 }
    294 
    295 static uint8x16_t
    296 aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
    297 {
    298 	uint8x16_t y = vdupq_n_u8(0);
    299 
    300 	x = aes_schedule_transform(x, dks1);
    301 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    302 	x = aes_schedule_transform(x, dks2);
    303 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    304 	x = aes_schedule_transform(x, dks3);
    305 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    306 	x = aes_schedule_transform(x, dks4);
    307 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    308 
    309 	return vqtbl1q_u8(y, sr_i);
    310 }
    311 
    312 static uint8x16_t
    313 aes_schedule_mangle_last_dec(uint8x16_t x)
    314 {
    315 
    316 	return aes_schedule_transform(x ^ s63, deskew);
    317 }
    318 
    319 static uint8x16_t
    320 aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
    321 {
    322 	uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
    323 	uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
    324 	uint32x4_t rk32;
    325 
    326 	rk32 = prkhi32;
    327 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
    328 	    vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
    329 	    3);
    330 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
    331 	    vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
    332 	    0);
    333 
    334 	return vreinterpretq_u8_u32(rk32);
    335 }
    336 
    337 static uint8x16_t
    338 aes_schedule_192_smearhi(uint8x16_t rk)
    339 {
    340 	uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
    341 
    342 	rk64 = vsetq_lane_u64(0, rk64, 0);
    343 
    344 	return vreinterpretq_u8_u64(rk64);
    345 }
    346 
    347 void
    348 aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
    349 {
    350 	uint32_t *rk32 = enc->aese_aes.aes_rk;
    351 	uint8x16_t mrk;		/* mangled round key */
    352 	uint8x16_t rk;		/* round key */
    353 	uint8x16_t prk;		/* previous round key */
    354 	uint8x16_t rcon_rot = rcon;
    355 	uint64_t i = 3;
    356 
    357 	/* input transform */
    358 	rk = aes_schedule_transform(vld1q_u8(key), ipt);
    359 	storeroundkey(rk32, rk);
    360 	rk32 += 4;
    361 
    362 	switch (nrounds) {
    363 	case 10:
    364 		for (;;) {
    365 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    366 			if (--nrounds == 0)
    367 				break;
    368 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    369 			storeroundkey(rk32, mrk);
    370 			rk32 += 4;
    371 		}
    372 		break;
    373 	case 12: {
    374 		uint8x16_t prkhi;	/* high half of previous round key */
    375 
    376 		prk = rk;
    377 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
    378 		prkhi = aes_schedule_192_smearhi(rk);
    379 		for (;;) {
    380 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    381 			rk = vextq_u8(prkhi, prk, 8);
    382 
    383 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    384 			storeroundkey(rk32, mrk);
    385 			rk32 += 4;
    386 			rk = aes_schedule_192_smear(prkhi, prk);
    387 			prkhi = aes_schedule_192_smearhi(rk);
    388 
    389 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    390 			storeroundkey(rk32, mrk);
    391 			rk32 += 4;
    392 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    393 			if ((nrounds -= 3) == 0)
    394 				break;
    395 
    396 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    397 			storeroundkey(rk32, mrk);
    398 			rk32 += 4;
    399 			rk = aes_schedule_192_smear(prkhi, prk);
    400 			prkhi = aes_schedule_192_smearhi(rk);
    401 		}
    402 		break;
    403 	}
    404 	case 14: {
    405 		uint8x16_t pprk;	/* previous previous round key */
    406 
    407 		prk = rk;
    408 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
    409 		for (;;) {
    410 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    411 			storeroundkey(rk32, mrk);
    412 			rk32 += 4;
    413 			pprk = rk;
    414 
    415 			/* high round */
    416 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    417 			if ((nrounds -= 2) == 0)
    418 				break;
    419 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    420 			storeroundkey(rk32, mrk);
    421 			rk32 += 4;
    422 
    423 			/* low round */
    424 			rk = vreinterpretq_u8_u32(
    425 				vdupq_n_u32(
    426 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
    427 					3)));
    428 			rk = aes_schedule_low_round(rk, pprk);
    429 		}
    430 		break;
    431 	}
    432 	default:
    433 		panic("invalid number of AES rounds: %u", nrounds);
    434 	}
    435 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
    436 }
    437 
    438 void
    439 aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
    440 {
    441 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
    442 	uint8x16_t mrk;		/* mangled round key */
    443 	uint8x16_t ork;		/* original round key */
    444 	uint8x16_t rk;		/* round key */
    445 	uint8x16_t prk;		/* previous round key */
    446 	uint8x16_t rcon_rot = rcon;
    447 	unsigned i = nrounds == 12 ? 0 : 2;
    448 
    449 	ork = vld1q_u8(key);
    450 
    451 	/* input transform */
    452 	rk = aes_schedule_transform(ork, ipt);
    453 
    454 	/* go from end */
    455 	rk32 += 4*nrounds;
    456 	storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
    457 	rk32 -= 4;
    458 	i ^= 3;
    459 
    460 	switch (nrounds) {
    461 	case 10:
    462 		for (;;) {
    463 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    464 			if (--nrounds == 0)
    465 				break;
    466 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    467 			storeroundkey(rk32, mrk);
    468 			rk32 -= 4;
    469 		}
    470 		break;
    471 	case 12: {
    472 		uint8x16_t prkhi;	/* high half of previous round key */
    473 
    474 		prk = rk;
    475 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
    476 		prkhi = aes_schedule_192_smearhi(rk);
    477 		for (;;) {
    478 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    479 			rk = vextq_u8(prkhi, prk, 8);
    480 
    481 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    482 			storeroundkey(rk32, mrk);
    483 			rk32 -= 4;
    484 			rk = aes_schedule_192_smear(prkhi, prk);
    485 			prkhi = aes_schedule_192_smearhi(rk);
    486 
    487 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    488 			storeroundkey(rk32, mrk);
    489 			rk32 -= 4;
    490 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    491 			if ((nrounds -= 3) == 0)
    492 				break;
    493 
    494 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    495 			storeroundkey(rk32, mrk);
    496 			rk32 -= 4;
    497 			rk = aes_schedule_192_smear(prkhi, prk);
    498 			prkhi = aes_schedule_192_smearhi(rk);
    499 		}
    500 		break;
    501 	}
    502 	case 14: {
    503 		uint8x16_t pprk;	/* previous previous round key */
    504 
    505 		prk = rk;
    506 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
    507 		for (;;) {
    508 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    509 			storeroundkey(rk32, mrk);
    510 			rk32 -= 4;
    511 			pprk = rk;
    512 
    513 			/* high round */
    514 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    515 			if ((nrounds -= 2) == 0)
    516 				break;
    517 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    518 			storeroundkey(rk32, mrk);
    519 			rk32 -= 4;
    520 
    521 			/* low round */
    522 			rk = vreinterpretq_u8_u32(
    523 				vdupq_n_u32(
    524 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
    525 					3)));
    526 			rk = aes_schedule_low_round(rk, pprk);
    527 		}
    528 		break;
    529 	}
    530 	default:
    531 		panic("invalid number of AES rounds: %u", nrounds);
    532 	}
    533 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
    534 }
    535 
    536 uint8x16_t
    537 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
    538 {
    539 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    540 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    541 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    542 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
    543 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
    544 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
    545 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
    546 	uint8x16_t io, jo;
    547 	unsigned rmod4 = 0;
    548 
    549 	x = aes_schedule_transform(x, ipt);
    550 	x ^= loadroundkey(rk32);
    551 	for (;;) {
    552 		uint8x16_t A, A2, A2_B, A2_B_D;
    553 
    554 		subbytes(&io, &jo, x, inv_, inva_);
    555 
    556 		rk32 += 4;
    557 		rmod4 = (rmod4 + 1) % 4;
    558 		if (--nrounds == 0)
    559 			break;
    560 
    561 		A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
    562 		A ^= loadroundkey(rk32);
    563 		A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
    564 		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
    565 		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
    566 		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
    567 	}
    568 	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
    569 	x ^= loadroundkey(rk32);
    570 	return vqtbl1q_u8(x, sr[rmod4]);
    571 }
    572 
    573 uint8x16_t
    574 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
    575 {
    576 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    577 	unsigned i = 3 & ~(nrounds - 1);
    578 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    579 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    580 	uint8x16_t io, jo, mc;
    581 
    582 	x = aes_schedule_transform(x, dipt);
    583 	x ^= loadroundkey(rk32);
    584 	rk32 += 4;
    585 
    586 	mc = mc_forward[3];
    587 	for (;;) {
    588 		subbytes(&io, &jo, x, inv_, inva_);
    589 		if (--nrounds == 0)
    590 			break;
    591 
    592 		x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
    593 		x ^= loadroundkey(rk32);
    594 		rk32 += 4;				/* next round key */
    595 
    596 		x = vqtbl1q_u8(x, mc);
    597 		x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
    598 
    599 		x = vqtbl1q_u8(x, mc);
    600 		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
    601 
    602 		x = vqtbl1q_u8(x, mc);
    603 		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
    604 
    605 		mc = vextq_u8(mc, mc, 12);
    606 	}
    607 	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
    608 	x ^= loadroundkey(rk32);
    609 	return vqtbl1q_u8(x, sr[i]);
    610 }
    611