Home | History | Annotate | Line # | Download | only in arm
aes_neon.c revision 1.5
      1 /*	$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
     31  * software, at <https://crypto.stanford.edu/vpaes/>, described in
     32  *
     33  *	Mike Hamburg, `Accelerating AES with Vector Permute
     34  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
     35  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
     36  *	Springer LNCS 5747, pp. 18-32.
     37  *
     38  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
     39  */
     40 
     41 #include <sys/cdefs.h>
     42 __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
     43 
     44 #include <sys/types.h>
     45 
     46 #ifdef _KERNEL
     47 #include <sys/systm.h>
     48 #else
     49 #include <err.h>
     50 #define	panic(fmt, args...)		err(1, fmt, ##args)
     51 #endif
     52 
     53 #include "aes_neon_impl.h"
     54 
     55 #ifdef __aarch64__
     56 #define	__aarch64_used
     57 #else
     58 #define	__aarch64_used	__unused
     59 #endif
     60 
     61 static const uint8x16_t
     62 mc_forward[4] = {
     63 	VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
     64 	    0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
     65 	VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
     66 	    0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
     67 	VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
     68 	    0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
     69 	VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
     70 	    0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
     71 },
     72 mc_backward[4] __aarch64_used = {
     73 	VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
     74 	    0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
     75 	VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
     76 	    0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
     77 	VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
     78 	    0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
     79 	VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
     80 	    0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
     81 },
     82 ipt[2] __aarch64_used = {
     83 	VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
     84 	    0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
     85 	VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
     86 	    0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
     87 },
     88 opt[2] = {
     89 	VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
     90 	    0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
     91 	VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
     92 	    0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
     93 },
     94 dipt[2] __aarch64_used = {
     95 	VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
     96 	    0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
     97 	VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
     98 	    0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
     99 },
    100 sb1[2] __aarch64_used = {
    101 	VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
    102 	    0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
    103 	VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
    104 	    0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
    105 },
    106 sb2[2] __aarch64_used = {
    107 	VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
    108 	    0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
    109 	VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
    110 	    0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
    111 },
    112 sbo[2] __aarch64_used = {
    113 	VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
    114 	    0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
    115 	VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
    116 	    0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
    117 },
    118 dsb9[2] __aarch64_used = {
    119 	VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
    120 	    0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
    121 	VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
    122 	    0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
    123 },
    124 dsbd[2] __aarch64_used = {
    125 	VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
    126 	    0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
    127 	VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
    128 	    0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
    129 },
    130 dsbb[2] __aarch64_used = {
    131 	VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
    132 	    0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
    133 	VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
    134 	    0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
    135 },
    136 dsbe[2] __aarch64_used = {
    137 	VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
    138 	    0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
    139 	VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
    140 	    0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
    141 },
    142 dsbo[2] __aarch64_used = {
    143 	VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
    144 	    0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
    145 	VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
    146 	    0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
    147 },
    148 dks1[2] = {
    149 	VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
    150 	    0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
    151 	VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
    152 	    0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
    153 },
    154 dks2[2] = {
    155 	VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
    156 	    0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
    157 	VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
    158 	    0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
    159 },
    160 dks3[2] = {
    161 	VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
    162 	    0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
    163 	VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
    164 	    0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
    165 },
    166 dks4[2] = {
    167 	VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
    168 	    0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0),
    169 	VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
    170 	    0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F),
    171 },
    172 deskew[2] = {
    173 	VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
    174 	    0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D),
    175 	VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
    176 	    0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28),
    177 },
    178 sr[4] __aarch64_used = {
    179 	VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
    180 	    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F),
    181 	VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
    182 	    0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B),
    183 	VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
    184 	    0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07),
    185 	VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
    186 	    0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03),
    187 },
    188 rcon	= VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
    189 	    0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70),
    190 of	= VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
    191 	    0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F),
    192 s63	= VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
    193 	    0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B),
    194 inv	= VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
    195 	    0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04),
    196 inva	= VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
    197 	    0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
    198 
    199 static inline uint8x16_t
    200 loadroundkey(const void *rkp)
    201 {
    202 	return vld1q_u8(rkp);
    203 }
    204 
    205 static inline void
    206 storeroundkey(void *rkp, uint8x16_t rk)
    207 {
    208 	vst1q_u8(rkp, rk);
    209 }
    210 
    211 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
    212 static inline void
    213 bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
    214 {
    215 
    216 	*lo = of & x;
    217 	*hi = of & vshrq_n_u8(x, 4);
    218 }
    219 
    220 /*
    221  * t is a pair of maps respectively from low and high nybbles to bytes.
    222  * Apply t the nybbles, and add the results in GF(2).
    223  */
    224 static uint8x16_t
    225 aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
    226 {
    227 	uint8x16_t lo, hi;
    228 
    229 	bytes2nybbles(&lo, &hi, x);
    230 	return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
    231 }
    232 
    233 static inline void
    234 subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
    235     uint8x16_t inva_)
    236 {
    237 	uint8x16_t k, i, ak, j;
    238 
    239 	bytes2nybbles(&k, &i, x);
    240 	ak = vqtbl1q_u8(inva_, k);
    241 	j = i ^ k;
    242 	*io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
    243 	*jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
    244 }
    245 
    246 static uint8x16_t
    247 aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
    248 {
    249 	uint8x16_t io, jo;
    250 
    251 	/* smear prk */
    252 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
    253 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
    254 	prk ^= s63;
    255 
    256 	/* subbytes */
    257 	subbytes(&io, &jo, rk, inv, inva);
    258 	rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
    259 
    260 	/* add in smeared stuff */
    261 	return rk ^ prk;
    262 }
    263 
    264 static uint8x16_t
    265 aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
    266 {
    267 	uint32x4_t rk32;
    268 
    269 	/* extract rcon from rcon_rot */
    270 	prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
    271 	*rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
    272 
    273 	/* rotate */
    274 	rk32 = vreinterpretq_u32_u8(rk);
    275 	rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
    276 	rk = vreinterpretq_u8_u32(rk32);
    277 	rk = vextq_u8(rk, rk, 1);
    278 
    279 	return aes_schedule_low_round(rk, prk);
    280 }
    281 
    282 static uint8x16_t
    283 aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
    284 {
    285 	uint8x16_t y = vdupq_n_u8(0);
    286 
    287 	x ^= s63;
    288 
    289 	x = vqtbl1q_u8(x, mc_forward[0]);
    290 	y ^= x;
    291 	x = vqtbl1q_u8(x, mc_forward[0]);
    292 	y ^= x;
    293 	x = vqtbl1q_u8(x, mc_forward[0]);
    294 	y ^= x;
    295 
    296 	return vqtbl1q_u8(y, sr_i);
    297 }
    298 
    299 static uint8x16_t
    300 aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
    301 {
    302 
    303 	return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
    304 }
    305 
    306 static uint8x16_t
    307 aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
    308 {
    309 	uint8x16_t y = vdupq_n_u8(0);
    310 
    311 	x = aes_schedule_transform(x, dks1);
    312 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    313 	x = aes_schedule_transform(x, dks2);
    314 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    315 	x = aes_schedule_transform(x, dks3);
    316 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    317 	x = aes_schedule_transform(x, dks4);
    318 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    319 
    320 	return vqtbl1q_u8(y, sr_i);
    321 }
    322 
    323 static uint8x16_t
    324 aes_schedule_mangle_last_dec(uint8x16_t x)
    325 {
    326 
    327 	return aes_schedule_transform(x ^ s63, deskew);
    328 }
    329 
    330 static uint8x16_t
    331 aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
    332 {
    333 	uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
    334 	uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
    335 	uint32x4_t rk32;
    336 
    337 	rk32 = prkhi32;
    338 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
    339 	    vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
    340 	    3);
    341 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
    342 	    vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
    343 	    0);
    344 
    345 	return vreinterpretq_u8_u32(rk32);
    346 }
    347 
    348 static uint8x16_t
    349 aes_schedule_192_smearhi(uint8x16_t rk)
    350 {
    351 	uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
    352 
    353 	rk64 = vsetq_lane_u64(0, rk64, 0);
    354 
    355 	return vreinterpretq_u8_u64(rk64);
    356 }
    357 
    358 void
    359 aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
    360 {
    361 	uint32_t *rk32 = enc->aese_aes.aes_rk;
    362 	uint8x16_t mrk;		/* mangled round key */
    363 	uint8x16_t rk;		/* round key */
    364 	uint8x16_t prk;		/* previous round key */
    365 	uint8x16_t rcon_rot = rcon;
    366 	uint64_t i = 3;
    367 
    368 	/* input transform */
    369 	rk = aes_schedule_transform(vld1q_u8(key), ipt);
    370 	storeroundkey(rk32, rk);
    371 	rk32 += 4;
    372 
    373 	switch (nrounds) {
    374 	case 10:
    375 		for (;;) {
    376 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    377 			if (--nrounds == 0)
    378 				break;
    379 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    380 			storeroundkey(rk32, mrk);
    381 			rk32 += 4;
    382 		}
    383 		break;
    384 	case 12: {
    385 		uint8x16_t prkhi;	/* high half of previous round key */
    386 
    387 		prk = rk;
    388 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
    389 		prkhi = aes_schedule_192_smearhi(rk);
    390 		for (;;) {
    391 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    392 			rk = vextq_u8(prkhi, prk, 8);
    393 
    394 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    395 			storeroundkey(rk32, mrk);
    396 			rk32 += 4;
    397 			rk = aes_schedule_192_smear(prkhi, prk);
    398 			prkhi = aes_schedule_192_smearhi(rk);
    399 
    400 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    401 			storeroundkey(rk32, mrk);
    402 			rk32 += 4;
    403 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    404 			if ((nrounds -= 3) == 0)
    405 				break;
    406 
    407 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    408 			storeroundkey(rk32, mrk);
    409 			rk32 += 4;
    410 			rk = aes_schedule_192_smear(prkhi, prk);
    411 			prkhi = aes_schedule_192_smearhi(rk);
    412 		}
    413 		break;
    414 	}
    415 	case 14: {
    416 		uint8x16_t pprk;	/* previous previous round key */
    417 
    418 		prk = rk;
    419 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
    420 		for (;;) {
    421 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    422 			storeroundkey(rk32, mrk);
    423 			rk32 += 4;
    424 			pprk = rk;
    425 
    426 			/* high round */
    427 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    428 			if ((nrounds -= 2) == 0)
    429 				break;
    430 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    431 			storeroundkey(rk32, mrk);
    432 			rk32 += 4;
    433 
    434 			/* low round */
    435 			rk = vreinterpretq_u8_u32(
    436 				vdupq_n_u32(
    437 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
    438 					3)));
    439 			rk = aes_schedule_low_round(rk, pprk);
    440 		}
    441 		break;
    442 	}
    443 	default:
    444 		panic("invalid number of AES rounds: %u", nrounds);
    445 	}
    446 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
    447 }
    448 
    449 void
    450 aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
    451 {
    452 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
    453 	uint8x16_t mrk;		/* mangled round key */
    454 	uint8x16_t ork;		/* original round key */
    455 	uint8x16_t rk;		/* round key */
    456 	uint8x16_t prk;		/* previous round key */
    457 	uint8x16_t rcon_rot = rcon;
    458 	unsigned i = nrounds == 12 ? 0 : 2;
    459 
    460 	ork = vld1q_u8(key);
    461 
    462 	/* input transform */
    463 	rk = aes_schedule_transform(ork, ipt);
    464 
    465 	/* go from end */
    466 	rk32 += 4*nrounds;
    467 	storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
    468 	rk32 -= 4;
    469 	i ^= 3;
    470 
    471 	switch (nrounds) {
    472 	case 10:
    473 		for (;;) {
    474 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    475 			if (--nrounds == 0)
    476 				break;
    477 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    478 			storeroundkey(rk32, mrk);
    479 			rk32 -= 4;
    480 		}
    481 		break;
    482 	case 12: {
    483 		uint8x16_t prkhi;	/* high half of previous round key */
    484 
    485 		prk = rk;
    486 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
    487 		prkhi = aes_schedule_192_smearhi(rk);
    488 		for (;;) {
    489 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    490 			rk = vextq_u8(prkhi, prk, 8);
    491 
    492 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    493 			storeroundkey(rk32, mrk);
    494 			rk32 -= 4;
    495 			rk = aes_schedule_192_smear(prkhi, prk);
    496 			prkhi = aes_schedule_192_smearhi(rk);
    497 
    498 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    499 			storeroundkey(rk32, mrk);
    500 			rk32 -= 4;
    501 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    502 			if ((nrounds -= 3) == 0)
    503 				break;
    504 
    505 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    506 			storeroundkey(rk32, mrk);
    507 			rk32 -= 4;
    508 			rk = aes_schedule_192_smear(prkhi, prk);
    509 			prkhi = aes_schedule_192_smearhi(rk);
    510 		}
    511 		break;
    512 	}
    513 	case 14: {
    514 		uint8x16_t pprk;	/* previous previous round key */
    515 
    516 		prk = rk;
    517 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
    518 		for (;;) {
    519 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    520 			storeroundkey(rk32, mrk);
    521 			rk32 -= 4;
    522 			pprk = rk;
    523 
    524 			/* high round */
    525 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    526 			if ((nrounds -= 2) == 0)
    527 				break;
    528 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    529 			storeroundkey(rk32, mrk);
    530 			rk32 -= 4;
    531 
    532 			/* low round */
    533 			rk = vreinterpretq_u8_u32(
    534 				vdupq_n_u32(
    535 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
    536 					3)));
    537 			rk = aes_schedule_low_round(rk, pprk);
    538 		}
    539 		break;
    540 	}
    541 	default:
    542 		panic("invalid number of AES rounds: %u", nrounds);
    543 	}
    544 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
    545 }
    546 
    547 #ifdef __aarch64__
    548 
    549 /*
    550  * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
    551  * do the performance-critical parts -- encryption and decryption -- in
    552  * hand-written assembly on arm32.
    553  */
    554 
    555 uint8x16_t
    556 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
    557 {
    558 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    559 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    560 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    561 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
    562 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
    563 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
    564 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
    565 	uint8x16_t io, jo;
    566 	unsigned rmod4 = 0;
    567 
    568 	x = aes_schedule_transform(x, ipt);
    569 	x ^= loadroundkey(rk32);
    570 	for (;;) {
    571 		uint8x16_t A, A2, A2_B, A2_B_D;
    572 
    573 		subbytes(&io, &jo, x, inv_, inva_);
    574 
    575 		rk32 += 4;
    576 		rmod4 = (rmod4 + 1) % 4;
    577 		if (--nrounds == 0)
    578 			break;
    579 
    580 		A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
    581 		A ^= loadroundkey(rk32);
    582 		A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
    583 		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
    584 		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
    585 		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
    586 	}
    587 	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
    588 	x ^= loadroundkey(rk32);
    589 	return vqtbl1q_u8(x, sr[rmod4]);
    590 }
    591 
    592 uint8x16x2_t
    593 aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
    594 {
    595 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    596 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    597 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    598 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
    599 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
    600 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
    601 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
    602 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
    603 	uint8x16_t io0, jo0, io1, jo1;
    604 	unsigned rmod4 = 0;
    605 
    606 	x0 = aes_schedule_transform(x0, ipt);
    607 	x1 = aes_schedule_transform(x1, ipt);
    608 	x0 ^= loadroundkey(rk32);
    609 	x1 ^= loadroundkey(rk32);
    610 	for (;;) {
    611 		uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
    612 		uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
    613 
    614 		subbytes(&io0, &jo0, x0, inv_, inva_);
    615 		subbytes(&io1, &jo1, x1, inv_, inva_);
    616 
    617 		rk32 += 4;
    618 		rmod4 = (rmod4 + 1) % 4;
    619 		if (--nrounds == 0)
    620 			break;
    621 
    622 		A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
    623 		A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
    624 		A_0 ^= loadroundkey(rk32);
    625 		A_1 ^= loadroundkey(rk32);
    626 		A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
    627 		A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
    628 		A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
    629 		A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
    630 		A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
    631 		A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
    632 		x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
    633 		x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
    634 	}
    635 	x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
    636 	x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
    637 	x0 ^= loadroundkey(rk32);
    638 	x1 ^= loadroundkey(rk32);
    639 	return (uint8x16x2_t) { .val = {
    640 		[0] = vqtbl1q_u8(x0, sr[rmod4]),
    641 		[1] = vqtbl1q_u8(x1, sr[rmod4]),
    642 	} };
    643 }
    644 
    645 uint8x16_t
    646 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
    647 {
    648 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    649 	unsigned i = 3 & ~(nrounds - 1);
    650 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    651 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    652 	uint8x16_t io, jo, mc;
    653 
    654 	x = aes_schedule_transform(x, dipt);
    655 	x ^= loadroundkey(rk32);
    656 	rk32 += 4;
    657 
    658 	mc = mc_forward[3];
    659 	for (;;) {
    660 		subbytes(&io, &jo, x, inv_, inva_);
    661 		if (--nrounds == 0)
    662 			break;
    663 
    664 		x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
    665 		x ^= loadroundkey(rk32);
    666 		rk32 += 4;				/* next round key */
    667 
    668 		x = vqtbl1q_u8(x, mc);
    669 		x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
    670 
    671 		x = vqtbl1q_u8(x, mc);
    672 		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
    673 
    674 		x = vqtbl1q_u8(x, mc);
    675 		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
    676 
    677 		mc = vextq_u8(mc, mc, 12);
    678 	}
    679 	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
    680 	x ^= loadroundkey(rk32);
    681 	return vqtbl1q_u8(x, sr[i]);
    682 }
    683 
    684 uint8x16x2_t
    685 aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
    686 {
    687 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    688 	unsigned i = 3 & ~(nrounds - 1);
    689 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    690 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    691 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
    692 	uint8x16_t io0, jo0, io1, jo1, mc;
    693 
    694 	x0 = aes_schedule_transform(x0, dipt);
    695 	x1 = aes_schedule_transform(x1, dipt);
    696 	x0 ^= loadroundkey(rk32);
    697 	x1 ^= loadroundkey(rk32);
    698 	rk32 += 4;
    699 
    700 	mc = mc_forward[3];
    701 	for (;;) {
    702 		subbytes(&io0, &jo0, x0, inv_, inva_);
    703 		subbytes(&io1, &jo1, x1, inv_, inva_);
    704 		if (--nrounds == 0)
    705 			break;
    706 
    707 		x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
    708 		x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
    709 		x0 ^= loadroundkey(rk32);
    710 		x1 ^= loadroundkey(rk32);
    711 		rk32 += 4;				/* next round key */
    712 
    713 		x0 = vqtbl1q_u8(x0, mc);
    714 		x1 = vqtbl1q_u8(x1, mc);
    715 		x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
    716 		x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
    717 
    718 		x0 = vqtbl1q_u8(x0, mc);
    719 		x1 = vqtbl1q_u8(x1, mc);
    720 		x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
    721 		x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
    722 
    723 		x0 = vqtbl1q_u8(x0, mc);
    724 		x1 = vqtbl1q_u8(x1, mc);
    725 		x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
    726 		x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
    727 
    728 		mc = vextq_u8(mc, mc, 12);
    729 	}
    730 	x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
    731 	x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
    732 	x0 ^= loadroundkey(rk32);
    733 	x1 ^= loadroundkey(rk32);
    734 	return (uint8x16x2_t) { .val = {
    735 		[0] = vqtbl1q_u8(x0, sr[i]),
    736 		[1] = vqtbl1q_u8(x1, sr[i]),
    737 	} };
    738 }
    739 
    740 #endif
    741