Home | History | Annotate | Line # | Download | only in arm
      1  1.6       rin /*	$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad /*
     30  1.1  riastrad  * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
     31  1.1  riastrad  * software, at <https://crypto.stanford.edu/vpaes/>, described in
     32  1.1  riastrad  *
     33  1.1  riastrad  *	Mike Hamburg, `Accelerating AES with Vector Permute
     34  1.1  riastrad  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
     35  1.1  riastrad  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
     36  1.1  riastrad  *	Springer LNCS 5747, pp. 18-32.
     37  1.1  riastrad  *
     38  1.1  riastrad  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
     39  1.1  riastrad  */
     40  1.1  riastrad 
     41  1.1  riastrad #include <sys/cdefs.h>
     42  1.6       rin __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $");
     43  1.1  riastrad 
     44  1.1  riastrad #include <sys/types.h>
     45  1.1  riastrad 
     46  1.3  riastrad #ifdef _KERNEL
     47  1.1  riastrad #include <sys/systm.h>
     48  1.3  riastrad #else
     49  1.3  riastrad #include <err.h>
     50  1.3  riastrad #define	panic(fmt, args...)		err(1, fmt, ##args)
     51  1.3  riastrad #endif
     52  1.1  riastrad 
     53  1.1  riastrad #include "aes_neon_impl.h"
     54  1.1  riastrad 
     55  1.2  riastrad #ifdef __aarch64__
     56  1.2  riastrad #define	__aarch64_used
     57  1.2  riastrad #else
     58  1.2  riastrad #define	__aarch64_used	__unused
     59  1.2  riastrad #endif
     60  1.2  riastrad 
     61  1.1  riastrad static const uint8x16_t
     62  1.1  riastrad mc_forward[4] = {
     63  1.5  riastrad 	VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
     64  1.5  riastrad 	    0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
     65  1.5  riastrad 	VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
     66  1.5  riastrad 	    0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
     67  1.5  riastrad 	VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
     68  1.5  riastrad 	    0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
     69  1.5  riastrad 	VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
     70  1.5  riastrad 	    0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
     71  1.1  riastrad },
     72  1.2  riastrad mc_backward[4] __aarch64_used = {
     73  1.5  riastrad 	VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
     74  1.5  riastrad 	    0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
     75  1.5  riastrad 	VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
     76  1.5  riastrad 	    0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
     77  1.5  riastrad 	VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
     78  1.5  riastrad 	    0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
     79  1.5  riastrad 	VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
     80  1.5  riastrad 	    0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
     81  1.1  riastrad },
     82  1.2  riastrad ipt[2] __aarch64_used = {
     83  1.5  riastrad 	VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
     84  1.5  riastrad 	    0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
     85  1.5  riastrad 	VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
     86  1.5  riastrad 	    0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
     87  1.1  riastrad },
     88  1.1  riastrad opt[2] = {
     89  1.5  riastrad 	VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
     90  1.5  riastrad 	    0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
     91  1.5  riastrad 	VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
     92  1.5  riastrad 	    0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
     93  1.1  riastrad },
     94  1.2  riastrad dipt[2] __aarch64_used = {
     95  1.5  riastrad 	VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
     96  1.5  riastrad 	    0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
     97  1.5  riastrad 	VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
     98  1.5  riastrad 	    0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
     99  1.1  riastrad },
    100  1.2  riastrad sb1[2] __aarch64_used = {
    101  1.5  riastrad 	VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
    102  1.5  riastrad 	    0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
    103  1.5  riastrad 	VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
    104  1.5  riastrad 	    0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
    105  1.1  riastrad },
    106  1.2  riastrad sb2[2] __aarch64_used = {
    107  1.5  riastrad 	VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
    108  1.5  riastrad 	    0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
    109  1.5  riastrad 	VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
    110  1.5  riastrad 	    0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
    111  1.1  riastrad },
    112  1.2  riastrad sbo[2] __aarch64_used = {
    113  1.5  riastrad 	VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
    114  1.5  riastrad 	    0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
    115  1.5  riastrad 	VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
    116  1.5  riastrad 	    0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
    117  1.1  riastrad },
    118  1.2  riastrad dsb9[2] __aarch64_used = {
    119  1.5  riastrad 	VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
    120  1.5  riastrad 	    0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
    121  1.5  riastrad 	VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
    122  1.5  riastrad 	    0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
    123  1.1  riastrad },
    124  1.2  riastrad dsbd[2] __aarch64_used = {
    125  1.5  riastrad 	VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
    126  1.5  riastrad 	    0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
    127  1.5  riastrad 	VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
    128  1.5  riastrad 	    0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
    129  1.1  riastrad },
    130  1.2  riastrad dsbb[2] __aarch64_used = {
    131  1.5  riastrad 	VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
    132  1.5  riastrad 	    0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
    133  1.5  riastrad 	VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
    134  1.5  riastrad 	    0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
    135  1.1  riastrad },
    136  1.2  riastrad dsbe[2] __aarch64_used = {
    137  1.5  riastrad 	VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
    138  1.5  riastrad 	    0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
    139  1.5  riastrad 	VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
    140  1.5  riastrad 	    0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
    141  1.1  riastrad },
    142  1.2  riastrad dsbo[2] __aarch64_used = {
    143  1.5  riastrad 	VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
    144  1.5  riastrad 	    0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
    145  1.5  riastrad 	VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
    146  1.5  riastrad 	    0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
    147  1.1  riastrad },
    148  1.1  riastrad dks1[2] = {
    149  1.5  riastrad 	VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
    150  1.5  riastrad 	    0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
    151  1.5  riastrad 	VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
    152  1.5  riastrad 	    0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
    153  1.1  riastrad },
    154  1.1  riastrad dks2[2] = {
    155  1.5  riastrad 	VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
    156  1.5  riastrad 	    0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
    157  1.5  riastrad 	VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
    158  1.5  riastrad 	    0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
    159  1.1  riastrad },
    160  1.1  riastrad dks3[2] = {
    161  1.5  riastrad 	VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
    162  1.5  riastrad 	    0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
    163  1.5  riastrad 	VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
    164  1.5  riastrad 	    0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
    165  1.1  riastrad },
    166  1.1  riastrad dks4[2] = {
    167  1.5  riastrad 	VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
    168  1.5  riastrad 	    0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0),
    169  1.5  riastrad 	VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
    170  1.5  riastrad 	    0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F),
    171  1.1  riastrad },
    172  1.1  riastrad deskew[2] = {
    173  1.5  riastrad 	VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
    174  1.5  riastrad 	    0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D),
    175  1.5  riastrad 	VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
    176  1.5  riastrad 	    0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28),
    177  1.1  riastrad },
    178  1.2  riastrad sr[4] __aarch64_used = {
    179  1.5  riastrad 	VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
    180  1.5  riastrad 	    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F),
    181  1.5  riastrad 	VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
    182  1.5  riastrad 	    0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B),
    183  1.5  riastrad 	VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
    184  1.5  riastrad 	    0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07),
    185  1.5  riastrad 	VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
    186  1.5  riastrad 	    0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03),
    187  1.5  riastrad },
    188  1.5  riastrad rcon	= VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
    189  1.5  riastrad 	    0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70),
    190  1.5  riastrad of	= VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
    191  1.5  riastrad 	    0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F),
    192  1.5  riastrad s63	= VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
    193  1.5  riastrad 	    0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B),
    194  1.5  riastrad inv	= VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
    195  1.5  riastrad 	    0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04),
    196  1.5  riastrad inva	= VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
    197  1.5  riastrad 	    0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
    198  1.1  riastrad 
    199  1.6       rin #ifdef __aarch64__
    200  1.1  riastrad static inline uint8x16_t
    201  1.1  riastrad loadroundkey(const void *rkp)
    202  1.1  riastrad {
    203  1.1  riastrad 	return vld1q_u8(rkp);
    204  1.1  riastrad }
    205  1.6       rin #endif
    206  1.1  riastrad 
    207  1.1  riastrad static inline void
    208  1.1  riastrad storeroundkey(void *rkp, uint8x16_t rk)
    209  1.1  riastrad {
    210  1.1  riastrad 	vst1q_u8(rkp, rk);
    211  1.1  riastrad }
    212  1.1  riastrad 
    213  1.1  riastrad /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
    214  1.1  riastrad static inline void
    215  1.1  riastrad bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
    216  1.1  riastrad {
    217  1.1  riastrad 
    218  1.1  riastrad 	*lo = of & x;
    219  1.1  riastrad 	*hi = of & vshrq_n_u8(x, 4);
    220  1.1  riastrad }
    221  1.1  riastrad 
    222  1.1  riastrad /*
    223  1.1  riastrad  * t is a pair of maps respectively from low and high nybbles to bytes.
    224  1.1  riastrad  * Apply t the nybbles, and add the results in GF(2).
    225  1.1  riastrad  */
    226  1.1  riastrad static uint8x16_t
    227  1.1  riastrad aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
    228  1.1  riastrad {
    229  1.1  riastrad 	uint8x16_t lo, hi;
    230  1.1  riastrad 
    231  1.1  riastrad 	bytes2nybbles(&lo, &hi, x);
    232  1.1  riastrad 	return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
    233  1.1  riastrad }
    234  1.1  riastrad 
    235  1.1  riastrad static inline void
    236  1.1  riastrad subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
    237  1.1  riastrad     uint8x16_t inva_)
    238  1.1  riastrad {
    239  1.1  riastrad 	uint8x16_t k, i, ak, j;
    240  1.1  riastrad 
    241  1.1  riastrad 	bytes2nybbles(&k, &i, x);
    242  1.1  riastrad 	ak = vqtbl1q_u8(inva_, k);
    243  1.1  riastrad 	j = i ^ k;
    244  1.1  riastrad 	*io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
    245  1.1  riastrad 	*jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
    246  1.1  riastrad }
    247  1.1  riastrad 
    248  1.1  riastrad static uint8x16_t
    249  1.1  riastrad aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
    250  1.1  riastrad {
    251  1.1  riastrad 	uint8x16_t io, jo;
    252  1.1  riastrad 
    253  1.1  riastrad 	/* smear prk */
    254  1.1  riastrad 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
    255  1.1  riastrad 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
    256  1.1  riastrad 	prk ^= s63;
    257  1.1  riastrad 
    258  1.1  riastrad 	/* subbytes */
    259  1.1  riastrad 	subbytes(&io, &jo, rk, inv, inva);
    260  1.1  riastrad 	rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
    261  1.1  riastrad 
    262  1.1  riastrad 	/* add in smeared stuff */
    263  1.1  riastrad 	return rk ^ prk;
    264  1.1  riastrad }
    265  1.1  riastrad 
    266  1.1  riastrad static uint8x16_t
    267  1.1  riastrad aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
    268  1.1  riastrad {
    269  1.1  riastrad 	uint32x4_t rk32;
    270  1.1  riastrad 
    271  1.1  riastrad 	/* extract rcon from rcon_rot */
    272  1.1  riastrad 	prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
    273  1.1  riastrad 	*rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
    274  1.1  riastrad 
    275  1.1  riastrad 	/* rotate */
    276  1.1  riastrad 	rk32 = vreinterpretq_u32_u8(rk);
    277  1.1  riastrad 	rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
    278  1.1  riastrad 	rk = vreinterpretq_u8_u32(rk32);
    279  1.1  riastrad 	rk = vextq_u8(rk, rk, 1);
    280  1.1  riastrad 
    281  1.1  riastrad 	return aes_schedule_low_round(rk, prk);
    282  1.1  riastrad }
    283  1.1  riastrad 
    284  1.1  riastrad static uint8x16_t
    285  1.1  riastrad aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
    286  1.1  riastrad {
    287  1.1  riastrad 	uint8x16_t y = vdupq_n_u8(0);
    288  1.1  riastrad 
    289  1.1  riastrad 	x ^= s63;
    290  1.1  riastrad 
    291  1.1  riastrad 	x = vqtbl1q_u8(x, mc_forward[0]);
    292  1.1  riastrad 	y ^= x;
    293  1.1  riastrad 	x = vqtbl1q_u8(x, mc_forward[0]);
    294  1.1  riastrad 	y ^= x;
    295  1.1  riastrad 	x = vqtbl1q_u8(x, mc_forward[0]);
    296  1.1  riastrad 	y ^= x;
    297  1.1  riastrad 
    298  1.1  riastrad 	return vqtbl1q_u8(y, sr_i);
    299  1.1  riastrad }
    300  1.1  riastrad 
    301  1.1  riastrad static uint8x16_t
    302  1.1  riastrad aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
    303  1.1  riastrad {
    304  1.1  riastrad 
    305  1.1  riastrad 	return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
    306  1.1  riastrad }
    307  1.1  riastrad 
    308  1.1  riastrad static uint8x16_t
    309  1.1  riastrad aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
    310  1.1  riastrad {
    311  1.1  riastrad 	uint8x16_t y = vdupq_n_u8(0);
    312  1.1  riastrad 
    313  1.1  riastrad 	x = aes_schedule_transform(x, dks1);
    314  1.1  riastrad 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    315  1.1  riastrad 	x = aes_schedule_transform(x, dks2);
    316  1.1  riastrad 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    317  1.1  riastrad 	x = aes_schedule_transform(x, dks3);
    318  1.1  riastrad 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    319  1.1  riastrad 	x = aes_schedule_transform(x, dks4);
    320  1.1  riastrad 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
    321  1.1  riastrad 
    322  1.1  riastrad 	return vqtbl1q_u8(y, sr_i);
    323  1.1  riastrad }
    324  1.1  riastrad 
    325  1.1  riastrad static uint8x16_t
    326  1.1  riastrad aes_schedule_mangle_last_dec(uint8x16_t x)
    327  1.1  riastrad {
    328  1.1  riastrad 
    329  1.1  riastrad 	return aes_schedule_transform(x ^ s63, deskew);
    330  1.1  riastrad }
    331  1.1  riastrad 
    332  1.1  riastrad static uint8x16_t
    333  1.1  riastrad aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
    334  1.1  riastrad {
    335  1.1  riastrad 	uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
    336  1.1  riastrad 	uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
    337  1.1  riastrad 	uint32x4_t rk32;
    338  1.1  riastrad 
    339  1.1  riastrad 	rk32 = prkhi32;
    340  1.1  riastrad 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
    341  1.1  riastrad 	    vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
    342  1.1  riastrad 	    3);
    343  1.1  riastrad 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
    344  1.1  riastrad 	    vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
    345  1.1  riastrad 	    0);
    346  1.1  riastrad 
    347  1.1  riastrad 	return vreinterpretq_u8_u32(rk32);
    348  1.1  riastrad }
    349  1.1  riastrad 
    350  1.1  riastrad static uint8x16_t
    351  1.1  riastrad aes_schedule_192_smearhi(uint8x16_t rk)
    352  1.1  riastrad {
    353  1.1  riastrad 	uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
    354  1.1  riastrad 
    355  1.1  riastrad 	rk64 = vsetq_lane_u64(0, rk64, 0);
    356  1.1  riastrad 
    357  1.1  riastrad 	return vreinterpretq_u8_u64(rk64);
    358  1.1  riastrad }
    359  1.1  riastrad 
    360  1.1  riastrad void
    361  1.1  riastrad aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
    362  1.1  riastrad {
    363  1.1  riastrad 	uint32_t *rk32 = enc->aese_aes.aes_rk;
    364  1.1  riastrad 	uint8x16_t mrk;		/* mangled round key */
    365  1.1  riastrad 	uint8x16_t rk;		/* round key */
    366  1.1  riastrad 	uint8x16_t prk;		/* previous round key */
    367  1.1  riastrad 	uint8x16_t rcon_rot = rcon;
    368  1.1  riastrad 	uint64_t i = 3;
    369  1.1  riastrad 
    370  1.1  riastrad 	/* input transform */
    371  1.1  riastrad 	rk = aes_schedule_transform(vld1q_u8(key), ipt);
    372  1.1  riastrad 	storeroundkey(rk32, rk);
    373  1.1  riastrad 	rk32 += 4;
    374  1.1  riastrad 
    375  1.1  riastrad 	switch (nrounds) {
    376  1.1  riastrad 	case 10:
    377  1.1  riastrad 		for (;;) {
    378  1.1  riastrad 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    379  1.1  riastrad 			if (--nrounds == 0)
    380  1.1  riastrad 				break;
    381  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    382  1.1  riastrad 			storeroundkey(rk32, mrk);
    383  1.1  riastrad 			rk32 += 4;
    384  1.1  riastrad 		}
    385  1.1  riastrad 		break;
    386  1.1  riastrad 	case 12: {
    387  1.1  riastrad 		uint8x16_t prkhi;	/* high half of previous round key */
    388  1.1  riastrad 
    389  1.1  riastrad 		prk = rk;
    390  1.1  riastrad 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
    391  1.1  riastrad 		prkhi = aes_schedule_192_smearhi(rk);
    392  1.1  riastrad 		for (;;) {
    393  1.1  riastrad 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    394  1.1  riastrad 			rk = vextq_u8(prkhi, prk, 8);
    395  1.1  riastrad 
    396  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    397  1.1  riastrad 			storeroundkey(rk32, mrk);
    398  1.1  riastrad 			rk32 += 4;
    399  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    400  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    401  1.1  riastrad 
    402  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    403  1.1  riastrad 			storeroundkey(rk32, mrk);
    404  1.1  riastrad 			rk32 += 4;
    405  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    406  1.1  riastrad 			if ((nrounds -= 3) == 0)
    407  1.1  riastrad 				break;
    408  1.1  riastrad 
    409  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    410  1.1  riastrad 			storeroundkey(rk32, mrk);
    411  1.1  riastrad 			rk32 += 4;
    412  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    413  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    414  1.1  riastrad 		}
    415  1.1  riastrad 		break;
    416  1.1  riastrad 	}
    417  1.1  riastrad 	case 14: {
    418  1.1  riastrad 		uint8x16_t pprk;	/* previous previous round key */
    419  1.1  riastrad 
    420  1.1  riastrad 		prk = rk;
    421  1.1  riastrad 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
    422  1.1  riastrad 		for (;;) {
    423  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    424  1.1  riastrad 			storeroundkey(rk32, mrk);
    425  1.1  riastrad 			rk32 += 4;
    426  1.1  riastrad 			pprk = rk;
    427  1.1  riastrad 
    428  1.1  riastrad 			/* high round */
    429  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    430  1.1  riastrad 			if ((nrounds -= 2) == 0)
    431  1.1  riastrad 				break;
    432  1.1  riastrad 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
    433  1.1  riastrad 			storeroundkey(rk32, mrk);
    434  1.1  riastrad 			rk32 += 4;
    435  1.1  riastrad 
    436  1.1  riastrad 			/* low round */
    437  1.1  riastrad 			rk = vreinterpretq_u8_u32(
    438  1.1  riastrad 				vdupq_n_u32(
    439  1.1  riastrad 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
    440  1.1  riastrad 					3)));
    441  1.1  riastrad 			rk = aes_schedule_low_round(rk, pprk);
    442  1.1  riastrad 		}
    443  1.1  riastrad 		break;
    444  1.1  riastrad 	}
    445  1.1  riastrad 	default:
    446  1.1  riastrad 		panic("invalid number of AES rounds: %u", nrounds);
    447  1.1  riastrad 	}
    448  1.1  riastrad 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
    449  1.1  riastrad }
    450  1.1  riastrad 
    451  1.1  riastrad void
    452  1.1  riastrad aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
    453  1.1  riastrad {
    454  1.1  riastrad 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
    455  1.1  riastrad 	uint8x16_t mrk;		/* mangled round key */
    456  1.1  riastrad 	uint8x16_t ork;		/* original round key */
    457  1.1  riastrad 	uint8x16_t rk;		/* round key */
    458  1.1  riastrad 	uint8x16_t prk;		/* previous round key */
    459  1.1  riastrad 	uint8x16_t rcon_rot = rcon;
    460  1.1  riastrad 	unsigned i = nrounds == 12 ? 0 : 2;
    461  1.1  riastrad 
    462  1.1  riastrad 	ork = vld1q_u8(key);
    463  1.1  riastrad 
    464  1.1  riastrad 	/* input transform */
    465  1.1  riastrad 	rk = aes_schedule_transform(ork, ipt);
    466  1.1  riastrad 
    467  1.1  riastrad 	/* go from end */
    468  1.1  riastrad 	rk32 += 4*nrounds;
    469  1.1  riastrad 	storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
    470  1.1  riastrad 	rk32 -= 4;
    471  1.1  riastrad 	i ^= 3;
    472  1.1  riastrad 
    473  1.1  riastrad 	switch (nrounds) {
    474  1.1  riastrad 	case 10:
    475  1.1  riastrad 		for (;;) {
    476  1.1  riastrad 			rk = aes_schedule_round(rk, rk, &rcon_rot);
    477  1.1  riastrad 			if (--nrounds == 0)
    478  1.1  riastrad 				break;
    479  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    480  1.1  riastrad 			storeroundkey(rk32, mrk);
    481  1.1  riastrad 			rk32 -= 4;
    482  1.1  riastrad 		}
    483  1.1  riastrad 		break;
    484  1.1  riastrad 	case 12: {
    485  1.1  riastrad 		uint8x16_t prkhi;	/* high half of previous round key */
    486  1.1  riastrad 
    487  1.1  riastrad 		prk = rk;
    488  1.1  riastrad 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
    489  1.1  riastrad 		prkhi = aes_schedule_192_smearhi(rk);
    490  1.1  riastrad 		for (;;) {
    491  1.1  riastrad 			prk = aes_schedule_round(rk, prk, &rcon_rot);
    492  1.1  riastrad 			rk = vextq_u8(prkhi, prk, 8);
    493  1.1  riastrad 
    494  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    495  1.1  riastrad 			storeroundkey(rk32, mrk);
    496  1.1  riastrad 			rk32 -= 4;
    497  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    498  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    499  1.1  riastrad 
    500  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    501  1.1  riastrad 			storeroundkey(rk32, mrk);
    502  1.1  riastrad 			rk32 -= 4;
    503  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    504  1.1  riastrad 			if ((nrounds -= 3) == 0)
    505  1.1  riastrad 				break;
    506  1.1  riastrad 
    507  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    508  1.1  riastrad 			storeroundkey(rk32, mrk);
    509  1.1  riastrad 			rk32 -= 4;
    510  1.1  riastrad 			rk = aes_schedule_192_smear(prkhi, prk);
    511  1.1  riastrad 			prkhi = aes_schedule_192_smearhi(rk);
    512  1.1  riastrad 		}
    513  1.1  riastrad 		break;
    514  1.1  riastrad 	}
    515  1.1  riastrad 	case 14: {
    516  1.1  riastrad 		uint8x16_t pprk;	/* previous previous round key */
    517  1.1  riastrad 
    518  1.1  riastrad 		prk = rk;
    519  1.1  riastrad 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
    520  1.1  riastrad 		for (;;) {
    521  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    522  1.1  riastrad 			storeroundkey(rk32, mrk);
    523  1.1  riastrad 			rk32 -= 4;
    524  1.1  riastrad 			pprk = rk;
    525  1.1  riastrad 
    526  1.1  riastrad 			/* high round */
    527  1.1  riastrad 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
    528  1.1  riastrad 			if ((nrounds -= 2) == 0)
    529  1.1  riastrad 				break;
    530  1.1  riastrad 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
    531  1.1  riastrad 			storeroundkey(rk32, mrk);
    532  1.1  riastrad 			rk32 -= 4;
    533  1.1  riastrad 
    534  1.1  riastrad 			/* low round */
    535  1.1  riastrad 			rk = vreinterpretq_u8_u32(
    536  1.1  riastrad 				vdupq_n_u32(
    537  1.1  riastrad 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
    538  1.1  riastrad 					3)));
    539  1.1  riastrad 			rk = aes_schedule_low_round(rk, pprk);
    540  1.1  riastrad 		}
    541  1.1  riastrad 		break;
    542  1.1  riastrad 	}
    543  1.1  riastrad 	default:
    544  1.1  riastrad 		panic("invalid number of AES rounds: %u", nrounds);
    545  1.1  riastrad 	}
    546  1.1  riastrad 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
    547  1.1  riastrad }
    548  1.1  riastrad 
    549  1.2  riastrad #ifdef __aarch64__
    550  1.2  riastrad 
    551  1.2  riastrad /*
    552  1.2  riastrad  * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
    553  1.2  riastrad  * do the performance-critical parts -- encryption and decryption -- in
    554  1.2  riastrad  * hand-written assembly on arm32.
    555  1.2  riastrad  */
    556  1.2  riastrad 
    557  1.1  riastrad uint8x16_t
    558  1.1  riastrad aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
    559  1.1  riastrad {
    560  1.1  riastrad 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    561  1.1  riastrad 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    562  1.1  riastrad 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    563  1.1  riastrad 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
    564  1.1  riastrad 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
    565  1.1  riastrad 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
    566  1.1  riastrad 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
    567  1.1  riastrad 	uint8x16_t io, jo;
    568  1.1  riastrad 	unsigned rmod4 = 0;
    569  1.1  riastrad 
    570  1.1  riastrad 	x = aes_schedule_transform(x, ipt);
    571  1.1  riastrad 	x ^= loadroundkey(rk32);
    572  1.1  riastrad 	for (;;) {
    573  1.1  riastrad 		uint8x16_t A, A2, A2_B, A2_B_D;
    574  1.1  riastrad 
    575  1.1  riastrad 		subbytes(&io, &jo, x, inv_, inva_);
    576  1.1  riastrad 
    577  1.1  riastrad 		rk32 += 4;
    578  1.1  riastrad 		rmod4 = (rmod4 + 1) % 4;
    579  1.1  riastrad 		if (--nrounds == 0)
    580  1.1  riastrad 			break;
    581  1.1  riastrad 
    582  1.1  riastrad 		A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
    583  1.1  riastrad 		A ^= loadroundkey(rk32);
    584  1.1  riastrad 		A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
    585  1.1  riastrad 		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
    586  1.1  riastrad 		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
    587  1.1  riastrad 		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
    588  1.1  riastrad 	}
    589  1.1  riastrad 	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
    590  1.1  riastrad 	x ^= loadroundkey(rk32);
    591  1.1  riastrad 	return vqtbl1q_u8(x, sr[rmod4]);
    592  1.1  riastrad }
    593  1.1  riastrad 
    594  1.4  riastrad uint8x16x2_t
    595  1.4  riastrad aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
    596  1.4  riastrad {
    597  1.4  riastrad 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
    598  1.4  riastrad 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    599  1.4  riastrad 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    600  1.4  riastrad 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
    601  1.4  riastrad 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
    602  1.4  riastrad 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
    603  1.4  riastrad 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
    604  1.4  riastrad 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
    605  1.4  riastrad 	uint8x16_t io0, jo0, io1, jo1;
    606  1.4  riastrad 	unsigned rmod4 = 0;
    607  1.4  riastrad 
    608  1.4  riastrad 	x0 = aes_schedule_transform(x0, ipt);
    609  1.4  riastrad 	x1 = aes_schedule_transform(x1, ipt);
    610  1.4  riastrad 	x0 ^= loadroundkey(rk32);
    611  1.4  riastrad 	x1 ^= loadroundkey(rk32);
    612  1.4  riastrad 	for (;;) {
    613  1.4  riastrad 		uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
    614  1.4  riastrad 		uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
    615  1.4  riastrad 
    616  1.4  riastrad 		subbytes(&io0, &jo0, x0, inv_, inva_);
    617  1.4  riastrad 		subbytes(&io1, &jo1, x1, inv_, inva_);
    618  1.4  riastrad 
    619  1.4  riastrad 		rk32 += 4;
    620  1.4  riastrad 		rmod4 = (rmod4 + 1) % 4;
    621  1.4  riastrad 		if (--nrounds == 0)
    622  1.4  riastrad 			break;
    623  1.4  riastrad 
    624  1.4  riastrad 		A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
    625  1.4  riastrad 		A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
    626  1.4  riastrad 		A_0 ^= loadroundkey(rk32);
    627  1.4  riastrad 		A_1 ^= loadroundkey(rk32);
    628  1.4  riastrad 		A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
    629  1.4  riastrad 		A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
    630  1.4  riastrad 		A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
    631  1.4  riastrad 		A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
    632  1.4  riastrad 		A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
    633  1.4  riastrad 		A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
    634  1.4  riastrad 		x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
    635  1.4  riastrad 		x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
    636  1.4  riastrad 	}
    637  1.4  riastrad 	x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
    638  1.4  riastrad 	x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
    639  1.4  riastrad 	x0 ^= loadroundkey(rk32);
    640  1.4  riastrad 	x1 ^= loadroundkey(rk32);
    641  1.4  riastrad 	return (uint8x16x2_t) { .val = {
    642  1.4  riastrad 		[0] = vqtbl1q_u8(x0, sr[rmod4]),
    643  1.4  riastrad 		[1] = vqtbl1q_u8(x1, sr[rmod4]),
    644  1.4  riastrad 	} };
    645  1.4  riastrad }
    646  1.4  riastrad 
    647  1.1  riastrad uint8x16_t
    648  1.1  riastrad aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
    649  1.1  riastrad {
    650  1.1  riastrad 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    651  1.1  riastrad 	unsigned i = 3 & ~(nrounds - 1);
    652  1.1  riastrad 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    653  1.1  riastrad 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    654  1.1  riastrad 	uint8x16_t io, jo, mc;
    655  1.1  riastrad 
    656  1.1  riastrad 	x = aes_schedule_transform(x, dipt);
    657  1.1  riastrad 	x ^= loadroundkey(rk32);
    658  1.1  riastrad 	rk32 += 4;
    659  1.1  riastrad 
    660  1.1  riastrad 	mc = mc_forward[3];
    661  1.1  riastrad 	for (;;) {
    662  1.1  riastrad 		subbytes(&io, &jo, x, inv_, inva_);
    663  1.1  riastrad 		if (--nrounds == 0)
    664  1.1  riastrad 			break;
    665  1.1  riastrad 
    666  1.1  riastrad 		x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
    667  1.1  riastrad 		x ^= loadroundkey(rk32);
    668  1.1  riastrad 		rk32 += 4;				/* next round key */
    669  1.1  riastrad 
    670  1.1  riastrad 		x = vqtbl1q_u8(x, mc);
    671  1.1  riastrad 		x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
    672  1.1  riastrad 
    673  1.1  riastrad 		x = vqtbl1q_u8(x, mc);
    674  1.1  riastrad 		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
    675  1.1  riastrad 
    676  1.1  riastrad 		x = vqtbl1q_u8(x, mc);
    677  1.1  riastrad 		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
    678  1.1  riastrad 
    679  1.1  riastrad 		mc = vextq_u8(mc, mc, 12);
    680  1.1  riastrad 	}
    681  1.1  riastrad 	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
    682  1.1  riastrad 	x ^= loadroundkey(rk32);
    683  1.1  riastrad 	return vqtbl1q_u8(x, sr[i]);
    684  1.1  riastrad }
    685  1.2  riastrad 
    686  1.4  riastrad uint8x16x2_t
    687  1.4  riastrad aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
    688  1.4  riastrad {
    689  1.4  riastrad 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
    690  1.4  riastrad 	unsigned i = 3 & ~(nrounds - 1);
    691  1.4  riastrad 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
    692  1.4  riastrad 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
    693  1.4  riastrad 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
    694  1.4  riastrad 	uint8x16_t io0, jo0, io1, jo1, mc;
    695  1.4  riastrad 
    696  1.4  riastrad 	x0 = aes_schedule_transform(x0, dipt);
    697  1.4  riastrad 	x1 = aes_schedule_transform(x1, dipt);
    698  1.4  riastrad 	x0 ^= loadroundkey(rk32);
    699  1.4  riastrad 	x1 ^= loadroundkey(rk32);
    700  1.4  riastrad 	rk32 += 4;
    701  1.4  riastrad 
    702  1.4  riastrad 	mc = mc_forward[3];
    703  1.4  riastrad 	for (;;) {
    704  1.4  riastrad 		subbytes(&io0, &jo0, x0, inv_, inva_);
    705  1.4  riastrad 		subbytes(&io1, &jo1, x1, inv_, inva_);
    706  1.4  riastrad 		if (--nrounds == 0)
    707  1.4  riastrad 			break;
    708  1.4  riastrad 
    709  1.4  riastrad 		x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
    710  1.4  riastrad 		x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
    711  1.4  riastrad 		x0 ^= loadroundkey(rk32);
    712  1.4  riastrad 		x1 ^= loadroundkey(rk32);
    713  1.4  riastrad 		rk32 += 4;				/* next round key */
    714  1.4  riastrad 
    715  1.4  riastrad 		x0 = vqtbl1q_u8(x0, mc);
    716  1.4  riastrad 		x1 = vqtbl1q_u8(x1, mc);
    717  1.4  riastrad 		x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
    718  1.4  riastrad 		x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
    719  1.4  riastrad 
    720  1.4  riastrad 		x0 = vqtbl1q_u8(x0, mc);
    721  1.4  riastrad 		x1 = vqtbl1q_u8(x1, mc);
    722  1.4  riastrad 		x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
    723  1.4  riastrad 		x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
    724  1.4  riastrad 
    725  1.4  riastrad 		x0 = vqtbl1q_u8(x0, mc);
    726  1.4  riastrad 		x1 = vqtbl1q_u8(x1, mc);
    727  1.4  riastrad 		x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
    728  1.4  riastrad 		x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
    729  1.4  riastrad 
    730  1.4  riastrad 		mc = vextq_u8(mc, mc, 12);
    731  1.4  riastrad 	}
    732  1.4  riastrad 	x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
    733  1.4  riastrad 	x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
    734  1.4  riastrad 	x0 ^= loadroundkey(rk32);
    735  1.4  riastrad 	x1 ^= loadroundkey(rk32);
    736  1.4  riastrad 	return (uint8x16x2_t) { .val = {
    737  1.4  riastrad 		[0] = vqtbl1q_u8(x0, sr[i]),
    738  1.4  riastrad 		[1] = vqtbl1q_u8(x1, sr[i]),
    739  1.4  riastrad 	} };
    740  1.4  riastrad }
    741  1.4  riastrad 
    742  1.2  riastrad #endif
    743