1 1.6 rin /* $NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad /* 30 1.1 riastrad * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES 31 1.1 riastrad * software, at <https://crypto.stanford.edu/vpaes/>, described in 32 1.1 riastrad * 33 1.1 riastrad * Mike Hamburg, `Accelerating AES with Vector Permute 34 1.1 riastrad * Instructions', in Christophe Clavier and Kris Gaj (eds.), 35 1.1 riastrad * Cryptographic Hardware and Embedded Systems -- CHES 2009, 36 1.1 riastrad * Springer LNCS 5747, pp. 18-32. 37 1.1 riastrad * 38 1.1 riastrad * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 39 1.1 riastrad */ 40 1.1 riastrad 41 1.1 riastrad #include <sys/cdefs.h> 42 1.6 rin __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $"); 43 1.1 riastrad 44 1.1 riastrad #include <sys/types.h> 45 1.1 riastrad 46 1.3 riastrad #ifdef _KERNEL 47 1.1 riastrad #include <sys/systm.h> 48 1.3 riastrad #else 49 1.3 riastrad #include <err.h> 50 1.3 riastrad #define panic(fmt, args...) err(1, fmt, ##args) 51 1.3 riastrad #endif 52 1.1 riastrad 53 1.1 riastrad #include "aes_neon_impl.h" 54 1.1 riastrad 55 1.2 riastrad #ifdef __aarch64__ 56 1.2 riastrad #define __aarch64_used 57 1.2 riastrad #else 58 1.2 riastrad #define __aarch64_used __unused 59 1.2 riastrad #endif 60 1.2 riastrad 61 1.1 riastrad static const uint8x16_t 62 1.1 riastrad mc_forward[4] = { 63 1.5 riastrad VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, 64 1.5 riastrad 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C), 65 1.5 riastrad VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, 66 1.5 riastrad 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00), 67 1.5 riastrad VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, 68 1.5 riastrad 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04), 69 1.5 riastrad VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, 70 1.5 riastrad 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08), 71 1.1 riastrad }, 72 1.2 riastrad mc_backward[4] __aarch64_used = { 73 1.5 riastrad VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, 74 1.5 riastrad 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E), 75 1.5 riastrad VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, 76 1.5 riastrad 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A), 77 1.5 riastrad VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, 78 1.5 riastrad 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06), 79 1.5 riastrad VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, 80 1.5 riastrad 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02), 81 1.1 riastrad }, 82 1.2 riastrad ipt[2] __aarch64_used = { 83 1.5 riastrad VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, 84 1.5 riastrad 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA), 85 1.5 riastrad VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, 86 1.5 riastrad 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD), 87 1.1 riastrad }, 88 1.1 riastrad opt[2] = { 89 1.5 riastrad VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, 90 1.5 riastrad 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7), 91 1.5 riastrad VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, 92 1.5 riastrad 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1), 93 1.1 riastrad }, 94 1.2 riastrad dipt[2] __aarch64_used = { 95 1.5 riastrad VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, 96 1.5 riastrad 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15), 97 1.5 riastrad VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, 98 1.5 riastrad 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12), 99 1.1 riastrad }, 100 1.2 riastrad sb1[2] __aarch64_used = { 101 1.5 riastrad VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, 102 1.5 riastrad 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5), 103 1.5 riastrad VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, 104 1.5 riastrad 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B), 105 1.1 riastrad }, 106 1.2 riastrad sb2[2] __aarch64_used = { 107 1.5 riastrad VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, 108 1.5 riastrad 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E), 109 1.5 riastrad VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, 110 1.5 riastrad 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2), 111 1.1 riastrad }, 112 1.2 riastrad sbo[2] __aarch64_used = { 113 1.5 riastrad VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, 114 1.5 riastrad 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15), 115 1.5 riastrad VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, 116 1.5 riastrad 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E), 117 1.1 riastrad }, 118 1.2 riastrad dsb9[2] __aarch64_used = { 119 1.5 riastrad VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, 120 1.5 riastrad 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA), 121 1.5 riastrad VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, 122 1.5 riastrad 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72), 123 1.1 riastrad }, 124 1.2 riastrad dsbd[2] __aarch64_used = { 125 1.5 riastrad VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, 126 1.5 riastrad 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5), 127 1.5 riastrad VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, 128 1.5 riastrad 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29), 129 1.1 riastrad }, 130 1.2 riastrad dsbb[2] __aarch64_used = { 131 1.5 riastrad VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, 132 1.5 riastrad 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60), 133 1.5 riastrad VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, 134 1.5 riastrad 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3), 135 1.1 riastrad }, 136 1.2 riastrad dsbe[2] __aarch64_used = { 137 1.5 riastrad VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, 138 1.5 riastrad 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22), 139 1.5 riastrad VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, 140 1.5 riastrad 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94), 141 1.1 riastrad }, 142 1.2 riastrad dsbo[2] __aarch64_used = { 143 1.5 riastrad VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, 144 1.5 riastrad 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7), 145 1.5 riastrad VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, 146 1.5 riastrad 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA), 147 1.1 riastrad }, 148 1.1 riastrad dks1[2] = { 149 1.5 riastrad VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, 150 1.5 riastrad 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A), 151 1.5 riastrad VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, 152 1.5 riastrad 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B), 153 1.1 riastrad }, 154 1.1 riastrad dks2[2] = { 155 1.5 riastrad VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, 156 1.5 riastrad 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46), 157 1.5 riastrad VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, 158 1.5 riastrad 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73), 159 1.1 riastrad }, 160 1.1 riastrad dks3[2] = { 161 1.5 riastrad VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, 162 1.5 riastrad 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8), 163 1.5 riastrad VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, 164 1.5 riastrad 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5), 165 1.1 riastrad }, 166 1.1 riastrad dks4[2] = { 167 1.5 riastrad VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, 168 1.5 riastrad 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0), 169 1.5 riastrad VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, 170 1.5 riastrad 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F), 171 1.1 riastrad }, 172 1.1 riastrad deskew[2] = { 173 1.5 riastrad VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, 174 1.5 riastrad 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D), 175 1.5 riastrad VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, 176 1.5 riastrad 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28), 177 1.1 riastrad }, 178 1.2 riastrad sr[4] __aarch64_used = { 179 1.5 riastrad VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 180 1.5 riastrad 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F), 181 1.5 riastrad VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, 182 1.5 riastrad 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B), 183 1.5 riastrad VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, 184 1.5 riastrad 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07), 185 1.5 riastrad VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, 186 1.5 riastrad 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03), 187 1.5 riastrad }, 188 1.5 riastrad rcon = VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, 189 1.5 riastrad 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70), 190 1.5 riastrad of = VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, 191 1.5 riastrad 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F), 192 1.5 riastrad s63 = VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, 193 1.5 riastrad 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B), 194 1.5 riastrad inv = VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, 195 1.5 riastrad 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04), 196 1.5 riastrad inva = VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, 197 1.5 riastrad 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03); 198 1.1 riastrad 199 1.6 rin #ifdef __aarch64__ 200 1.1 riastrad static inline uint8x16_t 201 1.1 riastrad loadroundkey(const void *rkp) 202 1.1 riastrad { 203 1.1 riastrad return vld1q_u8(rkp); 204 1.1 riastrad } 205 1.6 rin #endif 206 1.1 riastrad 207 1.1 riastrad static inline void 208 1.1 riastrad storeroundkey(void *rkp, uint8x16_t rk) 209 1.1 riastrad { 210 1.1 riastrad vst1q_u8(rkp, rk); 211 1.1 riastrad } 212 1.1 riastrad 213 1.1 riastrad /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ 214 1.1 riastrad static inline void 215 1.1 riastrad bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x) 216 1.1 riastrad { 217 1.1 riastrad 218 1.1 riastrad *lo = of & x; 219 1.1 riastrad *hi = of & vshrq_n_u8(x, 4); 220 1.1 riastrad } 221 1.1 riastrad 222 1.1 riastrad /* 223 1.1 riastrad * t is a pair of maps respectively from low and high nybbles to bytes. 224 1.1 riastrad * Apply t the nybbles, and add the results in GF(2). 225 1.1 riastrad */ 226 1.1 riastrad static uint8x16_t 227 1.1 riastrad aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2]) 228 1.1 riastrad { 229 1.1 riastrad uint8x16_t lo, hi; 230 1.1 riastrad 231 1.1 riastrad bytes2nybbles(&lo, &hi, x); 232 1.1 riastrad return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi); 233 1.1 riastrad } 234 1.1 riastrad 235 1.1 riastrad static inline void 236 1.1 riastrad subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_, 237 1.1 riastrad uint8x16_t inva_) 238 1.1 riastrad { 239 1.1 riastrad uint8x16_t k, i, ak, j; 240 1.1 riastrad 241 1.1 riastrad bytes2nybbles(&k, &i, x); 242 1.1 riastrad ak = vqtbl1q_u8(inva_, k); 243 1.1 riastrad j = i ^ k; 244 1.1 riastrad *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i)); 245 1.1 riastrad *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j)); 246 1.1 riastrad } 247 1.1 riastrad 248 1.1 riastrad static uint8x16_t 249 1.1 riastrad aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk) 250 1.1 riastrad { 251 1.1 riastrad uint8x16_t io, jo; 252 1.1 riastrad 253 1.1 riastrad /* smear prk */ 254 1.1 riastrad prk ^= vextq_u8(vdupq_n_u8(0), prk, 12); 255 1.1 riastrad prk ^= vextq_u8(vdupq_n_u8(0), prk, 8); 256 1.1 riastrad prk ^= s63; 257 1.1 riastrad 258 1.1 riastrad /* subbytes */ 259 1.1 riastrad subbytes(&io, &jo, rk, inv, inva); 260 1.1 riastrad rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo); 261 1.1 riastrad 262 1.1 riastrad /* add in smeared stuff */ 263 1.1 riastrad return rk ^ prk; 264 1.1 riastrad } 265 1.1 riastrad 266 1.1 riastrad static uint8x16_t 267 1.1 riastrad aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot) 268 1.1 riastrad { 269 1.1 riastrad uint32x4_t rk32; 270 1.1 riastrad 271 1.1 riastrad /* extract rcon from rcon_rot */ 272 1.1 riastrad prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15); 273 1.1 riastrad *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15); 274 1.1 riastrad 275 1.1 riastrad /* rotate */ 276 1.1 riastrad rk32 = vreinterpretq_u32_u8(rk); 277 1.1 riastrad rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3)); 278 1.1 riastrad rk = vreinterpretq_u8_u32(rk32); 279 1.1 riastrad rk = vextq_u8(rk, rk, 1); 280 1.1 riastrad 281 1.1 riastrad return aes_schedule_low_round(rk, prk); 282 1.1 riastrad } 283 1.1 riastrad 284 1.1 riastrad static uint8x16_t 285 1.1 riastrad aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i) 286 1.1 riastrad { 287 1.1 riastrad uint8x16_t y = vdupq_n_u8(0); 288 1.1 riastrad 289 1.1 riastrad x ^= s63; 290 1.1 riastrad 291 1.1 riastrad x = vqtbl1q_u8(x, mc_forward[0]); 292 1.1 riastrad y ^= x; 293 1.1 riastrad x = vqtbl1q_u8(x, mc_forward[0]); 294 1.1 riastrad y ^= x; 295 1.1 riastrad x = vqtbl1q_u8(x, mc_forward[0]); 296 1.1 riastrad y ^= x; 297 1.1 riastrad 298 1.1 riastrad return vqtbl1q_u8(y, sr_i); 299 1.1 riastrad } 300 1.1 riastrad 301 1.1 riastrad static uint8x16_t 302 1.1 riastrad aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i) 303 1.1 riastrad { 304 1.1 riastrad 305 1.1 riastrad return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt); 306 1.1 riastrad } 307 1.1 riastrad 308 1.1 riastrad static uint8x16_t 309 1.1 riastrad aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i) 310 1.1 riastrad { 311 1.1 riastrad uint8x16_t y = vdupq_n_u8(0); 312 1.1 riastrad 313 1.1 riastrad x = aes_schedule_transform(x, dks1); 314 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]); 315 1.1 riastrad x = aes_schedule_transform(x, dks2); 316 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]); 317 1.1 riastrad x = aes_schedule_transform(x, dks3); 318 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]); 319 1.1 riastrad x = aes_schedule_transform(x, dks4); 320 1.1 riastrad y = vqtbl1q_u8(y ^ x, mc_forward[0]); 321 1.1 riastrad 322 1.1 riastrad return vqtbl1q_u8(y, sr_i); 323 1.1 riastrad } 324 1.1 riastrad 325 1.1 riastrad static uint8x16_t 326 1.1 riastrad aes_schedule_mangle_last_dec(uint8x16_t x) 327 1.1 riastrad { 328 1.1 riastrad 329 1.1 riastrad return aes_schedule_transform(x ^ s63, deskew); 330 1.1 riastrad } 331 1.1 riastrad 332 1.1 riastrad static uint8x16_t 333 1.1 riastrad aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk) 334 1.1 riastrad { 335 1.1 riastrad uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi); 336 1.1 riastrad uint32x4_t prk32 = vreinterpretq_u32_u8(prk); 337 1.1 riastrad uint32x4_t rk32; 338 1.1 riastrad 339 1.1 riastrad rk32 = prkhi32; 340 1.1 riastrad rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2), 341 1.1 riastrad vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)), 342 1.1 riastrad 3); 343 1.1 riastrad rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2), 344 1.1 riastrad vdupq_n_u32(vgetq_lane_u32(prk32, 3)), 345 1.1 riastrad 0); 346 1.1 riastrad 347 1.1 riastrad return vreinterpretq_u8_u32(rk32); 348 1.1 riastrad } 349 1.1 riastrad 350 1.1 riastrad static uint8x16_t 351 1.1 riastrad aes_schedule_192_smearhi(uint8x16_t rk) 352 1.1 riastrad { 353 1.1 riastrad uint64x2_t rk64 = vreinterpretq_u64_u8(rk); 354 1.1 riastrad 355 1.1 riastrad rk64 = vsetq_lane_u64(0, rk64, 0); 356 1.1 riastrad 357 1.1 riastrad return vreinterpretq_u8_u64(rk64); 358 1.1 riastrad } 359 1.1 riastrad 360 1.1 riastrad void 361 1.1 riastrad aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) 362 1.1 riastrad { 363 1.1 riastrad uint32_t *rk32 = enc->aese_aes.aes_rk; 364 1.1 riastrad uint8x16_t mrk; /* mangled round key */ 365 1.1 riastrad uint8x16_t rk; /* round key */ 366 1.1 riastrad uint8x16_t prk; /* previous round key */ 367 1.1 riastrad uint8x16_t rcon_rot = rcon; 368 1.1 riastrad uint64_t i = 3; 369 1.1 riastrad 370 1.1 riastrad /* input transform */ 371 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key), ipt); 372 1.1 riastrad storeroundkey(rk32, rk); 373 1.1 riastrad rk32 += 4; 374 1.1 riastrad 375 1.1 riastrad switch (nrounds) { 376 1.1 riastrad case 10: 377 1.1 riastrad for (;;) { 378 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot); 379 1.1 riastrad if (--nrounds == 0) 380 1.1 riastrad break; 381 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 382 1.1 riastrad storeroundkey(rk32, mrk); 383 1.1 riastrad rk32 += 4; 384 1.1 riastrad } 385 1.1 riastrad break; 386 1.1 riastrad case 12: { 387 1.1 riastrad uint8x16_t prkhi; /* high half of previous round key */ 388 1.1 riastrad 389 1.1 riastrad prk = rk; 390 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); 391 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 392 1.1 riastrad for (;;) { 393 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot); 394 1.1 riastrad rk = vextq_u8(prkhi, prk, 8); 395 1.1 riastrad 396 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 397 1.1 riastrad storeroundkey(rk32, mrk); 398 1.1 riastrad rk32 += 4; 399 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 400 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 401 1.1 riastrad 402 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 403 1.1 riastrad storeroundkey(rk32, mrk); 404 1.1 riastrad rk32 += 4; 405 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 406 1.1 riastrad if ((nrounds -= 3) == 0) 407 1.1 riastrad break; 408 1.1 riastrad 409 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 410 1.1 riastrad storeroundkey(rk32, mrk); 411 1.1 riastrad rk32 += 4; 412 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 413 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 414 1.1 riastrad } 415 1.1 riastrad break; 416 1.1 riastrad } 417 1.1 riastrad case 14: { 418 1.1 riastrad uint8x16_t pprk; /* previous previous round key */ 419 1.1 riastrad 420 1.1 riastrad prk = rk; 421 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); 422 1.1 riastrad for (;;) { 423 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 424 1.1 riastrad storeroundkey(rk32, mrk); 425 1.1 riastrad rk32 += 4; 426 1.1 riastrad pprk = rk; 427 1.1 riastrad 428 1.1 riastrad /* high round */ 429 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 430 1.1 riastrad if ((nrounds -= 2) == 0) 431 1.1 riastrad break; 432 1.1 riastrad mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 433 1.1 riastrad storeroundkey(rk32, mrk); 434 1.1 riastrad rk32 += 4; 435 1.1 riastrad 436 1.1 riastrad /* low round */ 437 1.1 riastrad rk = vreinterpretq_u8_u32( 438 1.1 riastrad vdupq_n_u32( 439 1.1 riastrad vgetq_lane_u32(vreinterpretq_u32_u8(rk), 440 1.1 riastrad 3))); 441 1.1 riastrad rk = aes_schedule_low_round(rk, pprk); 442 1.1 riastrad } 443 1.1 riastrad break; 444 1.1 riastrad } 445 1.1 riastrad default: 446 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds); 447 1.1 riastrad } 448 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4])); 449 1.1 riastrad } 450 1.1 riastrad 451 1.1 riastrad void 452 1.1 riastrad aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) 453 1.1 riastrad { 454 1.1 riastrad uint32_t *rk32 = dec->aesd_aes.aes_rk; 455 1.1 riastrad uint8x16_t mrk; /* mangled round key */ 456 1.1 riastrad uint8x16_t ork; /* original round key */ 457 1.1 riastrad uint8x16_t rk; /* round key */ 458 1.1 riastrad uint8x16_t prk; /* previous round key */ 459 1.1 riastrad uint8x16_t rcon_rot = rcon; 460 1.1 riastrad unsigned i = nrounds == 12 ? 0 : 2; 461 1.1 riastrad 462 1.1 riastrad ork = vld1q_u8(key); 463 1.1 riastrad 464 1.1 riastrad /* input transform */ 465 1.1 riastrad rk = aes_schedule_transform(ork, ipt); 466 1.1 riastrad 467 1.1 riastrad /* go from end */ 468 1.1 riastrad rk32 += 4*nrounds; 469 1.1 riastrad storeroundkey(rk32, vqtbl1q_u8(ork, sr[i])); 470 1.1 riastrad rk32 -= 4; 471 1.1 riastrad i ^= 3; 472 1.1 riastrad 473 1.1 riastrad switch (nrounds) { 474 1.1 riastrad case 10: 475 1.1 riastrad for (;;) { 476 1.1 riastrad rk = aes_schedule_round(rk, rk, &rcon_rot); 477 1.1 riastrad if (--nrounds == 0) 478 1.1 riastrad break; 479 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 480 1.1 riastrad storeroundkey(rk32, mrk); 481 1.1 riastrad rk32 -= 4; 482 1.1 riastrad } 483 1.1 riastrad break; 484 1.1 riastrad case 12: { 485 1.1 riastrad uint8x16_t prkhi; /* high half of previous round key */ 486 1.1 riastrad 487 1.1 riastrad prk = rk; 488 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); 489 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 490 1.1 riastrad for (;;) { 491 1.1 riastrad prk = aes_schedule_round(rk, prk, &rcon_rot); 492 1.1 riastrad rk = vextq_u8(prkhi, prk, 8); 493 1.1 riastrad 494 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 495 1.1 riastrad storeroundkey(rk32, mrk); 496 1.1 riastrad rk32 -= 4; 497 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 498 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 499 1.1 riastrad 500 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 501 1.1 riastrad storeroundkey(rk32, mrk); 502 1.1 riastrad rk32 -= 4; 503 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 504 1.1 riastrad if ((nrounds -= 3) == 0) 505 1.1 riastrad break; 506 1.1 riastrad 507 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 508 1.1 riastrad storeroundkey(rk32, mrk); 509 1.1 riastrad rk32 -= 4; 510 1.1 riastrad rk = aes_schedule_192_smear(prkhi, prk); 511 1.1 riastrad prkhi = aes_schedule_192_smearhi(rk); 512 1.1 riastrad } 513 1.1 riastrad break; 514 1.1 riastrad } 515 1.1 riastrad case 14: { 516 1.1 riastrad uint8x16_t pprk; /* previous previous round key */ 517 1.1 riastrad 518 1.1 riastrad prk = rk; 519 1.1 riastrad rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); 520 1.1 riastrad for (;;) { 521 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 522 1.1 riastrad storeroundkey(rk32, mrk); 523 1.1 riastrad rk32 -= 4; 524 1.1 riastrad pprk = rk; 525 1.1 riastrad 526 1.1 riastrad /* high round */ 527 1.1 riastrad rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 528 1.1 riastrad if ((nrounds -= 2) == 0) 529 1.1 riastrad break; 530 1.1 riastrad mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 531 1.1 riastrad storeroundkey(rk32, mrk); 532 1.1 riastrad rk32 -= 4; 533 1.1 riastrad 534 1.1 riastrad /* low round */ 535 1.1 riastrad rk = vreinterpretq_u8_u32( 536 1.1 riastrad vdupq_n_u32( 537 1.1 riastrad vgetq_lane_u32(vreinterpretq_u32_u8(rk), 538 1.1 riastrad 3))); 539 1.1 riastrad rk = aes_schedule_low_round(rk, pprk); 540 1.1 riastrad } 541 1.1 riastrad break; 542 1.1 riastrad } 543 1.1 riastrad default: 544 1.1 riastrad panic("invalid number of AES rounds: %u", nrounds); 545 1.1 riastrad } 546 1.1 riastrad storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); 547 1.1 riastrad } 548 1.1 riastrad 549 1.2 riastrad #ifdef __aarch64__ 550 1.2 riastrad 551 1.2 riastrad /* 552 1.2 riastrad * GCC does a lousy job of compiling NEON intrinsics for arm32, so we 553 1.2 riastrad * do the performance-critical parts -- encryption and decryption -- in 554 1.2 riastrad * hand-written assembly on arm32. 555 1.2 riastrad */ 556 1.2 riastrad 557 1.1 riastrad uint8x16_t 558 1.1 riastrad aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds) 559 1.1 riastrad { 560 1.1 riastrad const uint32_t *rk32 = enc->aese_aes.aes_rk; 561 1.1 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 562 1.1 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 563 1.1 riastrad uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; 564 1.1 riastrad uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; 565 1.1 riastrad uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; 566 1.1 riastrad uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; 567 1.1 riastrad uint8x16_t io, jo; 568 1.1 riastrad unsigned rmod4 = 0; 569 1.1 riastrad 570 1.1 riastrad x = aes_schedule_transform(x, ipt); 571 1.1 riastrad x ^= loadroundkey(rk32); 572 1.1 riastrad for (;;) { 573 1.1 riastrad uint8x16_t A, A2, A2_B, A2_B_D; 574 1.1 riastrad 575 1.1 riastrad subbytes(&io, &jo, x, inv_, inva_); 576 1.1 riastrad 577 1.1 riastrad rk32 += 4; 578 1.1 riastrad rmod4 = (rmod4 + 1) % 4; 579 1.1 riastrad if (--nrounds == 0) 580 1.1 riastrad break; 581 1.1 riastrad 582 1.1 riastrad A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); 583 1.1 riastrad A ^= loadroundkey(rk32); 584 1.1 riastrad A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); 585 1.1 riastrad A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); 586 1.1 riastrad A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); 587 1.1 riastrad x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); 588 1.1 riastrad } 589 1.1 riastrad x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); 590 1.1 riastrad x ^= loadroundkey(rk32); 591 1.1 riastrad return vqtbl1q_u8(x, sr[rmod4]); 592 1.1 riastrad } 593 1.1 riastrad 594 1.4 riastrad uint8x16x2_t 595 1.4 riastrad aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds) 596 1.4 riastrad { 597 1.4 riastrad const uint32_t *rk32 = enc->aese_aes.aes_rk; 598 1.4 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 599 1.4 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 600 1.4 riastrad uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; 601 1.4 riastrad uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; 602 1.4 riastrad uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; 603 1.4 riastrad uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; 604 1.4 riastrad uint8x16_t x0 = x.val[0], x1 = x.val[1]; 605 1.4 riastrad uint8x16_t io0, jo0, io1, jo1; 606 1.4 riastrad unsigned rmod4 = 0; 607 1.4 riastrad 608 1.4 riastrad x0 = aes_schedule_transform(x0, ipt); 609 1.4 riastrad x1 = aes_schedule_transform(x1, ipt); 610 1.4 riastrad x0 ^= loadroundkey(rk32); 611 1.4 riastrad x1 ^= loadroundkey(rk32); 612 1.4 riastrad for (;;) { 613 1.4 riastrad uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0; 614 1.4 riastrad uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1; 615 1.4 riastrad 616 1.4 riastrad subbytes(&io0, &jo0, x0, inv_, inva_); 617 1.4 riastrad subbytes(&io1, &jo1, x1, inv_, inva_); 618 1.4 riastrad 619 1.4 riastrad rk32 += 4; 620 1.4 riastrad rmod4 = (rmod4 + 1) % 4; 621 1.4 riastrad if (--nrounds == 0) 622 1.4 riastrad break; 623 1.4 riastrad 624 1.4 riastrad A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0); 625 1.4 riastrad A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1); 626 1.4 riastrad A_0 ^= loadroundkey(rk32); 627 1.4 riastrad A_1 ^= loadroundkey(rk32); 628 1.4 riastrad A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0); 629 1.4 riastrad A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1); 630 1.4 riastrad A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]); 631 1.4 riastrad A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]); 632 1.4 riastrad A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]); 633 1.4 riastrad A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]); 634 1.4 riastrad x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]); 635 1.4 riastrad x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]); 636 1.4 riastrad } 637 1.4 riastrad x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0); 638 1.4 riastrad x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1); 639 1.4 riastrad x0 ^= loadroundkey(rk32); 640 1.4 riastrad x1 ^= loadroundkey(rk32); 641 1.4 riastrad return (uint8x16x2_t) { .val = { 642 1.4 riastrad [0] = vqtbl1q_u8(x0, sr[rmod4]), 643 1.4 riastrad [1] = vqtbl1q_u8(x1, sr[rmod4]), 644 1.4 riastrad } }; 645 1.4 riastrad } 646 1.4 riastrad 647 1.1 riastrad uint8x16_t 648 1.1 riastrad aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) 649 1.1 riastrad { 650 1.1 riastrad const uint32_t *rk32 = dec->aesd_aes.aes_rk; 651 1.1 riastrad unsigned i = 3 & ~(nrounds - 1); 652 1.1 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 653 1.1 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 654 1.1 riastrad uint8x16_t io, jo, mc; 655 1.1 riastrad 656 1.1 riastrad x = aes_schedule_transform(x, dipt); 657 1.1 riastrad x ^= loadroundkey(rk32); 658 1.1 riastrad rk32 += 4; 659 1.1 riastrad 660 1.1 riastrad mc = mc_forward[3]; 661 1.1 riastrad for (;;) { 662 1.1 riastrad subbytes(&io, &jo, x, inv_, inva_); 663 1.1 riastrad if (--nrounds == 0) 664 1.1 riastrad break; 665 1.1 riastrad 666 1.1 riastrad x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo); 667 1.1 riastrad x ^= loadroundkey(rk32); 668 1.1 riastrad rk32 += 4; /* next round key */ 669 1.1 riastrad 670 1.1 riastrad x = vqtbl1q_u8(x, mc); 671 1.1 riastrad x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo); 672 1.1 riastrad 673 1.1 riastrad x = vqtbl1q_u8(x, mc); 674 1.1 riastrad x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); 675 1.1 riastrad 676 1.1 riastrad x = vqtbl1q_u8(x, mc); 677 1.1 riastrad x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); 678 1.1 riastrad 679 1.1 riastrad mc = vextq_u8(mc, mc, 12); 680 1.1 riastrad } 681 1.1 riastrad x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); 682 1.1 riastrad x ^= loadroundkey(rk32); 683 1.1 riastrad return vqtbl1q_u8(x, sr[i]); 684 1.1 riastrad } 685 1.2 riastrad 686 1.4 riastrad uint8x16x2_t 687 1.4 riastrad aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds) 688 1.4 riastrad { 689 1.4 riastrad const uint32_t *rk32 = dec->aesd_aes.aes_rk; 690 1.4 riastrad unsigned i = 3 & ~(nrounds - 1); 691 1.4 riastrad uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 692 1.4 riastrad uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 693 1.4 riastrad uint8x16_t x0 = x.val[0], x1 = x.val[1]; 694 1.4 riastrad uint8x16_t io0, jo0, io1, jo1, mc; 695 1.4 riastrad 696 1.4 riastrad x0 = aes_schedule_transform(x0, dipt); 697 1.4 riastrad x1 = aes_schedule_transform(x1, dipt); 698 1.4 riastrad x0 ^= loadroundkey(rk32); 699 1.4 riastrad x1 ^= loadroundkey(rk32); 700 1.4 riastrad rk32 += 4; 701 1.4 riastrad 702 1.4 riastrad mc = mc_forward[3]; 703 1.4 riastrad for (;;) { 704 1.4 riastrad subbytes(&io0, &jo0, x0, inv_, inva_); 705 1.4 riastrad subbytes(&io1, &jo1, x1, inv_, inva_); 706 1.4 riastrad if (--nrounds == 0) 707 1.4 riastrad break; 708 1.4 riastrad 709 1.4 riastrad x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0); 710 1.4 riastrad x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1); 711 1.4 riastrad x0 ^= loadroundkey(rk32); 712 1.4 riastrad x1 ^= loadroundkey(rk32); 713 1.4 riastrad rk32 += 4; /* next round key */ 714 1.4 riastrad 715 1.4 riastrad x0 = vqtbl1q_u8(x0, mc); 716 1.4 riastrad x1 = vqtbl1q_u8(x1, mc); 717 1.4 riastrad x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0); 718 1.4 riastrad x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1); 719 1.4 riastrad 720 1.4 riastrad x0 = vqtbl1q_u8(x0, mc); 721 1.4 riastrad x1 = vqtbl1q_u8(x1, mc); 722 1.4 riastrad x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0); 723 1.4 riastrad x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1); 724 1.4 riastrad 725 1.4 riastrad x0 = vqtbl1q_u8(x0, mc); 726 1.4 riastrad x1 = vqtbl1q_u8(x1, mc); 727 1.4 riastrad x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0); 728 1.4 riastrad x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1); 729 1.4 riastrad 730 1.4 riastrad mc = vextq_u8(mc, mc, 12); 731 1.4 riastrad } 732 1.4 riastrad x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0); 733 1.4 riastrad x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1); 734 1.4 riastrad x0 ^= loadroundkey(rk32); 735 1.4 riastrad x1 ^= loadroundkey(rk32); 736 1.4 riastrad return (uint8x16x2_t) { .val = { 737 1.4 riastrad [0] = vqtbl1q_u8(x0, sr[i]), 738 1.4 riastrad [1] = vqtbl1q_u8(x1, sr[i]), 739 1.4 riastrad } }; 740 1.4 riastrad } 741 1.4 riastrad 742 1.2 riastrad #endif 743