Home | History | Annotate | Line # | Download | only in arm
aes_neon_subr.c revision 1.3
      1 /*	$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
     31 
     32 #include <sys/endian.h>
     33 
     34 #ifdef _KERNEL
     35 #include <sys/systm.h>
     36 #include <lib/libkern/libkern.h>
     37 #else
     38 #include <assert.h>
     39 #include <inttypes.h>
     40 #include <stdio.h>
     41 #define	KASSERT			assert
     42 #endif
     43 
     44 #include <crypto/aes/arch/arm/aes_neon.h>
     45 
     46 #include "aes_neon_impl.h"
     47 
     48 static inline uint8x16_t
     49 loadblock(const void *in)
     50 {
     51 	return vld1q_u8(in);
     52 }
     53 
     54 static inline void
     55 storeblock(void *out, uint8x16_t block)
     56 {
     57 	vst1q_u8(out, block);
     58 }
     59 
     60 void
     61 aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16],
     62     uint8_t out[static 16], uint32_t nrounds)
     63 {
     64 	uint8x16_t block;
     65 
     66 	block = loadblock(in);
     67 	block = aes_neon_enc1(enc, block, nrounds);
     68 	storeblock(out, block);
     69 }
     70 
     71 void
     72 aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16],
     73     uint8_t out[static 16], uint32_t nrounds)
     74 {
     75 	uint8x16_t block;
     76 
     77 	block = loadblock(in);
     78 	block = aes_neon_dec1(dec, block, nrounds);
     79 	storeblock(out, block);
     80 }
     81 
     82 void
     83 aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
     84     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
     85     uint32_t nrounds)
     86 {
     87 	uint8x16_t cv;
     88 
     89 	KASSERT(nbytes);
     90 
     91 	cv = loadblock(iv);
     92 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
     93 		cv ^= loadblock(in);
     94 		cv = aes_neon_enc1(enc, cv, nrounds);
     95 		storeblock(out, cv);
     96 	}
     97 	storeblock(iv, cv);
     98 }
     99 
    100 void
    101 aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
    102     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    103     uint32_t nrounds)
    104 {
    105 	uint8x16_t iv0, cv, b;
    106 
    107 	KASSERT(nbytes);
    108 	KASSERT(nbytes % 16 == 0);
    109 
    110 	iv0 = loadblock(iv);
    111 	cv = loadblock(in + nbytes - 16);
    112 	storeblock(iv, cv);
    113 
    114 	for (;;) {
    115 		b = aes_neon_dec1(dec, cv, nrounds);
    116 		if ((nbytes -= 16) == 0)
    117 			break;
    118 		cv = loadblock(in + nbytes - 16);
    119 		storeblock(out + nbytes, b ^ cv);
    120 	}
    121 	storeblock(out, b ^ iv0);
    122 }
    123 
    124 static inline uint8x16_t
    125 aes_neon_xts_update(uint8x16_t t8)
    126 {
    127 	const int32x4_t zero = vdupq_n_s32(0);
    128 	const int32x4_t carry = {0x87, 1, 1, 1};
    129 	int32x4_t t, t_;
    130 	uint32x4_t mask;
    131 
    132 	t = vreinterpretq_s32_u8(t8);
    133 	mask = vcltq_s32(t, zero);		/* -1 if high bit set else 0 */
    134 	mask = vextq_u32(mask, mask, 3);	/* rotate quarters */
    135 	t_ = vsliq_n_s32(zero, t, 1);		/* shift */
    136 	t_ ^= carry & mask;
    137 
    138 	return vreinterpretq_u8_s32(t_);
    139 }
    140 
    141 static int
    142 aes_neon_xts_update_selftest(void)
    143 {
    144 	static const struct {
    145 		uint32_t in[4], out[4];
    146 	} cases[] = {
    147 		[0] = { {1}, {2} },
    148 		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
    149 		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
    150 		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
    151 		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
    152 		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
    153 	};
    154 	unsigned i;
    155 	uint32_t t[4];
    156 	int result = 0;
    157 
    158 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
    159 		t[0] = cases[i].in[0];
    160 		t[1] = cases[i].in[1];
    161 		t[2] = cases[i].in[2];
    162 		t[3] = cases[i].in[3];
    163 		storeblock(t, aes_neon_xts_update(loadblock(t)));
    164 		if (t[0] != cases[i].out[0] ||
    165 		    t[1] != cases[i].out[1] ||
    166 		    t[2] != cases[i].out[2] ||
    167 		    t[3] != cases[i].out[3]) {
    168 			printf("%s %u:"
    169 			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
    170 			    __func__, i, t[0], t[1], t[2], t[3]);
    171 			result = -1;
    172 		}
    173 	}
    174 
    175 	return result;
    176 }
    177 
    178 void
    179 aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
    180     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    181     uint32_t nrounds)
    182 {
    183 	uint8x16_t t, b;
    184 
    185 	KASSERT(nbytes);
    186 	KASSERT(nbytes % 16 == 0);
    187 
    188 	t = loadblock(tweak);
    189 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    190 		b = t ^ loadblock(in);
    191 		b = aes_neon_enc1(enc, b, nrounds);
    192 		storeblock(out, t ^ b);
    193 		t = aes_neon_xts_update(t);
    194 	}
    195 	storeblock(tweak, t);
    196 }
    197 
    198 void
    199 aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
    200     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
    201     uint32_t nrounds)
    202 {
    203 	uint8x16_t t, b;
    204 
    205 	KASSERT(nbytes);
    206 	KASSERT(nbytes % 16 == 0);
    207 
    208 	t = loadblock(tweak);
    209 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    210 		b = t ^ loadblock(in);
    211 		b = aes_neon_dec1(dec, b, nrounds);
    212 		storeblock(out, t ^ b);
    213 		t = aes_neon_xts_update(t);
    214 	}
    215 	storeblock(tweak, t);
    216 }
    217 
    218 void
    219 aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
    220     size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
    221 {
    222 	uint8x16_t auth;
    223 
    224 	KASSERT(nbytes);
    225 	KASSERT(nbytes % 16 == 0);
    226 
    227 	auth = loadblock(auth0);
    228 	for (; nbytes; nbytes -= 16, in += 16)
    229 		auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds);
    230 	storeblock(auth0, auth);
    231 }
    232 
    233 /*
    234  * XXX On aarch64, we have enough registers that we should be able to
    235  * pipeline two simultaneous vpaes computations in an `aes_neon_enc2'
    236  * function, which should substantially improve CCM throughput.
    237  */
    238 
    239 #if _BYTE_ORDER == _LITTLE_ENDIAN
    240 #define	vbetoh32q_u8	vrev32q_u8
    241 #define	vhtobe32q_u8	vrev32q_u8
    242 #elif _BYTE_ORDER == _BIG_ENDIAN
    243 #define	vbetoh32q_u8(x)	(x)
    244 #define	vhtobe32q_u8(x)	(x)
    245 #else
    246 #error what kind of endian are you anyway
    247 #endif
    248 
    249 void
    250 aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
    251     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
    252     uint32_t nrounds)
    253 {
    254 	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
    255 	uint8x16_t auth, ptxt, ctr_be;
    256 	uint32x4_t ctr;
    257 
    258 	KASSERT(nbytes);
    259 	KASSERT(nbytes % 16 == 0);
    260 
    261 	auth = loadblock(authctr);
    262 	ctr_be = loadblock(authctr + 16);
    263 	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
    264 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    265 		ptxt = loadblock(in);
    266 		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
    267 		ctr = vaddq_u32(ctr, ctr32_inc);
    268 		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
    269 		storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds));
    270 	}
    271 	storeblock(authctr, auth);
    272 	storeblock(authctr + 16, ctr_be);
    273 }
    274 
    275 void
    276 aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
    277     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
    278     uint32_t nrounds)
    279 {
    280 	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
    281 	uint8x16_t auth, ctr_be, ptxt;
    282 	uint32x4_t ctr;
    283 
    284 	KASSERT(nbytes);
    285 	KASSERT(nbytes % 16 == 0);
    286 
    287 	auth = loadblock(authctr);
    288 	ctr_be = loadblock(authctr + 16);
    289 	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
    290 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
    291 		ctr = vaddq_u32(ctr, ctr32_inc);
    292 		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
    293 		ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds);
    294 		storeblock(out, ptxt);
    295 		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
    296 	}
    297 	storeblock(authctr, auth);
    298 	storeblock(authctr + 16, ctr_be);
    299 }
    300 
    301 int
    302 aes_neon_selftest(void)
    303 {
    304 
    305 	if (aes_neon_xts_update_selftest())
    306 		return -1;
    307 
    308 	return 0;
    309 }
    310