arch/arm/chacha_neon.c

1.5  riastrad /*	$NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $	*/
1.1  riastrad
1.1  riastrad /*-
1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
1.1  riastrad  * All rights reserved.
1.1  riastrad  *
1.1  riastrad  * Redistribution and use in source and binary forms, with or without
1.1  riastrad  * modification, are permitted provided that the following conditions
1.1  riastrad  * are met:
1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
1.1  riastrad  *    documentation and/or other materials provided with the distribution.
1.1  riastrad  *
1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
1.1  riastrad  */
1.1  riastrad
1.1  riastrad #include <sys/types.h>
1.1  riastrad #include <sys/endian.h>
1.1  riastrad
1.1  riastrad #include "arm_neon.h"
1.1  riastrad #include "chacha_neon.h"
1.1  riastrad
1.1  riastrad static inline uint32x4_t
1.1  riastrad vrolq_n_u32(uint32x4_t x, uint8_t n)
1.1  riastrad {
1.1  riastrad
1.5  riastrad 	/*
1.5  riastrad 	 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
1.5  riastrad 	 * practice it hurts performance at least on Cortex-A8.
1.5  riastrad 	 */
1.5  riastrad #if 1
1.1  riastrad 	return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
1.5  riastrad #else
1.5  riastrad 	return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
1.5  riastrad #endif
1.1  riastrad }
1.1  riastrad
1.1  riastrad static inline uint32x4_t
1.1  riastrad vhtole_u32(uint32x4_t x)
1.1  riastrad {
1.1  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
1.1  riastrad 	return x;
1.1  riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
1.1  riastrad 	return vrev32q_u8(x);
1.1  riastrad #endif
1.1  riastrad }
1.1  riastrad
1.1  riastrad static inline uint32x4_t
1.1  riastrad vletoh_u32(uint32x4_t x)
1.1  riastrad {
1.1  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
1.1  riastrad 	return x;
1.1  riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
1.1  riastrad 	return vrev32q_u8(x);
1.1  riastrad #endif
1.1  riastrad }
1.1  riastrad
1.4  riastrad static inline uint32x4_t
1.4  riastrad rol16(uint32x4_t x)
1.4  riastrad {
1.4  riastrad 	uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
1.4  riastrad
1.4  riastrad 	y16 = vrev32q_u16(x16);
1.4  riastrad
1.4  riastrad 	return vreinterpretq_u32_u16(y16);
1.4  riastrad }
1.4  riastrad
1.4  riastrad static inline uint32x4_t
1.4  riastrad rol12(uint32x4_t x)
1.4  riastrad {
1.4  riastrad
1.4  riastrad 	return vrolq_n_u32(x, 12);
1.4  riastrad }
1.4  riastrad
1.4  riastrad static inline uint32x4_t
1.4  riastrad rol8(uint32x4_t x)
1.4  riastrad {
1.4  riastrad #if defined(__aarch64__)
1.4  riastrad 	static const uint8x16_t rol8_tab = {
1.4  riastrad 		  3, 0, 1, 2,  7, 4, 5, 6,
1.4  riastrad 		 11, 8, 9,10, 15,12,13,14,
1.4  riastrad 	};
1.4  riastrad 	uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
1.4  riastrad
1.4  riastrad 	y8 = vqtbl1q_u8(x8, rol8_tab);
1.4  riastrad
1.4  riastrad 	return vreinterpretq_u32_u8(y8);
1.4  riastrad #elif 0
1.4  riastrad 	/*
1.4  riastrad 	 * GCC does a lousy job with this, spilling two 64-bit vector
1.4  riastrad 	 * registers to the stack every time.  There should be plenty
1.4  riastrad 	 * of vector registers free, requiring no spills at all, and
1.4  riastrad 	 * GCC should be able to hoist the load of rol8_tab out of any
1.4  riastrad 	 * loops, but it doesn't and so attempting to use VTBL hurts
1.4  riastrad 	 * more than it helps.
1.4  riastrad 	 */
1.4  riastrad 	static const uint8x8_t rol8_tab = {
1.4  riastrad 		 3, 0, 1, 2,  7, 4, 5, 6,
1.4  riastrad 	};
1.4  riastrad
1.4  riastrad 	uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
1.4  riastrad
1.4  riastrad 	y64 = (uint64x2_t) {
1.4  riastrad 		(uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
1.4  riastrad 		(uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
1.4  riastrad 	};
1.4  riastrad
1.4  riastrad 	return vreinterpretq_u32_u64(y64);
1.4  riastrad #else
1.4  riastrad 	return vrolq_n_u32(x, 8);
1.4  riastrad #endif
1.4  riastrad }
1.4  riastrad
1.4  riastrad static inline uint32x4_t
1.4  riastrad rol7(uint32x4_t x)
1.4  riastrad {
1.4  riastrad
1.4  riastrad 	return vrolq_n_u32(x, 7);
1.4  riastrad }
1.1  riastrad
1.1  riastrad static inline void
1.1  riastrad chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.1  riastrad 	uint32x4_t r0, r1, r2, r3;
1.1  riastrad 	uint32x4_t c0, c1, c2, c3;
1.1  riastrad
1.1  riastrad 	r0 = *p0;
1.1  riastrad 	r1 = *p1;
1.1  riastrad 	r2 = *p2;
1.1  riastrad 	r3 = *p3;
1.4  riastrad
1.4  riastrad 	for (; nr > 0; nr -= 2) {
1.4  riastrad 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
1.4  riastrad 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
1.1  riastrad 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
1.1  riastrad 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
1.1  riastrad
1.1  riastrad 		c0 = r0;
1.1  riastrad 		c1 = vextq_u32(r1, r1, 1);
1.1  riastrad 		c2 = vextq_u32(r2, r2, 2);
1.4  riastrad 		c3 = vextq_u32(r3, r3, 3);
1.4  riastrad
1.4  riastrad 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
1.4  riastrad 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
1.1  riastrad 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
1.1  riastrad 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
1.1  riastrad
1.1  riastrad 		r0 = c0;
1.1  riastrad 		r1 = vextq_u32(c1, c1, 3);
1.1  riastrad 		r2 = vextq_u32(c2, c2, 2);
1.1  riastrad 		r3 = vextq_u32(c3, c3, 1);
1.1  riastrad 	}
1.1  riastrad
1.1  riastrad 	*p0 = r0;
1.1  riastrad 	*p1 = r1;
1.1  riastrad 	*p2 = r2;
1.1  riastrad 	*p3 = r3;
1.1  riastrad }
1.1  riastrad
1.1  riastrad void
1.1  riastrad chacha_core_neon(uint8_t out[restrict static 64],
1.1  riastrad     const uint8_t in[static 16],
1.1  riastrad     const uint8_t k[static 32],
1.1  riastrad     const uint8_t c[static 16],
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.1  riastrad 	uint32x4_t in0, in1, in2, in3;
1.1  riastrad 	uint32x4_t r0, r1, r2, r3;
1.1  riastrad
1.1  riastrad 	r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
1.1  riastrad 	r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
1.1  riastrad 	r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
1.1  riastrad 	r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
1.1  riastrad
1.1  riastrad 	chacha_permute(&r0, &r1, &r2, &r3, nr);
1.1  riastrad
1.1  riastrad 	vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
1.1  riastrad 	vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
1.1  riastrad 	vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
1.1  riastrad 	vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
1.1  riastrad }
1.1  riastrad
1.1  riastrad void
1.1  riastrad hchacha_neon(uint8_t out[restrict static 32],
1.1  riastrad     const uint8_t in[static 16],
1.1  riastrad     const uint8_t k[static 32],
1.1  riastrad     const uint8_t c[static 16],
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.1  riastrad 	uint32x4_t r0, r1, r2, r3;
1.1  riastrad
1.1  riastrad 	r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
1.1  riastrad 	r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
1.1  riastrad 	r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
1.1  riastrad 	r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
1.1  riastrad
1.1  riastrad 	chacha_permute(&r0, &r1, &r2, &r3, nr);
1.1  riastrad
1.1  riastrad 	vst1q_u32((uint32_t *)out + 0, r0);
1.1  riastrad 	vst1q_u32((uint32_t *)out + 4, r3);
1.1  riastrad }
1.1  riastrad
1.1  riastrad void
1.1  riastrad chacha_stream_neon(uint8_t *restrict s, size_t n,
1.1  riastrad     uint32_t blkno,
1.1  riastrad     const uint8_t nonce[static 12],
1.3  riastrad     const uint8_t k[static 32],
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.3  riastrad
1.1  riastrad #ifdef __aarch64__
1.1  riastrad 	for (; n >= 256; s += 256, n -= 256, blkno += 4)
1.1  riastrad 		chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
1.1  riastrad #endif
1.1  riastrad
1.1  riastrad 	if (n) {
1.1  riastrad 		const uint32x4_t blkno_inc = {1,0,0,0};
1.1  riastrad 		uint32x4_t in0, in1, in2, in3;
1.1  riastrad 		uint32x4_t r0, r1, r2, r3;
1.1  riastrad
1.1  riastrad 		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
1.1  riastrad 		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
1.1  riastrad 		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
1.1  riastrad 		in3 = (uint32x4_t) {
1.1  riastrad 			blkno,
1.1  riastrad 			le32dec(nonce),
1.2  riastrad 			le32dec(nonce + 4),
1.1  riastrad 			le32dec(nonce + 8)
1.1  riastrad 		};
1.1  riastrad
1.1  riastrad 		for (; n; s += 64, n -= 64) {
1.1  riastrad 			r0 = in0;
1.1  riastrad 			r1 = in1;
1.1  riastrad 			r2 = in2;
1.1  riastrad 			r3 = in3;
1.1  riastrad 			chacha_permute(&r0, &r1, &r2, &r3, nr);
1.2  riastrad 			r0 = vhtole_u32(vaddq_u32(r0, in0));
1.2  riastrad 			r1 = vhtole_u32(vaddq_u32(r1, in1));
1.2  riastrad 			r2 = vhtole_u32(vaddq_u32(r2, in2));
1.2  riastrad 			r3 = vhtole_u32(vaddq_u32(r3, in3));
1.2  riastrad
1.2  riastrad 			if (n < 64) {
1.2  riastrad 				uint8_t buf[64] __aligned(16);
1.2  riastrad
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*0, r0);
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*1, r1);
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*2, r2);
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*3, r3);
1.2  riastrad 				memcpy(s, buf, n);
1.1  riastrad
1.1  riastrad 				break;
1.1  riastrad 			}
1.1  riastrad
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*0, r0);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*1, r1);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*2, r2);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*3, r3);
1.1  riastrad 			in3 = vaddq_u32(in3, blkno_inc);
1.1  riastrad 		}
1.1  riastrad 	}
1.1  riastrad }
1.1  riastrad
1.1  riastrad void
1.1  riastrad chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
1.1  riastrad     uint32_t blkno,
1.3  riastrad     const uint8_t nonce[static 12],
1.1  riastrad     const uint8_t k[static 32],
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.3  riastrad
1.1  riastrad #ifdef __aarch64__
1.1  riastrad 	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
1.1  riastrad 		chacha_stream_xor256_neon(s, p, blkno, nonce, k,
1.1  riastrad 		    chacha_const32, nr);
1.1  riastrad #endif
1.1  riastrad
1.1  riastrad 	if (n) {
1.1  riastrad 		const uint32x4_t blkno_inc = {1,0,0,0};
1.1  riastrad 		uint32x4_t in0, in1, in2, in3;
1.1  riastrad 		uint32x4_t r0, r1, r2, r3;
1.1  riastrad
1.1  riastrad 		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
1.1  riastrad 		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
1.1  riastrad 		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
1.1  riastrad 		in3 = (uint32x4_t) {
1.1  riastrad 			blkno,
1.2  riastrad 			le32dec(nonce),
1.1  riastrad 			le32dec(nonce + 4),
1.1  riastrad 			le32dec(nonce + 8)
1.1  riastrad 		};
1.1  riastrad
1.1  riastrad 		for (; n; s += 64, p += 64, n -= 64) {
1.1  riastrad 			r0 = in0;
1.1  riastrad 			r1 = in1;
1.1  riastrad 			r2 = in2;
1.1  riastrad 			r3 = in3;
1.2  riastrad 			chacha_permute(&r0, &r1, &r2, &r3, nr);
1.2  riastrad 			r0 = vhtole_u32(vaddq_u32(r0, in0));
1.2  riastrad 			r1 = vhtole_u32(vaddq_u32(r1, in1));
1.2  riastrad 			r2 = vhtole_u32(vaddq_u32(r2, in2));
1.2  riastrad 			r3 = vhtole_u32(vaddq_u32(r3, in3));
1.2  riastrad
1.2  riastrad 			if (n < 64) {
1.2  riastrad 				uint8_t buf[64] __aligned(16);
1.2  riastrad 				unsigned i;
1.2  riastrad
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*0, r0);
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*1, r1);
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*2, r2);
1.2  riastrad 				vst1q_u32((uint32_t *)buf + 4*3, r3);
1.2  riastrad
1.2  riastrad 				for (i = 0; i < n - n%4; i += 4)
1.2  riastrad 					le32enc(s + i,
1.2  riastrad 					    le32dec(p + i) ^ le32dec(buf + i));
1.2  riastrad 				for (; i < n; i++)
1.1  riastrad 					s[i] = p[i] ^ buf[i];
1.1  riastrad
1.1  riastrad 				break;
1.1  riastrad 			}
1.1  riastrad
1.1  riastrad 			r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
1.1  riastrad 			r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
1.1  riastrad 			r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
1.1  riastrad 			r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*0, r0);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*1, r1);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*2, r2);
1.1  riastrad 			vst1q_u32((uint32_t *)s + 4*3, r3);
1.1  riastrad 			in3 = vaddq_u32(in3, blkno_inc);
1.1  riastrad 		}
1.1  riastrad 	}
1.1  riastrad }
1.1  riastrad
1.1  riastrad void
1.1  riastrad xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
1.1  riastrad     uint32_t blkno,
1.1  riastrad     const uint8_t nonce[static 24],
1.1  riastrad     const uint8_t k[static 32],
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.1  riastrad 	uint8_t subkey[32];
1.1  riastrad 	uint8_t subnonce[12];
1.1  riastrad
1.1  riastrad 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
1.1  riastrad 	memset(subnonce, 0, 4);
1.1  riastrad 	memcpy(subnonce + 4, nonce + 16, 8);
1.1  riastrad 	chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
1.1  riastrad }
1.1  riastrad
1.1  riastrad void
1.1  riastrad xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
1.1  riastrad     uint32_t blkno,
1.1  riastrad     const uint8_t nonce[static 24],
1.1  riastrad     const uint8_t k[static 32],
1.1  riastrad     unsigned nr)
1.1  riastrad {
1.1  riastrad 	uint8_t subkey[32];
1.1  riastrad 	uint8_t subnonce[12];

              	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
              	memset(subnonce, 0, 4);
              	memcpy(subnonce + 4, nonce + 16, 8);
              	chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
              }