Home | History | Annotate | Line # | Download | only in arm
      1 /*	$NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/types.h>
     30 #include <sys/endian.h>
     31 
     32 #include <crypto/arch/arm/arm_neon.h>
     33 #include <crypto/arch/arm/arm_neon_imm.h>
     34 #include "chacha_neon.h"
     35 
     36 /*
     37  * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice
     38  * it hurts performance at least on Cortex-A8.
     39  */
     40 #if 1
     41 #define	vrolq_n_u32(x, n)	(vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n)))
     42 #else
     43 #define	vrolq_n_u32(x, n)	vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n))
     44 #endif
     45 
     46 static inline uint32x4_t
     48 rol16(uint32x4_t x)
     49 {
     50 	uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
     51 
     52 	y16 = vrev32q_u16(x16);
     53 
     54 	return vreinterpretq_u32_u16(y16);
     55 }
     56 
     57 static inline uint32x4_t
     58 rol12(uint32x4_t x)
     59 {
     60 
     61 	return vrolq_n_u32(x, 12);
     62 }
     63 
     64 static inline uint32x4_t
     65 rol8(uint32x4_t x)
     66 {
     67 #if defined(__aarch64__)
     68 	static const uint8x16_t rol8_tab = VQ_N_U8(
     69 		  3, 0, 1, 2,  7, 4, 5, 6,
     70 		 11, 8, 9,10, 15,12,13,14
     71 	);
     72 	uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
     73 
     74 	y8 = vqtbl1q_u8(x8, rol8_tab);
     75 
     76 	return vreinterpretq_u32_u8(y8);
     77 #elif 0
     78 	/*
     79 	 * GCC does a lousy job with this, spilling two 64-bit vector
     80 	 * registers to the stack every time.  There should be plenty
     81 	 * of vector registers free, requiring no spills at all, and
     82 	 * GCC should be able to hoist the load of rol8_tab out of any
     83 	 * loops, but it doesn't and so attempting to use VTBL hurts
     84 	 * more than it helps.
     85 	 */
     86 	static const uint8x8_t rol8_tab = V_N_U8(
     87 		 3, 0, 1, 2,  7, 4, 5, 6
     88 	);
     89 
     90 	uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
     91 
     92 	y64 = (uint64x2_t) {
     93 		(uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
     94 		(uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
     95 	};
     96 
     97 	return vreinterpretq_u32_u64(y64);
     98 #else
     99 	return vrolq_n_u32(x, 8);
    100 #endif
    101 }
    102 
    103 static inline uint32x4_t
    104 rol7(uint32x4_t x)
    105 {
    106 
    107 	return vrolq_n_u32(x, 7);
    108 }
    109 
    110 static inline void
    112 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
    113     unsigned nr)
    114 {
    115 	uint32x4_t r0, r1, r2, r3;
    116 	uint32x4_t c0, c1, c2, c3;
    117 
    118 	r0 = *p0;
    119 	r1 = *p1;
    120 	r2 = *p2;
    121 	r3 = *p3;
    122 
    123 	for (; nr > 0; nr -= 2) {
    124 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
    125 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
    126 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
    127 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
    128 
    129 		c0 = r0;
    130 		c1 = vextq_u32(r1, r1, 1);
    131 		c2 = vextq_u32(r2, r2, 2);
    132 		c3 = vextq_u32(r3, r3, 3);
    133 
    134 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
    135 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
    136 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
    137 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
    138 
    139 		r0 = c0;
    140 		r1 = vextq_u32(c1, c1, 3);
    141 		r2 = vextq_u32(c2, c2, 2);
    142 		r3 = vextq_u32(c3, c3, 1);
    143 	}
    144 
    145 	*p0 = r0;
    146 	*p1 = r1;
    147 	*p2 = r2;
    148 	*p3 = r3;
    149 }
    150 
    151 void
    153 chacha_core_neon(uint8_t out[restrict static 64],
    154     const uint8_t in[static 16],
    155     const uint8_t k[static 32],
    156     const uint8_t c[static 16],
    157     unsigned nr)
    158 {
    159 	uint32x4_t in0, in1, in2, in3;
    160 	uint32x4_t r0, r1, r2, r3;
    161 
    162 	r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c));
    163 	r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
    164 	r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
    165 	r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in));
    166 
    167 	chacha_permute(&r0, &r1, &r2, &r3, nr);
    168 
    169 	vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0)));
    170 	vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1)));
    171 	vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2)));
    172 	vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3)));
    173 }
    174 
    175 void
    176 hchacha_neon(uint8_t out[restrict static 32],
    177     const uint8_t in[static 16],
    178     const uint8_t k[static 32],
    179     const uint8_t c[static 16],
    180     unsigned nr)
    181 {
    182 	uint32x4_t r0, r1, r2, r3;
    183 
    184 	r0 = vreinterpretq_u32_u8(vld1q_u8(c));
    185 	r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
    186 	r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
    187 	r3 = vreinterpretq_u32_u8(vld1q_u8(in));
    188 
    189 	chacha_permute(&r0, &r1, &r2, &r3, nr);
    190 
    191 	vst1q_u8(out + 0, vreinterpretq_u8_u32(r0));
    192 	vst1q_u8(out + 16, vreinterpretq_u8_u32(r3));
    193 }
    194 
    195 void
    197 chacha_stream_neon(uint8_t *restrict s, size_t n,
    198     uint32_t blkno,
    199     const uint8_t nonce[static 12],
    200     const uint8_t k[static 32],
    201     unsigned nr)
    202 {
    203 
    204 	for (; n >= 256; s += 256, n -= 256, blkno += 4)
    205 		chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
    206 
    207 	if (n) {
    208 		const uint32x4_t blkno_inc = /* (1,0,0,0) */
    209 		    vsetq_lane_u32(1, vdupq_n_u32(0), 0);
    210 		uint32x4_t in0, in1, in2, in3;
    211 		uint32x4_t r0, r1, r2, r3;
    212 
    213 		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
    214 		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
    215 		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
    216 		in3 = (uint32x4_t) VQ_N_U32(
    217 			blkno,
    218 			le32dec(nonce),
    219 			le32dec(nonce + 4),
    220 			le32dec(nonce + 8)
    221 		);
    222 
    223 		for (; n; s += 64, n -= 64) {
    224 			r0 = in0;
    225 			r1 = in1;
    226 			r2 = in2;
    227 			r3 = in3;
    228 			chacha_permute(&r0, &r1, &r2, &r3, nr);
    229 			r0 = vaddq_u32(r0, in0);
    230 			r1 = vaddq_u32(r1, in1);
    231 			r2 = vaddq_u32(r2, in2);
    232 			r3 = vaddq_u32(r3, in3);
    233 
    234 			if (n < 64) {
    235 				uint8_t buf[64] __aligned(16);
    236 
    237 				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
    238 				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
    239 				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
    240 				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
    241 				memcpy(s, buf, n);
    242 
    243 				break;
    244 			}
    245 
    246 			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
    247 			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
    248 			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
    249 			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
    250 			in3 = vaddq_u32(in3, blkno_inc);
    251 		}
    252 	}
    253 }
    254 
    255 void
    257 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
    258     uint32_t blkno,
    259     const uint8_t nonce[static 12],
    260     const uint8_t k[static 32],
    261     unsigned nr)
    262 {
    263 
    264 	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
    265 		chacha_stream_xor256_neon(s, p, blkno, nonce, k,
    266 		    chacha_const32, nr);
    267 
    268 	if (n) {
    269 		const uint32x4_t blkno_inc = /* (1,0,0,0) */
    270 		    vsetq_lane_u32(1, vdupq_n_u32(0), 0);
    271 		uint32x4_t in0, in1, in2, in3;
    272 		uint32x4_t r0, r1, r2, r3;
    273 
    274 		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
    275 		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
    276 		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
    277 		in3 = (uint32x4_t) VQ_N_U32(
    278 			blkno,
    279 			le32dec(nonce),
    280 			le32dec(nonce + 4),
    281 			le32dec(nonce + 8)
    282 		);
    283 
    284 		for (; n; s += 64, p += 64, n -= 64) {
    285 			r0 = in0;
    286 			r1 = in1;
    287 			r2 = in2;
    288 			r3 = in3;
    289 			chacha_permute(&r0, &r1, &r2, &r3, nr);
    290 			r0 = vaddq_u32(r0, in0);
    291 			r1 = vaddq_u32(r1, in1);
    292 			r2 = vaddq_u32(r2, in2);
    293 			r3 = vaddq_u32(r3, in3);
    294 
    295 			if (n < 64) {
    296 				uint8_t buf[64] __aligned(16);
    297 				unsigned i;
    298 
    299 				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
    300 				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
    301 				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
    302 				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
    303 
    304 				for (i = 0; i < n - n%4; i += 4)
    305 					le32enc(s + i,
    306 					    le32dec(p + i) ^ le32dec(buf + i));
    307 				for (; i < n; i++)
    308 					s[i] = p[i] ^ buf[i];
    309 
    310 				break;
    311 			}
    312 
    313 			r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0));
    314 			r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16));
    315 			r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32));
    316 			r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48));
    317 			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
    318 			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
    319 			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
    320 			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
    321 			in3 = vaddq_u32(in3, blkno_inc);
    322 		}
    323 	}
    324 }
    325 
    326 void
    328 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
    329     uint32_t blkno,
    330     const uint8_t nonce[static 24],
    331     const uint8_t k[static 32],
    332     unsigned nr)
    333 {
    334 	uint8_t subkey[32];
    335 	uint8_t subnonce[12];
    336 
    337 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
    338 	memset(subnonce, 0, 4);
    339 	memcpy(subnonce + 4, nonce + 16, 8);
    340 	chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
    341 }
    342 
    343 void
    344 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
    345     uint32_t blkno,
    346     const uint8_t nonce[static 24],
    347     const uint8_t k[static 32],
    348     unsigned nr)
    349 {
    350 	uint8_t subkey[32];
    351 	uint8_t subnonce[12];
    352 
    353 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
    354 	memset(subnonce, 0, 4);
    355 	memcpy(subnonce + 4, nonce + 16, 8);
    356 	chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
    357 }
    358