1 1.9 rin /* $NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad #include <sys/types.h> 30 1.1 riastrad #include <sys/endian.h> 31 1.1 riastrad 32 1.9 rin #include <crypto/arch/arm/arm_neon.h> 33 1.9 rin #include <crypto/arch/arm/arm_neon_imm.h> 34 1.1 riastrad #include "chacha_neon.h" 35 1.1 riastrad 36 1.8 riastrad /* 37 1.8 riastrad * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice 38 1.8 riastrad * it hurts performance at least on Cortex-A8. 39 1.8 riastrad */ 40 1.5 riastrad #if 1 41 1.8 riastrad #define vrolq_n_u32(x, n) (vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n))) 42 1.5 riastrad #else 43 1.8 riastrad #define vrolq_n_u32(x, n) vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n)) 44 1.5 riastrad #endif 45 1.1 riastrad 46 1.4 riastrad static inline uint32x4_t 48 1.4 riastrad rol16(uint32x4_t x) 49 1.4 riastrad { 50 1.4 riastrad uint16x8_t y16, x16 = vreinterpretq_u16_u32(x); 51 1.4 riastrad 52 1.4 riastrad y16 = vrev32q_u16(x16); 53 1.4 riastrad 54 1.4 riastrad return vreinterpretq_u32_u16(y16); 55 1.4 riastrad } 56 1.4 riastrad 57 1.4 riastrad static inline uint32x4_t 58 1.4 riastrad rol12(uint32x4_t x) 59 1.4 riastrad { 60 1.4 riastrad 61 1.4 riastrad return vrolq_n_u32(x, 12); 62 1.4 riastrad } 63 1.4 riastrad 64 1.4 riastrad static inline uint32x4_t 65 1.4 riastrad rol8(uint32x4_t x) 66 1.4 riastrad { 67 1.8 riastrad #if defined(__aarch64__) 68 1.4 riastrad static const uint8x16_t rol8_tab = VQ_N_U8( 69 1.8 riastrad 3, 0, 1, 2, 7, 4, 5, 6, 70 1.8 riastrad 11, 8, 9,10, 15,12,13,14 71 1.4 riastrad ); 72 1.4 riastrad uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); 73 1.4 riastrad 74 1.4 riastrad y8 = vqtbl1q_u8(x8, rol8_tab); 75 1.4 riastrad 76 1.4 riastrad return vreinterpretq_u32_u8(y8); 77 1.4 riastrad #elif 0 78 1.4 riastrad /* 79 1.4 riastrad * GCC does a lousy job with this, spilling two 64-bit vector 80 1.4 riastrad * registers to the stack every time. There should be plenty 81 1.4 riastrad * of vector registers free, requiring no spills at all, and 82 1.4 riastrad * GCC should be able to hoist the load of rol8_tab out of any 83 1.4 riastrad * loops, but it doesn't and so attempting to use VTBL hurts 84 1.4 riastrad * more than it helps. 85 1.8 riastrad */ 86 1.8 riastrad static const uint8x8_t rol8_tab = V_N_U8( 87 1.8 riastrad 3, 0, 1, 2, 7, 4, 5, 6 88 1.4 riastrad ); 89 1.4 riastrad 90 1.4 riastrad uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); 91 1.4 riastrad 92 1.4 riastrad y64 = (uint64x2_t) { 93 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab), 94 1.4 riastrad (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab), 95 1.4 riastrad }; 96 1.4 riastrad 97 1.4 riastrad return vreinterpretq_u32_u64(y64); 98 1.4 riastrad #else 99 1.4 riastrad return vrolq_n_u32(x, 8); 100 1.4 riastrad #endif 101 1.4 riastrad } 102 1.4 riastrad 103 1.4 riastrad static inline uint32x4_t 104 1.4 riastrad rol7(uint32x4_t x) 105 1.4 riastrad { 106 1.4 riastrad 107 1.4 riastrad return vrolq_n_u32(x, 7); 108 1.4 riastrad } 109 1.1 riastrad 110 1.1 riastrad static inline void 112 1.1 riastrad chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, 113 1.1 riastrad unsigned nr) 114 1.1 riastrad { 115 1.1 riastrad uint32x4_t r0, r1, r2, r3; 116 1.1 riastrad uint32x4_t c0, c1, c2, c3; 117 1.1 riastrad 118 1.1 riastrad r0 = *p0; 119 1.1 riastrad r1 = *p1; 120 1.1 riastrad r2 = *p2; 121 1.1 riastrad r3 = *p3; 122 1.4 riastrad 123 1.4 riastrad for (; nr > 0; nr -= 2) { 124 1.4 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3); 125 1.4 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1); 126 1.1 riastrad r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3); 127 1.1 riastrad r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1); 128 1.1 riastrad 129 1.1 riastrad c0 = r0; 130 1.1 riastrad c1 = vextq_u32(r1, r1, 1); 131 1.1 riastrad c2 = vextq_u32(r2, r2, 2); 132 1.4 riastrad c3 = vextq_u32(r3, r3, 3); 133 1.4 riastrad 134 1.4 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3); 135 1.4 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1); 136 1.1 riastrad c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3); 137 1.1 riastrad c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1); 138 1.1 riastrad 139 1.1 riastrad r0 = c0; 140 1.1 riastrad r1 = vextq_u32(c1, c1, 3); 141 1.1 riastrad r2 = vextq_u32(c2, c2, 2); 142 1.1 riastrad r3 = vextq_u32(c3, c3, 1); 143 1.1 riastrad } 144 1.1 riastrad 145 1.1 riastrad *p0 = r0; 146 1.1 riastrad *p1 = r1; 147 1.1 riastrad *p2 = r2; 148 1.1 riastrad *p3 = r3; 149 1.1 riastrad } 150 1.1 riastrad 151 1.1 riastrad void 153 1.1 riastrad chacha_core_neon(uint8_t out[restrict static 64], 154 1.1 riastrad const uint8_t in[static 16], 155 1.1 riastrad const uint8_t k[static 32], 156 1.1 riastrad const uint8_t c[static 16], 157 1.1 riastrad unsigned nr) 158 1.1 riastrad { 159 1.8 riastrad uint32x4_t in0, in1, in2, in3; 160 1.8 riastrad uint32x4_t r0, r1, r2, r3; 161 1.8 riastrad 162 1.8 riastrad r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c)); 163 1.1 riastrad r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 164 1.1 riastrad r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 165 1.1 riastrad r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in)); 166 1.8 riastrad 167 1.8 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 168 1.8 riastrad 169 1.8 riastrad vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0))); 170 1.1 riastrad vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1))); 171 1.1 riastrad vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2))); 172 1.1 riastrad vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3))); 173 1.1 riastrad } 174 1.1 riastrad 175 1.1 riastrad void 176 1.1 riastrad hchacha_neon(uint8_t out[restrict static 32], 177 1.1 riastrad const uint8_t in[static 16], 178 1.1 riastrad const uint8_t k[static 32], 179 1.1 riastrad const uint8_t c[static 16], 180 1.1 riastrad unsigned nr) 181 1.8 riastrad { 182 1.8 riastrad uint32x4_t r0, r1, r2, r3; 183 1.8 riastrad 184 1.8 riastrad r0 = vreinterpretq_u32_u8(vld1q_u8(c)); 185 1.1 riastrad r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 186 1.1 riastrad r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 187 1.1 riastrad r3 = vreinterpretq_u32_u8(vld1q_u8(in)); 188 1.8 riastrad 189 1.8 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 190 1.1 riastrad 191 1.1 riastrad vst1q_u8(out + 0, vreinterpretq_u8_u32(r0)); 192 1.1 riastrad vst1q_u8(out + 16, vreinterpretq_u8_u32(r3)); 193 1.1 riastrad } 194 1.1 riastrad 195 1.1 riastrad void 197 1.1 riastrad chacha_stream_neon(uint8_t *restrict s, size_t n, 198 1.1 riastrad uint32_t blkno, 199 1.1 riastrad const uint8_t nonce[static 12], 200 1.1 riastrad const uint8_t k[static 32], 201 1.1 riastrad unsigned nr) 202 1.1 riastrad { 203 1.1 riastrad 204 1.8 riastrad for (; n >= 256; s += 256, n -= 256, blkno += 4) 205 1.8 riastrad chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); 206 1.1 riastrad 207 1.1 riastrad if (n) { 208 1.1 riastrad const uint32x4_t blkno_inc = /* (1,0,0,0) */ 209 1.8 riastrad vsetq_lane_u32(1, vdupq_n_u32(0), 0); 210 1.8 riastrad uint32x4_t in0, in1, in2, in3; 211 1.8 riastrad uint32x4_t r0, r1, r2, r3; 212 1.8 riastrad 213 1.1 riastrad in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); 214 1.1 riastrad in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 215 1.1 riastrad in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 216 1.1 riastrad in3 = (uint32x4_t) VQ_N_U32( 217 1.8 riastrad blkno, 218 1.1 riastrad le32dec(nonce), 219 1.2 riastrad le32dec(nonce + 4), 220 1.1 riastrad le32dec(nonce + 8) 221 1.1 riastrad ); 222 1.1 riastrad 223 1.1 riastrad for (; n; s += 64, n -= 64) { 224 1.1 riastrad r0 = in0; 225 1.8 riastrad r1 = in1; 226 1.8 riastrad r2 = in2; 227 1.8 riastrad r3 = in3; 228 1.8 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 229 1.2 riastrad r0 = vaddq_u32(r0, in0); 230 1.2 riastrad r1 = vaddq_u32(r1, in1); 231 1.2 riastrad r2 = vaddq_u32(r2, in2); 232 1.2 riastrad r3 = vaddq_u32(r3, in3); 233 1.8 riastrad 234 1.8 riastrad if (n < 64) { 235 1.8 riastrad uint8_t buf[64] __aligned(16); 236 1.8 riastrad 237 1.2 riastrad vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); 238 1.2 riastrad vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); 239 1.2 riastrad vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); 240 1.2 riastrad vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); 241 1.2 riastrad memcpy(s, buf, n); 242 1.8 riastrad 243 1.8 riastrad break; 244 1.8 riastrad } 245 1.8 riastrad 246 1.1 riastrad vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); 247 1.1 riastrad vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); 248 1.1 riastrad vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); 249 1.1 riastrad vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); 250 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc); 251 1.1 riastrad } 252 1.1 riastrad } 253 1.1 riastrad } 254 1.1 riastrad 255 1.1 riastrad void 257 1.1 riastrad chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, 258 1.1 riastrad uint32_t blkno, 259 1.1 riastrad const uint8_t nonce[static 12], 260 1.1 riastrad const uint8_t k[static 32], 261 1.1 riastrad unsigned nr) 262 1.1 riastrad { 263 1.1 riastrad 264 1.8 riastrad for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) 265 1.8 riastrad chacha_stream_xor256_neon(s, p, blkno, nonce, k, 266 1.1 riastrad chacha_const32, nr); 267 1.1 riastrad 268 1.1 riastrad if (n) { 269 1.8 riastrad const uint32x4_t blkno_inc = /* (1,0,0,0) */ 270 1.8 riastrad vsetq_lane_u32(1, vdupq_n_u32(0), 0); 271 1.8 riastrad uint32x4_t in0, in1, in2, in3; 272 1.8 riastrad uint32x4_t r0, r1, r2, r3; 273 1.1 riastrad 274 1.1 riastrad in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); 275 1.1 riastrad in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 276 1.1 riastrad in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 277 1.8 riastrad in3 = (uint32x4_t) VQ_N_U32( 278 1.1 riastrad blkno, 279 1.2 riastrad le32dec(nonce), 280 1.1 riastrad le32dec(nonce + 4), 281 1.1 riastrad le32dec(nonce + 8) 282 1.1 riastrad ); 283 1.1 riastrad 284 1.1 riastrad for (; n; s += 64, p += 64, n -= 64) { 285 1.8 riastrad r0 = in0; 286 1.8 riastrad r1 = in1; 287 1.8 riastrad r2 = in2; 288 1.8 riastrad r3 = in3; 289 1.2 riastrad chacha_permute(&r0, &r1, &r2, &r3, nr); 290 1.2 riastrad r0 = vaddq_u32(r0, in0); 291 1.2 riastrad r1 = vaddq_u32(r1, in1); 292 1.2 riastrad r2 = vaddq_u32(r2, in2); 293 1.2 riastrad r3 = vaddq_u32(r3, in3); 294 1.8 riastrad 295 1.8 riastrad if (n < 64) { 296 1.8 riastrad uint8_t buf[64] __aligned(16); 297 1.8 riastrad unsigned i; 298 1.2 riastrad 299 1.2 riastrad vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); 300 1.2 riastrad vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); 301 1.2 riastrad vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); 302 1.2 riastrad vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); 303 1.2 riastrad 304 1.2 riastrad for (i = 0; i < n - n%4; i += 4) 305 1.2 riastrad le32enc(s + i, 306 1.2 riastrad le32dec(p + i) ^ le32dec(buf + i)); 307 1.2 riastrad for (; i < n; i++) 308 1.8 riastrad s[i] = p[i] ^ buf[i]; 309 1.8 riastrad 310 1.8 riastrad break; 311 1.8 riastrad } 312 1.8 riastrad 313 1.8 riastrad r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0)); 314 1.8 riastrad r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16)); 315 1.8 riastrad r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32)); 316 1.1 riastrad r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48)); 317 1.1 riastrad vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); 318 1.1 riastrad vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); 319 1.1 riastrad vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); 320 1.1 riastrad vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); 321 1.1 riastrad in3 = vaddq_u32(in3, blkno_inc); 322 1.1 riastrad } 323 1.1 riastrad } 324 1.1 riastrad } 325 1.1 riastrad 326 1.1 riastrad void 328 1.1 riastrad xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, 329 1.1 riastrad uint32_t blkno, 330 1.1 riastrad const uint8_t nonce[static 24], 331 1.1 riastrad const uint8_t k[static 32], 332 1.1 riastrad unsigned nr) 333 1.1 riastrad { 334 1.1 riastrad uint8_t subkey[32]; 335 1.1 riastrad uint8_t subnonce[12]; 336 1.1 riastrad 337 1.1 riastrad hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 338 1.1 riastrad memset(subnonce, 0, 4); 339 1.1 riastrad memcpy(subnonce + 4, nonce + 16, 8); 340 1.1 riastrad chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); 341 1.1 riastrad } 342 1.1 riastrad 343 1.1 riastrad void 344 1.1 riastrad xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 345 1.1 riastrad uint32_t blkno, 346 1.1 riastrad const uint8_t nonce[static 24], 347 1.1 riastrad const uint8_t k[static 32], 348 1.1 riastrad unsigned nr) 349 1.1 riastrad { 350 1.1 riastrad uint8_t subkey[32]; 351 1.1 riastrad uint8_t subnonce[12]; 352 353 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 354 memset(subnonce, 0, 4); 355 memcpy(subnonce + 4, nonce + 16, 8); 356 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); 357 } 358