1 1.4 riastrad /* $NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad #include <machine/asm.h> 30 1.1 riastrad 31 1.4 riastrad RCSID("$NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $") 32 1.1 riastrad 33 1.1 riastrad .fpu neon 34 1.1 riastrad 35 1.1 riastrad /* 36 1.1 riastrad * ChaCha round, split up so we can interleave the quarterrounds on 37 1.1 riastrad * independent rows/diagonals to maximize pipeline efficiency, with 38 1.1 riastrad * spills to deal with the scarcity of registers. Reference: 39 1.1 riastrad * 40 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop 41 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008. 42 1.1 riastrad * https://cr.yp.to/papers.html#chacha 43 1.1 riastrad * 44 1.1 riastrad * a += b; d ^= a; d <<<= 16; 45 1.1 riastrad * c += d; b ^= c; b <<<= 12; 46 1.1 riastrad * a += b; d ^= a; d <<<= 8; 47 1.1 riastrad * c += d; b ^= c; b <<<= 7; 48 1.1 riastrad * 49 1.1 riastrad * The rotations are implemented with: 50 1.1 riastrad * <<< 16 VREV32.16 for 16, 51 1.1 riastrad * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR) 52 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r) 53 1.1 riastrad * <<< 7 VSHL/VSRI/VORR 54 1.1 riastrad */ 55 1.1 riastrad 56 1.1 riastrad .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 57 1.4 riastrad vld1.8 {\c2-\c3}, [sp, :256] 58 1.1 riastrad .endm 59 1.1 riastrad 60 1.1 riastrad .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h 61 1.1 riastrad /* a += b; d ^= a; d <<<= 16 */ 62 1.1 riastrad vadd.u32 \a0, \a0, \b0 63 1.1 riastrad vadd.u32 \a1, \a1, \b1 64 1.1 riastrad vadd.u32 \a2, \a2, \b2 65 1.1 riastrad vadd.u32 \a3, \a3, \b3 66 1.1 riastrad 67 1.1 riastrad veor \d0, \d0, \a0 68 1.1 riastrad veor \d1, \d1, \a1 69 1.1 riastrad veor \d2, \d2, \a2 70 1.1 riastrad veor \d3, \d3, \a3 71 1.1 riastrad 72 1.1 riastrad vrev32.16 \d0, \d0 73 1.1 riastrad vrev32.16 \d1, \d1 74 1.1 riastrad vrev32.16 \d2, \d2 75 1.1 riastrad vrev32.16 \d3, \d3 76 1.1 riastrad 77 1.1 riastrad /* c += d; b ^= c; b <<<= 12 */ 78 1.1 riastrad vadd.u32 \c0, \c0, \d0 79 1.1 riastrad vadd.u32 \c1, \c1, \d1 80 1.1 riastrad vadd.u32 \c2, \c2, \d2 81 1.1 riastrad vadd.u32 \c3, \c3, \d3 82 1.1 riastrad 83 1.4 riastrad vst1.8 {\c0-\c1}, [sp, :256] /* free c0 and c1 as temps */ 84 1.1 riastrad 85 1.1 riastrad veor \c0, \b0, \c0 86 1.1 riastrad veor \c1, \b1, \c1 87 1.1 riastrad vshl.u32 \b0, \c0, #12 88 1.1 riastrad vshl.u32 \b1, \c1, #12 89 1.1 riastrad vsri.u32 \b0, \c0, #(32 - 12) 90 1.1 riastrad vsri.u32 \b1, \c1, #(32 - 12) 91 1.1 riastrad 92 1.1 riastrad veor \c0, \b2, \c2 93 1.1 riastrad veor \c1, \b3, \c3 94 1.1 riastrad vshl.u32 \b2, \c0, #12 95 1.1 riastrad vshl.u32 \b3, \c1, #12 96 1.1 riastrad vsri.u32 \b2, \c0, #(32 - 12) 97 1.1 riastrad vsri.u32 \b3, \c1, #(32 - 12) 98 1.1 riastrad 99 1.1 riastrad vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ 100 1.1 riastrad 101 1.1 riastrad /* a += b; d ^= a; d <<<= 8 */ 102 1.1 riastrad vadd.u32 \a0, \a0, \b0 103 1.1 riastrad vadd.u32 \a1, \a1, \b1 104 1.1 riastrad vadd.u32 \a2, \a2, \b2 105 1.1 riastrad vadd.u32 \a3, \a3, \b3 106 1.1 riastrad 107 1.1 riastrad veor \d0, \d0, \a0 108 1.1 riastrad veor \d1, \d1, \a1 109 1.1 riastrad veor \d2, \d2, \a2 110 1.1 riastrad veor \d3, \d3, \a3 111 1.1 riastrad 112 1.1 riastrad vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */ 113 1.1 riastrad vtbl.8 \d0h, {\d0h}, \c0l 114 1.1 riastrad vtbl.8 \d1l, {\d1l}, \c0l 115 1.1 riastrad vtbl.8 \d1h, {\d1h}, \c0l 116 1.1 riastrad vtbl.8 \d2l, {\d2l}, \c0l 117 1.1 riastrad vtbl.8 \d2h, {\d2h}, \c0l 118 1.1 riastrad vtbl.8 \d3l, {\d3l}, \c0l 119 1.1 riastrad vtbl.8 \d3h, {\d3h}, \c0l 120 1.1 riastrad 121 1.4 riastrad vld1.8 {\c0-\c1}, [sp, :256] /* restore c0 and c1 */ 122 1.1 riastrad 123 1.1 riastrad /* c += d; b ^= c; b <<<= 7 */ 124 1.1 riastrad vadd.u32 \c2, \c2, \d2 125 1.1 riastrad vadd.u32 \c3, \c3, \d3 126 1.1 riastrad vadd.u32 \c0, \c0, \d0 127 1.1 riastrad vadd.u32 \c1, \c1, \d1 128 1.1 riastrad 129 1.4 riastrad vst1.8 {\c2-\c3}, [sp, :256] /* free c2 and c3 as temps */ 130 1.1 riastrad 131 1.1 riastrad veor \c2, \b2, \c2 132 1.1 riastrad veor \c3, \b3, \c3 133 1.1 riastrad vshl.u32 \b2, \c2, #7 134 1.1 riastrad vshl.u32 \b3, \c3, #7 135 1.1 riastrad vsri.u32 \b2, \c2, #(32 - 7) 136 1.1 riastrad vsri.u32 \b3, \c3, #(32 - 7) 137 1.1 riastrad 138 1.1 riastrad veor \c2, \b0, \c0 139 1.1 riastrad veor \c3, \b1, \c1 140 1.1 riastrad vshl.u32 \b0, \c2, #7 141 1.1 riastrad vshl.u32 \b1, \c3, #7 142 1.1 riastrad vsri.u32 \b0, \c2, #(32 - 7) 143 1.1 riastrad vsri.u32 \b1, \c3, #(32 - 7) 144 1.1 riastrad .endm 145 1.1 riastrad 146 1.1 riastrad .text 147 1.1 riastrad .p2align 2 148 1.1 riastrad .Lconstants_addr: 149 1.1 riastrad .long .Lconstants - . 150 1.1 riastrad 151 1.1 riastrad /* 152 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@r0, 153 1.1 riastrad * uint32_t blkno@r1, 154 1.1 riastrad * const uint8_t nonce[12]@r2, 155 1.1 riastrad * const uint8_t key[32]@r3, 156 1.1 riastrad * const uint8_t const[16]@sp[0], 157 1.1 riastrad * unsigned nr@sp[4]) 158 1.1 riastrad */ 159 1.1 riastrad ENTRY(chacha_stream256_neon) 160 1.1 riastrad /* save callee-saves registers */ 161 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr} 162 1.1 riastrad vpush {d8-d15} 163 1.4 riastrad mov fp, sp 164 1.1 riastrad 165 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ 166 1.1 riastrad ldr r7, .Lconstants_addr 167 1.1 riastrad adr r6, .Lconstants_addr 168 1.1 riastrad 169 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */ 170 1.4 riastrad sub sp, sp, #0x20 171 1.4 riastrad bic sp, sp, #0x1f /* align */ 172 1.1 riastrad 173 1.1 riastrad /* get parameters */ 174 1.4 riastrad add ip, fp, #96 175 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ 176 1.1 riastrad ldm ip, {r4, r5} /* r4 := const, r5 := nr */ 177 1.1 riastrad ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ 178 1.1 riastrad 179 1.3 riastrad vld1.8 {q12}, [r4] /* q12 := constant */ 180 1.3 riastrad vld1.8 {q13-q14}, [r3] /* q13-q14 := key */ 181 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ 182 1.1 riastrad 183 1.3 riastrad #ifdef __ARM_BIG_ENDIAN 184 1.3 riastrad rev r6, r6 185 1.3 riastrad rev r8, r8 186 1.3 riastrad rev r10, r10 187 1.3 riastrad #endif 188 1.3 riastrad 189 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */ 190 1.1 riastrad vdup.32 q1, d24[1] 191 1.1 riastrad vdup.32 q2, d25[0] 192 1.1 riastrad vdup.32 q3, d25[1] 193 1.1 riastrad vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */ 194 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ 195 1.1 riastrad vdup.32 q5, d26[1] 196 1.1 riastrad vdup.32 q6, d27[0] 197 1.1 riastrad vdup.32 q7, d27[1] 198 1.1 riastrad vdup.32 q8, d28[0] 199 1.1 riastrad vdup.32 q9, d28[1] 200 1.1 riastrad vdup.32 q10, d29[0] 201 1.1 riastrad vdup.32 q11, d29[1] 202 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ 203 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */ 204 1.1 riastrad vdup.32 q14, r8 205 1.1 riastrad vdup.32 q15, r10 206 1.1 riastrad 207 1.1 riastrad b 2f 208 1.1 riastrad 209 1.1 riastrad _ALIGN_TEXT 210 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 211 1.1 riastrad 2: subs r5, r5, #2 212 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ 213 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31 214 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 215 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ 216 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29 217 1.1 riastrad bne 1b 218 1.1 riastrad 219 1.1 riastrad /* 220 1.1 riastrad * q8-q9 are free / saved on the stack. We have: 221 1.1 riastrad * 222 1.1 riastrad * q0 = (x0[0], x1[0]; x2[0], x3[0]) 223 1.1 riastrad * q1 = (x0[1], x1[1]; x2[1], x3[1]) 224 1.1 riastrad * q2 = (x0[2], x1[2]; x2[2], x3[2]) 225 1.1 riastrad * q3 = (x0[3], x1[3]; x2[3], x3[3]) 226 1.1 riastrad * ... 227 1.1 riastrad * q15 = (x0[15], x1[15]; x2[15], x3[15]) 228 1.1 riastrad * 229 1.1 riastrad * where xi[j] is the jth word of the ith 16-word block. Zip 230 1.1 riastrad * consecutive pairs with vzip.32, and you get: 231 1.1 riastrad * 232 1.1 riastrad * q0 = (x0[0], x0[1]; x1[0], x1[1]) 233 1.1 riastrad * q1 = (x2[0], x2[1]; x3[0], x3[1]) 234 1.1 riastrad * q2 = (x0[2], x0[3]; x1[2], x1[3]) 235 1.1 riastrad * q3 = (x2[2], x2[3]; x3[2], x3[3]) 236 1.1 riastrad * ... 237 1.1 riastrad * q15 = (x2[14], x2[15]; x3[14], x3[15]) 238 1.1 riastrad * 239 1.1 riastrad * As 64-bit d registers, this is: 240 1.1 riastrad * 241 1.1 riastrad * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) 242 1.1 riastrad * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) 243 1.1 riastrad * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) 244 1.1 riastrad * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) 245 1.1 riastrad * ... 246 1.1 riastrad * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) 247 1.1 riastrad * 248 1.1 riastrad * Swap d1<->d4, d3<->d6, ..., and you get: 249 1.1 riastrad * 250 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3]) 251 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3]) 252 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3]) 253 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3]) 254 1.1 riastrad * ... 255 1.1 riastrad * q15 = (x15[0], x15[1]; x15[2], x15[3]) 256 1.1 riastrad */ 257 1.1 riastrad 258 1.1 riastrad sub r7, r7, #0x10 259 1.1 riastrad vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */ 260 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ 261 1.1 riastrad 262 1.1 riastrad vzip.32 q0, q1 263 1.1 riastrad vzip.32 q2, q3 264 1.1 riastrad vzip.32 q4, q5 265 1.1 riastrad vzip.32 q6, q7 266 1.1 riastrad 267 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ 268 1.3 riastrad vld1.8 {q9}, [r4] /* q9 := constant */ 269 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ 270 1.3 riastrad vld1.8 {q8}, [r3]! /* q8 := key[0:16) */ 271 1.1 riastrad 272 1.1 riastrad vswp d1, d4 273 1.1 riastrad vswp d9, d12 274 1.1 riastrad vswp d3, d6 275 1.1 riastrad vswp d11, d14 276 1.1 riastrad 277 1.1 riastrad /* 278 1.1 riastrad * At this point, the blocks are: 279 1.1 riastrad * 280 1.1 riastrad * q0 = (x0[0], x0[1]; x0[2], x0[3]) 281 1.1 riastrad * q1 = (x2[0], x2[1]; x2[2], x2[3]) 282 1.1 riastrad * q2 = (x1[0], x1[1]; x1[2], x1[3]) 283 1.1 riastrad * q3 = (x3[0], x3[1]; x3[2], x3[3]) 284 1.1 riastrad * q4 = (x0[4], x0[5]; x0[6], x0[7]) 285 1.1 riastrad * q5 = (x2[4], x2[5]; x2[6], x2[7]) 286 1.1 riastrad * q6 = (x1[4], x1[5]; x1[6], x1[7]) 287 1.1 riastrad * q7 = (x3[4], x3[5]; x3[6], x3[7]) 288 1.1 riastrad * 289 1.1 riastrad * The first two rows to write out are q0 = x0[0:4) and q4 = 290 1.2 riastrad * x0[4:8). Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14 291 1.2 riastrad * enables us to issue all stores in consecutive pairs: 292 1.2 riastrad * x0 in q0-q1 293 1.2 riastrad * x1 in q8-q9 294 1.2 riastrad * x2 in q2-q3 295 1.2 riastrad * x3 in q10-q11 296 1.2 riastrad * x4 in q4-q5 297 1.2 riastrad * x5 in q12-q3 298 1.2 riastrad * x6 in q6-q7 299 1.2 riastrad * x7 in q14-q15 300 1.1 riastrad */ 301 1.1 riastrad 302 1.1 riastrad vswp q1, q4 303 1.2 riastrad vswp q3, q6 304 1.1 riastrad 305 1.1 riastrad vadd.u32 q0, q0, q9 306 1.1 riastrad vadd.u32 q4, q4, q9 307 1.1 riastrad vadd.u32 q2, q2, q9 308 1.2 riastrad vadd.u32 q6, q6, q9 309 1.1 riastrad 310 1.1 riastrad vadd.u32 q1, q1, q8 311 1.1 riastrad vadd.u32 q5, q5, q8 312 1.2 riastrad vadd.u32 q3, q3, q8 313 1.1 riastrad vadd.u32 q7, q7, q8 314 1.1 riastrad 315 1.4 riastrad vld1.8 {q8-q9}, [sp, :256] /* restore q8-q9 */ 316 1.1 riastrad 317 1.3 riastrad vst1.8 {q0-q1}, [r0]! 318 1.3 riastrad vld1.8 {q0}, [r3] /* q0 := key[16:32) */ 319 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ 320 1.1 riastrad vmov d2, r3, r6 321 1.1 riastrad vmov d3, r8, r10 322 1.1 riastrad 323 1.1 riastrad vzip.32 q8, q9 324 1.1 riastrad vzip.32 q10, q11 325 1.1 riastrad vzip.32 q12, q13 326 1.1 riastrad vzip.32 q14, q15 327 1.1 riastrad 328 1.1 riastrad vswp d17, d20 329 1.1 riastrad vswp d25, d28 330 1.1 riastrad vswp d19, d22 331 1.1 riastrad vswp d27, d30 332 1.1 riastrad 333 1.2 riastrad vswp q9, q12 334 1.2 riastrad vswp q11, q14 335 1.2 riastrad 336 1.1 riastrad vadd.u32 q8, q8, q0 337 1.2 riastrad vadd.u32 q12, q12, q0 338 1.1 riastrad vadd.u32 q10, q10, q0 339 1.2 riastrad vadd.u32 q14, q14, q0 340 1.1 riastrad 341 1.2 riastrad vadd.u32 q9, q9, q1 342 1.1 riastrad vadd.u32 q13, q13, q1 343 1.2 riastrad vadd.u32 q11, q11, q1 344 1.1 riastrad vadd.u32 q15, q15, q1 345 1.1 riastrad 346 1.3 riastrad /* vst1.8 {q0-q1}, [r0]! */ 347 1.3 riastrad vst1.8 {q8-q9}, [r0]! 348 1.3 riastrad vst1.8 {q2-q3}, [r0]! 349 1.3 riastrad vst1.8 {q10-q11}, [r0]! 350 1.3 riastrad vst1.8 {q4-q5}, [r0]! 351 1.3 riastrad vst1.8 {q12-q13}, [r0]! 352 1.3 riastrad vst1.8 {q6-q7}, [r0]! 353 1.3 riastrad vst1.8 {q14-q15}, [r0] 354 1.2 riastrad 355 1.2 riastrad /* zero temporary space on the stack */ 356 1.1 riastrad vmov.i32 q0, #0 357 1.1 riastrad vmov.i32 q1, #0 358 1.4 riastrad vst1.8 {q0-q1}, [sp, :256] 359 1.1 riastrad 360 1.1 riastrad /* restore callee-saves registers and stack */ 361 1.4 riastrad mov sp, fp 362 1.1 riastrad vpop {d8-d15} 363 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr} 364 1.1 riastrad bx lr 365 1.1 riastrad END(chacha_stream256_neon) 366 1.1 riastrad 367 1.1 riastrad /* 368 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1, 369 1.1 riastrad * uint32_t blkno@r2, 370 1.1 riastrad * const uint8_t nonce[12]@r3, 371 1.1 riastrad * const uint8_t key[32]@sp[0], 372 1.1 riastrad * const uint8_t const[16]@sp[4], 373 1.1 riastrad * unsigned nr@sp[8]) 374 1.1 riastrad */ 375 1.1 riastrad ENTRY(chacha_stream_xor256_neon) 376 1.1 riastrad /* save callee-saves registers */ 377 1.1 riastrad push {r4, r5, r6, r7, r8, r10, fp, lr} 378 1.1 riastrad vpush {d8-d15} 379 1.4 riastrad mov fp, sp 380 1.1 riastrad 381 1.1 riastrad /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ 382 1.1 riastrad ldr r7, .Lconstants_addr 383 1.1 riastrad adr r6, .Lconstants_addr 384 1.1 riastrad 385 1.1 riastrad /* reserve space for two 128-bit/16-byte q registers */ 386 1.4 riastrad sub sp, sp, #0x20 387 1.4 riastrad bic sp, sp, #0x1f /* align */ 388 1.1 riastrad 389 1.1 riastrad /* get parameters */ 390 1.4 riastrad add ip, fp, #96 391 1.1 riastrad add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ 392 1.1 riastrad ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ 393 1.1 riastrad ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ 394 1.1 riastrad 395 1.3 riastrad vld1.8 {q12}, [r5] /* q12 := constant */ 396 1.3 riastrad vld1.8 {q13-q14}, [r4] /* q13-q14 := key */ 397 1.1 riastrad vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ 398 1.1 riastrad 399 1.3 riastrad #ifdef __ARM_BIG_ENDIAN 400 1.3 riastrad rev r6, r6 401 1.3 riastrad rev r8, r8 402 1.3 riastrad rev r10, r10 403 1.3 riastrad #endif 404 1.3 riastrad 405 1.1 riastrad vdup.32 q0, d24[0] /* q0-q3 := constant */ 406 1.1 riastrad vdup.32 q1, d24[1] 407 1.1 riastrad vdup.32 q2, d25[0] 408 1.1 riastrad vdup.32 q3, d25[1] 409 1.1 riastrad vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */ 410 1.1 riastrad vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ 411 1.1 riastrad vdup.32 q5, d26[1] 412 1.1 riastrad vdup.32 q6, d27[0] 413 1.1 riastrad vdup.32 q7, d27[1] 414 1.1 riastrad vdup.32 q8, d28[0] 415 1.1 riastrad vdup.32 q9, d28[1] 416 1.1 riastrad vdup.32 q10, d29[0] 417 1.1 riastrad vdup.32 q11, d29[1] 418 1.1 riastrad vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ 419 1.1 riastrad vdup.32 q13, r6 /* q13-q15 := nonce */ 420 1.1 riastrad vdup.32 q14, r8 421 1.1 riastrad vdup.32 q15, r10 422 1.1 riastrad 423 1.1 riastrad b 2f 424 1.1 riastrad 425 1.1 riastrad _ALIGN_TEXT 426 1.1 riastrad 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 427 1.1 riastrad 2: subs ip, ip, #2 428 1.1 riastrad ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ 429 1.1 riastrad d16, d24,d25, d26,d27, d28,d29, d30,d31 430 1.1 riastrad ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 431 1.1 riastrad ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ 432 1.1 riastrad d20, d30,d31, d24,d25, d26,d27, d28,d29 433 1.1 riastrad bne 1b 434 1.1 riastrad 435 1.1 riastrad /* 436 1.1 riastrad * q8-q9 are free / saved on the stack. Now for the real fun: 437 1.1 riastrad * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in 438 1.1 riastrad * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are 439 1.1 riastrad * transposed from one another, and the x[i] are in general 440 1.2 riastrad * registers and memory. See comments in chacha_stream256_neon 441 1.2 riastrad * for the layout with swaps. 442 1.1 riastrad */ 443 1.1 riastrad 444 1.1 riastrad sub r7, r7, #0x10 445 1.1 riastrad vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ 446 1.1 riastrad vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ 447 1.1 riastrad 448 1.1 riastrad vzip.32 q0, q1 449 1.1 riastrad vzip.32 q2, q3 450 1.1 riastrad vzip.32 q4, q5 451 1.1 riastrad vzip.32 q6, q7 452 1.1 riastrad 453 1.1 riastrad vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ 454 1.3 riastrad vld1.8 {q9}, [r5] /* q9 := constant */ 455 1.1 riastrad vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ 456 1.3 riastrad vld1.8 {q8}, [r4]! /* q8 := key[0:16) */ 457 1.1 riastrad 458 1.2 riastrad vswp d3, d6 459 1.2 riastrad vswp d9, d12 460 1.1 riastrad vswp d1, d4 461 1.1 riastrad vswp d11, d14 462 1.1 riastrad 463 1.1 riastrad vswp q1, q4 464 1.2 riastrad vswp q3, q6 465 1.1 riastrad 466 1.1 riastrad vadd.u32 q0, q0, q9 467 1.1 riastrad vadd.u32 q4, q4, q9 468 1.1 riastrad vadd.u32 q2, q2, q9 469 1.2 riastrad vadd.u32 q6, q6, q9 470 1.1 riastrad 471 1.1 riastrad vadd.u32 q1, q1, q8 472 1.1 riastrad vadd.u32 q5, q5, q8 473 1.2 riastrad vadd.u32 q3, q3, q8 474 1.1 riastrad vadd.u32 q7, q7, q8 475 1.1 riastrad 476 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ 477 1.1 riastrad 478 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ 479 1.1 riastrad veor q1, q1, q9 480 1.1 riastrad 481 1.4 riastrad vld1.8 {q8-q9}, [sp, :256] /* restore q8-q9 */ 482 1.1 riastrad 483 1.3 riastrad vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ 484 1.3 riastrad vld1.8 {q0}, [r4] /* q0 := key[16:32) */ 485 1.1 riastrad mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ 486 1.1 riastrad vmov d2, r3, r6 487 1.1 riastrad vmov d3, r8, r10 488 1.1 riastrad 489 1.1 riastrad vzip.32 q8, q9 490 1.1 riastrad vzip.32 q10, q11 491 1.1 riastrad vzip.32 q12, q13 492 1.1 riastrad vzip.32 q14, q15 493 1.1 riastrad 494 1.2 riastrad vswp d19, d22 495 1.2 riastrad vswp d25, d28 496 1.1 riastrad vswp d17, d20 497 1.1 riastrad vswp d27, d30 498 1.1 riastrad 499 1.1 riastrad vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ 500 1.2 riastrad vswp q11, q14 501 1.1 riastrad 502 1.1 riastrad vadd.u32 q8, q8, q0 503 1.1 riastrad vadd.u32 q12, q12, q0 504 1.1 riastrad vadd.u32 q10, q10, q0 505 1.2 riastrad vadd.u32 q14, q14, q0 506 1.1 riastrad 507 1.1 riastrad vadd.u32 q9, q9, q1 508 1.1 riastrad vadd.u32 q13, q13, q1 509 1.2 riastrad vadd.u32 q11, q11, q1 510 1.1 riastrad vadd.u32 q15, q15, q1 511 1.1 riastrad 512 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ 513 1.1 riastrad 514 1.1 riastrad veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ 515 1.1 riastrad veor q1, q1, q9 516 1.1 riastrad 517 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ 518 1.3 riastrad vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ 519 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ 520 1.1 riastrad 521 1.1 riastrad veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ 522 1.2 riastrad veor q3, q3, q9 523 1.1 riastrad 524 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ 525 1.3 riastrad vst1.8 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ 526 1.1 riastrad 527 1.1 riastrad veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ 528 1.2 riastrad veor q11, q11, q1 529 1.1 riastrad 530 1.3 riastrad vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ 531 1.3 riastrad vst1.8 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ 532 1.1 riastrad 533 1.1 riastrad veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ 534 1.1 riastrad veor q5, q5, q9 535 1.1 riastrad 536 1.3 riastrad vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ 537 1.3 riastrad vst1.8 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ 538 1.1 riastrad 539 1.1 riastrad veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ 540 1.1 riastrad veor q13, q13, q1 541 1.1 riastrad 542 1.3 riastrad vld1.8 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ 543 1.3 riastrad vst1.8 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ 544 1.1 riastrad 545 1.2 riastrad veor q6, q6, q8 /* compute ciphertext bytes [192:224) */ 546 1.2 riastrad veor q7, q7, q9 547 1.1 riastrad 548 1.3 riastrad vst1.8 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ 549 1.1 riastrad 550 1.2 riastrad veor q14, q14, q0 /* compute ciphertext bytes [224:256) */ 551 1.2 riastrad veor q15, q15, q1 552 1.1 riastrad 553 1.3 riastrad vst1.8 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ 554 1.1 riastrad 555 1.1 riastrad /* zero temporary space on the stack */ 556 1.1 riastrad vmov.i32 q0, #0 557 1.1 riastrad vmov.i32 q1, #0 558 1.4 riastrad vst1.8 {q0-q1}, [sp, :256] 559 1.1 riastrad 560 1.1 riastrad /* restore callee-saves registers and stack */ 561 1.4 riastrad mov sp, fp 562 1.1 riastrad vpop {d8-d15} 563 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, fp, lr} 564 1.1 riastrad bx lr 565 1.1 riastrad END(chacha_stream_xor256_neon) 566 1.1 riastrad 567 1.1 riastrad .section .rodata 568 1.1 riastrad .p2align 4 569 1.1 riastrad .Lconstants: 570 1.1 riastrad 571 1.1 riastrad .type v0123,%object 572 1.1 riastrad v0123: 573 1.1 riastrad .long 0, 1, 2, 3 574 1.1 riastrad END(v0123) 575 1.1 riastrad 576 1.1 riastrad .type rot8,%object 577 1.1 riastrad rot8: 578 1.3 riastrad .byte 3,0,1,2, 7,4,5,6 579 1.1 riastrad END(rot8) 580