1 1.7 jakllsch /* $NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.2 riastrad #include <aarch64/asm.h> 30 1.1 riastrad 31 1.7 jakllsch RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $") 32 1.4 riastrad 33 1.1 riastrad #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ 34 1.1 riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 35 1.1 riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 36 1.1 riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 37 1.1 riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 38 1.1 riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 39 1.1 riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 40 1.1 riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 41 1.1 riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 42 1.1 riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 43 1.1 riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 44 1.1 riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 45 1.1 riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 46 1.1 riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 47 1.1 riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 48 1.1 riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 49 1.1 riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 50 1.1 riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 51 1.1 riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 52 1.1 riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 53 1.1 riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 54 1.1 riastrad /* end ROUND */ 55 1.1 riastrad 56 1.1 riastrad #define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \ 57 1.1 riastrad f(a0,b0,c0,d0, t0, r); \ 58 1.1 riastrad f(a1,b1,c1,d1, t1, r); \ 59 1.1 riastrad f(a2,b2,c2,d2, t2, r); \ 60 1.1 riastrad f(a3,b3,c3,d3, t3, r); \ 61 1.1 riastrad /* end of STEP */ 62 1.1 riastrad 63 1.1 riastrad /* 64 1.1 riastrad * Each step of the ChaCha quarterround, split up so we can interleave 65 1.1 riastrad * the quarterrounds on independent rows/diagonals to maximize pipeline 66 1.1 riastrad * efficiency. Reference: 67 1.1 riastrad * 68 1.1 riastrad * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop 69 1.1 riastrad * Record of the State of the Art in Stream Ciphers -- SASC 2008. 70 1.1 riastrad * https://cr.yp.to/papers.html#chacha 71 1.1 riastrad * 72 1.1 riastrad * a += b; d ^= a; d <<<= 16; 73 1.1 riastrad * c += d; b ^= c; b <<<= 12; 74 1.1 riastrad * a += b; d ^= a; d <<<= 8; 75 1.1 riastrad * c += d; b ^= c; b <<<= 7; 76 1.1 riastrad * 77 1.1 riastrad * The rotations are implemented with: 78 1.1 riastrad * <<< 16 REV32 Vn.8h for 16, 79 1.1 riastrad * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR) 80 1.1 riastrad * <<< 8 TBL (general permutation; rot8 below stored in r) 81 1.1 riastrad * <<< 7 SHL/SRI/ORR 82 1.1 riastrad */ 83 1.1 riastrad #define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s 84 1.1 riastrad #define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b 85 1.1 riastrad #if 0 86 1.1 riastrad #define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16 87 1.1 riastrad #define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16) 88 1.1 riastrad #define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b 89 1.1 riastrad #else 90 1.1 riastrad #define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h 91 1.1 riastrad #define STEP3(a,b,c,d, t, r) /* nothing */ 92 1.1 riastrad #define STEP4(a,b,c,d, t, r) /* nothing */ 93 1.1 riastrad #endif 94 1.1 riastrad 95 1.1 riastrad #define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s 96 1.1 riastrad #if 0 97 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b 98 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12 99 1.1 riastrad #define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12) 100 1.1 riastrad #define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b 101 1.1 riastrad #else 102 1.1 riastrad #define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b 103 1.1 riastrad #define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12 104 1.1 riastrad #define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12) 105 1.1 riastrad #define STEP9(a,b,c,d, t, r) /* nothing */ 106 1.1 riastrad #endif 107 1.1 riastrad 108 1.1 riastrad #define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s 109 1.1 riastrad #define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b 110 1.1 riastrad #if 0 111 1.1 riastrad #define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8 112 1.1 riastrad #define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8) 113 1.1 riastrad #define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b 114 1.1 riastrad #else 115 1.1 riastrad #define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b 116 1.1 riastrad #define STEP13(a,b,c,d, t, r) /* nothing */ 117 1.1 riastrad #define STEP14(a,b,c,d, t, r) /* nothing */ 118 1.1 riastrad #endif 119 1.1 riastrad 120 1.1 riastrad #define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s 121 1.1 riastrad #if 0 122 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b 123 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7 124 1.1 riastrad #define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7) 125 1.1 riastrad #define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b 126 1.1 riastrad #else 127 1.1 riastrad #define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b 128 1.1 riastrad #define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7 129 1.1 riastrad #define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7) 130 1.1 riastrad #define STEP19(a,b,c,d, t, r) /* nothing */ 131 1.1 riastrad #endif 132 1.1 riastrad 133 1.7 jakllsch #if defined(__AARCH64EB__) 134 1.1 riastrad #define HTOLE32(x) rev32 x, x 135 1.1 riastrad #define LE32TOH(x) rev32 x, x 136 1.7 jakllsch #else 137 1.7 jakllsch #define LE32TOH(x) 138 1.7 jakllsch #define HTOLE32(x) 139 1.1 riastrad #endif 140 1.1 riastrad 141 1.1 riastrad /* 142 1.1 riastrad * chacha_stream256_neon(uint8_t s[256]@x0, 143 1.1 riastrad * uint32_t blkno@w1, 144 1.1 riastrad * const uint8_t nonce[12]@x2, 145 1.5 riastrad * const uint8_t key[32]@x3, 146 1.1 riastrad * const uint8_t const[16]@x4, 147 1.1 riastrad * unsigned nr@w5) 148 1.1 riastrad */ 149 1.1 riastrad ENTRY(chacha_stream256_neon) 150 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ 151 1.1 riastrad mov fp, sp 152 1.1 riastrad 153 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ 154 1.1 riastrad stp d10, d11, [sp, #0x20] 155 1.1 riastrad stp d12, d13, [sp, #0x30] 156 1.1 riastrad stp d14, d15, [sp, #0x40] 157 1.1 riastrad 158 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */ 159 1.1 riastrad mov x10, x4 /* r10 := c */ 160 1.1 riastrad mov x11, x3 /* r11 := k */ 161 1.1 riastrad add x12, x3, #16 /* r12 := k+4 */ 162 1.1 riastrad mov x13, x2 /* r13 := nonce */ 163 1.1 riastrad 164 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ 165 1.1 riastrad dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */ 166 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ 167 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ 168 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ 169 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ 170 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ 171 1.1 riastrad 172 1.6 riastrad LE32TOH(v0.16b) 173 1.6 riastrad LE32TOH(v1.16b) 174 1.6 riastrad LE32TOH(v2.16b) 175 1.6 riastrad LE32TOH(v3.16b) 176 1.6 riastrad LE32TOH(v4.16b) 177 1.6 riastrad LE32TOH(v5.16b) 178 1.6 riastrad LE32TOH(v6.16b) 179 1.6 riastrad LE32TOH(v7.16b) 180 1.6 riastrad LE32TOH(v8.16b) 181 1.6 riastrad LE32TOH(v9.16b) 182 1.6 riastrad LE32TOH(v10.16b) 183 1.6 riastrad LE32TOH(v11.16b) 184 1.6 riastrad /* LE32TOH(v12.16b) -- blkno, already host order */ 185 1.6 riastrad LE32TOH(v13.16b) 186 1.6 riastrad LE32TOH(v14.16b) 187 1.6 riastrad LE32TOH(v15.16b) 188 1.1 riastrad 189 1.1 riastrad mov v16.16b, v0.16b 190 1.1 riastrad mov v17.16b, v1.16b 191 1.1 riastrad mov v18.16b, v2.16b 192 1.1 riastrad mov v19.16b, v3.16b 193 1.1 riastrad mov v20.16b, v4.16b 194 1.1 riastrad mov v21.16b, v5.16b 195 1.1 riastrad mov v22.16b, v6.16b 196 1.1 riastrad mov v23.16b, v7.16b 197 1.1 riastrad mov v24.16b, v8.16b 198 1.1 riastrad mov v25.16b, v9.16b 199 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */ 200 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */ 201 1.1 riastrad mov w9, v11.s[0] 202 1.1 riastrad mov w10, v13.s[0] 203 1.1 riastrad mov w11, v14.s[0] 204 1.1 riastrad mov w12, v15.s[0] 205 1.1 riastrad 206 1.3 riastrad _ALIGN_TEXT 207 1.1 riastrad 1: subs w5, w5, #2 208 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, 209 1.1 riastrad v28,v29,v30,v31, v27) 210 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, 211 1.1 riastrad v28,v29,v30,v31, v27) 212 1.1 riastrad b.ne 1b 213 1.1 riastrad 214 1.1 riastrad dup v27.4s, w8 215 1.1 riastrad dup v28.4s, w9 216 1.1 riastrad dup v29.4s, w10 217 1.1 riastrad dup v30.4s, w11 218 1.1 riastrad dup v31.4s, w12 219 1.1 riastrad 220 1.1 riastrad add v0.4s, v0.4s, v16.4s 221 1.1 riastrad add v1.4s, v1.4s, v17.4s 222 1.1 riastrad add v2.4s, v2.4s, v18.4s 223 1.1 riastrad add v3.4s, v3.4s, v19.4s 224 1.1 riastrad add v4.4s, v4.4s, v20.4s 225 1.1 riastrad add v5.4s, v5.4s, v21.4s 226 1.1 riastrad add v6.4s, v6.4s, v22.4s 227 1.1 riastrad add v7.4s, v7.4s, v23.4s 228 1.1 riastrad add v8.4s, v8.4s, v24.4s 229 1.1 riastrad add v9.4s, v9.4s, v25.4s 230 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ 231 1.1 riastrad add v11.4s, v11.4s, v28.4s 232 1.1 riastrad add v12.4s, v12.4s, v26.4s 233 1.1 riastrad add v13.4s, v13.4s, v29.4s 234 1.1 riastrad add v14.4s, v14.4s, v30.4s 235 1.1 riastrad add v15.4s, v15.4s, v31.4s 236 1.1 riastrad 237 1.6 riastrad HTOLE32(v0.16b) 238 1.6 riastrad HTOLE32(v1.16b) 239 1.6 riastrad HTOLE32(v2.16b) 240 1.6 riastrad HTOLE32(v3.16b) 241 1.6 riastrad HTOLE32(v4.16b) 242 1.6 riastrad HTOLE32(v5.16b) 243 1.6 riastrad HTOLE32(v6.16b) 244 1.6 riastrad HTOLE32(v7.16b) 245 1.6 riastrad HTOLE32(v8.16b) 246 1.6 riastrad HTOLE32(v9.16b) 247 1.6 riastrad HTOLE32(v10.16b) 248 1.6 riastrad HTOLE32(v11.16b) 249 1.6 riastrad HTOLE32(v12.16b) 250 1.6 riastrad HTOLE32(v13.16b) 251 1.6 riastrad HTOLE32(v14.16b) 252 1.6 riastrad HTOLE32(v15.16b) 253 1.1 riastrad 254 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 255 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 256 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16 257 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16 258 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16 259 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16 260 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16 261 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16 262 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16 263 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16 264 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16 265 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16 266 1.1 riastrad st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16 267 1.1 riastrad st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16 268 1.1 riastrad st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16 269 1.1 riastrad st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16 270 1.1 riastrad 271 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ 272 1.1 riastrad ldp d10, d11, [sp, #0x20] 273 1.1 riastrad ldp d12, d13, [sp, #0x30] 274 1.1 riastrad ldp d14, d15, [sp, #0x40] 275 1.1 riastrad 276 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ 277 1.1 riastrad ret 278 1.1 riastrad END(chacha_stream256_neon) 279 1.1 riastrad 280 1.1 riastrad /* 281 1.1 riastrad * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1, 282 1.1 riastrad * uint32_t blkno@w2, 283 1.1 riastrad * const uint8_t nonce[12]@x3, 284 1.1 riastrad * const uint8_t key[32]@x4, 285 1.1 riastrad * const uint8_t const[16]@x5, 286 1.1 riastrad * unsigned nr@w6) 287 1.1 riastrad */ 288 1.1 riastrad ENTRY(chacha_stream_xor256_neon) 289 1.1 riastrad stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ 290 1.1 riastrad mov fp, sp 291 1.1 riastrad 292 1.1 riastrad stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ 293 1.1 riastrad stp d10, d11, [sp, #0x20] 294 1.1 riastrad stp d12, d13, [sp, #0x30] 295 1.1 riastrad stp d14, d15, [sp, #0x40] 296 1.1 riastrad 297 1.1 riastrad adrl x9, v0123 /* x9 := &v0123 */ 298 1.1 riastrad mov x10, x5 /* r10 := c */ 299 1.1 riastrad mov x11, x4 /* r11 := k */ 300 1.1 riastrad add x12, x4, #16 /* r12 := k+4 */ 301 1.1 riastrad mov x13, x3 /* r13 := nonce */ 302 1.1 riastrad 303 1.1 riastrad ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ 304 1.1 riastrad dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */ 305 1.1 riastrad ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ 306 1.1 riastrad ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ 307 1.1 riastrad ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ 308 1.1 riastrad ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ 309 1.1 riastrad add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ 310 1.1 riastrad 311 1.6 riastrad LE32TOH(v0.16b) 312 1.6 riastrad LE32TOH(v1.16b) 313 1.6 riastrad LE32TOH(v2.16b) 314 1.6 riastrad LE32TOH(v3.16b) 315 1.6 riastrad LE32TOH(v4.16b) 316 1.6 riastrad LE32TOH(v5.16b) 317 1.6 riastrad LE32TOH(v6.16b) 318 1.6 riastrad LE32TOH(v7.16b) 319 1.6 riastrad LE32TOH(v8.16b) 320 1.6 riastrad LE32TOH(v9.16b) 321 1.6 riastrad LE32TOH(v10.16b) 322 1.6 riastrad LE32TOH(v11.16b) 323 1.6 riastrad /* LE32TOH(v12.16b) -- blkno, already host order */ 324 1.6 riastrad LE32TOH(v13.16b) 325 1.6 riastrad LE32TOH(v14.16b) 326 1.6 riastrad LE32TOH(v15.16b) 327 1.1 riastrad 328 1.1 riastrad mov v16.16b, v0.16b 329 1.1 riastrad mov v17.16b, v1.16b 330 1.1 riastrad mov v18.16b, v2.16b 331 1.1 riastrad mov v19.16b, v3.16b 332 1.1 riastrad mov v20.16b, v4.16b 333 1.1 riastrad mov v21.16b, v5.16b 334 1.1 riastrad mov v22.16b, v6.16b 335 1.1 riastrad mov v23.16b, v7.16b 336 1.1 riastrad mov v24.16b, v8.16b 337 1.1 riastrad mov v25.16b, v9.16b 338 1.1 riastrad mov v26.16b, v12.16b /* reordered since v12 isn't dup */ 339 1.1 riastrad mov w8, v10.s[0] /* v27-31 needed as temporaries */ 340 1.1 riastrad mov w9, v11.s[0] 341 1.1 riastrad mov w10, v13.s[0] 342 1.1 riastrad mov w11, v14.s[0] 343 1.1 riastrad mov w12, v15.s[0] 344 1.1 riastrad 345 1.3 riastrad _ALIGN_TEXT 346 1.1 riastrad 1: subs w6, w6, #2 347 1.1 riastrad ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, 348 1.1 riastrad v28,v29,v30,v31, v27) 349 1.1 riastrad ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, 350 1.1 riastrad v28,v29,v30,v31, v27) 351 1.1 riastrad b.ne 1b 352 1.1 riastrad 353 1.1 riastrad dup v27.4s, w8 354 1.1 riastrad dup v28.4s, w9 355 1.1 riastrad dup v29.4s, w10 356 1.1 riastrad dup v30.4s, w11 357 1.1 riastrad dup v31.4s, w12 358 1.1 riastrad 359 1.1 riastrad add v0.4s, v0.4s, v16.4s 360 1.1 riastrad add v1.4s, v1.4s, v17.4s 361 1.1 riastrad add v2.4s, v2.4s, v18.4s 362 1.1 riastrad add v3.4s, v3.4s, v19.4s 363 1.1 riastrad add v4.4s, v4.4s, v20.4s 364 1.1 riastrad add v5.4s, v5.4s, v21.4s 365 1.1 riastrad add v6.4s, v6.4s, v22.4s 366 1.1 riastrad add v7.4s, v7.4s, v23.4s 367 1.1 riastrad add v8.4s, v8.4s, v24.4s 368 1.1 riastrad add v9.4s, v9.4s, v25.4s 369 1.1 riastrad add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ 370 1.1 riastrad add v11.4s, v11.4s, v28.4s 371 1.1 riastrad add v12.4s, v12.4s, v26.4s 372 1.1 riastrad add v13.4s, v13.4s, v29.4s 373 1.1 riastrad add v14.4s, v14.4s, v30.4s 374 1.1 riastrad add v15.4s, v15.4s, v31.4s 375 1.1 riastrad 376 1.1 riastrad /* 377 1.1 riastrad * We could do these sixteen LD4-into-lane instructions instead 378 1.1 riastrad * by four LD1-into-register instructions, but we would need to 379 1.1 riastrad * permute the elements in v0-v15 to put them in the right 380 1.1 riastrad * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized 381 1.1 riastrad * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the 382 1.1 riastrad * net cost of the thirty-two ZIP1/ZIP2 instructions seems to 383 1.1 riastrad * exceed the savings in cost from four LD1 instructions rather 384 1.1 riastrad * than sixteen LD4 instructions, even if we interleave the LD1 385 1.1 riastrad * instructions with the ZIPs. 386 1.1 riastrad */ 387 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16 388 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16 389 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16 390 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16 391 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16 392 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16 393 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16 394 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16 395 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16 396 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16 397 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16 398 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16 399 1.1 riastrad ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16 400 1.1 riastrad ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16 401 1.1 riastrad ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 402 1.1 riastrad ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 403 1.1 riastrad 404 1.6 riastrad HTOLE32(v0.16b) 405 1.6 riastrad HTOLE32(v1.16b) 406 1.6 riastrad HTOLE32(v2.16b) 407 1.6 riastrad HTOLE32(v3.16b) 408 1.6 riastrad HTOLE32(v4.16b) 409 1.6 riastrad HTOLE32(v5.16b) 410 1.6 riastrad HTOLE32(v6.16b) 411 1.6 riastrad HTOLE32(v7.16b) 412 1.6 riastrad HTOLE32(v8.16b) 413 1.6 riastrad HTOLE32(v9.16b) 414 1.6 riastrad HTOLE32(v10.16b) 415 1.6 riastrad HTOLE32(v11.16b) 416 1.6 riastrad HTOLE32(v12.16b) 417 1.6 riastrad HTOLE32(v13.16b) 418 1.6 riastrad HTOLE32(v14.16b) 419 1.6 riastrad HTOLE32(v15.16b) 420 1.1 riastrad 421 1.1 riastrad eor v16.16b, v16.16b, v0.16b 422 1.1 riastrad eor v17.16b, v17.16b, v1.16b 423 1.1 riastrad eor v18.16b, v18.16b, v2.16b 424 1.1 riastrad eor v19.16b, v19.16b, v3.16b 425 1.1 riastrad eor v20.16b, v20.16b, v4.16b 426 1.1 riastrad eor v21.16b, v21.16b, v5.16b 427 1.1 riastrad eor v22.16b, v22.16b, v6.16b 428 1.1 riastrad eor v23.16b, v23.16b, v7.16b 429 1.1 riastrad eor v24.16b, v24.16b, v8.16b 430 1.1 riastrad eor v25.16b, v25.16b, v9.16b 431 1.1 riastrad eor v26.16b, v26.16b, v10.16b 432 1.1 riastrad eor v27.16b, v27.16b, v11.16b 433 1.1 riastrad eor v28.16b, v28.16b, v12.16b 434 1.1 riastrad eor v29.16b, v29.16b, v13.16b 435 1.1 riastrad eor v30.16b, v30.16b, v14.16b 436 1.1 riastrad eor v31.16b, v31.16b, v15.16b 437 1.1 riastrad 438 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16 439 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16 440 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16 441 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16 442 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16 443 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16 444 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16 445 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16 446 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16 447 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16 448 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16 449 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16 450 1.1 riastrad st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16 451 1.1 riastrad st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16 452 1.1 riastrad st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16 453 1.1 riastrad st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16 454 1.1 riastrad 455 1.1 riastrad ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ 456 1.1 riastrad ldp d10, d11, [sp, #0x20] 457 1.1 riastrad ldp d12, d13, [sp, #0x30] 458 1.1 riastrad ldp d14, d15, [sp, #0x40] 459 1.1 riastrad 460 1.1 riastrad ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ 461 1.1 riastrad ret 462 1.1 riastrad END(chacha_stream_xor256_neon) 463 1.1 riastrad 464 1.1 riastrad .section .rodata 465 1.1 riastrad .p2align 4 466 1.1 riastrad 467 1.1 riastrad .type v0123,@object 468 1.1 riastrad v0123: 469 1.1 riastrad .long 0, 1, 2, 3 470 1.1 riastrad END(v0123) 471 1.1 riastrad 472 1.1 riastrad /* 473 1.1 riastrad * Must be immediately after v0123 -- we load them in a single 474 1.1 riastrad * ld1 instruction. 475 1.1 riastrad */ 476 1.1 riastrad .type rot8,@object 477 1.1 riastrad rot8: 478 1.1 riastrad .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 479 1.1 riastrad END(rot8) 480