Home | History | Annotate | Line # | Download | only in arm
      1 /*	$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <aarch64/asm.h>
     30 
     31 RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $")
     32 
     33 #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
     34 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     35 STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     36 STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     37 STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     38 STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     39 STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     40 STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     41 STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     42 STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     43 STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     44 STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     45 STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     46 STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     47 STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     48 STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     49 STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     50 STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     51 STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     52 STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     53 STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     54 /* end ROUND */
     55 
     56 #define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
     57 	f(a0,b0,c0,d0, t0, r);						      \
     58 	f(a1,b1,c1,d1, t1, r);						      \
     59 	f(a2,b2,c2,d2, t2, r);						      \
     60 	f(a3,b3,c3,d3, t3, r);						      \
     61 	/* end of STEP */
     62 
     63 /*
     64  * Each step of the ChaCha quarterround, split up so we can interleave
     65  * the quarterrounds on independent rows/diagonals to maximize pipeline
     66  * efficiency.  Reference:
     67  *
     68  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
     69  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
     70  *	https://cr.yp.to/papers.html#chacha
     71  *
     72  *	a += b; d ^= a; d <<<= 16;
     73  *	c += d; b ^= c; b <<<= 12;
     74  *	a += b; d ^= a; d <<<= 8;
     75  *	c += d; b ^= c; b <<<= 7;
     76  *
     77  * The rotations are implemented with:
     78  *	<<< 16		REV32 Vn.8h for 16,
     79  *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
     80  *	<<< 8		TBL (general permutation; rot8 below stored in r)
     81  *	<<< 7		SHL/SRI/ORR
     82  */
     83 #define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
     84 #define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
     85 #if 0
     86 #define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
     87 #define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
     88 #define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
     89 #else
     90 #define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
     91 #define	STEP3(a,b,c,d, t, r)	/* nothing */
     92 #define	STEP4(a,b,c,d, t, r)	/* nothing */
     93 #endif
     94 
     95 #define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
     96 #if 0
     97 #define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
     98 #define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
     99 #define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
    100 #define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    101 #else
    102 #define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    103 #define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
    104 #define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
    105 #define	STEP9(a,b,c,d, t, r)	/* nothing */
    106 #endif
    107 
    108 #define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
    109 #define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
    110 #if 0
    111 #define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
    112 #define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
    113 #define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    114 #else
    115 #define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
    116 #define	STEP13(a,b,c,d, t, r)	/* nothing */
    117 #define	STEP14(a,b,c,d, t, r)	/* nothing */
    118 #endif
    119 
    120 #define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    121 #if 0
    122 #define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    123 #define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
    124 #define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
    125 #define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    126 #else
    127 #define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    128 #define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
    129 #define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
    130 #define	STEP19(a,b,c,d, t, r)	/* nothing */
    131 #endif
    132 
    133 #if defined(__AARCH64EB__)
    134 #define	HTOLE32(x)	rev32	x, x
    135 #define	LE32TOH(x)	rev32	x, x
    136 #else
    137 #define	LE32TOH(x)
    138 #define	HTOLE32(x)
    139 #endif
    140 
    141 /*
    142  * chacha_stream256_neon(uint8_t s[256]@x0,
    143  *     uint32_t blkno@w1,
    144  *     const uint8_t nonce[12]@x2,
    145  *     const uint8_t key[32]@x3,
    146  *     const uint8_t const[16]@x4,
    147  *     unsigned nr@w5)
    148  */
    149 ENTRY(chacha_stream256_neon)
    150 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    151 	mov	fp, sp
    152 
    153 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    154 	stp	d10, d11, [sp, #0x20]
    155 	stp	d12, d13, [sp, #0x30]
    156 	stp	d14, d15, [sp, #0x40]
    157 
    158 	adrl	x9, v0123	/* x9 := &v0123 */
    159 	mov	x10, x4		/* r10 := c */
    160 	mov	x11, x3		/* r11 := k */
    161 	add	x12, x3, #16	/* r12 := k+4 */
    162 	mov	x13, x2		/* r13 := nonce */
    163 
    164 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    165 	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
    166 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    167 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    168 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    169 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    170 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    171 
    172 	LE32TOH(v0.16b)
    173 	LE32TOH(v1.16b)
    174 	LE32TOH(v2.16b)
    175 	LE32TOH(v3.16b)
    176 	LE32TOH(v4.16b)
    177 	LE32TOH(v5.16b)
    178 	LE32TOH(v6.16b)
    179 	LE32TOH(v7.16b)
    180 	LE32TOH(v8.16b)
    181 	LE32TOH(v9.16b)
    182 	LE32TOH(v10.16b)
    183 	LE32TOH(v11.16b)
    184 	/* LE32TOH(v12.16b) -- blkno, already host order */
    185 	LE32TOH(v13.16b)
    186 	LE32TOH(v14.16b)
    187 	LE32TOH(v15.16b)
    188 
    189 	mov	v16.16b, v0.16b
    190 	mov	v17.16b, v1.16b
    191 	mov	v18.16b, v2.16b
    192 	mov	v19.16b, v3.16b
    193 	mov	v20.16b, v4.16b
    194 	mov	v21.16b, v5.16b
    195 	mov	v22.16b, v6.16b
    196 	mov	v23.16b, v7.16b
    197 	mov	v24.16b, v8.16b
    198 	mov	v25.16b, v9.16b
    199 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    200 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    201 	mov	w9, v11.s[0]
    202 	mov	w10, v13.s[0]
    203 	mov	w11, v14.s[0]
    204 	mov	w12, v15.s[0]
    205 
    206 	_ALIGN_TEXT
    207 1:	subs	w5, w5, #2
    208 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    209 	    v28,v29,v30,v31, v27)
    210 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    211 	    v28,v29,v30,v31, v27)
    212 	b.ne	1b
    213 
    214 	dup	v27.4s, w8
    215 	dup	v28.4s, w9
    216 	dup	v29.4s, w10
    217 	dup	v30.4s, w11
    218 	dup	v31.4s, w12
    219 
    220 	add	v0.4s, v0.4s, v16.4s
    221 	add	v1.4s, v1.4s, v17.4s
    222 	add	v2.4s, v2.4s, v18.4s
    223 	add	v3.4s, v3.4s, v19.4s
    224 	add	v4.4s, v4.4s, v20.4s
    225 	add	v5.4s, v5.4s, v21.4s
    226 	add	v6.4s, v6.4s, v22.4s
    227 	add	v7.4s, v7.4s, v23.4s
    228 	add	v8.4s, v8.4s, v24.4s
    229 	add	v9.4s, v9.4s, v25.4s
    230 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    231 	add	v11.4s, v11.4s, v28.4s
    232 	add	v12.4s, v12.4s, v26.4s
    233 	add	v13.4s, v13.4s, v29.4s
    234 	add	v14.4s, v14.4s, v30.4s
    235 	add	v15.4s, v15.4s, v31.4s
    236 
    237 	HTOLE32(v0.16b)
    238 	HTOLE32(v1.16b)
    239 	HTOLE32(v2.16b)
    240 	HTOLE32(v3.16b)
    241 	HTOLE32(v4.16b)
    242 	HTOLE32(v5.16b)
    243 	HTOLE32(v6.16b)
    244 	HTOLE32(v7.16b)
    245 	HTOLE32(v8.16b)
    246 	HTOLE32(v9.16b)
    247 	HTOLE32(v10.16b)
    248 	HTOLE32(v11.16b)
    249 	HTOLE32(v12.16b)
    250 	HTOLE32(v13.16b)
    251 	HTOLE32(v14.16b)
    252 	HTOLE32(v15.16b)
    253 
    254 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
    255 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
    256 	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
    257 	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
    258 	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
    259 	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
    260 	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
    261 	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
    262 	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
    263 	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
    264 	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
    265 	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
    266 	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
    267 	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
    268 	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
    269 	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
    270 
    271 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    272 	ldp	d10, d11, [sp, #0x20]
    273 	ldp	d12, d13, [sp, #0x30]
    274 	ldp	d14, d15, [sp, #0x40]
    275 
    276 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    277 	ret
    278 END(chacha_stream256_neon)
    279 
    280 /*
    281  * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
    282  *     uint32_t blkno@w2,
    283  *     const uint8_t nonce[12]@x3,
    284  *     const uint8_t key[32]@x4,
    285  *     const uint8_t const[16]@x5,
    286  *     unsigned nr@w6)
    287  */
    288 ENTRY(chacha_stream_xor256_neon)
    289 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    290 	mov	fp, sp
    291 
    292 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    293 	stp	d10, d11, [sp, #0x20]
    294 	stp	d12, d13, [sp, #0x30]
    295 	stp	d14, d15, [sp, #0x40]
    296 
    297 	adrl	x9, v0123	/* x9 := &v0123 */
    298 	mov	x10, x5		/* r10 := c */
    299 	mov	x11, x4		/* r11 := k */
    300 	add	x12, x4, #16	/* r12 := k+4 */
    301 	mov	x13, x3		/* r13 := nonce */
    302 
    303 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    304 	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
    305 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    306 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    307 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    308 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    309 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    310 
    311 	LE32TOH(v0.16b)
    312 	LE32TOH(v1.16b)
    313 	LE32TOH(v2.16b)
    314 	LE32TOH(v3.16b)
    315 	LE32TOH(v4.16b)
    316 	LE32TOH(v5.16b)
    317 	LE32TOH(v6.16b)
    318 	LE32TOH(v7.16b)
    319 	LE32TOH(v8.16b)
    320 	LE32TOH(v9.16b)
    321 	LE32TOH(v10.16b)
    322 	LE32TOH(v11.16b)
    323 	/* LE32TOH(v12.16b) -- blkno, already host order */
    324 	LE32TOH(v13.16b)
    325 	LE32TOH(v14.16b)
    326 	LE32TOH(v15.16b)
    327 
    328 	mov	v16.16b, v0.16b
    329 	mov	v17.16b, v1.16b
    330 	mov	v18.16b, v2.16b
    331 	mov	v19.16b, v3.16b
    332 	mov	v20.16b, v4.16b
    333 	mov	v21.16b, v5.16b
    334 	mov	v22.16b, v6.16b
    335 	mov	v23.16b, v7.16b
    336 	mov	v24.16b, v8.16b
    337 	mov	v25.16b, v9.16b
    338 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    339 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    340 	mov	w9, v11.s[0]
    341 	mov	w10, v13.s[0]
    342 	mov	w11, v14.s[0]
    343 	mov	w12, v15.s[0]
    344 
    345         _ALIGN_TEXT
    346 1:	subs	w6, w6, #2
    347 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    348 	    v28,v29,v30,v31, v27)
    349 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    350 	    v28,v29,v30,v31, v27)
    351 	b.ne	1b
    352 
    353 	dup	v27.4s, w8
    354 	dup	v28.4s, w9
    355 	dup	v29.4s, w10
    356 	dup	v30.4s, w11
    357 	dup	v31.4s, w12
    358 
    359 	add	v0.4s, v0.4s, v16.4s
    360 	add	v1.4s, v1.4s, v17.4s
    361 	add	v2.4s, v2.4s, v18.4s
    362 	add	v3.4s, v3.4s, v19.4s
    363 	add	v4.4s, v4.4s, v20.4s
    364 	add	v5.4s, v5.4s, v21.4s
    365 	add	v6.4s, v6.4s, v22.4s
    366 	add	v7.4s, v7.4s, v23.4s
    367 	add	v8.4s, v8.4s, v24.4s
    368 	add	v9.4s, v9.4s, v25.4s
    369 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    370 	add	v11.4s, v11.4s, v28.4s
    371 	add	v12.4s, v12.4s, v26.4s
    372 	add	v13.4s, v13.4s, v29.4s
    373 	add	v14.4s, v14.4s, v30.4s
    374 	add	v15.4s, v15.4s, v31.4s
    375 
    376 	/*
    377 	 * We could do these sixteen LD4-into-lane instructions instead
    378 	 * by four LD1-into-register instructions, but we would need to
    379 	 * permute the elements in v0-v15 to put them in the right
    380 	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
    381 	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
    382 	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
    383 	 * exceed the savings in cost from four LD1 instructions rather
    384 	 * than sixteen LD4 instructions, even if we interleave the LD1
    385 	 * instructions with the ZIPs.
    386 	 */
    387 	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
    388 	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
    389 	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
    390 	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
    391 	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
    392 	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
    393 	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
    394 	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
    395 	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
    396 	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
    397 	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
    398 	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
    399 	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
    400 	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
    401 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
    402 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
    403 
    404 	HTOLE32(v0.16b)
    405 	HTOLE32(v1.16b)
    406 	HTOLE32(v2.16b)
    407 	HTOLE32(v3.16b)
    408 	HTOLE32(v4.16b)
    409 	HTOLE32(v5.16b)
    410 	HTOLE32(v6.16b)
    411 	HTOLE32(v7.16b)
    412 	HTOLE32(v8.16b)
    413 	HTOLE32(v9.16b)
    414 	HTOLE32(v10.16b)
    415 	HTOLE32(v11.16b)
    416 	HTOLE32(v12.16b)
    417 	HTOLE32(v13.16b)
    418 	HTOLE32(v14.16b)
    419 	HTOLE32(v15.16b)
    420 
    421 	eor	v16.16b, v16.16b, v0.16b
    422 	eor	v17.16b, v17.16b, v1.16b
    423 	eor	v18.16b, v18.16b, v2.16b
    424 	eor	v19.16b, v19.16b, v3.16b
    425 	eor	v20.16b, v20.16b, v4.16b
    426 	eor	v21.16b, v21.16b, v5.16b
    427 	eor	v22.16b, v22.16b, v6.16b
    428 	eor	v23.16b, v23.16b, v7.16b
    429 	eor	v24.16b, v24.16b, v8.16b
    430 	eor	v25.16b, v25.16b, v9.16b
    431 	eor	v26.16b, v26.16b, v10.16b
    432 	eor	v27.16b, v27.16b, v11.16b
    433 	eor	v28.16b, v28.16b, v12.16b
    434 	eor	v29.16b, v29.16b, v13.16b
    435 	eor	v30.16b, v30.16b, v14.16b
    436 	eor	v31.16b, v31.16b, v15.16b
    437 
    438 	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
    439 	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
    440 	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
    441 	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
    442 	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
    443 	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
    444 	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
    445 	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
    446 	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
    447 	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
    448 	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
    449 	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
    450 	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
    451 	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
    452 	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
    453 	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
    454 
    455 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    456 	ldp	d10, d11, [sp, #0x20]
    457 	ldp	d12, d13, [sp, #0x30]
    458 	ldp	d14, d15, [sp, #0x40]
    459 
    460 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    461 	ret
    462 END(chacha_stream_xor256_neon)
    463 
    464 	.section .rodata
    465 	.p2align 4
    466 
    467 	.type	v0123,@object
    468 v0123:
    469 	.long	0, 1, 2, 3
    470 END(v0123)
    471 
    472 	/*
    473 	 * Must be immediately after v0123 -- we load them in a single
    474 	 * ld1 instruction.
    475 	 */
    476 	.type	rot8,@object
    477 rot8:
    478 	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
    479 END(rot8)
    480