Home | History | Annotate | Line # | Download | only in arm
chacha_neon_64.S revision 1.1
      1 /*	$NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 .macro	adrl 	reg, addr
     30 	adrp	\reg, \addr
     31 	add	\reg, \reg, #:lo12:\addr
     32 .endm
     33 
     34 #define	_ALIGN_TEXT							      \
     35 	.p2align 4
     36 
     37 #define	ENTRY(x)							      \
     38 	.text;								      \
     39 	_ALIGN_TEXT;							      \
     40 	.global	x;							      \
     41 	.type	x,@function;						      \
     42 x:
     43 
     44 #define	END(x)								      \
     45 	.size x, . - x
     46 
     47 #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
     48 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     49 STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     50 STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     51 STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     52 STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     53 STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     54 STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     55 STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     56 STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     57 STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     58 STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     59 STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     60 STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     61 STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     62 STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     63 STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     64 STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     65 STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     66 STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     67 STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     68 /* end ROUND */
     69 
     70 #define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
     71 	f(a0,b0,c0,d0, t0, r);						      \
     72 	f(a1,b1,c1,d1, t1, r);						      \
     73 	f(a2,b2,c2,d2, t2, r);						      \
     74 	f(a3,b3,c3,d3, t3, r);						      \
     75 	/* end of STEP */
     76 
     77 /*
     78  * Each step of the ChaCha quarterround, split up so we can interleave
     79  * the quarterrounds on independent rows/diagonals to maximize pipeline
     80  * efficiency.  Reference:
     81  *
     82  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
     83  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
     84  *	https://cr.yp.to/papers.html#chacha
     85  *
     86  *	a += b; d ^= a; d <<<= 16;
     87  *	c += d; b ^= c; b <<<= 12;
     88  *	a += b; d ^= a; d <<<= 8;
     89  *	c += d; b ^= c; b <<<= 7;
     90  *
     91  * The rotations are implemented with:
     92  *	<<< 16		REV32 Vn.8h for 16,
     93  *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
     94  *	<<< 8		TBL (general permutation; rot8 below stored in r)
     95  *	<<< 7		SHL/SRI/ORR
     96  */
     97 #define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
     98 #define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
     99 #if 0
    100 #define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
    101 #define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
    102 #define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    103 #else
    104 #define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
    105 #define	STEP3(a,b,c,d, t, r)	/* nothing */
    106 #define	STEP4(a,b,c,d, t, r)	/* nothing */
    107 #endif
    108 
    109 #define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    110 #if 0
    111 #define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    112 #define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
    113 #define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
    114 #define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    115 #else
    116 #define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    117 #define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
    118 #define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
    119 #define	STEP9(a,b,c,d, t, r)	/* nothing */
    120 #endif
    121 
    122 #define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
    123 #define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
    124 #if 0
    125 #define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
    126 #define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
    127 #define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    128 #else
    129 #define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
    130 #define	STEP13(a,b,c,d, t, r)	/* nothing */
    131 #define	STEP14(a,b,c,d, t, r)	/* nothing */
    132 #endif
    133 
    134 #define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    135 #if 0
    136 #define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    137 #define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
    138 #define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
    139 #define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    140 #else
    141 #define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    142 #define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
    143 #define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
    144 #define	STEP19(a,b,c,d, t, r)	/* nothing */
    145 #endif
    146 
    147 #if _BYTE_ORDER == _LITTLE_ENDIAN
    148 #define	HTOLE32(x)
    149 #define	LE32TOH(x)
    150 #elif _BYTE_ORDER == _BIG_ENDIAN
    151 #define	HTOLE32(x)	rev32	x, x
    152 #define	LE32TOH(x)	rev32	x, x
    153 #endif
    154 
    155 /*
    156  * chacha_stream256_neon(uint8_t s[256]@x0,
    157  *     uint32_t blkno@w1,
    158  *     const uint8_t nonce[12]@x2,
    159  *     const uint8_t key[12]@x3,
    160  *     const uint8_t const[16]@x4,
    161  *     unsigned nr@w5)
    162  */
    163 ENTRY(chacha_stream256_neon)
    164 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    165 	mov	fp, sp
    166 
    167 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    168 	stp	d10, d11, [sp, #0x20]
    169 	stp	d12, d13, [sp, #0x30]
    170 	stp	d14, d15, [sp, #0x40]
    171 
    172 	adrl	x9, v0123	/* x9 := &v0123 */
    173 	mov	x10, x4		/* r10 := c */
    174 	mov	x11, x3		/* r11 := k */
    175 	add	x12, x3, #16	/* r12 := k+4 */
    176 	mov	x13, x2		/* r13 := nonce */
    177 
    178 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    179 	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
    180 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    181 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    182 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    183 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    184 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    185 
    186 	HTOLE32(v0.16b)
    187 	HTOLE32(v1.16b)
    188 	HTOLE32(v2.16b)
    189 	HTOLE32(v3.16b)
    190 	HTOLE32(v4.16b)
    191 	HTOLE32(v5.16b)
    192 	HTOLE32(v6.16b)
    193 	HTOLE32(v7.16b)
    194 	HTOLE32(v8.16b)
    195 	HTOLE32(v9.16b)
    196 	HTOLE32(v10.16b)
    197 	HTOLE32(v11.16b)
    198 	HTOLE32(v12.16b)
    199 	HTOLE32(v13.16b)
    200 	HTOLE32(v14.16b)
    201 	HTOLE32(v15.16b)
    202 
    203 	mov	v16.16b, v0.16b
    204 	mov	v17.16b, v1.16b
    205 	mov	v18.16b, v2.16b
    206 	mov	v19.16b, v3.16b
    207 	mov	v20.16b, v4.16b
    208 	mov	v21.16b, v5.16b
    209 	mov	v22.16b, v6.16b
    210 	mov	v23.16b, v7.16b
    211 	mov	v24.16b, v8.16b
    212 	mov	v25.16b, v9.16b
    213 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    214 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    215 	mov	w9, v11.s[0]
    216 	mov	w10, v13.s[0]
    217 	mov	w11, v14.s[0]
    218 	mov	w12, v15.s[0]
    219 
    220 1:	subs	w5, w5, #2
    221 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    222 	    v28,v29,v30,v31, v27)
    223 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    224 	    v28,v29,v30,v31, v27)
    225 	b.ne	1b
    226 
    227 	dup	v27.4s, w8
    228 	dup	v28.4s, w9
    229 	dup	v29.4s, w10
    230 	dup	v30.4s, w11
    231 	dup	v31.4s, w12
    232 
    233 	add	v0.4s, v0.4s, v16.4s
    234 	add	v1.4s, v1.4s, v17.4s
    235 	add	v2.4s, v2.4s, v18.4s
    236 	add	v3.4s, v3.4s, v19.4s
    237 	add	v4.4s, v4.4s, v20.4s
    238 	add	v5.4s, v5.4s, v21.4s
    239 	add	v6.4s, v6.4s, v22.4s
    240 	add	v7.4s, v7.4s, v23.4s
    241 	add	v8.4s, v8.4s, v24.4s
    242 	add	v9.4s, v9.4s, v25.4s
    243 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    244 	add	v11.4s, v11.4s, v28.4s
    245 	add	v12.4s, v12.4s, v26.4s
    246 	add	v13.4s, v13.4s, v29.4s
    247 	add	v14.4s, v14.4s, v30.4s
    248 	add	v15.4s, v15.4s, v31.4s
    249 
    250 	LE32TOH(v0.16b)
    251 	LE32TOH(v1.16b)
    252 	LE32TOH(v2.16b)
    253 	LE32TOH(v3.16b)
    254 	LE32TOH(v4.16b)
    255 	LE32TOH(v5.16b)
    256 	LE32TOH(v6.16b)
    257 	LE32TOH(v7.16b)
    258 	LE32TOH(v8.16b)
    259 	LE32TOH(v9.16b)
    260 	LE32TOH(v10.16b)
    261 	LE32TOH(v11.16b)
    262 	LE32TOH(v12.16b)
    263 	LE32TOH(v13.16b)
    264 	LE32TOH(v14.16b)
    265 	LE32TOH(v15.16b)
    266 
    267 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
    268 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
    269 	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
    270 	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
    271 	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
    272 	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
    273 	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
    274 	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
    275 	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
    276 	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
    277 	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
    278 	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
    279 	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
    280 	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
    281 	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
    282 	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
    283 
    284 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    285 	ldp	d10, d11, [sp, #0x20]
    286 	ldp	d12, d13, [sp, #0x30]
    287 	ldp	d14, d15, [sp, #0x40]
    288 
    289 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    290 	ret
    291 END(chacha_stream256_neon)
    292 
    293 /*
    294  * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
    295  *     uint32_t blkno@w2,
    296  *     const uint8_t nonce[12]@x3,
    297  *     const uint8_t key[32]@x4,
    298  *     const uint8_t const[16]@x5,
    299  *     unsigned nr@w6)
    300  */
    301 ENTRY(chacha_stream_xor256_neon)
    302 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    303 	mov	fp, sp
    304 
    305 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    306 	stp	d10, d11, [sp, #0x20]
    307 	stp	d12, d13, [sp, #0x30]
    308 	stp	d14, d15, [sp, #0x40]
    309 
    310 	adrl	x9, v0123	/* x9 := &v0123 */
    311 	mov	x10, x5		/* r10 := c */
    312 	mov	x11, x4		/* r11 := k */
    313 	add	x12, x4, #16	/* r12 := k+4 */
    314 	mov	x13, x3		/* r13 := nonce */
    315 
    316 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    317 	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
    318 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    319 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    320 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    321 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    322 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    323 
    324 	HTOLE32(v0.16b)
    325 	HTOLE32(v1.16b)
    326 	HTOLE32(v2.16b)
    327 	HTOLE32(v3.16b)
    328 	HTOLE32(v4.16b)
    329 	HTOLE32(v5.16b)
    330 	HTOLE32(v6.16b)
    331 	HTOLE32(v7.16b)
    332 	HTOLE32(v8.16b)
    333 	HTOLE32(v9.16b)
    334 	HTOLE32(v10.16b)
    335 	HTOLE32(v11.16b)
    336 	HTOLE32(v12.16b)
    337 	HTOLE32(v13.16b)
    338 	HTOLE32(v14.16b)
    339 	HTOLE32(v15.16b)
    340 
    341 	mov	v16.16b, v0.16b
    342 	mov	v17.16b, v1.16b
    343 	mov	v18.16b, v2.16b
    344 	mov	v19.16b, v3.16b
    345 	mov	v20.16b, v4.16b
    346 	mov	v21.16b, v5.16b
    347 	mov	v22.16b, v6.16b
    348 	mov	v23.16b, v7.16b
    349 	mov	v24.16b, v8.16b
    350 	mov	v25.16b, v9.16b
    351 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    352 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    353 	mov	w9, v11.s[0]
    354 	mov	w10, v13.s[0]
    355 	mov	w11, v14.s[0]
    356 	mov	w12, v15.s[0]
    357 
    358 1:	subs	w6, w6, #2
    359 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    360 	    v28,v29,v30,v31, v27)
    361 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    362 	    v28,v29,v30,v31, v27)
    363 	b.ne	1b
    364 
    365 	dup	v27.4s, w8
    366 	dup	v28.4s, w9
    367 	dup	v29.4s, w10
    368 	dup	v30.4s, w11
    369 	dup	v31.4s, w12
    370 
    371 	add	v0.4s, v0.4s, v16.4s
    372 	add	v1.4s, v1.4s, v17.4s
    373 	add	v2.4s, v2.4s, v18.4s
    374 	add	v3.4s, v3.4s, v19.4s
    375 	add	v4.4s, v4.4s, v20.4s
    376 	add	v5.4s, v5.4s, v21.4s
    377 	add	v6.4s, v6.4s, v22.4s
    378 	add	v7.4s, v7.4s, v23.4s
    379 	add	v8.4s, v8.4s, v24.4s
    380 	add	v9.4s, v9.4s, v25.4s
    381 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    382 	add	v11.4s, v11.4s, v28.4s
    383 	add	v12.4s, v12.4s, v26.4s
    384 	add	v13.4s, v13.4s, v29.4s
    385 	add	v14.4s, v14.4s, v30.4s
    386 	add	v15.4s, v15.4s, v31.4s
    387 
    388 	/*
    389 	 * We could do these sixteen LD4-into-lane instructions instead
    390 	 * by four LD1-into-register instructions, but we would need to
    391 	 * permute the elements in v0-v15 to put them in the right
    392 	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
    393 	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
    394 	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
    395 	 * exceed the savings in cost from four LD1 instructions rather
    396 	 * than sixteen LD4 instructions, even if we interleave the LD1
    397 	 * instructions with the ZIPs.
    398 	 */
    399 	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
    400 	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
    401 	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
    402 	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
    403 	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
    404 	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
    405 	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
    406 	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
    407 	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
    408 	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
    409 	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
    410 	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
    411 	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
    412 	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
    413 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
    414 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
    415 
    416 	LE32TOH(v0.16b)
    417 	LE32TOH(v1.16b)
    418 	LE32TOH(v2.16b)
    419 	LE32TOH(v3.16b)
    420 	LE32TOH(v4.16b)
    421 	LE32TOH(v5.16b)
    422 	LE32TOH(v6.16b)
    423 	LE32TOH(v7.16b)
    424 	LE32TOH(v8.16b)
    425 	LE32TOH(v9.16b)
    426 	LE32TOH(v10.16b)
    427 	LE32TOH(v11.16b)
    428 	LE32TOH(v12.16b)
    429 	LE32TOH(v13.16b)
    430 	LE32TOH(v14.16b)
    431 	LE32TOH(v15.16b)
    432 
    433 	eor	v16.16b, v16.16b, v0.16b
    434 	eor	v17.16b, v17.16b, v1.16b
    435 	eor	v18.16b, v18.16b, v2.16b
    436 	eor	v19.16b, v19.16b, v3.16b
    437 	eor	v20.16b, v20.16b, v4.16b
    438 	eor	v21.16b, v21.16b, v5.16b
    439 	eor	v22.16b, v22.16b, v6.16b
    440 	eor	v23.16b, v23.16b, v7.16b
    441 	eor	v24.16b, v24.16b, v8.16b
    442 	eor	v25.16b, v25.16b, v9.16b
    443 	eor	v26.16b, v26.16b, v10.16b
    444 	eor	v27.16b, v27.16b, v11.16b
    445 	eor	v28.16b, v28.16b, v12.16b
    446 	eor	v29.16b, v29.16b, v13.16b
    447 	eor	v30.16b, v30.16b, v14.16b
    448 	eor	v31.16b, v31.16b, v15.16b
    449 
    450 	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
    451 	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
    452 	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
    453 	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
    454 	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
    455 	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
    456 	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
    457 	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
    458 	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
    459 	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
    460 	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
    461 	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
    462 	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
    463 	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
    464 	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
    465 	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
    466 
    467 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    468 	ldp	d10, d11, [sp, #0x20]
    469 	ldp	d12, d13, [sp, #0x30]
    470 	ldp	d14, d15, [sp, #0x40]
    471 
    472 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    473 	ret
    474 END(chacha_stream_xor256_neon)
    475 
    476 	.section .rodata
    477 	.p2align 4
    478 
    479 	.type	v0123,@object
    480 v0123:
    481 	.long	0, 1, 2, 3
    482 END(v0123)
    483 
    484 	/*
    485 	 * Must be immediately after v0123 -- we load them in a single
    486 	 * ld1 instruction.
    487 	 */
    488 	.type	rot8,@object
    489 rot8:
    490 	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
    491 END(rot8)
    492