Home | History | Annotate | Line # | Download | only in arm
chacha_neon_64.S revision 1.1
      1  1.1  riastrad /*	$NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad .macro	adrl 	reg, addr
     30  1.1  riastrad 	adrp	\reg, \addr
     31  1.1  riastrad 	add	\reg, \reg, #:lo12:\addr
     32  1.1  riastrad .endm
     33  1.1  riastrad 
     34  1.1  riastrad #define	_ALIGN_TEXT							      \
     35  1.1  riastrad 	.p2align 4
     36  1.1  riastrad 
     37  1.1  riastrad #define	ENTRY(x)							      \
     38  1.1  riastrad 	.text;								      \
     39  1.1  riastrad 	_ALIGN_TEXT;							      \
     40  1.1  riastrad 	.global	x;							      \
     41  1.1  riastrad 	.type	x,@function;						      \
     42  1.1  riastrad x:
     43  1.1  riastrad 
     44  1.1  riastrad #define	END(x)								      \
     45  1.1  riastrad 	.size x, . - x
     46  1.1  riastrad 
     47  1.1  riastrad #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
     48  1.1  riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     49  1.1  riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     50  1.1  riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     51  1.1  riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     52  1.1  riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     53  1.1  riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     54  1.1  riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     55  1.1  riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     56  1.1  riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     57  1.1  riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     58  1.1  riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     59  1.1  riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     60  1.1  riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     61  1.1  riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     62  1.1  riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     63  1.1  riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     64  1.1  riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     65  1.1  riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     66  1.1  riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     67  1.1  riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     68  1.1  riastrad /* end ROUND */
     69  1.1  riastrad 
     70  1.1  riastrad #define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
     71  1.1  riastrad 	f(a0,b0,c0,d0, t0, r);						      \
     72  1.1  riastrad 	f(a1,b1,c1,d1, t1, r);						      \
     73  1.1  riastrad 	f(a2,b2,c2,d2, t2, r);						      \
     74  1.1  riastrad 	f(a3,b3,c3,d3, t3, r);						      \
     75  1.1  riastrad 	/* end of STEP */
     76  1.1  riastrad 
     77  1.1  riastrad /*
     78  1.1  riastrad  * Each step of the ChaCha quarterround, split up so we can interleave
     79  1.1  riastrad  * the quarterrounds on independent rows/diagonals to maximize pipeline
     80  1.1  riastrad  * efficiency.  Reference:
     81  1.1  riastrad  *
     82  1.1  riastrad  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
     83  1.1  riastrad  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
     84  1.1  riastrad  *	https://cr.yp.to/papers.html#chacha
     85  1.1  riastrad  *
     86  1.1  riastrad  *	a += b; d ^= a; d <<<= 16;
     87  1.1  riastrad  *	c += d; b ^= c; b <<<= 12;
     88  1.1  riastrad  *	a += b; d ^= a; d <<<= 8;
     89  1.1  riastrad  *	c += d; b ^= c; b <<<= 7;
     90  1.1  riastrad  *
     91  1.1  riastrad  * The rotations are implemented with:
     92  1.1  riastrad  *	<<< 16		REV32 Vn.8h for 16,
     93  1.1  riastrad  *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
     94  1.1  riastrad  *	<<< 8		TBL (general permutation; rot8 below stored in r)
     95  1.1  riastrad  *	<<< 7		SHL/SRI/ORR
     96  1.1  riastrad  */
     97  1.1  riastrad #define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
     98  1.1  riastrad #define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
     99  1.1  riastrad #if 0
    100  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
    101  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
    102  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    103  1.1  riastrad #else
    104  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
    105  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	/* nothing */
    106  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	/* nothing */
    107  1.1  riastrad #endif
    108  1.1  riastrad 
    109  1.1  riastrad #define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    110  1.1  riastrad #if 0
    111  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    112  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
    113  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
    114  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    115  1.1  riastrad #else
    116  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    117  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
    118  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
    119  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	/* nothing */
    120  1.1  riastrad #endif
    121  1.1  riastrad 
    122  1.1  riastrad #define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
    123  1.1  riastrad #define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
    124  1.1  riastrad #if 0
    125  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
    126  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
    127  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    128  1.1  riastrad #else
    129  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
    130  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	/* nothing */
    131  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	/* nothing */
    132  1.1  riastrad #endif
    133  1.1  riastrad 
    134  1.1  riastrad #define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    135  1.1  riastrad #if 0
    136  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    137  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
    138  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
    139  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    140  1.1  riastrad #else
    141  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    142  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
    143  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
    144  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	/* nothing */
    145  1.1  riastrad #endif
    146  1.1  riastrad 
    147  1.1  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    148  1.1  riastrad #define	HTOLE32(x)
    149  1.1  riastrad #define	LE32TOH(x)
    150  1.1  riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
    151  1.1  riastrad #define	HTOLE32(x)	rev32	x, x
    152  1.1  riastrad #define	LE32TOH(x)	rev32	x, x
    153  1.1  riastrad #endif
    154  1.1  riastrad 
    155  1.1  riastrad /*
    156  1.1  riastrad  * chacha_stream256_neon(uint8_t s[256]@x0,
    157  1.1  riastrad  *     uint32_t blkno@w1,
    158  1.1  riastrad  *     const uint8_t nonce[12]@x2,
    159  1.1  riastrad  *     const uint8_t key[12]@x3,
    160  1.1  riastrad  *     const uint8_t const[16]@x4,
    161  1.1  riastrad  *     unsigned nr@w5)
    162  1.1  riastrad  */
    163  1.1  riastrad ENTRY(chacha_stream256_neon)
    164  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    165  1.1  riastrad 	mov	fp, sp
    166  1.1  riastrad 
    167  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    168  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    169  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    170  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    171  1.1  riastrad 
    172  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    173  1.1  riastrad 	mov	x10, x4		/* r10 := c */
    174  1.1  riastrad 	mov	x11, x3		/* r11 := k */
    175  1.1  riastrad 	add	x12, x3, #16	/* r12 := k+4 */
    176  1.1  riastrad 	mov	x13, x2		/* r13 := nonce */
    177  1.1  riastrad 
    178  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    179  1.1  riastrad 	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
    180  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    181  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    182  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    183  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    184  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    185  1.1  riastrad 
    186  1.1  riastrad 	HTOLE32(v0.16b)
    187  1.1  riastrad 	HTOLE32(v1.16b)
    188  1.1  riastrad 	HTOLE32(v2.16b)
    189  1.1  riastrad 	HTOLE32(v3.16b)
    190  1.1  riastrad 	HTOLE32(v4.16b)
    191  1.1  riastrad 	HTOLE32(v5.16b)
    192  1.1  riastrad 	HTOLE32(v6.16b)
    193  1.1  riastrad 	HTOLE32(v7.16b)
    194  1.1  riastrad 	HTOLE32(v8.16b)
    195  1.1  riastrad 	HTOLE32(v9.16b)
    196  1.1  riastrad 	HTOLE32(v10.16b)
    197  1.1  riastrad 	HTOLE32(v11.16b)
    198  1.1  riastrad 	HTOLE32(v12.16b)
    199  1.1  riastrad 	HTOLE32(v13.16b)
    200  1.1  riastrad 	HTOLE32(v14.16b)
    201  1.1  riastrad 	HTOLE32(v15.16b)
    202  1.1  riastrad 
    203  1.1  riastrad 	mov	v16.16b, v0.16b
    204  1.1  riastrad 	mov	v17.16b, v1.16b
    205  1.1  riastrad 	mov	v18.16b, v2.16b
    206  1.1  riastrad 	mov	v19.16b, v3.16b
    207  1.1  riastrad 	mov	v20.16b, v4.16b
    208  1.1  riastrad 	mov	v21.16b, v5.16b
    209  1.1  riastrad 	mov	v22.16b, v6.16b
    210  1.1  riastrad 	mov	v23.16b, v7.16b
    211  1.1  riastrad 	mov	v24.16b, v8.16b
    212  1.1  riastrad 	mov	v25.16b, v9.16b
    213  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    214  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    215  1.1  riastrad 	mov	w9, v11.s[0]
    216  1.1  riastrad 	mov	w10, v13.s[0]
    217  1.1  riastrad 	mov	w11, v14.s[0]
    218  1.1  riastrad 	mov	w12, v15.s[0]
    219  1.1  riastrad 
    220  1.1  riastrad 1:	subs	w5, w5, #2
    221  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    222  1.1  riastrad 	    v28,v29,v30,v31, v27)
    223  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    224  1.1  riastrad 	    v28,v29,v30,v31, v27)
    225  1.1  riastrad 	b.ne	1b
    226  1.1  riastrad 
    227  1.1  riastrad 	dup	v27.4s, w8
    228  1.1  riastrad 	dup	v28.4s, w9
    229  1.1  riastrad 	dup	v29.4s, w10
    230  1.1  riastrad 	dup	v30.4s, w11
    231  1.1  riastrad 	dup	v31.4s, w12
    232  1.1  riastrad 
    233  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    234  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    235  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    236  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    237  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    238  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    239  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    240  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    241  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    242  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    243  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    244  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    245  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    246  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    247  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    248  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    249  1.1  riastrad 
    250  1.1  riastrad 	LE32TOH(v0.16b)
    251  1.1  riastrad 	LE32TOH(v1.16b)
    252  1.1  riastrad 	LE32TOH(v2.16b)
    253  1.1  riastrad 	LE32TOH(v3.16b)
    254  1.1  riastrad 	LE32TOH(v4.16b)
    255  1.1  riastrad 	LE32TOH(v5.16b)
    256  1.1  riastrad 	LE32TOH(v6.16b)
    257  1.1  riastrad 	LE32TOH(v7.16b)
    258  1.1  riastrad 	LE32TOH(v8.16b)
    259  1.1  riastrad 	LE32TOH(v9.16b)
    260  1.1  riastrad 	LE32TOH(v10.16b)
    261  1.1  riastrad 	LE32TOH(v11.16b)
    262  1.1  riastrad 	LE32TOH(v12.16b)
    263  1.1  riastrad 	LE32TOH(v13.16b)
    264  1.1  riastrad 	LE32TOH(v14.16b)
    265  1.1  riastrad 	LE32TOH(v15.16b)
    266  1.1  riastrad 
    267  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
    268  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
    269  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
    270  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
    271  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
    272  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
    273  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
    274  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
    275  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
    276  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
    277  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
    278  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
    279  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
    280  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
    281  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
    282  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
    283  1.1  riastrad 
    284  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    285  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    286  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    287  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    288  1.1  riastrad 
    289  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    290  1.1  riastrad 	ret
    291  1.1  riastrad END(chacha_stream256_neon)
    292  1.1  riastrad 
    293  1.1  riastrad /*
    294  1.1  riastrad  * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
    295  1.1  riastrad  *     uint32_t blkno@w2,
    296  1.1  riastrad  *     const uint8_t nonce[12]@x3,
    297  1.1  riastrad  *     const uint8_t key[32]@x4,
    298  1.1  riastrad  *     const uint8_t const[16]@x5,
    299  1.1  riastrad  *     unsigned nr@w6)
    300  1.1  riastrad  */
    301  1.1  riastrad ENTRY(chacha_stream_xor256_neon)
    302  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    303  1.1  riastrad 	mov	fp, sp
    304  1.1  riastrad 
    305  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    306  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    307  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    308  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    309  1.1  riastrad 
    310  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    311  1.1  riastrad 	mov	x10, x5		/* r10 := c */
    312  1.1  riastrad 	mov	x11, x4		/* r11 := k */
    313  1.1  riastrad 	add	x12, x4, #16	/* r12 := k+4 */
    314  1.1  riastrad 	mov	x13, x3		/* r13 := nonce */
    315  1.1  riastrad 
    316  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    317  1.1  riastrad 	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
    318  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    319  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    320  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    321  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    322  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    323  1.1  riastrad 
    324  1.1  riastrad 	HTOLE32(v0.16b)
    325  1.1  riastrad 	HTOLE32(v1.16b)
    326  1.1  riastrad 	HTOLE32(v2.16b)
    327  1.1  riastrad 	HTOLE32(v3.16b)
    328  1.1  riastrad 	HTOLE32(v4.16b)
    329  1.1  riastrad 	HTOLE32(v5.16b)
    330  1.1  riastrad 	HTOLE32(v6.16b)
    331  1.1  riastrad 	HTOLE32(v7.16b)
    332  1.1  riastrad 	HTOLE32(v8.16b)
    333  1.1  riastrad 	HTOLE32(v9.16b)
    334  1.1  riastrad 	HTOLE32(v10.16b)
    335  1.1  riastrad 	HTOLE32(v11.16b)
    336  1.1  riastrad 	HTOLE32(v12.16b)
    337  1.1  riastrad 	HTOLE32(v13.16b)
    338  1.1  riastrad 	HTOLE32(v14.16b)
    339  1.1  riastrad 	HTOLE32(v15.16b)
    340  1.1  riastrad 
    341  1.1  riastrad 	mov	v16.16b, v0.16b
    342  1.1  riastrad 	mov	v17.16b, v1.16b
    343  1.1  riastrad 	mov	v18.16b, v2.16b
    344  1.1  riastrad 	mov	v19.16b, v3.16b
    345  1.1  riastrad 	mov	v20.16b, v4.16b
    346  1.1  riastrad 	mov	v21.16b, v5.16b
    347  1.1  riastrad 	mov	v22.16b, v6.16b
    348  1.1  riastrad 	mov	v23.16b, v7.16b
    349  1.1  riastrad 	mov	v24.16b, v8.16b
    350  1.1  riastrad 	mov	v25.16b, v9.16b
    351  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    352  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    353  1.1  riastrad 	mov	w9, v11.s[0]
    354  1.1  riastrad 	mov	w10, v13.s[0]
    355  1.1  riastrad 	mov	w11, v14.s[0]
    356  1.1  riastrad 	mov	w12, v15.s[0]
    357  1.1  riastrad 
    358  1.1  riastrad 1:	subs	w6, w6, #2
    359  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    360  1.1  riastrad 	    v28,v29,v30,v31, v27)
    361  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    362  1.1  riastrad 	    v28,v29,v30,v31, v27)
    363  1.1  riastrad 	b.ne	1b
    364  1.1  riastrad 
    365  1.1  riastrad 	dup	v27.4s, w8
    366  1.1  riastrad 	dup	v28.4s, w9
    367  1.1  riastrad 	dup	v29.4s, w10
    368  1.1  riastrad 	dup	v30.4s, w11
    369  1.1  riastrad 	dup	v31.4s, w12
    370  1.1  riastrad 
    371  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    372  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    373  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    374  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    375  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    376  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    377  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    378  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    379  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    380  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    381  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    382  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    383  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    384  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    385  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    386  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    387  1.1  riastrad 
    388  1.1  riastrad 	/*
    389  1.1  riastrad 	 * We could do these sixteen LD4-into-lane instructions instead
    390  1.1  riastrad 	 * by four LD1-into-register instructions, but we would need to
    391  1.1  riastrad 	 * permute the elements in v0-v15 to put them in the right
    392  1.1  riastrad 	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
    393  1.1  riastrad 	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
    394  1.1  riastrad 	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
    395  1.1  riastrad 	 * exceed the savings in cost from four LD1 instructions rather
    396  1.1  riastrad 	 * than sixteen LD4 instructions, even if we interleave the LD1
    397  1.1  riastrad 	 * instructions with the ZIPs.
    398  1.1  riastrad 	 */
    399  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
    400  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
    401  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
    402  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
    403  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
    404  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
    405  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
    406  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
    407  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
    408  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
    409  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
    410  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
    411  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
    412  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
    413  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
    414  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
    415  1.1  riastrad 
    416  1.1  riastrad 	LE32TOH(v0.16b)
    417  1.1  riastrad 	LE32TOH(v1.16b)
    418  1.1  riastrad 	LE32TOH(v2.16b)
    419  1.1  riastrad 	LE32TOH(v3.16b)
    420  1.1  riastrad 	LE32TOH(v4.16b)
    421  1.1  riastrad 	LE32TOH(v5.16b)
    422  1.1  riastrad 	LE32TOH(v6.16b)
    423  1.1  riastrad 	LE32TOH(v7.16b)
    424  1.1  riastrad 	LE32TOH(v8.16b)
    425  1.1  riastrad 	LE32TOH(v9.16b)
    426  1.1  riastrad 	LE32TOH(v10.16b)
    427  1.1  riastrad 	LE32TOH(v11.16b)
    428  1.1  riastrad 	LE32TOH(v12.16b)
    429  1.1  riastrad 	LE32TOH(v13.16b)
    430  1.1  riastrad 	LE32TOH(v14.16b)
    431  1.1  riastrad 	LE32TOH(v15.16b)
    432  1.1  riastrad 
    433  1.1  riastrad 	eor	v16.16b, v16.16b, v0.16b
    434  1.1  riastrad 	eor	v17.16b, v17.16b, v1.16b
    435  1.1  riastrad 	eor	v18.16b, v18.16b, v2.16b
    436  1.1  riastrad 	eor	v19.16b, v19.16b, v3.16b
    437  1.1  riastrad 	eor	v20.16b, v20.16b, v4.16b
    438  1.1  riastrad 	eor	v21.16b, v21.16b, v5.16b
    439  1.1  riastrad 	eor	v22.16b, v22.16b, v6.16b
    440  1.1  riastrad 	eor	v23.16b, v23.16b, v7.16b
    441  1.1  riastrad 	eor	v24.16b, v24.16b, v8.16b
    442  1.1  riastrad 	eor	v25.16b, v25.16b, v9.16b
    443  1.1  riastrad 	eor	v26.16b, v26.16b, v10.16b
    444  1.1  riastrad 	eor	v27.16b, v27.16b, v11.16b
    445  1.1  riastrad 	eor	v28.16b, v28.16b, v12.16b
    446  1.1  riastrad 	eor	v29.16b, v29.16b, v13.16b
    447  1.1  riastrad 	eor	v30.16b, v30.16b, v14.16b
    448  1.1  riastrad 	eor	v31.16b, v31.16b, v15.16b
    449  1.1  riastrad 
    450  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
    451  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
    452  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
    453  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
    454  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
    455  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
    456  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
    457  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
    458  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
    459  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
    460  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
    461  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
    462  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
    463  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
    464  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
    465  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
    466  1.1  riastrad 
    467  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    468  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    469  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    470  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    471  1.1  riastrad 
    472  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    473  1.1  riastrad 	ret
    474  1.1  riastrad END(chacha_stream_xor256_neon)
    475  1.1  riastrad 
    476  1.1  riastrad 	.section .rodata
    477  1.1  riastrad 	.p2align 4
    478  1.1  riastrad 
    479  1.1  riastrad 	.type	v0123,@object
    480  1.1  riastrad v0123:
    481  1.1  riastrad 	.long	0, 1, 2, 3
    482  1.1  riastrad END(v0123)
    483  1.1  riastrad 
    484  1.1  riastrad 	/*
    485  1.1  riastrad 	 * Must be immediately after v0123 -- we load them in a single
    486  1.1  riastrad 	 * ld1 instruction.
    487  1.1  riastrad 	 */
    488  1.1  riastrad 	.type	rot8,@object
    489  1.1  riastrad rot8:
    490  1.1  riastrad 	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
    491  1.1  riastrad END(rot8)
    492