Home | History | Annotate | Line # | Download | only in arm
chacha_neon_64.S revision 1.2
      1  1.2  riastrad /*	$NetBSD: chacha_neon_64.S,v 1.2 2020/07/27 20:50:25 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.2  riastrad #include <aarch64/asm.h>
     30  1.1  riastrad 
     31  1.1  riastrad #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
     32  1.1  riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     33  1.1  riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     34  1.1  riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     35  1.1  riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     36  1.1  riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     37  1.1  riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     38  1.1  riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     39  1.1  riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     40  1.1  riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     41  1.1  riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     42  1.1  riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     43  1.1  riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     44  1.1  riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     45  1.1  riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     46  1.1  riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     47  1.1  riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     48  1.1  riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     49  1.1  riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     50  1.1  riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     51  1.1  riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     52  1.1  riastrad /* end ROUND */
     53  1.1  riastrad 
     54  1.1  riastrad #define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
     55  1.1  riastrad 	f(a0,b0,c0,d0, t0, r);						      \
     56  1.1  riastrad 	f(a1,b1,c1,d1, t1, r);						      \
     57  1.1  riastrad 	f(a2,b2,c2,d2, t2, r);						      \
     58  1.1  riastrad 	f(a3,b3,c3,d3, t3, r);						      \
     59  1.1  riastrad 	/* end of STEP */
     60  1.1  riastrad 
     61  1.1  riastrad /*
     62  1.1  riastrad  * Each step of the ChaCha quarterround, split up so we can interleave
     63  1.1  riastrad  * the quarterrounds on independent rows/diagonals to maximize pipeline
     64  1.1  riastrad  * efficiency.  Reference:
     65  1.1  riastrad  *
     66  1.1  riastrad  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
     67  1.1  riastrad  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
     68  1.1  riastrad  *	https://cr.yp.to/papers.html#chacha
     69  1.1  riastrad  *
     70  1.1  riastrad  *	a += b; d ^= a; d <<<= 16;
     71  1.1  riastrad  *	c += d; b ^= c; b <<<= 12;
     72  1.1  riastrad  *	a += b; d ^= a; d <<<= 8;
     73  1.1  riastrad  *	c += d; b ^= c; b <<<= 7;
     74  1.1  riastrad  *
     75  1.1  riastrad  * The rotations are implemented with:
     76  1.1  riastrad  *	<<< 16		REV32 Vn.8h for 16,
     77  1.1  riastrad  *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
     78  1.1  riastrad  *	<<< 8		TBL (general permutation; rot8 below stored in r)
     79  1.1  riastrad  *	<<< 7		SHL/SRI/ORR
     80  1.1  riastrad  */
     81  1.1  riastrad #define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
     82  1.1  riastrad #define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
     83  1.1  riastrad #if 0
     84  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
     85  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
     86  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
     87  1.1  riastrad #else
     88  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
     89  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	/* nothing */
     90  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	/* nothing */
     91  1.1  riastrad #endif
     92  1.1  riastrad 
     93  1.1  riastrad #define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
     94  1.1  riastrad #if 0
     95  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
     96  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
     97  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
     98  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
     99  1.1  riastrad #else
    100  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    101  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
    102  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
    103  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	/* nothing */
    104  1.1  riastrad #endif
    105  1.1  riastrad 
    106  1.1  riastrad #define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
    107  1.1  riastrad #define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
    108  1.1  riastrad #if 0
    109  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
    110  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
    111  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    112  1.1  riastrad #else
    113  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
    114  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	/* nothing */
    115  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	/* nothing */
    116  1.1  riastrad #endif
    117  1.1  riastrad 
    118  1.1  riastrad #define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    119  1.1  riastrad #if 0
    120  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    121  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
    122  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
    123  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    124  1.1  riastrad #else
    125  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    126  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
    127  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
    128  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	/* nothing */
    129  1.1  riastrad #endif
    130  1.1  riastrad 
    131  1.1  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    132  1.1  riastrad #define	HTOLE32(x)
    133  1.1  riastrad #define	LE32TOH(x)
    134  1.1  riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
    135  1.1  riastrad #define	HTOLE32(x)	rev32	x, x
    136  1.1  riastrad #define	LE32TOH(x)	rev32	x, x
    137  1.1  riastrad #endif
    138  1.1  riastrad 
    139  1.1  riastrad /*
    140  1.1  riastrad  * chacha_stream256_neon(uint8_t s[256]@x0,
    141  1.1  riastrad  *     uint32_t blkno@w1,
    142  1.1  riastrad  *     const uint8_t nonce[12]@x2,
    143  1.1  riastrad  *     const uint8_t key[12]@x3,
    144  1.1  riastrad  *     const uint8_t const[16]@x4,
    145  1.1  riastrad  *     unsigned nr@w5)
    146  1.1  riastrad  */
    147  1.1  riastrad ENTRY(chacha_stream256_neon)
    148  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    149  1.1  riastrad 	mov	fp, sp
    150  1.1  riastrad 
    151  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    152  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    153  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    154  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    155  1.1  riastrad 
    156  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    157  1.1  riastrad 	mov	x10, x4		/* r10 := c */
    158  1.1  riastrad 	mov	x11, x3		/* r11 := k */
    159  1.1  riastrad 	add	x12, x3, #16	/* r12 := k+4 */
    160  1.1  riastrad 	mov	x13, x2		/* r13 := nonce */
    161  1.1  riastrad 
    162  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    163  1.1  riastrad 	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
    164  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    165  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    166  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    167  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    168  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    169  1.1  riastrad 
    170  1.1  riastrad 	HTOLE32(v0.16b)
    171  1.1  riastrad 	HTOLE32(v1.16b)
    172  1.1  riastrad 	HTOLE32(v2.16b)
    173  1.1  riastrad 	HTOLE32(v3.16b)
    174  1.1  riastrad 	HTOLE32(v4.16b)
    175  1.1  riastrad 	HTOLE32(v5.16b)
    176  1.1  riastrad 	HTOLE32(v6.16b)
    177  1.1  riastrad 	HTOLE32(v7.16b)
    178  1.1  riastrad 	HTOLE32(v8.16b)
    179  1.1  riastrad 	HTOLE32(v9.16b)
    180  1.1  riastrad 	HTOLE32(v10.16b)
    181  1.1  riastrad 	HTOLE32(v11.16b)
    182  1.1  riastrad 	HTOLE32(v12.16b)
    183  1.1  riastrad 	HTOLE32(v13.16b)
    184  1.1  riastrad 	HTOLE32(v14.16b)
    185  1.1  riastrad 	HTOLE32(v15.16b)
    186  1.1  riastrad 
    187  1.1  riastrad 	mov	v16.16b, v0.16b
    188  1.1  riastrad 	mov	v17.16b, v1.16b
    189  1.1  riastrad 	mov	v18.16b, v2.16b
    190  1.1  riastrad 	mov	v19.16b, v3.16b
    191  1.1  riastrad 	mov	v20.16b, v4.16b
    192  1.1  riastrad 	mov	v21.16b, v5.16b
    193  1.1  riastrad 	mov	v22.16b, v6.16b
    194  1.1  riastrad 	mov	v23.16b, v7.16b
    195  1.1  riastrad 	mov	v24.16b, v8.16b
    196  1.1  riastrad 	mov	v25.16b, v9.16b
    197  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    198  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    199  1.1  riastrad 	mov	w9, v11.s[0]
    200  1.1  riastrad 	mov	w10, v13.s[0]
    201  1.1  riastrad 	mov	w11, v14.s[0]
    202  1.1  riastrad 	mov	w12, v15.s[0]
    203  1.1  riastrad 
    204  1.1  riastrad 1:	subs	w5, w5, #2
    205  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    206  1.1  riastrad 	    v28,v29,v30,v31, v27)
    207  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    208  1.1  riastrad 	    v28,v29,v30,v31, v27)
    209  1.1  riastrad 	b.ne	1b
    210  1.1  riastrad 
    211  1.1  riastrad 	dup	v27.4s, w8
    212  1.1  riastrad 	dup	v28.4s, w9
    213  1.1  riastrad 	dup	v29.4s, w10
    214  1.1  riastrad 	dup	v30.4s, w11
    215  1.1  riastrad 	dup	v31.4s, w12
    216  1.1  riastrad 
    217  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    218  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    219  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    220  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    221  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    222  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    223  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    224  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    225  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    226  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    227  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    228  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    229  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    230  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    231  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    232  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    233  1.1  riastrad 
    234  1.1  riastrad 	LE32TOH(v0.16b)
    235  1.1  riastrad 	LE32TOH(v1.16b)
    236  1.1  riastrad 	LE32TOH(v2.16b)
    237  1.1  riastrad 	LE32TOH(v3.16b)
    238  1.1  riastrad 	LE32TOH(v4.16b)
    239  1.1  riastrad 	LE32TOH(v5.16b)
    240  1.1  riastrad 	LE32TOH(v6.16b)
    241  1.1  riastrad 	LE32TOH(v7.16b)
    242  1.1  riastrad 	LE32TOH(v8.16b)
    243  1.1  riastrad 	LE32TOH(v9.16b)
    244  1.1  riastrad 	LE32TOH(v10.16b)
    245  1.1  riastrad 	LE32TOH(v11.16b)
    246  1.1  riastrad 	LE32TOH(v12.16b)
    247  1.1  riastrad 	LE32TOH(v13.16b)
    248  1.1  riastrad 	LE32TOH(v14.16b)
    249  1.1  riastrad 	LE32TOH(v15.16b)
    250  1.1  riastrad 
    251  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
    252  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
    253  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
    254  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
    255  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
    256  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
    257  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
    258  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
    259  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
    260  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
    261  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
    262  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
    263  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
    264  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
    265  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
    266  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
    267  1.1  riastrad 
    268  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    269  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    270  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    271  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    272  1.1  riastrad 
    273  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    274  1.1  riastrad 	ret
    275  1.1  riastrad END(chacha_stream256_neon)
    276  1.1  riastrad 
    277  1.1  riastrad /*
    278  1.1  riastrad  * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
    279  1.1  riastrad  *     uint32_t blkno@w2,
    280  1.1  riastrad  *     const uint8_t nonce[12]@x3,
    281  1.1  riastrad  *     const uint8_t key[32]@x4,
    282  1.1  riastrad  *     const uint8_t const[16]@x5,
    283  1.1  riastrad  *     unsigned nr@w6)
    284  1.1  riastrad  */
    285  1.1  riastrad ENTRY(chacha_stream_xor256_neon)
    286  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    287  1.1  riastrad 	mov	fp, sp
    288  1.1  riastrad 
    289  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    290  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    291  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    292  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    293  1.1  riastrad 
    294  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    295  1.1  riastrad 	mov	x10, x5		/* r10 := c */
    296  1.1  riastrad 	mov	x11, x4		/* r11 := k */
    297  1.1  riastrad 	add	x12, x4, #16	/* r12 := k+4 */
    298  1.1  riastrad 	mov	x13, x3		/* r13 := nonce */
    299  1.1  riastrad 
    300  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    301  1.1  riastrad 	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
    302  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    303  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    304  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    305  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    306  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    307  1.1  riastrad 
    308  1.1  riastrad 	HTOLE32(v0.16b)
    309  1.1  riastrad 	HTOLE32(v1.16b)
    310  1.1  riastrad 	HTOLE32(v2.16b)
    311  1.1  riastrad 	HTOLE32(v3.16b)
    312  1.1  riastrad 	HTOLE32(v4.16b)
    313  1.1  riastrad 	HTOLE32(v5.16b)
    314  1.1  riastrad 	HTOLE32(v6.16b)
    315  1.1  riastrad 	HTOLE32(v7.16b)
    316  1.1  riastrad 	HTOLE32(v8.16b)
    317  1.1  riastrad 	HTOLE32(v9.16b)
    318  1.1  riastrad 	HTOLE32(v10.16b)
    319  1.1  riastrad 	HTOLE32(v11.16b)
    320  1.1  riastrad 	HTOLE32(v12.16b)
    321  1.1  riastrad 	HTOLE32(v13.16b)
    322  1.1  riastrad 	HTOLE32(v14.16b)
    323  1.1  riastrad 	HTOLE32(v15.16b)
    324  1.1  riastrad 
    325  1.1  riastrad 	mov	v16.16b, v0.16b
    326  1.1  riastrad 	mov	v17.16b, v1.16b
    327  1.1  riastrad 	mov	v18.16b, v2.16b
    328  1.1  riastrad 	mov	v19.16b, v3.16b
    329  1.1  riastrad 	mov	v20.16b, v4.16b
    330  1.1  riastrad 	mov	v21.16b, v5.16b
    331  1.1  riastrad 	mov	v22.16b, v6.16b
    332  1.1  riastrad 	mov	v23.16b, v7.16b
    333  1.1  riastrad 	mov	v24.16b, v8.16b
    334  1.1  riastrad 	mov	v25.16b, v9.16b
    335  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    336  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    337  1.1  riastrad 	mov	w9, v11.s[0]
    338  1.1  riastrad 	mov	w10, v13.s[0]
    339  1.1  riastrad 	mov	w11, v14.s[0]
    340  1.1  riastrad 	mov	w12, v15.s[0]
    341  1.1  riastrad 
    342  1.1  riastrad 1:	subs	w6, w6, #2
    343  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    344  1.1  riastrad 	    v28,v29,v30,v31, v27)
    345  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    346  1.1  riastrad 	    v28,v29,v30,v31, v27)
    347  1.1  riastrad 	b.ne	1b
    348  1.1  riastrad 
    349  1.1  riastrad 	dup	v27.4s, w8
    350  1.1  riastrad 	dup	v28.4s, w9
    351  1.1  riastrad 	dup	v29.4s, w10
    352  1.1  riastrad 	dup	v30.4s, w11
    353  1.1  riastrad 	dup	v31.4s, w12
    354  1.1  riastrad 
    355  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    356  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    357  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    358  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    359  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    360  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    361  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    362  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    363  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    364  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    365  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    366  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    367  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    368  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    369  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    370  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    371  1.1  riastrad 
    372  1.1  riastrad 	/*
    373  1.1  riastrad 	 * We could do these sixteen LD4-into-lane instructions instead
    374  1.1  riastrad 	 * by four LD1-into-register instructions, but we would need to
    375  1.1  riastrad 	 * permute the elements in v0-v15 to put them in the right
    376  1.1  riastrad 	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
    377  1.1  riastrad 	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
    378  1.1  riastrad 	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
    379  1.1  riastrad 	 * exceed the savings in cost from four LD1 instructions rather
    380  1.1  riastrad 	 * than sixteen LD4 instructions, even if we interleave the LD1
    381  1.1  riastrad 	 * instructions with the ZIPs.
    382  1.1  riastrad 	 */
    383  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
    384  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
    385  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
    386  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
    387  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
    388  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
    389  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
    390  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
    391  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
    392  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
    393  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
    394  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
    395  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
    396  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
    397  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
    398  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
    399  1.1  riastrad 
    400  1.1  riastrad 	LE32TOH(v0.16b)
    401  1.1  riastrad 	LE32TOH(v1.16b)
    402  1.1  riastrad 	LE32TOH(v2.16b)
    403  1.1  riastrad 	LE32TOH(v3.16b)
    404  1.1  riastrad 	LE32TOH(v4.16b)
    405  1.1  riastrad 	LE32TOH(v5.16b)
    406  1.1  riastrad 	LE32TOH(v6.16b)
    407  1.1  riastrad 	LE32TOH(v7.16b)
    408  1.1  riastrad 	LE32TOH(v8.16b)
    409  1.1  riastrad 	LE32TOH(v9.16b)
    410  1.1  riastrad 	LE32TOH(v10.16b)
    411  1.1  riastrad 	LE32TOH(v11.16b)
    412  1.1  riastrad 	LE32TOH(v12.16b)
    413  1.1  riastrad 	LE32TOH(v13.16b)
    414  1.1  riastrad 	LE32TOH(v14.16b)
    415  1.1  riastrad 	LE32TOH(v15.16b)
    416  1.1  riastrad 
    417  1.1  riastrad 	eor	v16.16b, v16.16b, v0.16b
    418  1.1  riastrad 	eor	v17.16b, v17.16b, v1.16b
    419  1.1  riastrad 	eor	v18.16b, v18.16b, v2.16b
    420  1.1  riastrad 	eor	v19.16b, v19.16b, v3.16b
    421  1.1  riastrad 	eor	v20.16b, v20.16b, v4.16b
    422  1.1  riastrad 	eor	v21.16b, v21.16b, v5.16b
    423  1.1  riastrad 	eor	v22.16b, v22.16b, v6.16b
    424  1.1  riastrad 	eor	v23.16b, v23.16b, v7.16b
    425  1.1  riastrad 	eor	v24.16b, v24.16b, v8.16b
    426  1.1  riastrad 	eor	v25.16b, v25.16b, v9.16b
    427  1.1  riastrad 	eor	v26.16b, v26.16b, v10.16b
    428  1.1  riastrad 	eor	v27.16b, v27.16b, v11.16b
    429  1.1  riastrad 	eor	v28.16b, v28.16b, v12.16b
    430  1.1  riastrad 	eor	v29.16b, v29.16b, v13.16b
    431  1.1  riastrad 	eor	v30.16b, v30.16b, v14.16b
    432  1.1  riastrad 	eor	v31.16b, v31.16b, v15.16b
    433  1.1  riastrad 
    434  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
    435  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
    436  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
    437  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
    438  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
    439  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
    440  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
    441  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
    442  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
    443  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
    444  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
    445  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
    446  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
    447  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
    448  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
    449  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
    450  1.1  riastrad 
    451  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    452  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    453  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    454  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    455  1.1  riastrad 
    456  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    457  1.1  riastrad 	ret
    458  1.1  riastrad END(chacha_stream_xor256_neon)
    459  1.1  riastrad 
    460  1.1  riastrad 	.section .rodata
    461  1.1  riastrad 	.p2align 4
    462  1.1  riastrad 
    463  1.1  riastrad 	.type	v0123,@object
    464  1.1  riastrad v0123:
    465  1.1  riastrad 	.long	0, 1, 2, 3
    466  1.1  riastrad END(v0123)
    467  1.1  riastrad 
    468  1.1  riastrad 	/*
    469  1.1  riastrad 	 * Must be immediately after v0123 -- we load them in a single
    470  1.1  riastrad 	 * ld1 instruction.
    471  1.1  riastrad 	 */
    472  1.1  riastrad 	.type	rot8,@object
    473  1.1  riastrad rot8:
    474  1.1  riastrad 	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
    475  1.1  riastrad END(rot8)
    476