Home | History | Annotate | Line # | Download | only in arm
      1  1.7  jakllsch /*	$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.2  riastrad #include <aarch64/asm.h>
     30  1.1  riastrad 
     31  1.7  jakllsch RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $")
     32  1.4  riastrad 
     33  1.1  riastrad #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
     34  1.1  riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     35  1.1  riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     36  1.1  riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     37  1.1  riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     38  1.1  riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     39  1.1  riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     40  1.1  riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     41  1.1  riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     42  1.1  riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     43  1.1  riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     44  1.1  riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     45  1.1  riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     46  1.1  riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     47  1.1  riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     48  1.1  riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     49  1.1  riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     50  1.1  riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     51  1.1  riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     52  1.1  riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     53  1.1  riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     54  1.1  riastrad /* end ROUND */
     55  1.1  riastrad 
     56  1.1  riastrad #define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
     57  1.1  riastrad 	f(a0,b0,c0,d0, t0, r);						      \
     58  1.1  riastrad 	f(a1,b1,c1,d1, t1, r);						      \
     59  1.1  riastrad 	f(a2,b2,c2,d2, t2, r);						      \
     60  1.1  riastrad 	f(a3,b3,c3,d3, t3, r);						      \
     61  1.1  riastrad 	/* end of STEP */
     62  1.1  riastrad 
     63  1.1  riastrad /*
     64  1.1  riastrad  * Each step of the ChaCha quarterround, split up so we can interleave
     65  1.1  riastrad  * the quarterrounds on independent rows/diagonals to maximize pipeline
     66  1.1  riastrad  * efficiency.  Reference:
     67  1.1  riastrad  *
     68  1.1  riastrad  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
     69  1.1  riastrad  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
     70  1.1  riastrad  *	https://cr.yp.to/papers.html#chacha
     71  1.1  riastrad  *
     72  1.1  riastrad  *	a += b; d ^= a; d <<<= 16;
     73  1.1  riastrad  *	c += d; b ^= c; b <<<= 12;
     74  1.1  riastrad  *	a += b; d ^= a; d <<<= 8;
     75  1.1  riastrad  *	c += d; b ^= c; b <<<= 7;
     76  1.1  riastrad  *
     77  1.1  riastrad  * The rotations are implemented with:
     78  1.1  riastrad  *	<<< 16		REV32 Vn.8h for 16,
     79  1.1  riastrad  *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
     80  1.1  riastrad  *	<<< 8		TBL (general permutation; rot8 below stored in r)
     81  1.1  riastrad  *	<<< 7		SHL/SRI/ORR
     82  1.1  riastrad  */
     83  1.1  riastrad #define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
     84  1.1  riastrad #define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
     85  1.1  riastrad #if 0
     86  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
     87  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
     88  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
     89  1.1  riastrad #else
     90  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
     91  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	/* nothing */
     92  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	/* nothing */
     93  1.1  riastrad #endif
     94  1.1  riastrad 
     95  1.1  riastrad #define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
     96  1.1  riastrad #if 0
     97  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
     98  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
     99  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
    100  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    101  1.1  riastrad #else
    102  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    103  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
    104  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
    105  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	/* nothing */
    106  1.1  riastrad #endif
    107  1.1  riastrad 
    108  1.1  riastrad #define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
    109  1.1  riastrad #define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
    110  1.1  riastrad #if 0
    111  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
    112  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
    113  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    114  1.1  riastrad #else
    115  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
    116  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	/* nothing */
    117  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	/* nothing */
    118  1.1  riastrad #endif
    119  1.1  riastrad 
    120  1.1  riastrad #define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    121  1.1  riastrad #if 0
    122  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    123  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
    124  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
    125  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    126  1.1  riastrad #else
    127  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    128  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
    129  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
    130  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	/* nothing */
    131  1.1  riastrad #endif
    132  1.1  riastrad 
    133  1.7  jakllsch #if defined(__AARCH64EB__)
    134  1.1  riastrad #define	HTOLE32(x)	rev32	x, x
    135  1.1  riastrad #define	LE32TOH(x)	rev32	x, x
    136  1.7  jakllsch #else
    137  1.7  jakllsch #define	LE32TOH(x)
    138  1.7  jakllsch #define	HTOLE32(x)
    139  1.1  riastrad #endif
    140  1.1  riastrad 
    141  1.1  riastrad /*
    142  1.1  riastrad  * chacha_stream256_neon(uint8_t s[256]@x0,
    143  1.1  riastrad  *     uint32_t blkno@w1,
    144  1.1  riastrad  *     const uint8_t nonce[12]@x2,
    145  1.5  riastrad  *     const uint8_t key[32]@x3,
    146  1.1  riastrad  *     const uint8_t const[16]@x4,
    147  1.1  riastrad  *     unsigned nr@w5)
    148  1.1  riastrad  */
    149  1.1  riastrad ENTRY(chacha_stream256_neon)
    150  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    151  1.1  riastrad 	mov	fp, sp
    152  1.1  riastrad 
    153  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    154  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    155  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    156  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    157  1.1  riastrad 
    158  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    159  1.1  riastrad 	mov	x10, x4		/* r10 := c */
    160  1.1  riastrad 	mov	x11, x3		/* r11 := k */
    161  1.1  riastrad 	add	x12, x3, #16	/* r12 := k+4 */
    162  1.1  riastrad 	mov	x13, x2		/* r13 := nonce */
    163  1.1  riastrad 
    164  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    165  1.1  riastrad 	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
    166  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    167  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    168  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    169  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    170  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    171  1.1  riastrad 
    172  1.6  riastrad 	LE32TOH(v0.16b)
    173  1.6  riastrad 	LE32TOH(v1.16b)
    174  1.6  riastrad 	LE32TOH(v2.16b)
    175  1.6  riastrad 	LE32TOH(v3.16b)
    176  1.6  riastrad 	LE32TOH(v4.16b)
    177  1.6  riastrad 	LE32TOH(v5.16b)
    178  1.6  riastrad 	LE32TOH(v6.16b)
    179  1.6  riastrad 	LE32TOH(v7.16b)
    180  1.6  riastrad 	LE32TOH(v8.16b)
    181  1.6  riastrad 	LE32TOH(v9.16b)
    182  1.6  riastrad 	LE32TOH(v10.16b)
    183  1.6  riastrad 	LE32TOH(v11.16b)
    184  1.6  riastrad 	/* LE32TOH(v12.16b) -- blkno, already host order */
    185  1.6  riastrad 	LE32TOH(v13.16b)
    186  1.6  riastrad 	LE32TOH(v14.16b)
    187  1.6  riastrad 	LE32TOH(v15.16b)
    188  1.1  riastrad 
    189  1.1  riastrad 	mov	v16.16b, v0.16b
    190  1.1  riastrad 	mov	v17.16b, v1.16b
    191  1.1  riastrad 	mov	v18.16b, v2.16b
    192  1.1  riastrad 	mov	v19.16b, v3.16b
    193  1.1  riastrad 	mov	v20.16b, v4.16b
    194  1.1  riastrad 	mov	v21.16b, v5.16b
    195  1.1  riastrad 	mov	v22.16b, v6.16b
    196  1.1  riastrad 	mov	v23.16b, v7.16b
    197  1.1  riastrad 	mov	v24.16b, v8.16b
    198  1.1  riastrad 	mov	v25.16b, v9.16b
    199  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    200  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    201  1.1  riastrad 	mov	w9, v11.s[0]
    202  1.1  riastrad 	mov	w10, v13.s[0]
    203  1.1  riastrad 	mov	w11, v14.s[0]
    204  1.1  riastrad 	mov	w12, v15.s[0]
    205  1.1  riastrad 
    206  1.3  riastrad 	_ALIGN_TEXT
    207  1.1  riastrad 1:	subs	w5, w5, #2
    208  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    209  1.1  riastrad 	    v28,v29,v30,v31, v27)
    210  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    211  1.1  riastrad 	    v28,v29,v30,v31, v27)
    212  1.1  riastrad 	b.ne	1b
    213  1.1  riastrad 
    214  1.1  riastrad 	dup	v27.4s, w8
    215  1.1  riastrad 	dup	v28.4s, w9
    216  1.1  riastrad 	dup	v29.4s, w10
    217  1.1  riastrad 	dup	v30.4s, w11
    218  1.1  riastrad 	dup	v31.4s, w12
    219  1.1  riastrad 
    220  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    221  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    222  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    223  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    224  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    225  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    226  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    227  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    228  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    229  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    230  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    231  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    232  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    233  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    234  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    235  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    236  1.1  riastrad 
    237  1.6  riastrad 	HTOLE32(v0.16b)
    238  1.6  riastrad 	HTOLE32(v1.16b)
    239  1.6  riastrad 	HTOLE32(v2.16b)
    240  1.6  riastrad 	HTOLE32(v3.16b)
    241  1.6  riastrad 	HTOLE32(v4.16b)
    242  1.6  riastrad 	HTOLE32(v5.16b)
    243  1.6  riastrad 	HTOLE32(v6.16b)
    244  1.6  riastrad 	HTOLE32(v7.16b)
    245  1.6  riastrad 	HTOLE32(v8.16b)
    246  1.6  riastrad 	HTOLE32(v9.16b)
    247  1.6  riastrad 	HTOLE32(v10.16b)
    248  1.6  riastrad 	HTOLE32(v11.16b)
    249  1.6  riastrad 	HTOLE32(v12.16b)
    250  1.6  riastrad 	HTOLE32(v13.16b)
    251  1.6  riastrad 	HTOLE32(v14.16b)
    252  1.6  riastrad 	HTOLE32(v15.16b)
    253  1.1  riastrad 
    254  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
    255  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
    256  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
    257  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
    258  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
    259  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
    260  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
    261  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
    262  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
    263  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
    264  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
    265  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
    266  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
    267  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
    268  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
    269  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
    270  1.1  riastrad 
    271  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    272  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    273  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    274  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    275  1.1  riastrad 
    276  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    277  1.1  riastrad 	ret
    278  1.1  riastrad END(chacha_stream256_neon)
    279  1.1  riastrad 
    280  1.1  riastrad /*
    281  1.1  riastrad  * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
    282  1.1  riastrad  *     uint32_t blkno@w2,
    283  1.1  riastrad  *     const uint8_t nonce[12]@x3,
    284  1.1  riastrad  *     const uint8_t key[32]@x4,
    285  1.1  riastrad  *     const uint8_t const[16]@x5,
    286  1.1  riastrad  *     unsigned nr@w6)
    287  1.1  riastrad  */
    288  1.1  riastrad ENTRY(chacha_stream_xor256_neon)
    289  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    290  1.1  riastrad 	mov	fp, sp
    291  1.1  riastrad 
    292  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    293  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    294  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    295  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    296  1.1  riastrad 
    297  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    298  1.1  riastrad 	mov	x10, x5		/* r10 := c */
    299  1.1  riastrad 	mov	x11, x4		/* r11 := k */
    300  1.1  riastrad 	add	x12, x4, #16	/* r12 := k+4 */
    301  1.1  riastrad 	mov	x13, x3		/* r13 := nonce */
    302  1.1  riastrad 
    303  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    304  1.1  riastrad 	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
    305  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    306  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    307  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    308  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    309  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    310  1.1  riastrad 
    311  1.6  riastrad 	LE32TOH(v0.16b)
    312  1.6  riastrad 	LE32TOH(v1.16b)
    313  1.6  riastrad 	LE32TOH(v2.16b)
    314  1.6  riastrad 	LE32TOH(v3.16b)
    315  1.6  riastrad 	LE32TOH(v4.16b)
    316  1.6  riastrad 	LE32TOH(v5.16b)
    317  1.6  riastrad 	LE32TOH(v6.16b)
    318  1.6  riastrad 	LE32TOH(v7.16b)
    319  1.6  riastrad 	LE32TOH(v8.16b)
    320  1.6  riastrad 	LE32TOH(v9.16b)
    321  1.6  riastrad 	LE32TOH(v10.16b)
    322  1.6  riastrad 	LE32TOH(v11.16b)
    323  1.6  riastrad 	/* LE32TOH(v12.16b) -- blkno, already host order */
    324  1.6  riastrad 	LE32TOH(v13.16b)
    325  1.6  riastrad 	LE32TOH(v14.16b)
    326  1.6  riastrad 	LE32TOH(v15.16b)
    327  1.1  riastrad 
    328  1.1  riastrad 	mov	v16.16b, v0.16b
    329  1.1  riastrad 	mov	v17.16b, v1.16b
    330  1.1  riastrad 	mov	v18.16b, v2.16b
    331  1.1  riastrad 	mov	v19.16b, v3.16b
    332  1.1  riastrad 	mov	v20.16b, v4.16b
    333  1.1  riastrad 	mov	v21.16b, v5.16b
    334  1.1  riastrad 	mov	v22.16b, v6.16b
    335  1.1  riastrad 	mov	v23.16b, v7.16b
    336  1.1  riastrad 	mov	v24.16b, v8.16b
    337  1.1  riastrad 	mov	v25.16b, v9.16b
    338  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    339  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    340  1.1  riastrad 	mov	w9, v11.s[0]
    341  1.1  riastrad 	mov	w10, v13.s[0]
    342  1.1  riastrad 	mov	w11, v14.s[0]
    343  1.1  riastrad 	mov	w12, v15.s[0]
    344  1.1  riastrad 
    345  1.3  riastrad         _ALIGN_TEXT
    346  1.1  riastrad 1:	subs	w6, w6, #2
    347  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    348  1.1  riastrad 	    v28,v29,v30,v31, v27)
    349  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    350  1.1  riastrad 	    v28,v29,v30,v31, v27)
    351  1.1  riastrad 	b.ne	1b
    352  1.1  riastrad 
    353  1.1  riastrad 	dup	v27.4s, w8
    354  1.1  riastrad 	dup	v28.4s, w9
    355  1.1  riastrad 	dup	v29.4s, w10
    356  1.1  riastrad 	dup	v30.4s, w11
    357  1.1  riastrad 	dup	v31.4s, w12
    358  1.1  riastrad 
    359  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    360  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    361  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    362  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    363  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    364  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    365  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    366  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    367  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    368  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    369  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    370  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    371  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    372  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    373  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    374  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    375  1.1  riastrad 
    376  1.1  riastrad 	/*
    377  1.1  riastrad 	 * We could do these sixteen LD4-into-lane instructions instead
    378  1.1  riastrad 	 * by four LD1-into-register instructions, but we would need to
    379  1.1  riastrad 	 * permute the elements in v0-v15 to put them in the right
    380  1.1  riastrad 	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
    381  1.1  riastrad 	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
    382  1.1  riastrad 	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
    383  1.1  riastrad 	 * exceed the savings in cost from four LD1 instructions rather
    384  1.1  riastrad 	 * than sixteen LD4 instructions, even if we interleave the LD1
    385  1.1  riastrad 	 * instructions with the ZIPs.
    386  1.1  riastrad 	 */
    387  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
    388  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
    389  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
    390  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
    391  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
    392  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
    393  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
    394  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
    395  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
    396  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
    397  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
    398  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
    399  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
    400  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
    401  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
    402  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
    403  1.1  riastrad 
    404  1.6  riastrad 	HTOLE32(v0.16b)
    405  1.6  riastrad 	HTOLE32(v1.16b)
    406  1.6  riastrad 	HTOLE32(v2.16b)
    407  1.6  riastrad 	HTOLE32(v3.16b)
    408  1.6  riastrad 	HTOLE32(v4.16b)
    409  1.6  riastrad 	HTOLE32(v5.16b)
    410  1.6  riastrad 	HTOLE32(v6.16b)
    411  1.6  riastrad 	HTOLE32(v7.16b)
    412  1.6  riastrad 	HTOLE32(v8.16b)
    413  1.6  riastrad 	HTOLE32(v9.16b)
    414  1.6  riastrad 	HTOLE32(v10.16b)
    415  1.6  riastrad 	HTOLE32(v11.16b)
    416  1.6  riastrad 	HTOLE32(v12.16b)
    417  1.6  riastrad 	HTOLE32(v13.16b)
    418  1.6  riastrad 	HTOLE32(v14.16b)
    419  1.6  riastrad 	HTOLE32(v15.16b)
    420  1.1  riastrad 
    421  1.1  riastrad 	eor	v16.16b, v16.16b, v0.16b
    422  1.1  riastrad 	eor	v17.16b, v17.16b, v1.16b
    423  1.1  riastrad 	eor	v18.16b, v18.16b, v2.16b
    424  1.1  riastrad 	eor	v19.16b, v19.16b, v3.16b
    425  1.1  riastrad 	eor	v20.16b, v20.16b, v4.16b
    426  1.1  riastrad 	eor	v21.16b, v21.16b, v5.16b
    427  1.1  riastrad 	eor	v22.16b, v22.16b, v6.16b
    428  1.1  riastrad 	eor	v23.16b, v23.16b, v7.16b
    429  1.1  riastrad 	eor	v24.16b, v24.16b, v8.16b
    430  1.1  riastrad 	eor	v25.16b, v25.16b, v9.16b
    431  1.1  riastrad 	eor	v26.16b, v26.16b, v10.16b
    432  1.1  riastrad 	eor	v27.16b, v27.16b, v11.16b
    433  1.1  riastrad 	eor	v28.16b, v28.16b, v12.16b
    434  1.1  riastrad 	eor	v29.16b, v29.16b, v13.16b
    435  1.1  riastrad 	eor	v30.16b, v30.16b, v14.16b
    436  1.1  riastrad 	eor	v31.16b, v31.16b, v15.16b
    437  1.1  riastrad 
    438  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
    439  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
    440  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
    441  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
    442  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
    443  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
    444  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
    445  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
    446  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
    447  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
    448  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
    449  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
    450  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
    451  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
    452  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
    453  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
    454  1.1  riastrad 
    455  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    456  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    457  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    458  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    459  1.1  riastrad 
    460  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    461  1.1  riastrad 	ret
    462  1.1  riastrad END(chacha_stream_xor256_neon)
    463  1.1  riastrad 
    464  1.1  riastrad 	.section .rodata
    465  1.1  riastrad 	.p2align 4
    466  1.1  riastrad 
    467  1.1  riastrad 	.type	v0123,@object
    468  1.1  riastrad v0123:
    469  1.1  riastrad 	.long	0, 1, 2, 3
    470  1.1  riastrad END(v0123)
    471  1.1  riastrad 
    472  1.1  riastrad 	/*
    473  1.1  riastrad 	 * Must be immediately after v0123 -- we load them in a single
    474  1.1  riastrad 	 * ld1 instruction.
    475  1.1  riastrad 	 */
    476  1.1  riastrad 	.type	rot8,@object
    477  1.1  riastrad rot8:
    478  1.1  riastrad 	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
    479  1.1  riastrad END(rot8)
    480