Home | History | Annotate | Line # | Download | only in arm
chacha_neon_64.S revision 1.3
      1  1.3  riastrad /*	$NetBSD: chacha_neon_64.S,v 1.3 2020/07/27 20:53:23 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.2  riastrad #include <aarch64/asm.h>
     30  1.1  riastrad 
     31  1.1  riastrad #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
     32  1.1  riastrad STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     33  1.1  riastrad STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     34  1.1  riastrad STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     35  1.1  riastrad STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     36  1.1  riastrad STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     37  1.1  riastrad STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     38  1.1  riastrad STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     39  1.1  riastrad STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     40  1.1  riastrad STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     41  1.1  riastrad STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
     42  1.1  riastrad STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     43  1.1  riastrad STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     44  1.1  riastrad STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     45  1.1  riastrad STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     46  1.1  riastrad STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     47  1.1  riastrad STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     48  1.1  riastrad STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     49  1.1  riastrad STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     50  1.1  riastrad STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     51  1.1  riastrad STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
     52  1.1  riastrad /* end ROUND */
     53  1.1  riastrad 
     54  1.1  riastrad #define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
     55  1.1  riastrad 	f(a0,b0,c0,d0, t0, r);						      \
     56  1.1  riastrad 	f(a1,b1,c1,d1, t1, r);						      \
     57  1.1  riastrad 	f(a2,b2,c2,d2, t2, r);						      \
     58  1.1  riastrad 	f(a3,b3,c3,d3, t3, r);						      \
     59  1.1  riastrad 	/* end of STEP */
     60  1.1  riastrad 
     61  1.1  riastrad /*
     62  1.1  riastrad  * Each step of the ChaCha quarterround, split up so we can interleave
     63  1.1  riastrad  * the quarterrounds on independent rows/diagonals to maximize pipeline
     64  1.1  riastrad  * efficiency.  Reference:
     65  1.1  riastrad  *
     66  1.1  riastrad  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
     67  1.1  riastrad  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
     68  1.1  riastrad  *	https://cr.yp.to/papers.html#chacha
     69  1.1  riastrad  *
     70  1.1  riastrad  *	a += b; d ^= a; d <<<= 16;
     71  1.1  riastrad  *	c += d; b ^= c; b <<<= 12;
     72  1.1  riastrad  *	a += b; d ^= a; d <<<= 8;
     73  1.1  riastrad  *	c += d; b ^= c; b <<<= 7;
     74  1.1  riastrad  *
     75  1.1  riastrad  * The rotations are implemented with:
     76  1.1  riastrad  *	<<< 16		REV32 Vn.8h for 16,
     77  1.1  riastrad  *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
     78  1.1  riastrad  *	<<< 8		TBL (general permutation; rot8 below stored in r)
     79  1.1  riastrad  *	<<< 7		SHL/SRI/ORR
     80  1.1  riastrad  */
     81  1.1  riastrad #define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
     82  1.1  riastrad #define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
     83  1.1  riastrad #if 0
     84  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
     85  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
     86  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
     87  1.1  riastrad #else
     88  1.1  riastrad #define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
     89  1.1  riastrad #define	STEP3(a,b,c,d, t, r)	/* nothing */
     90  1.1  riastrad #define	STEP4(a,b,c,d, t, r)	/* nothing */
     91  1.1  riastrad #endif
     92  1.1  riastrad 
     93  1.1  riastrad #define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
     94  1.1  riastrad #if 0
     95  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
     96  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
     97  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
     98  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
     99  1.1  riastrad #else
    100  1.1  riastrad #define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    101  1.1  riastrad #define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
    102  1.1  riastrad #define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
    103  1.1  riastrad #define	STEP9(a,b,c,d, t, r)	/* nothing */
    104  1.1  riastrad #endif
    105  1.1  riastrad 
    106  1.1  riastrad #define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
    107  1.1  riastrad #define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
    108  1.1  riastrad #if 0
    109  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
    110  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
    111  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
    112  1.1  riastrad #else
    113  1.1  riastrad #define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
    114  1.1  riastrad #define	STEP13(a,b,c,d, t, r)	/* nothing */
    115  1.1  riastrad #define	STEP14(a,b,c,d, t, r)	/* nothing */
    116  1.1  riastrad #endif
    117  1.1  riastrad 
    118  1.1  riastrad #define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
    119  1.1  riastrad #if 0
    120  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
    121  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
    122  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
    123  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
    124  1.1  riastrad #else
    125  1.1  riastrad #define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
    126  1.1  riastrad #define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
    127  1.1  riastrad #define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
    128  1.1  riastrad #define	STEP19(a,b,c,d, t, r)	/* nothing */
    129  1.1  riastrad #endif
    130  1.1  riastrad 
    131  1.1  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
    132  1.1  riastrad #define	HTOLE32(x)
    133  1.1  riastrad #define	LE32TOH(x)
    134  1.1  riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
    135  1.1  riastrad #define	HTOLE32(x)	rev32	x, x
    136  1.1  riastrad #define	LE32TOH(x)	rev32	x, x
    137  1.1  riastrad #endif
    138  1.1  riastrad 
    139  1.1  riastrad /*
    140  1.1  riastrad  * chacha_stream256_neon(uint8_t s[256]@x0,
    141  1.1  riastrad  *     uint32_t blkno@w1,
    142  1.1  riastrad  *     const uint8_t nonce[12]@x2,
    143  1.1  riastrad  *     const uint8_t key[12]@x3,
    144  1.1  riastrad  *     const uint8_t const[16]@x4,
    145  1.1  riastrad  *     unsigned nr@w5)
    146  1.1  riastrad  */
    147  1.1  riastrad ENTRY(chacha_stream256_neon)
    148  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    149  1.1  riastrad 	mov	fp, sp
    150  1.1  riastrad 
    151  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    152  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    153  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    154  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    155  1.1  riastrad 
    156  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    157  1.1  riastrad 	mov	x10, x4		/* r10 := c */
    158  1.1  riastrad 	mov	x11, x3		/* r11 := k */
    159  1.1  riastrad 	add	x12, x3, #16	/* r12 := k+4 */
    160  1.1  riastrad 	mov	x13, x2		/* r13 := nonce */
    161  1.1  riastrad 
    162  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    163  1.1  riastrad 	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
    164  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    165  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    166  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    167  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    168  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    169  1.1  riastrad 
    170  1.1  riastrad 	HTOLE32(v0.16b)
    171  1.1  riastrad 	HTOLE32(v1.16b)
    172  1.1  riastrad 	HTOLE32(v2.16b)
    173  1.1  riastrad 	HTOLE32(v3.16b)
    174  1.1  riastrad 	HTOLE32(v4.16b)
    175  1.1  riastrad 	HTOLE32(v5.16b)
    176  1.1  riastrad 	HTOLE32(v6.16b)
    177  1.1  riastrad 	HTOLE32(v7.16b)
    178  1.1  riastrad 	HTOLE32(v8.16b)
    179  1.1  riastrad 	HTOLE32(v9.16b)
    180  1.1  riastrad 	HTOLE32(v10.16b)
    181  1.1  riastrad 	HTOLE32(v11.16b)
    182  1.1  riastrad 	HTOLE32(v12.16b)
    183  1.1  riastrad 	HTOLE32(v13.16b)
    184  1.1  riastrad 	HTOLE32(v14.16b)
    185  1.1  riastrad 	HTOLE32(v15.16b)
    186  1.1  riastrad 
    187  1.1  riastrad 	mov	v16.16b, v0.16b
    188  1.1  riastrad 	mov	v17.16b, v1.16b
    189  1.1  riastrad 	mov	v18.16b, v2.16b
    190  1.1  riastrad 	mov	v19.16b, v3.16b
    191  1.1  riastrad 	mov	v20.16b, v4.16b
    192  1.1  riastrad 	mov	v21.16b, v5.16b
    193  1.1  riastrad 	mov	v22.16b, v6.16b
    194  1.1  riastrad 	mov	v23.16b, v7.16b
    195  1.1  riastrad 	mov	v24.16b, v8.16b
    196  1.1  riastrad 	mov	v25.16b, v9.16b
    197  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    198  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    199  1.1  riastrad 	mov	w9, v11.s[0]
    200  1.1  riastrad 	mov	w10, v13.s[0]
    201  1.1  riastrad 	mov	w11, v14.s[0]
    202  1.1  riastrad 	mov	w12, v15.s[0]
    203  1.1  riastrad 
    204  1.3  riastrad 	_ALIGN_TEXT
    205  1.1  riastrad 1:	subs	w5, w5, #2
    206  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    207  1.1  riastrad 	    v28,v29,v30,v31, v27)
    208  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    209  1.1  riastrad 	    v28,v29,v30,v31, v27)
    210  1.1  riastrad 	b.ne	1b
    211  1.1  riastrad 
    212  1.1  riastrad 	dup	v27.4s, w8
    213  1.1  riastrad 	dup	v28.4s, w9
    214  1.1  riastrad 	dup	v29.4s, w10
    215  1.1  riastrad 	dup	v30.4s, w11
    216  1.1  riastrad 	dup	v31.4s, w12
    217  1.1  riastrad 
    218  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    219  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    220  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    221  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    222  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    223  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    224  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    225  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    226  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    227  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    228  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    229  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    230  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    231  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    232  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    233  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    234  1.1  riastrad 
    235  1.1  riastrad 	LE32TOH(v0.16b)
    236  1.1  riastrad 	LE32TOH(v1.16b)
    237  1.1  riastrad 	LE32TOH(v2.16b)
    238  1.1  riastrad 	LE32TOH(v3.16b)
    239  1.1  riastrad 	LE32TOH(v4.16b)
    240  1.1  riastrad 	LE32TOH(v5.16b)
    241  1.1  riastrad 	LE32TOH(v6.16b)
    242  1.1  riastrad 	LE32TOH(v7.16b)
    243  1.1  riastrad 	LE32TOH(v8.16b)
    244  1.1  riastrad 	LE32TOH(v9.16b)
    245  1.1  riastrad 	LE32TOH(v10.16b)
    246  1.1  riastrad 	LE32TOH(v11.16b)
    247  1.1  riastrad 	LE32TOH(v12.16b)
    248  1.1  riastrad 	LE32TOH(v13.16b)
    249  1.1  riastrad 	LE32TOH(v14.16b)
    250  1.1  riastrad 	LE32TOH(v15.16b)
    251  1.1  riastrad 
    252  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
    253  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
    254  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
    255  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
    256  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
    257  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
    258  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
    259  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
    260  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
    261  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
    262  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
    263  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
    264  1.1  riastrad 	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
    265  1.1  riastrad 	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
    266  1.1  riastrad 	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
    267  1.1  riastrad 	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
    268  1.1  riastrad 
    269  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    270  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    271  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    272  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    273  1.1  riastrad 
    274  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    275  1.1  riastrad 	ret
    276  1.1  riastrad END(chacha_stream256_neon)
    277  1.1  riastrad 
    278  1.1  riastrad /*
    279  1.1  riastrad  * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
    280  1.1  riastrad  *     uint32_t blkno@w2,
    281  1.1  riastrad  *     const uint8_t nonce[12]@x3,
    282  1.1  riastrad  *     const uint8_t key[32]@x4,
    283  1.1  riastrad  *     const uint8_t const[16]@x5,
    284  1.1  riastrad  *     unsigned nr@w6)
    285  1.1  riastrad  */
    286  1.1  riastrad ENTRY(chacha_stream_xor256_neon)
    287  1.1  riastrad 	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
    288  1.1  riastrad 	mov	fp, sp
    289  1.1  riastrad 
    290  1.1  riastrad 	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
    291  1.1  riastrad 	stp	d10, d11, [sp, #0x20]
    292  1.1  riastrad 	stp	d12, d13, [sp, #0x30]
    293  1.1  riastrad 	stp	d14, d15, [sp, #0x40]
    294  1.1  riastrad 
    295  1.1  riastrad 	adrl	x9, v0123	/* x9 := &v0123 */
    296  1.1  riastrad 	mov	x10, x5		/* r10 := c */
    297  1.1  riastrad 	mov	x11, x4		/* r11 := k */
    298  1.1  riastrad 	add	x12, x4, #16	/* r12 := k+4 */
    299  1.1  riastrad 	mov	x13, x3		/* r13 := nonce */
    300  1.1  riastrad 
    301  1.1  riastrad 	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
    302  1.1  riastrad 	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
    303  1.1  riastrad 	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
    304  1.1  riastrad 	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
    305  1.1  riastrad 	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
    306  1.1  riastrad 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
    307  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
    308  1.1  riastrad 
    309  1.1  riastrad 	HTOLE32(v0.16b)
    310  1.1  riastrad 	HTOLE32(v1.16b)
    311  1.1  riastrad 	HTOLE32(v2.16b)
    312  1.1  riastrad 	HTOLE32(v3.16b)
    313  1.1  riastrad 	HTOLE32(v4.16b)
    314  1.1  riastrad 	HTOLE32(v5.16b)
    315  1.1  riastrad 	HTOLE32(v6.16b)
    316  1.1  riastrad 	HTOLE32(v7.16b)
    317  1.1  riastrad 	HTOLE32(v8.16b)
    318  1.1  riastrad 	HTOLE32(v9.16b)
    319  1.1  riastrad 	HTOLE32(v10.16b)
    320  1.1  riastrad 	HTOLE32(v11.16b)
    321  1.1  riastrad 	HTOLE32(v12.16b)
    322  1.1  riastrad 	HTOLE32(v13.16b)
    323  1.1  riastrad 	HTOLE32(v14.16b)
    324  1.1  riastrad 	HTOLE32(v15.16b)
    325  1.1  riastrad 
    326  1.1  riastrad 	mov	v16.16b, v0.16b
    327  1.1  riastrad 	mov	v17.16b, v1.16b
    328  1.1  riastrad 	mov	v18.16b, v2.16b
    329  1.1  riastrad 	mov	v19.16b, v3.16b
    330  1.1  riastrad 	mov	v20.16b, v4.16b
    331  1.1  riastrad 	mov	v21.16b, v5.16b
    332  1.1  riastrad 	mov	v22.16b, v6.16b
    333  1.1  riastrad 	mov	v23.16b, v7.16b
    334  1.1  riastrad 	mov	v24.16b, v8.16b
    335  1.1  riastrad 	mov	v25.16b, v9.16b
    336  1.1  riastrad 	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
    337  1.1  riastrad 	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
    338  1.1  riastrad 	mov	w9, v11.s[0]
    339  1.1  riastrad 	mov	w10, v13.s[0]
    340  1.1  riastrad 	mov	w11, v14.s[0]
    341  1.1  riastrad 	mov	w12, v15.s[0]
    342  1.1  riastrad 
    343  1.3  riastrad         _ALIGN_TEXT
    344  1.1  riastrad 1:	subs	w6, w6, #2
    345  1.1  riastrad 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
    346  1.1  riastrad 	    v28,v29,v30,v31, v27)
    347  1.1  riastrad 	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
    348  1.1  riastrad 	    v28,v29,v30,v31, v27)
    349  1.1  riastrad 	b.ne	1b
    350  1.1  riastrad 
    351  1.1  riastrad 	dup	v27.4s, w8
    352  1.1  riastrad 	dup	v28.4s, w9
    353  1.1  riastrad 	dup	v29.4s, w10
    354  1.1  riastrad 	dup	v30.4s, w11
    355  1.1  riastrad 	dup	v31.4s, w12
    356  1.1  riastrad 
    357  1.1  riastrad 	add	v0.4s, v0.4s, v16.4s
    358  1.1  riastrad 	add	v1.4s, v1.4s, v17.4s
    359  1.1  riastrad 	add	v2.4s, v2.4s, v18.4s
    360  1.1  riastrad 	add	v3.4s, v3.4s, v19.4s
    361  1.1  riastrad 	add	v4.4s, v4.4s, v20.4s
    362  1.1  riastrad 	add	v5.4s, v5.4s, v21.4s
    363  1.1  riastrad 	add	v6.4s, v6.4s, v22.4s
    364  1.1  riastrad 	add	v7.4s, v7.4s, v23.4s
    365  1.1  riastrad 	add	v8.4s, v8.4s, v24.4s
    366  1.1  riastrad 	add	v9.4s, v9.4s, v25.4s
    367  1.1  riastrad 	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
    368  1.1  riastrad 	add	v11.4s, v11.4s, v28.4s
    369  1.1  riastrad 	add	v12.4s, v12.4s, v26.4s
    370  1.1  riastrad 	add	v13.4s, v13.4s, v29.4s
    371  1.1  riastrad 	add	v14.4s, v14.4s, v30.4s
    372  1.1  riastrad 	add	v15.4s, v15.4s, v31.4s
    373  1.1  riastrad 
    374  1.1  riastrad 	/*
    375  1.1  riastrad 	 * We could do these sixteen LD4-into-lane instructions instead
    376  1.1  riastrad 	 * by four LD1-into-register instructions, but we would need to
    377  1.1  riastrad 	 * permute the elements in v0-v15 to put them in the right
    378  1.1  riastrad 	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
    379  1.1  riastrad 	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
    380  1.1  riastrad 	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
    381  1.1  riastrad 	 * exceed the savings in cost from four LD1 instructions rather
    382  1.1  riastrad 	 * than sixteen LD4 instructions, even if we interleave the LD1
    383  1.1  riastrad 	 * instructions with the ZIPs.
    384  1.1  riastrad 	 */
    385  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
    386  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
    387  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
    388  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
    389  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
    390  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
    391  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
    392  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
    393  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
    394  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
    395  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
    396  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
    397  1.1  riastrad 	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
    398  1.1  riastrad 	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
    399  1.1  riastrad 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
    400  1.1  riastrad 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
    401  1.1  riastrad 
    402  1.1  riastrad 	LE32TOH(v0.16b)
    403  1.1  riastrad 	LE32TOH(v1.16b)
    404  1.1  riastrad 	LE32TOH(v2.16b)
    405  1.1  riastrad 	LE32TOH(v3.16b)
    406  1.1  riastrad 	LE32TOH(v4.16b)
    407  1.1  riastrad 	LE32TOH(v5.16b)
    408  1.1  riastrad 	LE32TOH(v6.16b)
    409  1.1  riastrad 	LE32TOH(v7.16b)
    410  1.1  riastrad 	LE32TOH(v8.16b)
    411  1.1  riastrad 	LE32TOH(v9.16b)
    412  1.1  riastrad 	LE32TOH(v10.16b)
    413  1.1  riastrad 	LE32TOH(v11.16b)
    414  1.1  riastrad 	LE32TOH(v12.16b)
    415  1.1  riastrad 	LE32TOH(v13.16b)
    416  1.1  riastrad 	LE32TOH(v14.16b)
    417  1.1  riastrad 	LE32TOH(v15.16b)
    418  1.1  riastrad 
    419  1.1  riastrad 	eor	v16.16b, v16.16b, v0.16b
    420  1.1  riastrad 	eor	v17.16b, v17.16b, v1.16b
    421  1.1  riastrad 	eor	v18.16b, v18.16b, v2.16b
    422  1.1  riastrad 	eor	v19.16b, v19.16b, v3.16b
    423  1.1  riastrad 	eor	v20.16b, v20.16b, v4.16b
    424  1.1  riastrad 	eor	v21.16b, v21.16b, v5.16b
    425  1.1  riastrad 	eor	v22.16b, v22.16b, v6.16b
    426  1.1  riastrad 	eor	v23.16b, v23.16b, v7.16b
    427  1.1  riastrad 	eor	v24.16b, v24.16b, v8.16b
    428  1.1  riastrad 	eor	v25.16b, v25.16b, v9.16b
    429  1.1  riastrad 	eor	v26.16b, v26.16b, v10.16b
    430  1.1  riastrad 	eor	v27.16b, v27.16b, v11.16b
    431  1.1  riastrad 	eor	v28.16b, v28.16b, v12.16b
    432  1.1  riastrad 	eor	v29.16b, v29.16b, v13.16b
    433  1.1  riastrad 	eor	v30.16b, v30.16b, v14.16b
    434  1.1  riastrad 	eor	v31.16b, v31.16b, v15.16b
    435  1.1  riastrad 
    436  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
    437  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
    438  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
    439  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
    440  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
    441  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
    442  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
    443  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
    444  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
    445  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
    446  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
    447  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
    448  1.1  riastrad 	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
    449  1.1  riastrad 	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
    450  1.1  riastrad 	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
    451  1.1  riastrad 	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
    452  1.1  riastrad 
    453  1.1  riastrad 	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
    454  1.1  riastrad 	ldp	d10, d11, [sp, #0x20]
    455  1.1  riastrad 	ldp	d12, d13, [sp, #0x30]
    456  1.1  riastrad 	ldp	d14, d15, [sp, #0x40]
    457  1.1  riastrad 
    458  1.1  riastrad 	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
    459  1.1  riastrad 	ret
    460  1.1  riastrad END(chacha_stream_xor256_neon)
    461  1.1  riastrad 
    462  1.1  riastrad 	.section .rodata
    463  1.1  riastrad 	.p2align 4
    464  1.1  riastrad 
    465  1.1  riastrad 	.type	v0123,@object
    466  1.1  riastrad v0123:
    467  1.1  riastrad 	.long	0, 1, 2, 3
    468  1.1  riastrad END(v0123)
    469  1.1  riastrad 
    470  1.1  riastrad 	/*
    471  1.1  riastrad 	 * Must be immediately after v0123 -- we load them in a single
    472  1.1  riastrad 	 * ld1 instruction.
    473  1.1  riastrad 	 */
    474  1.1  riastrad 	.type	rot8,@object
    475  1.1  riastrad rot8:
    476  1.1  riastrad 	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
    477  1.1  riastrad END(rot8)
    478