arch/arm/chacha_neon_32.S

1.1  riastrad /*	$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $	*/
1.1  riastrad
1.1  riastrad /*-
1.1  riastrad  * Copyright (c) 2020 The NetBSD Foundation, Inc.
1.1  riastrad  * All rights reserved.
1.1  riastrad  *
1.1  riastrad  * Redistribution and use in source and binary forms, with or without
1.1  riastrad  * modification, are permitted provided that the following conditions
1.1  riastrad  * are met:
1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
1.1  riastrad  *    documentation and/or other materials provided with the distribution.
1.1  riastrad  *
1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
1.1  riastrad  */
1.1  riastrad
1.1  riastrad #include <machine/asm.h>
1.1  riastrad
1.1  riastrad RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")
1.1  riastrad
1.1  riastrad 	.fpu	neon
1.1  riastrad
1.1  riastrad /*
1.1  riastrad  * ChaCha round, split up so we can interleave the quarterrounds on
1.1  riastrad  * independent rows/diagonals to maximize pipeline efficiency, with
1.1  riastrad  * spills to deal with the scarcity of registers.  Reference:
1.1  riastrad  *
1.1  riastrad  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
1.1  riastrad  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
1.1  riastrad  *	https://cr.yp.to/papers.html#chacha
1.1  riastrad  *
1.1  riastrad  *	a += b; d ^= a; d <<<= 16;
1.1  riastrad  *	c += d; b ^= c; b <<<= 12;
1.1  riastrad  *	a += b; d ^= a; d <<<= 8;
1.1  riastrad  *	c += d; b ^= c; b <<<= 7;
1.1  riastrad  *
1.1  riastrad  * The rotations are implemented with:
1.1  riastrad  *	<<< 16		VREV32.16 for 16,
1.1  riastrad  *	<<< 12		VSHL/VSRI/VORR (shift left, shift right and insert, OR)
1.1  riastrad  *	<<< 8		TBL (general permutation; rot8 below stored in r)
1.1  riastrad  *	<<< 7		VSHL/VSRI/VORR
1.1  riastrad  */
1.1  riastrad
1.1  riastrad .macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
1.1  riastrad 	vld1.32		{\c2-\c3}, [fp, :256]
1.1  riastrad .endm
1.1  riastrad
1.1  riastrad .macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
1.1  riastrad 	/* a += b; d ^= a; d <<<= 16 */
1.1  riastrad 	vadd.u32	\a0, \a0, \b0
1.1  riastrad 	vadd.u32	\a1, \a1, \b1
1.1  riastrad 	vadd.u32	\a2, \a2, \b2
1.1  riastrad 	vadd.u32	\a3, \a3, \b3
1.1  riastrad
1.1  riastrad 	veor		\d0, \d0, \a0
1.1  riastrad 	veor		\d1, \d1, \a1
1.1  riastrad 	veor		\d2, \d2, \a2
1.1  riastrad 	veor		\d3, \d3, \a3
1.1  riastrad
1.1  riastrad 	vrev32.16	\d0, \d0
1.1  riastrad 	vrev32.16	\d1, \d1
1.1  riastrad 	vrev32.16	\d2, \d2
1.1  riastrad 	vrev32.16	\d3, \d3
1.1  riastrad
1.1  riastrad 	/* c += d; b ^= c; b <<<= 12 */
1.1  riastrad 	vadd.u32	\c0, \c0, \d0
1.1  riastrad 	vadd.u32	\c1, \c1, \d1
1.1  riastrad 	vadd.u32	\c2, \c2, \d2
1.1  riastrad 	vadd.u32	\c3, \c3, \d3
1.1  riastrad
1.1  riastrad 	vst1.32		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
1.1  riastrad
1.1  riastrad 	veor		\c0, \b0, \c0
1.1  riastrad 	veor		\c1, \b1, \c1
1.1  riastrad 	vshl.u32	\b0, \c0, #12
1.1  riastrad 	vshl.u32	\b1, \c1, #12
1.1  riastrad 	vsri.u32	\b0, \c0, #(32 - 12)
1.1  riastrad 	vsri.u32	\b1, \c1, #(32 - 12)
1.1  riastrad
1.1  riastrad 	veor		\c0, \b2, \c2
1.1  riastrad 	veor		\c1, \b3, \c3
1.1  riastrad 	vshl.u32	\b2, \c0, #12
1.1  riastrad 	vshl.u32	\b3, \c1, #12
1.1  riastrad 	vsri.u32	\b2, \c0, #(32 - 12)
1.1  riastrad 	vsri.u32	\b3, \c1, #(32 - 12)
1.1  riastrad
1.1  riastrad 	vld1.8		{\c0l}, [r7, :64]	/* load rot8 table */
1.1  riastrad
1.1  riastrad 	/* a += b; d ^= a; d <<<= 8 */
1.1  riastrad 	vadd.u32	\a0, \a0, \b0
1.1  riastrad 	vadd.u32	\a1, \a1, \b1
1.1  riastrad 	vadd.u32	\a2, \a2, \b2
1.1  riastrad 	vadd.u32	\a3, \a3, \b3
1.1  riastrad
1.1  riastrad 	veor		\d0, \d0, \a0
1.1  riastrad 	veor		\d1, \d1, \a1
1.1  riastrad 	veor		\d2, \d2, \a2
1.1  riastrad 	veor		\d3, \d3, \a3
1.1  riastrad
1.1  riastrad 	vtbl.8		\d0l, {\d0l}, \c0l	/* <<< 8 */
1.1  riastrad 	vtbl.8		\d0h, {\d0h}, \c0l
1.1  riastrad 	vtbl.8		\d1l, {\d1l}, \c0l
1.1  riastrad 	vtbl.8		\d1h, {\d1h}, \c0l
1.1  riastrad 	vtbl.8		\d2l, {\d2l}, \c0l
1.1  riastrad 	vtbl.8		\d2h, {\d2h}, \c0l
1.1  riastrad 	vtbl.8		\d3l, {\d3l}, \c0l
1.1  riastrad 	vtbl.8		\d3h, {\d3h}, \c0l
1.1  riastrad
1.1  riastrad 	vld1.32		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
1.1  riastrad
1.1  riastrad 	/* c += d; b ^= c; b <<<= 7 */
1.1  riastrad 	vadd.u32	\c2, \c2, \d2
1.1  riastrad 	vadd.u32	\c3, \c3, \d3
1.1  riastrad 	vadd.u32	\c0, \c0, \d0
1.1  riastrad 	vadd.u32	\c1, \c1, \d1
1.1  riastrad
1.1  riastrad 	vst1.32		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
1.1  riastrad
1.1  riastrad 	veor		\c2, \b2, \c2
1.1  riastrad 	veor		\c3, \b3, \c3
1.1  riastrad 	vshl.u32	\b2, \c2, #7
1.1  riastrad 	vshl.u32	\b3, \c3, #7
1.1  riastrad 	vsri.u32	\b2, \c2, #(32 - 7)
1.1  riastrad 	vsri.u32	\b3, \c3, #(32 - 7)
1.1  riastrad
1.1  riastrad 	veor		\c2, \b0, \c0
1.1  riastrad 	veor		\c3, \b1, \c1
1.1  riastrad 	vshl.u32	\b0, \c2, #7
1.1  riastrad 	vshl.u32	\b1, \c3, #7
1.1  riastrad 	vsri.u32	\b0, \c2, #(32 - 7)
1.1  riastrad 	vsri.u32	\b1, \c3, #(32 - 7)
1.1  riastrad .endm
1.1  riastrad
1.1  riastrad #if _BYTE_ORDER == _LITTLE_ENDIAN
1.1  riastrad #define	HTOLE32(x)
1.1  riastrad #define	LE32TOH(x)
1.1  riastrad #elif _BYTE_ORDER == _BIG_ENDIAN
1.1  riastrad #define	HTOLE32(x)	vrev32.8	x, x
1.1  riastrad #define	LE32TOH(x)	vrev32.8	x, x
1.1  riastrad #endif
1.1  riastrad
1.1  riastrad 	.text
1.1  riastrad 	.p2align 2
1.1  riastrad .Lconstants_addr:
1.1  riastrad 	.long	.Lconstants - .
1.1  riastrad
1.1  riastrad /*
1.1  riastrad  * chacha_stream256_neon(uint8_t s[256]@r0,
1.1  riastrad  *     uint32_t blkno@r1,
1.1  riastrad  *     const uint8_t nonce[12]@r2,
1.1  riastrad  *     const uint8_t key[32]@r3,
1.1  riastrad  *     const uint8_t const[16]@sp[0],
1.1  riastrad  *     unsigned nr@sp[4])
1.1  riastrad  */
1.1  riastrad ENTRY(chacha_stream256_neon)
1.1  riastrad 	/* save callee-saves registers */
1.1  riastrad 	push	{r4, r5, r6, r7, r8, r10, fp, lr}
1.1  riastrad 	vpush	{d8-d15}
1.1  riastrad
1.1  riastrad 	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
1.1  riastrad 	ldr	r7, .Lconstants_addr
1.1  riastrad 	adr	r6, .Lconstants_addr
1.1  riastrad
1.1  riastrad 	/* reserve space for two 128-bit/16-byte q registers */
1.1  riastrad 	sub	fp, sp, #0x20
1.1  riastrad 	bic	fp, fp, #0x1f	/* align */
1.1  riastrad
1.1  riastrad 	/* get parameters */
1.1  riastrad 	add	ip, sp, #96
1.1  riastrad 	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
1.1  riastrad 	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
1.1  riastrad 	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
1.1  riastrad
1.1  riastrad 	vld1.32	{q12}, [r4]	/* q12 := constant */
1.1  riastrad 	vld1.32	{q13-q14}, [r3]	/* q13-q14 := key */
1.1  riastrad 	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
1.1  riastrad
1.1  riastrad 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
1.1  riastrad 	vdup.32	q1, d24[1]
1.1  riastrad 	vdup.32	q2, d25[0]
1.1  riastrad 	vdup.32	q3, d25[1]
1.1  riastrad 	vdup.32	q12, r1		/* q12 := (blkno, blkno, blkno, blkno) */
1.1  riastrad 	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
1.1  riastrad 	vdup.32	q5, d26[1]
1.1  riastrad 	vdup.32	q6, d27[0]
1.1  riastrad 	vdup.32	q7, d27[1]
1.1  riastrad 	vdup.32	q8, d28[0]
1.1  riastrad 	vdup.32	q9, d28[1]
1.1  riastrad 	vdup.32	q10, d29[0]
1.1  riastrad 	vdup.32	q11, d29[1]
1.1  riastrad 	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
1.1  riastrad 	vdup.32	q13, r6		/* q13-q15 := nonce */
1.1  riastrad 	vdup.32	q14, r8
1.1  riastrad 	vdup.32	q15, r10
1.1  riastrad
1.1  riastrad 	HTOLE32(q0)
1.1  riastrad 	HTOLE32(q1)
1.1  riastrad 	HTOLE32(q2)
1.1  riastrad 	HTOLE32(q3)
1.1  riastrad 	HTOLE32(q4)
1.1  riastrad 	HTOLE32(q5)
1.1  riastrad 	HTOLE32(q6)
1.1  riastrad 	HTOLE32(q7)
1.1  riastrad 	HTOLE32(q8)
1.1  riastrad 	HTOLE32(q9)
1.1  riastrad 	HTOLE32(q10)
1.1  riastrad 	HTOLE32(q11)
1.1  riastrad 	HTOLE32(q12)
1.1  riastrad 	HTOLE32(q13)
1.1  riastrad 	HTOLE32(q14)
1.1  riastrad 	HTOLE32(q15)
1.1  riastrad
1.1  riastrad 	b	2f
1.1  riastrad
1.1  riastrad 	_ALIGN_TEXT
1.1  riastrad 1:	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
1.1  riastrad 2:	subs	r5, r5, #2
1.1  riastrad 	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
1.1  riastrad 			d16, d24,d25, d26,d27, d28,d29, d30,d31
1.1  riastrad 	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
1.1  riastrad 	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
1.1  riastrad 			d20, d30,d31, d24,d25, d26,d27, d28,d29
1.1  riastrad 	bne	1b
1.1  riastrad
1.1  riastrad 	/*
1.1  riastrad 	 * q8-q9 are free / saved on the stack.  We have:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
1.1  riastrad 	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
1.1  riastrad 	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
1.1  riastrad 	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
1.1  riastrad 	 *
1.1  riastrad 	 * where xi[j] is the jth word of the ith 16-word block.  Zip
1.1  riastrad 	 * consecutive pairs with vzip.32, and you get:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
1.1  riastrad 	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
1.1  riastrad 	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
1.1  riastrad 	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
1.1  riastrad 	 *
1.1  riastrad 	 * As 64-bit d registers, this is:
1.1  riastrad 	 *
1.1  riastrad 	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
1.1  riastrad 	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
1.1  riastrad 	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
1.1  riastrad 	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
1.1  riastrad 	 *
1.1  riastrad 	 * Swap d1<->d4, d3<->d6, ..., and you get:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
1.1  riastrad 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
1.1  riastrad 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
1.1  riastrad 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
1.1  riastrad 	 */
1.1  riastrad
1.1  riastrad 	sub	r7, r7, #0x10
1.1  riastrad 	vdup.32	q8, r1		/* q8 := (blkno, blkno, blkno, blkno) */
1.1  riastrad 	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
1.1  riastrad
1.1  riastrad 	vzip.32	q0, q1
1.1  riastrad 	vzip.32	q2, q3
1.1  riastrad 	vzip.32	q4, q5
1.1  riastrad 	vzip.32	q6, q7
1.1  riastrad
1.1  riastrad 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
1.1  riastrad 	vld1.32	{q9}, [r4]	/* q9 := constant */
1.1  riastrad 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
1.1  riastrad 	vld1.32	{q8}, [r3]!	/* q8 := key[0:16) */
1.1  riastrad
1.1  riastrad 	vswp	d1, d4
1.1  riastrad 	vswp	d9, d12
1.1  riastrad 	vswp	d3, d6
1.1  riastrad 	vswp	d11, d14
1.1  riastrad
1.1  riastrad 	/*
1.1  riastrad 	 * At this point, the blocks are:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
1.1  riastrad 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
1.1  riastrad 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
1.1  riastrad 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
1.1  riastrad 	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
1.1  riastrad 	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
1.1  riastrad 	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
1.1  riastrad 	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
1.1  riastrad 	 *
1.1  riastrad 	 * The first two rows to write out are q0 = x0[0:4) and q4 =
1.1  riastrad 	 * x0[4:8).  If we first swap q1 and q4, then once we've
1.1  riastrad 	 * written them out we free up consecutive registers q0-q1 for
1.1  riastrad 	 * store-multiple.
1.1  riastrad 	 */
1.1  riastrad
1.1  riastrad 	vswp	q1, q4
1.1  riastrad
1.1  riastrad 	vadd.u32 q0, q0, q9
1.1  riastrad 	vadd.u32 q4, q4, q9
1.1  riastrad 	vadd.u32 q2, q2, q9
1.1  riastrad 	vadd.u32 q3, q3, q9
1.1  riastrad
1.1  riastrad 	vadd.u32 q1, q1, q8
1.1  riastrad 	vadd.u32 q5, q5, q8
1.1  riastrad 	vadd.u32 q6, q6, q8
1.1  riastrad 	vadd.u32 q7, q7, q8
1.1  riastrad
1.1  riastrad 	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
1.1  riastrad
1.1  riastrad 	LE32TOH(q0)
1.1  riastrad 	LE32TOH(q1)
1.1  riastrad 	LE32TOH(q2)
1.1  riastrad 	LE32TOH(q3)
1.1  riastrad 	LE32TOH(q4)
1.1  riastrad 	LE32TOH(q5)
1.1  riastrad 	LE32TOH(q6)
1.1  riastrad 	LE32TOH(q7)
1.1  riastrad
1.1  riastrad 	vst1.32	{q0-q1}, [r0]!
1.1  riastrad 	vld1.32	{q0}, [r3]	/* q0 := key[16:32) */
1.1  riastrad 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
1.1  riastrad 	vmov	d2, r3, r6
1.1  riastrad 	vmov	d3, r8, r10
1.1  riastrad
1.1  riastrad 	vzip.32	q8, q9
1.1  riastrad 	vzip.32	q10, q11
1.1  riastrad 	vzip.32	q12, q13
1.1  riastrad 	vzip.32	q14, q15
1.1  riastrad
1.1  riastrad 	vswp	d17, d20
1.1  riastrad 	vswp	d25, d28
1.1  riastrad 	vswp	d19, d22
1.1  riastrad 	vswp	d27, d30
1.1  riastrad
1.1  riastrad 	vadd.u32 q8, q8, q0
1.1  riastrad 	vadd.u32 q9, q9, q0
1.1  riastrad 	vadd.u32 q10, q10, q0
1.1  riastrad 	vadd.u32 q11, q11, q0
1.1  riastrad
1.1  riastrad 	vadd.u32 q12, q12, q1
1.1  riastrad 	vadd.u32 q13, q13, q1
1.1  riastrad 	vadd.u32 q14, q14, q1
1.1  riastrad 	vadd.u32 q15, q15, q1
1.1  riastrad
1.1  riastrad 	LE32TOH(q8)
1.1  riastrad 	LE32TOH(q9)
1.1  riastrad 	LE32TOH(q10)
1.1  riastrad 	LE32TOH(q11)
1.1  riastrad 	LE32TOH(q12)
1.1  riastrad 	LE32TOH(q13)
1.1  riastrad 	LE32TOH(q14)
1.1  riastrad 	LE32TOH(q15)
1.1  riastrad
1.1  riastrad 	/* prepare to zero temporary space on stack */
1.1  riastrad 	vmov.i32 q0, #0
1.1  riastrad 	vmov.i32 q1, #0
1.1  riastrad
1.1  riastrad 	/* vst1.32	{q0}, [r0]! */
1.1  riastrad 	/* vst1.32	{q1}, [r0]! */	/* (was q4 before vswp) */
1.1  riastrad 	vst1.32	{q8}, [r0]!
1.1  riastrad 	vst1.32	{q12}, [r0]!
1.1  riastrad 	vst1.32	{q2}, [r0]!
1.1  riastrad 	vst1.32	{q6}, [r0]!
1.1  riastrad 	vst1.32	{q10}, [r0]!
1.1  riastrad 	vst1.32	{q14}, [r0]!
1.1  riastrad 	vst1.32	{q4}, [r0]!	/* (was q1 before vswp) */
1.1  riastrad 	vst1.32	{q5}, [r0]!
1.1  riastrad 	vst1.32	{q9}, [r0]!
1.1  riastrad 	vst1.32	{q13}, [r0]!
1.1  riastrad 	vst1.32 {q3}, [r0]!
1.1  riastrad 	vst1.32 {q7}, [r0]!
1.1  riastrad 	vst1.32 {q11}, [r0]!
1.1  riastrad 	vst1.32 {q15}, [r0]
1.1  riastrad
1.1  riastrad 	/* zero temporary space on the stack */
1.1  riastrad 	vst1.8	{q0-q1}, [fp, :256]
1.1  riastrad
1.1  riastrad 	/* restore callee-saves registers and stack */
1.1  riastrad 	vpop	{d8-d15}
1.1  riastrad 	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
1.1  riastrad 	bx	lr
1.1  riastrad END(chacha_stream256_neon)
1.1  riastrad
1.1  riastrad /*
1.1  riastrad  * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
1.1  riastrad  *     uint32_t blkno@r2,
1.1  riastrad  *     const uint8_t nonce[12]@r3,
1.1  riastrad  *     const uint8_t key[32]@sp[0],
1.1  riastrad  *     const uint8_t const[16]@sp[4],
1.1  riastrad  *     unsigned nr@sp[8])
1.1  riastrad  */
1.1  riastrad ENTRY(chacha_stream_xor256_neon)
1.1  riastrad 	/* save callee-saves registers */
1.1  riastrad 	push	{r4, r5, r6, r7, r8, r10, fp, lr}
1.1  riastrad 	vpush	{d8-d15}
1.1  riastrad
1.1  riastrad 	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
1.1  riastrad 	ldr	r7, .Lconstants_addr
1.1  riastrad 	adr	r6, .Lconstants_addr
1.1  riastrad
1.1  riastrad 	/* reserve space for two 128-bit/16-byte q registers */
1.1  riastrad 	sub	fp, sp, #0x20
1.1  riastrad 	bic	fp, fp, #0x1f	/* align */
1.1  riastrad
1.1  riastrad 	/* get parameters */
1.1  riastrad 	add	ip, sp, #96
1.1  riastrad 	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
1.1  riastrad 	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
1.1  riastrad 	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
1.1  riastrad
1.1  riastrad 	vld1.32	{q12}, [r5]	/* q12 := constant */
1.1  riastrad 	vld1.32	{q13-q14}, [r4]	/* q13-q14 := key */
1.1  riastrad 	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
1.1  riastrad
1.1  riastrad 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
1.1  riastrad 	vdup.32	q1, d24[1]
1.1  riastrad 	vdup.32	q2, d25[0]
1.1  riastrad 	vdup.32	q3, d25[1]
1.1  riastrad 	vdup.32	q12, r2		/* q12 := (blkno, blkno, blkno, blkno) */
1.1  riastrad 	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
1.1  riastrad 	vdup.32	q5, d26[1]
1.1  riastrad 	vdup.32	q6, d27[0]
1.1  riastrad 	vdup.32	q7, d27[1]
1.1  riastrad 	vdup.32	q8, d28[0]
1.1  riastrad 	vdup.32	q9, d28[1]
1.1  riastrad 	vdup.32	q10, d29[0]
1.1  riastrad 	vdup.32	q11, d29[1]
1.1  riastrad 	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
1.1  riastrad 	vdup.32	q13, r6		/* q13-q15 := nonce */
1.1  riastrad 	vdup.32	q14, r8
1.1  riastrad 	vdup.32	q15, r10
1.1  riastrad
1.1  riastrad 	HTOLE32(q0)
1.1  riastrad 	HTOLE32(q1)
1.1  riastrad 	HTOLE32(q2)
1.1  riastrad 	HTOLE32(q3)
1.1  riastrad 	HTOLE32(q4)
1.1  riastrad 	HTOLE32(q5)
1.1  riastrad 	HTOLE32(q6)
1.1  riastrad 	HTOLE32(q7)
1.1  riastrad 	HTOLE32(q8)
1.1  riastrad 	HTOLE32(q9)
1.1  riastrad 	HTOLE32(q10)
1.1  riastrad 	HTOLE32(q11)
1.1  riastrad 	HTOLE32(q12)
1.1  riastrad 	HTOLE32(q13)
1.1  riastrad 	HTOLE32(q14)
1.1  riastrad 	HTOLE32(q15)
1.1  riastrad
1.1  riastrad 	b	2f
1.1  riastrad
1.1  riastrad 	_ALIGN_TEXT
1.1  riastrad 1:	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
1.1  riastrad 2:	subs	ip, ip, #2
1.1  riastrad 	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
1.1  riastrad 			d16, d24,d25, d26,d27, d28,d29, d30,d31
1.1  riastrad 	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
1.1  riastrad 	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
1.1  riastrad 			d20, d30,d31, d24,d25, d26,d27, d28,d29
1.1  riastrad 	bne	1b
1.1  riastrad
1.1  riastrad 	/*
1.1  riastrad 	 * q8-q9 are free / saved on the stack.  Now for the real fun:
1.1  riastrad 	 * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
1.1  riastrad 	 * {0,1,2,...,15}.  The twist is that the p[i] and the y[i] are
1.1  riastrad 	 * transposed from one another, and the x[i] are in general
1.1  riastrad 	 * registers and memory.  So we have:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
1.1  riastrad 	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
1.1  riastrad 	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
1.1  riastrad 	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
1.1  riastrad 	 *
1.1  riastrad 	 * where xi[j] is the jth word of the ith 16-word block.  Zip
1.1  riastrad 	 * consecutive pairs with vzip.32, and you get:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
1.1  riastrad 	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
1.1  riastrad 	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
1.1  riastrad 	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
1.1  riastrad 	 *
1.1  riastrad 	 * As 64-bit d registers, this is:
1.1  riastrad 	 *
1.1  riastrad 	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
1.1  riastrad 	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
1.1  riastrad 	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
1.1  riastrad 	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
1.1  riastrad 	 *
1.1  riastrad 	 * Swap d1<->d4, d3<->d6, ..., and you get:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
1.1  riastrad 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
1.1  riastrad 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
1.1  riastrad 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
1.1  riastrad 	 *	...
1.1  riastrad 	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
1.1  riastrad 	 */
1.1  riastrad
1.1  riastrad 	sub	r7, r7, #0x10
1.1  riastrad 	vdup.32	q8, r2		/* q8 := (blkno, blkno, blkno, blkno) */
1.1  riastrad 	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
1.1  riastrad
1.1  riastrad 	vzip.32	q0, q1
1.1  riastrad 	vzip.32	q2, q3
1.1  riastrad 	vzip.32	q4, q5
1.1  riastrad 	vzip.32	q6, q7
1.1  riastrad
1.1  riastrad 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
1.1  riastrad 	vld1.32	{q9}, [r5]	/* q9 := constant */
1.1  riastrad 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
1.1  riastrad 	vld1.32	{q8}, [r4]!	/* q8 := key[0:16) */
1.1  riastrad
1.1  riastrad 	vswp	d1, d4
1.1  riastrad 	vswp	d9, d12
1.1  riastrad 	vswp	d3, d6
1.1  riastrad 	vswp	d11, d14
1.1  riastrad
1.1  riastrad 	/*
1.1  riastrad 	 * At this point, the blocks are:
1.1  riastrad 	 *
1.1  riastrad 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
1.1  riastrad 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
1.1  riastrad 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
1.1  riastrad 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
1.1  riastrad 	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
1.1  riastrad 	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
1.1  riastrad 	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
1.1  riastrad 	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
1.1  riastrad 	 *
1.1  riastrad 	 * The first two rows to write out are q0 = x0[0:4) and q4 =
1.1  riastrad 	 * x0[4:8).  If we first swap q1 and q4, then once we've
1.1  riastrad 	 * written them out we free up consecutive registers q0-q1 for
1.1  riastrad 	 * store-multiple.
1.1  riastrad 	 */
1.1  riastrad
1.1  riastrad 	vswp	q1, q4
1.1  riastrad
1.1  riastrad 	vadd.u32 q0, q0, q9
1.1  riastrad 	vadd.u32 q4, q4, q9
1.1  riastrad 	vadd.u32 q2, q2, q9
1.1  riastrad 	vadd.u32 q3, q3, q9
1.1  riastrad
1.1  riastrad 	vadd.u32 q1, q1, q8
1.1  riastrad 	vadd.u32 q5, q5, q8
1.1  riastrad 	vadd.u32 q6, q6, q8
1.1  riastrad 	vadd.u32 q7, q7, q8
1.1  riastrad
1.1  riastrad 	vld1.32 {q8-q9}, [r1]!	/* load plaintext bytes [0:32) */
1.1  riastrad
1.1  riastrad 	LE32TOH(q0)
1.1  riastrad 	LE32TOH(q1)
1.1  riastrad 	LE32TOH(q2)
1.1  riastrad 	LE32TOH(q6)
1.1  riastrad 	LE32TOH(q4)
1.1  riastrad 	LE32TOH(q5)
1.1  riastrad 	LE32TOH(q3)
1.1  riastrad 	LE32TOH(q7)
1.1  riastrad
1.1  riastrad 	veor	q0, q0, q8	/* compute ciphertext bytes [0:32) */
1.1  riastrad 	veor	q1, q1, q9
1.1  riastrad
1.1  riastrad 	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
1.1  riastrad
1.1  riastrad 	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
1.1  riastrad 	vld1.32	{q0}, [r4]	/* q0 := key[16:32) */
1.1  riastrad 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
1.1  riastrad 	vmov	d2, r3, r6
1.1  riastrad 	vmov	d3, r8, r10
1.1  riastrad
1.1  riastrad 	vzip.32	q8, q9
1.1  riastrad 	vzip.32	q10, q11
1.1  riastrad 	vzip.32	q12, q13
1.1  riastrad 	vzip.32	q14, q15
1.1  riastrad
1.1  riastrad 	vswp	d17, d20
1.1  riastrad 	vswp	d25, d28
1.1  riastrad 	vswp	d19, d22
1.1  riastrad 	vswp	d27, d30
1.1  riastrad
1.1  riastrad 	vswp	q9, q12		/* free up q9 earlier for consecutive q8-q9 */
1.1  riastrad
1.1  riastrad 	vadd.u32 q8, q8, q0
1.1  riastrad 	vadd.u32 q12, q12, q0
1.1  riastrad 	vadd.u32 q10, q10, q0
1.1  riastrad 	vadd.u32 q11, q11, q0
1.1  riastrad
1.1  riastrad 	vadd.u32 q9, q9, q1
1.1  riastrad 	vadd.u32 q13, q13, q1
1.1  riastrad 	vadd.u32 q14, q14, q1
1.1  riastrad 	vadd.u32 q15, q15, q1
1.1  riastrad
1.1  riastrad 	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */
1.1  riastrad
1.1  riastrad 	LE32TOH(q8)
1.1  riastrad 	LE32TOH(q9)
1.1  riastrad 	LE32TOH(q10)
1.1  riastrad 	LE32TOH(q14)
1.1  riastrad 	LE32TOH(q12)
1.1  riastrad 	LE32TOH(q13)
1.1  riastrad 	LE32TOH(q11)
1.1  riastrad 	LE32TOH(q15)
1.1  riastrad
1.1  riastrad 	veor	q0, q0, q8	/* compute ciphertext bytes [32:64) */
1.1  riastrad 	veor	q1, q1, q9
1.1  riastrad
1.1  riastrad 	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
1.1  riastrad 	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
1.1  riastrad 	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */
1.1  riastrad
1.1  riastrad 	veor	q2, q2, q8	/* compute ciphertext bytes [64:96) */
1.1  riastrad 	veor	q6, q6, q9
1.1  riastrad
1.1  riastrad 	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
1.1  riastrad 	vst1.32	{q2}, [r0]!	/* store ciphertext bytes [64:80) */
1.1  riastrad
1.1  riastrad 	veor	q10, q10, q0	/* compute ciphertext bytes [96:128) */
1.1  riastrad 	veor	q14, q14, q1
1.1  riastrad
1.1  riastrad 	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
1.1  riastrad 	vst1.32	{q6}, [r0]!	/* store ciphertext bytes [80:96) */
1.1  riastrad
1.1  riastrad 	veor	q4, q4, q8	/* compute ciphertext bytes [128:160) */
1.1  riastrad 	veor	q5, q5, q9
1.1  riastrad
1.1  riastrad 	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
1.1  riastrad 	vst1.32	{q10}, [r0]!	/* store ciphertext bytes [96:112) */
1.1  riastrad
1.1  riastrad 	veor	q12, q12, q0	/* compute ciphertext bytes [160:192) */
1.1  riastrad 	veor	q13, q13, q1
1.1  riastrad
1.1  riastrad 	vld1.32	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
1.1  riastrad 	vst1.32	{q14}, [r0]!	/* store ciphertext bytes [112:128) */
1.1  riastrad
1.1  riastrad 	veor	q8, q3, q8	/* compute ciphertext bytes [192:224) */
1.1  riastrad 	veor	q9, q7, q9
1.1  riastrad
1.1  riastrad 	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [128:160) */
1.1  riastrad 	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [160:192) */
1.1  riastrad
1.1  riastrad 	veor	q0, q11, q0	/* compute ciphertext bytes [224:256) */
1.1  riastrad 	veor	q1, q15, q1
1.1  riastrad
1.1  riastrad 	vst1.32	{q8-q9}, [r0]!	/* store ciphertext bytes [192:224) */
1.1  riastrad 	vst1.32	{q0-q1}, [r0]	/* store ciphertext bytes [224:256) */
1.1  riastrad
1.1  riastrad 	/* zero temporary space on the stack */
1.1  riastrad 	vmov.i32 q0, #0
1.1  riastrad 	vmov.i32 q1, #0
1.1  riastrad 	vst1.8	{q0-q1}, [fp, :256]
1.1  riastrad
1.1  riastrad 	/* restore callee-saves registers and stack */
1.1  riastrad 	vpop	{d8-d15}
1.1  riastrad 	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
1.1  riastrad 	bx	lr
1.1  riastrad END(chacha_stream_xor256_neon)
1.1  riastrad
1.1  riastrad 	.section .rodata
1.1  riastrad 	.p2align 4
1.1  riastrad .Lconstants:
1.1  riastrad
1.1  riastrad 	.type	v0123,%object
1.1  riastrad v0123:
1.1  riastrad 	.long	0, 1, 2, 3
1.1  riastrad END(v0123)
1.1  riastrad
1.1  riastrad 	.type	rot8,%object
1.1  riastrad rot8:
1.1  riastrad 	.long	0x02010003, 0x06050407
1.1  riastrad END(rot8)