Home | History | Annotate | Line # | Download | only in string
memcpy_neon.S revision 1.1.8.2
      1  1.1.8.2  tls /*-
      2  1.1.8.2  tls  * Copyright (c) 2013 The NetBSD Foundation, Inc.
      3  1.1.8.2  tls  * All rights reserved.
      4  1.1.8.2  tls  *
      5  1.1.8.2  tls  * This code is derived from software contributed to The NetBSD Foundation
      6  1.1.8.2  tls  * by Matt Thomas of 3am Software Foundry.
      7  1.1.8.2  tls  *
      8  1.1.8.2  tls  * Redistribution and use in source and binary forms, with or without
      9  1.1.8.2  tls  * modification, are permitted provided that the following conditions
     10  1.1.8.2  tls  * are met:
     11  1.1.8.2  tls  * 1. Redistributions of source code must retain the above copyright
     12  1.1.8.2  tls  *    notice, this list of conditions and the following disclaimer.
     13  1.1.8.2  tls  * 2. Redistributions in binary form must reproduce the above copyright
     14  1.1.8.2  tls  *    notice, this list of conditions and the following disclaimer in the
     15  1.1.8.2  tls  *    documentation and/or other materials provided with the distribution.
     16  1.1.8.2  tls  *
     17  1.1.8.2  tls  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     18  1.1.8.2  tls  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     19  1.1.8.2  tls  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     20  1.1.8.2  tls  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     21  1.1.8.2  tls  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     22  1.1.8.2  tls  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     23  1.1.8.2  tls  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     24  1.1.8.2  tls  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     25  1.1.8.2  tls  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     26  1.1.8.2  tls  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27  1.1.8.2  tls  * POSSIBILITY OF SUCH DAMAGE.
     28  1.1.8.2  tls  */
     29  1.1.8.2  tls 
     30  1.1.8.2  tls #include <machine/asm.h>
     31  1.1.8.2  tls 
     32  1.1.8.2  tls RCSID("$NetBSD: memcpy_neon.S,v 1.1.8.2 2013/02/25 00:23:56 tls Exp $")
     33  1.1.8.2  tls 
     34  1.1.8.2  tls 	.text
     35  1.1.8.2  tls ENTRY(memcpy)
     36  1.1.8.2  tls 	teq	r2, #0			/* 0 length? */
     37  1.1.8.2  tls 	cmpne	r0, r1			/*   if not, does src == dst? */
     38  1.1.8.2  tls 	RETc(eq)			/*   yes, (to either) return */
     39  1.1.8.2  tls 
     40  1.1.8.2  tls 	mov	r3, r0			/* keep r0 unchanged */
     41  1.1.8.2  tls #if 0
     42  1.1.8.2  tls 	cmp	r2, #16			/* copy less than 8 bytes? */
     43  1.1.8.2  tls 	bge	.Ldst_aligner		/*   nope, do it the long way */
     44  1.1.8.2  tls 
     45  1.1.8.2  tls 1:	ldrb	ip, [r1], #1		/* load a byte from src */
     46  1.1.8.2  tls 	subs	r2, r2, #1		/* and more to transfer? */
     47  1.1.8.2  tls 	strb	ip, [r3], #1		/* save it to dst */
     48  1.1.8.2  tls 	bne	1b			/*   yes, do next byte */
     49  1.1.8.2  tls 	RET				/* return */
     50  1.1.8.2  tls #endif
     51  1.1.8.2  tls 
     52  1.1.8.2  tls .Ldst_aligner:
     53  1.1.8.2  tls 	tst	r3, #7			/* is dst pointer word aligned? */
     54  1.1.8.2  tls 	beq	.Lsrc_aligner		/*   yes, check src pointer */
     55  1.1.8.2  tls 	/*
     56  1.1.8.2  tls 	 * Until the dst pointer is word aligned, read src and dst byte by
     57  1.1.8.2  tls 	 * byte until it is aligned or we've copied everything.
     58  1.1.8.2  tls 	 */
     59  1.1.8.2  tls 	ldrb	ip, [r1], #1		/* load a byte from src */
     60  1.1.8.2  tls 	strb	ip, [r3], #1		/* save the byte to dst */
     61  1.1.8.2  tls 	subs	r2, r2, #1		/* end of transfer? */
     62  1.1.8.2  tls 	bne	.Ldst_aligner		/*   no, try next byte */
     63  1.1.8.2  tls 	RET				/* yes, we're done! */
     64  1.1.8.2  tls 
     65  1.1.8.2  tls .Lsrc_aligner:
     66  1.1.8.2  tls 	push	{r4-r5}			/* save some registers */
     67  1.1.8.2  tls 	add	r4, r2, r3		/* keep a pointer to the end of src */
     68  1.1.8.2  tls 	ands	r5, r1, #7		/* get misalignment of src pointer */
     69  1.1.8.2  tls 	beq	.Lcongruent_main	/*   aligned, do it the fast way */
     70  1.1.8.2  tls 
     71  1.1.8.2  tls 	vdup.8	d1, r5			/* set offset for table */
     72  1.1.8.2  tls 	rsb	r5, r5, #8		/* calculate leftover of each word */
     73  1.1.8.2  tls 	bic	r1, r1, #7		/* dword align src pointer */
     74  1.1.8.2  tls 
     75  1.1.8.2  tls 	vldr	d0, .Ltbl_value		/* load table value */
     76  1.1.8.2  tls 	vadd.u8	d0, d0, d1		/* add offset to it */
     77  1.1.8.2  tls 
     78  1.1.8.2  tls 	vld1.64 {d1}, [r1:64]!		/* load a dword from src */
     79  1.1.8.2  tls 
     80  1.1.8.2  tls 	cmp	r2, r5			/* do we already have enough? */
     81  1.1.8.2  tls 	bgt	.Lincongruent		/*   no, so read more */
     82  1.1.8.2  tls 
     83  1.1.8.2  tls .Lincongruent_finish:
     84  1.1.8.2  tls 	vtbl.8	d0, {d1-d2}, d0		/* merge last dwords */
     85  1.1.8.2  tls 	cmp	r2, #8			/* room for a full dword? */
     86  1.1.8.2  tls #ifdef __ARMEB__
     87  1.1.8.2  tls 	vrev64.32 d0, d0		/* word swap to LE */
     88  1.1.8.2  tls #endif
     89  1.1.8.2  tls 	blt	.Lfinish		/*   no, write final partial dword */
     90  1.1.8.2  tls 	vst1.32 {d0}, [r3:64]		/*   yes, write final full dword */
     91  1.1.8.2  tls 	b	.Ldone			/* and we're done! */
     92  1.1.8.2  tls 
     93  1.1.8.2  tls .Lincongruent:
     94  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
     95  1.1.8.2  tls 	cmp	r2, #8			/* can we write a full dword? */
     96  1.1.8.2  tls 	blt	.Lincongruent_finish	/*   no, finish it. */
     97  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
     98  1.1.8.2  tls 	vst1.64 {d1}, [r3:64]!		/* store a dword */
     99  1.1.8.2  tls 	subs	r2, r2, #8		/* have we written everything? */
    100  1.1.8.2  tls 	beq	.Ldone			/*   yes, we're done! */
    101  1.1.8.2  tls 	vmov	d1, d2			/* prepare for next dword */
    102  1.1.8.2  tls 	tst	r3, #63			/* are we 64-byte aligned? */
    103  1.1.8.2  tls 	bne	.Lincongruent		/*   no, load next dword */
    104  1.1.8.2  tls 
    105  1.1.8.2  tls 	/*
    106  1.1.8.2  tls 	 * We are now 64-byte aligneds so all writes should fill one or more
    107  1.1.8.2  tls 	 * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
    108  1.1.8.2  tls 	 * still need to read 4 dwords (3 full dwords and 1 dword for that
    109  1.1.8.2  tls 	 * last byte).
    110  1.1.8.2  tls 	 */
    111  1.1.8.2  tls 	cmp	r2, #32			/* can we write 4 more dwords? */
    112  1.1.8.2  tls 	blt	.Lincongruent_dword	/*   no, handle dword by dword */
    113  1.1.8.2  tls 	vld1.64 {d2-d5}, [r1:64]!	/* read 4 dwords */
    114  1.1.8.2  tls 	cmp	r2, #64			/* can we write 4 more dwords? */
    115  1.1.8.2  tls 	blt	.Lincongruent_4dword	/*   no, handle it */
    116  1.1.8.2  tls 
    117  1.1.8.2  tls 1:	vld1.64 {d7-d10}, [r1:64]!	/* read 4 dwords */
    118  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    119  1.1.8.2  tls 	vtbl.8	d2, {d2-d3}, d0		/* reorder */
    120  1.1.8.2  tls 	vtbl.8	d3, {d3-d4}, d0		/* reorder */
    121  1.1.8.2  tls 	vtbl.8	d4, {d4-d5}, d0		/* reorder */
    122  1.1.8.2  tls 	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
    123  1.1.8.2  tls 	vmov	d6, d5			/* move out of the way the load */
    124  1.1.8.2  tls 	cmp	r2, #96			/* have 8+4 dwords to write? */
    125  1.1.8.2  tls 	blt	2f			/*   no more data, skip the load */
    126  1.1.8.2  tls 	vld1.64 {d2-d5}, [r1:64]!	/* more data, load 4 dwords */
    127  1.1.8.2  tls 2:	vtbl.8	d6, {d6-d7}, d0		/* reorder */
    128  1.1.8.2  tls 	vtbl.8	d7, {d7-d8}, d0		/* reorder */
    129  1.1.8.2  tls 	vtbl.8	d8, {d8-d9}, d0		/* reorder */
    130  1.1.8.2  tls 	vtbl.8	d9, {d9-d10}, d0	/* reorder */
    131  1.1.8.2  tls 	vst1.64 {d6-d9}, [r3:64]!	/* write 4 dwords */
    132  1.1.8.2  tls 	subs	r2, r2, #64
    133  1.1.8.2  tls 	beq	.Ldone
    134  1.1.8.2  tls 	vmov	d1, d10
    135  1.1.8.2  tls 	cmp	r2, #64
    136  1.1.8.2  tls 	bge	1b
    137  1.1.8.2  tls 
    138  1.1.8.2  tls 	/*
    139  1.1.8.2  tls 	 * we have leftovers in d1 and new untranslated date in d2-d5.
    140  1.1.8.2  tls 	 */
    141  1.1.8.2  tls .Lincongruent_4dword:
    142  1.1.8.2  tls 	cmp	r2, #32
    143  1.1.8.2  tls 	blt	.Lincongruent_dword
    144  1.1.8.2  tls 
    145  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    146  1.1.8.2  tls 	vtbl.8	d2, {d2-d3}, d0		/* reorder */
    147  1.1.8.2  tls 	vtbl.8	d3, {d3-d4}, d0		/* reorder */
    148  1.1.8.2  tls 	vtbl.8	d4, {d4-d5}, d0		/* reorder */
    149  1.1.8.2  tls 	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
    150  1.1.8.2  tls 	vmov	d1, d5			/* move leftovers */
    151  1.1.8.2  tls 	subs	r2, r2, #32
    152  1.1.8.2  tls 	beq	.Ldone
    153  1.1.8.2  tls 
    154  1.1.8.2  tls .Lincongruent_dword:
    155  1.1.8.2  tls #if 0
    156  1.1.8.2  tls 	cmp	r2, r5			/* enough in leftovers? */
    157  1.1.8.2  tls 	ble	.Lincongruent_finish	/*   yes, finish it. */
    158  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    159  1.1.8.2  tls 	cmp	r2, #8			/* can we write a full dword? */
    160  1.1.8.2  tls 	blt	.Lincongruent_finish	/*   no, finish it. */
    161  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    162  1.1.8.2  tls 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    163  1.1.8.2  tls 	subs	r2, r2, #8		/* have we written everything? */
    164  1.1.8.2  tls 	beq	.Ldone			/*   yes, we're done! */
    165  1.1.8.2  tls 	b	.Lincongruent_dword	/* and go get it */
    166  1.1.8.2  tls #else
    167  1.1.8.2  tls 	cmp	r2, r5			/* are the bytes we have enough? */
    168  1.1.8.2  tls 	ble	.Lincongruent_finish	/*   yes, finish it. */
    169  1.1.8.2  tls 	mov	ip, r2			/* get remaining count */
    170  1.1.8.2  tls 	bic	ip, ip, #7		/* truncate to a dword */
    171  1.1.8.2  tls 	rsb	ip, ip, #32		/* subtract from 32 */
    172  1.1.8.2  tls 	ands	r2, r2, #7		/* count mod 8 */
    173  1.1.8.2  tls 	add	pc, pc, ip, lsl #1	/* and jump! */
    174  1.1.8.2  tls 	nop
    175  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    176  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    177  1.1.8.2  tls 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    178  1.1.8.2  tls 	vmov	d1, d2			/* prepare for next dword */
    179  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    180  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    181  1.1.8.2  tls 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    182  1.1.8.2  tls 	vmov	d1, d2			/* prepare for next dword */
    183  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    184  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    185  1.1.8.2  tls 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    186  1.1.8.2  tls 	vmov	d1, d2			/* prepare for next dword */
    187  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    188  1.1.8.2  tls 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    189  1.1.8.2  tls 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    190  1.1.8.2  tls 	vmov	d1, d2			/* prepare for next dword */
    191  1.1.8.2  tls 	beq	.Ldone
    192  1.1.8.2  tls 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    193  1.1.8.2  tls 	b	.Lincongruent_finish	/* write last partial dowrd */
    194  1.1.8.2  tls #endif
    195  1.1.8.2  tls 
    196  1.1.8.2  tls .Lcongruent_main:
    197  1.1.8.2  tls 	vld1.32 {d0}, [r1:64]!		/* load next dword */
    198  1.1.8.2  tls 	cmp	r2, #8			/* compare current ptr against end */
    199  1.1.8.2  tls 	blt	.Lfinish		/*   greater so write final dword */
    200  1.1.8.2  tls 	vst1.32 {d0}, [r3:64]!		/* store dword */
    201  1.1.8.2  tls 	subs	r2, r2, #8		/* compare current ptr against end */
    202  1.1.8.2  tls 	beq	.Ldone			/*   equal? we're done! */
    203  1.1.8.2  tls 	tst	r3, #63			/* have we hit a 64-byte boundary? */
    204  1.1.8.2  tls 	bne	.Lcongruent_main	/*   no, write next word */
    205  1.1.8.2  tls 
    206  1.1.8.2  tls 	cmp	r2, #64			/* can we write 4 dwords? */
    207  1.1.8.2  tls 	blt	.Lcongruent_loop	/*   no, this dword by dword */
    208  1.1.8.2  tls 	vldm	r1!, {d0-d7}		/* load next 7 dwords */
    209  1.1.8.2  tls 	cmp	r2, #128		/* can we write 16 dwords */
    210  1.1.8.2  tls 	blt	3f			/*   no, then deal with 8 dwords */
    211  1.1.8.2  tls 
    212  1.1.8.2  tls 	/*
    213  1.1.8.2  tls 	 * The following writes two 64-byte interleaving stores and loads.
    214  1.1.8.2  tls 	 */
    215  1.1.8.2  tls 1:	vldm	r1!, {d8-d15}		/* load next 8 dwords */
    216  1.1.8.2  tls 	vstm	r3!, {d0-d7}		/* store 8 more dwords */
    217  1.1.8.2  tls 	cmp	r2, #192		/* can we write 16+8 dwords? */
    218  1.1.8.2  tls 	blt	2f			/*   no, don't load the next 8 dwords */
    219  1.1.8.2  tls 	vldm	r1!, {d0-d7}		/*   yes, load next 8 dwords */
    220  1.1.8.2  tls 2:	vstm	r3!, {d8-d15}		/* store 8 more dwords */
    221  1.1.8.2  tls 	sub	r2, r2, #128		/* we just stored 16 (8+8) dwords */
    222  1.1.8.2  tls 	beq	.Ldone			/*   if 0, we're done! */
    223  1.1.8.2  tls 	cmp	r2, #128		/* can we write 16 dwords */
    224  1.1.8.2  tls 	bge	1b			/*   yes, do it again */
    225  1.1.8.2  tls 	cmp	r2, #64			/* have we loaded 8 dwords? */
    226  1.1.8.2  tls 	blt	.Lcongruent_loop	/*   no, proceed to do it dword */
    227  1.1.8.2  tls 
    228  1.1.8.2  tls 	/*
    229  1.1.8.2  tls 	 * We now have 8 dwords we can write in d0-d7.
    230  1.1.8.2  tls 	 */
    231  1.1.8.2  tls 3:	vstm	r3!, {d0-d7}		/* store 8 more dwords */
    232  1.1.8.2  tls 	subs	r2, r2, #64		/* we wrote 8 dwords */
    233  1.1.8.2  tls 	beq	.Ldone			/*   if 0, we're done! */
    234  1.1.8.2  tls 
    235  1.1.8.2  tls .Lcongruent_loop:
    236  1.1.8.2  tls 	vld1.32 {d0}, [r1]!		/* load dword from src */
    237  1.1.8.2  tls 	cmp	r2, #8			/* can we write a full dword? */
    238  1.1.8.2  tls 	blt	.Lfinish		/*   no, write last partial dword */
    239  1.1.8.2  tls .Lcongruent_loop_start:
    240  1.1.8.2  tls 	vst1.32 {d0}, [r3]!		/* store dword into dst */
    241  1.1.8.2  tls 	subs	r2, r2, #8		/* subtract it from length */
    242  1.1.8.2  tls 	beq	.Ldone			/*   if 0, we're done! */
    243  1.1.8.2  tls 	vld1.32 {d0}, [r1]!		/* load dword from src */
    244  1.1.8.2  tls 	cmp	r2, #8			/* can we write a full dword? */
    245  1.1.8.2  tls 	bge	.Lcongruent_loop_start	/*   yes, so do it */
    246  1.1.8.2  tls 
    247  1.1.8.2  tls .Lfinish:
    248  1.1.8.2  tls 	vmov	r4, r5, d0		/* get last dword from NEON */
    249  1.1.8.2  tls 	tst	r2, #4			/* do we have at least 4 bytes left? */
    250  1.1.8.2  tls 	strne	r4, [r3], #4		/* store the 1st word */
    251  1.1.8.2  tls 	movne	r4, r5			/* move 2nd word into place */
    252  1.1.8.2  tls 	tst	r2, #2			/* do we have at least 2 bytes left? */
    253  1.1.8.2  tls #ifdef __ARMEB__
    254  1.1.8.2  tls 	movne	r4, r4, ror #16		/*   yes, swap halfwords */
    255  1.1.8.2  tls #endif
    256  1.1.8.2  tls 	strneh	r4, [r3], #2		/*   yes, store the halfword */
    257  1.1.8.2  tls #ifdef __ARMEL__
    258  1.1.8.2  tls 	movne	r4, r4, lsr #16		/*   yes, discard just written bytes */
    259  1.1.8.2  tls #endif
    260  1.1.8.2  tls 	tst	r2, #1			/* do we have a final byte? */
    261  1.1.8.2  tls #ifdef __ARMEB__
    262  1.1.8.2  tls 	movne	r4, r4, lsr #24		/*   yes, move MSB to LSB */
    263  1.1.8.2  tls #endif
    264  1.1.8.2  tls 	strneb	r4, [r3], #1		/*   yes, store it */
    265  1.1.8.2  tls 
    266  1.1.8.2  tls .Ldone:
    267  1.1.8.2  tls 	pop	{r4-r5}			/* restore registers */
    268  1.1.8.2  tls 	RET
    269  1.1.8.2  tls 
    270  1.1.8.2  tls 	.p2align 3
    271  1.1.8.2  tls .Ltbl_value:
    272  1.1.8.2  tls #ifdef __ARMEL__
    273  1.1.8.2  tls 	.quad	0x0706050403020100
    274  1.1.8.2  tls #else
    275  1.1.8.2  tls 	.quad	0x0001020304050607
    276  1.1.8.2  tls #endif
    277  1.1.8.2  tls END(memcpy)
    278