Home | History | Annotate | Line # | Download | only in string
memcpy_neon.S revision 1.1.38.2
      1       1.1    matt /*-
      2       1.1    matt  * Copyright (c) 2013 The NetBSD Foundation, Inc.
      3       1.1    matt  * All rights reserved.
      4       1.1    matt  *
      5       1.1    matt  * This code is derived from software contributed to The NetBSD Foundation
      6       1.1    matt  * by Matt Thomas of 3am Software Foundry.
      7       1.1    matt  *
      8       1.1    matt  * Redistribution and use in source and binary forms, with or without
      9       1.1    matt  * modification, are permitted provided that the following conditions
     10       1.1    matt  * are met:
     11       1.1    matt  * 1. Redistributions of source code must retain the above copyright
     12       1.1    matt  *    notice, this list of conditions and the following disclaimer.
     13       1.1    matt  * 2. Redistributions in binary form must reproduce the above copyright
     14       1.1    matt  *    notice, this list of conditions and the following disclaimer in the
     15       1.1    matt  *    documentation and/or other materials provided with the distribution.
     16       1.1    matt  *
     17       1.1    matt  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     18       1.1    matt  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     19       1.1    matt  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     20       1.1    matt  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     21       1.1    matt  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     22       1.1    matt  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     23       1.1    matt  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     24       1.1    matt  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     25       1.1    matt  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     26       1.1    matt  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27       1.1    matt  * POSSIBILITY OF SUCH DAMAGE.
     28       1.1    matt  */
     29       1.1    matt 
     30       1.1    matt #include <machine/asm.h>
     31       1.1    matt 
     32  1.1.38.1  martin RCSID("$NetBSD: memcpy_neon.S,v 1.1.38.2 2020/04/21 19:37:42 martin Exp $")
     33       1.1    matt 
     34       1.1    matt 	.text
     35       1.1    matt ENTRY(memcpy)
     36       1.1    matt 	teq	r2, #0			/* 0 length? */
     37       1.1    matt 	cmpne	r0, r1			/*   if not, does src == dst? */
     38       1.1    matt 	RETc(eq)			/*   yes, (to either) return */
     39       1.1    matt 
     40       1.1    matt 	mov	r3, r0			/* keep r0 unchanged */
     41       1.1    matt #if 0
     42       1.1    matt 	cmp	r2, #16			/* copy less than 8 bytes? */
     43       1.1    matt 	bge	.Ldst_aligner		/*   nope, do it the long way */
     44       1.1    matt 
     45       1.1    matt 1:	ldrb	ip, [r1], #1		/* load a byte from src */
     46       1.1    matt 	subs	r2, r2, #1		/* and more to transfer? */
     47       1.1    matt 	strb	ip, [r3], #1		/* save it to dst */
     48       1.1    matt 	bne	1b			/*   yes, do next byte */
     49       1.1    matt 	RET				/* return */
     50       1.1    matt #endif
     51       1.1    matt 
     52       1.1    matt .Ldst_aligner:
     53       1.1    matt 	tst	r3, #7			/* is dst pointer word aligned? */
     54       1.1    matt 	beq	.Lsrc_aligner		/*   yes, check src pointer */
     55       1.1    matt 	/*
     56       1.1    matt 	 * Until the dst pointer is word aligned, read src and dst byte by
     57       1.1    matt 	 * byte until it is aligned or we've copied everything.
     58       1.1    matt 	 */
     59       1.1    matt 	ldrb	ip, [r1], #1		/* load a byte from src */
     60       1.1    matt 	strb	ip, [r3], #1		/* save the byte to dst */
     61       1.1    matt 	subs	r2, r2, #1		/* end of transfer? */
     62       1.1    matt 	bne	.Ldst_aligner		/*   no, try next byte */
     63       1.1    matt 	RET				/* yes, we're done! */
     64       1.1    matt 
     65       1.1    matt .Lsrc_aligner:
     66       1.1    matt 	push	{r4-r5}			/* save some registers */
     67       1.1    matt 	add	r4, r2, r3		/* keep a pointer to the end of src */
     68       1.1    matt 	ands	r5, r1, #7		/* get misalignment of src pointer */
     69       1.1    matt 	beq	.Lcongruent_main	/*   aligned, do it the fast way */
     70       1.1    matt 
     71       1.1    matt 	vdup.8	d1, r5			/* set offset for table */
     72       1.1    matt 	rsb	r5, r5, #8		/* calculate leftover of each word */
     73       1.1    matt 	bic	r1, r1, #7		/* dword align src pointer */
     74       1.1    matt 
     75       1.1    matt 	vldr	d0, .Ltbl_value		/* load table value */
     76       1.1    matt 	vadd.u8	d0, d0, d1		/* add offset to it */
     77       1.1    matt 
     78       1.1    matt 	vld1.64 {d1}, [r1:64]!		/* load a dword from src */
     79       1.1    matt 
     80       1.1    matt 	cmp	r2, r5			/* do we already have enough? */
     81       1.1    matt 	bgt	.Lincongruent		/*   no, so read more */
     82       1.1    matt 
     83       1.1    matt .Lincongruent_finish:
     84       1.1    matt 	vtbl.8	d0, {d1-d2}, d0		/* merge last dwords */
     85       1.1    matt 	cmp	r2, #8			/* room for a full dword? */
     86       1.1    matt #ifdef __ARMEB__
     87       1.1    matt 	vrev64.32 d0, d0		/* word swap to LE */
     88       1.1    matt #endif
     89       1.1    matt 	blt	.Lfinish		/*   no, write final partial dword */
     90       1.1    matt 	vst1.32 {d0}, [r3:64]		/*   yes, write final full dword */
     91       1.1    matt 	b	.Ldone			/* and we're done! */
     92       1.1    matt 
     93       1.1    matt .Lincongruent:
     94       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
     95       1.1    matt 	cmp	r2, #8			/* can we write a full dword? */
     96       1.1    matt 	blt	.Lincongruent_finish	/*   no, finish it. */
     97       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
     98       1.1    matt 	vst1.64 {d1}, [r3:64]!		/* store a dword */
     99       1.1    matt 	subs	r2, r2, #8		/* have we written everything? */
    100       1.1    matt 	beq	.Ldone			/*   yes, we're done! */
    101       1.1    matt 	vmov	d1, d2			/* prepare for next dword */
    102       1.1    matt 	tst	r3, #63			/* are we 64-byte aligned? */
    103       1.1    matt 	bne	.Lincongruent		/*   no, load next dword */
    104       1.1    matt 
    105       1.1    matt 	/*
    106       1.1    matt 	 * We are now 64-byte aligneds so all writes should fill one or more
    107       1.1    matt 	 * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
    108       1.1    matt 	 * still need to read 4 dwords (3 full dwords and 1 dword for that
    109       1.1    matt 	 * last byte).
    110       1.1    matt 	 */
    111       1.1    matt 	cmp	r2, #32			/* can we write 4 more dwords? */
    112       1.1    matt 	blt	.Lincongruent_dword	/*   no, handle dword by dword */
    113       1.1    matt 	vld1.64 {d2-d5}, [r1:64]!	/* read 4 dwords */
    114       1.1    matt 	cmp	r2, #64			/* can we write 4 more dwords? */
    115       1.1    matt 	blt	.Lincongruent_4dword	/*   no, handle it */
    116       1.1    matt 
    117       1.1    matt 1:	vld1.64 {d7-d10}, [r1:64]!	/* read 4 dwords */
    118       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    119       1.1    matt 	vtbl.8	d2, {d2-d3}, d0		/* reorder */
    120       1.1    matt 	vtbl.8	d3, {d3-d4}, d0		/* reorder */
    121       1.1    matt 	vtbl.8	d4, {d4-d5}, d0		/* reorder */
    122       1.1    matt 	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
    123       1.1    matt 	vmov	d6, d5			/* move out of the way the load */
    124       1.1    matt 	cmp	r2, #96			/* have 8+4 dwords to write? */
    125       1.1    matt 	blt	2f			/*   no more data, skip the load */
    126       1.1    matt 	vld1.64 {d2-d5}, [r1:64]!	/* more data, load 4 dwords */
    127       1.1    matt 2:	vtbl.8	d6, {d6-d7}, d0		/* reorder */
    128       1.1    matt 	vtbl.8	d7, {d7-d8}, d0		/* reorder */
    129       1.1    matt 	vtbl.8	d8, {d8-d9}, d0		/* reorder */
    130       1.1    matt 	vtbl.8	d9, {d9-d10}, d0	/* reorder */
    131       1.1    matt 	vst1.64 {d6-d9}, [r3:64]!	/* write 4 dwords */
    132       1.1    matt 	subs	r2, r2, #64
    133       1.1    matt 	beq	.Ldone
    134       1.1    matt 	vmov	d1, d10
    135       1.1    matt 	cmp	r2, #64
    136       1.1    matt 	bge	1b
    137       1.1    matt 
    138       1.1    matt 	/*
    139       1.1    matt 	 * we have leftovers in d1 and new untranslated date in d2-d5.
    140       1.1    matt 	 */
    141       1.1    matt .Lincongruent_4dword:
    142       1.1    matt 	cmp	r2, #32
    143       1.1    matt 	blt	.Lincongruent_dword
    144       1.1    matt 
    145       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    146       1.1    matt 	vtbl.8	d2, {d2-d3}, d0		/* reorder */
    147       1.1    matt 	vtbl.8	d3, {d3-d4}, d0		/* reorder */
    148       1.1    matt 	vtbl.8	d4, {d4-d5}, d0		/* reorder */
    149       1.1    matt 	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
    150       1.1    matt 	vmov	d1, d5			/* move leftovers */
    151       1.1    matt 	subs	r2, r2, #32
    152       1.1    matt 	beq	.Ldone
    153       1.1    matt 
    154       1.1    matt .Lincongruent_dword:
    155       1.1    matt #if 0
    156       1.1    matt 	cmp	r2, r5			/* enough in leftovers? */
    157       1.1    matt 	ble	.Lincongruent_finish	/*   yes, finish it. */
    158       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    159       1.1    matt 	cmp	r2, #8			/* can we write a full dword? */
    160       1.1    matt 	blt	.Lincongruent_finish	/*   no, finish it. */
    161       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    162       1.1    matt 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    163       1.1    matt 	subs	r2, r2, #8		/* have we written everything? */
    164       1.1    matt 	beq	.Ldone			/*   yes, we're done! */
    165       1.1    matt 	b	.Lincongruent_dword	/* and go get it */
    166       1.1    matt #else
    167       1.1    matt 	cmp	r2, r5			/* are the bytes we have enough? */
    168       1.1    matt 	ble	.Lincongruent_finish	/*   yes, finish it. */
    169       1.1    matt 	mov	ip, r2			/* get remaining count */
    170       1.1    matt 	bic	ip, ip, #7		/* truncate to a dword */
    171       1.1    matt 	rsb	ip, ip, #32		/* subtract from 32 */
    172       1.1    matt 	ands	r2, r2, #7		/* count mod 8 */
    173       1.1    matt 	add	pc, pc, ip, lsl #1	/* and jump! */
    174       1.1    matt 	nop
    175       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    176       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    177       1.1    matt 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    178       1.1    matt 	vmov	d1, d2			/* prepare for next dword */
    179       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    180       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    181       1.1    matt 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    182       1.1    matt 	vmov	d1, d2			/* prepare for next dword */
    183       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    184       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    185       1.1    matt 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    186       1.1    matt 	vmov	d1, d2			/* prepare for next dword */
    187       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    188       1.1    matt 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    189       1.1    matt 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    190       1.1    matt 	vmov	d1, d2			/* prepare for next dword */
    191       1.1    matt 	beq	.Ldone
    192       1.1    matt 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    193       1.1    matt 	b	.Lincongruent_finish	/* write last partial dowrd */
    194       1.1    matt #endif
    195       1.1    matt 
    196       1.1    matt .Lcongruent_main:
    197       1.1    matt 	vld1.32 {d0}, [r1:64]!		/* load next dword */
    198       1.1    matt 	cmp	r2, #8			/* compare current ptr against end */
    199       1.1    matt 	blt	.Lfinish		/*   greater so write final dword */
    200       1.1    matt 	vst1.32 {d0}, [r3:64]!		/* store dword */
    201       1.1    matt 	subs	r2, r2, #8		/* compare current ptr against end */
    202       1.1    matt 	beq	.Ldone			/*   equal? we're done! */
    203       1.1    matt 	tst	r3, #63			/* have we hit a 64-byte boundary? */
    204       1.1    matt 	bne	.Lcongruent_main	/*   no, write next word */
    205       1.1    matt 
    206       1.1    matt 	cmp	r2, #64			/* can we write 4 dwords? */
    207       1.1    matt 	blt	.Lcongruent_loop	/*   no, this dword by dword */
    208       1.1    matt 	vldm	r1!, {d0-d7}		/* load next 7 dwords */
    209       1.1    matt 	cmp	r2, #128		/* can we write 16 dwords */
    210       1.1    matt 	blt	3f			/*   no, then deal with 8 dwords */
    211       1.1    matt 
    212       1.1    matt 	/*
    213       1.1    matt 	 * The following writes two 64-byte interleaving stores and loads.
    214       1.1    matt 	 */
    215       1.1    matt 1:	vldm	r1!, {d8-d15}		/* load next 8 dwords */
    216       1.1    matt 	vstm	r3!, {d0-d7}		/* store 8 more dwords */
    217       1.1    matt 	cmp	r2, #192		/* can we write 16+8 dwords? */
    218       1.1    matt 	blt	2f			/*   no, don't load the next 8 dwords */
    219       1.1    matt 	vldm	r1!, {d0-d7}		/*   yes, load next 8 dwords */
    220       1.1    matt 2:	vstm	r3!, {d8-d15}		/* store 8 more dwords */
    221       1.1    matt 	sub	r2, r2, #128		/* we just stored 16 (8+8) dwords */
    222       1.1    matt 	beq	.Ldone			/*   if 0, we're done! */
    223       1.1    matt 	cmp	r2, #128		/* can we write 16 dwords */
    224       1.1    matt 	bge	1b			/*   yes, do it again */
    225       1.1    matt 	cmp	r2, #64			/* have we loaded 8 dwords? */
    226       1.1    matt 	blt	.Lcongruent_loop	/*   no, proceed to do it dword */
    227       1.1    matt 
    228       1.1    matt 	/*
    229       1.1    matt 	 * We now have 8 dwords we can write in d0-d7.
    230       1.1    matt 	 */
    231       1.1    matt 3:	vstm	r3!, {d0-d7}		/* store 8 more dwords */
    232       1.1    matt 	subs	r2, r2, #64		/* we wrote 8 dwords */
    233       1.1    matt 	beq	.Ldone			/*   if 0, we're done! */
    234       1.1    matt 
    235       1.1    matt .Lcongruent_loop:
    236       1.1    matt 	vld1.32 {d0}, [r1]!		/* load dword from src */
    237       1.1    matt 	cmp	r2, #8			/* can we write a full dword? */
    238       1.1    matt 	blt	.Lfinish		/*   no, write last partial dword */
    239       1.1    matt .Lcongruent_loop_start:
    240       1.1    matt 	vst1.32 {d0}, [r3]!		/* store dword into dst */
    241       1.1    matt 	subs	r2, r2, #8		/* subtract it from length */
    242       1.1    matt 	beq	.Ldone			/*   if 0, we're done! */
    243       1.1    matt 	vld1.32 {d0}, [r1]!		/* load dword from src */
    244       1.1    matt 	cmp	r2, #8			/* can we write a full dword? */
    245       1.1    matt 	bge	.Lcongruent_loop_start	/*   yes, so do it */
    246       1.1    matt 
    247       1.1    matt .Lfinish:
    248       1.1    matt 	vmov	r4, r5, d0		/* get last dword from NEON */
    249       1.1    matt 	tst	r2, #4			/* do we have at least 4 bytes left? */
    250       1.1    matt 	strne	r4, [r3], #4		/* store the 1st word */
    251       1.1    matt 	movne	r4, r5			/* move 2nd word into place */
    252       1.1    matt 	tst	r2, #2			/* do we have at least 2 bytes left? */
    253       1.1    matt #ifdef __ARMEB__
    254       1.1    matt 	movne	r4, r4, ror #16		/*   yes, swap halfwords */
    255       1.1    matt #endif
    256       1.1    matt 	strneh	r4, [r3], #2		/*   yes, store the halfword */
    257       1.1    matt #ifdef __ARMEL__
    258       1.1    matt 	movne	r4, r4, lsr #16		/*   yes, discard just written bytes */
    259       1.1    matt #endif
    260       1.1    matt 	tst	r2, #1			/* do we have a final byte? */
    261       1.1    matt #ifdef __ARMEB__
    262       1.1    matt 	movne	r4, r4, lsr #24		/*   yes, move MSB to LSB */
    263       1.1    matt #endif
    264       1.1    matt 	strneb	r4, [r3], #1		/*   yes, store it */
    265       1.1    matt 
    266       1.1    matt .Ldone:
    267       1.1    matt 	pop	{r4-r5}			/* restore registers */
    268       1.1    matt 	RET
    269       1.1    matt 
    270       1.1    matt 	.p2align 3
    271       1.1    matt .Ltbl_value:
    272       1.1    matt #ifdef __ARMEL__
    273       1.1    matt 	.quad	0x0706050403020100
    274       1.1    matt #else
    275       1.1    matt 	.quad	0x0001020304050607
    276       1.1    matt #endif
    277       1.1    matt END(memcpy)
    278