Home | History | Annotate | Line # | Download | only in string
memcpy_neon.S revision 1.3
      1 /*-
      2  * Copyright (c) 2013 The NetBSD Foundation, Inc.
      3  * All rights reserved.
      4  *
      5  * This code is derived from software contributed to The NetBSD Foundation
      6  * by Matt Thomas of 3am Software Foundry.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27  * POSSIBILITY OF SUCH DAMAGE.
     28  */
     29 
     30 #include <machine/asm.h>
     31 
     32 RCSID("$NetBSD: memcpy_neon.S,v 1.3 2025/02/27 08:39:53 andvar Exp $")
     33 
     34 	.text
     35 ENTRY(memcpy)
     36 	teq	r2, #0			/* 0 length? */
     37 	cmpne	r0, r1			/*   if not, does src == dst? */
     38 	RETc(eq)			/*   yes, (to either) return */
     39 
     40 	mov	r3, r0			/* keep r0 unchanged */
     41 #if 0
     42 	cmp	r2, #16			/* copy less than 8 bytes? */
     43 	bhs	.Ldst_aligner		/*   nope, do it the long way */
     44 
     45 1:	ldrb	ip, [r1], #1		/* load a byte from src */
     46 	subs	r2, r2, #1		/* and more to transfer? */
     47 	strb	ip, [r3], #1		/* save it to dst */
     48 	bne	1b			/*   yes, do next byte */
     49 	RET				/* return */
     50 #endif
     51 
     52 .Ldst_aligner:
     53 	tst	r3, #7			/* is dst pointer word aligned? */
     54 	beq	.Lsrc_aligner		/*   yes, check src pointer */
     55 	/*
     56 	 * Until the dst pointer is word aligned, read src and dst byte by
     57 	 * byte until it is aligned or we've copied everything.
     58 	 */
     59 	ldrb	ip, [r1], #1		/* load a byte from src */
     60 	strb	ip, [r3], #1		/* save the byte to dst */
     61 	subs	r2, r2, #1		/* end of transfer? */
     62 	bne	.Ldst_aligner		/*   no, try next byte */
     63 	RET				/* yes, we're done! */
     64 
     65 .Lsrc_aligner:
     66 	push	{r4-r5}			/* save some registers */
     67 	add	r4, r2, r3		/* keep a pointer to the end of src */
     68 	ands	r5, r1, #7		/* get misalignment of src pointer */
     69 	beq	.Lcongruent_main	/*   aligned, do it the fast way */
     70 
     71 	vdup.8	d1, r5			/* set offset for table */
     72 	rsb	r5, r5, #8		/* calculate leftover of each word */
     73 	bic	r1, r1, #7		/* dword align src pointer */
     74 
     75 	vldr	d0, .Ltbl_value		/* load table value */
     76 	vadd.u8	d0, d0, d1		/* add offset to it */
     77 
     78 	vld1.64 {d1}, [r1:64]!		/* load a dword from src */
     79 
     80 	cmp	r2, r5			/* do we already have enough? */
     81 	bhi	.Lincongruent		/*   no, so read more */
     82 
     83 .Lincongruent_finish:
     84 	vtbl.8	d0, {d1-d2}, d0		/* merge last dwords */
     85 	cmp	r2, #8			/* room for a full dword? */
     86 #ifdef __ARMEB__
     87 	vrev64.32 d0, d0		/* word swap to LE */
     88 #endif
     89 	blo	.Lfinish		/*   no, write final partial dword */
     90 	vst1.32 {d0}, [r3:64]		/*   yes, write final full dword */
     91 	b	.Ldone			/* and we're done! */
     92 
     93 .Lincongruent:
     94 	vld1.64 {d2}, [r1:64]!		/* load a dword */
     95 	cmp	r2, #8			/* can we write a full dword? */
     96 	blo	.Lincongruent_finish	/*   no, finish it. */
     97 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
     98 	vst1.64 {d1}, [r3:64]!		/* store a dword */
     99 	subs	r2, r2, #8		/* have we written everything? */
    100 	beq	.Ldone			/*   yes, we're done! */
    101 	vmov	d1, d2			/* prepare for next dword */
    102 	tst	r3, #63			/* are we 64-byte aligned? */
    103 	bne	.Lincongruent		/*   no, load next dword */
    104 
    105 	/*
    106 	 * We are now 64-byte aligneds so all writes should fill one or more
    107 	 * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
    108 	 * still need to read 4 dwords (3 full dwords and 1 dword for that
    109 	 * last byte).
    110 	 */
    111 	cmp	r2, #32			/* can we write 4 more dwords? */
    112 	blo	.Lincongruent_dword	/*   no, handle dword by dword */
    113 	vld1.64 {d2-d5}, [r1:64]!	/* read 4 dwords */
    114 	cmp	r2, #64			/* can we write 4 more dwords? */
    115 	blo	.Lincongruent_4dword	/*   no, handle it */
    116 
    117 1:	vld1.64 {d7-d10}, [r1:64]!	/* read 4 dwords */
    118 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    119 	vtbl.8	d2, {d2-d3}, d0		/* reorder */
    120 	vtbl.8	d3, {d3-d4}, d0		/* reorder */
    121 	vtbl.8	d4, {d4-d5}, d0		/* reorder */
    122 	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
    123 	vmov	d6, d5			/* move out of the way the load */
    124 	cmp	r2, #96			/* have 8+4 dwords to write? */
    125 	blo	2f			/*   no more data, skip the load */
    126 	vld1.64 {d2-d5}, [r1:64]!	/* more data, load 4 dwords */
    127 2:	vtbl.8	d6, {d6-d7}, d0		/* reorder */
    128 	vtbl.8	d7, {d7-d8}, d0		/* reorder */
    129 	vtbl.8	d8, {d8-d9}, d0		/* reorder */
    130 	vtbl.8	d9, {d9-d10}, d0	/* reorder */
    131 	vst1.64 {d6-d9}, [r3:64]!	/* write 4 dwords */
    132 	subs	r2, r2, #64
    133 	beq	.Ldone
    134 	vmov	d1, d10
    135 	cmp	r2, #64
    136 	bhs	1b
    137 
    138 	/*
    139 	 * we have leftovers in d1 and new untranslated date in d2-d5.
    140 	 */
    141 .Lincongruent_4dword:
    142 	cmp	r2, #32
    143 	blo	.Lincongruent_dword
    144 
    145 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    146 	vtbl.8	d2, {d2-d3}, d0		/* reorder */
    147 	vtbl.8	d3, {d3-d4}, d0		/* reorder */
    148 	vtbl.8	d4, {d4-d5}, d0		/* reorder */
    149 	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
    150 	vmov	d1, d5			/* move leftovers */
    151 	subs	r2, r2, #32
    152 	beq	.Ldone
    153 
    154 .Lincongruent_dword:
    155 #if 0
    156 	cmp	r2, r5			/* enough in leftovers? */
    157 	bls	.Lincongruent_finish	/*   yes, finish it. */
    158 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    159 	cmp	r2, #8			/* can we write a full dword? */
    160 	blo	.Lincongruent_finish	/*   no, finish it. */
    161 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    162 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    163 	subs	r2, r2, #8		/* have we written everything? */
    164 	beq	.Ldone			/*   yes, we're done! */
    165 	b	.Lincongruent_dword	/* and go get it */
    166 #else
    167 	cmp	r2, r5			/* are the bytes we have enough? */
    168 	bls	.Lincongruent_finish	/*   yes, finish it. */
    169 	mov	ip, r2			/* get remaining count */
    170 	bic	ip, ip, #7		/* truncate to a dword */
    171 	rsb	ip, ip, #32		/* subtract from 32 */
    172 	ands	r2, r2, #7		/* count mod 8 */
    173 	add	pc, pc, ip, lsl #1	/* and jump! */
    174 	nop
    175 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    176 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    177 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    178 	vmov	d1, d2			/* prepare for next dword */
    179 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    180 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    181 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    182 	vmov	d1, d2			/* prepare for next dword */
    183 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    184 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    185 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    186 	vmov	d1, d2			/* prepare for next dword */
    187 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    188 	vtbl.8	d1, {d1-d2}, d0		/* reorder */
    189 	vst1.64 {d1}, [r3:64]!		/* store a dword */
    190 	vmov	d1, d2			/* prepare for next dword */
    191 	beq	.Ldone
    192 	vld1.64 {d2}, [r1:64]!		/* load a dword */
    193 	b	.Lincongruent_finish	/* write last partial dword */
    194 #endif
    195 
    196 .Lcongruent_main:
    197 	vld1.32 {d0}, [r1:64]!		/* load next dword */
    198 	cmp	r2, #8			/* compare current ptr against end */
    199 	blo	.Lfinish		/*   greater so write final dword */
    200 	vst1.32 {d0}, [r3:64]!		/* store dword */
    201 	subs	r2, r2, #8		/* compare current ptr against end */
    202 	beq	.Ldone			/*   equal? we're done! */
    203 	tst	r3, #63			/* have we hit a 64-byte boundary? */
    204 	bne	.Lcongruent_main	/*   no, write next word */
    205 
    206 	cmp	r2, #64			/* can we write 4 dwords? */
    207 	blo	.Lcongruent_loop	/*   no, this dword by dword */
    208 	vldm	r1!, {d0-d7}		/* load next 7 dwords */
    209 	cmp	r2, #128		/* can we write 16 dwords */
    210 	blo	3f			/*   no, then deal with 8 dwords */
    211 
    212 	/*
    213 	 * The following writes two 64-byte interleaving stores and loads.
    214 	 */
    215 1:	vldm	r1!, {d8-d15}		/* load next 8 dwords */
    216 	vstm	r3!, {d0-d7}		/* store 8 more dwords */
    217 	cmp	r2, #192		/* can we write 16+8 dwords? */
    218 	blo	2f			/*   no, don't load the next 8 dwords */
    219 	vldm	r1!, {d0-d7}		/*   yes, load next 8 dwords */
    220 2:	vstm	r3!, {d8-d15}		/* store 8 more dwords */
    221 	sub	r2, r2, #128		/* we just stored 16 (8+8) dwords */
    222 	beq	.Ldone			/*   if 0, we're done! */
    223 	cmp	r2, #128		/* can we write 16 dwords */
    224 	bhs	1b			/*   yes, do it again */
    225 	cmp	r2, #64			/* have we loaded 8 dwords? */
    226 	blo	.Lcongruent_loop	/*   no, proceed to do it dword */
    227 
    228 	/*
    229 	 * We now have 8 dwords we can write in d0-d7.
    230 	 */
    231 3:	vstm	r3!, {d0-d7}		/* store 8 more dwords */
    232 	subs	r2, r2, #64		/* we wrote 8 dwords */
    233 	beq	.Ldone			/*   if 0, we're done! */
    234 
    235 .Lcongruent_loop:
    236 	vld1.32 {d0}, [r1]!		/* load dword from src */
    237 	cmp	r2, #8			/* can we write a full dword? */
    238 	blo	.Lfinish		/*   no, write last partial dword */
    239 .Lcongruent_loop_start:
    240 	vst1.32 {d0}, [r3]!		/* store dword into dst */
    241 	subs	r2, r2, #8		/* subtract it from length */
    242 	beq	.Ldone			/*   if 0, we're done! */
    243 	vld1.32 {d0}, [r1]!		/* load dword from src */
    244 	cmp	r2, #8			/* can we write a full dword? */
    245 	bhs	.Lcongruent_loop_start	/*   yes, so do it */
    246 
    247 .Lfinish:
    248 	vmov	r4, r5, d0		/* get last dword from NEON */
    249 	tst	r2, #4			/* do we have at least 4 bytes left? */
    250 	strne	r4, [r3], #4		/* store the 1st word */
    251 	movne	r4, r5			/* move 2nd word into place */
    252 	tst	r2, #2			/* do we have at least 2 bytes left? */
    253 #ifdef __ARMEB__
    254 	movne	r4, r4, ror #16		/*   yes, swap halfwords */
    255 #endif
    256 	strneh	r4, [r3], #2		/*   yes, store the halfword */
    257 #ifdef __ARMEL__
    258 	movne	r4, r4, lsr #16		/*   yes, discard just written bytes */
    259 #endif
    260 	tst	r2, #1			/* do we have a final byte? */
    261 #ifdef __ARMEB__
    262 	movne	r4, r4, lsr #24		/*   yes, move MSB to LSB */
    263 #endif
    264 	strneb	r4, [r3], #1		/*   yes, store it */
    265 
    266 .Ldone:
    267 	pop	{r4-r5}			/* restore registers */
    268 	RET
    269 
    270 	.p2align 3
    271 .Ltbl_value:
    272 #ifdef __ARMEL__
    273 	.quad	0x0706050403020100
    274 #else
    275 	.quad	0x0001020304050607
    276 #endif
    277 END(memcpy)
    278