1 1.1 matt /*- 2 1.1 matt * Copyright (c) 2013 The NetBSD Foundation, Inc. 3 1.1 matt * All rights reserved. 4 1.1 matt * 5 1.1 matt * This code is derived from software contributed to The NetBSD Foundation 6 1.1 matt * by Matt Thomas of 3am Software Foundry. 7 1.1 matt * 8 1.1 matt * Redistribution and use in source and binary forms, with or without 9 1.1 matt * modification, are permitted provided that the following conditions 10 1.1 matt * are met: 11 1.1 matt * 1. Redistributions of source code must retain the above copyright 12 1.1 matt * notice, this list of conditions and the following disclaimer. 13 1.1 matt * 2. Redistributions in binary form must reproduce the above copyright 14 1.1 matt * notice, this list of conditions and the following disclaimer in the 15 1.1 matt * documentation and/or other materials provided with the distribution. 16 1.1 matt * 17 1.1 matt * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 1.1 matt * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 1.1 matt * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 1.1 matt * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 1.1 matt * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 1.1 matt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 1.1 matt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 1.1 matt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 1.1 matt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 1.1 matt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 1.1 matt * POSSIBILITY OF SUCH DAMAGE. 28 1.1 matt */ 29 1.1 matt 30 1.1 matt #include <machine/asm.h> 31 1.1 matt 32 1.3 andvar RCSID("$NetBSD: memcpy_neon.S,v 1.3 2025/02/27 08:39:53 andvar Exp $") 33 1.1 matt 34 1.1 matt .text 35 1.1 matt ENTRY(memcpy) 36 1.1 matt teq r2, #0 /* 0 length? */ 37 1.1 matt cmpne r0, r1 /* if not, does src == dst? */ 38 1.1 matt RETc(eq) /* yes, (to either) return */ 39 1.1 matt 40 1.1 matt mov r3, r0 /* keep r0 unchanged */ 41 1.1 matt #if 0 42 1.1 matt cmp r2, #16 /* copy less than 8 bytes? */ 43 1.2 mlelstv bhs .Ldst_aligner /* nope, do it the long way */ 44 1.1 matt 45 1.1 matt 1: ldrb ip, [r1], #1 /* load a byte from src */ 46 1.1 matt subs r2, r2, #1 /* and more to transfer? */ 47 1.1 matt strb ip, [r3], #1 /* save it to dst */ 48 1.1 matt bne 1b /* yes, do next byte */ 49 1.1 matt RET /* return */ 50 1.1 matt #endif 51 1.1 matt 52 1.1 matt .Ldst_aligner: 53 1.1 matt tst r3, #7 /* is dst pointer word aligned? */ 54 1.1 matt beq .Lsrc_aligner /* yes, check src pointer */ 55 1.1 matt /* 56 1.1 matt * Until the dst pointer is word aligned, read src and dst byte by 57 1.1 matt * byte until it is aligned or we've copied everything. 58 1.1 matt */ 59 1.1 matt ldrb ip, [r1], #1 /* load a byte from src */ 60 1.1 matt strb ip, [r3], #1 /* save the byte to dst */ 61 1.1 matt subs r2, r2, #1 /* end of transfer? */ 62 1.1 matt bne .Ldst_aligner /* no, try next byte */ 63 1.1 matt RET /* yes, we're done! */ 64 1.1 matt 65 1.1 matt .Lsrc_aligner: 66 1.1 matt push {r4-r5} /* save some registers */ 67 1.1 matt add r4, r2, r3 /* keep a pointer to the end of src */ 68 1.1 matt ands r5, r1, #7 /* get misalignment of src pointer */ 69 1.1 matt beq .Lcongruent_main /* aligned, do it the fast way */ 70 1.1 matt 71 1.1 matt vdup.8 d1, r5 /* set offset for table */ 72 1.1 matt rsb r5, r5, #8 /* calculate leftover of each word */ 73 1.1 matt bic r1, r1, #7 /* dword align src pointer */ 74 1.1 matt 75 1.1 matt vldr d0, .Ltbl_value /* load table value */ 76 1.1 matt vadd.u8 d0, d0, d1 /* add offset to it */ 77 1.1 matt 78 1.1 matt vld1.64 {d1}, [r1:64]! /* load a dword from src */ 79 1.1 matt 80 1.1 matt cmp r2, r5 /* do we already have enough? */ 81 1.2 mlelstv bhi .Lincongruent /* no, so read more */ 82 1.1 matt 83 1.1 matt .Lincongruent_finish: 84 1.1 matt vtbl.8 d0, {d1-d2}, d0 /* merge last dwords */ 85 1.1 matt cmp r2, #8 /* room for a full dword? */ 86 1.1 matt #ifdef __ARMEB__ 87 1.1 matt vrev64.32 d0, d0 /* word swap to LE */ 88 1.1 matt #endif 89 1.2 mlelstv blo .Lfinish /* no, write final partial dword */ 90 1.1 matt vst1.32 {d0}, [r3:64] /* yes, write final full dword */ 91 1.1 matt b .Ldone /* and we're done! */ 92 1.1 matt 93 1.1 matt .Lincongruent: 94 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 95 1.1 matt cmp r2, #8 /* can we write a full dword? */ 96 1.2 mlelstv blo .Lincongruent_finish /* no, finish it. */ 97 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 98 1.1 matt vst1.64 {d1}, [r3:64]! /* store a dword */ 99 1.1 matt subs r2, r2, #8 /* have we written everything? */ 100 1.1 matt beq .Ldone /* yes, we're done! */ 101 1.1 matt vmov d1, d2 /* prepare for next dword */ 102 1.1 matt tst r3, #63 /* are we 64-byte aligned? */ 103 1.1 matt bne .Lincongruent /* no, load next dword */ 104 1.1 matt 105 1.1 matt /* 106 1.1 matt * We are now 64-byte aligneds so all writes should fill one or more 107 1.1 matt * cachelines. Even if d1 has 7 bytes cached, to write 32 bytes we 108 1.1 matt * still need to read 4 dwords (3 full dwords and 1 dword for that 109 1.1 matt * last byte). 110 1.1 matt */ 111 1.1 matt cmp r2, #32 /* can we write 4 more dwords? */ 112 1.2 mlelstv blo .Lincongruent_dword /* no, handle dword by dword */ 113 1.1 matt vld1.64 {d2-d5}, [r1:64]! /* read 4 dwords */ 114 1.1 matt cmp r2, #64 /* can we write 4 more dwords? */ 115 1.2 mlelstv blo .Lincongruent_4dword /* no, handle it */ 116 1.1 matt 117 1.1 matt 1: vld1.64 {d7-d10}, [r1:64]! /* read 4 dwords */ 118 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 119 1.1 matt vtbl.8 d2, {d2-d3}, d0 /* reorder */ 120 1.1 matt vtbl.8 d3, {d3-d4}, d0 /* reorder */ 121 1.1 matt vtbl.8 d4, {d4-d5}, d0 /* reorder */ 122 1.1 matt vst1.64 {d1-d4}, [r3:64]! /* write 4 dwords */ 123 1.1 matt vmov d6, d5 /* move out of the way the load */ 124 1.1 matt cmp r2, #96 /* have 8+4 dwords to write? */ 125 1.2 mlelstv blo 2f /* no more data, skip the load */ 126 1.1 matt vld1.64 {d2-d5}, [r1:64]! /* more data, load 4 dwords */ 127 1.1 matt 2: vtbl.8 d6, {d6-d7}, d0 /* reorder */ 128 1.1 matt vtbl.8 d7, {d7-d8}, d0 /* reorder */ 129 1.1 matt vtbl.8 d8, {d8-d9}, d0 /* reorder */ 130 1.1 matt vtbl.8 d9, {d9-d10}, d0 /* reorder */ 131 1.1 matt vst1.64 {d6-d9}, [r3:64]! /* write 4 dwords */ 132 1.1 matt subs r2, r2, #64 133 1.1 matt beq .Ldone 134 1.1 matt vmov d1, d10 135 1.1 matt cmp r2, #64 136 1.2 mlelstv bhs 1b 137 1.1 matt 138 1.1 matt /* 139 1.1 matt * we have leftovers in d1 and new untranslated date in d2-d5. 140 1.1 matt */ 141 1.1 matt .Lincongruent_4dword: 142 1.1 matt cmp r2, #32 143 1.2 mlelstv blo .Lincongruent_dword 144 1.1 matt 145 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 146 1.1 matt vtbl.8 d2, {d2-d3}, d0 /* reorder */ 147 1.1 matt vtbl.8 d3, {d3-d4}, d0 /* reorder */ 148 1.1 matt vtbl.8 d4, {d4-d5}, d0 /* reorder */ 149 1.1 matt vst1.64 {d1-d4}, [r3:64]! /* write 4 dwords */ 150 1.1 matt vmov d1, d5 /* move leftovers */ 151 1.1 matt subs r2, r2, #32 152 1.1 matt beq .Ldone 153 1.1 matt 154 1.1 matt .Lincongruent_dword: 155 1.1 matt #if 0 156 1.1 matt cmp r2, r5 /* enough in leftovers? */ 157 1.2 mlelstv bls .Lincongruent_finish /* yes, finish it. */ 158 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 159 1.1 matt cmp r2, #8 /* can we write a full dword? */ 160 1.2 mlelstv blo .Lincongruent_finish /* no, finish it. */ 161 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 162 1.1 matt vst1.64 {d1}, [r3:64]! /* store a dword */ 163 1.1 matt subs r2, r2, #8 /* have we written everything? */ 164 1.1 matt beq .Ldone /* yes, we're done! */ 165 1.1 matt b .Lincongruent_dword /* and go get it */ 166 1.1 matt #else 167 1.1 matt cmp r2, r5 /* are the bytes we have enough? */ 168 1.2 mlelstv bls .Lincongruent_finish /* yes, finish it. */ 169 1.1 matt mov ip, r2 /* get remaining count */ 170 1.1 matt bic ip, ip, #7 /* truncate to a dword */ 171 1.1 matt rsb ip, ip, #32 /* subtract from 32 */ 172 1.1 matt ands r2, r2, #7 /* count mod 8 */ 173 1.1 matt add pc, pc, ip, lsl #1 /* and jump! */ 174 1.1 matt nop 175 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 176 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 177 1.1 matt vst1.64 {d1}, [r3:64]! /* store a dword */ 178 1.1 matt vmov d1, d2 /* prepare for next dword */ 179 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 180 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 181 1.1 matt vst1.64 {d1}, [r3:64]! /* store a dword */ 182 1.1 matt vmov d1, d2 /* prepare for next dword */ 183 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 184 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 185 1.1 matt vst1.64 {d1}, [r3:64]! /* store a dword */ 186 1.1 matt vmov d1, d2 /* prepare for next dword */ 187 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 188 1.1 matt vtbl.8 d1, {d1-d2}, d0 /* reorder */ 189 1.1 matt vst1.64 {d1}, [r3:64]! /* store a dword */ 190 1.1 matt vmov d1, d2 /* prepare for next dword */ 191 1.1 matt beq .Ldone 192 1.1 matt vld1.64 {d2}, [r1:64]! /* load a dword */ 193 1.3 andvar b .Lincongruent_finish /* write last partial dword */ 194 1.1 matt #endif 195 1.1 matt 196 1.1 matt .Lcongruent_main: 197 1.1 matt vld1.32 {d0}, [r1:64]! /* load next dword */ 198 1.1 matt cmp r2, #8 /* compare current ptr against end */ 199 1.2 mlelstv blo .Lfinish /* greater so write final dword */ 200 1.1 matt vst1.32 {d0}, [r3:64]! /* store dword */ 201 1.1 matt subs r2, r2, #8 /* compare current ptr against end */ 202 1.1 matt beq .Ldone /* equal? we're done! */ 203 1.1 matt tst r3, #63 /* have we hit a 64-byte boundary? */ 204 1.1 matt bne .Lcongruent_main /* no, write next word */ 205 1.1 matt 206 1.1 matt cmp r2, #64 /* can we write 4 dwords? */ 207 1.2 mlelstv blo .Lcongruent_loop /* no, this dword by dword */ 208 1.1 matt vldm r1!, {d0-d7} /* load next 7 dwords */ 209 1.1 matt cmp r2, #128 /* can we write 16 dwords */ 210 1.2 mlelstv blo 3f /* no, then deal with 8 dwords */ 211 1.1 matt 212 1.1 matt /* 213 1.1 matt * The following writes two 64-byte interleaving stores and loads. 214 1.1 matt */ 215 1.1 matt 1: vldm r1!, {d8-d15} /* load next 8 dwords */ 216 1.1 matt vstm r3!, {d0-d7} /* store 8 more dwords */ 217 1.1 matt cmp r2, #192 /* can we write 16+8 dwords? */ 218 1.2 mlelstv blo 2f /* no, don't load the next 8 dwords */ 219 1.1 matt vldm r1!, {d0-d7} /* yes, load next 8 dwords */ 220 1.1 matt 2: vstm r3!, {d8-d15} /* store 8 more dwords */ 221 1.1 matt sub r2, r2, #128 /* we just stored 16 (8+8) dwords */ 222 1.1 matt beq .Ldone /* if 0, we're done! */ 223 1.1 matt cmp r2, #128 /* can we write 16 dwords */ 224 1.2 mlelstv bhs 1b /* yes, do it again */ 225 1.1 matt cmp r2, #64 /* have we loaded 8 dwords? */ 226 1.2 mlelstv blo .Lcongruent_loop /* no, proceed to do it dword */ 227 1.1 matt 228 1.1 matt /* 229 1.1 matt * We now have 8 dwords we can write in d0-d7. 230 1.1 matt */ 231 1.1 matt 3: vstm r3!, {d0-d7} /* store 8 more dwords */ 232 1.1 matt subs r2, r2, #64 /* we wrote 8 dwords */ 233 1.1 matt beq .Ldone /* if 0, we're done! */ 234 1.1 matt 235 1.1 matt .Lcongruent_loop: 236 1.1 matt vld1.32 {d0}, [r1]! /* load dword from src */ 237 1.1 matt cmp r2, #8 /* can we write a full dword? */ 238 1.2 mlelstv blo .Lfinish /* no, write last partial dword */ 239 1.1 matt .Lcongruent_loop_start: 240 1.1 matt vst1.32 {d0}, [r3]! /* store dword into dst */ 241 1.1 matt subs r2, r2, #8 /* subtract it from length */ 242 1.1 matt beq .Ldone /* if 0, we're done! */ 243 1.1 matt vld1.32 {d0}, [r1]! /* load dword from src */ 244 1.1 matt cmp r2, #8 /* can we write a full dword? */ 245 1.2 mlelstv bhs .Lcongruent_loop_start /* yes, so do it */ 246 1.1 matt 247 1.1 matt .Lfinish: 248 1.1 matt vmov r4, r5, d0 /* get last dword from NEON */ 249 1.1 matt tst r2, #4 /* do we have at least 4 bytes left? */ 250 1.1 matt strne r4, [r3], #4 /* store the 1st word */ 251 1.1 matt movne r4, r5 /* move 2nd word into place */ 252 1.1 matt tst r2, #2 /* do we have at least 2 bytes left? */ 253 1.1 matt #ifdef __ARMEB__ 254 1.1 matt movne r4, r4, ror #16 /* yes, swap halfwords */ 255 1.1 matt #endif 256 1.1 matt strneh r4, [r3], #2 /* yes, store the halfword */ 257 1.1 matt #ifdef __ARMEL__ 258 1.1 matt movne r4, r4, lsr #16 /* yes, discard just written bytes */ 259 1.1 matt #endif 260 1.1 matt tst r2, #1 /* do we have a final byte? */ 261 1.1 matt #ifdef __ARMEB__ 262 1.1 matt movne r4, r4, lsr #24 /* yes, move MSB to LSB */ 263 1.1 matt #endif 264 1.1 matt strneb r4, [r3], #1 /* yes, store it */ 265 1.1 matt 266 1.1 matt .Ldone: 267 1.1 matt pop {r4-r5} /* restore registers */ 268 1.1 matt RET 269 1.1 matt 270 1.1 matt .p2align 3 271 1.1 matt .Ltbl_value: 272 1.1 matt #ifdef __ARMEL__ 273 1.1 matt .quad 0x0706050403020100 274 1.1 matt #else 275 1.1 matt .quad 0x0001020304050607 276 1.1 matt #endif 277 1.1 matt END(memcpy) 278