1 /* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */ 2 3 /* 4 * Copyright (c) 2018 Ryo Shimizu 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <machine/asm.h> 30 31 #if defined(LIBC_SCCS) 32 RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $") 33 #endif 34 35 #if defined(MEMCOPY) 36 37 /* 38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len); 39 */ 40 #define FUNCTION memcpy 41 #define NO_OVERLAP 42 #define SRC0 x1 43 #define DST0 x0 44 #define LEN x2 45 46 #elif defined(MEMMOVE) 47 48 /* 49 * void *memmove(void *dst, const void *src, size_t len); 50 */ 51 #define FUNCTION memmove 52 #undef NO_OVERLAP 53 #define SRC0 x1 54 #define DST0 x0 55 #define LEN x2 56 57 #else /* !MEMCOPY && !MEMMOVE */ 58 59 /* 60 * void bcopy(const void *src, void *dst, size_t len); 61 */ 62 #define FUNCTION bcopy 63 #define NO_OVERLAP 64 #define SRC0 x0 65 #define DST0 x1 66 #define LEN x2 67 68 #endif /* MEMCOPY/MEMMOVE/BCOPY */ 69 70 /* caller-saved temporary registers. breakable. */ 71 #define TMP_X x3 72 #define TMP_Xw w3 73 #define TMP_D x4 74 #define TMP_S x5 75 #define DST x6 76 #define SRC x7 77 #define DATA0 x8 78 #define DATA0w w8 79 #define DATA1 x9 80 #define DATA1w w9 81 #define DATA2 x10 82 #define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */ 83 #define DST_ALIGNBIT x12 /* (DST & 7) * 8 */ 84 #define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */ 85 #define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */ 86 87 #define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */ 88 #define SMALLSIZE 32 89 90 .text 91 .align 5 92 93 #ifndef NO_OVERLAP 94 #ifndef STRICT_ALIGNMENT 95 backward_ignore_align: 96 prfm PLDL1KEEP, [SRC0] 97 add SRC0, SRC0, LEN 98 add DST, DST0, LEN 99 cmp LEN, #SMALLSIZE 100 bcs copy_backward 101 copy_backward_small: 102 cmp LEN, #8 103 bcs 9f 104 105 /* 0 <= len < 8 */ 106 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 107 tbz LEN, #2, 1f 108 ldr TMP_Xw, [SRC0, #-4]! 109 str TMP_Xw, [DST, #-4]! 110 1: 111 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 112 tbz LEN, #1, 1f 113 ldrh TMP_Xw, [SRC0, #-2]! 114 strh TMP_Xw, [DST, #-2]! 115 1: 116 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 117 tbz LEN, #0, 1f 118 ldrb TMP_Xw, [SRC0, #-1]! 119 strb TMP_Xw, [DST, #-1]! 120 1: 121 ret 122 9: 123 124 cmp LEN, #16 125 bcs 9f 126 127 /* 8 <= len < 16 */ 128 /* *--(uint64_t *)dst = *--(uint64_t *)src; */ 129 ldr TMP_X, [SRC0, #-8]! 130 str TMP_X, [DST, #-8]! 131 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 132 tbz LEN, #2, 1f 133 ldr TMP_Xw, [SRC0, #-4]! 134 str TMP_Xw, [DST, #-4]! 135 1: 136 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 137 tbz LEN, #1, 1f 138 ldrh TMP_Xw, [SRC0, #-2]! 139 strh TMP_Xw, [DST, #-2]! 140 1: 141 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 142 tbz LEN, #0, 1f 143 ldrb TMP_Xw, [SRC0, #-1]! 144 strb TMP_Xw, [DST, #-1]! 145 1: 146 ret 147 9: 148 149 /* 16 <= len < 32 */ 150 ldp DATA0, DATA1, [SRC0, #-16]! 151 stp DATA0, DATA1, [DST, #-16]! 152 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 153 tbz LEN, #3, 1f 154 ldr TMP_X, [SRC0, #-8]! 155 str TMP_X, [DST, #-8]! 156 1: 157 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 158 tbz LEN, #2, 1f 159 ldr TMP_Xw, [SRC0, #-4]! 160 str TMP_Xw, [DST, #-4]! 161 1: 162 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 163 tbz LEN, #1, 1f 164 ldrh TMP_Xw, [SRC0, #-2]! 165 strh TMP_Xw, [DST, #-2]! 166 1: 167 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 168 tbz LEN, #0, 1f 169 ldrb TMP_Xw, [SRC0, #-1]! 170 strb TMP_Xw, [DST, #-1]! 171 1: 172 ret 173 #endif /* !STRICT_ALIGNMENT */ 174 175 .align 4 176 copy_backward: 177 /* DST is not aligned at this point */ 178 #ifndef STRICT_ALIGNMENT 179 cmp LEN, #512 /* pre-alignment can be overhead when small */ 180 bcc 9f 181 #endif 182 /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 183 tbz DST, #0, 1f 184 ldrb TMP_Xw, [SRC0, #-1]! 185 strb TMP_Xw, [DST, #-1]! 186 sub LEN, LEN, #1 187 1: 188 /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 189 tbz DST, #1, 1f 190 ldrh TMP_Xw, [SRC0, #-2]! 191 strh TMP_Xw, [DST, #-2]! 192 sub LEN, LEN, #2 193 1: 194 /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 195 tbz DST, #2, 1f 196 ldr TMP_Xw, [SRC0, #-4]! 197 str TMP_Xw, [DST, #-4]! 198 sub LEN, LEN, #4 199 1: 200 #if (STP_ALIGN > 8) 201 /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 202 tbz DST, #3, 1f 203 ldr TMP_X, [SRC0, #-8]! 204 str TMP_X, [DST, #-8]! 205 sub LEN, LEN, #8 206 1: 207 #endif /* (STP_ALIGN > 8) */ 208 9: 209 210 backward_copy1k: 211 /* while (len >= 1024) */ 212 /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */ 213 cmp LEN, #1024 214 blo 9f 215 1: 216 sub LEN, LEN, #1024 217 .rept (1024 / 16) 218 ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */ 219 stp DATA0, DATA1, [DST, #-16]! 220 .endr 221 cmp LEN, #1024 222 bhs 1b 223 9: 224 225 /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */ 226 tbz LEN, #9, 1f 227 .rept (512 / 16) 228 ldp DATA0, DATA1, [SRC0, #-16]! 229 stp DATA0, DATA1, [DST, #-16]! 230 .endr 231 1: 232 /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */ 233 tbz LEN, #8, 1f 234 .rept (256 / 16) 235 ldp DATA0, DATA1, [SRC0, #-16]! 236 stp DATA0, DATA1, [DST, #-16]! 237 .endr 238 1: 239 /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */ 240 tbz LEN, #7, 1f 241 .rept (128 / 16) 242 ldp DATA0, DATA1, [SRC0, #-16]! 243 stp DATA0, DATA1, [DST, #-16]! 244 .endr 245 1: 246 /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */ 247 tbz LEN, #6, 1f 248 .rept (64 / 16) 249 ldp DATA0, DATA1, [SRC0, #-16]! 250 stp DATA0, DATA1, [DST, #-16]! 251 .endr 252 1: 253 /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */ 254 tbz LEN, #5, 1f 255 .rept (32 / 16) 256 ldp DATA0, DATA1, [SRC0, #-16]! 257 stp DATA0, DATA1, [DST, #-16]! 258 .endr 259 1: 260 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 261 tbz LEN, #4, 1f 262 ldp DATA0, DATA1, [SRC0, #-16]! 263 stp DATA0, DATA1, [DST, #-16]! 264 1: 265 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 266 tbz LEN, #3, 1f 267 ldr TMP_X, [SRC0, #-8]! 268 str TMP_X, [DST, #-8]! 269 1: 270 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 271 tbz LEN, #2, 1f 272 ldr TMP_Xw, [SRC0, #-4]! 273 str TMP_Xw, [DST, #-4]! 274 1: 275 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 276 tbz LEN, #1, 1f 277 ldrh TMP_Xw, [SRC0, #-2]! 278 strh TMP_Xw, [DST, #-2]! 279 1: 280 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 281 tbz LEN, #0, 1f 282 ldrb TMP_Xw, [SRC0, #-1]! 283 strb TMP_Xw, [DST, #-1]! 284 1: 285 ret 286 #endif /* !NO_OVERLAP */ 287 288 289 #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) 290 .align 5 291 backward_copy: 292 prfm PLDL1KEEP, [SRC0] 293 add DST, DST0, LEN 294 add SRC0, SRC0, LEN 295 cmp LEN, #SMALLSIZE 296 bcs strict_backward 297 298 cmp LEN, #10 299 bcs 9f 300 backward_tiny: 301 /* copy 1-10 bytes */ 302 1: sub LEN, LEN, #1 303 ldrb TMP_Xw, [SRC0, #-1]! 304 strb TMP_Xw, [DST, #-1]! 305 cbz LEN, 1b 306 ret 307 9: 308 /* length is small(<32), and src or dst may be unaligned */ 309 eor TMP_X, SRC0, DST 310 ands TMP_X, TMP_X, #7 311 bne notaligned_backward_small 312 313 samealign_backward_small: 314 /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 315 tbz DST, #0, 1f 316 ldrb TMP_Xw, [SRC0, #-1]! 317 strb TMP_Xw, [DST, #-1]! 318 sub LEN, LEN, #1 319 1: 320 /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 321 tbz DST, #1, 1f 322 ldrh TMP_Xw, [SRC0, #-2]! 323 strh TMP_Xw, [DST, #-2]! 324 sub LEN, LEN, #2 325 1: 326 /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 327 tbz DST, #2, 1f 328 ldr TMP_Xw, [SRC0, #-4]! 329 str TMP_Xw, [DST, #-4]! 330 sub LEN, LEN, #4 331 1: 332 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 333 tbz LEN, #4, 1f 334 ldp DATA0, DATA1, [SRC0, #-16]! 335 stp DATA0, DATA1, [DST, #-16]! 336 1: 337 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 338 tbz LEN, #3, 1f 339 ldr TMP_X, [SRC0, #-8]! 340 str TMP_X, [DST, #-8]! 341 1: 342 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 343 tbz LEN, #2, 1f 344 ldr TMP_Xw, [SRC0, #-4]! 345 str TMP_Xw, [DST, #-4]! 346 1: 347 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 348 tbz LEN, #1, 1f 349 ldrh TMP_Xw, [SRC0, #-2]! 350 strh TMP_Xw, [DST, #-2]! 351 1: 352 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 353 tbz LEN, #0, 1f 354 ldrb TMP_Xw, [SRC0, #-1]! 355 strb TMP_Xw, [DST, #-1]! 356 1: 357 ret 358 359 notaligned_backward_small: 360 /* length is small, and src or dst may be unaligned */ 361 sub TMP_S, SRC0, LEN /* tmp_s = src - len */ 362 1: /* do { */ 363 ldrb TMP_Xw, [SRC0, #-1]! 364 strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */ 365 cmp TMP_S, SRC0 /* while (tmp_s < src) */ 366 blo 1b 367 ret 368 369 strict_backward: 370 /* src or dst may be unaligned */ 371 and SRC_ALIGNBIT, SRC0, #7 372 and DST_ALIGNBIT, DST, #7 373 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 374 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 375 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 376 cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */ 377 378 and SRC, SRC0, #~7 379 and DST, DST, #~7 380 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 381 382 #if BYTE_ORDER == LITTLE_ENDIAN 383 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 384 385 cmp SRC, SRC0 /* don't access out of range */ 386 beq 1f 387 ldr DATA1, [SRC] 388 1: 389 ldr DATA0, [SRC, #-8]! 390 391 lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 392 lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */ 393 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 394 395 b 9f /* } */ 396 5: /* else { */ 397 ldr DATA0, [SRC] /* data0 = *src; */ 398 lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/ 399 9: /* } */ 400 401 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 402 mov TMP_D, DST /* tmp_d = dst; */ 403 404 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 405 str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */ 406 lsr DATA1, DATA1, #32 /* data1 >>= 32; */ 407 1: /* } */ 408 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 409 strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */ 410 lsr DATA1, DATA1, #16 /* data1 >>= 16; */ 411 1: /* } */ 412 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 413 strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */ 414 1: /* } */ 415 416 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 417 9: /* } */ 418 #else /* BYTE_ORDER */ 419 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 420 421 cmp SRC, SRC0 /* don't access out of range */ 422 beq 1f 423 ldr DATA1, [SRC] 424 1: 425 ldr DATA0, [SRC, #-8]! 426 427 lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 428 lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */ 429 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 430 431 b 9f /* } */ 432 5: /* else { */ 433 ldr DATA0, [SRC] /* data0 = *src; */ 434 lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/ 435 9: /* } */ 436 437 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 438 mov TMP_D, DST /* tmp_d = dst; */ 439 440 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 441 lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */ 442 str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */ 443 1: /* } */ 444 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 445 lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */ 446 strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */ 447 1: /* } */ 448 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 449 lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */ 450 strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */ 451 1: /* } */ 452 453 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 454 9: /* } */ 455 #endif /* BYTE_ORDER */ 456 457 458 backward_shifting_copy_loop: 459 ldp DATA2, DATA1, [SRC, #-16]! 460 #if BYTE_ORDER == LITTLE_ENDIAN 461 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 462 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 463 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 464 orr DATA0, DATA0, TMP_X 465 /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */ 466 lsl DATA1, DATA1, DST_SRC_ALIGNBIT 467 lsr TMP_X, DATA2, SRC_DST_ALIGNBIT 468 orr DATA1, DATA1, TMP_X 469 #else /* BYTE_ORDER */ 470 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 471 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 472 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 473 orr DATA0, DATA0, TMP_X 474 /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */ 475 lsr DATA1, DATA1, DST_SRC_ALIGNBIT 476 lsl TMP_X, DATA2, SRC_DST_ALIGNBIT 477 orr DATA1, DATA1, TMP_X 478 #endif /* BYTE_ORDER */ 479 stp DATA1, DATA0, [DST, #-16]! 480 mov DATA0, DATA2 481 sub LEN, LEN, #16 482 cmp LEN, #16 483 bhs backward_shifting_copy_loop 484 485 486 /* write 8 bytes */ 487 tbz LEN, #3, 9f 488 489 ldr DATA1, [SRC, #-8]! 490 #if BYTE_ORDER == LITTLE_ENDIAN 491 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 492 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 493 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 494 orr DATA0, DATA0, TMP_X 495 #else /* BYTE_ORDER */ 496 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 497 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 498 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 499 orr DATA0, DATA0, TMP_X 500 #endif /* BYTE_ORDER */ 501 str DATA0, [DST, #-8]! 502 mov DATA0, DATA1 503 sub LEN, LEN, #8 504 9: 505 506 cbz LEN, backward_shifting_copy_done 507 508 /* copy last 1-7 bytes */ 509 and TMP_X, SRC_DST_ALIGNBIT, #63 510 cmp LEN, TMP_X, lsr #3 511 bls 1f 512 ldr DATA1, [SRC, #-8]! /* don't access out of range */ 513 1: 514 515 #if BYTE_ORDER == LITTLE_ENDIAN 516 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 517 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 518 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 519 orr DATA0, DATA0, TMP_X 520 #else /* BYTE_ORDER */ 521 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 522 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 523 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 524 orr DATA0, DATA0, TMP_X 525 #endif /* BYTE_ORDER */ 526 527 #if BYTE_ORDER == LITTLE_ENDIAN 528 tbz LEN, #2, 1f 529 ror DATA0, DATA0, #32 530 str DATA0w, [DST, #-4]! 531 1: 532 tbz LEN, #1, 1f 533 ror DATA0, DATA0, #48 534 strh DATA0w, [DST, #-2]! 535 1: 536 tbz LEN, #0, 1f 537 ror DATA0, DATA0, #56 538 strb DATA0w, [DST, #-1]! 539 1: 540 #else /* BYTE_ORDER */ 541 tbz LEN, #2, 1f 542 str DATA0w, [DST, #-4]! 543 lsr DATA0, DATA0, #32 544 1: 545 tbz LEN, #1, 1f 546 strh DATA0w, [DST, #-2]! 547 lsr DATA0, DATA0, #16 548 1: 549 tbz LEN, #0, 1f 550 strb DATA0w, [DST, #-1]! 551 1: 552 #endif /* BYTE_ORDER */ 553 backward_shifting_copy_done: 554 ret 555 #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */ 556 557 558 .align 5 559 ENTRY(FUNCTION) 560 #ifdef STRICT_ALIGNMENT 561 cbz LEN, done 562 #ifndef NO_OVERLAP 563 cmp SRC0, DST0 564 beq done 565 bcc backward_copy 566 #endif /* NO_OVERLAP */ 567 mov DST, DST0 568 cmp LEN, #SMALLSIZE 569 bcs strict_forward 570 571 cmp LEN, #10 572 bcs 9f 573 forward_tiny: 574 /* copy 1-10 bytes */ 575 1: sub LEN, LEN, #1 576 ldrb TMP_Xw, [SRC0], #1 577 strb TMP_Xw, [DST], #1 578 cbz LEN, 1b 579 ret 580 9: 581 /* length is small(<32), and src or dst may be unaligned */ 582 eor TMP_X, SRC0, DST0 583 ands TMP_X, TMP_X, #7 584 bne notaligned_forward_small 585 samealign_forward_small: 586 /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 587 tbz DST, #0, 1f 588 ldrb TMP_Xw, [SRC0], #1 589 strb TMP_Xw, [DST], #1 590 sub LEN, LEN, #1 591 1: 592 /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 593 tbz DST, #1, 1f 594 ldrh TMP_Xw, [SRC0], #2 595 strh TMP_Xw, [DST], #2 596 sub LEN, LEN, #2 597 1: 598 /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 599 tbz DST, #2, 1f 600 ldr TMP_Xw, [SRC0], #4 601 str TMP_Xw, [DST], #4 602 sub LEN, LEN, #4 603 1: 604 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 605 tbz LEN, #4, 1f 606 ldp DATA0, DATA1, [SRC0], #16 607 stp DATA0, DATA1, [DST], #16 608 1: 609 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 610 tbz LEN, #3, 1f 611 ldr TMP_X, [SRC0], #8 612 str TMP_X, [DST], #8 613 1: 614 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 615 tbz LEN, #2, 1f 616 ldr TMP_Xw, [SRC0], #4 617 str TMP_Xw, [DST], #4 618 1: 619 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 620 tbz LEN, #1, 1f 621 ldrh TMP_Xw, [SRC0], #2 622 strh TMP_Xw, [DST], #2 623 1: 624 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 625 tbz LEN, #0, 1f 626 ldrb TMP_Xw, [SRC0], #1 627 strb TMP_Xw, [DST], #1 628 1: 629 ret 630 631 notaligned_forward_small: 632 /* src and dst are not aligned... */ 633 prfm PLDL1KEEP, [SRC0] 634 prfm PLDL1KEEP, [SRC0, #8] 635 prfm PLDL1KEEP, [SRC0, #16] 636 add TMP_S, SRC0, LEN /* tmp_s = src + len */ 637 1: /* do { */ 638 ldrb TMP_Xw, [SRC0], #1 639 strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */ 640 cmp SRC0, TMP_S /* while (src < tmp_s); */ 641 blo 1b 642 ret 643 644 strict_forward: 645 /* src or dst may be unaligned */ 646 and SRC_ALIGNBIT, SRC0, #7 647 and DST_ALIGNBIT, DST0, #7 648 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 649 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 650 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 651 cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */ 652 653 and SRC, SRC0, #~7 654 and DST, DST0, #~7 655 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 656 657 #if BYTE_ORDER == LITTLE_ENDIAN 658 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 659 ldp DATA1, DATA0, [SRC], #16 660 neg TMP_X, SRC_ALIGNBIT 661 lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 662 lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */ 663 orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */ 664 b 9f 665 5: 666 ldr DATA0, [SRC], #8 667 lsr DATA1, DATA0, SRC_ALIGNBIT 668 9: 669 670 cbz DST_ALIGNBIT, 5f 671 mov TMP_D, DST0 672 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */ 673 tbz TMP_D, #0, 1f 674 strb DATA1w, [TMP_D], #1 675 lsr DATA1, DATA1, #8 676 1: 677 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */ 678 tbz TMP_D, #1, 1f 679 strh DATA1w, [TMP_D], #2 680 lsr DATA1, DATA1, #16 681 1: 682 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */ 683 tbz TMP_D, #2, 1f 684 str DATA1w, [TMP_D], #4 685 1: 686 add DST, DST, #8 687 b 9f 688 5: 689 str DATA1, [DST], #8 690 9: 691 sub LEN, LEN, #8 692 add LEN, LEN, DST_ALIGNBIT, lsr #3 693 #else /* BYTE_ORDER */ 694 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 695 ldp DATA1, DATA0, [SRC], #16 696 neg TMP_X, SRC_ALIGNBIT 697 lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 698 lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */ 699 orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */ 700 b 9f 701 5: 702 ldr DATA0, [SRC], #8 703 lsl DATA1, DATA0, SRC_ALIGNBIT 704 9: 705 706 cbz DST_ALIGNBIT, 5f 707 mov TMP_D, DST0 708 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */ 709 tbz TMP_D, #0, 1f 710 lsr TMP_X, DATA1, #56 711 strb TMP_Xw, [TMP_D], #1 712 1: 713 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */ 714 tbz TMP_D, #1, 1f 715 lsr TMP_X, DATA1, #48 716 strh TMP_Xw, [TMP_D], #2 717 1: 718 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */ 719 tbz TMP_D, #2, 1f 720 lsr TMP_X, DATA1, #32 721 str TMP_Xw, [TMP_D], #4 722 1: 723 add DST, DST, #8 724 b 9f 725 5: 726 str DATA1, [DST], #8 727 9: 728 sub LEN, LEN, #8 729 add LEN, LEN, DST_ALIGNBIT, lsr #3 730 #endif /* BYTE_ORDER */ 731 732 shifting_copy_loop: 733 ldp DATA1, DATA2, [SRC], #16 734 #if BYTE_ORDER == LITTLE_ENDIAN 735 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 736 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 737 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 738 orr DATA0, DATA0, TMP_X 739 /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */ 740 lsr DATA1, DATA1, SRC_DST_ALIGNBIT 741 lsl TMP_X, DATA2, DST_SRC_ALIGNBIT 742 orr DATA1, DATA1, TMP_X 743 #else /* BYTE_ORDER */ 744 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 745 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 746 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 747 orr DATA0, DATA0, TMP_X 748 /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */ 749 lsl DATA1, DATA1, SRC_DST_ALIGNBIT 750 lsr TMP_X, DATA2, DST_SRC_ALIGNBIT 751 orr DATA1, DATA1, TMP_X 752 #endif /* BYTE_ORDER */ 753 stp DATA0, DATA1, [DST], #16 754 mov DATA0, DATA2 755 sub LEN, LEN, #16 756 cmp LEN, #16 757 bhs shifting_copy_loop 758 759 760 /* write 8 bytes */ 761 tbz LEN, #3, 9f 762 ldr DATA1, [SRC], #8 763 #if BYTE_ORDER == LITTLE_ENDIAN 764 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 765 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 766 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 767 orr DATA0, DATA0, TMP_X 768 #else /* BYTE_ORDER */ 769 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 770 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 771 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 772 orr DATA0, DATA0, TMP_X 773 #endif /* BYTE_ORDER */ 774 str DATA0, [DST], #8 775 mov DATA0, DATA1 776 sub LEN, LEN, #8 777 9: 778 779 cbz LEN, shifting_copy_done 780 781 /* copy last 1-7 bytes */ 782 and TMP_X, DST_SRC_ALIGNBIT, #63 783 cmp LEN, TMP_X, lsr #3 784 bls 1f 785 ldr DATA1, [SRC], #8 /* don't access out of range */ 786 1: 787 788 #if BYTE_ORDER == LITTLE_ENDIAN 789 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 790 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 791 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 792 orr DATA0, DATA0, TMP_X 793 #else /* BYTE_ORDER */ 794 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 795 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 796 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 797 orr DATA0, DATA0, TMP_X 798 #endif /* BYTE_ORDER */ 799 800 #if BYTE_ORDER == LITTLE_ENDIAN 801 /* if (len & 4) { *(uint32_t *)dst++ = data0; } */ 802 tbz LEN, #2, 1f 803 str DATA0w, [DST], #4 804 lsr DATA0, DATA0, #32 805 1: 806 /* if (len & 2) { *(uint16_t *)dst++ = data0; } */ 807 tbz LEN, #1, 1f 808 strh DATA0w, [DST], #2 809 lsr DATA0, DATA0, #16 810 1: 811 /* if (len & 1) { *(uint8_t *)dst++ = data0; } */ 812 tbz LEN, #0, 1f 813 strb DATA0w, [DST], #1 814 1: 815 #else /* BYTE_ORDER */ 816 /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */ 817 tbz LEN, #2, 1f 818 lsr TMP_X, DATA0, #32 819 str TMP_Xw, [DST], #4 820 1: 821 /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */ 822 tbz LEN, #1, 1f 823 lsr TMP_X, DATA0, #16 824 strh TMP_Xw, [DST], #2 825 1: 826 /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */ 827 tbz LEN, #0, 1f 828 lsr TMP_X, DATA0, #8 829 strb TMP_Xw, [DST], #1 830 1: 831 #endif /* BYTE_ORDER */ 832 shifting_copy_done: 833 ret 834 835 #else /* STRICT_ALIGNMENT */ 836 #ifndef NO_OVERLAP 837 cbz LEN, done 838 cmp SRC0, DST0 839 beq done 840 bcc backward_ignore_align 841 #endif /* NO_OVERLAP */ 842 843 prfm PLDL1KEEP, [SRC0] 844 cmp LEN, #SMALLSIZE 845 bcs copy_forward 846 mov DST, DST0 847 848 copy_forward_small: 849 cmp LEN, #8 850 bcs 9f 851 852 /* 0 <= len < 8 */ 853 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 854 tbz LEN, #2, 1f 855 ldr TMP_Xw, [SRC0], #4 856 str TMP_Xw, [DST], #4 857 1: 858 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 859 tbz LEN, #1, 1f 860 ldrh TMP_Xw, [SRC0], #2 861 strh TMP_Xw, [DST], #2 862 1: 863 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 864 tbz LEN, #0, 1f 865 ldrb TMP_Xw, [SRC0], #1 866 strb TMP_Xw, [DST], #1 867 1: 868 ret 869 9: 870 871 prfm PLDL1KEEP, [SRC0, #8] 872 cmp LEN, #16 873 bcs 9f 874 875 /* 8 <= len < 16 */ 876 /* *(uint64_t *)dst++ = *(uint64_t *)src++; */ 877 ldr TMP_X, [SRC0], #8 878 str TMP_X, [DST], #8 879 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 880 tbz LEN, #2, 1f 881 ldr TMP_Xw, [SRC0], #4 882 str TMP_Xw, [DST], #4 883 1: 884 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 885 tbz LEN, #1, 1f 886 ldrh TMP_Xw, [SRC0], #2 887 strh TMP_Xw, [DST], #2 888 1: 889 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 890 tbz LEN, #0, 1f 891 ldrb TMP_Xw, [SRC0], #1 892 strb TMP_Xw, [DST], #1 893 1: 894 ret 895 9: 896 897 /* 16 <= len < 32 */ 898 prfm PLDL1KEEP, [SRC0, 16] 899 prfm PLDL1KEEP, [SRC0, 24] 900 ldp DATA0, DATA1, [SRC0], #16 901 stp DATA0, DATA1, [DST], #16 902 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 903 tbz LEN, #3, 1f 904 ldr TMP_X, [SRC0], #8 905 str TMP_X, [DST], #8 906 1: 907 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 908 tbz LEN, #2, 1f 909 ldr TMP_Xw, [SRC0], #4 910 str TMP_Xw, [DST], #4 911 1: 912 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 913 tbz LEN, #1, 1f 914 ldrh TMP_Xw, [SRC0], #2 915 strh TMP_Xw, [DST], #2 916 1: 917 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 918 tbz LEN, #0, 1f 919 ldrb TMP_Xw, [SRC0], #1 920 strb TMP_Xw, [DST], #1 921 1: 922 ret 923 #endif /* !STRICT_ALIGNMENT */ 924 925 .align 4 926 copy_forward: 927 /* DST is not aligned at this point */ 928 mov DST, DST0 929 #ifndef STRICT_ALIGNMENT 930 cmp LEN, #512 /* pre-alignment can be overhead when small */ 931 bcc 9f 932 #endif /* STRICT_ALIGNMENT */ 933 /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 934 tbz DST, #0, 1f 935 ldrb TMP_Xw, [SRC0], #1 936 strb TMP_Xw, [DST], #1 937 sub LEN, LEN, #1 938 1: 939 /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 940 tbz DST, #1, 1f 941 ldrh TMP_Xw, [SRC0], #2 942 strh TMP_Xw, [DST], #2 943 sub LEN, LEN, #2 944 1: 945 /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 946 tbz DST, #2, 1f 947 ldr TMP_Xw, [SRC0], #4 948 str TMP_Xw, [DST], #4 949 sub LEN, LEN, #4 950 1: 951 #if (STP_ALIGN > 8) 952 /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 953 tbz DST, #3, 1f 954 ldr TMP_X, [SRC0], #8 955 str TMP_X, [DST], #8 956 sub LEN, LEN, #8 957 1: 958 #endif /* (STP_ALIGN > 8) */ 959 9: 960 961 forward_copy1k: 962 /* while (len >= 1024) */ 963 /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */ 964 cmp LEN, #1024 965 blo 9f 966 1: 967 sub LEN, LEN, #1024 968 .rept (1024 / 16) 969 ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */ 970 stp DATA0, DATA1, [DST], #16 971 .endr 972 cmp LEN, #1024 973 bhs 1b 974 9: 975 976 /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */ 977 tbz LEN, #9, 1f 978 .rept (512 / 16) 979 ldp DATA0, DATA1, [SRC0], #16 980 stp DATA0, DATA1, [DST], #16 981 .endr 982 1: 983 /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */ 984 tbz LEN, #8, 1f 985 .rept (256 / 16) 986 ldp DATA0, DATA1, [SRC0], #16 987 stp DATA0, DATA1, [DST], #16 988 .endr 989 1: 990 /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */ 991 tbz LEN, #7, 1f 992 .rept (128 / 16) 993 ldp DATA0, DATA1, [SRC0], #16 994 stp DATA0, DATA1, [DST], #16 995 .endr 996 1: 997 /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */ 998 tbz LEN, #6, 1f 999 .rept (64 / 16) 1000 ldp DATA0, DATA1, [SRC0], #16 1001 stp DATA0, DATA1, [DST], #16 1002 .endr 1003 1: 1004 /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */ 1005 tbz LEN, #5, 1f 1006 .rept (32 / 16) 1007 ldp DATA0, DATA1, [SRC0], #16 1008 stp DATA0, DATA1, [DST], #16 1009 .endr 1010 1: 1011 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 1012 tbz LEN, #4, 1f 1013 ldp DATA0, DATA1, [SRC0], #16 1014 stp DATA0, DATA1, [DST], #16 1015 1: 1016 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 1017 tbz LEN, #3, 1f 1018 ldr TMP_X, [SRC0], #8 1019 str TMP_X, [DST], #8 1020 1: 1021 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 1022 tbz LEN, #2, 1f 1023 ldr TMP_Xw, [SRC0], #4 1024 str TMP_Xw, [DST], #4 1025 1: 1026 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 1027 tbz LEN, #1, 1f 1028 ldrh TMP_Xw, [SRC0], #2 1029 strh TMP_Xw, [DST], #2 1030 1: 1031 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 1032 tbz LEN, #0, 1f 1033 ldrb TMP_Xw, [SRC0], #1 1034 strb TMP_Xw, [DST], #1 1035 1: 1036 done: 1037 ret 1038 END(FUNCTION) 1039