1 1.4 msaitoh /* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */ 2 1.1 skrll 3 1.1 skrll /* 4 1.4 msaitoh * Copyright (c) 2018 Ryo Shimizu 5 1.1 skrll * All rights reserved. 6 1.1 skrll * 7 1.1 skrll * Redistribution and use in source and binary forms, with or without 8 1.1 skrll * modification, are permitted provided that the following conditions 9 1.1 skrll * are met: 10 1.1 skrll * 1. Redistributions of source code must retain the above copyright 11 1.1 skrll * notice, this list of conditions and the following disclaimer. 12 1.1 skrll * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 skrll * notice, this list of conditions and the following disclaimer in the 14 1.1 skrll * documentation and/or other materials provided with the distribution. 15 1.1 skrll * 16 1.1 skrll * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 skrll * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 skrll * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 skrll * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 skrll * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 skrll * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 skrll * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 skrll * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 skrll * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 skrll * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 skrll * POSSIBILITY OF SUCH DAMAGE. 27 1.1 skrll */ 28 1.1 skrll 29 1.1 skrll #include <machine/asm.h> 30 1.1 skrll 31 1.1 skrll #if defined(LIBC_SCCS) 32 1.4 msaitoh RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $") 33 1.1 skrll #endif 34 1.1 skrll 35 1.1 skrll #if defined(MEMCOPY) 36 1.1 skrll 37 1.1 skrll /* 38 1.1 skrll * void *memcpy(void * restrict dst, const void * restrict src, size_t len); 39 1.1 skrll */ 40 1.1 skrll #define FUNCTION memcpy 41 1.1 skrll #define NO_OVERLAP 42 1.1 skrll #define SRC0 x1 43 1.1 skrll #define DST0 x0 44 1.1 skrll #define LEN x2 45 1.1 skrll 46 1.1 skrll #elif defined(MEMMOVE) 47 1.1 skrll 48 1.1 skrll /* 49 1.1 skrll * void *memmove(void *dst, const void *src, size_t len); 50 1.1 skrll */ 51 1.1 skrll #define FUNCTION memmove 52 1.1 skrll #undef NO_OVERLAP 53 1.1 skrll #define SRC0 x1 54 1.1 skrll #define DST0 x0 55 1.1 skrll #define LEN x2 56 1.1 skrll 57 1.1 skrll #else /* !MEMCOPY && !MEMMOVE */ 58 1.1 skrll 59 1.1 skrll /* 60 1.1 skrll * void bcopy(const void *src, void *dst, size_t len); 61 1.1 skrll */ 62 1.1 skrll #define FUNCTION bcopy 63 1.1 skrll #define NO_OVERLAP 64 1.1 skrll #define SRC0 x0 65 1.1 skrll #define DST0 x1 66 1.1 skrll #define LEN x2 67 1.1 skrll 68 1.1 skrll #endif /* MEMCOPY/MEMMOVE/BCOPY */ 69 1.1 skrll 70 1.1 skrll /* caller-saved temporary registers. breakable. */ 71 1.1 skrll #define TMP_X x3 72 1.1 skrll #define TMP_Xw w3 73 1.1 skrll #define TMP_D x4 74 1.1 skrll #define TMP_S x5 75 1.1 skrll #define DST x6 76 1.1 skrll #define SRC x7 77 1.1 skrll #define DATA0 x8 78 1.1 skrll #define DATA0w w8 79 1.1 skrll #define DATA1 x9 80 1.1 skrll #define DATA1w w9 81 1.1 skrll #define DATA2 x10 82 1.1 skrll #define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */ 83 1.1 skrll #define DST_ALIGNBIT x12 /* (DST & 7) * 8 */ 84 1.1 skrll #define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */ 85 1.1 skrll #define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */ 86 1.1 skrll 87 1.1 skrll #define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */ 88 1.1 skrll #define SMALLSIZE 32 89 1.1 skrll 90 1.1 skrll .text 91 1.1 skrll .align 5 92 1.1 skrll 93 1.1 skrll #ifndef NO_OVERLAP 94 1.1 skrll #ifndef STRICT_ALIGNMENT 95 1.1 skrll backward_ignore_align: 96 1.1 skrll prfm PLDL1KEEP, [SRC0] 97 1.1 skrll add SRC0, SRC0, LEN 98 1.1 skrll add DST, DST0, LEN 99 1.1 skrll cmp LEN, #SMALLSIZE 100 1.1 skrll bcs copy_backward 101 1.1 skrll copy_backward_small: 102 1.1 skrll cmp LEN, #8 103 1.1 skrll bcs 9f 104 1.1 skrll 105 1.1 skrll /* 0 <= len < 8 */ 106 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 107 1.1 skrll tbz LEN, #2, 1f 108 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 109 1.1 skrll str TMP_Xw, [DST, #-4]! 110 1.1 skrll 1: 111 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 112 1.1 skrll tbz LEN, #1, 1f 113 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 114 1.1 skrll strh TMP_Xw, [DST, #-2]! 115 1.1 skrll 1: 116 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 117 1.1 skrll tbz LEN, #0, 1f 118 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 119 1.1 skrll strb TMP_Xw, [DST, #-1]! 120 1.1 skrll 1: 121 1.1 skrll ret 122 1.1 skrll 9: 123 1.1 skrll 124 1.1 skrll cmp LEN, #16 125 1.1 skrll bcs 9f 126 1.1 skrll 127 1.1 skrll /* 8 <= len < 16 */ 128 1.1 skrll /* *--(uint64_t *)dst = *--(uint64_t *)src; */ 129 1.1 skrll ldr TMP_X, [SRC0, #-8]! 130 1.1 skrll str TMP_X, [DST, #-8]! 131 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 132 1.1 skrll tbz LEN, #2, 1f 133 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 134 1.1 skrll str TMP_Xw, [DST, #-4]! 135 1.1 skrll 1: 136 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 137 1.1 skrll tbz LEN, #1, 1f 138 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 139 1.1 skrll strh TMP_Xw, [DST, #-2]! 140 1.1 skrll 1: 141 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 142 1.1 skrll tbz LEN, #0, 1f 143 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 144 1.1 skrll strb TMP_Xw, [DST, #-1]! 145 1.1 skrll 1: 146 1.1 skrll ret 147 1.1 skrll 9: 148 1.1 skrll 149 1.1 skrll /* 16 <= len < 32 */ 150 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]! 151 1.1 skrll stp DATA0, DATA1, [DST, #-16]! 152 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 153 1.1 skrll tbz LEN, #3, 1f 154 1.1 skrll ldr TMP_X, [SRC0, #-8]! 155 1.1 skrll str TMP_X, [DST, #-8]! 156 1.1 skrll 1: 157 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 158 1.1 skrll tbz LEN, #2, 1f 159 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 160 1.1 skrll str TMP_Xw, [DST, #-4]! 161 1.1 skrll 1: 162 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 163 1.1 skrll tbz LEN, #1, 1f 164 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 165 1.1 skrll strh TMP_Xw, [DST, #-2]! 166 1.1 skrll 1: 167 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 168 1.1 skrll tbz LEN, #0, 1f 169 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 170 1.1 skrll strb TMP_Xw, [DST, #-1]! 171 1.1 skrll 1: 172 1.1 skrll ret 173 1.1 skrll #endif /* !STRICT_ALIGNMENT */ 174 1.1 skrll 175 1.1 skrll .align 4 176 1.1 skrll copy_backward: 177 1.1 skrll /* DST is not aligned at this point */ 178 1.1 skrll #ifndef STRICT_ALIGNMENT 179 1.1 skrll cmp LEN, #512 /* pre-alignment can be overhead when small */ 180 1.1 skrll bcc 9f 181 1.1 skrll #endif 182 1.1 skrll /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 183 1.1 skrll tbz DST, #0, 1f 184 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 185 1.1 skrll strb TMP_Xw, [DST, #-1]! 186 1.1 skrll sub LEN, LEN, #1 187 1.1 skrll 1: 188 1.1 skrll /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 189 1.1 skrll tbz DST, #1, 1f 190 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 191 1.1 skrll strh TMP_Xw, [DST, #-2]! 192 1.1 skrll sub LEN, LEN, #2 193 1.1 skrll 1: 194 1.1 skrll /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 195 1.1 skrll tbz DST, #2, 1f 196 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 197 1.1 skrll str TMP_Xw, [DST, #-4]! 198 1.1 skrll sub LEN, LEN, #4 199 1.1 skrll 1: 200 1.1 skrll #if (STP_ALIGN > 8) 201 1.1 skrll /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 202 1.1 skrll tbz DST, #3, 1f 203 1.1 skrll ldr TMP_X, [SRC0, #-8]! 204 1.1 skrll str TMP_X, [DST, #-8]! 205 1.1 skrll sub LEN, LEN, #8 206 1.1 skrll 1: 207 1.1 skrll #endif /* (STP_ALIGN > 8) */ 208 1.1 skrll 9: 209 1.1 skrll 210 1.2 ryo backward_copy1k: 211 1.2 ryo /* while (len >= 1024) */ 212 1.2 ryo /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */ 213 1.1 skrll cmp LEN, #1024 214 1.2 ryo blo 9f 215 1.2 ryo 1: 216 1.1 skrll sub LEN, LEN, #1024 217 1.1 skrll .rept (1024 / 16) 218 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */ 219 1.1 skrll stp DATA0, DATA1, [DST, #-16]! 220 1.1 skrll .endr 221 1.1 skrll cmp LEN, #1024 222 1.2 ryo bhs 1b 223 1.2 ryo 9: 224 1.1 skrll 225 1.2 ryo /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */ 226 1.2 ryo tbz LEN, #9, 1f 227 1.2 ryo .rept (512 / 16) 228 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]! 229 1.2 ryo stp DATA0, DATA1, [DST, #-16]! 230 1.2 ryo .endr 231 1.2 ryo 1: 232 1.2 ryo /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */ 233 1.2 ryo tbz LEN, #8, 1f 234 1.2 ryo .rept (256 / 16) 235 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]! 236 1.2 ryo stp DATA0, DATA1, [DST, #-16]! 237 1.2 ryo .endr 238 1.2 ryo 1: 239 1.2 ryo /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */ 240 1.2 ryo tbz LEN, #7, 1f 241 1.2 ryo .rept (128 / 16) 242 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]! 243 1.2 ryo stp DATA0, DATA1, [DST, #-16]! 244 1.2 ryo .endr 245 1.2 ryo 1: 246 1.2 ryo /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */ 247 1.2 ryo tbz LEN, #6, 1f 248 1.2 ryo .rept (64 / 16) 249 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]! 250 1.2 ryo stp DATA0, DATA1, [DST, #-16]! 251 1.2 ryo .endr 252 1.2 ryo 1: 253 1.2 ryo /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */ 254 1.2 ryo tbz LEN, #5, 1f 255 1.2 ryo .rept (32 / 16) 256 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]! 257 1.2 ryo stp DATA0, DATA1, [DST, #-16]! 258 1.2 ryo .endr 259 1.2 ryo 1: 260 1.1 skrll /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 261 1.1 skrll tbz LEN, #4, 1f 262 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]! 263 1.2 ryo stp DATA0, DATA1, [DST, #-16]! 264 1.1 skrll 1: 265 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 266 1.1 skrll tbz LEN, #3, 1f 267 1.1 skrll ldr TMP_X, [SRC0, #-8]! 268 1.1 skrll str TMP_X, [DST, #-8]! 269 1.1 skrll 1: 270 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 271 1.1 skrll tbz LEN, #2, 1f 272 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 273 1.1 skrll str TMP_Xw, [DST, #-4]! 274 1.1 skrll 1: 275 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 276 1.1 skrll tbz LEN, #1, 1f 277 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 278 1.1 skrll strh TMP_Xw, [DST, #-2]! 279 1.1 skrll 1: 280 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 281 1.1 skrll tbz LEN, #0, 1f 282 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 283 1.1 skrll strb TMP_Xw, [DST, #-1]! 284 1.1 skrll 1: 285 1.1 skrll ret 286 1.1 skrll #endif /* !NO_OVERLAP */ 287 1.1 skrll 288 1.1 skrll 289 1.1 skrll #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) 290 1.1 skrll .align 5 291 1.1 skrll backward_copy: 292 1.1 skrll prfm PLDL1KEEP, [SRC0] 293 1.1 skrll add DST, DST0, LEN 294 1.1 skrll add SRC0, SRC0, LEN 295 1.1 skrll cmp LEN, #SMALLSIZE 296 1.1 skrll bcs strict_backward 297 1.1 skrll 298 1.1 skrll cmp LEN, #10 299 1.1 skrll bcs 9f 300 1.1 skrll backward_tiny: 301 1.1 skrll /* copy 1-10 bytes */ 302 1.2 ryo 1: sub LEN, LEN, #1 303 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 304 1.1 skrll strb TMP_Xw, [DST, #-1]! 305 1.2 ryo cbz LEN, 1b 306 1.1 skrll ret 307 1.1 skrll 9: 308 1.1 skrll /* length is small(<32), and src or dst may be unaligned */ 309 1.3 skrll eor TMP_X, SRC0, DST 310 1.1 skrll ands TMP_X, TMP_X, #7 311 1.1 skrll bne notaligned_backward_small 312 1.1 skrll 313 1.1 skrll samealign_backward_small: 314 1.1 skrll /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 315 1.1 skrll tbz DST, #0, 1f 316 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 317 1.1 skrll strb TMP_Xw, [DST, #-1]! 318 1.1 skrll sub LEN, LEN, #1 319 1.1 skrll 1: 320 1.1 skrll /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 321 1.1 skrll tbz DST, #1, 1f 322 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 323 1.1 skrll strh TMP_Xw, [DST, #-2]! 324 1.1 skrll sub LEN, LEN, #2 325 1.1 skrll 1: 326 1.1 skrll /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 327 1.1 skrll tbz DST, #2, 1f 328 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 329 1.1 skrll str TMP_Xw, [DST, #-4]! 330 1.1 skrll sub LEN, LEN, #4 331 1.1 skrll 1: 332 1.1 skrll /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 333 1.1 skrll tbz LEN, #4, 1f 334 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]! 335 1.1 skrll stp DATA0, DATA1, [DST, #-16]! 336 1.1 skrll 1: 337 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 338 1.1 skrll tbz LEN, #3, 1f 339 1.1 skrll ldr TMP_X, [SRC0, #-8]! 340 1.1 skrll str TMP_X, [DST, #-8]! 341 1.1 skrll 1: 342 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 343 1.1 skrll tbz LEN, #2, 1f 344 1.1 skrll ldr TMP_Xw, [SRC0, #-4]! 345 1.1 skrll str TMP_Xw, [DST, #-4]! 346 1.1 skrll 1: 347 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 348 1.1 skrll tbz LEN, #1, 1f 349 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]! 350 1.1 skrll strh TMP_Xw, [DST, #-2]! 351 1.1 skrll 1: 352 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 353 1.1 skrll tbz LEN, #0, 1f 354 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 355 1.1 skrll strb TMP_Xw, [DST, #-1]! 356 1.1 skrll 1: 357 1.1 skrll ret 358 1.1 skrll 359 1.1 skrll notaligned_backward_small: 360 1.1 skrll /* length is small, and src or dst may be unaligned */ 361 1.1 skrll sub TMP_S, SRC0, LEN /* tmp_s = src - len */ 362 1.1 skrll 1: /* do { */ 363 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]! 364 1.1 skrll strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */ 365 1.1 skrll cmp TMP_S, SRC0 /* while (tmp_s < src) */ 366 1.1 skrll blo 1b 367 1.1 skrll ret 368 1.1 skrll 369 1.1 skrll strict_backward: 370 1.1 skrll /* src or dst may be unaligned */ 371 1.1 skrll and SRC_ALIGNBIT, SRC0, #7 372 1.1 skrll and DST_ALIGNBIT, DST, #7 373 1.1 skrll lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 374 1.1 skrll lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 375 1.1 skrll sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 376 1.1 skrll cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */ 377 1.1 skrll 378 1.1 skrll and SRC, SRC0, #~7 379 1.1 skrll and DST, DST, #~7 380 1.1 skrll neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 381 1.1 skrll 382 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 383 1.1 skrll tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 384 1.1 skrll 385 1.1 skrll cmp SRC, SRC0 /* don't access out of range */ 386 1.1 skrll beq 1f 387 1.1 skrll ldr DATA1, [SRC] 388 1.1 skrll 1: 389 1.1 skrll ldr DATA0, [SRC, #-8]! 390 1.1 skrll 391 1.1 skrll lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 392 1.1 skrll lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */ 393 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 394 1.1 skrll 395 1.1 skrll b 9f /* } */ 396 1.1 skrll 5: /* else { */ 397 1.1 skrll ldr DATA0, [SRC] /* data0 = *src; */ 398 1.1 skrll lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/ 399 1.1 skrll 9: /* } */ 400 1.1 skrll 401 1.1 skrll cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 402 1.1 skrll mov TMP_D, DST /* tmp_d = dst; */ 403 1.1 skrll 404 1.1 skrll tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 405 1.1 skrll str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */ 406 1.1 skrll lsr DATA1, DATA1, #32 /* data1 >>= 32; */ 407 1.1 skrll 1: /* } */ 408 1.1 skrll tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 409 1.1 skrll strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */ 410 1.1 skrll lsr DATA1, DATA1, #16 /* data1 >>= 16; */ 411 1.1 skrll 1: /* } */ 412 1.1 skrll tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 413 1.1 skrll strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */ 414 1.1 skrll 1: /* } */ 415 1.1 skrll 416 1.1 skrll sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 417 1.1 skrll 9: /* } */ 418 1.1 skrll #else /* BYTE_ORDER */ 419 1.1 skrll tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 420 1.1 skrll 421 1.1 skrll cmp SRC, SRC0 /* don't access out of range */ 422 1.1 skrll beq 1f 423 1.1 skrll ldr DATA1, [SRC] 424 1.1 skrll 1: 425 1.1 skrll ldr DATA0, [SRC, #-8]! 426 1.1 skrll 427 1.1 skrll lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 428 1.1 skrll lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */ 429 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 430 1.1 skrll 431 1.1 skrll b 9f /* } */ 432 1.1 skrll 5: /* else { */ 433 1.1 skrll ldr DATA0, [SRC] /* data0 = *src; */ 434 1.1 skrll lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/ 435 1.1 skrll 9: /* } */ 436 1.1 skrll 437 1.1 skrll cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 438 1.1 skrll mov TMP_D, DST /* tmp_d = dst; */ 439 1.1 skrll 440 1.1 skrll tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 441 1.1 skrll lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */ 442 1.1 skrll str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */ 443 1.1 skrll 1: /* } */ 444 1.1 skrll tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 445 1.1 skrll lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */ 446 1.1 skrll strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */ 447 1.1 skrll 1: /* } */ 448 1.1 skrll tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 449 1.1 skrll lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */ 450 1.1 skrll strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */ 451 1.1 skrll 1: /* } */ 452 1.1 skrll 453 1.1 skrll sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 454 1.1 skrll 9: /* } */ 455 1.1 skrll #endif /* BYTE_ORDER */ 456 1.1 skrll 457 1.1 skrll 458 1.1 skrll backward_shifting_copy_loop: 459 1.1 skrll ldp DATA2, DATA1, [SRC, #-16]! 460 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 461 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 462 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT 463 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 464 1.1 skrll orr DATA0, DATA0, TMP_X 465 1.1 skrll /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */ 466 1.1 skrll lsl DATA1, DATA1, DST_SRC_ALIGNBIT 467 1.1 skrll lsr TMP_X, DATA2, SRC_DST_ALIGNBIT 468 1.1 skrll orr DATA1, DATA1, TMP_X 469 1.1 skrll #else /* BYTE_ORDER */ 470 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 471 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT 472 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 473 1.1 skrll orr DATA0, DATA0, TMP_X 474 1.1 skrll /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */ 475 1.1 skrll lsr DATA1, DATA1, DST_SRC_ALIGNBIT 476 1.1 skrll lsl TMP_X, DATA2, SRC_DST_ALIGNBIT 477 1.1 skrll orr DATA1, DATA1, TMP_X 478 1.1 skrll #endif /* BYTE_ORDER */ 479 1.1 skrll stp DATA1, DATA0, [DST, #-16]! 480 1.1 skrll mov DATA0, DATA2 481 1.1 skrll sub LEN, LEN, #16 482 1.1 skrll cmp LEN, #16 483 1.1 skrll bhs backward_shifting_copy_loop 484 1.1 skrll 485 1.1 skrll 486 1.1 skrll /* write 8 bytes */ 487 1.1 skrll tbz LEN, #3, 9f 488 1.1 skrll 489 1.1 skrll ldr DATA1, [SRC, #-8]! 490 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 491 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 492 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT 493 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 494 1.1 skrll orr DATA0, DATA0, TMP_X 495 1.1 skrll #else /* BYTE_ORDER */ 496 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 497 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT 498 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 499 1.1 skrll orr DATA0, DATA0, TMP_X 500 1.1 skrll #endif /* BYTE_ORDER */ 501 1.1 skrll str DATA0, [DST, #-8]! 502 1.1 skrll mov DATA0, DATA1 503 1.1 skrll sub LEN, LEN, #8 504 1.1 skrll 9: 505 1.1 skrll 506 1.1 skrll cbz LEN, backward_shifting_copy_done 507 1.1 skrll 508 1.1 skrll /* copy last 1-7 bytes */ 509 1.1 skrll and TMP_X, SRC_DST_ALIGNBIT, #63 510 1.1 skrll cmp LEN, TMP_X, lsr #3 511 1.1 skrll bls 1f 512 1.1 skrll ldr DATA1, [SRC, #-8]! /* don't access out of range */ 513 1.1 skrll 1: 514 1.1 skrll 515 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 516 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 517 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT 518 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 519 1.1 skrll orr DATA0, DATA0, TMP_X 520 1.1 skrll #else /* BYTE_ORDER */ 521 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 522 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT 523 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 524 1.1 skrll orr DATA0, DATA0, TMP_X 525 1.1 skrll #endif /* BYTE_ORDER */ 526 1.1 skrll 527 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 528 1.1 skrll tbz LEN, #2, 1f 529 1.1 skrll ror DATA0, DATA0, #32 530 1.1 skrll str DATA0w, [DST, #-4]! 531 1.1 skrll 1: 532 1.1 skrll tbz LEN, #1, 1f 533 1.1 skrll ror DATA0, DATA0, #48 534 1.1 skrll strh DATA0w, [DST, #-2]! 535 1.1 skrll 1: 536 1.1 skrll tbz LEN, #0, 1f 537 1.1 skrll ror DATA0, DATA0, #56 538 1.1 skrll strb DATA0w, [DST, #-1]! 539 1.1 skrll 1: 540 1.1 skrll #else /* BYTE_ORDER */ 541 1.1 skrll tbz LEN, #2, 1f 542 1.1 skrll str DATA0w, [DST, #-4]! 543 1.1 skrll lsr DATA0, DATA0, #32 544 1.1 skrll 1: 545 1.1 skrll tbz LEN, #1, 1f 546 1.1 skrll strh DATA0w, [DST, #-2]! 547 1.1 skrll lsr DATA0, DATA0, #16 548 1.1 skrll 1: 549 1.1 skrll tbz LEN, #0, 1f 550 1.1 skrll strb DATA0w, [DST, #-1]! 551 1.1 skrll 1: 552 1.1 skrll #endif /* BYTE_ORDER */ 553 1.1 skrll backward_shifting_copy_done: 554 1.1 skrll ret 555 1.1 skrll #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */ 556 1.1 skrll 557 1.1 skrll 558 1.1 skrll .align 5 559 1.1 skrll ENTRY(FUNCTION) 560 1.1 skrll #ifdef STRICT_ALIGNMENT 561 1.1 skrll cbz LEN, done 562 1.1 skrll #ifndef NO_OVERLAP 563 1.1 skrll cmp SRC0, DST0 564 1.1 skrll beq done 565 1.1 skrll bcc backward_copy 566 1.1 skrll #endif /* NO_OVERLAP */ 567 1.1 skrll mov DST, DST0 568 1.1 skrll cmp LEN, #SMALLSIZE 569 1.1 skrll bcs strict_forward 570 1.1 skrll 571 1.1 skrll cmp LEN, #10 572 1.1 skrll bcs 9f 573 1.1 skrll forward_tiny: 574 1.1 skrll /* copy 1-10 bytes */ 575 1.2 ryo 1: sub LEN, LEN, #1 576 1.1 skrll ldrb TMP_Xw, [SRC0], #1 577 1.1 skrll strb TMP_Xw, [DST], #1 578 1.2 ryo cbz LEN, 1b 579 1.1 skrll ret 580 1.1 skrll 9: 581 1.1 skrll /* length is small(<32), and src or dst may be unaligned */ 582 1.1 skrll eor TMP_X, SRC0, DST0 583 1.1 skrll ands TMP_X, TMP_X, #7 584 1.1 skrll bne notaligned_forward_small 585 1.1 skrll samealign_forward_small: 586 1.1 skrll /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 587 1.1 skrll tbz DST, #0, 1f 588 1.1 skrll ldrb TMP_Xw, [SRC0], #1 589 1.1 skrll strb TMP_Xw, [DST], #1 590 1.1 skrll sub LEN, LEN, #1 591 1.1 skrll 1: 592 1.1 skrll /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 593 1.1 skrll tbz DST, #1, 1f 594 1.1 skrll ldrh TMP_Xw, [SRC0], #2 595 1.1 skrll strh TMP_Xw, [DST], #2 596 1.1 skrll sub LEN, LEN, #2 597 1.1 skrll 1: 598 1.1 skrll /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 599 1.1 skrll tbz DST, #2, 1f 600 1.1 skrll ldr TMP_Xw, [SRC0], #4 601 1.1 skrll str TMP_Xw, [DST], #4 602 1.1 skrll sub LEN, LEN, #4 603 1.1 skrll 1: 604 1.1 skrll /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 605 1.1 skrll tbz LEN, #4, 1f 606 1.1 skrll ldp DATA0, DATA1, [SRC0], #16 607 1.1 skrll stp DATA0, DATA1, [DST], #16 608 1.1 skrll 1: 609 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 610 1.1 skrll tbz LEN, #3, 1f 611 1.1 skrll ldr TMP_X, [SRC0], #8 612 1.1 skrll str TMP_X, [DST], #8 613 1.1 skrll 1: 614 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 615 1.1 skrll tbz LEN, #2, 1f 616 1.1 skrll ldr TMP_Xw, [SRC0], #4 617 1.1 skrll str TMP_Xw, [DST], #4 618 1.1 skrll 1: 619 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 620 1.1 skrll tbz LEN, #1, 1f 621 1.1 skrll ldrh TMP_Xw, [SRC0], #2 622 1.1 skrll strh TMP_Xw, [DST], #2 623 1.1 skrll 1: 624 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 625 1.1 skrll tbz LEN, #0, 1f 626 1.1 skrll ldrb TMP_Xw, [SRC0], #1 627 1.1 skrll strb TMP_Xw, [DST], #1 628 1.1 skrll 1: 629 1.1 skrll ret 630 1.1 skrll 631 1.1 skrll notaligned_forward_small: 632 1.1 skrll /* src and dst are not aligned... */ 633 1.1 skrll prfm PLDL1KEEP, [SRC0] 634 1.1 skrll prfm PLDL1KEEP, [SRC0, #8] 635 1.1 skrll prfm PLDL1KEEP, [SRC0, #16] 636 1.1 skrll add TMP_S, SRC0, LEN /* tmp_s = src + len */ 637 1.1 skrll 1: /* do { */ 638 1.1 skrll ldrb TMP_Xw, [SRC0], #1 639 1.1 skrll strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */ 640 1.1 skrll cmp SRC0, TMP_S /* while (src < tmp_s); */ 641 1.1 skrll blo 1b 642 1.1 skrll ret 643 1.1 skrll 644 1.1 skrll strict_forward: 645 1.1 skrll /* src or dst may be unaligned */ 646 1.1 skrll and SRC_ALIGNBIT, SRC0, #7 647 1.1 skrll and DST_ALIGNBIT, DST0, #7 648 1.1 skrll lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 649 1.1 skrll lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 650 1.1 skrll sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 651 1.1 skrll cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */ 652 1.1 skrll 653 1.1 skrll and SRC, SRC0, #~7 654 1.1 skrll and DST, DST0, #~7 655 1.1 skrll neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 656 1.1 skrll 657 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 658 1.1 skrll tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 659 1.1 skrll ldp DATA1, DATA0, [SRC], #16 660 1.1 skrll neg TMP_X, SRC_ALIGNBIT 661 1.1 skrll lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 662 1.1 skrll lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */ 663 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */ 664 1.1 skrll b 9f 665 1.1 skrll 5: 666 1.1 skrll ldr DATA0, [SRC], #8 667 1.1 skrll lsr DATA1, DATA0, SRC_ALIGNBIT 668 1.1 skrll 9: 669 1.1 skrll 670 1.1 skrll cbz DST_ALIGNBIT, 5f 671 1.1 skrll mov TMP_D, DST0 672 1.1 skrll /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */ 673 1.1 skrll tbz TMP_D, #0, 1f 674 1.1 skrll strb DATA1w, [TMP_D], #1 675 1.1 skrll lsr DATA1, DATA1, #8 676 1.1 skrll 1: 677 1.1 skrll /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */ 678 1.1 skrll tbz TMP_D, #1, 1f 679 1.1 skrll strh DATA1w, [TMP_D], #2 680 1.1 skrll lsr DATA1, DATA1, #16 681 1.1 skrll 1: 682 1.1 skrll /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */ 683 1.1 skrll tbz TMP_D, #2, 1f 684 1.1 skrll str DATA1w, [TMP_D], #4 685 1.1 skrll 1: 686 1.1 skrll add DST, DST, #8 687 1.1 skrll b 9f 688 1.1 skrll 5: 689 1.1 skrll str DATA1, [DST], #8 690 1.1 skrll 9: 691 1.1 skrll sub LEN, LEN, #8 692 1.1 skrll add LEN, LEN, DST_ALIGNBIT, lsr #3 693 1.1 skrll #else /* BYTE_ORDER */ 694 1.1 skrll tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 695 1.1 skrll ldp DATA1, DATA0, [SRC], #16 696 1.1 skrll neg TMP_X, SRC_ALIGNBIT 697 1.1 skrll lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 698 1.1 skrll lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */ 699 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */ 700 1.1 skrll b 9f 701 1.1 skrll 5: 702 1.1 skrll ldr DATA0, [SRC], #8 703 1.1 skrll lsl DATA1, DATA0, SRC_ALIGNBIT 704 1.1 skrll 9: 705 1.1 skrll 706 1.1 skrll cbz DST_ALIGNBIT, 5f 707 1.1 skrll mov TMP_D, DST0 708 1.1 skrll /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */ 709 1.1 skrll tbz TMP_D, #0, 1f 710 1.1 skrll lsr TMP_X, DATA1, #56 711 1.1 skrll strb TMP_Xw, [TMP_D], #1 712 1.1 skrll 1: 713 1.1 skrll /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */ 714 1.1 skrll tbz TMP_D, #1, 1f 715 1.1 skrll lsr TMP_X, DATA1, #48 716 1.1 skrll strh TMP_Xw, [TMP_D], #2 717 1.1 skrll 1: 718 1.1 skrll /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */ 719 1.1 skrll tbz TMP_D, #2, 1f 720 1.1 skrll lsr TMP_X, DATA1, #32 721 1.1 skrll str TMP_Xw, [TMP_D], #4 722 1.1 skrll 1: 723 1.1 skrll add DST, DST, #8 724 1.1 skrll b 9f 725 1.1 skrll 5: 726 1.1 skrll str DATA1, [DST], #8 727 1.1 skrll 9: 728 1.1 skrll sub LEN, LEN, #8 729 1.1 skrll add LEN, LEN, DST_ALIGNBIT, lsr #3 730 1.1 skrll #endif /* BYTE_ORDER */ 731 1.1 skrll 732 1.1 skrll shifting_copy_loop: 733 1.1 skrll ldp DATA1, DATA2, [SRC], #16 734 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 735 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 736 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT 737 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 738 1.1 skrll orr DATA0, DATA0, TMP_X 739 1.1 skrll /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */ 740 1.1 skrll lsr DATA1, DATA1, SRC_DST_ALIGNBIT 741 1.1 skrll lsl TMP_X, DATA2, DST_SRC_ALIGNBIT 742 1.1 skrll orr DATA1, DATA1, TMP_X 743 1.1 skrll #else /* BYTE_ORDER */ 744 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 745 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT 746 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 747 1.1 skrll orr DATA0, DATA0, TMP_X 748 1.1 skrll /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */ 749 1.1 skrll lsl DATA1, DATA1, SRC_DST_ALIGNBIT 750 1.1 skrll lsr TMP_X, DATA2, DST_SRC_ALIGNBIT 751 1.1 skrll orr DATA1, DATA1, TMP_X 752 1.1 skrll #endif /* BYTE_ORDER */ 753 1.1 skrll stp DATA0, DATA1, [DST], #16 754 1.1 skrll mov DATA0, DATA2 755 1.1 skrll sub LEN, LEN, #16 756 1.1 skrll cmp LEN, #16 757 1.1 skrll bhs shifting_copy_loop 758 1.1 skrll 759 1.1 skrll 760 1.1 skrll /* write 8 bytes */ 761 1.1 skrll tbz LEN, #3, 9f 762 1.1 skrll ldr DATA1, [SRC], #8 763 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 764 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 765 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT 766 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 767 1.1 skrll orr DATA0, DATA0, TMP_X 768 1.1 skrll #else /* BYTE_ORDER */ 769 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 770 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT 771 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 772 1.1 skrll orr DATA0, DATA0, TMP_X 773 1.1 skrll #endif /* BYTE_ORDER */ 774 1.1 skrll str DATA0, [DST], #8 775 1.1 skrll mov DATA0, DATA1 776 1.1 skrll sub LEN, LEN, #8 777 1.1 skrll 9: 778 1.1 skrll 779 1.1 skrll cbz LEN, shifting_copy_done 780 1.1 skrll 781 1.1 skrll /* copy last 1-7 bytes */ 782 1.1 skrll and TMP_X, DST_SRC_ALIGNBIT, #63 783 1.1 skrll cmp LEN, TMP_X, lsr #3 784 1.1 skrll bls 1f 785 1.1 skrll ldr DATA1, [SRC], #8 /* don't access out of range */ 786 1.1 skrll 1: 787 1.1 skrll 788 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 789 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 790 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT 791 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 792 1.1 skrll orr DATA0, DATA0, TMP_X 793 1.1 skrll #else /* BYTE_ORDER */ 794 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 795 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT 796 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 797 1.1 skrll orr DATA0, DATA0, TMP_X 798 1.1 skrll #endif /* BYTE_ORDER */ 799 1.1 skrll 800 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN 801 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = data0; } */ 802 1.1 skrll tbz LEN, #2, 1f 803 1.1 skrll str DATA0w, [DST], #4 804 1.1 skrll lsr DATA0, DATA0, #32 805 1.1 skrll 1: 806 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = data0; } */ 807 1.1 skrll tbz LEN, #1, 1f 808 1.1 skrll strh DATA0w, [DST], #2 809 1.1 skrll lsr DATA0, DATA0, #16 810 1.1 skrll 1: 811 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = data0; } */ 812 1.1 skrll tbz LEN, #0, 1f 813 1.1 skrll strb DATA0w, [DST], #1 814 1.1 skrll 1: 815 1.1 skrll #else /* BYTE_ORDER */ 816 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */ 817 1.1 skrll tbz LEN, #2, 1f 818 1.1 skrll lsr TMP_X, DATA0, #32 819 1.1 skrll str TMP_Xw, [DST], #4 820 1.1 skrll 1: 821 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */ 822 1.1 skrll tbz LEN, #1, 1f 823 1.1 skrll lsr TMP_X, DATA0, #16 824 1.1 skrll strh TMP_Xw, [DST], #2 825 1.1 skrll 1: 826 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */ 827 1.1 skrll tbz LEN, #0, 1f 828 1.1 skrll lsr TMP_X, DATA0, #8 829 1.1 skrll strb TMP_Xw, [DST], #1 830 1.1 skrll 1: 831 1.1 skrll #endif /* BYTE_ORDER */ 832 1.1 skrll shifting_copy_done: 833 1.1 skrll ret 834 1.1 skrll 835 1.1 skrll #else /* STRICT_ALIGNMENT */ 836 1.1 skrll #ifndef NO_OVERLAP 837 1.1 skrll cbz LEN, done 838 1.1 skrll cmp SRC0, DST0 839 1.1 skrll beq done 840 1.1 skrll bcc backward_ignore_align 841 1.1 skrll #endif /* NO_OVERLAP */ 842 1.1 skrll 843 1.1 skrll prfm PLDL1KEEP, [SRC0] 844 1.1 skrll cmp LEN, #SMALLSIZE 845 1.1 skrll bcs copy_forward 846 1.1 skrll mov DST, DST0 847 1.1 skrll 848 1.1 skrll copy_forward_small: 849 1.1 skrll cmp LEN, #8 850 1.1 skrll bcs 9f 851 1.1 skrll 852 1.1 skrll /* 0 <= len < 8 */ 853 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 854 1.1 skrll tbz LEN, #2, 1f 855 1.1 skrll ldr TMP_Xw, [SRC0], #4 856 1.1 skrll str TMP_Xw, [DST], #4 857 1.1 skrll 1: 858 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 859 1.1 skrll tbz LEN, #1, 1f 860 1.1 skrll ldrh TMP_Xw, [SRC0], #2 861 1.1 skrll strh TMP_Xw, [DST], #2 862 1.1 skrll 1: 863 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 864 1.1 skrll tbz LEN, #0, 1f 865 1.1 skrll ldrb TMP_Xw, [SRC0], #1 866 1.1 skrll strb TMP_Xw, [DST], #1 867 1.1 skrll 1: 868 1.1 skrll ret 869 1.1 skrll 9: 870 1.1 skrll 871 1.1 skrll prfm PLDL1KEEP, [SRC0, #8] 872 1.1 skrll cmp LEN, #16 873 1.1 skrll bcs 9f 874 1.1 skrll 875 1.1 skrll /* 8 <= len < 16 */ 876 1.1 skrll /* *(uint64_t *)dst++ = *(uint64_t *)src++; */ 877 1.1 skrll ldr TMP_X, [SRC0], #8 878 1.1 skrll str TMP_X, [DST], #8 879 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 880 1.1 skrll tbz LEN, #2, 1f 881 1.1 skrll ldr TMP_Xw, [SRC0], #4 882 1.1 skrll str TMP_Xw, [DST], #4 883 1.1 skrll 1: 884 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 885 1.1 skrll tbz LEN, #1, 1f 886 1.1 skrll ldrh TMP_Xw, [SRC0], #2 887 1.1 skrll strh TMP_Xw, [DST], #2 888 1.1 skrll 1: 889 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 890 1.1 skrll tbz LEN, #0, 1f 891 1.1 skrll ldrb TMP_Xw, [SRC0], #1 892 1.1 skrll strb TMP_Xw, [DST], #1 893 1.1 skrll 1: 894 1.1 skrll ret 895 1.1 skrll 9: 896 1.1 skrll 897 1.1 skrll /* 16 <= len < 32 */ 898 1.1 skrll prfm PLDL1KEEP, [SRC0, 16] 899 1.1 skrll prfm PLDL1KEEP, [SRC0, 24] 900 1.1 skrll ldp DATA0, DATA1, [SRC0], #16 901 1.1 skrll stp DATA0, DATA1, [DST], #16 902 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 903 1.1 skrll tbz LEN, #3, 1f 904 1.1 skrll ldr TMP_X, [SRC0], #8 905 1.1 skrll str TMP_X, [DST], #8 906 1.1 skrll 1: 907 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 908 1.1 skrll tbz LEN, #2, 1f 909 1.1 skrll ldr TMP_Xw, [SRC0], #4 910 1.1 skrll str TMP_Xw, [DST], #4 911 1.1 skrll 1: 912 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 913 1.1 skrll tbz LEN, #1, 1f 914 1.1 skrll ldrh TMP_Xw, [SRC0], #2 915 1.1 skrll strh TMP_Xw, [DST], #2 916 1.1 skrll 1: 917 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 918 1.1 skrll tbz LEN, #0, 1f 919 1.1 skrll ldrb TMP_Xw, [SRC0], #1 920 1.1 skrll strb TMP_Xw, [DST], #1 921 1.1 skrll 1: 922 1.1 skrll ret 923 1.1 skrll #endif /* !STRICT_ALIGNMENT */ 924 1.1 skrll 925 1.1 skrll .align 4 926 1.1 skrll copy_forward: 927 1.1 skrll /* DST is not aligned at this point */ 928 1.1 skrll mov DST, DST0 929 1.1 skrll #ifndef STRICT_ALIGNMENT 930 1.1 skrll cmp LEN, #512 /* pre-alignment can be overhead when small */ 931 1.1 skrll bcc 9f 932 1.1 skrll #endif /* STRICT_ALIGNMENT */ 933 1.1 skrll /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 934 1.1 skrll tbz DST, #0, 1f 935 1.1 skrll ldrb TMP_Xw, [SRC0], #1 936 1.1 skrll strb TMP_Xw, [DST], #1 937 1.1 skrll sub LEN, LEN, #1 938 1.1 skrll 1: 939 1.1 skrll /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 940 1.1 skrll tbz DST, #1, 1f 941 1.1 skrll ldrh TMP_Xw, [SRC0], #2 942 1.1 skrll strh TMP_Xw, [DST], #2 943 1.1 skrll sub LEN, LEN, #2 944 1.1 skrll 1: 945 1.1 skrll /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 946 1.1 skrll tbz DST, #2, 1f 947 1.1 skrll ldr TMP_Xw, [SRC0], #4 948 1.1 skrll str TMP_Xw, [DST], #4 949 1.1 skrll sub LEN, LEN, #4 950 1.1 skrll 1: 951 1.1 skrll #if (STP_ALIGN > 8) 952 1.1 skrll /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 953 1.1 skrll tbz DST, #3, 1f 954 1.1 skrll ldr TMP_X, [SRC0], #8 955 1.1 skrll str TMP_X, [DST], #8 956 1.1 skrll sub LEN, LEN, #8 957 1.1 skrll 1: 958 1.1 skrll #endif /* (STP_ALIGN > 8) */ 959 1.1 skrll 9: 960 1.1 skrll 961 1.2 ryo forward_copy1k: 962 1.2 ryo /* while (len >= 1024) */ 963 1.2 ryo /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */ 964 1.1 skrll cmp LEN, #1024 965 1.2 ryo blo 9f 966 1.2 ryo 1: 967 1.1 skrll sub LEN, LEN, #1024 968 1.1 skrll .rept (1024 / 16) 969 1.1 skrll ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */ 970 1.1 skrll stp DATA0, DATA1, [DST], #16 971 1.1 skrll .endr 972 1.1 skrll cmp LEN, #1024 973 1.2 ryo bhs 1b 974 1.2 ryo 9: 975 1.1 skrll 976 1.2 ryo /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */ 977 1.2 ryo tbz LEN, #9, 1f 978 1.2 ryo .rept (512 / 16) 979 1.2 ryo ldp DATA0, DATA1, [SRC0], #16 980 1.2 ryo stp DATA0, DATA1, [DST], #16 981 1.2 ryo .endr 982 1.2 ryo 1: 983 1.2 ryo /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */ 984 1.2 ryo tbz LEN, #8, 1f 985 1.2 ryo .rept (256 / 16) 986 1.2 ryo ldp DATA0, DATA1, [SRC0], #16 987 1.2 ryo stp DATA0, DATA1, [DST], #16 988 1.2 ryo .endr 989 1.2 ryo 1: 990 1.2 ryo /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */ 991 1.2 ryo tbz LEN, #7, 1f 992 1.2 ryo .rept (128 / 16) 993 1.2 ryo ldp DATA0, DATA1, [SRC0], #16 994 1.2 ryo stp DATA0, DATA1, [DST], #16 995 1.2 ryo .endr 996 1.2 ryo 1: 997 1.2 ryo /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */ 998 1.2 ryo tbz LEN, #6, 1f 999 1.2 ryo .rept (64 / 16) 1000 1.2 ryo ldp DATA0, DATA1, [SRC0], #16 1001 1.2 ryo stp DATA0, DATA1, [DST], #16 1002 1.2 ryo .endr 1003 1.2 ryo 1: 1004 1.2 ryo /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */ 1005 1.2 ryo tbz LEN, #5, 1f 1006 1.2 ryo .rept (32 / 16) 1007 1.2 ryo ldp DATA0, DATA1, [SRC0], #16 1008 1.2 ryo stp DATA0, DATA1, [DST], #16 1009 1.2 ryo .endr 1010 1.2 ryo 1: 1011 1.1 skrll /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 1012 1.1 skrll tbz LEN, #4, 1f 1013 1.1 skrll ldp DATA0, DATA1, [SRC0], #16 1014 1.1 skrll stp DATA0, DATA1, [DST], #16 1015 1.1 skrll 1: 1016 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 1017 1.1 skrll tbz LEN, #3, 1f 1018 1.1 skrll ldr TMP_X, [SRC0], #8 1019 1.1 skrll str TMP_X, [DST], #8 1020 1.1 skrll 1: 1021 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 1022 1.1 skrll tbz LEN, #2, 1f 1023 1.1 skrll ldr TMP_Xw, [SRC0], #4 1024 1.1 skrll str TMP_Xw, [DST], #4 1025 1.1 skrll 1: 1026 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 1027 1.1 skrll tbz LEN, #1, 1f 1028 1.1 skrll ldrh TMP_Xw, [SRC0], #2 1029 1.1 skrll strh TMP_Xw, [DST], #2 1030 1.1 skrll 1: 1031 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 1032 1.1 skrll tbz LEN, #0, 1f 1033 1.1 skrll ldrb TMP_Xw, [SRC0], #1 1034 1.1 skrll strb TMP_Xw, [DST], #1 1035 1.1 skrll 1: 1036 1.1 skrll done: 1037 1.1 skrll ret 1038 1.1 skrll END(FUNCTION) 1039