bcopy.S revision 1.4 1 /* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */
2
3 /*
4 * Copyright (c) 2018 Ryo Shimizu
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <machine/asm.h>
30
31 #if defined(LIBC_SCCS)
32 RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $")
33 #endif
34
35 #if defined(MEMCOPY)
36
37 /*
38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 */
40 #define FUNCTION memcpy
41 #define NO_OVERLAP
42 #define SRC0 x1
43 #define DST0 x0
44 #define LEN x2
45
46 #elif defined(MEMMOVE)
47
48 /*
49 * void *memmove(void *dst, const void *src, size_t len);
50 */
51 #define FUNCTION memmove
52 #undef NO_OVERLAP
53 #define SRC0 x1
54 #define DST0 x0
55 #define LEN x2
56
57 #else /* !MEMCOPY && !MEMMOVE */
58
59 /*
60 * void bcopy(const void *src, void *dst, size_t len);
61 */
62 #define FUNCTION bcopy
63 #define NO_OVERLAP
64 #define SRC0 x0
65 #define DST0 x1
66 #define LEN x2
67
68 #endif /* MEMCOPY/MEMMOVE/BCOPY */
69
70 /* caller-saved temporary registers. breakable. */
71 #define TMP_X x3
72 #define TMP_Xw w3
73 #define TMP_D x4
74 #define TMP_S x5
75 #define DST x6
76 #define SRC x7
77 #define DATA0 x8
78 #define DATA0w w8
79 #define DATA1 x9
80 #define DATA1w w9
81 #define DATA2 x10
82 #define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */
83 #define DST_ALIGNBIT x12 /* (DST & 7) * 8 */
84 #define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */
85 #define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */
86
87 #define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */
88 #define SMALLSIZE 32
89
90 .text
91 .align 5
92
93 #ifndef NO_OVERLAP
94 #ifndef STRICT_ALIGNMENT
95 backward_ignore_align:
96 prfm PLDL1KEEP, [SRC0]
97 add SRC0, SRC0, LEN
98 add DST, DST0, LEN
99 cmp LEN, #SMALLSIZE
100 bcs copy_backward
101 copy_backward_small:
102 cmp LEN, #8
103 bcs 9f
104
105 /* 0 <= len < 8 */
106 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107 tbz LEN, #2, 1f
108 ldr TMP_Xw, [SRC0, #-4]!
109 str TMP_Xw, [DST, #-4]!
110 1:
111 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112 tbz LEN, #1, 1f
113 ldrh TMP_Xw, [SRC0, #-2]!
114 strh TMP_Xw, [DST, #-2]!
115 1:
116 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117 tbz LEN, #0, 1f
118 ldrb TMP_Xw, [SRC0, #-1]!
119 strb TMP_Xw, [DST, #-1]!
120 1:
121 ret
122 9:
123
124 cmp LEN, #16
125 bcs 9f
126
127 /* 8 <= len < 16 */
128 /* *--(uint64_t *)dst = *--(uint64_t *)src; */
129 ldr TMP_X, [SRC0, #-8]!
130 str TMP_X, [DST, #-8]!
131 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132 tbz LEN, #2, 1f
133 ldr TMP_Xw, [SRC0, #-4]!
134 str TMP_Xw, [DST, #-4]!
135 1:
136 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137 tbz LEN, #1, 1f
138 ldrh TMP_Xw, [SRC0, #-2]!
139 strh TMP_Xw, [DST, #-2]!
140 1:
141 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142 tbz LEN, #0, 1f
143 ldrb TMP_Xw, [SRC0, #-1]!
144 strb TMP_Xw, [DST, #-1]!
145 1:
146 ret
147 9:
148
149 /* 16 <= len < 32 */
150 ldp DATA0, DATA1, [SRC0, #-16]!
151 stp DATA0, DATA1, [DST, #-16]!
152 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153 tbz LEN, #3, 1f
154 ldr TMP_X, [SRC0, #-8]!
155 str TMP_X, [DST, #-8]!
156 1:
157 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158 tbz LEN, #2, 1f
159 ldr TMP_Xw, [SRC0, #-4]!
160 str TMP_Xw, [DST, #-4]!
161 1:
162 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163 tbz LEN, #1, 1f
164 ldrh TMP_Xw, [SRC0, #-2]!
165 strh TMP_Xw, [DST, #-2]!
166 1:
167 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168 tbz LEN, #0, 1f
169 ldrb TMP_Xw, [SRC0, #-1]!
170 strb TMP_Xw, [DST, #-1]!
171 1:
172 ret
173 #endif /* !STRICT_ALIGNMENT */
174
175 .align 4
176 copy_backward:
177 /* DST is not aligned at this point */
178 #ifndef STRICT_ALIGNMENT
179 cmp LEN, #512 /* pre-alignment can be overhead when small */
180 bcc 9f
181 #endif
182 /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183 tbz DST, #0, 1f
184 ldrb TMP_Xw, [SRC0, #-1]!
185 strb TMP_Xw, [DST, #-1]!
186 sub LEN, LEN, #1
187 1:
188 /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189 tbz DST, #1, 1f
190 ldrh TMP_Xw, [SRC0, #-2]!
191 strh TMP_Xw, [DST, #-2]!
192 sub LEN, LEN, #2
193 1:
194 /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195 tbz DST, #2, 1f
196 ldr TMP_Xw, [SRC0, #-4]!
197 str TMP_Xw, [DST, #-4]!
198 sub LEN, LEN, #4
199 1:
200 #if (STP_ALIGN > 8)
201 /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202 tbz DST, #3, 1f
203 ldr TMP_X, [SRC0, #-8]!
204 str TMP_X, [DST, #-8]!
205 sub LEN, LEN, #8
206 1:
207 #endif /* (STP_ALIGN > 8) */
208 9:
209
210 backward_copy1k:
211 /* while (len >= 1024) */
212 /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
213 cmp LEN, #1024
214 blo 9f
215 1:
216 sub LEN, LEN, #1024
217 .rept (1024 / 16)
218 ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */
219 stp DATA0, DATA1, [DST, #-16]!
220 .endr
221 cmp LEN, #1024
222 bhs 1b
223 9:
224
225 /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
226 tbz LEN, #9, 1f
227 .rept (512 / 16)
228 ldp DATA0, DATA1, [SRC0, #-16]!
229 stp DATA0, DATA1, [DST, #-16]!
230 .endr
231 1:
232 /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
233 tbz LEN, #8, 1f
234 .rept (256 / 16)
235 ldp DATA0, DATA1, [SRC0, #-16]!
236 stp DATA0, DATA1, [DST, #-16]!
237 .endr
238 1:
239 /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
240 tbz LEN, #7, 1f
241 .rept (128 / 16)
242 ldp DATA0, DATA1, [SRC0, #-16]!
243 stp DATA0, DATA1, [DST, #-16]!
244 .endr
245 1:
246 /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
247 tbz LEN, #6, 1f
248 .rept (64 / 16)
249 ldp DATA0, DATA1, [SRC0, #-16]!
250 stp DATA0, DATA1, [DST, #-16]!
251 .endr
252 1:
253 /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
254 tbz LEN, #5, 1f
255 .rept (32 / 16)
256 ldp DATA0, DATA1, [SRC0, #-16]!
257 stp DATA0, DATA1, [DST, #-16]!
258 .endr
259 1:
260 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
261 tbz LEN, #4, 1f
262 ldp DATA0, DATA1, [SRC0, #-16]!
263 stp DATA0, DATA1, [DST, #-16]!
264 1:
265 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
266 tbz LEN, #3, 1f
267 ldr TMP_X, [SRC0, #-8]!
268 str TMP_X, [DST, #-8]!
269 1:
270 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
271 tbz LEN, #2, 1f
272 ldr TMP_Xw, [SRC0, #-4]!
273 str TMP_Xw, [DST, #-4]!
274 1:
275 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
276 tbz LEN, #1, 1f
277 ldrh TMP_Xw, [SRC0, #-2]!
278 strh TMP_Xw, [DST, #-2]!
279 1:
280 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
281 tbz LEN, #0, 1f
282 ldrb TMP_Xw, [SRC0, #-1]!
283 strb TMP_Xw, [DST, #-1]!
284 1:
285 ret
286 #endif /* !NO_OVERLAP */
287
288
289 #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
290 .align 5
291 backward_copy:
292 prfm PLDL1KEEP, [SRC0]
293 add DST, DST0, LEN
294 add SRC0, SRC0, LEN
295 cmp LEN, #SMALLSIZE
296 bcs strict_backward
297
298 cmp LEN, #10
299 bcs 9f
300 backward_tiny:
301 /* copy 1-10 bytes */
302 1: sub LEN, LEN, #1
303 ldrb TMP_Xw, [SRC0, #-1]!
304 strb TMP_Xw, [DST, #-1]!
305 cbz LEN, 1b
306 ret
307 9:
308 /* length is small(<32), and src or dst may be unaligned */
309 eor TMP_X, SRC0, DST
310 ands TMP_X, TMP_X, #7
311 bne notaligned_backward_small
312
313 samealign_backward_small:
314 /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
315 tbz DST, #0, 1f
316 ldrb TMP_Xw, [SRC0, #-1]!
317 strb TMP_Xw, [DST, #-1]!
318 sub LEN, LEN, #1
319 1:
320 /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
321 tbz DST, #1, 1f
322 ldrh TMP_Xw, [SRC0, #-2]!
323 strh TMP_Xw, [DST, #-2]!
324 sub LEN, LEN, #2
325 1:
326 /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
327 tbz DST, #2, 1f
328 ldr TMP_Xw, [SRC0, #-4]!
329 str TMP_Xw, [DST, #-4]!
330 sub LEN, LEN, #4
331 1:
332 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
333 tbz LEN, #4, 1f
334 ldp DATA0, DATA1, [SRC0, #-16]!
335 stp DATA0, DATA1, [DST, #-16]!
336 1:
337 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
338 tbz LEN, #3, 1f
339 ldr TMP_X, [SRC0, #-8]!
340 str TMP_X, [DST, #-8]!
341 1:
342 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
343 tbz LEN, #2, 1f
344 ldr TMP_Xw, [SRC0, #-4]!
345 str TMP_Xw, [DST, #-4]!
346 1:
347 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
348 tbz LEN, #1, 1f
349 ldrh TMP_Xw, [SRC0, #-2]!
350 strh TMP_Xw, [DST, #-2]!
351 1:
352 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
353 tbz LEN, #0, 1f
354 ldrb TMP_Xw, [SRC0, #-1]!
355 strb TMP_Xw, [DST, #-1]!
356 1:
357 ret
358
359 notaligned_backward_small:
360 /* length is small, and src or dst may be unaligned */
361 sub TMP_S, SRC0, LEN /* tmp_s = src - len */
362 1: /* do { */
363 ldrb TMP_Xw, [SRC0, #-1]!
364 strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */
365 cmp TMP_S, SRC0 /* while (tmp_s < src) */
366 blo 1b
367 ret
368
369 strict_backward:
370 /* src or dst may be unaligned */
371 and SRC_ALIGNBIT, SRC0, #7
372 and DST_ALIGNBIT, DST, #7
373 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
374 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
375 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
376 cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */
377
378 and SRC, SRC0, #~7
379 and DST, DST, #~7
380 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
381
382 #if BYTE_ORDER == LITTLE_ENDIAN
383 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
384
385 cmp SRC, SRC0 /* don't access out of range */
386 beq 1f
387 ldr DATA1, [SRC]
388 1:
389 ldr DATA0, [SRC, #-8]!
390
391 lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
392 lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */
393 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
394
395 b 9f /* } */
396 5: /* else { */
397 ldr DATA0, [SRC] /* data0 = *src; */
398 lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/
399 9: /* } */
400
401 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
402 mov TMP_D, DST /* tmp_d = dst; */
403
404 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
405 str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */
406 lsr DATA1, DATA1, #32 /* data1 >>= 32; */
407 1: /* } */
408 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
409 strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */
410 lsr DATA1, DATA1, #16 /* data1 >>= 16; */
411 1: /* } */
412 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
413 strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */
414 1: /* } */
415
416 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
417 9: /* } */
418 #else /* BYTE_ORDER */
419 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
420
421 cmp SRC, SRC0 /* don't access out of range */
422 beq 1f
423 ldr DATA1, [SRC]
424 1:
425 ldr DATA0, [SRC, #-8]!
426
427 lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
428 lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */
429 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
430
431 b 9f /* } */
432 5: /* else { */
433 ldr DATA0, [SRC] /* data0 = *src; */
434 lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/
435 9: /* } */
436
437 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
438 mov TMP_D, DST /* tmp_d = dst; */
439
440 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
441 lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */
442 str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */
443 1: /* } */
444 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
445 lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */
446 strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */
447 1: /* } */
448 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
449 lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */
450 strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */
451 1: /* } */
452
453 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
454 9: /* } */
455 #endif /* BYTE_ORDER */
456
457
458 backward_shifting_copy_loop:
459 ldp DATA2, DATA1, [SRC, #-16]!
460 #if BYTE_ORDER == LITTLE_ENDIAN
461 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
462 lsl DATA0, DATA0, DST_SRC_ALIGNBIT
463 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
464 orr DATA0, DATA0, TMP_X
465 /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
466 lsl DATA1, DATA1, DST_SRC_ALIGNBIT
467 lsr TMP_X, DATA2, SRC_DST_ALIGNBIT
468 orr DATA1, DATA1, TMP_X
469 #else /* BYTE_ORDER */
470 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
471 lsr DATA0, DATA0, DST_SRC_ALIGNBIT
472 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
473 orr DATA0, DATA0, TMP_X
474 /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
475 lsr DATA1, DATA1, DST_SRC_ALIGNBIT
476 lsl TMP_X, DATA2, SRC_DST_ALIGNBIT
477 orr DATA1, DATA1, TMP_X
478 #endif /* BYTE_ORDER */
479 stp DATA1, DATA0, [DST, #-16]!
480 mov DATA0, DATA2
481 sub LEN, LEN, #16
482 cmp LEN, #16
483 bhs backward_shifting_copy_loop
484
485
486 /* write 8 bytes */
487 tbz LEN, #3, 9f
488
489 ldr DATA1, [SRC, #-8]!
490 #if BYTE_ORDER == LITTLE_ENDIAN
491 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
492 lsl DATA0, DATA0, DST_SRC_ALIGNBIT
493 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
494 orr DATA0, DATA0, TMP_X
495 #else /* BYTE_ORDER */
496 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
497 lsr DATA0, DATA0, DST_SRC_ALIGNBIT
498 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
499 orr DATA0, DATA0, TMP_X
500 #endif /* BYTE_ORDER */
501 str DATA0, [DST, #-8]!
502 mov DATA0, DATA1
503 sub LEN, LEN, #8
504 9:
505
506 cbz LEN, backward_shifting_copy_done
507
508 /* copy last 1-7 bytes */
509 and TMP_X, SRC_DST_ALIGNBIT, #63
510 cmp LEN, TMP_X, lsr #3
511 bls 1f
512 ldr DATA1, [SRC, #-8]! /* don't access out of range */
513 1:
514
515 #if BYTE_ORDER == LITTLE_ENDIAN
516 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
517 lsl DATA0, DATA0, DST_SRC_ALIGNBIT
518 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
519 orr DATA0, DATA0, TMP_X
520 #else /* BYTE_ORDER */
521 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
522 lsr DATA0, DATA0, DST_SRC_ALIGNBIT
523 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
524 orr DATA0, DATA0, TMP_X
525 #endif /* BYTE_ORDER */
526
527 #if BYTE_ORDER == LITTLE_ENDIAN
528 tbz LEN, #2, 1f
529 ror DATA0, DATA0, #32
530 str DATA0w, [DST, #-4]!
531 1:
532 tbz LEN, #1, 1f
533 ror DATA0, DATA0, #48
534 strh DATA0w, [DST, #-2]!
535 1:
536 tbz LEN, #0, 1f
537 ror DATA0, DATA0, #56
538 strb DATA0w, [DST, #-1]!
539 1:
540 #else /* BYTE_ORDER */
541 tbz LEN, #2, 1f
542 str DATA0w, [DST, #-4]!
543 lsr DATA0, DATA0, #32
544 1:
545 tbz LEN, #1, 1f
546 strh DATA0w, [DST, #-2]!
547 lsr DATA0, DATA0, #16
548 1:
549 tbz LEN, #0, 1f
550 strb DATA0w, [DST, #-1]!
551 1:
552 #endif /* BYTE_ORDER */
553 backward_shifting_copy_done:
554 ret
555 #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
556
557
558 .align 5
559 ENTRY(FUNCTION)
560 #ifdef STRICT_ALIGNMENT
561 cbz LEN, done
562 #ifndef NO_OVERLAP
563 cmp SRC0, DST0
564 beq done
565 bcc backward_copy
566 #endif /* NO_OVERLAP */
567 mov DST, DST0
568 cmp LEN, #SMALLSIZE
569 bcs strict_forward
570
571 cmp LEN, #10
572 bcs 9f
573 forward_tiny:
574 /* copy 1-10 bytes */
575 1: sub LEN, LEN, #1
576 ldrb TMP_Xw, [SRC0], #1
577 strb TMP_Xw, [DST], #1
578 cbz LEN, 1b
579 ret
580 9:
581 /* length is small(<32), and src or dst may be unaligned */
582 eor TMP_X, SRC0, DST0
583 ands TMP_X, TMP_X, #7
584 bne notaligned_forward_small
585 samealign_forward_small:
586 /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
587 tbz DST, #0, 1f
588 ldrb TMP_Xw, [SRC0], #1
589 strb TMP_Xw, [DST], #1
590 sub LEN, LEN, #1
591 1:
592 /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
593 tbz DST, #1, 1f
594 ldrh TMP_Xw, [SRC0], #2
595 strh TMP_Xw, [DST], #2
596 sub LEN, LEN, #2
597 1:
598 /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
599 tbz DST, #2, 1f
600 ldr TMP_Xw, [SRC0], #4
601 str TMP_Xw, [DST], #4
602 sub LEN, LEN, #4
603 1:
604 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
605 tbz LEN, #4, 1f
606 ldp DATA0, DATA1, [SRC0], #16
607 stp DATA0, DATA1, [DST], #16
608 1:
609 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
610 tbz LEN, #3, 1f
611 ldr TMP_X, [SRC0], #8
612 str TMP_X, [DST], #8
613 1:
614 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
615 tbz LEN, #2, 1f
616 ldr TMP_Xw, [SRC0], #4
617 str TMP_Xw, [DST], #4
618 1:
619 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
620 tbz LEN, #1, 1f
621 ldrh TMP_Xw, [SRC0], #2
622 strh TMP_Xw, [DST], #2
623 1:
624 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
625 tbz LEN, #0, 1f
626 ldrb TMP_Xw, [SRC0], #1
627 strb TMP_Xw, [DST], #1
628 1:
629 ret
630
631 notaligned_forward_small:
632 /* src and dst are not aligned... */
633 prfm PLDL1KEEP, [SRC0]
634 prfm PLDL1KEEP, [SRC0, #8]
635 prfm PLDL1KEEP, [SRC0, #16]
636 add TMP_S, SRC0, LEN /* tmp_s = src + len */
637 1: /* do { */
638 ldrb TMP_Xw, [SRC0], #1
639 strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */
640 cmp SRC0, TMP_S /* while (src < tmp_s); */
641 blo 1b
642 ret
643
644 strict_forward:
645 /* src or dst may be unaligned */
646 and SRC_ALIGNBIT, SRC0, #7
647 and DST_ALIGNBIT, DST0, #7
648 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
649 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
650 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
651 cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */
652
653 and SRC, SRC0, #~7
654 and DST, DST0, #~7
655 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
656
657 #if BYTE_ORDER == LITTLE_ENDIAN
658 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
659 ldp DATA1, DATA0, [SRC], #16
660 neg TMP_X, SRC_ALIGNBIT
661 lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
662 lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */
663 orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */
664 b 9f
665 5:
666 ldr DATA0, [SRC], #8
667 lsr DATA1, DATA0, SRC_ALIGNBIT
668 9:
669
670 cbz DST_ALIGNBIT, 5f
671 mov TMP_D, DST0
672 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
673 tbz TMP_D, #0, 1f
674 strb DATA1w, [TMP_D], #1
675 lsr DATA1, DATA1, #8
676 1:
677 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
678 tbz TMP_D, #1, 1f
679 strh DATA1w, [TMP_D], #2
680 lsr DATA1, DATA1, #16
681 1:
682 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
683 tbz TMP_D, #2, 1f
684 str DATA1w, [TMP_D], #4
685 1:
686 add DST, DST, #8
687 b 9f
688 5:
689 str DATA1, [DST], #8
690 9:
691 sub LEN, LEN, #8
692 add LEN, LEN, DST_ALIGNBIT, lsr #3
693 #else /* BYTE_ORDER */
694 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
695 ldp DATA1, DATA0, [SRC], #16
696 neg TMP_X, SRC_ALIGNBIT
697 lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
698 lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */
699 orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */
700 b 9f
701 5:
702 ldr DATA0, [SRC], #8
703 lsl DATA1, DATA0, SRC_ALIGNBIT
704 9:
705
706 cbz DST_ALIGNBIT, 5f
707 mov TMP_D, DST0
708 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
709 tbz TMP_D, #0, 1f
710 lsr TMP_X, DATA1, #56
711 strb TMP_Xw, [TMP_D], #1
712 1:
713 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
714 tbz TMP_D, #1, 1f
715 lsr TMP_X, DATA1, #48
716 strh TMP_Xw, [TMP_D], #2
717 1:
718 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
719 tbz TMP_D, #2, 1f
720 lsr TMP_X, DATA1, #32
721 str TMP_Xw, [TMP_D], #4
722 1:
723 add DST, DST, #8
724 b 9f
725 5:
726 str DATA1, [DST], #8
727 9:
728 sub LEN, LEN, #8
729 add LEN, LEN, DST_ALIGNBIT, lsr #3
730 #endif /* BYTE_ORDER */
731
732 shifting_copy_loop:
733 ldp DATA1, DATA2, [SRC], #16
734 #if BYTE_ORDER == LITTLE_ENDIAN
735 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
736 lsr DATA0, DATA0, SRC_DST_ALIGNBIT
737 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
738 orr DATA0, DATA0, TMP_X
739 /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
740 lsr DATA1, DATA1, SRC_DST_ALIGNBIT
741 lsl TMP_X, DATA2, DST_SRC_ALIGNBIT
742 orr DATA1, DATA1, TMP_X
743 #else /* BYTE_ORDER */
744 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
745 lsl DATA0, DATA0, SRC_DST_ALIGNBIT
746 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
747 orr DATA0, DATA0, TMP_X
748 /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
749 lsl DATA1, DATA1, SRC_DST_ALIGNBIT
750 lsr TMP_X, DATA2, DST_SRC_ALIGNBIT
751 orr DATA1, DATA1, TMP_X
752 #endif /* BYTE_ORDER */
753 stp DATA0, DATA1, [DST], #16
754 mov DATA0, DATA2
755 sub LEN, LEN, #16
756 cmp LEN, #16
757 bhs shifting_copy_loop
758
759
760 /* write 8 bytes */
761 tbz LEN, #3, 9f
762 ldr DATA1, [SRC], #8
763 #if BYTE_ORDER == LITTLE_ENDIAN
764 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
765 lsr DATA0, DATA0, SRC_DST_ALIGNBIT
766 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
767 orr DATA0, DATA0, TMP_X
768 #else /* BYTE_ORDER */
769 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
770 lsl DATA0, DATA0, SRC_DST_ALIGNBIT
771 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
772 orr DATA0, DATA0, TMP_X
773 #endif /* BYTE_ORDER */
774 str DATA0, [DST], #8
775 mov DATA0, DATA1
776 sub LEN, LEN, #8
777 9:
778
779 cbz LEN, shifting_copy_done
780
781 /* copy last 1-7 bytes */
782 and TMP_X, DST_SRC_ALIGNBIT, #63
783 cmp LEN, TMP_X, lsr #3
784 bls 1f
785 ldr DATA1, [SRC], #8 /* don't access out of range */
786 1:
787
788 #if BYTE_ORDER == LITTLE_ENDIAN
789 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
790 lsr DATA0, DATA0, SRC_DST_ALIGNBIT
791 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
792 orr DATA0, DATA0, TMP_X
793 #else /* BYTE_ORDER */
794 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
795 lsl DATA0, DATA0, SRC_DST_ALIGNBIT
796 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
797 orr DATA0, DATA0, TMP_X
798 #endif /* BYTE_ORDER */
799
800 #if BYTE_ORDER == LITTLE_ENDIAN
801 /* if (len & 4) { *(uint32_t *)dst++ = data0; } */
802 tbz LEN, #2, 1f
803 str DATA0w, [DST], #4
804 lsr DATA0, DATA0, #32
805 1:
806 /* if (len & 2) { *(uint16_t *)dst++ = data0; } */
807 tbz LEN, #1, 1f
808 strh DATA0w, [DST], #2
809 lsr DATA0, DATA0, #16
810 1:
811 /* if (len & 1) { *(uint8_t *)dst++ = data0; } */
812 tbz LEN, #0, 1f
813 strb DATA0w, [DST], #1
814 1:
815 #else /* BYTE_ORDER */
816 /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
817 tbz LEN, #2, 1f
818 lsr TMP_X, DATA0, #32
819 str TMP_Xw, [DST], #4
820 1:
821 /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
822 tbz LEN, #1, 1f
823 lsr TMP_X, DATA0, #16
824 strh TMP_Xw, [DST], #2
825 1:
826 /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
827 tbz LEN, #0, 1f
828 lsr TMP_X, DATA0, #8
829 strb TMP_Xw, [DST], #1
830 1:
831 #endif /* BYTE_ORDER */
832 shifting_copy_done:
833 ret
834
835 #else /* STRICT_ALIGNMENT */
836 #ifndef NO_OVERLAP
837 cbz LEN, done
838 cmp SRC0, DST0
839 beq done
840 bcc backward_ignore_align
841 #endif /* NO_OVERLAP */
842
843 prfm PLDL1KEEP, [SRC0]
844 cmp LEN, #SMALLSIZE
845 bcs copy_forward
846 mov DST, DST0
847
848 copy_forward_small:
849 cmp LEN, #8
850 bcs 9f
851
852 /* 0 <= len < 8 */
853 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
854 tbz LEN, #2, 1f
855 ldr TMP_Xw, [SRC0], #4
856 str TMP_Xw, [DST], #4
857 1:
858 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
859 tbz LEN, #1, 1f
860 ldrh TMP_Xw, [SRC0], #2
861 strh TMP_Xw, [DST], #2
862 1:
863 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
864 tbz LEN, #0, 1f
865 ldrb TMP_Xw, [SRC0], #1
866 strb TMP_Xw, [DST], #1
867 1:
868 ret
869 9:
870
871 prfm PLDL1KEEP, [SRC0, #8]
872 cmp LEN, #16
873 bcs 9f
874
875 /* 8 <= len < 16 */
876 /* *(uint64_t *)dst++ = *(uint64_t *)src++; */
877 ldr TMP_X, [SRC0], #8
878 str TMP_X, [DST], #8
879 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
880 tbz LEN, #2, 1f
881 ldr TMP_Xw, [SRC0], #4
882 str TMP_Xw, [DST], #4
883 1:
884 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
885 tbz LEN, #1, 1f
886 ldrh TMP_Xw, [SRC0], #2
887 strh TMP_Xw, [DST], #2
888 1:
889 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
890 tbz LEN, #0, 1f
891 ldrb TMP_Xw, [SRC0], #1
892 strb TMP_Xw, [DST], #1
893 1:
894 ret
895 9:
896
897 /* 16 <= len < 32 */
898 prfm PLDL1KEEP, [SRC0, 16]
899 prfm PLDL1KEEP, [SRC0, 24]
900 ldp DATA0, DATA1, [SRC0], #16
901 stp DATA0, DATA1, [DST], #16
902 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
903 tbz LEN, #3, 1f
904 ldr TMP_X, [SRC0], #8
905 str TMP_X, [DST], #8
906 1:
907 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
908 tbz LEN, #2, 1f
909 ldr TMP_Xw, [SRC0], #4
910 str TMP_Xw, [DST], #4
911 1:
912 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
913 tbz LEN, #1, 1f
914 ldrh TMP_Xw, [SRC0], #2
915 strh TMP_Xw, [DST], #2
916 1:
917 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
918 tbz LEN, #0, 1f
919 ldrb TMP_Xw, [SRC0], #1
920 strb TMP_Xw, [DST], #1
921 1:
922 ret
923 #endif /* !STRICT_ALIGNMENT */
924
925 .align 4
926 copy_forward:
927 /* DST is not aligned at this point */
928 mov DST, DST0
929 #ifndef STRICT_ALIGNMENT
930 cmp LEN, #512 /* pre-alignment can be overhead when small */
931 bcc 9f
932 #endif /* STRICT_ALIGNMENT */
933 /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
934 tbz DST, #0, 1f
935 ldrb TMP_Xw, [SRC0], #1
936 strb TMP_Xw, [DST], #1
937 sub LEN, LEN, #1
938 1:
939 /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
940 tbz DST, #1, 1f
941 ldrh TMP_Xw, [SRC0], #2
942 strh TMP_Xw, [DST], #2
943 sub LEN, LEN, #2
944 1:
945 /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
946 tbz DST, #2, 1f
947 ldr TMP_Xw, [SRC0], #4
948 str TMP_Xw, [DST], #4
949 sub LEN, LEN, #4
950 1:
951 #if (STP_ALIGN > 8)
952 /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
953 tbz DST, #3, 1f
954 ldr TMP_X, [SRC0], #8
955 str TMP_X, [DST], #8
956 sub LEN, LEN, #8
957 1:
958 #endif /* (STP_ALIGN > 8) */
959 9:
960
961 forward_copy1k:
962 /* while (len >= 1024) */
963 /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
964 cmp LEN, #1024
965 blo 9f
966 1:
967 sub LEN, LEN, #1024
968 .rept (1024 / 16)
969 ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */
970 stp DATA0, DATA1, [DST], #16
971 .endr
972 cmp LEN, #1024
973 bhs 1b
974 9:
975
976 /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
977 tbz LEN, #9, 1f
978 .rept (512 / 16)
979 ldp DATA0, DATA1, [SRC0], #16
980 stp DATA0, DATA1, [DST], #16
981 .endr
982 1:
983 /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
984 tbz LEN, #8, 1f
985 .rept (256 / 16)
986 ldp DATA0, DATA1, [SRC0], #16
987 stp DATA0, DATA1, [DST], #16
988 .endr
989 1:
990 /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
991 tbz LEN, #7, 1f
992 .rept (128 / 16)
993 ldp DATA0, DATA1, [SRC0], #16
994 stp DATA0, DATA1, [DST], #16
995 .endr
996 1:
997 /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
998 tbz LEN, #6, 1f
999 .rept (64 / 16)
1000 ldp DATA0, DATA1, [SRC0], #16
1001 stp DATA0, DATA1, [DST], #16
1002 .endr
1003 1:
1004 /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
1005 tbz LEN, #5, 1f
1006 .rept (32 / 16)
1007 ldp DATA0, DATA1, [SRC0], #16
1008 stp DATA0, DATA1, [DST], #16
1009 .endr
1010 1:
1011 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
1012 tbz LEN, #4, 1f
1013 ldp DATA0, DATA1, [SRC0], #16
1014 stp DATA0, DATA1, [DST], #16
1015 1:
1016 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
1017 tbz LEN, #3, 1f
1018 ldr TMP_X, [SRC0], #8
1019 str TMP_X, [DST], #8
1020 1:
1021 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
1022 tbz LEN, #2, 1f
1023 ldr TMP_Xw, [SRC0], #4
1024 str TMP_Xw, [DST], #4
1025 1:
1026 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
1027 tbz LEN, #1, 1f
1028 ldrh TMP_Xw, [SRC0], #2
1029 strh TMP_Xw, [DST], #2
1030 1:
1031 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
1032 tbz LEN, #0, 1f
1033 ldrb TMP_Xw, [SRC0], #1
1034 strb TMP_Xw, [DST], #1
1035 1:
1036 done:
1037 ret
1038 END(FUNCTION)
1039