bcopy.S revision 1.1 1 /* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
2
3 /*
4 * Copyright (c) 2018 Ryo Shimizu <ryo (at) nerv.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <machine/asm.h>
30
31 #if defined(LIBC_SCCS)
32 RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $")
33 #endif
34
35 #if defined(MEMCOPY)
36
37 /*
38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 */
40 #define FUNCTION memcpy
41 #define NO_OVERLAP
42 #define SRC0 x1
43 #define DST0 x0
44 #define LEN x2
45
46 #elif defined(MEMMOVE)
47
48 /*
49 * void *memmove(void *dst, const void *src, size_t len);
50 */
51 #define FUNCTION memmove
52 #undef NO_OVERLAP
53 #define SRC0 x1
54 #define DST0 x0
55 #define LEN x2
56
57 #else /* !MEMCOPY && !MEMMOVE */
58
59 /*
60 * void bcopy(const void *src, void *dst, size_t len);
61 */
62 #define FUNCTION bcopy
63 #define NO_OVERLAP
64 #define SRC0 x0
65 #define DST0 x1
66 #define LEN x2
67
68 #endif /* MEMCOPY/MEMMOVE/BCOPY */
69
70 /* caller-saved temporary registers. breakable. */
71 #define TMP_X x3
72 #define TMP_Xw w3
73 #define TMP_D x4
74 #define TMP_S x5
75 #define DST x6
76 #define SRC x7
77 #define DATA0 x8
78 #define DATA0w w8
79 #define DATA1 x9
80 #define DATA1w w9
81 #define DATA2 x10
82 #define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */
83 #define DST_ALIGNBIT x12 /* (DST & 7) * 8 */
84 #define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */
85 #define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */
86
87 #define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */
88 #define SMALLSIZE 32
89
90 .text
91 .align 5
92
93 #ifndef NO_OVERLAP
94 #ifndef STRICT_ALIGNMENT
95 backward_ignore_align:
96 prfm PLDL1KEEP, [SRC0]
97 add SRC0, SRC0, LEN
98 add DST, DST0, LEN
99 cmp LEN, #SMALLSIZE
100 bcs copy_backward
101 copy_backward_small:
102 cmp LEN, #8
103 bcs 9f
104
105 /* 0 <= len < 8 */
106 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107 tbz LEN, #2, 1f
108 ldr TMP_Xw, [SRC0, #-4]!
109 str TMP_Xw, [DST, #-4]!
110 1:
111 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112 tbz LEN, #1, 1f
113 ldrh TMP_Xw, [SRC0, #-2]!
114 strh TMP_Xw, [DST, #-2]!
115 1:
116 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117 tbz LEN, #0, 1f
118 ldrb TMP_Xw, [SRC0, #-1]!
119 strb TMP_Xw, [DST, #-1]!
120 1:
121 ret
122 9:
123
124 cmp LEN, #16
125 bcs 9f
126
127 /* 8 <= len < 16 */
128 /* *--(uint64_t *)dst = *--(uint64_t *)src; */
129 ldr TMP_X, [SRC0, #-8]!
130 str TMP_X, [DST, #-8]!
131 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132 tbz LEN, #2, 1f
133 ldr TMP_Xw, [SRC0, #-4]!
134 str TMP_Xw, [DST, #-4]!
135 1:
136 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137 tbz LEN, #1, 1f
138 ldrh TMP_Xw, [SRC0, #-2]!
139 strh TMP_Xw, [DST, #-2]!
140 1:
141 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142 tbz LEN, #0, 1f
143 ldrb TMP_Xw, [SRC0, #-1]!
144 strb TMP_Xw, [DST, #-1]!
145 1:
146 ret
147 9:
148
149 /* 16 <= len < 32 */
150 ldp DATA0, DATA1, [SRC0, #-16]!
151 stp DATA0, DATA1, [DST, #-16]!
152 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153 tbz LEN, #3, 1f
154 ldr TMP_X, [SRC0, #-8]!
155 str TMP_X, [DST, #-8]!
156 1:
157 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158 tbz LEN, #2, 1f
159 ldr TMP_Xw, [SRC0, #-4]!
160 str TMP_Xw, [DST, #-4]!
161 1:
162 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163 tbz LEN, #1, 1f
164 ldrh TMP_Xw, [SRC0, #-2]!
165 strh TMP_Xw, [DST, #-2]!
166 1:
167 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168 tbz LEN, #0, 1f
169 ldrb TMP_Xw, [SRC0, #-1]!
170 strb TMP_Xw, [DST, #-1]!
171 1:
172 ret
173 #endif /* !STRICT_ALIGNMENT */
174
175 .align 4
176 copy_backward:
177 /* DST is not aligned at this point */
178 #ifndef STRICT_ALIGNMENT
179 cmp LEN, #512 /* pre-alignment can be overhead when small */
180 bcc 9f
181 #endif
182 /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183 tbz DST, #0, 1f
184 ldrb TMP_Xw, [SRC0, #-1]!
185 strb TMP_Xw, [DST, #-1]!
186 sub LEN, LEN, #1
187 1:
188 /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189 tbz DST, #1, 1f
190 ldrh TMP_Xw, [SRC0, #-2]!
191 strh TMP_Xw, [DST, #-2]!
192 sub LEN, LEN, #2
193 1:
194 /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195 tbz DST, #2, 1f
196 ldr TMP_Xw, [SRC0, #-4]!
197 str TMP_Xw, [DST, #-4]!
198 sub LEN, LEN, #4
199 1:
200 #if (STP_ALIGN > 8)
201 /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202 tbz DST, #3, 1f
203 ldr TMP_X, [SRC0, #-8]!
204 str TMP_X, [DST, #-8]!
205 sub LEN, LEN, #8
206 1:
207 #endif /* (STP_ALIGN > 8) */
208 9:
209
210 cmp LEN, #1024
211 bhs backward_copy1k
212 backward_less1k:
213 /* copy 16*n bytes */
214 and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
215 adr TMP_X, 8f
216 sub LEN, LEN, TMP_D
217 sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
218 br TMP_X
219 backward_copy1k: /* copy 16*64 bytes */
220 sub LEN, LEN, #1024
221 .rept (1024 / 16)
222 ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */
223 stp DATA0, DATA1, [DST, #-16]!
224 .endr
225 8:
226 cbz LEN, done
227 cmp LEN, #1024
228 bhs backward_copy1k
229 cmp LEN, #16
230 bhs backward_less1k
231
232 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
233 tbz LEN, #4, 1f
234 ldp DATA0, DATA1, [SRC0, #-16]!
235 ldp DATA0, DATA1, [DST, #-16]!
236 1:
237 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
238 tbz LEN, #3, 1f
239 ldr TMP_X, [SRC0, #-8]!
240 str TMP_X, [DST, #-8]!
241 1:
242 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
243 tbz LEN, #2, 1f
244 ldr TMP_Xw, [SRC0, #-4]!
245 str TMP_Xw, [DST, #-4]!
246 1:
247 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
248 tbz LEN, #1, 1f
249 ldrh TMP_Xw, [SRC0, #-2]!
250 strh TMP_Xw, [DST, #-2]!
251 1:
252 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
253 tbz LEN, #0, 1f
254 ldrb TMP_Xw, [SRC0, #-1]!
255 strb TMP_Xw, [DST, #-1]!
256 1:
257 ret
258 #endif /* !NO_OVERLAP */
259
260
261 #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
262 .align 5
263 backward_copy:
264 prfm PLDL1KEEP, [SRC0]
265 add DST, DST0, LEN
266 add SRC0, SRC0, LEN
267 cmp LEN, #SMALLSIZE
268 bcs strict_backward
269
270 cmp LEN, #10
271 bcs 9f
272 backward_tiny:
273 /* copy 1-10 bytes */
274 adr TMP_X, 8f
275 sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
276 br TMP_X
277 .rept 10
278 ldrb TMP_Xw, [SRC0, #-1]!
279 strb TMP_Xw, [DST, #-1]!
280 .endr
281 8:
282 ret
283 9:
284 /* length is small(<32), and src or dst may be unaligned */
285 eor TMP_X, SRC0, DST0
286 ands TMP_X, TMP_X, #7
287 bne notaligned_backward_small
288
289 samealign_backward_small:
290 /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
291 tbz DST, #0, 1f
292 ldrb TMP_Xw, [SRC0, #-1]!
293 strb TMP_Xw, [DST, #-1]!
294 sub LEN, LEN, #1
295 1:
296 /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
297 tbz DST, #1, 1f
298 ldrh TMP_Xw, [SRC0, #-2]!
299 strh TMP_Xw, [DST, #-2]!
300 sub LEN, LEN, #2
301 1:
302 /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
303 tbz DST, #2, 1f
304 ldr TMP_Xw, [SRC0, #-4]!
305 str TMP_Xw, [DST, #-4]!
306 sub LEN, LEN, #4
307 1:
308 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
309 tbz LEN, #4, 1f
310 ldp DATA0, DATA1, [SRC0, #-16]!
311 stp DATA0, DATA1, [DST, #-16]!
312 1:
313 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
314 tbz LEN, #3, 1f
315 ldr TMP_X, [SRC0, #-8]!
316 str TMP_X, [DST, #-8]!
317 1:
318 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
319 tbz LEN, #2, 1f
320 ldr TMP_Xw, [SRC0, #-4]!
321 str TMP_Xw, [DST, #-4]!
322 1:
323 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
324 tbz LEN, #1, 1f
325 ldrh TMP_Xw, [SRC0, #-2]!
326 strh TMP_Xw, [DST, #-2]!
327 1:
328 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
329 tbz LEN, #0, 1f
330 ldrb TMP_Xw, [SRC0, #-1]!
331 strb TMP_Xw, [DST, #-1]!
332 1:
333 ret
334
335 notaligned_backward_small:
336 /* length is small, and src or dst may be unaligned */
337 sub TMP_S, SRC0, LEN /* tmp_s = src - len */
338 1: /* do { */
339 ldrb TMP_Xw, [SRC0, #-1]!
340 strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */
341 cmp TMP_S, SRC0 /* while (tmp_s < src) */
342 blo 1b
343 ret
344
345 strict_backward:
346 /* src or dst may be unaligned */
347 and SRC_ALIGNBIT, SRC0, #7
348 and DST_ALIGNBIT, DST, #7
349 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
350 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
351 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
352 cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */
353
354 and SRC, SRC0, #~7
355 and DST, DST, #~7
356 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
357
358 #if BYTE_ORDER == LITTLE_ENDIAN
359 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
360
361 cmp SRC, SRC0 /* don't access out of range */
362 beq 1f
363 ldr DATA1, [SRC]
364 1:
365 ldr DATA0, [SRC, #-8]!
366
367 lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
368 lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */
369 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
370
371 b 9f /* } */
372 5: /* else { */
373 ldr DATA0, [SRC] /* data0 = *src; */
374 lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/
375 9: /* } */
376
377 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
378 mov TMP_D, DST /* tmp_d = dst; */
379
380 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
381 str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */
382 lsr DATA1, DATA1, #32 /* data1 >>= 32; */
383 1: /* } */
384 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
385 strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */
386 lsr DATA1, DATA1, #16 /* data1 >>= 16; */
387 1: /* } */
388 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
389 strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */
390 1: /* } */
391
392 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
393 9: /* } */
394 #else /* BYTE_ORDER */
395 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
396
397 cmp SRC, SRC0 /* don't access out of range */
398 beq 1f
399 ldr DATA1, [SRC]
400 1:
401 ldr DATA0, [SRC, #-8]!
402
403 lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
404 lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */
405 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
406
407 b 9f /* } */
408 5: /* else { */
409 ldr DATA0, [SRC] /* data0 = *src; */
410 lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/
411 9: /* } */
412
413 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
414 mov TMP_D, DST /* tmp_d = dst; */
415
416 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
417 lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */
418 str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */
419 1: /* } */
420 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
421 lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */
422 strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */
423 1: /* } */
424 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
425 lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */
426 strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */
427 1: /* } */
428
429 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
430 9: /* } */
431 #endif /* BYTE_ORDER */
432
433
434 backward_shifting_copy_loop:
435 ldp DATA2, DATA1, [SRC, #-16]!
436 #if BYTE_ORDER == LITTLE_ENDIAN
437 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
438 lsl DATA0, DATA0, DST_SRC_ALIGNBIT
439 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
440 orr DATA0, DATA0, TMP_X
441 /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
442 lsl DATA1, DATA1, DST_SRC_ALIGNBIT
443 lsr TMP_X, DATA2, SRC_DST_ALIGNBIT
444 orr DATA1, DATA1, TMP_X
445 #else /* BYTE_ORDER */
446 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
447 lsr DATA0, DATA0, DST_SRC_ALIGNBIT
448 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
449 orr DATA0, DATA0, TMP_X
450 /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
451 lsr DATA1, DATA1, DST_SRC_ALIGNBIT
452 lsl TMP_X, DATA2, SRC_DST_ALIGNBIT
453 orr DATA1, DATA1, TMP_X
454 #endif /* BYTE_ORDER */
455 stp DATA1, DATA0, [DST, #-16]!
456 mov DATA0, DATA2
457 sub LEN, LEN, #16
458 cmp LEN, #16
459 bhs backward_shifting_copy_loop
460
461
462 /* write 8 bytes */
463 tbz LEN, #3, 9f
464
465 ldr DATA1, [SRC, #-8]!
466 #if BYTE_ORDER == LITTLE_ENDIAN
467 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
468 lsl DATA0, DATA0, DST_SRC_ALIGNBIT
469 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
470 orr DATA0, DATA0, TMP_X
471 #else /* BYTE_ORDER */
472 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
473 lsr DATA0, DATA0, DST_SRC_ALIGNBIT
474 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
475 orr DATA0, DATA0, TMP_X
476 #endif /* BYTE_ORDER */
477 str DATA0, [DST, #-8]!
478 mov DATA0, DATA1
479 sub LEN, LEN, #8
480 9:
481
482 cbz LEN, backward_shifting_copy_done
483
484 /* copy last 1-7 bytes */
485 and TMP_X, SRC_DST_ALIGNBIT, #63
486 cmp LEN, TMP_X, lsr #3
487 bls 1f
488 ldr DATA1, [SRC, #-8]! /* don't access out of range */
489 1:
490
491 #if BYTE_ORDER == LITTLE_ENDIAN
492 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
493 lsl DATA0, DATA0, DST_SRC_ALIGNBIT
494 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
495 orr DATA0, DATA0, TMP_X
496 #else /* BYTE_ORDER */
497 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
498 lsr DATA0, DATA0, DST_SRC_ALIGNBIT
499 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
500 orr DATA0, DATA0, TMP_X
501 #endif /* BYTE_ORDER */
502
503 #if BYTE_ORDER == LITTLE_ENDIAN
504 tbz LEN, #2, 1f
505 ror DATA0, DATA0, #32
506 str DATA0w, [DST, #-4]!
507 1:
508 tbz LEN, #1, 1f
509 ror DATA0, DATA0, #48
510 strh DATA0w, [DST, #-2]!
511 1:
512 tbz LEN, #0, 1f
513 ror DATA0, DATA0, #56
514 strb DATA0w, [DST, #-1]!
515 1:
516 #else /* BYTE_ORDER */
517 tbz LEN, #2, 1f
518 str DATA0w, [DST, #-4]!
519 lsr DATA0, DATA0, #32
520 1:
521 tbz LEN, #1, 1f
522 strh DATA0w, [DST, #-2]!
523 lsr DATA0, DATA0, #16
524 1:
525 tbz LEN, #0, 1f
526 strb DATA0w, [DST, #-1]!
527 1:
528 #endif /* BYTE_ORDER */
529 backward_shifting_copy_done:
530 ret
531 #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
532
533
534 .align 5
535 ENTRY(FUNCTION)
536 #ifdef STRICT_ALIGNMENT
537 cbz LEN, done
538 #ifndef NO_OVERLAP
539 cmp SRC0, DST0
540 beq done
541 bcc backward_copy
542 #endif /* NO_OVERLAP */
543 mov DST, DST0
544 cmp LEN, #SMALLSIZE
545 bcs strict_forward
546
547 cmp LEN, #10
548 bcs 9f
549 forward_tiny:
550 /* copy 1-10 bytes */
551 adr TMP_X, 8f
552 sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
553 br TMP_X
554 .rept 10
555 ldrb TMP_Xw, [SRC0], #1
556 strb TMP_Xw, [DST], #1
557 .endr
558 8:
559 ret
560 9:
561 /* length is small(<32), and src or dst may be unaligned */
562 eor TMP_X, SRC0, DST0
563 ands TMP_X, TMP_X, #7
564 bne notaligned_forward_small
565 samealign_forward_small:
566 /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
567 tbz DST, #0, 1f
568 ldrb TMP_Xw, [SRC0], #1
569 strb TMP_Xw, [DST], #1
570 sub LEN, LEN, #1
571 1:
572 /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
573 tbz DST, #1, 1f
574 ldrh TMP_Xw, [SRC0], #2
575 strh TMP_Xw, [DST], #2
576 sub LEN, LEN, #2
577 1:
578 /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
579 tbz DST, #2, 1f
580 ldr TMP_Xw, [SRC0], #4
581 str TMP_Xw, [DST], #4
582 sub LEN, LEN, #4
583 1:
584 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
585 tbz LEN, #4, 1f
586 ldp DATA0, DATA1, [SRC0], #16
587 stp DATA0, DATA1, [DST], #16
588 1:
589 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
590 tbz LEN, #3, 1f
591 ldr TMP_X, [SRC0], #8
592 str TMP_X, [DST], #8
593 1:
594 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
595 tbz LEN, #2, 1f
596 ldr TMP_Xw, [SRC0], #4
597 str TMP_Xw, [DST], #4
598 1:
599 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
600 tbz LEN, #1, 1f
601 ldrh TMP_Xw, [SRC0], #2
602 strh TMP_Xw, [DST], #2
603 1:
604 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
605 tbz LEN, #0, 1f
606 ldrb TMP_Xw, [SRC0], #1
607 strb TMP_Xw, [DST], #1
608 1:
609 ret
610
611 notaligned_forward_small:
612 /* src and dst are not aligned... */
613 prfm PLDL1KEEP, [SRC0]
614 prfm PLDL1KEEP, [SRC0, #8]
615 prfm PLDL1KEEP, [SRC0, #16]
616 add TMP_S, SRC0, LEN /* tmp_s = src + len */
617 1: /* do { */
618 ldrb TMP_Xw, [SRC0], #1
619 strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */
620 cmp SRC0, TMP_S /* while (src < tmp_s); */
621 blo 1b
622 ret
623
624 strict_forward:
625 /* src or dst may be unaligned */
626 and SRC_ALIGNBIT, SRC0, #7
627 and DST_ALIGNBIT, DST0, #7
628 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
629 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
630 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
631 cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */
632
633 and SRC, SRC0, #~7
634 and DST, DST0, #~7
635 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
636
637 #if BYTE_ORDER == LITTLE_ENDIAN
638 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
639 ldp DATA1, DATA0, [SRC], #16
640 neg TMP_X, SRC_ALIGNBIT
641 lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
642 lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */
643 orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */
644 b 9f
645 5:
646 ldr DATA0, [SRC], #8
647 lsr DATA1, DATA0, SRC_ALIGNBIT
648 9:
649
650 cbz DST_ALIGNBIT, 5f
651 mov TMP_D, DST0
652 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
653 tbz TMP_D, #0, 1f
654 strb DATA1w, [TMP_D], #1
655 lsr DATA1, DATA1, #8
656 1:
657 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
658 tbz TMP_D, #1, 1f
659 strh DATA1w, [TMP_D], #2
660 lsr DATA1, DATA1, #16
661 1:
662 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
663 tbz TMP_D, #2, 1f
664 str DATA1w, [TMP_D], #4
665 1:
666 add DST, DST, #8
667 b 9f
668 5:
669 str DATA1, [DST], #8
670 9:
671 sub LEN, LEN, #8
672 add LEN, LEN, DST_ALIGNBIT, lsr #3
673 #else /* BYTE_ORDER */
674 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
675 ldp DATA1, DATA0, [SRC], #16
676 neg TMP_X, SRC_ALIGNBIT
677 lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
678 lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */
679 orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */
680 b 9f
681 5:
682 ldr DATA0, [SRC], #8
683 lsl DATA1, DATA0, SRC_ALIGNBIT
684 9:
685
686 cbz DST_ALIGNBIT, 5f
687 mov TMP_D, DST0
688 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
689 tbz TMP_D, #0, 1f
690 lsr TMP_X, DATA1, #56
691 strb TMP_Xw, [TMP_D], #1
692 1:
693 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
694 tbz TMP_D, #1, 1f
695 lsr TMP_X, DATA1, #48
696 strh TMP_Xw, [TMP_D], #2
697 1:
698 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
699 tbz TMP_D, #2, 1f
700 lsr TMP_X, DATA1, #32
701 str TMP_Xw, [TMP_D], #4
702 1:
703 add DST, DST, #8
704 b 9f
705 5:
706 str DATA1, [DST], #8
707 9:
708 sub LEN, LEN, #8
709 add LEN, LEN, DST_ALIGNBIT, lsr #3
710 #endif /* BYTE_ORDER */
711
712 shifting_copy_loop:
713 ldp DATA1, DATA2, [SRC], #16
714 #if BYTE_ORDER == LITTLE_ENDIAN
715 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
716 lsr DATA0, DATA0, SRC_DST_ALIGNBIT
717 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
718 orr DATA0, DATA0, TMP_X
719 /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
720 lsr DATA1, DATA1, SRC_DST_ALIGNBIT
721 lsl TMP_X, DATA2, DST_SRC_ALIGNBIT
722 orr DATA1, DATA1, TMP_X
723 #else /* BYTE_ORDER */
724 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
725 lsl DATA0, DATA0, SRC_DST_ALIGNBIT
726 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
727 orr DATA0, DATA0, TMP_X
728 /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
729 lsl DATA1, DATA1, SRC_DST_ALIGNBIT
730 lsr TMP_X, DATA2, DST_SRC_ALIGNBIT
731 orr DATA1, DATA1, TMP_X
732 #endif /* BYTE_ORDER */
733 stp DATA0, DATA1, [DST], #16
734 mov DATA0, DATA2
735 sub LEN, LEN, #16
736 cmp LEN, #16
737 bhs shifting_copy_loop
738
739
740 /* write 8 bytes */
741 tbz LEN, #3, 9f
742 ldr DATA1, [SRC], #8
743 #if BYTE_ORDER == LITTLE_ENDIAN
744 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
745 lsr DATA0, DATA0, SRC_DST_ALIGNBIT
746 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
747 orr DATA0, DATA0, TMP_X
748 #else /* BYTE_ORDER */
749 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
750 lsl DATA0, DATA0, SRC_DST_ALIGNBIT
751 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
752 orr DATA0, DATA0, TMP_X
753 #endif /* BYTE_ORDER */
754 str DATA0, [DST], #8
755 mov DATA0, DATA1
756 sub LEN, LEN, #8
757 9:
758
759 cbz LEN, shifting_copy_done
760
761 /* copy last 1-7 bytes */
762 and TMP_X, DST_SRC_ALIGNBIT, #63
763 cmp LEN, TMP_X, lsr #3
764 bls 1f
765 ldr DATA1, [SRC], #8 /* don't access out of range */
766 1:
767
768 #if BYTE_ORDER == LITTLE_ENDIAN
769 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
770 lsr DATA0, DATA0, SRC_DST_ALIGNBIT
771 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
772 orr DATA0, DATA0, TMP_X
773 #else /* BYTE_ORDER */
774 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
775 lsl DATA0, DATA0, SRC_DST_ALIGNBIT
776 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
777 orr DATA0, DATA0, TMP_X
778 #endif /* BYTE_ORDER */
779
780 #if BYTE_ORDER == LITTLE_ENDIAN
781 /* if (len & 4) { *(uint32_t *)dst++ = data0; } */
782 tbz LEN, #2, 1f
783 str DATA0w, [DST], #4
784 lsr DATA0, DATA0, #32
785 1:
786 /* if (len & 2) { *(uint16_t *)dst++ = data0; } */
787 tbz LEN, #1, 1f
788 strh DATA0w, [DST], #2
789 lsr DATA0, DATA0, #16
790 1:
791 /* if (len & 1) { *(uint8_t *)dst++ = data0; } */
792 tbz LEN, #0, 1f
793 strb DATA0w, [DST], #1
794 1:
795 #else /* BYTE_ORDER */
796 /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
797 tbz LEN, #2, 1f
798 lsr TMP_X, DATA0, #32
799 str TMP_Xw, [DST], #4
800 1:
801 /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
802 tbz LEN, #1, 1f
803 lsr TMP_X, DATA0, #16
804 strh TMP_Xw, [DST], #2
805 1:
806 /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
807 tbz LEN, #0, 1f
808 lsr TMP_X, DATA0, #8
809 strb TMP_Xw, [DST], #1
810 1:
811 #endif /* BYTE_ORDER */
812 shifting_copy_done:
813 ret
814
815 #else /* STRICT_ALIGNMENT */
816 #ifndef NO_OVERLAP
817 cbz LEN, done
818 cmp SRC0, DST0
819 beq done
820 bcc backward_ignore_align
821 #endif /* NO_OVERLAP */
822
823 prfm PLDL1KEEP, [SRC0]
824 cmp LEN, #SMALLSIZE
825 bcs copy_forward
826 mov DST, DST0
827
828 copy_forward_small:
829 cmp LEN, #8
830 bcs 9f
831
832 /* 0 <= len < 8 */
833 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
834 tbz LEN, #2, 1f
835 ldr TMP_Xw, [SRC0], #4
836 str TMP_Xw, [DST], #4
837 1:
838 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
839 tbz LEN, #1, 1f
840 ldrh TMP_Xw, [SRC0], #2
841 strh TMP_Xw, [DST], #2
842 1:
843 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
844 tbz LEN, #0, 1f
845 ldrb TMP_Xw, [SRC0], #1
846 strb TMP_Xw, [DST], #1
847 1:
848 ret
849 9:
850
851 prfm PLDL1KEEP, [SRC0, #8]
852 cmp LEN, #16
853 bcs 9f
854
855 /* 8 <= len < 16 */
856 /* *(uint64_t *)dst++ = *(uint64_t *)src++; */
857 ldr TMP_X, [SRC0], #8
858 str TMP_X, [DST], #8
859 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
860 tbz LEN, #2, 1f
861 ldr TMP_Xw, [SRC0], #4
862 str TMP_Xw, [DST], #4
863 1:
864 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
865 tbz LEN, #1, 1f
866 ldrh TMP_Xw, [SRC0], #2
867 strh TMP_Xw, [DST], #2
868 1:
869 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
870 tbz LEN, #0, 1f
871 ldrb TMP_Xw, [SRC0], #1
872 strb TMP_Xw, [DST], #1
873 1:
874 ret
875 9:
876
877 /* 16 <= len < 32 */
878 prfm PLDL1KEEP, [SRC0, 16]
879 prfm PLDL1KEEP, [SRC0, 24]
880 ldp DATA0, DATA1, [SRC0], #16
881 stp DATA0, DATA1, [DST], #16
882 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
883 tbz LEN, #3, 1f
884 ldr TMP_X, [SRC0], #8
885 str TMP_X, [DST], #8
886 1:
887 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
888 tbz LEN, #2, 1f
889 ldr TMP_Xw, [SRC0], #4
890 str TMP_Xw, [DST], #4
891 1:
892 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
893 tbz LEN, #1, 1f
894 ldrh TMP_Xw, [SRC0], #2
895 strh TMP_Xw, [DST], #2
896 1:
897 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
898 tbz LEN, #0, 1f
899 ldrb TMP_Xw, [SRC0], #1
900 strb TMP_Xw, [DST], #1
901 1:
902 ret
903 #endif /* !STRICT_ALIGNMENT */
904
905 .align 4
906 copy_forward:
907 /* DST is not aligned at this point */
908 mov DST, DST0
909 #ifndef STRICT_ALIGNMENT
910 cmp LEN, #512 /* pre-alignment can be overhead when small */
911 bcc 9f
912 #endif /* STRICT_ALIGNMENT */
913 /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
914 tbz DST, #0, 1f
915 ldrb TMP_Xw, [SRC0], #1
916 strb TMP_Xw, [DST], #1
917 sub LEN, LEN, #1
918 1:
919 /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
920 tbz DST, #1, 1f
921 ldrh TMP_Xw, [SRC0], #2
922 strh TMP_Xw, [DST], #2
923 sub LEN, LEN, #2
924 1:
925 /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
926 tbz DST, #2, 1f
927 ldr TMP_Xw, [SRC0], #4
928 str TMP_Xw, [DST], #4
929 sub LEN, LEN, #4
930 1:
931 #if (STP_ALIGN > 8)
932 /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
933 tbz DST, #3, 1f
934 ldr TMP_X, [SRC0], #8
935 str TMP_X, [DST], #8
936 sub LEN, LEN, #8
937 1:
938 #endif /* (STP_ALIGN > 8) */
939 9:
940
941 cmp LEN, #1024
942 bhs forward_copy1k
943 forward_less1k:
944 /* copy 16*n bytes */
945 and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
946 adr TMP_X, 8f
947 sub LEN, LEN, TMP_D
948 sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
949 br TMP_X
950 forward_copy1k: /* copy 16*64 bytes */
951 sub LEN, LEN, #1024
952 .rept (1024 / 16)
953 ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */
954 stp DATA0, DATA1, [DST], #16
955 .endr
956 8:
957 cbz LEN, done
958 cmp LEN, #1024
959 bhs forward_copy1k
960 cmp LEN, #16
961 bhs forward_less1k
962
963 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
964 tbz LEN, #4, 1f
965 ldp DATA0, DATA1, [SRC0], #16
966 stp DATA0, DATA1, [DST], #16
967 1:
968 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
969 tbz LEN, #3, 1f
970 ldr TMP_X, [SRC0], #8
971 str TMP_X, [DST], #8
972 1:
973 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
974 tbz LEN, #2, 1f
975 ldr TMP_Xw, [SRC0], #4
976 str TMP_Xw, [DST], #4
977 1:
978 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
979 tbz LEN, #1, 1f
980 ldrh TMP_Xw, [SRC0], #2
981 strh TMP_Xw, [DST], #2
982 1:
983 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
984 tbz LEN, #0, 1f
985 ldrb TMP_Xw, [SRC0], #1
986 strb TMP_Xw, [DST], #1
987 1:
988 done:
989 ret
990 END(FUNCTION)
991