bcopy.S revision 1.1 1 1.1 skrll /* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
2 1.1 skrll
3 1.1 skrll /*
4 1.1 skrll * Copyright (c) 2018 Ryo Shimizu <ryo (at) nerv.org>
5 1.1 skrll * All rights reserved.
6 1.1 skrll *
7 1.1 skrll * Redistribution and use in source and binary forms, with or without
8 1.1 skrll * modification, are permitted provided that the following conditions
9 1.1 skrll * are met:
10 1.1 skrll * 1. Redistributions of source code must retain the above copyright
11 1.1 skrll * notice, this list of conditions and the following disclaimer.
12 1.1 skrll * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 skrll * notice, this list of conditions and the following disclaimer in the
14 1.1 skrll * documentation and/or other materials provided with the distribution.
15 1.1 skrll *
16 1.1 skrll * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 skrll * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 skrll * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 skrll * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 skrll * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 skrll * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 skrll * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 skrll * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 skrll * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 skrll * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 skrll * POSSIBILITY OF SUCH DAMAGE.
27 1.1 skrll */
28 1.1 skrll
29 1.1 skrll #include <machine/asm.h>
30 1.1 skrll
31 1.1 skrll #if defined(LIBC_SCCS)
32 1.1 skrll RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $")
33 1.1 skrll #endif
34 1.1 skrll
35 1.1 skrll #if defined(MEMCOPY)
36 1.1 skrll
37 1.1 skrll /*
38 1.1 skrll * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 1.1 skrll */
40 1.1 skrll #define FUNCTION memcpy
41 1.1 skrll #define NO_OVERLAP
42 1.1 skrll #define SRC0 x1
43 1.1 skrll #define DST0 x0
44 1.1 skrll #define LEN x2
45 1.1 skrll
46 1.1 skrll #elif defined(MEMMOVE)
47 1.1 skrll
48 1.1 skrll /*
49 1.1 skrll * void *memmove(void *dst, const void *src, size_t len);
50 1.1 skrll */
51 1.1 skrll #define FUNCTION memmove
52 1.1 skrll #undef NO_OVERLAP
53 1.1 skrll #define SRC0 x1
54 1.1 skrll #define DST0 x0
55 1.1 skrll #define LEN x2
56 1.1 skrll
57 1.1 skrll #else /* !MEMCOPY && !MEMMOVE */
58 1.1 skrll
59 1.1 skrll /*
60 1.1 skrll * void bcopy(const void *src, void *dst, size_t len);
61 1.1 skrll */
62 1.1 skrll #define FUNCTION bcopy
63 1.1 skrll #define NO_OVERLAP
64 1.1 skrll #define SRC0 x0
65 1.1 skrll #define DST0 x1
66 1.1 skrll #define LEN x2
67 1.1 skrll
68 1.1 skrll #endif /* MEMCOPY/MEMMOVE/BCOPY */
69 1.1 skrll
70 1.1 skrll /* caller-saved temporary registers. breakable. */
71 1.1 skrll #define TMP_X x3
72 1.1 skrll #define TMP_Xw w3
73 1.1 skrll #define TMP_D x4
74 1.1 skrll #define TMP_S x5
75 1.1 skrll #define DST x6
76 1.1 skrll #define SRC x7
77 1.1 skrll #define DATA0 x8
78 1.1 skrll #define DATA0w w8
79 1.1 skrll #define DATA1 x9
80 1.1 skrll #define DATA1w w9
81 1.1 skrll #define DATA2 x10
82 1.1 skrll #define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */
83 1.1 skrll #define DST_ALIGNBIT x12 /* (DST & 7) * 8 */
84 1.1 skrll #define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */
85 1.1 skrll #define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */
86 1.1 skrll
87 1.1 skrll #define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */
88 1.1 skrll #define SMALLSIZE 32
89 1.1 skrll
90 1.1 skrll .text
91 1.1 skrll .align 5
92 1.1 skrll
93 1.1 skrll #ifndef NO_OVERLAP
94 1.1 skrll #ifndef STRICT_ALIGNMENT
95 1.1 skrll backward_ignore_align:
96 1.1 skrll prfm PLDL1KEEP, [SRC0]
97 1.1 skrll add SRC0, SRC0, LEN
98 1.1 skrll add DST, DST0, LEN
99 1.1 skrll cmp LEN, #SMALLSIZE
100 1.1 skrll bcs copy_backward
101 1.1 skrll copy_backward_small:
102 1.1 skrll cmp LEN, #8
103 1.1 skrll bcs 9f
104 1.1 skrll
105 1.1 skrll /* 0 <= len < 8 */
106 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107 1.1 skrll tbz LEN, #2, 1f
108 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
109 1.1 skrll str TMP_Xw, [DST, #-4]!
110 1.1 skrll 1:
111 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112 1.1 skrll tbz LEN, #1, 1f
113 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
114 1.1 skrll strh TMP_Xw, [DST, #-2]!
115 1.1 skrll 1:
116 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117 1.1 skrll tbz LEN, #0, 1f
118 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
119 1.1 skrll strb TMP_Xw, [DST, #-1]!
120 1.1 skrll 1:
121 1.1 skrll ret
122 1.1 skrll 9:
123 1.1 skrll
124 1.1 skrll cmp LEN, #16
125 1.1 skrll bcs 9f
126 1.1 skrll
127 1.1 skrll /* 8 <= len < 16 */
128 1.1 skrll /* *--(uint64_t *)dst = *--(uint64_t *)src; */
129 1.1 skrll ldr TMP_X, [SRC0, #-8]!
130 1.1 skrll str TMP_X, [DST, #-8]!
131 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132 1.1 skrll tbz LEN, #2, 1f
133 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
134 1.1 skrll str TMP_Xw, [DST, #-4]!
135 1.1 skrll 1:
136 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137 1.1 skrll tbz LEN, #1, 1f
138 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
139 1.1 skrll strh TMP_Xw, [DST, #-2]!
140 1.1 skrll 1:
141 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142 1.1 skrll tbz LEN, #0, 1f
143 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
144 1.1 skrll strb TMP_Xw, [DST, #-1]!
145 1.1 skrll 1:
146 1.1 skrll ret
147 1.1 skrll 9:
148 1.1 skrll
149 1.1 skrll /* 16 <= len < 32 */
150 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]!
151 1.1 skrll stp DATA0, DATA1, [DST, #-16]!
152 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153 1.1 skrll tbz LEN, #3, 1f
154 1.1 skrll ldr TMP_X, [SRC0, #-8]!
155 1.1 skrll str TMP_X, [DST, #-8]!
156 1.1 skrll 1:
157 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158 1.1 skrll tbz LEN, #2, 1f
159 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
160 1.1 skrll str TMP_Xw, [DST, #-4]!
161 1.1 skrll 1:
162 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163 1.1 skrll tbz LEN, #1, 1f
164 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
165 1.1 skrll strh TMP_Xw, [DST, #-2]!
166 1.1 skrll 1:
167 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168 1.1 skrll tbz LEN, #0, 1f
169 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
170 1.1 skrll strb TMP_Xw, [DST, #-1]!
171 1.1 skrll 1:
172 1.1 skrll ret
173 1.1 skrll #endif /* !STRICT_ALIGNMENT */
174 1.1 skrll
175 1.1 skrll .align 4
176 1.1 skrll copy_backward:
177 1.1 skrll /* DST is not aligned at this point */
178 1.1 skrll #ifndef STRICT_ALIGNMENT
179 1.1 skrll cmp LEN, #512 /* pre-alignment can be overhead when small */
180 1.1 skrll bcc 9f
181 1.1 skrll #endif
182 1.1 skrll /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183 1.1 skrll tbz DST, #0, 1f
184 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
185 1.1 skrll strb TMP_Xw, [DST, #-1]!
186 1.1 skrll sub LEN, LEN, #1
187 1.1 skrll 1:
188 1.1 skrll /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189 1.1 skrll tbz DST, #1, 1f
190 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
191 1.1 skrll strh TMP_Xw, [DST, #-2]!
192 1.1 skrll sub LEN, LEN, #2
193 1.1 skrll 1:
194 1.1 skrll /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195 1.1 skrll tbz DST, #2, 1f
196 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
197 1.1 skrll str TMP_Xw, [DST, #-4]!
198 1.1 skrll sub LEN, LEN, #4
199 1.1 skrll 1:
200 1.1 skrll #if (STP_ALIGN > 8)
201 1.1 skrll /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202 1.1 skrll tbz DST, #3, 1f
203 1.1 skrll ldr TMP_X, [SRC0, #-8]!
204 1.1 skrll str TMP_X, [DST, #-8]!
205 1.1 skrll sub LEN, LEN, #8
206 1.1 skrll 1:
207 1.1 skrll #endif /* (STP_ALIGN > 8) */
208 1.1 skrll 9:
209 1.1 skrll
210 1.1 skrll cmp LEN, #1024
211 1.1 skrll bhs backward_copy1k
212 1.1 skrll backward_less1k:
213 1.1 skrll /* copy 16*n bytes */
214 1.1 skrll and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
215 1.1 skrll adr TMP_X, 8f
216 1.1 skrll sub LEN, LEN, TMP_D
217 1.1 skrll sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
218 1.1 skrll br TMP_X
219 1.1 skrll backward_copy1k: /* copy 16*64 bytes */
220 1.1 skrll sub LEN, LEN, #1024
221 1.1 skrll .rept (1024 / 16)
222 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */
223 1.1 skrll stp DATA0, DATA1, [DST, #-16]!
224 1.1 skrll .endr
225 1.1 skrll 8:
226 1.1 skrll cbz LEN, done
227 1.1 skrll cmp LEN, #1024
228 1.1 skrll bhs backward_copy1k
229 1.1 skrll cmp LEN, #16
230 1.1 skrll bhs backward_less1k
231 1.1 skrll
232 1.1 skrll /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
233 1.1 skrll tbz LEN, #4, 1f
234 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]!
235 1.1 skrll ldp DATA0, DATA1, [DST, #-16]!
236 1.1 skrll 1:
237 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
238 1.1 skrll tbz LEN, #3, 1f
239 1.1 skrll ldr TMP_X, [SRC0, #-8]!
240 1.1 skrll str TMP_X, [DST, #-8]!
241 1.1 skrll 1:
242 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
243 1.1 skrll tbz LEN, #2, 1f
244 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
245 1.1 skrll str TMP_Xw, [DST, #-4]!
246 1.1 skrll 1:
247 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
248 1.1 skrll tbz LEN, #1, 1f
249 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
250 1.1 skrll strh TMP_Xw, [DST, #-2]!
251 1.1 skrll 1:
252 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
253 1.1 skrll tbz LEN, #0, 1f
254 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
255 1.1 skrll strb TMP_Xw, [DST, #-1]!
256 1.1 skrll 1:
257 1.1 skrll ret
258 1.1 skrll #endif /* !NO_OVERLAP */
259 1.1 skrll
260 1.1 skrll
261 1.1 skrll #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
262 1.1 skrll .align 5
263 1.1 skrll backward_copy:
264 1.1 skrll prfm PLDL1KEEP, [SRC0]
265 1.1 skrll add DST, DST0, LEN
266 1.1 skrll add SRC0, SRC0, LEN
267 1.1 skrll cmp LEN, #SMALLSIZE
268 1.1 skrll bcs strict_backward
269 1.1 skrll
270 1.1 skrll cmp LEN, #10
271 1.1 skrll bcs 9f
272 1.1 skrll backward_tiny:
273 1.1 skrll /* copy 1-10 bytes */
274 1.1 skrll adr TMP_X, 8f
275 1.1 skrll sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
276 1.1 skrll br TMP_X
277 1.1 skrll .rept 10
278 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
279 1.1 skrll strb TMP_Xw, [DST, #-1]!
280 1.1 skrll .endr
281 1.1 skrll 8:
282 1.1 skrll ret
283 1.1 skrll 9:
284 1.1 skrll /* length is small(<32), and src or dst may be unaligned */
285 1.1 skrll eor TMP_X, SRC0, DST0
286 1.1 skrll ands TMP_X, TMP_X, #7
287 1.1 skrll bne notaligned_backward_small
288 1.1 skrll
289 1.1 skrll samealign_backward_small:
290 1.1 skrll /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
291 1.1 skrll tbz DST, #0, 1f
292 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
293 1.1 skrll strb TMP_Xw, [DST, #-1]!
294 1.1 skrll sub LEN, LEN, #1
295 1.1 skrll 1:
296 1.1 skrll /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
297 1.1 skrll tbz DST, #1, 1f
298 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
299 1.1 skrll strh TMP_Xw, [DST, #-2]!
300 1.1 skrll sub LEN, LEN, #2
301 1.1 skrll 1:
302 1.1 skrll /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
303 1.1 skrll tbz DST, #2, 1f
304 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
305 1.1 skrll str TMP_Xw, [DST, #-4]!
306 1.1 skrll sub LEN, LEN, #4
307 1.1 skrll 1:
308 1.1 skrll /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
309 1.1 skrll tbz LEN, #4, 1f
310 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]!
311 1.1 skrll stp DATA0, DATA1, [DST, #-16]!
312 1.1 skrll 1:
313 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
314 1.1 skrll tbz LEN, #3, 1f
315 1.1 skrll ldr TMP_X, [SRC0, #-8]!
316 1.1 skrll str TMP_X, [DST, #-8]!
317 1.1 skrll 1:
318 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
319 1.1 skrll tbz LEN, #2, 1f
320 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
321 1.1 skrll str TMP_Xw, [DST, #-4]!
322 1.1 skrll 1:
323 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
324 1.1 skrll tbz LEN, #1, 1f
325 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
326 1.1 skrll strh TMP_Xw, [DST, #-2]!
327 1.1 skrll 1:
328 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
329 1.1 skrll tbz LEN, #0, 1f
330 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
331 1.1 skrll strb TMP_Xw, [DST, #-1]!
332 1.1 skrll 1:
333 1.1 skrll ret
334 1.1 skrll
335 1.1 skrll notaligned_backward_small:
336 1.1 skrll /* length is small, and src or dst may be unaligned */
337 1.1 skrll sub TMP_S, SRC0, LEN /* tmp_s = src - len */
338 1.1 skrll 1: /* do { */
339 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
340 1.1 skrll strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */
341 1.1 skrll cmp TMP_S, SRC0 /* while (tmp_s < src) */
342 1.1 skrll blo 1b
343 1.1 skrll ret
344 1.1 skrll
345 1.1 skrll strict_backward:
346 1.1 skrll /* src or dst may be unaligned */
347 1.1 skrll and SRC_ALIGNBIT, SRC0, #7
348 1.1 skrll and DST_ALIGNBIT, DST, #7
349 1.1 skrll lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
350 1.1 skrll lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
351 1.1 skrll sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
352 1.1 skrll cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */
353 1.1 skrll
354 1.1 skrll and SRC, SRC0, #~7
355 1.1 skrll and DST, DST, #~7
356 1.1 skrll neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
357 1.1 skrll
358 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
359 1.1 skrll tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
360 1.1 skrll
361 1.1 skrll cmp SRC, SRC0 /* don't access out of range */
362 1.1 skrll beq 1f
363 1.1 skrll ldr DATA1, [SRC]
364 1.1 skrll 1:
365 1.1 skrll ldr DATA0, [SRC, #-8]!
366 1.1 skrll
367 1.1 skrll lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
368 1.1 skrll lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */
369 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
370 1.1 skrll
371 1.1 skrll b 9f /* } */
372 1.1 skrll 5: /* else { */
373 1.1 skrll ldr DATA0, [SRC] /* data0 = *src; */
374 1.1 skrll lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/
375 1.1 skrll 9: /* } */
376 1.1 skrll
377 1.1 skrll cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
378 1.1 skrll mov TMP_D, DST /* tmp_d = dst; */
379 1.1 skrll
380 1.1 skrll tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
381 1.1 skrll str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */
382 1.1 skrll lsr DATA1, DATA1, #32 /* data1 >>= 32; */
383 1.1 skrll 1: /* } */
384 1.1 skrll tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
385 1.1 skrll strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */
386 1.1 skrll lsr DATA1, DATA1, #16 /* data1 >>= 16; */
387 1.1 skrll 1: /* } */
388 1.1 skrll tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
389 1.1 skrll strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */
390 1.1 skrll 1: /* } */
391 1.1 skrll
392 1.1 skrll sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
393 1.1 skrll 9: /* } */
394 1.1 skrll #else /* BYTE_ORDER */
395 1.1 skrll tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
396 1.1 skrll
397 1.1 skrll cmp SRC, SRC0 /* don't access out of range */
398 1.1 skrll beq 1f
399 1.1 skrll ldr DATA1, [SRC]
400 1.1 skrll 1:
401 1.1 skrll ldr DATA0, [SRC, #-8]!
402 1.1 skrll
403 1.1 skrll lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
404 1.1 skrll lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */
405 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
406 1.1 skrll
407 1.1 skrll b 9f /* } */
408 1.1 skrll 5: /* else { */
409 1.1 skrll ldr DATA0, [SRC] /* data0 = *src; */
410 1.1 skrll lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/
411 1.1 skrll 9: /* } */
412 1.1 skrll
413 1.1 skrll cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
414 1.1 skrll mov TMP_D, DST /* tmp_d = dst; */
415 1.1 skrll
416 1.1 skrll tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
417 1.1 skrll lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */
418 1.1 skrll str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */
419 1.1 skrll 1: /* } */
420 1.1 skrll tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
421 1.1 skrll lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */
422 1.1 skrll strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */
423 1.1 skrll 1: /* } */
424 1.1 skrll tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
425 1.1 skrll lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */
426 1.1 skrll strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */
427 1.1 skrll 1: /* } */
428 1.1 skrll
429 1.1 skrll sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
430 1.1 skrll 9: /* } */
431 1.1 skrll #endif /* BYTE_ORDER */
432 1.1 skrll
433 1.1 skrll
434 1.1 skrll backward_shifting_copy_loop:
435 1.1 skrll ldp DATA2, DATA1, [SRC, #-16]!
436 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
437 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
438 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT
439 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
440 1.1 skrll orr DATA0, DATA0, TMP_X
441 1.1 skrll /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
442 1.1 skrll lsl DATA1, DATA1, DST_SRC_ALIGNBIT
443 1.1 skrll lsr TMP_X, DATA2, SRC_DST_ALIGNBIT
444 1.1 skrll orr DATA1, DATA1, TMP_X
445 1.1 skrll #else /* BYTE_ORDER */
446 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
447 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT
448 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
449 1.1 skrll orr DATA0, DATA0, TMP_X
450 1.1 skrll /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
451 1.1 skrll lsr DATA1, DATA1, DST_SRC_ALIGNBIT
452 1.1 skrll lsl TMP_X, DATA2, SRC_DST_ALIGNBIT
453 1.1 skrll orr DATA1, DATA1, TMP_X
454 1.1 skrll #endif /* BYTE_ORDER */
455 1.1 skrll stp DATA1, DATA0, [DST, #-16]!
456 1.1 skrll mov DATA0, DATA2
457 1.1 skrll sub LEN, LEN, #16
458 1.1 skrll cmp LEN, #16
459 1.1 skrll bhs backward_shifting_copy_loop
460 1.1 skrll
461 1.1 skrll
462 1.1 skrll /* write 8 bytes */
463 1.1 skrll tbz LEN, #3, 9f
464 1.1 skrll
465 1.1 skrll ldr DATA1, [SRC, #-8]!
466 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
467 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
468 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT
469 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
470 1.1 skrll orr DATA0, DATA0, TMP_X
471 1.1 skrll #else /* BYTE_ORDER */
472 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
473 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT
474 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
475 1.1 skrll orr DATA0, DATA0, TMP_X
476 1.1 skrll #endif /* BYTE_ORDER */
477 1.1 skrll str DATA0, [DST, #-8]!
478 1.1 skrll mov DATA0, DATA1
479 1.1 skrll sub LEN, LEN, #8
480 1.1 skrll 9:
481 1.1 skrll
482 1.1 skrll cbz LEN, backward_shifting_copy_done
483 1.1 skrll
484 1.1 skrll /* copy last 1-7 bytes */
485 1.1 skrll and TMP_X, SRC_DST_ALIGNBIT, #63
486 1.1 skrll cmp LEN, TMP_X, lsr #3
487 1.1 skrll bls 1f
488 1.1 skrll ldr DATA1, [SRC, #-8]! /* don't access out of range */
489 1.1 skrll 1:
490 1.1 skrll
491 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
492 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
493 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT
494 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
495 1.1 skrll orr DATA0, DATA0, TMP_X
496 1.1 skrll #else /* BYTE_ORDER */
497 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
498 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT
499 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
500 1.1 skrll orr DATA0, DATA0, TMP_X
501 1.1 skrll #endif /* BYTE_ORDER */
502 1.1 skrll
503 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
504 1.1 skrll tbz LEN, #2, 1f
505 1.1 skrll ror DATA0, DATA0, #32
506 1.1 skrll str DATA0w, [DST, #-4]!
507 1.1 skrll 1:
508 1.1 skrll tbz LEN, #1, 1f
509 1.1 skrll ror DATA0, DATA0, #48
510 1.1 skrll strh DATA0w, [DST, #-2]!
511 1.1 skrll 1:
512 1.1 skrll tbz LEN, #0, 1f
513 1.1 skrll ror DATA0, DATA0, #56
514 1.1 skrll strb DATA0w, [DST, #-1]!
515 1.1 skrll 1:
516 1.1 skrll #else /* BYTE_ORDER */
517 1.1 skrll tbz LEN, #2, 1f
518 1.1 skrll str DATA0w, [DST, #-4]!
519 1.1 skrll lsr DATA0, DATA0, #32
520 1.1 skrll 1:
521 1.1 skrll tbz LEN, #1, 1f
522 1.1 skrll strh DATA0w, [DST, #-2]!
523 1.1 skrll lsr DATA0, DATA0, #16
524 1.1 skrll 1:
525 1.1 skrll tbz LEN, #0, 1f
526 1.1 skrll strb DATA0w, [DST, #-1]!
527 1.1 skrll 1:
528 1.1 skrll #endif /* BYTE_ORDER */
529 1.1 skrll backward_shifting_copy_done:
530 1.1 skrll ret
531 1.1 skrll #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
532 1.1 skrll
533 1.1 skrll
534 1.1 skrll .align 5
535 1.1 skrll ENTRY(FUNCTION)
536 1.1 skrll #ifdef STRICT_ALIGNMENT
537 1.1 skrll cbz LEN, done
538 1.1 skrll #ifndef NO_OVERLAP
539 1.1 skrll cmp SRC0, DST0
540 1.1 skrll beq done
541 1.1 skrll bcc backward_copy
542 1.1 skrll #endif /* NO_OVERLAP */
543 1.1 skrll mov DST, DST0
544 1.1 skrll cmp LEN, #SMALLSIZE
545 1.1 skrll bcs strict_forward
546 1.1 skrll
547 1.1 skrll cmp LEN, #10
548 1.1 skrll bcs 9f
549 1.1 skrll forward_tiny:
550 1.1 skrll /* copy 1-10 bytes */
551 1.1 skrll adr TMP_X, 8f
552 1.1 skrll sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
553 1.1 skrll br TMP_X
554 1.1 skrll .rept 10
555 1.1 skrll ldrb TMP_Xw, [SRC0], #1
556 1.1 skrll strb TMP_Xw, [DST], #1
557 1.1 skrll .endr
558 1.1 skrll 8:
559 1.1 skrll ret
560 1.1 skrll 9:
561 1.1 skrll /* length is small(<32), and src or dst may be unaligned */
562 1.1 skrll eor TMP_X, SRC0, DST0
563 1.1 skrll ands TMP_X, TMP_X, #7
564 1.1 skrll bne notaligned_forward_small
565 1.1 skrll samealign_forward_small:
566 1.1 skrll /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
567 1.1 skrll tbz DST, #0, 1f
568 1.1 skrll ldrb TMP_Xw, [SRC0], #1
569 1.1 skrll strb TMP_Xw, [DST], #1
570 1.1 skrll sub LEN, LEN, #1
571 1.1 skrll 1:
572 1.1 skrll /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
573 1.1 skrll tbz DST, #1, 1f
574 1.1 skrll ldrh TMP_Xw, [SRC0], #2
575 1.1 skrll strh TMP_Xw, [DST], #2
576 1.1 skrll sub LEN, LEN, #2
577 1.1 skrll 1:
578 1.1 skrll /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
579 1.1 skrll tbz DST, #2, 1f
580 1.1 skrll ldr TMP_Xw, [SRC0], #4
581 1.1 skrll str TMP_Xw, [DST], #4
582 1.1 skrll sub LEN, LEN, #4
583 1.1 skrll 1:
584 1.1 skrll /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
585 1.1 skrll tbz LEN, #4, 1f
586 1.1 skrll ldp DATA0, DATA1, [SRC0], #16
587 1.1 skrll stp DATA0, DATA1, [DST], #16
588 1.1 skrll 1:
589 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
590 1.1 skrll tbz LEN, #3, 1f
591 1.1 skrll ldr TMP_X, [SRC0], #8
592 1.1 skrll str TMP_X, [DST], #8
593 1.1 skrll 1:
594 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
595 1.1 skrll tbz LEN, #2, 1f
596 1.1 skrll ldr TMP_Xw, [SRC0], #4
597 1.1 skrll str TMP_Xw, [DST], #4
598 1.1 skrll 1:
599 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
600 1.1 skrll tbz LEN, #1, 1f
601 1.1 skrll ldrh TMP_Xw, [SRC0], #2
602 1.1 skrll strh TMP_Xw, [DST], #2
603 1.1 skrll 1:
604 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
605 1.1 skrll tbz LEN, #0, 1f
606 1.1 skrll ldrb TMP_Xw, [SRC0], #1
607 1.1 skrll strb TMP_Xw, [DST], #1
608 1.1 skrll 1:
609 1.1 skrll ret
610 1.1 skrll
611 1.1 skrll notaligned_forward_small:
612 1.1 skrll /* src and dst are not aligned... */
613 1.1 skrll prfm PLDL1KEEP, [SRC0]
614 1.1 skrll prfm PLDL1KEEP, [SRC0, #8]
615 1.1 skrll prfm PLDL1KEEP, [SRC0, #16]
616 1.1 skrll add TMP_S, SRC0, LEN /* tmp_s = src + len */
617 1.1 skrll 1: /* do { */
618 1.1 skrll ldrb TMP_Xw, [SRC0], #1
619 1.1 skrll strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */
620 1.1 skrll cmp SRC0, TMP_S /* while (src < tmp_s); */
621 1.1 skrll blo 1b
622 1.1 skrll ret
623 1.1 skrll
624 1.1 skrll strict_forward:
625 1.1 skrll /* src or dst may be unaligned */
626 1.1 skrll and SRC_ALIGNBIT, SRC0, #7
627 1.1 skrll and DST_ALIGNBIT, DST0, #7
628 1.1 skrll lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
629 1.1 skrll lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
630 1.1 skrll sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
631 1.1 skrll cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */
632 1.1 skrll
633 1.1 skrll and SRC, SRC0, #~7
634 1.1 skrll and DST, DST0, #~7
635 1.1 skrll neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
636 1.1 skrll
637 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
638 1.1 skrll tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
639 1.1 skrll ldp DATA1, DATA0, [SRC], #16
640 1.1 skrll neg TMP_X, SRC_ALIGNBIT
641 1.1 skrll lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
642 1.1 skrll lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */
643 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */
644 1.1 skrll b 9f
645 1.1 skrll 5:
646 1.1 skrll ldr DATA0, [SRC], #8
647 1.1 skrll lsr DATA1, DATA0, SRC_ALIGNBIT
648 1.1 skrll 9:
649 1.1 skrll
650 1.1 skrll cbz DST_ALIGNBIT, 5f
651 1.1 skrll mov TMP_D, DST0
652 1.1 skrll /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
653 1.1 skrll tbz TMP_D, #0, 1f
654 1.1 skrll strb DATA1w, [TMP_D], #1
655 1.1 skrll lsr DATA1, DATA1, #8
656 1.1 skrll 1:
657 1.1 skrll /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
658 1.1 skrll tbz TMP_D, #1, 1f
659 1.1 skrll strh DATA1w, [TMP_D], #2
660 1.1 skrll lsr DATA1, DATA1, #16
661 1.1 skrll 1:
662 1.1 skrll /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
663 1.1 skrll tbz TMP_D, #2, 1f
664 1.1 skrll str DATA1w, [TMP_D], #4
665 1.1 skrll 1:
666 1.1 skrll add DST, DST, #8
667 1.1 skrll b 9f
668 1.1 skrll 5:
669 1.1 skrll str DATA1, [DST], #8
670 1.1 skrll 9:
671 1.1 skrll sub LEN, LEN, #8
672 1.1 skrll add LEN, LEN, DST_ALIGNBIT, lsr #3
673 1.1 skrll #else /* BYTE_ORDER */
674 1.1 skrll tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
675 1.1 skrll ldp DATA1, DATA0, [SRC], #16
676 1.1 skrll neg TMP_X, SRC_ALIGNBIT
677 1.1 skrll lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
678 1.1 skrll lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */
679 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */
680 1.1 skrll b 9f
681 1.1 skrll 5:
682 1.1 skrll ldr DATA0, [SRC], #8
683 1.1 skrll lsl DATA1, DATA0, SRC_ALIGNBIT
684 1.1 skrll 9:
685 1.1 skrll
686 1.1 skrll cbz DST_ALIGNBIT, 5f
687 1.1 skrll mov TMP_D, DST0
688 1.1 skrll /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
689 1.1 skrll tbz TMP_D, #0, 1f
690 1.1 skrll lsr TMP_X, DATA1, #56
691 1.1 skrll strb TMP_Xw, [TMP_D], #1
692 1.1 skrll 1:
693 1.1 skrll /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
694 1.1 skrll tbz TMP_D, #1, 1f
695 1.1 skrll lsr TMP_X, DATA1, #48
696 1.1 skrll strh TMP_Xw, [TMP_D], #2
697 1.1 skrll 1:
698 1.1 skrll /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
699 1.1 skrll tbz TMP_D, #2, 1f
700 1.1 skrll lsr TMP_X, DATA1, #32
701 1.1 skrll str TMP_Xw, [TMP_D], #4
702 1.1 skrll 1:
703 1.1 skrll add DST, DST, #8
704 1.1 skrll b 9f
705 1.1 skrll 5:
706 1.1 skrll str DATA1, [DST], #8
707 1.1 skrll 9:
708 1.1 skrll sub LEN, LEN, #8
709 1.1 skrll add LEN, LEN, DST_ALIGNBIT, lsr #3
710 1.1 skrll #endif /* BYTE_ORDER */
711 1.1 skrll
712 1.1 skrll shifting_copy_loop:
713 1.1 skrll ldp DATA1, DATA2, [SRC], #16
714 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
715 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
716 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT
717 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
718 1.1 skrll orr DATA0, DATA0, TMP_X
719 1.1 skrll /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
720 1.1 skrll lsr DATA1, DATA1, SRC_DST_ALIGNBIT
721 1.1 skrll lsl TMP_X, DATA2, DST_SRC_ALIGNBIT
722 1.1 skrll orr DATA1, DATA1, TMP_X
723 1.1 skrll #else /* BYTE_ORDER */
724 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
725 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT
726 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
727 1.1 skrll orr DATA0, DATA0, TMP_X
728 1.1 skrll /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
729 1.1 skrll lsl DATA1, DATA1, SRC_DST_ALIGNBIT
730 1.1 skrll lsr TMP_X, DATA2, DST_SRC_ALIGNBIT
731 1.1 skrll orr DATA1, DATA1, TMP_X
732 1.1 skrll #endif /* BYTE_ORDER */
733 1.1 skrll stp DATA0, DATA1, [DST], #16
734 1.1 skrll mov DATA0, DATA2
735 1.1 skrll sub LEN, LEN, #16
736 1.1 skrll cmp LEN, #16
737 1.1 skrll bhs shifting_copy_loop
738 1.1 skrll
739 1.1 skrll
740 1.1 skrll /* write 8 bytes */
741 1.1 skrll tbz LEN, #3, 9f
742 1.1 skrll ldr DATA1, [SRC], #8
743 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
744 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
745 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT
746 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
747 1.1 skrll orr DATA0, DATA0, TMP_X
748 1.1 skrll #else /* BYTE_ORDER */
749 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
750 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT
751 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
752 1.1 skrll orr DATA0, DATA0, TMP_X
753 1.1 skrll #endif /* BYTE_ORDER */
754 1.1 skrll str DATA0, [DST], #8
755 1.1 skrll mov DATA0, DATA1
756 1.1 skrll sub LEN, LEN, #8
757 1.1 skrll 9:
758 1.1 skrll
759 1.1 skrll cbz LEN, shifting_copy_done
760 1.1 skrll
761 1.1 skrll /* copy last 1-7 bytes */
762 1.1 skrll and TMP_X, DST_SRC_ALIGNBIT, #63
763 1.1 skrll cmp LEN, TMP_X, lsr #3
764 1.1 skrll bls 1f
765 1.1 skrll ldr DATA1, [SRC], #8 /* don't access out of range */
766 1.1 skrll 1:
767 1.1 skrll
768 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
769 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
770 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT
771 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
772 1.1 skrll orr DATA0, DATA0, TMP_X
773 1.1 skrll #else /* BYTE_ORDER */
774 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
775 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT
776 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
777 1.1 skrll orr DATA0, DATA0, TMP_X
778 1.1 skrll #endif /* BYTE_ORDER */
779 1.1 skrll
780 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
781 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = data0; } */
782 1.1 skrll tbz LEN, #2, 1f
783 1.1 skrll str DATA0w, [DST], #4
784 1.1 skrll lsr DATA0, DATA0, #32
785 1.1 skrll 1:
786 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = data0; } */
787 1.1 skrll tbz LEN, #1, 1f
788 1.1 skrll strh DATA0w, [DST], #2
789 1.1 skrll lsr DATA0, DATA0, #16
790 1.1 skrll 1:
791 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = data0; } */
792 1.1 skrll tbz LEN, #0, 1f
793 1.1 skrll strb DATA0w, [DST], #1
794 1.1 skrll 1:
795 1.1 skrll #else /* BYTE_ORDER */
796 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
797 1.1 skrll tbz LEN, #2, 1f
798 1.1 skrll lsr TMP_X, DATA0, #32
799 1.1 skrll str TMP_Xw, [DST], #4
800 1.1 skrll 1:
801 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
802 1.1 skrll tbz LEN, #1, 1f
803 1.1 skrll lsr TMP_X, DATA0, #16
804 1.1 skrll strh TMP_Xw, [DST], #2
805 1.1 skrll 1:
806 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
807 1.1 skrll tbz LEN, #0, 1f
808 1.1 skrll lsr TMP_X, DATA0, #8
809 1.1 skrll strb TMP_Xw, [DST], #1
810 1.1 skrll 1:
811 1.1 skrll #endif /* BYTE_ORDER */
812 1.1 skrll shifting_copy_done:
813 1.1 skrll ret
814 1.1 skrll
815 1.1 skrll #else /* STRICT_ALIGNMENT */
816 1.1 skrll #ifndef NO_OVERLAP
817 1.1 skrll cbz LEN, done
818 1.1 skrll cmp SRC0, DST0
819 1.1 skrll beq done
820 1.1 skrll bcc backward_ignore_align
821 1.1 skrll #endif /* NO_OVERLAP */
822 1.1 skrll
823 1.1 skrll prfm PLDL1KEEP, [SRC0]
824 1.1 skrll cmp LEN, #SMALLSIZE
825 1.1 skrll bcs copy_forward
826 1.1 skrll mov DST, DST0
827 1.1 skrll
828 1.1 skrll copy_forward_small:
829 1.1 skrll cmp LEN, #8
830 1.1 skrll bcs 9f
831 1.1 skrll
832 1.1 skrll /* 0 <= len < 8 */
833 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
834 1.1 skrll tbz LEN, #2, 1f
835 1.1 skrll ldr TMP_Xw, [SRC0], #4
836 1.1 skrll str TMP_Xw, [DST], #4
837 1.1 skrll 1:
838 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
839 1.1 skrll tbz LEN, #1, 1f
840 1.1 skrll ldrh TMP_Xw, [SRC0], #2
841 1.1 skrll strh TMP_Xw, [DST], #2
842 1.1 skrll 1:
843 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
844 1.1 skrll tbz LEN, #0, 1f
845 1.1 skrll ldrb TMP_Xw, [SRC0], #1
846 1.1 skrll strb TMP_Xw, [DST], #1
847 1.1 skrll 1:
848 1.1 skrll ret
849 1.1 skrll 9:
850 1.1 skrll
851 1.1 skrll prfm PLDL1KEEP, [SRC0, #8]
852 1.1 skrll cmp LEN, #16
853 1.1 skrll bcs 9f
854 1.1 skrll
855 1.1 skrll /* 8 <= len < 16 */
856 1.1 skrll /* *(uint64_t *)dst++ = *(uint64_t *)src++; */
857 1.1 skrll ldr TMP_X, [SRC0], #8
858 1.1 skrll str TMP_X, [DST], #8
859 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
860 1.1 skrll tbz LEN, #2, 1f
861 1.1 skrll ldr TMP_Xw, [SRC0], #4
862 1.1 skrll str TMP_Xw, [DST], #4
863 1.1 skrll 1:
864 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
865 1.1 skrll tbz LEN, #1, 1f
866 1.1 skrll ldrh TMP_Xw, [SRC0], #2
867 1.1 skrll strh TMP_Xw, [DST], #2
868 1.1 skrll 1:
869 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
870 1.1 skrll tbz LEN, #0, 1f
871 1.1 skrll ldrb TMP_Xw, [SRC0], #1
872 1.1 skrll strb TMP_Xw, [DST], #1
873 1.1 skrll 1:
874 1.1 skrll ret
875 1.1 skrll 9:
876 1.1 skrll
877 1.1 skrll /* 16 <= len < 32 */
878 1.1 skrll prfm PLDL1KEEP, [SRC0, 16]
879 1.1 skrll prfm PLDL1KEEP, [SRC0, 24]
880 1.1 skrll ldp DATA0, DATA1, [SRC0], #16
881 1.1 skrll stp DATA0, DATA1, [DST], #16
882 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
883 1.1 skrll tbz LEN, #3, 1f
884 1.1 skrll ldr TMP_X, [SRC0], #8
885 1.1 skrll str TMP_X, [DST], #8
886 1.1 skrll 1:
887 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
888 1.1 skrll tbz LEN, #2, 1f
889 1.1 skrll ldr TMP_Xw, [SRC0], #4
890 1.1 skrll str TMP_Xw, [DST], #4
891 1.1 skrll 1:
892 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
893 1.1 skrll tbz LEN, #1, 1f
894 1.1 skrll ldrh TMP_Xw, [SRC0], #2
895 1.1 skrll strh TMP_Xw, [DST], #2
896 1.1 skrll 1:
897 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
898 1.1 skrll tbz LEN, #0, 1f
899 1.1 skrll ldrb TMP_Xw, [SRC0], #1
900 1.1 skrll strb TMP_Xw, [DST], #1
901 1.1 skrll 1:
902 1.1 skrll ret
903 1.1 skrll #endif /* !STRICT_ALIGNMENT */
904 1.1 skrll
905 1.1 skrll .align 4
906 1.1 skrll copy_forward:
907 1.1 skrll /* DST is not aligned at this point */
908 1.1 skrll mov DST, DST0
909 1.1 skrll #ifndef STRICT_ALIGNMENT
910 1.1 skrll cmp LEN, #512 /* pre-alignment can be overhead when small */
911 1.1 skrll bcc 9f
912 1.1 skrll #endif /* STRICT_ALIGNMENT */
913 1.1 skrll /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
914 1.1 skrll tbz DST, #0, 1f
915 1.1 skrll ldrb TMP_Xw, [SRC0], #1
916 1.1 skrll strb TMP_Xw, [DST], #1
917 1.1 skrll sub LEN, LEN, #1
918 1.1 skrll 1:
919 1.1 skrll /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
920 1.1 skrll tbz DST, #1, 1f
921 1.1 skrll ldrh TMP_Xw, [SRC0], #2
922 1.1 skrll strh TMP_Xw, [DST], #2
923 1.1 skrll sub LEN, LEN, #2
924 1.1 skrll 1:
925 1.1 skrll /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
926 1.1 skrll tbz DST, #2, 1f
927 1.1 skrll ldr TMP_Xw, [SRC0], #4
928 1.1 skrll str TMP_Xw, [DST], #4
929 1.1 skrll sub LEN, LEN, #4
930 1.1 skrll 1:
931 1.1 skrll #if (STP_ALIGN > 8)
932 1.1 skrll /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
933 1.1 skrll tbz DST, #3, 1f
934 1.1 skrll ldr TMP_X, [SRC0], #8
935 1.1 skrll str TMP_X, [DST], #8
936 1.1 skrll sub LEN, LEN, #8
937 1.1 skrll 1:
938 1.1 skrll #endif /* (STP_ALIGN > 8) */
939 1.1 skrll 9:
940 1.1 skrll
941 1.1 skrll cmp LEN, #1024
942 1.1 skrll bhs forward_copy1k
943 1.1 skrll forward_less1k:
944 1.1 skrll /* copy 16*n bytes */
945 1.1 skrll and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
946 1.1 skrll adr TMP_X, 8f
947 1.1 skrll sub LEN, LEN, TMP_D
948 1.1 skrll sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
949 1.1 skrll br TMP_X
950 1.1 skrll forward_copy1k: /* copy 16*64 bytes */
951 1.1 skrll sub LEN, LEN, #1024
952 1.1 skrll .rept (1024 / 16)
953 1.1 skrll ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */
954 1.1 skrll stp DATA0, DATA1, [DST], #16
955 1.1 skrll .endr
956 1.1 skrll 8:
957 1.1 skrll cbz LEN, done
958 1.1 skrll cmp LEN, #1024
959 1.1 skrll bhs forward_copy1k
960 1.1 skrll cmp LEN, #16
961 1.1 skrll bhs forward_less1k
962 1.1 skrll
963 1.1 skrll /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
964 1.1 skrll tbz LEN, #4, 1f
965 1.1 skrll ldp DATA0, DATA1, [SRC0], #16
966 1.1 skrll stp DATA0, DATA1, [DST], #16
967 1.1 skrll 1:
968 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
969 1.1 skrll tbz LEN, #3, 1f
970 1.1 skrll ldr TMP_X, [SRC0], #8
971 1.1 skrll str TMP_X, [DST], #8
972 1.1 skrll 1:
973 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
974 1.1 skrll tbz LEN, #2, 1f
975 1.1 skrll ldr TMP_Xw, [SRC0], #4
976 1.1 skrll str TMP_Xw, [DST], #4
977 1.1 skrll 1:
978 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
979 1.1 skrll tbz LEN, #1, 1f
980 1.1 skrll ldrh TMP_Xw, [SRC0], #2
981 1.1 skrll strh TMP_Xw, [DST], #2
982 1.1 skrll 1:
983 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
984 1.1 skrll tbz LEN, #0, 1f
985 1.1 skrll ldrb TMP_Xw, [SRC0], #1
986 1.1 skrll strb TMP_Xw, [DST], #1
987 1.1 skrll 1:
988 1.1 skrll done:
989 1.1 skrll ret
990 1.1 skrll END(FUNCTION)
991