bcopy.S revision 1.2 1 1.2 ryo /* $NetBSD: bcopy.S,v 1.2 2020/04/11 05:12:52 ryo Exp $ */
2 1.1 skrll
3 1.1 skrll /*
4 1.1 skrll * Copyright (c) 2018 Ryo Shimizu <ryo (at) nerv.org>
5 1.1 skrll * All rights reserved.
6 1.1 skrll *
7 1.1 skrll * Redistribution and use in source and binary forms, with or without
8 1.1 skrll * modification, are permitted provided that the following conditions
9 1.1 skrll * are met:
10 1.1 skrll * 1. Redistributions of source code must retain the above copyright
11 1.1 skrll * notice, this list of conditions and the following disclaimer.
12 1.1 skrll * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 skrll * notice, this list of conditions and the following disclaimer in the
14 1.1 skrll * documentation and/or other materials provided with the distribution.
15 1.1 skrll *
16 1.1 skrll * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 skrll * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 skrll * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 skrll * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 skrll * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 skrll * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 skrll * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 skrll * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 skrll * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 skrll * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 skrll * POSSIBILITY OF SUCH DAMAGE.
27 1.1 skrll */
28 1.1 skrll
29 1.1 skrll #include <machine/asm.h>
30 1.1 skrll
31 1.1 skrll #if defined(LIBC_SCCS)
32 1.2 ryo RCSID("$NetBSD: bcopy.S,v 1.2 2020/04/11 05:12:52 ryo Exp $")
33 1.1 skrll #endif
34 1.1 skrll
35 1.1 skrll #if defined(MEMCOPY)
36 1.1 skrll
37 1.1 skrll /*
38 1.1 skrll * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 1.1 skrll */
40 1.1 skrll #define FUNCTION memcpy
41 1.1 skrll #define NO_OVERLAP
42 1.1 skrll #define SRC0 x1
43 1.1 skrll #define DST0 x0
44 1.1 skrll #define LEN x2
45 1.1 skrll
46 1.1 skrll #elif defined(MEMMOVE)
47 1.1 skrll
48 1.1 skrll /*
49 1.1 skrll * void *memmove(void *dst, const void *src, size_t len);
50 1.1 skrll */
51 1.1 skrll #define FUNCTION memmove
52 1.1 skrll #undef NO_OVERLAP
53 1.1 skrll #define SRC0 x1
54 1.1 skrll #define DST0 x0
55 1.1 skrll #define LEN x2
56 1.1 skrll
57 1.1 skrll #else /* !MEMCOPY && !MEMMOVE */
58 1.1 skrll
59 1.1 skrll /*
60 1.1 skrll * void bcopy(const void *src, void *dst, size_t len);
61 1.1 skrll */
62 1.1 skrll #define FUNCTION bcopy
63 1.1 skrll #define NO_OVERLAP
64 1.1 skrll #define SRC0 x0
65 1.1 skrll #define DST0 x1
66 1.1 skrll #define LEN x2
67 1.1 skrll
68 1.1 skrll #endif /* MEMCOPY/MEMMOVE/BCOPY */
69 1.1 skrll
70 1.1 skrll /* caller-saved temporary registers. breakable. */
71 1.1 skrll #define TMP_X x3
72 1.1 skrll #define TMP_Xw w3
73 1.1 skrll #define TMP_D x4
74 1.1 skrll #define TMP_S x5
75 1.1 skrll #define DST x6
76 1.1 skrll #define SRC x7
77 1.1 skrll #define DATA0 x8
78 1.1 skrll #define DATA0w w8
79 1.1 skrll #define DATA1 x9
80 1.1 skrll #define DATA1w w9
81 1.1 skrll #define DATA2 x10
82 1.1 skrll #define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */
83 1.1 skrll #define DST_ALIGNBIT x12 /* (DST & 7) * 8 */
84 1.1 skrll #define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */
85 1.1 skrll #define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */
86 1.1 skrll
87 1.1 skrll #define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */
88 1.1 skrll #define SMALLSIZE 32
89 1.1 skrll
90 1.1 skrll .text
91 1.1 skrll .align 5
92 1.1 skrll
93 1.1 skrll #ifndef NO_OVERLAP
94 1.1 skrll #ifndef STRICT_ALIGNMENT
95 1.1 skrll backward_ignore_align:
96 1.1 skrll prfm PLDL1KEEP, [SRC0]
97 1.1 skrll add SRC0, SRC0, LEN
98 1.1 skrll add DST, DST0, LEN
99 1.1 skrll cmp LEN, #SMALLSIZE
100 1.1 skrll bcs copy_backward
101 1.1 skrll copy_backward_small:
102 1.1 skrll cmp LEN, #8
103 1.1 skrll bcs 9f
104 1.1 skrll
105 1.1 skrll /* 0 <= len < 8 */
106 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107 1.1 skrll tbz LEN, #2, 1f
108 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
109 1.1 skrll str TMP_Xw, [DST, #-4]!
110 1.1 skrll 1:
111 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112 1.1 skrll tbz LEN, #1, 1f
113 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
114 1.1 skrll strh TMP_Xw, [DST, #-2]!
115 1.1 skrll 1:
116 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117 1.1 skrll tbz LEN, #0, 1f
118 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
119 1.1 skrll strb TMP_Xw, [DST, #-1]!
120 1.1 skrll 1:
121 1.1 skrll ret
122 1.1 skrll 9:
123 1.1 skrll
124 1.1 skrll cmp LEN, #16
125 1.1 skrll bcs 9f
126 1.1 skrll
127 1.1 skrll /* 8 <= len < 16 */
128 1.1 skrll /* *--(uint64_t *)dst = *--(uint64_t *)src; */
129 1.1 skrll ldr TMP_X, [SRC0, #-8]!
130 1.1 skrll str TMP_X, [DST, #-8]!
131 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132 1.1 skrll tbz LEN, #2, 1f
133 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
134 1.1 skrll str TMP_Xw, [DST, #-4]!
135 1.1 skrll 1:
136 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137 1.1 skrll tbz LEN, #1, 1f
138 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
139 1.1 skrll strh TMP_Xw, [DST, #-2]!
140 1.1 skrll 1:
141 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142 1.1 skrll tbz LEN, #0, 1f
143 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
144 1.1 skrll strb TMP_Xw, [DST, #-1]!
145 1.1 skrll 1:
146 1.1 skrll ret
147 1.1 skrll 9:
148 1.1 skrll
149 1.1 skrll /* 16 <= len < 32 */
150 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]!
151 1.1 skrll stp DATA0, DATA1, [DST, #-16]!
152 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153 1.1 skrll tbz LEN, #3, 1f
154 1.1 skrll ldr TMP_X, [SRC0, #-8]!
155 1.1 skrll str TMP_X, [DST, #-8]!
156 1.1 skrll 1:
157 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158 1.1 skrll tbz LEN, #2, 1f
159 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
160 1.1 skrll str TMP_Xw, [DST, #-4]!
161 1.1 skrll 1:
162 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163 1.1 skrll tbz LEN, #1, 1f
164 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
165 1.1 skrll strh TMP_Xw, [DST, #-2]!
166 1.1 skrll 1:
167 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168 1.1 skrll tbz LEN, #0, 1f
169 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
170 1.1 skrll strb TMP_Xw, [DST, #-1]!
171 1.1 skrll 1:
172 1.1 skrll ret
173 1.1 skrll #endif /* !STRICT_ALIGNMENT */
174 1.1 skrll
175 1.1 skrll .align 4
176 1.1 skrll copy_backward:
177 1.1 skrll /* DST is not aligned at this point */
178 1.1 skrll #ifndef STRICT_ALIGNMENT
179 1.1 skrll cmp LEN, #512 /* pre-alignment can be overhead when small */
180 1.1 skrll bcc 9f
181 1.1 skrll #endif
182 1.1 skrll /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183 1.1 skrll tbz DST, #0, 1f
184 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
185 1.1 skrll strb TMP_Xw, [DST, #-1]!
186 1.1 skrll sub LEN, LEN, #1
187 1.1 skrll 1:
188 1.1 skrll /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189 1.1 skrll tbz DST, #1, 1f
190 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
191 1.1 skrll strh TMP_Xw, [DST, #-2]!
192 1.1 skrll sub LEN, LEN, #2
193 1.1 skrll 1:
194 1.1 skrll /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195 1.1 skrll tbz DST, #2, 1f
196 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
197 1.1 skrll str TMP_Xw, [DST, #-4]!
198 1.1 skrll sub LEN, LEN, #4
199 1.1 skrll 1:
200 1.1 skrll #if (STP_ALIGN > 8)
201 1.1 skrll /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202 1.1 skrll tbz DST, #3, 1f
203 1.1 skrll ldr TMP_X, [SRC0, #-8]!
204 1.1 skrll str TMP_X, [DST, #-8]!
205 1.1 skrll sub LEN, LEN, #8
206 1.1 skrll 1:
207 1.1 skrll #endif /* (STP_ALIGN > 8) */
208 1.1 skrll 9:
209 1.1 skrll
210 1.2 ryo backward_copy1k:
211 1.2 ryo /* while (len >= 1024) */
212 1.2 ryo /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
213 1.1 skrll cmp LEN, #1024
214 1.2 ryo blo 9f
215 1.2 ryo 1:
216 1.1 skrll sub LEN, LEN, #1024
217 1.1 skrll .rept (1024 / 16)
218 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */
219 1.1 skrll stp DATA0, DATA1, [DST, #-16]!
220 1.1 skrll .endr
221 1.1 skrll cmp LEN, #1024
222 1.2 ryo bhs 1b
223 1.2 ryo 9:
224 1.1 skrll
225 1.2 ryo /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
226 1.2 ryo tbz LEN, #9, 1f
227 1.2 ryo .rept (512 / 16)
228 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]!
229 1.2 ryo stp DATA0, DATA1, [DST, #-16]!
230 1.2 ryo .endr
231 1.2 ryo 1:
232 1.2 ryo /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
233 1.2 ryo tbz LEN, #8, 1f
234 1.2 ryo .rept (256 / 16)
235 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]!
236 1.2 ryo stp DATA0, DATA1, [DST, #-16]!
237 1.2 ryo .endr
238 1.2 ryo 1:
239 1.2 ryo /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
240 1.2 ryo tbz LEN, #7, 1f
241 1.2 ryo .rept (128 / 16)
242 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]!
243 1.2 ryo stp DATA0, DATA1, [DST, #-16]!
244 1.2 ryo .endr
245 1.2 ryo 1:
246 1.2 ryo /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
247 1.2 ryo tbz LEN, #6, 1f
248 1.2 ryo .rept (64 / 16)
249 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]!
250 1.2 ryo stp DATA0, DATA1, [DST, #-16]!
251 1.2 ryo .endr
252 1.2 ryo 1:
253 1.2 ryo /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
254 1.2 ryo tbz LEN, #5, 1f
255 1.2 ryo .rept (32 / 16)
256 1.2 ryo ldp DATA0, DATA1, [SRC0, #-16]!
257 1.2 ryo stp DATA0, DATA1, [DST, #-16]!
258 1.2 ryo .endr
259 1.2 ryo 1:
260 1.1 skrll /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
261 1.1 skrll tbz LEN, #4, 1f
262 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]!
263 1.2 ryo stp DATA0, DATA1, [DST, #-16]!
264 1.1 skrll 1:
265 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
266 1.1 skrll tbz LEN, #3, 1f
267 1.1 skrll ldr TMP_X, [SRC0, #-8]!
268 1.1 skrll str TMP_X, [DST, #-8]!
269 1.1 skrll 1:
270 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
271 1.1 skrll tbz LEN, #2, 1f
272 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
273 1.1 skrll str TMP_Xw, [DST, #-4]!
274 1.1 skrll 1:
275 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
276 1.1 skrll tbz LEN, #1, 1f
277 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
278 1.1 skrll strh TMP_Xw, [DST, #-2]!
279 1.1 skrll 1:
280 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
281 1.1 skrll tbz LEN, #0, 1f
282 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
283 1.1 skrll strb TMP_Xw, [DST, #-1]!
284 1.1 skrll 1:
285 1.1 skrll ret
286 1.1 skrll #endif /* !NO_OVERLAP */
287 1.1 skrll
288 1.1 skrll
289 1.1 skrll #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
290 1.1 skrll .align 5
291 1.1 skrll backward_copy:
292 1.1 skrll prfm PLDL1KEEP, [SRC0]
293 1.1 skrll add DST, DST0, LEN
294 1.1 skrll add SRC0, SRC0, LEN
295 1.1 skrll cmp LEN, #SMALLSIZE
296 1.1 skrll bcs strict_backward
297 1.1 skrll
298 1.1 skrll cmp LEN, #10
299 1.1 skrll bcs 9f
300 1.1 skrll backward_tiny:
301 1.1 skrll /* copy 1-10 bytes */
302 1.2 ryo 1: sub LEN, LEN, #1
303 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
304 1.1 skrll strb TMP_Xw, [DST, #-1]!
305 1.2 ryo cbz LEN, 1b
306 1.1 skrll ret
307 1.1 skrll 9:
308 1.1 skrll /* length is small(<32), and src or dst may be unaligned */
309 1.1 skrll eor TMP_X, SRC0, DST0
310 1.1 skrll ands TMP_X, TMP_X, #7
311 1.1 skrll bne notaligned_backward_small
312 1.1 skrll
313 1.1 skrll samealign_backward_small:
314 1.1 skrll /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
315 1.1 skrll tbz DST, #0, 1f
316 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
317 1.1 skrll strb TMP_Xw, [DST, #-1]!
318 1.1 skrll sub LEN, LEN, #1
319 1.1 skrll 1:
320 1.1 skrll /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
321 1.1 skrll tbz DST, #1, 1f
322 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
323 1.1 skrll strh TMP_Xw, [DST, #-2]!
324 1.1 skrll sub LEN, LEN, #2
325 1.1 skrll 1:
326 1.1 skrll /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
327 1.1 skrll tbz DST, #2, 1f
328 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
329 1.1 skrll str TMP_Xw, [DST, #-4]!
330 1.1 skrll sub LEN, LEN, #4
331 1.1 skrll 1:
332 1.1 skrll /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
333 1.1 skrll tbz LEN, #4, 1f
334 1.1 skrll ldp DATA0, DATA1, [SRC0, #-16]!
335 1.1 skrll stp DATA0, DATA1, [DST, #-16]!
336 1.1 skrll 1:
337 1.1 skrll /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
338 1.1 skrll tbz LEN, #3, 1f
339 1.1 skrll ldr TMP_X, [SRC0, #-8]!
340 1.1 skrll str TMP_X, [DST, #-8]!
341 1.1 skrll 1:
342 1.1 skrll /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
343 1.1 skrll tbz LEN, #2, 1f
344 1.1 skrll ldr TMP_Xw, [SRC0, #-4]!
345 1.1 skrll str TMP_Xw, [DST, #-4]!
346 1.1 skrll 1:
347 1.1 skrll /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
348 1.1 skrll tbz LEN, #1, 1f
349 1.1 skrll ldrh TMP_Xw, [SRC0, #-2]!
350 1.1 skrll strh TMP_Xw, [DST, #-2]!
351 1.1 skrll 1:
352 1.1 skrll /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
353 1.1 skrll tbz LEN, #0, 1f
354 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
355 1.1 skrll strb TMP_Xw, [DST, #-1]!
356 1.1 skrll 1:
357 1.1 skrll ret
358 1.1 skrll
359 1.1 skrll notaligned_backward_small:
360 1.1 skrll /* length is small, and src or dst may be unaligned */
361 1.1 skrll sub TMP_S, SRC0, LEN /* tmp_s = src - len */
362 1.1 skrll 1: /* do { */
363 1.1 skrll ldrb TMP_Xw, [SRC0, #-1]!
364 1.1 skrll strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */
365 1.1 skrll cmp TMP_S, SRC0 /* while (tmp_s < src) */
366 1.1 skrll blo 1b
367 1.1 skrll ret
368 1.1 skrll
369 1.1 skrll strict_backward:
370 1.1 skrll /* src or dst may be unaligned */
371 1.1 skrll and SRC_ALIGNBIT, SRC0, #7
372 1.1 skrll and DST_ALIGNBIT, DST, #7
373 1.1 skrll lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
374 1.1 skrll lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
375 1.1 skrll sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
376 1.1 skrll cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */
377 1.1 skrll
378 1.1 skrll and SRC, SRC0, #~7
379 1.1 skrll and DST, DST, #~7
380 1.1 skrll neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
381 1.1 skrll
382 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
383 1.1 skrll tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
384 1.1 skrll
385 1.1 skrll cmp SRC, SRC0 /* don't access out of range */
386 1.1 skrll beq 1f
387 1.1 skrll ldr DATA1, [SRC]
388 1.1 skrll 1:
389 1.1 skrll ldr DATA0, [SRC, #-8]!
390 1.1 skrll
391 1.1 skrll lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
392 1.1 skrll lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */
393 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
394 1.1 skrll
395 1.1 skrll b 9f /* } */
396 1.1 skrll 5: /* else { */
397 1.1 skrll ldr DATA0, [SRC] /* data0 = *src; */
398 1.1 skrll lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/
399 1.1 skrll 9: /* } */
400 1.1 skrll
401 1.1 skrll cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
402 1.1 skrll mov TMP_D, DST /* tmp_d = dst; */
403 1.1 skrll
404 1.1 skrll tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
405 1.1 skrll str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */
406 1.1 skrll lsr DATA1, DATA1, #32 /* data1 >>= 32; */
407 1.1 skrll 1: /* } */
408 1.1 skrll tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
409 1.1 skrll strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */
410 1.1 skrll lsr DATA1, DATA1, #16 /* data1 >>= 16; */
411 1.1 skrll 1: /* } */
412 1.1 skrll tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
413 1.1 skrll strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */
414 1.1 skrll 1: /* } */
415 1.1 skrll
416 1.1 skrll sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
417 1.1 skrll 9: /* } */
418 1.1 skrll #else /* BYTE_ORDER */
419 1.1 skrll tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
420 1.1 skrll
421 1.1 skrll cmp SRC, SRC0 /* don't access out of range */
422 1.1 skrll beq 1f
423 1.1 skrll ldr DATA1, [SRC]
424 1.1 skrll 1:
425 1.1 skrll ldr DATA0, [SRC, #-8]!
426 1.1 skrll
427 1.1 skrll lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
428 1.1 skrll lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */
429 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
430 1.1 skrll
431 1.1 skrll b 9f /* } */
432 1.1 skrll 5: /* else { */
433 1.1 skrll ldr DATA0, [SRC] /* data0 = *src; */
434 1.1 skrll lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/
435 1.1 skrll 9: /* } */
436 1.1 skrll
437 1.1 skrll cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
438 1.1 skrll mov TMP_D, DST /* tmp_d = dst; */
439 1.1 skrll
440 1.1 skrll tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
441 1.1 skrll lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */
442 1.1 skrll str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */
443 1.1 skrll 1: /* } */
444 1.1 skrll tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
445 1.1 skrll lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */
446 1.1 skrll strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */
447 1.1 skrll 1: /* } */
448 1.1 skrll tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
449 1.1 skrll lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */
450 1.1 skrll strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */
451 1.1 skrll 1: /* } */
452 1.1 skrll
453 1.1 skrll sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
454 1.1 skrll 9: /* } */
455 1.1 skrll #endif /* BYTE_ORDER */
456 1.1 skrll
457 1.1 skrll
458 1.1 skrll backward_shifting_copy_loop:
459 1.1 skrll ldp DATA2, DATA1, [SRC, #-16]!
460 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
461 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
462 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT
463 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
464 1.1 skrll orr DATA0, DATA0, TMP_X
465 1.1 skrll /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
466 1.1 skrll lsl DATA1, DATA1, DST_SRC_ALIGNBIT
467 1.1 skrll lsr TMP_X, DATA2, SRC_DST_ALIGNBIT
468 1.1 skrll orr DATA1, DATA1, TMP_X
469 1.1 skrll #else /* BYTE_ORDER */
470 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
471 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT
472 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
473 1.1 skrll orr DATA0, DATA0, TMP_X
474 1.1 skrll /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
475 1.1 skrll lsr DATA1, DATA1, DST_SRC_ALIGNBIT
476 1.1 skrll lsl TMP_X, DATA2, SRC_DST_ALIGNBIT
477 1.1 skrll orr DATA1, DATA1, TMP_X
478 1.1 skrll #endif /* BYTE_ORDER */
479 1.1 skrll stp DATA1, DATA0, [DST, #-16]!
480 1.1 skrll mov DATA0, DATA2
481 1.1 skrll sub LEN, LEN, #16
482 1.1 skrll cmp LEN, #16
483 1.1 skrll bhs backward_shifting_copy_loop
484 1.1 skrll
485 1.1 skrll
486 1.1 skrll /* write 8 bytes */
487 1.1 skrll tbz LEN, #3, 9f
488 1.1 skrll
489 1.1 skrll ldr DATA1, [SRC, #-8]!
490 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
491 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
492 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT
493 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
494 1.1 skrll orr DATA0, DATA0, TMP_X
495 1.1 skrll #else /* BYTE_ORDER */
496 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
497 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT
498 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
499 1.1 skrll orr DATA0, DATA0, TMP_X
500 1.1 skrll #endif /* BYTE_ORDER */
501 1.1 skrll str DATA0, [DST, #-8]!
502 1.1 skrll mov DATA0, DATA1
503 1.1 skrll sub LEN, LEN, #8
504 1.1 skrll 9:
505 1.1 skrll
506 1.1 skrll cbz LEN, backward_shifting_copy_done
507 1.1 skrll
508 1.1 skrll /* copy last 1-7 bytes */
509 1.1 skrll and TMP_X, SRC_DST_ALIGNBIT, #63
510 1.1 skrll cmp LEN, TMP_X, lsr #3
511 1.1 skrll bls 1f
512 1.1 skrll ldr DATA1, [SRC, #-8]! /* don't access out of range */
513 1.1 skrll 1:
514 1.1 skrll
515 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
516 1.1 skrll /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
517 1.1 skrll lsl DATA0, DATA0, DST_SRC_ALIGNBIT
518 1.1 skrll lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
519 1.1 skrll orr DATA0, DATA0, TMP_X
520 1.1 skrll #else /* BYTE_ORDER */
521 1.1 skrll /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
522 1.1 skrll lsr DATA0, DATA0, DST_SRC_ALIGNBIT
523 1.1 skrll lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
524 1.1 skrll orr DATA0, DATA0, TMP_X
525 1.1 skrll #endif /* BYTE_ORDER */
526 1.1 skrll
527 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
528 1.1 skrll tbz LEN, #2, 1f
529 1.1 skrll ror DATA0, DATA0, #32
530 1.1 skrll str DATA0w, [DST, #-4]!
531 1.1 skrll 1:
532 1.1 skrll tbz LEN, #1, 1f
533 1.1 skrll ror DATA0, DATA0, #48
534 1.1 skrll strh DATA0w, [DST, #-2]!
535 1.1 skrll 1:
536 1.1 skrll tbz LEN, #0, 1f
537 1.1 skrll ror DATA0, DATA0, #56
538 1.1 skrll strb DATA0w, [DST, #-1]!
539 1.1 skrll 1:
540 1.1 skrll #else /* BYTE_ORDER */
541 1.1 skrll tbz LEN, #2, 1f
542 1.1 skrll str DATA0w, [DST, #-4]!
543 1.1 skrll lsr DATA0, DATA0, #32
544 1.1 skrll 1:
545 1.1 skrll tbz LEN, #1, 1f
546 1.1 skrll strh DATA0w, [DST, #-2]!
547 1.1 skrll lsr DATA0, DATA0, #16
548 1.1 skrll 1:
549 1.1 skrll tbz LEN, #0, 1f
550 1.1 skrll strb DATA0w, [DST, #-1]!
551 1.1 skrll 1:
552 1.1 skrll #endif /* BYTE_ORDER */
553 1.1 skrll backward_shifting_copy_done:
554 1.1 skrll ret
555 1.1 skrll #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
556 1.1 skrll
557 1.1 skrll
558 1.1 skrll .align 5
559 1.1 skrll ENTRY(FUNCTION)
560 1.1 skrll #ifdef STRICT_ALIGNMENT
561 1.1 skrll cbz LEN, done
562 1.1 skrll #ifndef NO_OVERLAP
563 1.1 skrll cmp SRC0, DST0
564 1.1 skrll beq done
565 1.1 skrll bcc backward_copy
566 1.1 skrll #endif /* NO_OVERLAP */
567 1.1 skrll mov DST, DST0
568 1.1 skrll cmp LEN, #SMALLSIZE
569 1.1 skrll bcs strict_forward
570 1.1 skrll
571 1.1 skrll cmp LEN, #10
572 1.1 skrll bcs 9f
573 1.1 skrll forward_tiny:
574 1.1 skrll /* copy 1-10 bytes */
575 1.2 ryo 1: sub LEN, LEN, #1
576 1.1 skrll ldrb TMP_Xw, [SRC0], #1
577 1.1 skrll strb TMP_Xw, [DST], #1
578 1.2 ryo cbz LEN, 1b
579 1.1 skrll ret
580 1.1 skrll 9:
581 1.1 skrll /* length is small(<32), and src or dst may be unaligned */
582 1.1 skrll eor TMP_X, SRC0, DST0
583 1.1 skrll ands TMP_X, TMP_X, #7
584 1.1 skrll bne notaligned_forward_small
585 1.1 skrll samealign_forward_small:
586 1.1 skrll /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
587 1.1 skrll tbz DST, #0, 1f
588 1.1 skrll ldrb TMP_Xw, [SRC0], #1
589 1.1 skrll strb TMP_Xw, [DST], #1
590 1.1 skrll sub LEN, LEN, #1
591 1.1 skrll 1:
592 1.1 skrll /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
593 1.1 skrll tbz DST, #1, 1f
594 1.1 skrll ldrh TMP_Xw, [SRC0], #2
595 1.1 skrll strh TMP_Xw, [DST], #2
596 1.1 skrll sub LEN, LEN, #2
597 1.1 skrll 1:
598 1.1 skrll /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
599 1.1 skrll tbz DST, #2, 1f
600 1.1 skrll ldr TMP_Xw, [SRC0], #4
601 1.1 skrll str TMP_Xw, [DST], #4
602 1.1 skrll sub LEN, LEN, #4
603 1.1 skrll 1:
604 1.1 skrll /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
605 1.1 skrll tbz LEN, #4, 1f
606 1.1 skrll ldp DATA0, DATA1, [SRC0], #16
607 1.1 skrll stp DATA0, DATA1, [DST], #16
608 1.1 skrll 1:
609 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
610 1.1 skrll tbz LEN, #3, 1f
611 1.1 skrll ldr TMP_X, [SRC0], #8
612 1.1 skrll str TMP_X, [DST], #8
613 1.1 skrll 1:
614 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
615 1.1 skrll tbz LEN, #2, 1f
616 1.1 skrll ldr TMP_Xw, [SRC0], #4
617 1.1 skrll str TMP_Xw, [DST], #4
618 1.1 skrll 1:
619 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
620 1.1 skrll tbz LEN, #1, 1f
621 1.1 skrll ldrh TMP_Xw, [SRC0], #2
622 1.1 skrll strh TMP_Xw, [DST], #2
623 1.1 skrll 1:
624 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
625 1.1 skrll tbz LEN, #0, 1f
626 1.1 skrll ldrb TMP_Xw, [SRC0], #1
627 1.1 skrll strb TMP_Xw, [DST], #1
628 1.1 skrll 1:
629 1.1 skrll ret
630 1.1 skrll
631 1.1 skrll notaligned_forward_small:
632 1.1 skrll /* src and dst are not aligned... */
633 1.1 skrll prfm PLDL1KEEP, [SRC0]
634 1.1 skrll prfm PLDL1KEEP, [SRC0, #8]
635 1.1 skrll prfm PLDL1KEEP, [SRC0, #16]
636 1.1 skrll add TMP_S, SRC0, LEN /* tmp_s = src + len */
637 1.1 skrll 1: /* do { */
638 1.1 skrll ldrb TMP_Xw, [SRC0], #1
639 1.1 skrll strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */
640 1.1 skrll cmp SRC0, TMP_S /* while (src < tmp_s); */
641 1.1 skrll blo 1b
642 1.1 skrll ret
643 1.1 skrll
644 1.1 skrll strict_forward:
645 1.1 skrll /* src or dst may be unaligned */
646 1.1 skrll and SRC_ALIGNBIT, SRC0, #7
647 1.1 skrll and DST_ALIGNBIT, DST0, #7
648 1.1 skrll lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
649 1.1 skrll lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
650 1.1 skrll sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
651 1.1 skrll cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */
652 1.1 skrll
653 1.1 skrll and SRC, SRC0, #~7
654 1.1 skrll and DST, DST0, #~7
655 1.1 skrll neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
656 1.1 skrll
657 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
658 1.1 skrll tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
659 1.1 skrll ldp DATA1, DATA0, [SRC], #16
660 1.1 skrll neg TMP_X, SRC_ALIGNBIT
661 1.1 skrll lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
662 1.1 skrll lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */
663 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */
664 1.1 skrll b 9f
665 1.1 skrll 5:
666 1.1 skrll ldr DATA0, [SRC], #8
667 1.1 skrll lsr DATA1, DATA0, SRC_ALIGNBIT
668 1.1 skrll 9:
669 1.1 skrll
670 1.1 skrll cbz DST_ALIGNBIT, 5f
671 1.1 skrll mov TMP_D, DST0
672 1.1 skrll /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
673 1.1 skrll tbz TMP_D, #0, 1f
674 1.1 skrll strb DATA1w, [TMP_D], #1
675 1.1 skrll lsr DATA1, DATA1, #8
676 1.1 skrll 1:
677 1.1 skrll /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
678 1.1 skrll tbz TMP_D, #1, 1f
679 1.1 skrll strh DATA1w, [TMP_D], #2
680 1.1 skrll lsr DATA1, DATA1, #16
681 1.1 skrll 1:
682 1.1 skrll /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
683 1.1 skrll tbz TMP_D, #2, 1f
684 1.1 skrll str DATA1w, [TMP_D], #4
685 1.1 skrll 1:
686 1.1 skrll add DST, DST, #8
687 1.1 skrll b 9f
688 1.1 skrll 5:
689 1.1 skrll str DATA1, [DST], #8
690 1.1 skrll 9:
691 1.1 skrll sub LEN, LEN, #8
692 1.1 skrll add LEN, LEN, DST_ALIGNBIT, lsr #3
693 1.1 skrll #else /* BYTE_ORDER */
694 1.1 skrll tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
695 1.1 skrll ldp DATA1, DATA0, [SRC], #16
696 1.1 skrll neg TMP_X, SRC_ALIGNBIT
697 1.1 skrll lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
698 1.1 skrll lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */
699 1.1 skrll orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */
700 1.1 skrll b 9f
701 1.1 skrll 5:
702 1.1 skrll ldr DATA0, [SRC], #8
703 1.1 skrll lsl DATA1, DATA0, SRC_ALIGNBIT
704 1.1 skrll 9:
705 1.1 skrll
706 1.1 skrll cbz DST_ALIGNBIT, 5f
707 1.1 skrll mov TMP_D, DST0
708 1.1 skrll /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
709 1.1 skrll tbz TMP_D, #0, 1f
710 1.1 skrll lsr TMP_X, DATA1, #56
711 1.1 skrll strb TMP_Xw, [TMP_D], #1
712 1.1 skrll 1:
713 1.1 skrll /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
714 1.1 skrll tbz TMP_D, #1, 1f
715 1.1 skrll lsr TMP_X, DATA1, #48
716 1.1 skrll strh TMP_Xw, [TMP_D], #2
717 1.1 skrll 1:
718 1.1 skrll /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
719 1.1 skrll tbz TMP_D, #2, 1f
720 1.1 skrll lsr TMP_X, DATA1, #32
721 1.1 skrll str TMP_Xw, [TMP_D], #4
722 1.1 skrll 1:
723 1.1 skrll add DST, DST, #8
724 1.1 skrll b 9f
725 1.1 skrll 5:
726 1.1 skrll str DATA1, [DST], #8
727 1.1 skrll 9:
728 1.1 skrll sub LEN, LEN, #8
729 1.1 skrll add LEN, LEN, DST_ALIGNBIT, lsr #3
730 1.1 skrll #endif /* BYTE_ORDER */
731 1.1 skrll
732 1.1 skrll shifting_copy_loop:
733 1.1 skrll ldp DATA1, DATA2, [SRC], #16
734 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
735 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
736 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT
737 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
738 1.1 skrll orr DATA0, DATA0, TMP_X
739 1.1 skrll /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
740 1.1 skrll lsr DATA1, DATA1, SRC_DST_ALIGNBIT
741 1.1 skrll lsl TMP_X, DATA2, DST_SRC_ALIGNBIT
742 1.1 skrll orr DATA1, DATA1, TMP_X
743 1.1 skrll #else /* BYTE_ORDER */
744 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
745 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT
746 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
747 1.1 skrll orr DATA0, DATA0, TMP_X
748 1.1 skrll /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
749 1.1 skrll lsl DATA1, DATA1, SRC_DST_ALIGNBIT
750 1.1 skrll lsr TMP_X, DATA2, DST_SRC_ALIGNBIT
751 1.1 skrll orr DATA1, DATA1, TMP_X
752 1.1 skrll #endif /* BYTE_ORDER */
753 1.1 skrll stp DATA0, DATA1, [DST], #16
754 1.1 skrll mov DATA0, DATA2
755 1.1 skrll sub LEN, LEN, #16
756 1.1 skrll cmp LEN, #16
757 1.1 skrll bhs shifting_copy_loop
758 1.1 skrll
759 1.1 skrll
760 1.1 skrll /* write 8 bytes */
761 1.1 skrll tbz LEN, #3, 9f
762 1.1 skrll ldr DATA1, [SRC], #8
763 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
764 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
765 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT
766 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
767 1.1 skrll orr DATA0, DATA0, TMP_X
768 1.1 skrll #else /* BYTE_ORDER */
769 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
770 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT
771 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
772 1.1 skrll orr DATA0, DATA0, TMP_X
773 1.1 skrll #endif /* BYTE_ORDER */
774 1.1 skrll str DATA0, [DST], #8
775 1.1 skrll mov DATA0, DATA1
776 1.1 skrll sub LEN, LEN, #8
777 1.1 skrll 9:
778 1.1 skrll
779 1.1 skrll cbz LEN, shifting_copy_done
780 1.1 skrll
781 1.1 skrll /* copy last 1-7 bytes */
782 1.1 skrll and TMP_X, DST_SRC_ALIGNBIT, #63
783 1.1 skrll cmp LEN, TMP_X, lsr #3
784 1.1 skrll bls 1f
785 1.1 skrll ldr DATA1, [SRC], #8 /* don't access out of range */
786 1.1 skrll 1:
787 1.1 skrll
788 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
789 1.1 skrll /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
790 1.1 skrll lsr DATA0, DATA0, SRC_DST_ALIGNBIT
791 1.1 skrll lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
792 1.1 skrll orr DATA0, DATA0, TMP_X
793 1.1 skrll #else /* BYTE_ORDER */
794 1.1 skrll /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
795 1.1 skrll lsl DATA0, DATA0, SRC_DST_ALIGNBIT
796 1.1 skrll lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
797 1.1 skrll orr DATA0, DATA0, TMP_X
798 1.1 skrll #endif /* BYTE_ORDER */
799 1.1 skrll
800 1.1 skrll #if BYTE_ORDER == LITTLE_ENDIAN
801 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = data0; } */
802 1.1 skrll tbz LEN, #2, 1f
803 1.1 skrll str DATA0w, [DST], #4
804 1.1 skrll lsr DATA0, DATA0, #32
805 1.1 skrll 1:
806 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = data0; } */
807 1.1 skrll tbz LEN, #1, 1f
808 1.1 skrll strh DATA0w, [DST], #2
809 1.1 skrll lsr DATA0, DATA0, #16
810 1.1 skrll 1:
811 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = data0; } */
812 1.1 skrll tbz LEN, #0, 1f
813 1.1 skrll strb DATA0w, [DST], #1
814 1.1 skrll 1:
815 1.1 skrll #else /* BYTE_ORDER */
816 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
817 1.1 skrll tbz LEN, #2, 1f
818 1.1 skrll lsr TMP_X, DATA0, #32
819 1.1 skrll str TMP_Xw, [DST], #4
820 1.1 skrll 1:
821 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
822 1.1 skrll tbz LEN, #1, 1f
823 1.1 skrll lsr TMP_X, DATA0, #16
824 1.1 skrll strh TMP_Xw, [DST], #2
825 1.1 skrll 1:
826 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
827 1.1 skrll tbz LEN, #0, 1f
828 1.1 skrll lsr TMP_X, DATA0, #8
829 1.1 skrll strb TMP_Xw, [DST], #1
830 1.1 skrll 1:
831 1.1 skrll #endif /* BYTE_ORDER */
832 1.1 skrll shifting_copy_done:
833 1.1 skrll ret
834 1.1 skrll
835 1.1 skrll #else /* STRICT_ALIGNMENT */
836 1.1 skrll #ifndef NO_OVERLAP
837 1.1 skrll cbz LEN, done
838 1.1 skrll cmp SRC0, DST0
839 1.1 skrll beq done
840 1.1 skrll bcc backward_ignore_align
841 1.1 skrll #endif /* NO_OVERLAP */
842 1.1 skrll
843 1.1 skrll prfm PLDL1KEEP, [SRC0]
844 1.1 skrll cmp LEN, #SMALLSIZE
845 1.1 skrll bcs copy_forward
846 1.1 skrll mov DST, DST0
847 1.1 skrll
848 1.1 skrll copy_forward_small:
849 1.1 skrll cmp LEN, #8
850 1.1 skrll bcs 9f
851 1.1 skrll
852 1.1 skrll /* 0 <= len < 8 */
853 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
854 1.1 skrll tbz LEN, #2, 1f
855 1.1 skrll ldr TMP_Xw, [SRC0], #4
856 1.1 skrll str TMP_Xw, [DST], #4
857 1.1 skrll 1:
858 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
859 1.1 skrll tbz LEN, #1, 1f
860 1.1 skrll ldrh TMP_Xw, [SRC0], #2
861 1.1 skrll strh TMP_Xw, [DST], #2
862 1.1 skrll 1:
863 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
864 1.1 skrll tbz LEN, #0, 1f
865 1.1 skrll ldrb TMP_Xw, [SRC0], #1
866 1.1 skrll strb TMP_Xw, [DST], #1
867 1.1 skrll 1:
868 1.1 skrll ret
869 1.1 skrll 9:
870 1.1 skrll
871 1.1 skrll prfm PLDL1KEEP, [SRC0, #8]
872 1.1 skrll cmp LEN, #16
873 1.1 skrll bcs 9f
874 1.1 skrll
875 1.1 skrll /* 8 <= len < 16 */
876 1.1 skrll /* *(uint64_t *)dst++ = *(uint64_t *)src++; */
877 1.1 skrll ldr TMP_X, [SRC0], #8
878 1.1 skrll str TMP_X, [DST], #8
879 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
880 1.1 skrll tbz LEN, #2, 1f
881 1.1 skrll ldr TMP_Xw, [SRC0], #4
882 1.1 skrll str TMP_Xw, [DST], #4
883 1.1 skrll 1:
884 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
885 1.1 skrll tbz LEN, #1, 1f
886 1.1 skrll ldrh TMP_Xw, [SRC0], #2
887 1.1 skrll strh TMP_Xw, [DST], #2
888 1.1 skrll 1:
889 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
890 1.1 skrll tbz LEN, #0, 1f
891 1.1 skrll ldrb TMP_Xw, [SRC0], #1
892 1.1 skrll strb TMP_Xw, [DST], #1
893 1.1 skrll 1:
894 1.1 skrll ret
895 1.1 skrll 9:
896 1.1 skrll
897 1.1 skrll /* 16 <= len < 32 */
898 1.1 skrll prfm PLDL1KEEP, [SRC0, 16]
899 1.1 skrll prfm PLDL1KEEP, [SRC0, 24]
900 1.1 skrll ldp DATA0, DATA1, [SRC0], #16
901 1.1 skrll stp DATA0, DATA1, [DST], #16
902 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
903 1.1 skrll tbz LEN, #3, 1f
904 1.1 skrll ldr TMP_X, [SRC0], #8
905 1.1 skrll str TMP_X, [DST], #8
906 1.1 skrll 1:
907 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
908 1.1 skrll tbz LEN, #2, 1f
909 1.1 skrll ldr TMP_Xw, [SRC0], #4
910 1.1 skrll str TMP_Xw, [DST], #4
911 1.1 skrll 1:
912 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
913 1.1 skrll tbz LEN, #1, 1f
914 1.1 skrll ldrh TMP_Xw, [SRC0], #2
915 1.1 skrll strh TMP_Xw, [DST], #2
916 1.1 skrll 1:
917 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
918 1.1 skrll tbz LEN, #0, 1f
919 1.1 skrll ldrb TMP_Xw, [SRC0], #1
920 1.1 skrll strb TMP_Xw, [DST], #1
921 1.1 skrll 1:
922 1.1 skrll ret
923 1.1 skrll #endif /* !STRICT_ALIGNMENT */
924 1.1 skrll
925 1.1 skrll .align 4
926 1.1 skrll copy_forward:
927 1.1 skrll /* DST is not aligned at this point */
928 1.1 skrll mov DST, DST0
929 1.1 skrll #ifndef STRICT_ALIGNMENT
930 1.1 skrll cmp LEN, #512 /* pre-alignment can be overhead when small */
931 1.1 skrll bcc 9f
932 1.1 skrll #endif /* STRICT_ALIGNMENT */
933 1.1 skrll /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
934 1.1 skrll tbz DST, #0, 1f
935 1.1 skrll ldrb TMP_Xw, [SRC0], #1
936 1.1 skrll strb TMP_Xw, [DST], #1
937 1.1 skrll sub LEN, LEN, #1
938 1.1 skrll 1:
939 1.1 skrll /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
940 1.1 skrll tbz DST, #1, 1f
941 1.1 skrll ldrh TMP_Xw, [SRC0], #2
942 1.1 skrll strh TMP_Xw, [DST], #2
943 1.1 skrll sub LEN, LEN, #2
944 1.1 skrll 1:
945 1.1 skrll /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
946 1.1 skrll tbz DST, #2, 1f
947 1.1 skrll ldr TMP_Xw, [SRC0], #4
948 1.1 skrll str TMP_Xw, [DST], #4
949 1.1 skrll sub LEN, LEN, #4
950 1.1 skrll 1:
951 1.1 skrll #if (STP_ALIGN > 8)
952 1.1 skrll /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
953 1.1 skrll tbz DST, #3, 1f
954 1.1 skrll ldr TMP_X, [SRC0], #8
955 1.1 skrll str TMP_X, [DST], #8
956 1.1 skrll sub LEN, LEN, #8
957 1.1 skrll 1:
958 1.1 skrll #endif /* (STP_ALIGN > 8) */
959 1.1 skrll 9:
960 1.1 skrll
961 1.2 ryo forward_copy1k:
962 1.2 ryo /* while (len >= 1024) */
963 1.2 ryo /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
964 1.1 skrll cmp LEN, #1024
965 1.2 ryo blo 9f
966 1.2 ryo 1:
967 1.1 skrll sub LEN, LEN, #1024
968 1.1 skrll .rept (1024 / 16)
969 1.1 skrll ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */
970 1.1 skrll stp DATA0, DATA1, [DST], #16
971 1.1 skrll .endr
972 1.1 skrll cmp LEN, #1024
973 1.2 ryo bhs 1b
974 1.2 ryo 9:
975 1.1 skrll
976 1.2 ryo /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
977 1.2 ryo tbz LEN, #9, 1f
978 1.2 ryo .rept (512 / 16)
979 1.2 ryo ldp DATA0, DATA1, [SRC0], #16
980 1.2 ryo stp DATA0, DATA1, [DST], #16
981 1.2 ryo .endr
982 1.2 ryo 1:
983 1.2 ryo /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
984 1.2 ryo tbz LEN, #8, 1f
985 1.2 ryo .rept (256 / 16)
986 1.2 ryo ldp DATA0, DATA1, [SRC0], #16
987 1.2 ryo stp DATA0, DATA1, [DST], #16
988 1.2 ryo .endr
989 1.2 ryo 1:
990 1.2 ryo /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
991 1.2 ryo tbz LEN, #7, 1f
992 1.2 ryo .rept (128 / 16)
993 1.2 ryo ldp DATA0, DATA1, [SRC0], #16
994 1.2 ryo stp DATA0, DATA1, [DST], #16
995 1.2 ryo .endr
996 1.2 ryo 1:
997 1.2 ryo /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
998 1.2 ryo tbz LEN, #6, 1f
999 1.2 ryo .rept (64 / 16)
1000 1.2 ryo ldp DATA0, DATA1, [SRC0], #16
1001 1.2 ryo stp DATA0, DATA1, [DST], #16
1002 1.2 ryo .endr
1003 1.2 ryo 1:
1004 1.2 ryo /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
1005 1.2 ryo tbz LEN, #5, 1f
1006 1.2 ryo .rept (32 / 16)
1007 1.2 ryo ldp DATA0, DATA1, [SRC0], #16
1008 1.2 ryo stp DATA0, DATA1, [DST], #16
1009 1.2 ryo .endr
1010 1.2 ryo 1:
1011 1.1 skrll /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
1012 1.1 skrll tbz LEN, #4, 1f
1013 1.1 skrll ldp DATA0, DATA1, [SRC0], #16
1014 1.1 skrll stp DATA0, DATA1, [DST], #16
1015 1.1 skrll 1:
1016 1.1 skrll /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
1017 1.1 skrll tbz LEN, #3, 1f
1018 1.1 skrll ldr TMP_X, [SRC0], #8
1019 1.1 skrll str TMP_X, [DST], #8
1020 1.1 skrll 1:
1021 1.1 skrll /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
1022 1.1 skrll tbz LEN, #2, 1f
1023 1.1 skrll ldr TMP_Xw, [SRC0], #4
1024 1.1 skrll str TMP_Xw, [DST], #4
1025 1.1 skrll 1:
1026 1.1 skrll /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
1027 1.1 skrll tbz LEN, #1, 1f
1028 1.1 skrll ldrh TMP_Xw, [SRC0], #2
1029 1.1 skrll strh TMP_Xw, [DST], #2
1030 1.1 skrll 1:
1031 1.1 skrll /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
1032 1.1 skrll tbz LEN, #0, 1f
1033 1.1 skrll ldrb TMP_Xw, [SRC0], #1
1034 1.1 skrll strb TMP_Xw, [DST], #1
1035 1.1 skrll 1:
1036 1.1 skrll done:
1037 1.1 skrll ret
1038 1.1 skrll END(FUNCTION)
1039