memcpy_neon.S revision 1.1.8.2 1 1.1.8.2 tls /*-
2 1.1.8.2 tls * Copyright (c) 2013 The NetBSD Foundation, Inc.
3 1.1.8.2 tls * All rights reserved.
4 1.1.8.2 tls *
5 1.1.8.2 tls * This code is derived from software contributed to The NetBSD Foundation
6 1.1.8.2 tls * by Matt Thomas of 3am Software Foundry.
7 1.1.8.2 tls *
8 1.1.8.2 tls * Redistribution and use in source and binary forms, with or without
9 1.1.8.2 tls * modification, are permitted provided that the following conditions
10 1.1.8.2 tls * are met:
11 1.1.8.2 tls * 1. Redistributions of source code must retain the above copyright
12 1.1.8.2 tls * notice, this list of conditions and the following disclaimer.
13 1.1.8.2 tls * 2. Redistributions in binary form must reproduce the above copyright
14 1.1.8.2 tls * notice, this list of conditions and the following disclaimer in the
15 1.1.8.2 tls * documentation and/or other materials provided with the distribution.
16 1.1.8.2 tls *
17 1.1.8.2 tls * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 1.1.8.2 tls * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 1.1.8.2 tls * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 1.1.8.2 tls * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 1.1.8.2 tls * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 1.1.8.2 tls * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 1.1.8.2 tls * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 1.1.8.2 tls * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 1.1.8.2 tls * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 1.1.8.2 tls * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 1.1.8.2 tls * POSSIBILITY OF SUCH DAMAGE.
28 1.1.8.2 tls */
29 1.1.8.2 tls
30 1.1.8.2 tls #include <machine/asm.h>
31 1.1.8.2 tls
32 1.1.8.2 tls RCSID("$NetBSD: memcpy_neon.S,v 1.1.8.2 2013/02/25 00:23:56 tls Exp $")
33 1.1.8.2 tls
34 1.1.8.2 tls .text
35 1.1.8.2 tls ENTRY(memcpy)
36 1.1.8.2 tls teq r2, #0 /* 0 length? */
37 1.1.8.2 tls cmpne r0, r1 /* if not, does src == dst? */
38 1.1.8.2 tls RETc(eq) /* yes, (to either) return */
39 1.1.8.2 tls
40 1.1.8.2 tls mov r3, r0 /* keep r0 unchanged */
41 1.1.8.2 tls #if 0
42 1.1.8.2 tls cmp r2, #16 /* copy less than 8 bytes? */
43 1.1.8.2 tls bge .Ldst_aligner /* nope, do it the long way */
44 1.1.8.2 tls
45 1.1.8.2 tls 1: ldrb ip, [r1], #1 /* load a byte from src */
46 1.1.8.2 tls subs r2, r2, #1 /* and more to transfer? */
47 1.1.8.2 tls strb ip, [r3], #1 /* save it to dst */
48 1.1.8.2 tls bne 1b /* yes, do next byte */
49 1.1.8.2 tls RET /* return */
50 1.1.8.2 tls #endif
51 1.1.8.2 tls
52 1.1.8.2 tls .Ldst_aligner:
53 1.1.8.2 tls tst r3, #7 /* is dst pointer word aligned? */
54 1.1.8.2 tls beq .Lsrc_aligner /* yes, check src pointer */
55 1.1.8.2 tls /*
56 1.1.8.2 tls * Until the dst pointer is word aligned, read src and dst byte by
57 1.1.8.2 tls * byte until it is aligned or we've copied everything.
58 1.1.8.2 tls */
59 1.1.8.2 tls ldrb ip, [r1], #1 /* load a byte from src */
60 1.1.8.2 tls strb ip, [r3], #1 /* save the byte to dst */
61 1.1.8.2 tls subs r2, r2, #1 /* end of transfer? */
62 1.1.8.2 tls bne .Ldst_aligner /* no, try next byte */
63 1.1.8.2 tls RET /* yes, we're done! */
64 1.1.8.2 tls
65 1.1.8.2 tls .Lsrc_aligner:
66 1.1.8.2 tls push {r4-r5} /* save some registers */
67 1.1.8.2 tls add r4, r2, r3 /* keep a pointer to the end of src */
68 1.1.8.2 tls ands r5, r1, #7 /* get misalignment of src pointer */
69 1.1.8.2 tls beq .Lcongruent_main /* aligned, do it the fast way */
70 1.1.8.2 tls
71 1.1.8.2 tls vdup.8 d1, r5 /* set offset for table */
72 1.1.8.2 tls rsb r5, r5, #8 /* calculate leftover of each word */
73 1.1.8.2 tls bic r1, r1, #7 /* dword align src pointer */
74 1.1.8.2 tls
75 1.1.8.2 tls vldr d0, .Ltbl_value /* load table value */
76 1.1.8.2 tls vadd.u8 d0, d0, d1 /* add offset to it */
77 1.1.8.2 tls
78 1.1.8.2 tls vld1.64 {d1}, [r1:64]! /* load a dword from src */
79 1.1.8.2 tls
80 1.1.8.2 tls cmp r2, r5 /* do we already have enough? */
81 1.1.8.2 tls bgt .Lincongruent /* no, so read more */
82 1.1.8.2 tls
83 1.1.8.2 tls .Lincongruent_finish:
84 1.1.8.2 tls vtbl.8 d0, {d1-d2}, d0 /* merge last dwords */
85 1.1.8.2 tls cmp r2, #8 /* room for a full dword? */
86 1.1.8.2 tls #ifdef __ARMEB__
87 1.1.8.2 tls vrev64.32 d0, d0 /* word swap to LE */
88 1.1.8.2 tls #endif
89 1.1.8.2 tls blt .Lfinish /* no, write final partial dword */
90 1.1.8.2 tls vst1.32 {d0}, [r3:64] /* yes, write final full dword */
91 1.1.8.2 tls b .Ldone /* and we're done! */
92 1.1.8.2 tls
93 1.1.8.2 tls .Lincongruent:
94 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
95 1.1.8.2 tls cmp r2, #8 /* can we write a full dword? */
96 1.1.8.2 tls blt .Lincongruent_finish /* no, finish it. */
97 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
98 1.1.8.2 tls vst1.64 {d1}, [r3:64]! /* store a dword */
99 1.1.8.2 tls subs r2, r2, #8 /* have we written everything? */
100 1.1.8.2 tls beq .Ldone /* yes, we're done! */
101 1.1.8.2 tls vmov d1, d2 /* prepare for next dword */
102 1.1.8.2 tls tst r3, #63 /* are we 64-byte aligned? */
103 1.1.8.2 tls bne .Lincongruent /* no, load next dword */
104 1.1.8.2 tls
105 1.1.8.2 tls /*
106 1.1.8.2 tls * We are now 64-byte aligneds so all writes should fill one or more
107 1.1.8.2 tls * cachelines. Even if d1 has 7 bytes cached, to write 32 bytes we
108 1.1.8.2 tls * still need to read 4 dwords (3 full dwords and 1 dword for that
109 1.1.8.2 tls * last byte).
110 1.1.8.2 tls */
111 1.1.8.2 tls cmp r2, #32 /* can we write 4 more dwords? */
112 1.1.8.2 tls blt .Lincongruent_dword /* no, handle dword by dword */
113 1.1.8.2 tls vld1.64 {d2-d5}, [r1:64]! /* read 4 dwords */
114 1.1.8.2 tls cmp r2, #64 /* can we write 4 more dwords? */
115 1.1.8.2 tls blt .Lincongruent_4dword /* no, handle it */
116 1.1.8.2 tls
117 1.1.8.2 tls 1: vld1.64 {d7-d10}, [r1:64]! /* read 4 dwords */
118 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
119 1.1.8.2 tls vtbl.8 d2, {d2-d3}, d0 /* reorder */
120 1.1.8.2 tls vtbl.8 d3, {d3-d4}, d0 /* reorder */
121 1.1.8.2 tls vtbl.8 d4, {d4-d5}, d0 /* reorder */
122 1.1.8.2 tls vst1.64 {d1-d4}, [r3:64]! /* write 4 dwords */
123 1.1.8.2 tls vmov d6, d5 /* move out of the way the load */
124 1.1.8.2 tls cmp r2, #96 /* have 8+4 dwords to write? */
125 1.1.8.2 tls blt 2f /* no more data, skip the load */
126 1.1.8.2 tls vld1.64 {d2-d5}, [r1:64]! /* more data, load 4 dwords */
127 1.1.8.2 tls 2: vtbl.8 d6, {d6-d7}, d0 /* reorder */
128 1.1.8.2 tls vtbl.8 d7, {d7-d8}, d0 /* reorder */
129 1.1.8.2 tls vtbl.8 d8, {d8-d9}, d0 /* reorder */
130 1.1.8.2 tls vtbl.8 d9, {d9-d10}, d0 /* reorder */
131 1.1.8.2 tls vst1.64 {d6-d9}, [r3:64]! /* write 4 dwords */
132 1.1.8.2 tls subs r2, r2, #64
133 1.1.8.2 tls beq .Ldone
134 1.1.8.2 tls vmov d1, d10
135 1.1.8.2 tls cmp r2, #64
136 1.1.8.2 tls bge 1b
137 1.1.8.2 tls
138 1.1.8.2 tls /*
139 1.1.8.2 tls * we have leftovers in d1 and new untranslated date in d2-d5.
140 1.1.8.2 tls */
141 1.1.8.2 tls .Lincongruent_4dword:
142 1.1.8.2 tls cmp r2, #32
143 1.1.8.2 tls blt .Lincongruent_dword
144 1.1.8.2 tls
145 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
146 1.1.8.2 tls vtbl.8 d2, {d2-d3}, d0 /* reorder */
147 1.1.8.2 tls vtbl.8 d3, {d3-d4}, d0 /* reorder */
148 1.1.8.2 tls vtbl.8 d4, {d4-d5}, d0 /* reorder */
149 1.1.8.2 tls vst1.64 {d1-d4}, [r3:64]! /* write 4 dwords */
150 1.1.8.2 tls vmov d1, d5 /* move leftovers */
151 1.1.8.2 tls subs r2, r2, #32
152 1.1.8.2 tls beq .Ldone
153 1.1.8.2 tls
154 1.1.8.2 tls .Lincongruent_dword:
155 1.1.8.2 tls #if 0
156 1.1.8.2 tls cmp r2, r5 /* enough in leftovers? */
157 1.1.8.2 tls ble .Lincongruent_finish /* yes, finish it. */
158 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
159 1.1.8.2 tls cmp r2, #8 /* can we write a full dword? */
160 1.1.8.2 tls blt .Lincongruent_finish /* no, finish it. */
161 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
162 1.1.8.2 tls vst1.64 {d1}, [r3:64]! /* store a dword */
163 1.1.8.2 tls subs r2, r2, #8 /* have we written everything? */
164 1.1.8.2 tls beq .Ldone /* yes, we're done! */
165 1.1.8.2 tls b .Lincongruent_dword /* and go get it */
166 1.1.8.2 tls #else
167 1.1.8.2 tls cmp r2, r5 /* are the bytes we have enough? */
168 1.1.8.2 tls ble .Lincongruent_finish /* yes, finish it. */
169 1.1.8.2 tls mov ip, r2 /* get remaining count */
170 1.1.8.2 tls bic ip, ip, #7 /* truncate to a dword */
171 1.1.8.2 tls rsb ip, ip, #32 /* subtract from 32 */
172 1.1.8.2 tls ands r2, r2, #7 /* count mod 8 */
173 1.1.8.2 tls add pc, pc, ip, lsl #1 /* and jump! */
174 1.1.8.2 tls nop
175 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
176 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
177 1.1.8.2 tls vst1.64 {d1}, [r3:64]! /* store a dword */
178 1.1.8.2 tls vmov d1, d2 /* prepare for next dword */
179 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
180 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
181 1.1.8.2 tls vst1.64 {d1}, [r3:64]! /* store a dword */
182 1.1.8.2 tls vmov d1, d2 /* prepare for next dword */
183 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
184 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
185 1.1.8.2 tls vst1.64 {d1}, [r3:64]! /* store a dword */
186 1.1.8.2 tls vmov d1, d2 /* prepare for next dword */
187 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
188 1.1.8.2 tls vtbl.8 d1, {d1-d2}, d0 /* reorder */
189 1.1.8.2 tls vst1.64 {d1}, [r3:64]! /* store a dword */
190 1.1.8.2 tls vmov d1, d2 /* prepare for next dword */
191 1.1.8.2 tls beq .Ldone
192 1.1.8.2 tls vld1.64 {d2}, [r1:64]! /* load a dword */
193 1.1.8.2 tls b .Lincongruent_finish /* write last partial dowrd */
194 1.1.8.2 tls #endif
195 1.1.8.2 tls
196 1.1.8.2 tls .Lcongruent_main:
197 1.1.8.2 tls vld1.32 {d0}, [r1:64]! /* load next dword */
198 1.1.8.2 tls cmp r2, #8 /* compare current ptr against end */
199 1.1.8.2 tls blt .Lfinish /* greater so write final dword */
200 1.1.8.2 tls vst1.32 {d0}, [r3:64]! /* store dword */
201 1.1.8.2 tls subs r2, r2, #8 /* compare current ptr against end */
202 1.1.8.2 tls beq .Ldone /* equal? we're done! */
203 1.1.8.2 tls tst r3, #63 /* have we hit a 64-byte boundary? */
204 1.1.8.2 tls bne .Lcongruent_main /* no, write next word */
205 1.1.8.2 tls
206 1.1.8.2 tls cmp r2, #64 /* can we write 4 dwords? */
207 1.1.8.2 tls blt .Lcongruent_loop /* no, this dword by dword */
208 1.1.8.2 tls vldm r1!, {d0-d7} /* load next 7 dwords */
209 1.1.8.2 tls cmp r2, #128 /* can we write 16 dwords */
210 1.1.8.2 tls blt 3f /* no, then deal with 8 dwords */
211 1.1.8.2 tls
212 1.1.8.2 tls /*
213 1.1.8.2 tls * The following writes two 64-byte interleaving stores and loads.
214 1.1.8.2 tls */
215 1.1.8.2 tls 1: vldm r1!, {d8-d15} /* load next 8 dwords */
216 1.1.8.2 tls vstm r3!, {d0-d7} /* store 8 more dwords */
217 1.1.8.2 tls cmp r2, #192 /* can we write 16+8 dwords? */
218 1.1.8.2 tls blt 2f /* no, don't load the next 8 dwords */
219 1.1.8.2 tls vldm r1!, {d0-d7} /* yes, load next 8 dwords */
220 1.1.8.2 tls 2: vstm r3!, {d8-d15} /* store 8 more dwords */
221 1.1.8.2 tls sub r2, r2, #128 /* we just stored 16 (8+8) dwords */
222 1.1.8.2 tls beq .Ldone /* if 0, we're done! */
223 1.1.8.2 tls cmp r2, #128 /* can we write 16 dwords */
224 1.1.8.2 tls bge 1b /* yes, do it again */
225 1.1.8.2 tls cmp r2, #64 /* have we loaded 8 dwords? */
226 1.1.8.2 tls blt .Lcongruent_loop /* no, proceed to do it dword */
227 1.1.8.2 tls
228 1.1.8.2 tls /*
229 1.1.8.2 tls * We now have 8 dwords we can write in d0-d7.
230 1.1.8.2 tls */
231 1.1.8.2 tls 3: vstm r3!, {d0-d7} /* store 8 more dwords */
232 1.1.8.2 tls subs r2, r2, #64 /* we wrote 8 dwords */
233 1.1.8.2 tls beq .Ldone /* if 0, we're done! */
234 1.1.8.2 tls
235 1.1.8.2 tls .Lcongruent_loop:
236 1.1.8.2 tls vld1.32 {d0}, [r1]! /* load dword from src */
237 1.1.8.2 tls cmp r2, #8 /* can we write a full dword? */
238 1.1.8.2 tls blt .Lfinish /* no, write last partial dword */
239 1.1.8.2 tls .Lcongruent_loop_start:
240 1.1.8.2 tls vst1.32 {d0}, [r3]! /* store dword into dst */
241 1.1.8.2 tls subs r2, r2, #8 /* subtract it from length */
242 1.1.8.2 tls beq .Ldone /* if 0, we're done! */
243 1.1.8.2 tls vld1.32 {d0}, [r1]! /* load dword from src */
244 1.1.8.2 tls cmp r2, #8 /* can we write a full dword? */
245 1.1.8.2 tls bge .Lcongruent_loop_start /* yes, so do it */
246 1.1.8.2 tls
247 1.1.8.2 tls .Lfinish:
248 1.1.8.2 tls vmov r4, r5, d0 /* get last dword from NEON */
249 1.1.8.2 tls tst r2, #4 /* do we have at least 4 bytes left? */
250 1.1.8.2 tls strne r4, [r3], #4 /* store the 1st word */
251 1.1.8.2 tls movne r4, r5 /* move 2nd word into place */
252 1.1.8.2 tls tst r2, #2 /* do we have at least 2 bytes left? */
253 1.1.8.2 tls #ifdef __ARMEB__
254 1.1.8.2 tls movne r4, r4, ror #16 /* yes, swap halfwords */
255 1.1.8.2 tls #endif
256 1.1.8.2 tls strneh r4, [r3], #2 /* yes, store the halfword */
257 1.1.8.2 tls #ifdef __ARMEL__
258 1.1.8.2 tls movne r4, r4, lsr #16 /* yes, discard just written bytes */
259 1.1.8.2 tls #endif
260 1.1.8.2 tls tst r2, #1 /* do we have a final byte? */
261 1.1.8.2 tls #ifdef __ARMEB__
262 1.1.8.2 tls movne r4, r4, lsr #24 /* yes, move MSB to LSB */
263 1.1.8.2 tls #endif
264 1.1.8.2 tls strneb r4, [r3], #1 /* yes, store it */
265 1.1.8.2 tls
266 1.1.8.2 tls .Ldone:
267 1.1.8.2 tls pop {r4-r5} /* restore registers */
268 1.1.8.2 tls RET
269 1.1.8.2 tls
270 1.1.8.2 tls .p2align 3
271 1.1.8.2 tls .Ltbl_value:
272 1.1.8.2 tls #ifdef __ARMEL__
273 1.1.8.2 tls .quad 0x0706050403020100
274 1.1.8.2 tls #else
275 1.1.8.2 tls .quad 0x0001020304050607
276 1.1.8.2 tls #endif
277 1.1.8.2 tls END(memcpy)
278