memcpy.S revision 1.2.6.2 1 1.2.6.2 tls /* $NetBSD: memcpy.S,v 1.2.6.2 2013/06/23 06:26:13 tls Exp $ */
2 1.2.6.2 tls
3 1.2.6.2 tls /*
4 1.2.6.2 tls * Copyright (c) 1996-2002 Eduardo Horvath
5 1.2.6.2 tls * All rights reserved.
6 1.2.6.2 tls *
7 1.2.6.2 tls * Redistribution and use in source and binary forms, with or without
8 1.2.6.2 tls * modification, are permitted provided that the following conditions
9 1.2.6.2 tls * are met:
10 1.2.6.2 tls * 1. Redistributions of source code must retain the above copyright
11 1.2.6.2 tls * notice, this list of conditions and the following disclaimer.
12 1.2.6.2 tls *
13 1.2.6.2 tls * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
14 1.2.6.2 tls * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 1.2.6.2 tls * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 1.2.6.2 tls * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
17 1.2.6.2 tls * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 1.2.6.2 tls * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 1.2.6.2 tls * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 1.2.6.2 tls * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 1.2.6.2 tls * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 1.2.6.2 tls * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 1.2.6.2 tls * SUCH DAMAGE.
24 1.2.6.2 tls *
25 1.2.6.2 tls */
26 1.2.6.2 tls #include "strmacros.h"
27 1.2.6.2 tls #if defined(LIBC_SCCS) && !defined(lint)
28 1.2.6.2 tls RCSID("$NetBSD: memcpy.S,v 1.2.6.2 2013/06/23 06:26:13 tls Exp $")
29 1.2.6.2 tls #endif /* LIBC_SCCS and not lint */
30 1.2.6.2 tls
31 1.2.6.2 tls /*
32 1.2.6.2 tls * memcpy
33 1.2.6.2 tls * Assumes regions do not overlap;
34 1.2.6.2 tls *
35 1.2.6.2 tls * Must not use %g7 (see copyin/copyout above).
36 1.2.6.2 tls */
37 1.2.6.2 tls ENTRY(memcpy) /* dest, src, size */
38 1.2.6.2 tls /*
39 1.2.6.2 tls * Swap args for bcopy. Gcc generates calls to memcpy for
40 1.2.6.2 tls * structure assignments.
41 1.2.6.2 tls */
42 1.2.6.2 tls mov %o0, %o3
43 1.2.6.2 tls mov %o1, %o0
44 1.2.6.2 tls mov %o3, %o1
45 1.2.6.2 tls #if !defined(_KERNEL) || defined(_RUMPKERNEL)
46 1.2.6.2 tls ENTRY(bcopy) /* src, dest, size */
47 1.2.6.2 tls #endif
48 1.2.6.2 tls #ifdef DEBUG
49 1.2.6.2 tls #if defined(_KERNEL) && !defined(_RUMPKERNEL)
50 1.2.6.2 tls set pmapdebug, %o4
51 1.2.6.2 tls ld [%o4], %o4
52 1.2.6.2 tls btst 0x80, %o4 ! PDB_COPY
53 1.2.6.2 tls bz,pt %icc, 3f
54 1.2.6.2 tls nop
55 1.2.6.2 tls #endif
56 1.2.6.2 tls save %sp, -CC64FSZ, %sp
57 1.2.6.2 tls mov %i0, %o1
58 1.2.6.2 tls set 2f, %o0
59 1.2.6.2 tls mov %i1, %o2
60 1.2.6.2 tls call printf
61 1.2.6.2 tls mov %i2, %o3
62 1.2.6.2 tls ! ta 1; nop
63 1.2.6.2 tls restore
64 1.2.6.2 tls .data
65 1.2.6.2 tls 2: .asciz "memcpy(%p<-%p,%x)\n"
66 1.2.6.2 tls _ALIGN
67 1.2.6.2 tls .text
68 1.2.6.2 tls 3:
69 1.2.6.2 tls #endif
70 1.2.6.2 tls
71 1.2.6.2 tls cmp %o2, BCOPY_SMALL
72 1.2.6.2 tls
73 1.2.6.2 tls Lmemcpy_start:
74 1.2.6.2 tls bge,pt CCCR, 2f ! if >= this many, go be fancy.
75 1.2.6.2 tls cmp %o2, 256
76 1.2.6.2 tls
77 1.2.6.2 tls mov %o1, %o5 ! Save memcpy return value
78 1.2.6.2 tls /*
79 1.2.6.2 tls * Not much to copy, just do it a byte at a time.
80 1.2.6.2 tls */
81 1.2.6.2 tls deccc %o2 ! while (--len >= 0)
82 1.2.6.2 tls bl 1f
83 1.2.6.2 tls .empty
84 1.2.6.2 tls 0:
85 1.2.6.2 tls inc %o0
86 1.2.6.2 tls ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
87 1.2.6.2 tls stb %o4, [%o1]
88 1.2.6.2 tls deccc %o2
89 1.2.6.2 tls bge 0b
90 1.2.6.2 tls inc %o1
91 1.2.6.2 tls 1:
92 1.2.6.2 tls retl
93 1.2.6.2 tls mov %o5, %o0
94 1.2.6.2 tls NOTREACHED
95 1.2.6.2 tls
96 1.2.6.2 tls /*
97 1.2.6.2 tls * Plenty of data to copy, so try to do it optimally.
98 1.2.6.2 tls */
99 1.2.6.2 tls 2:
100 1.2.6.2 tls #ifdef USE_BLOCK_STORE_LOAD
101 1.2.6.2 tls ! If it is big enough, use VIS instructions
102 1.2.6.2 tls bge Lmemcpy_block
103 1.2.6.2 tls nop
104 1.2.6.2 tls #endif /* USE_BLOCK_STORE_LOAD */
105 1.2.6.2 tls Lmemcpy_fancy:
106 1.2.6.2 tls
107 1.2.6.2 tls !!
108 1.2.6.2 tls !! First align the output to a 8-byte entity
109 1.2.6.2 tls !!
110 1.2.6.2 tls
111 1.2.6.2 tls save %sp, -CC64FSZ, %sp
112 1.2.6.2 tls
113 1.2.6.2 tls mov %i0, %l0
114 1.2.6.2 tls mov %i1, %l1
115 1.2.6.2 tls
116 1.2.6.2 tls mov %i2, %l2
117 1.2.6.2 tls btst 1, %l1
118 1.2.6.2 tls
119 1.2.6.2 tls bz,pt %icc, 4f
120 1.2.6.2 tls btst 2, %l1
121 1.2.6.2 tls ldub [%l0], %l4 ! Load 1st byte
122 1.2.6.2 tls
123 1.2.6.2 tls deccc 1, %l2
124 1.2.6.2 tls ble,pn CCCR, Lmemcpy_finish ! XXXX
125 1.2.6.2 tls inc 1, %l0
126 1.2.6.2 tls
127 1.2.6.2 tls stb %l4, [%l1] ! Store 1st byte
128 1.2.6.2 tls inc 1, %l1 ! Update address
129 1.2.6.2 tls btst 2, %l1
130 1.2.6.2 tls 4:
131 1.2.6.2 tls bz,pt %icc, 4f
132 1.2.6.2 tls
133 1.2.6.2 tls btst 1, %l0
134 1.2.6.2 tls bz,a 1f
135 1.2.6.2 tls lduh [%l0], %l4 ! Load short
136 1.2.6.2 tls
137 1.2.6.2 tls ldub [%l0], %l4 ! Load bytes
138 1.2.6.2 tls
139 1.2.6.2 tls ldub [%l0+1], %l3
140 1.2.6.2 tls sllx %l4, 8, %l4
141 1.2.6.2 tls or %l3, %l4, %l4
142 1.2.6.2 tls
143 1.2.6.2 tls 1:
144 1.2.6.2 tls deccc 2, %l2
145 1.2.6.2 tls ble,pn CCCR, Lmemcpy_finish ! XXXX
146 1.2.6.2 tls inc 2, %l0
147 1.2.6.2 tls sth %l4, [%l1] ! Store 1st short
148 1.2.6.2 tls
149 1.2.6.2 tls inc 2, %l1
150 1.2.6.2 tls 4:
151 1.2.6.2 tls btst 4, %l1
152 1.2.6.2 tls bz,pt CCCR, 4f
153 1.2.6.2 tls
154 1.2.6.2 tls btst 3, %l0
155 1.2.6.2 tls bz,a,pt CCCR, 1f
156 1.2.6.2 tls lduw [%l0], %l4 ! Load word -1
157 1.2.6.2 tls
158 1.2.6.2 tls btst 1, %l0
159 1.2.6.2 tls bz,a,pt %icc, 2f
160 1.2.6.2 tls lduh [%l0], %l4
161 1.2.6.2 tls
162 1.2.6.2 tls ldub [%l0], %l4
163 1.2.6.2 tls
164 1.2.6.2 tls lduh [%l0+1], %l3
165 1.2.6.2 tls sllx %l4, 16, %l4
166 1.2.6.2 tls or %l4, %l3, %l4
167 1.2.6.2 tls
168 1.2.6.2 tls ldub [%l0+3], %l3
169 1.2.6.2 tls sllx %l4, 8, %l4
170 1.2.6.2 tls ba,pt %icc, 1f
171 1.2.6.2 tls or %l4, %l3, %l4
172 1.2.6.2 tls
173 1.2.6.2 tls 2:
174 1.2.6.2 tls lduh [%l0+2], %l3
175 1.2.6.2 tls sllx %l4, 16, %l4
176 1.2.6.2 tls or %l4, %l3, %l4
177 1.2.6.2 tls
178 1.2.6.2 tls 1:
179 1.2.6.2 tls deccc 4, %l2
180 1.2.6.2 tls ble,pn CCCR, Lmemcpy_finish ! XXXX
181 1.2.6.2 tls inc 4, %l0
182 1.2.6.2 tls
183 1.2.6.2 tls st %l4, [%l1] ! Store word
184 1.2.6.2 tls inc 4, %l1
185 1.2.6.2 tls 4:
186 1.2.6.2 tls !!
187 1.2.6.2 tls !! We are now 32-bit aligned in the dest.
188 1.2.6.2 tls !!
189 1.2.6.2 tls Lmemcpy_common:
190 1.2.6.2 tls
191 1.2.6.2 tls and %l0, 7, %l4 ! Shift amount
192 1.2.6.2 tls andn %l0, 7, %l0 ! Source addr
193 1.2.6.2 tls
194 1.2.6.2 tls brz,pt %l4, Lmemcpy_noshift8 ! No shift version...
195 1.2.6.2 tls
196 1.2.6.2 tls sllx %l4, 3, %l4 ! In bits
197 1.2.6.2 tls mov 8<<3, %l3
198 1.2.6.2 tls
199 1.2.6.2 tls ldx [%l0], %o0 ! Load word -1
200 1.2.6.2 tls sub %l3, %l4, %l3 ! Reverse shift
201 1.2.6.2 tls deccc 12*8, %l2 ! Have enough room?
202 1.2.6.2 tls
203 1.2.6.2 tls sllx %o0, %l4, %o0
204 1.2.6.2 tls bl,pn CCCR, 2f
205 1.2.6.2 tls and %l3, 0x38, %l3
206 1.2.6.2 tls Lmemcpy_unrolled8:
207 1.2.6.2 tls
208 1.2.6.2 tls /*
209 1.2.6.2 tls * This is about as close to optimal as you can get, since
210 1.2.6.2 tls * the shifts require EU0 and cannot be paired, and you have
211 1.2.6.2 tls * 3 dependent operations on the data.
212 1.2.6.2 tls */
213 1.2.6.2 tls
214 1.2.6.2 tls ! ldx [%l0+0*8], %o0 ! Already done
215 1.2.6.2 tls ! sllx %o0, %l4, %o0 ! Already done
216 1.2.6.2 tls ldx [%l0+1*8], %o1
217 1.2.6.2 tls ldx [%l0+2*8], %o2
218 1.2.6.2 tls ldx [%l0+3*8], %o3
219 1.2.6.2 tls ldx [%l0+4*8], %o4
220 1.2.6.2 tls ba,pt %icc, 1f
221 1.2.6.2 tls ldx [%l0+5*8], %o5
222 1.2.6.2 tls .align 8
223 1.2.6.2 tls 1:
224 1.2.6.2 tls srlx %o1, %l3, %g1
225 1.2.6.2 tls inc 6*8, %l0
226 1.2.6.2 tls
227 1.2.6.2 tls sllx %o1, %l4, %o1
228 1.2.6.2 tls or %g1, %o0, %g6
229 1.2.6.2 tls ldx [%l0+0*8], %o0
230 1.2.6.2 tls
231 1.2.6.2 tls stx %g6, [%l1+0*8]
232 1.2.6.2 tls srlx %o2, %l3, %g1
233 1.2.6.2 tls
234 1.2.6.2 tls sllx %o2, %l4, %o2
235 1.2.6.2 tls or %g1, %o1, %g6
236 1.2.6.2 tls ldx [%l0+1*8], %o1
237 1.2.6.2 tls
238 1.2.6.2 tls stx %g6, [%l1+1*8]
239 1.2.6.2 tls srlx %o3, %l3, %g1
240 1.2.6.2 tls
241 1.2.6.2 tls sllx %o3, %l4, %o3
242 1.2.6.2 tls or %g1, %o2, %g6
243 1.2.6.2 tls ldx [%l0+2*8], %o2
244 1.2.6.2 tls
245 1.2.6.2 tls stx %g6, [%l1+2*8]
246 1.2.6.2 tls srlx %o4, %l3, %g1
247 1.2.6.2 tls
248 1.2.6.2 tls sllx %o4, %l4, %o4
249 1.2.6.2 tls or %g1, %o3, %g6
250 1.2.6.2 tls ldx [%l0+3*8], %o3
251 1.2.6.2 tls
252 1.2.6.2 tls stx %g6, [%l1+3*8]
253 1.2.6.2 tls srlx %o5, %l3, %g1
254 1.2.6.2 tls
255 1.2.6.2 tls sllx %o5, %l4, %o5
256 1.2.6.2 tls or %g1, %o4, %g6
257 1.2.6.2 tls ldx [%l0+4*8], %o4
258 1.2.6.2 tls
259 1.2.6.2 tls stx %g6, [%l1+4*8]
260 1.2.6.2 tls srlx %o0, %l3, %g1
261 1.2.6.2 tls deccc 6*8, %l2 ! Have enough room?
262 1.2.6.2 tls
263 1.2.6.2 tls sllx %o0, %l4, %o0 ! Next loop
264 1.2.6.2 tls or %g1, %o5, %g6
265 1.2.6.2 tls ldx [%l0+5*8], %o5
266 1.2.6.2 tls
267 1.2.6.2 tls stx %g6, [%l1+5*8]
268 1.2.6.2 tls bge,pt CCCR, 1b
269 1.2.6.2 tls inc 6*8, %l1
270 1.2.6.2 tls
271 1.2.6.2 tls Lmemcpy_unrolled8_cleanup:
272 1.2.6.2 tls !!
273 1.2.6.2 tls !! Finished 8 byte block, unload the regs.
274 1.2.6.2 tls !!
275 1.2.6.2 tls srlx %o1, %l3, %g1
276 1.2.6.2 tls inc 5*8, %l0
277 1.2.6.2 tls
278 1.2.6.2 tls sllx %o1, %l4, %o1
279 1.2.6.2 tls or %g1, %o0, %g6
280 1.2.6.2 tls
281 1.2.6.2 tls stx %g6, [%l1+0*8]
282 1.2.6.2 tls srlx %o2, %l3, %g1
283 1.2.6.2 tls
284 1.2.6.2 tls sllx %o2, %l4, %o2
285 1.2.6.2 tls or %g1, %o1, %g6
286 1.2.6.2 tls
287 1.2.6.2 tls stx %g6, [%l1+1*8]
288 1.2.6.2 tls srlx %o3, %l3, %g1
289 1.2.6.2 tls
290 1.2.6.2 tls sllx %o3, %l4, %o3
291 1.2.6.2 tls or %g1, %o2, %g6
292 1.2.6.2 tls
293 1.2.6.2 tls stx %g6, [%l1+2*8]
294 1.2.6.2 tls srlx %o4, %l3, %g1
295 1.2.6.2 tls
296 1.2.6.2 tls sllx %o4, %l4, %o4
297 1.2.6.2 tls or %g1, %o3, %g6
298 1.2.6.2 tls
299 1.2.6.2 tls stx %g6, [%l1+3*8]
300 1.2.6.2 tls srlx %o5, %l3, %g1
301 1.2.6.2 tls
302 1.2.6.2 tls sllx %o5, %l4, %o5
303 1.2.6.2 tls or %g1, %o4, %g6
304 1.2.6.2 tls
305 1.2.6.2 tls stx %g6, [%l1+4*8]
306 1.2.6.2 tls inc 5*8, %l1
307 1.2.6.2 tls
308 1.2.6.2 tls mov %o5, %o0 ! Save our unused data
309 1.2.6.2 tls dec 5*8, %l2
310 1.2.6.2 tls 2:
311 1.2.6.2 tls inccc 12*8, %l2
312 1.2.6.2 tls bz,pn %icc, Lmemcpy_complete
313 1.2.6.2 tls
314 1.2.6.2 tls !! Unrolled 8 times
315 1.2.6.2 tls Lmemcpy_aligned8:
316 1.2.6.2 tls ! ldx [%l0], %o0 ! Already done
317 1.2.6.2 tls ! sllx %o0, %l4, %o0 ! Shift high word
318 1.2.6.2 tls
319 1.2.6.2 tls deccc 8, %l2 ! Pre-decrement
320 1.2.6.2 tls bl,pn CCCR, Lmemcpy_finish
321 1.2.6.2 tls 1:
322 1.2.6.2 tls ldx [%l0+8], %o1 ! Load word 0
323 1.2.6.2 tls inc 8, %l0
324 1.2.6.2 tls
325 1.2.6.2 tls srlx %o1, %l3, %g6
326 1.2.6.2 tls or %g6, %o0, %g6 ! Combine
327 1.2.6.2 tls
328 1.2.6.2 tls stx %g6, [%l1] ! Store result
329 1.2.6.2 tls inc 8, %l1
330 1.2.6.2 tls
331 1.2.6.2 tls deccc 8, %l2
332 1.2.6.2 tls bge,pn CCCR, 1b
333 1.2.6.2 tls sllx %o1, %l4, %o0
334 1.2.6.2 tls
335 1.2.6.2 tls btst 7, %l2 ! Done?
336 1.2.6.2 tls bz,pt CCCR, Lmemcpy_complete
337 1.2.6.2 tls
338 1.2.6.2 tls !!
339 1.2.6.2 tls !! Loadup the last dregs into %o0 and shift it into place
340 1.2.6.2 tls !!
341 1.2.6.2 tls srlx %l3, 3, %g6 ! # bytes in %o0
342 1.2.6.2 tls dec 8, %g6 ! - 8
343 1.2.6.2 tls !! n-8 - (by - 8) -> n - by
344 1.2.6.2 tls subcc %l2, %g6, %g0 ! # bytes we need
345 1.2.6.2 tls ble,pt %icc, Lmemcpy_finish
346 1.2.6.2 tls nop
347 1.2.6.2 tls ldx [%l0+8], %o1 ! Need another word
348 1.2.6.2 tls srlx %o1, %l3, %o1
349 1.2.6.2 tls ba,pt %icc, Lmemcpy_finish
350 1.2.6.2 tls or %o0, %o1, %o0 ! All loaded up.
351 1.2.6.2 tls
352 1.2.6.2 tls Lmemcpy_noshift8:
353 1.2.6.2 tls deccc 6*8, %l2 ! Have enough room?
354 1.2.6.2 tls bl,pn CCCR, 2f
355 1.2.6.2 tls nop
356 1.2.6.2 tls ba,pt %icc, 1f
357 1.2.6.2 tls nop
358 1.2.6.2 tls .align 32
359 1.2.6.2 tls 1:
360 1.2.6.2 tls ldx [%l0+0*8], %o0
361 1.2.6.2 tls ldx [%l0+1*8], %o1
362 1.2.6.2 tls ldx [%l0+2*8], %o2
363 1.2.6.2 tls stx %o0, [%l1+0*8]
364 1.2.6.2 tls stx %o1, [%l1+1*8]
365 1.2.6.2 tls stx %o2, [%l1+2*8]
366 1.2.6.2 tls
367 1.2.6.2 tls
368 1.2.6.2 tls ldx [%l0+3*8], %o3
369 1.2.6.2 tls ldx [%l0+4*8], %o4
370 1.2.6.2 tls ldx [%l0+5*8], %o5
371 1.2.6.2 tls inc 6*8, %l0
372 1.2.6.2 tls stx %o3, [%l1+3*8]
373 1.2.6.2 tls deccc 6*8, %l2
374 1.2.6.2 tls stx %o4, [%l1+4*8]
375 1.2.6.2 tls stx %o5, [%l1+5*8]
376 1.2.6.2 tls bge,pt CCCR, 1b
377 1.2.6.2 tls inc 6*8, %l1
378 1.2.6.2 tls 2:
379 1.2.6.2 tls inc 6*8, %l2
380 1.2.6.2 tls 1:
381 1.2.6.2 tls deccc 8, %l2
382 1.2.6.2 tls bl,pn %icc, 1f ! < 0 --> sub word
383 1.2.6.2 tls nop
384 1.2.6.2 tls ldx [%l0], %g6
385 1.2.6.2 tls inc 8, %l0
386 1.2.6.2 tls stx %g6, [%l1]
387 1.2.6.2 tls bg,pt %icc, 1b ! Exactly 0 --> done
388 1.2.6.2 tls inc 8, %l1
389 1.2.6.2 tls 1:
390 1.2.6.2 tls btst 7, %l2 ! Done?
391 1.2.6.2 tls bz,pt CCCR, Lmemcpy_complete
392 1.2.6.2 tls clr %l4
393 1.2.6.2 tls ldx [%l0], %o0
394 1.2.6.2 tls Lmemcpy_finish:
395 1.2.6.2 tls
396 1.2.6.2 tls brz,pn %l2, 2f ! 100% complete?
397 1.2.6.2 tls cmp %l2, 8 ! Exactly 8 bytes?
398 1.2.6.2 tls bz,a,pn CCCR, 2f
399 1.2.6.2 tls stx %o0, [%l1]
400 1.2.6.2 tls
401 1.2.6.2 tls btst 4, %l2 ! Word store?
402 1.2.6.2 tls bz CCCR, 1f
403 1.2.6.2 tls srlx %o0, 32, %g6 ! Shift high word down
404 1.2.6.2 tls stw %g6, [%l1]
405 1.2.6.2 tls inc 4, %l1
406 1.2.6.2 tls mov %o0, %g6 ! Operate on the low bits
407 1.2.6.2 tls 1:
408 1.2.6.2 tls btst 2, %l2
409 1.2.6.2 tls mov %g6, %o0
410 1.2.6.2 tls bz 1f
411 1.2.6.2 tls srlx %o0, 16, %g6
412 1.2.6.2 tls
413 1.2.6.2 tls sth %g6, [%l1] ! Store short
414 1.2.6.2 tls inc 2, %l1
415 1.2.6.2 tls mov %o0, %g6 ! Operate on low bytes
416 1.2.6.2 tls 1:
417 1.2.6.2 tls mov %g6, %o0
418 1.2.6.2 tls btst 1, %l2 ! Byte aligned?
419 1.2.6.2 tls bz 2f
420 1.2.6.2 tls srlx %o0, 8, %g6
421 1.2.6.2 tls
422 1.2.6.2 tls stb %g6, [%l1] ! Store last byte
423 1.2.6.2 tls inc 1, %l1 ! Update address
424 1.2.6.2 tls 2:
425 1.2.6.2 tls Lmemcpy_complete:
426 1.2.6.2 tls #if 0
427 1.2.6.2 tls !!
428 1.2.6.2 tls !! verify copy success.
429 1.2.6.2 tls !!
430 1.2.6.2 tls
431 1.2.6.2 tls mov %i0, %o2
432 1.2.6.2 tls mov %i1, %o4
433 1.2.6.2 tls mov %i2, %l4
434 1.2.6.2 tls 0:
435 1.2.6.2 tls ldub [%o2], %o1
436 1.2.6.2 tls inc %o2
437 1.2.6.2 tls ldub [%o4], %o3
438 1.2.6.2 tls inc %o4
439 1.2.6.2 tls cmp %o3, %o1
440 1.2.6.2 tls bnz 1f
441 1.2.6.2 tls dec %l4
442 1.2.6.2 tls brnz %l4, 0b
443 1.2.6.2 tls nop
444 1.2.6.2 tls ba 2f
445 1.2.6.2 tls nop
446 1.2.6.2 tls
447 1.2.6.2 tls 1:
448 1.2.6.2 tls set 0f, %o0
449 1.2.6.2 tls call printf
450 1.2.6.2 tls sub %i2, %l4, %o5
451 1.2.6.2 tls set 1f, %o0
452 1.2.6.2 tls mov %i0, %o2
453 1.2.6.2 tls mov %i1, %o1
454 1.2.6.2 tls call printf
455 1.2.6.2 tls mov %i2, %o3
456 1.2.6.2 tls ta 1
457 1.2.6.2 tls .data
458 1.2.6.2 tls 0: .asciz "memcpy failed: %x@%p != %x@%p byte %d\n"
459 1.2.6.2 tls 1: .asciz "memcpy(%p, %p, %lx)\n"
460 1.2.6.2 tls .align 8
461 1.2.6.2 tls .text
462 1.2.6.2 tls 2:
463 1.2.6.2 tls #endif
464 1.2.6.2 tls ret
465 1.2.6.2 tls restore %i1, %g0, %o0
466 1.2.6.2 tls
467 1.2.6.2 tls #ifdef USE_BLOCK_STORE_LOAD
468 1.2.6.2 tls
469 1.2.6.2 tls /*
470 1.2.6.2 tls * Block copy. Useful for >256 byte copies.
471 1.2.6.2 tls *
472 1.2.6.2 tls * Benchmarking has shown this always seems to be slower than
473 1.2.6.2 tls * the integer version, so this is disabled. Maybe someone will
474 1.2.6.2 tls * figure out why sometime.
475 1.2.6.2 tls */
476 1.2.6.2 tls
477 1.2.6.2 tls Lmemcpy_block:
478 1.2.6.2 tls sethi %hi(block_disable), %o3
479 1.2.6.2 tls ldx [ %o3 + %lo(block_disable) ], %o3
480 1.2.6.2 tls brnz,pn %o3, Lmemcpy_fancy
481 1.2.6.2 tls !! Make sure our trap table is installed
482 1.2.6.2 tls set _C_LABEL(trapbase), %o5
483 1.2.6.2 tls rdpr %tba, %o3
484 1.2.6.2 tls sub %o3, %o5, %o3
485 1.2.6.2 tls brnz,pn %o3, Lmemcpy_fancy ! No, then don't use block load/store
486 1.2.6.2 tls nop
487 1.2.6.2 tls #if defined(_KERNEL) && !defined(_RUMPKERNEL)
488 1.2.6.2 tls /*
489 1.2.6.2 tls * Kernel:
490 1.2.6.2 tls *
491 1.2.6.2 tls * Here we use VIS instructions to do a block clear of a page.
492 1.2.6.2 tls * But before we can do that we need to save and enable the FPU.
493 1.2.6.2 tls * The last owner of the FPU registers is fplwp, and
494 1.2.6.2 tls * fplwp->l_md.md_fpstate is the current fpstate. If that's not
495 1.2.6.2 tls * null, call savefpstate() with it to store our current fp state.
496 1.2.6.2 tls *
497 1.2.6.2 tls * Next, allocate an aligned fpstate on the stack. We will properly
498 1.2.6.2 tls * nest calls on a particular stack so this should not be a problem.
499 1.2.6.2 tls *
500 1.2.6.2 tls * Now we grab either curlwp (or if we're on the interrupt stack
501 1.2.6.2 tls * lwp0). We stash its existing fpstate in a local register and
502 1.2.6.2 tls * put our new fpstate in curlwp->p_md.md_fpstate. We point
503 1.2.6.2 tls * fplwp at curlwp (or lwp0) and enable the FPU.
504 1.2.6.2 tls *
505 1.2.6.2 tls * If we are ever preempted, our FPU state will be saved in our
506 1.2.6.2 tls * fpstate. Then, when we're resumed and we take an FPDISABLED
507 1.2.6.2 tls * trap, the trap handler will be able to fish our FPU state out
508 1.2.6.2 tls * of curlwp (or lwp0).
509 1.2.6.2 tls *
510 1.2.6.2 tls * On exiting this routine we undo the damage: restore the original
511 1.2.6.2 tls * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
512 1.2.6.2 tls * the MMU.
513 1.2.6.2 tls *
514 1.2.6.2 tls *
515 1.2.6.2 tls * Register usage, Kernel only (after save):
516 1.2.6.2 tls *
517 1.2.6.2 tls * %i0 src
518 1.2.6.2 tls * %i1 dest
519 1.2.6.2 tls * %i2 size
520 1.2.6.2 tls *
521 1.2.6.2 tls * %l0 XXXX DEBUG old fpstate
522 1.2.6.2 tls * %l1 fplwp (hi bits only)
523 1.2.6.2 tls * %l2 orig fplwp
524 1.2.6.2 tls * %l3 orig fpstate
525 1.2.6.2 tls * %l5 curlwp
526 1.2.6.2 tls * %l6 old fpstate
527 1.2.6.2 tls *
528 1.2.6.2 tls * Register ussage, Kernel and user:
529 1.2.6.2 tls *
530 1.2.6.2 tls * %g1 src (retval for memcpy)
531 1.2.6.2 tls *
532 1.2.6.2 tls * %o0 src
533 1.2.6.2 tls * %o1 dest
534 1.2.6.2 tls * %o2 end dest
535 1.2.6.2 tls * %o5 last safe fetchable address
536 1.2.6.2 tls */
537 1.2.6.2 tls
538 1.2.6.2 tls ENABLE_FPU(0)
539 1.2.6.2 tls
540 1.2.6.2 tls mov %i0, %o0 ! Src addr.
541 1.2.6.2 tls mov %i1, %o1 ! Store our dest ptr here.
542 1.2.6.2 tls mov %i2, %o2 ! Len counter
543 1.2.6.2 tls #endif /* _KERNEL */
544 1.2.6.2 tls
545 1.2.6.2 tls !!
546 1.2.6.2 tls !! First align the output to a 64-bit entity
547 1.2.6.2 tls !!
548 1.2.6.2 tls
549 1.2.6.2 tls mov %o1, %g1 ! memcpy retval
550 1.2.6.2 tls add %o0, %o2, %o5 ! End of source block
551 1.2.6.2 tls
552 1.2.6.2 tls andn %o0, 7, %o3 ! Start of block
553 1.2.6.2 tls dec %o5
554 1.2.6.2 tls fzero %f0
555 1.2.6.2 tls
556 1.2.6.2 tls andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
557 1.2.6.2 tls ldd [%o3], %f2 ! Load 1st word
558 1.2.6.2 tls
559 1.2.6.2 tls dec 8, %o3 ! Move %o3 1 word back
560 1.2.6.2 tls btst 1, %o1
561 1.2.6.2 tls bz 4f
562 1.2.6.2 tls
563 1.2.6.2 tls mov -7, %o4 ! Lowest src addr possible
564 1.2.6.2 tls alignaddr %o0, %o4, %o4 ! Base addr for load.
565 1.2.6.2 tls
566 1.2.6.2 tls cmp %o3, %o4
567 1.2.6.2 tls be,pt CCCR, 1f ! Already loaded?
568 1.2.6.2 tls mov %o4, %o3
569 1.2.6.2 tls fmovd %f2, %f0 ! No. Shift
570 1.2.6.2 tls ldd [%o3+8], %f2 ! And load
571 1.2.6.2 tls 1:
572 1.2.6.2 tls
573 1.2.6.2 tls faligndata %f0, %f2, %f4 ! Isolate 1st byte
574 1.2.6.2 tls
575 1.2.6.2 tls stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
576 1.2.6.2 tls inc 1, %o1 ! Update address
577 1.2.6.2 tls inc 1, %o0
578 1.2.6.2 tls dec 1, %o2
579 1.2.6.2 tls 4:
580 1.2.6.2 tls btst 2, %o1
581 1.2.6.2 tls bz 4f
582 1.2.6.2 tls
583 1.2.6.2 tls mov -6, %o4 ! Calculate src - 6
584 1.2.6.2 tls alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
585 1.2.6.2 tls
586 1.2.6.2 tls cmp %o3, %o4 ! Addresses same?
587 1.2.6.2 tls be,pt CCCR, 1f
588 1.2.6.2 tls mov %o4, %o3
589 1.2.6.2 tls fmovd %f2, %f0 ! Shuffle data
590 1.2.6.2 tls ldd [%o3+8], %f2 ! Load word 0
591 1.2.6.2 tls 1:
592 1.2.6.2 tls faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
593 1.2.6.2 tls
594 1.2.6.2 tls stda %f4, [%o1] ASI_FL16_P ! Store 1st short
595 1.2.6.2 tls dec 2, %o2
596 1.2.6.2 tls inc 2, %o1
597 1.2.6.2 tls inc 2, %o0
598 1.2.6.2 tls 4:
599 1.2.6.2 tls brz,pn %o2, Lmemcpy_blockfinish ! XXXX
600 1.2.6.2 tls
601 1.2.6.2 tls btst 4, %o1
602 1.2.6.2 tls bz 4f
603 1.2.6.2 tls
604 1.2.6.2 tls mov -4, %o4
605 1.2.6.2 tls alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
606 1.2.6.2 tls
607 1.2.6.2 tls cmp %o3, %o4 ! Addresses same?
608 1.2.6.2 tls beq,pt CCCR, 1f
609 1.2.6.2 tls mov %o4, %o3
610 1.2.6.2 tls fmovd %f2, %f0 ! Shuffle data
611 1.2.6.2 tls ldd [%o3+8], %f2 ! Load word 0
612 1.2.6.2 tls 1:
613 1.2.6.2 tls faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
614 1.2.6.2 tls
615 1.2.6.2 tls st %f5, [%o1] ! Store word
616 1.2.6.2 tls dec 4, %o2
617 1.2.6.2 tls inc 4, %o1
618 1.2.6.2 tls inc 4, %o0
619 1.2.6.2 tls 4:
620 1.2.6.2 tls brz,pn %o2, Lmemcpy_blockfinish ! XXXX
621 1.2.6.2 tls !!
622 1.2.6.2 tls !! We are now 32-bit aligned in the dest.
623 1.2.6.2 tls !!
624 1.2.6.2 tls Lmemcpy_block_common:
625 1.2.6.2 tls
626 1.2.6.2 tls mov -0, %o4
627 1.2.6.2 tls alignaddr %o0, %o4, %o4 ! base - shift
628 1.2.6.2 tls
629 1.2.6.2 tls cmp %o3, %o4 ! Addresses same?
630 1.2.6.2 tls beq,pt CCCR, 1f
631 1.2.6.2 tls mov %o4, %o3
632 1.2.6.2 tls fmovd %f2, %f0 ! Shuffle data
633 1.2.6.2 tls ldd [%o3+8], %f2 ! Load word 0
634 1.2.6.2 tls 1:
635 1.2.6.2 tls add %o3, 8, %o0 ! now use %o0 for src
636 1.2.6.2 tls
637 1.2.6.2 tls !!
638 1.2.6.2 tls !! Continue until our dest is block aligned
639 1.2.6.2 tls !!
640 1.2.6.2 tls Lmemcpy_block_aligned8:
641 1.2.6.2 tls 1:
642 1.2.6.2 tls brz %o2, Lmemcpy_blockfinish
643 1.2.6.2 tls btst BLOCK_ALIGN, %o1 ! Block aligned?
644 1.2.6.2 tls bz 1f
645 1.2.6.2 tls
646 1.2.6.2 tls faligndata %f0, %f2, %f4 ! Generate result
647 1.2.6.2 tls deccc 8, %o2
648 1.2.6.2 tls ble,pn %icc, Lmemcpy_blockfinish ! Should never happen
649 1.2.6.2 tls fmovd %f4, %f48
650 1.2.6.2 tls
651 1.2.6.2 tls std %f4, [%o1] ! Store result
652 1.2.6.2 tls inc 8, %o1
653 1.2.6.2 tls
654 1.2.6.2 tls fmovd %f2, %f0
655 1.2.6.2 tls inc 8, %o0
656 1.2.6.2 tls ba,pt %xcc, 1b ! Not yet.
657 1.2.6.2 tls ldd [%o0], %f2 ! Load next part
658 1.2.6.2 tls Lmemcpy_block_aligned64:
659 1.2.6.2 tls 1:
660 1.2.6.2 tls
661 1.2.6.2 tls /*
662 1.2.6.2 tls * 64-byte aligned -- ready for block operations.
663 1.2.6.2 tls *
664 1.2.6.2 tls * Here we have the destination block aligned, but the
665 1.2.6.2 tls * source pointer may not be. Sub-word alignment will
666 1.2.6.2 tls * be handled by faligndata instructions. But the source
667 1.2.6.2 tls * can still be potentially aligned to 8 different words
668 1.2.6.2 tls * in our 64-bit block, so we have 8 different copy routines.
669 1.2.6.2 tls *
670 1.2.6.2 tls * Once we figure out our source alignment, we branch
671 1.2.6.2 tls * to the appropriate copy routine, which sets up the
672 1.2.6.2 tls * alignment for faligndata and loads (sets) the values
673 1.2.6.2 tls * into the source registers and does the copy loop.
674 1.2.6.2 tls *
675 1.2.6.2 tls * When were down to less than 1 block to store, we
676 1.2.6.2 tls * exit the copy loop and execute cleanup code.
677 1.2.6.2 tls *
678 1.2.6.2 tls * Block loads and stores are not properly interlocked.
679 1.2.6.2 tls * Stores save one reg/cycle, so you can start overwriting
680 1.2.6.2 tls * registers the cycle after the store is issued.
681 1.2.6.2 tls *
682 1.2.6.2 tls * Block loads require a block load to a different register
683 1.2.6.2 tls * block or a membar #Sync before accessing the loaded
684 1.2.6.2 tls * data.
685 1.2.6.2 tls *
686 1.2.6.2 tls * Since the faligndata instructions may be offset as far
687 1.2.6.2 tls * as 7 registers into a block (if you are shifting source
688 1.2.6.2 tls * 7 -> dest 0), you need 3 source register blocks for full
689 1.2.6.2 tls * performance: one you are copying, one you are loading,
690 1.2.6.2 tls * and one for interlocking. Otherwise, we would need to
691 1.2.6.2 tls * sprinkle the code with membar #Sync and lose the advantage
692 1.2.6.2 tls * of running faligndata in parallel with block stores. This
693 1.2.6.2 tls * means we are fetching a full 128 bytes ahead of the stores.
694 1.2.6.2 tls * We need to make sure the prefetch does not inadvertently
695 1.2.6.2 tls * cross a page boundary and fault on data that we will never
696 1.2.6.2 tls * store.
697 1.2.6.2 tls *
698 1.2.6.2 tls */
699 1.2.6.2 tls #if 1
700 1.2.6.2 tls and %o0, BLOCK_ALIGN, %o3
701 1.2.6.2 tls srax %o3, 3, %o3 ! Isolate the offset
702 1.2.6.2 tls
703 1.2.6.2 tls brz %o3, L100 ! 0->0
704 1.2.6.2 tls btst 4, %o3
705 1.2.6.2 tls bnz %xcc, 4f
706 1.2.6.2 tls btst 2, %o3
707 1.2.6.2 tls bnz %xcc, 2f
708 1.2.6.2 tls btst 1, %o3
709 1.2.6.2 tls ba,pt %xcc, L101 ! 0->1
710 1.2.6.2 tls nop /* XXX spitfire bug */
711 1.2.6.2 tls 2:
712 1.2.6.2 tls bz %xcc, L102 ! 0->2
713 1.2.6.2 tls nop
714 1.2.6.2 tls ba,pt %xcc, L103 ! 0->3
715 1.2.6.2 tls nop /* XXX spitfire bug */
716 1.2.6.2 tls 4:
717 1.2.6.2 tls bnz %xcc, 2f
718 1.2.6.2 tls btst 1, %o3
719 1.2.6.2 tls bz %xcc, L104 ! 0->4
720 1.2.6.2 tls nop
721 1.2.6.2 tls ba,pt %xcc, L105 ! 0->5
722 1.2.6.2 tls nop /* XXX spitfire bug */
723 1.2.6.2 tls 2:
724 1.2.6.2 tls bz %xcc, L106 ! 0->6
725 1.2.6.2 tls nop
726 1.2.6.2 tls ba,pt %xcc, L107 ! 0->7
727 1.2.6.2 tls nop /* XXX spitfire bug */
728 1.2.6.2 tls #else
729 1.2.6.2 tls
730 1.2.6.2 tls !!
731 1.2.6.2 tls !! Isolate the word offset, which just happens to be
732 1.2.6.2 tls !! the slot in our jump table.
733 1.2.6.2 tls !!
734 1.2.6.2 tls !! This is 6 insns, most of which cannot be paired,
735 1.2.6.2 tls !! which is about the same as the above version.
736 1.2.6.2 tls !!
737 1.2.6.2 tls rd %pc, %o4
738 1.2.6.2 tls 1:
739 1.2.6.2 tls and %o0, 0x31, %o3
740 1.2.6.2 tls add %o3, (Lmemcpy_block_jmp - 1b), %o3
741 1.2.6.2 tls jmpl %o4 + %o3, %g0
742 1.2.6.2 tls nop
743 1.2.6.2 tls
744 1.2.6.2 tls !!
745 1.2.6.2 tls !! Jump table
746 1.2.6.2 tls !!
747 1.2.6.2 tls
748 1.2.6.2 tls Lmemcpy_block_jmp:
749 1.2.6.2 tls ba,a,pt %xcc, L100
750 1.2.6.2 tls nop
751 1.2.6.2 tls ba,a,pt %xcc, L101
752 1.2.6.2 tls nop
753 1.2.6.2 tls ba,a,pt %xcc, L102
754 1.2.6.2 tls nop
755 1.2.6.2 tls ba,a,pt %xcc, L103
756 1.2.6.2 tls nop
757 1.2.6.2 tls ba,a,pt %xcc, L104
758 1.2.6.2 tls nop
759 1.2.6.2 tls ba,a,pt %xcc, L105
760 1.2.6.2 tls nop
761 1.2.6.2 tls ba,a,pt %xcc, L106
762 1.2.6.2 tls nop
763 1.2.6.2 tls ba,a,pt %xcc, L107
764 1.2.6.2 tls nop
765 1.2.6.2 tls #endif
766 1.2.6.2 tls
767 1.2.6.2 tls !!
768 1.2.6.2 tls !! Source is block aligned.
769 1.2.6.2 tls !!
770 1.2.6.2 tls !! Just load a block and go.
771 1.2.6.2 tls !!
772 1.2.6.2 tls L100:
773 1.2.6.2 tls #ifdef RETURN_NAME
774 1.2.6.2 tls sethi %hi(1f), %g1
775 1.2.6.2 tls ba,pt %icc, 2f
776 1.2.6.2 tls or %g1, %lo(1f), %g1
777 1.2.6.2 tls 1:
778 1.2.6.2 tls .asciz "L100"
779 1.2.6.2 tls .align 8
780 1.2.6.2 tls 2:
781 1.2.6.2 tls #endif
782 1.2.6.2 tls fmovd %f0 , %f62
783 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
784 1.2.6.2 tls inc BLOCK_SIZE, %o0
785 1.2.6.2 tls cmp %o0, %o5
786 1.2.6.2 tls bleu,a,pn %icc, 3f
787 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
788 1.2.6.2 tls ba,pt %icc, 3f
789 1.2.6.2 tls membar #Sync
790 1.2.6.2 tls
791 1.2.6.2 tls .align 32 ! ICache align.
792 1.2.6.2 tls 3:
793 1.2.6.2 tls faligndata %f62, %f0, %f32
794 1.2.6.2 tls inc BLOCK_SIZE, %o0
795 1.2.6.2 tls faligndata %f0, %f2, %f34
796 1.2.6.2 tls dec BLOCK_SIZE, %o2
797 1.2.6.2 tls faligndata %f2, %f4, %f36
798 1.2.6.2 tls cmp %o0, %o5
799 1.2.6.2 tls faligndata %f4, %f6, %f38
800 1.2.6.2 tls faligndata %f6, %f8, %f40
801 1.2.6.2 tls faligndata %f8, %f10, %f42
802 1.2.6.2 tls faligndata %f10, %f12, %f44
803 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
804 1.2.6.2 tls faligndata %f12, %f14, %f46
805 1.2.6.2 tls
806 1.2.6.2 tls bleu,a,pn %icc, 2f
807 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
808 1.2.6.2 tls membar #Sync
809 1.2.6.2 tls 2:
810 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
811 1.2.6.2 tls faligndata %f14, %f16, %f32
812 1.2.6.2 tls inc BLOCK_SIZE, %o0
813 1.2.6.2 tls faligndata %f16, %f18, %f34
814 1.2.6.2 tls inc BLOCK_SIZE, %o1
815 1.2.6.2 tls faligndata %f18, %f20, %f36
816 1.2.6.2 tls dec BLOCK_SIZE, %o2
817 1.2.6.2 tls faligndata %f20, %f22, %f38
818 1.2.6.2 tls cmp %o0, %o5
819 1.2.6.2 tls faligndata %f22, %f24, %f40
820 1.2.6.2 tls faligndata %f24, %f26, %f42
821 1.2.6.2 tls faligndata %f26, %f28, %f44
822 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
823 1.2.6.2 tls faligndata %f28, %f30, %f46
824 1.2.6.2 tls
825 1.2.6.2 tls bleu,a,pn %icc, 2f
826 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
827 1.2.6.2 tls membar #Sync
828 1.2.6.2 tls 2:
829 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
830 1.2.6.2 tls faligndata %f30, %f48, %f32
831 1.2.6.2 tls inc BLOCK_SIZE, %o0
832 1.2.6.2 tls faligndata %f48, %f50, %f34
833 1.2.6.2 tls inc BLOCK_SIZE, %o1
834 1.2.6.2 tls faligndata %f50, %f52, %f36
835 1.2.6.2 tls dec BLOCK_SIZE, %o2
836 1.2.6.2 tls faligndata %f52, %f54, %f38
837 1.2.6.2 tls cmp %o0, %o5
838 1.2.6.2 tls faligndata %f54, %f56, %f40
839 1.2.6.2 tls faligndata %f56, %f58, %f42
840 1.2.6.2 tls faligndata %f58, %f60, %f44
841 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
842 1.2.6.2 tls faligndata %f60, %f62, %f46
843 1.2.6.2 tls bleu,a,pn %icc, 2f
844 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
845 1.2.6.2 tls membar #Sync
846 1.2.6.2 tls 2:
847 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
848 1.2.6.2 tls ba 3b
849 1.2.6.2 tls inc BLOCK_SIZE, %o1
850 1.2.6.2 tls
851 1.2.6.2 tls !!
852 1.2.6.2 tls !! Source at BLOCK_ALIGN+8
853 1.2.6.2 tls !!
854 1.2.6.2 tls !! We need to load almost 1 complete block by hand.
855 1.2.6.2 tls !!
856 1.2.6.2 tls L101:
857 1.2.6.2 tls #ifdef RETURN_NAME
858 1.2.6.2 tls sethi %hi(1f), %g1
859 1.2.6.2 tls ba,pt %icc, 2f
860 1.2.6.2 tls or %g1, %lo(1f), %g1
861 1.2.6.2 tls 1:
862 1.2.6.2 tls .asciz "L101"
863 1.2.6.2 tls .align 8
864 1.2.6.2 tls 2:
865 1.2.6.2 tls #endif
866 1.2.6.2 tls ! fmovd %f0, %f0 ! Hoist fmovd
867 1.2.6.2 tls ldd [%o0], %f2
868 1.2.6.2 tls inc 8, %o0
869 1.2.6.2 tls ldd [%o0], %f4
870 1.2.6.2 tls inc 8, %o0
871 1.2.6.2 tls ldd [%o0], %f6
872 1.2.6.2 tls inc 8, %o0
873 1.2.6.2 tls ldd [%o0], %f8
874 1.2.6.2 tls inc 8, %o0
875 1.2.6.2 tls ldd [%o0], %f10
876 1.2.6.2 tls inc 8, %o0
877 1.2.6.2 tls ldd [%o0], %f12
878 1.2.6.2 tls inc 8, %o0
879 1.2.6.2 tls ldd [%o0], %f14
880 1.2.6.2 tls inc 8, %o0
881 1.2.6.2 tls
882 1.2.6.2 tls cmp %o0, %o5
883 1.2.6.2 tls bleu,a,pn %icc, 3f
884 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
885 1.2.6.2 tls membar #Sync
886 1.2.6.2 tls 3:
887 1.2.6.2 tls faligndata %f0, %f2, %f32
888 1.2.6.2 tls inc BLOCK_SIZE, %o0
889 1.2.6.2 tls faligndata %f2, %f4, %f34
890 1.2.6.2 tls cmp %o0, %o5
891 1.2.6.2 tls faligndata %f4, %f6, %f36
892 1.2.6.2 tls dec BLOCK_SIZE, %o2
893 1.2.6.2 tls faligndata %f6, %f8, %f38
894 1.2.6.2 tls faligndata %f8, %f10, %f40
895 1.2.6.2 tls faligndata %f10, %f12, %f42
896 1.2.6.2 tls faligndata %f12, %f14, %f44
897 1.2.6.2 tls bleu,a,pn %icc, 2f
898 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
899 1.2.6.2 tls membar #Sync
900 1.2.6.2 tls 2:
901 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
902 1.2.6.2 tls faligndata %f14, %f16, %f46
903 1.2.6.2 tls
904 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
905 1.2.6.2 tls
906 1.2.6.2 tls faligndata %f16, %f18, %f32
907 1.2.6.2 tls inc BLOCK_SIZE, %o0
908 1.2.6.2 tls faligndata %f18, %f20, %f34
909 1.2.6.2 tls inc BLOCK_SIZE, %o1
910 1.2.6.2 tls faligndata %f20, %f22, %f36
911 1.2.6.2 tls cmp %o0, %o5
912 1.2.6.2 tls faligndata %f22, %f24, %f38
913 1.2.6.2 tls dec BLOCK_SIZE, %o2
914 1.2.6.2 tls faligndata %f24, %f26, %f40
915 1.2.6.2 tls faligndata %f26, %f28, %f42
916 1.2.6.2 tls faligndata %f28, %f30, %f44
917 1.2.6.2 tls bleu,a,pn %icc, 2f
918 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
919 1.2.6.2 tls membar #Sync
920 1.2.6.2 tls 2:
921 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
922 1.2.6.2 tls faligndata %f30, %f48, %f46
923 1.2.6.2 tls
924 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
925 1.2.6.2 tls
926 1.2.6.2 tls faligndata %f48, %f50, %f32
927 1.2.6.2 tls inc BLOCK_SIZE, %o0
928 1.2.6.2 tls faligndata %f50, %f52, %f34
929 1.2.6.2 tls inc BLOCK_SIZE, %o1
930 1.2.6.2 tls faligndata %f52, %f54, %f36
931 1.2.6.2 tls cmp %o0, %o5
932 1.2.6.2 tls faligndata %f54, %f56, %f38
933 1.2.6.2 tls dec BLOCK_SIZE, %o2
934 1.2.6.2 tls faligndata %f56, %f58, %f40
935 1.2.6.2 tls faligndata %f58, %f60, %f42
936 1.2.6.2 tls faligndata %f60, %f62, %f44
937 1.2.6.2 tls bleu,a,pn %icc, 2f
938 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
939 1.2.6.2 tls membar #Sync
940 1.2.6.2 tls 2:
941 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
942 1.2.6.2 tls faligndata %f62, %f0, %f46
943 1.2.6.2 tls
944 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
945 1.2.6.2 tls ba 3b
946 1.2.6.2 tls inc BLOCK_SIZE, %o1
947 1.2.6.2 tls
948 1.2.6.2 tls !!
949 1.2.6.2 tls !! Source at BLOCK_ALIGN+16
950 1.2.6.2 tls !!
951 1.2.6.2 tls !! We need to load 6 doubles by hand.
952 1.2.6.2 tls !!
953 1.2.6.2 tls L102:
954 1.2.6.2 tls #ifdef RETURN_NAME
955 1.2.6.2 tls sethi %hi(1f), %g1
956 1.2.6.2 tls ba,pt %icc, 2f
957 1.2.6.2 tls or %g1, %lo(1f), %g1
958 1.2.6.2 tls 1:
959 1.2.6.2 tls .asciz "L102"
960 1.2.6.2 tls .align 8
961 1.2.6.2 tls 2:
962 1.2.6.2 tls #endif
963 1.2.6.2 tls ldd [%o0], %f4
964 1.2.6.2 tls inc 8, %o0
965 1.2.6.2 tls fmovd %f0, %f2 ! Hoist fmovd
966 1.2.6.2 tls ldd [%o0], %f6
967 1.2.6.2 tls inc 8, %o0
968 1.2.6.2 tls
969 1.2.6.2 tls ldd [%o0], %f8
970 1.2.6.2 tls inc 8, %o0
971 1.2.6.2 tls ldd [%o0], %f10
972 1.2.6.2 tls inc 8, %o0
973 1.2.6.2 tls ldd [%o0], %f12
974 1.2.6.2 tls inc 8, %o0
975 1.2.6.2 tls ldd [%o0], %f14
976 1.2.6.2 tls inc 8, %o0
977 1.2.6.2 tls
978 1.2.6.2 tls cmp %o0, %o5
979 1.2.6.2 tls bleu,a,pn %icc, 3f
980 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
981 1.2.6.2 tls membar #Sync
982 1.2.6.2 tls 3:
983 1.2.6.2 tls faligndata %f2, %f4, %f32
984 1.2.6.2 tls inc BLOCK_SIZE, %o0
985 1.2.6.2 tls faligndata %f4, %f6, %f34
986 1.2.6.2 tls cmp %o0, %o5
987 1.2.6.2 tls faligndata %f6, %f8, %f36
988 1.2.6.2 tls dec BLOCK_SIZE, %o2
989 1.2.6.2 tls faligndata %f8, %f10, %f38
990 1.2.6.2 tls faligndata %f10, %f12, %f40
991 1.2.6.2 tls faligndata %f12, %f14, %f42
992 1.2.6.2 tls bleu,a,pn %icc, 2f
993 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
994 1.2.6.2 tls membar #Sync
995 1.2.6.2 tls 2:
996 1.2.6.2 tls faligndata %f14, %f16, %f44
997 1.2.6.2 tls
998 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
999 1.2.6.2 tls faligndata %f16, %f18, %f46
1000 1.2.6.2 tls
1001 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1002 1.2.6.2 tls
1003 1.2.6.2 tls faligndata %f18, %f20, %f32
1004 1.2.6.2 tls inc BLOCK_SIZE, %o0
1005 1.2.6.2 tls faligndata %f20, %f22, %f34
1006 1.2.6.2 tls inc BLOCK_SIZE, %o1
1007 1.2.6.2 tls faligndata %f22, %f24, %f36
1008 1.2.6.2 tls cmp %o0, %o5
1009 1.2.6.2 tls faligndata %f24, %f26, %f38
1010 1.2.6.2 tls dec BLOCK_SIZE, %o2
1011 1.2.6.2 tls faligndata %f26, %f28, %f40
1012 1.2.6.2 tls faligndata %f28, %f30, %f42
1013 1.2.6.2 tls bleu,a,pn %icc, 2f
1014 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
1015 1.2.6.2 tls membar #Sync
1016 1.2.6.2 tls 2:
1017 1.2.6.2 tls faligndata %f30, %f48, %f44
1018 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1019 1.2.6.2 tls faligndata %f48, %f50, %f46
1020 1.2.6.2 tls
1021 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1022 1.2.6.2 tls
1023 1.2.6.2 tls faligndata %f50, %f52, %f32
1024 1.2.6.2 tls inc BLOCK_SIZE, %o0
1025 1.2.6.2 tls faligndata %f52, %f54, %f34
1026 1.2.6.2 tls inc BLOCK_SIZE, %o1
1027 1.2.6.2 tls faligndata %f54, %f56, %f36
1028 1.2.6.2 tls cmp %o0, %o5
1029 1.2.6.2 tls faligndata %f56, %f58, %f38
1030 1.2.6.2 tls dec BLOCK_SIZE, %o2
1031 1.2.6.2 tls faligndata %f58, %f60, %f40
1032 1.2.6.2 tls faligndata %f60, %f62, %f42
1033 1.2.6.2 tls bleu,a,pn %icc, 2f
1034 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1035 1.2.6.2 tls membar #Sync
1036 1.2.6.2 tls 2:
1037 1.2.6.2 tls faligndata %f62, %f0, %f44
1038 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1039 1.2.6.2 tls faligndata %f0, %f2, %f46
1040 1.2.6.2 tls
1041 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1042 1.2.6.2 tls ba 3b
1043 1.2.6.2 tls inc BLOCK_SIZE, %o1
1044 1.2.6.2 tls
1045 1.2.6.2 tls !!
1046 1.2.6.2 tls !! Source at BLOCK_ALIGN+24
1047 1.2.6.2 tls !!
1048 1.2.6.2 tls !! We need to load 5 doubles by hand.
1049 1.2.6.2 tls !!
1050 1.2.6.2 tls L103:
1051 1.2.6.2 tls #ifdef RETURN_NAME
1052 1.2.6.2 tls sethi %hi(1f), %g1
1053 1.2.6.2 tls ba,pt %icc, 2f
1054 1.2.6.2 tls or %g1, %lo(1f), %g1
1055 1.2.6.2 tls 1:
1056 1.2.6.2 tls .asciz "L103"
1057 1.2.6.2 tls .align 8
1058 1.2.6.2 tls 2:
1059 1.2.6.2 tls #endif
1060 1.2.6.2 tls fmovd %f0, %f4
1061 1.2.6.2 tls ldd [%o0], %f6
1062 1.2.6.2 tls inc 8, %o0
1063 1.2.6.2 tls ldd [%o0], %f8
1064 1.2.6.2 tls inc 8, %o0
1065 1.2.6.2 tls ldd [%o0], %f10
1066 1.2.6.2 tls inc 8, %o0
1067 1.2.6.2 tls ldd [%o0], %f12
1068 1.2.6.2 tls inc 8, %o0
1069 1.2.6.2 tls ldd [%o0], %f14
1070 1.2.6.2 tls inc 8, %o0
1071 1.2.6.2 tls
1072 1.2.6.2 tls cmp %o0, %o5
1073 1.2.6.2 tls bleu,a,pn %icc, 2f
1074 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1075 1.2.6.2 tls membar #Sync
1076 1.2.6.2 tls 2:
1077 1.2.6.2 tls inc BLOCK_SIZE, %o0
1078 1.2.6.2 tls 3:
1079 1.2.6.2 tls faligndata %f4, %f6, %f32
1080 1.2.6.2 tls cmp %o0, %o5
1081 1.2.6.2 tls faligndata %f6, %f8, %f34
1082 1.2.6.2 tls dec BLOCK_SIZE, %o2
1083 1.2.6.2 tls faligndata %f8, %f10, %f36
1084 1.2.6.2 tls faligndata %f10, %f12, %f38
1085 1.2.6.2 tls faligndata %f12, %f14, %f40
1086 1.2.6.2 tls bleu,a,pn %icc, 2f
1087 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
1088 1.2.6.2 tls membar #Sync
1089 1.2.6.2 tls 2:
1090 1.2.6.2 tls faligndata %f14, %f16, %f42
1091 1.2.6.2 tls inc BLOCK_SIZE, %o0
1092 1.2.6.2 tls faligndata %f16, %f18, %f44
1093 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1094 1.2.6.2 tls faligndata %f18, %f20, %f46
1095 1.2.6.2 tls
1096 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1097 1.2.6.2 tls
1098 1.2.6.2 tls faligndata %f20, %f22, %f32
1099 1.2.6.2 tls cmp %o0, %o5
1100 1.2.6.2 tls faligndata %f22, %f24, %f34
1101 1.2.6.2 tls dec BLOCK_SIZE, %o2
1102 1.2.6.2 tls faligndata %f24, %f26, %f36
1103 1.2.6.2 tls inc BLOCK_SIZE, %o1
1104 1.2.6.2 tls faligndata %f26, %f28, %f38
1105 1.2.6.2 tls faligndata %f28, %f30, %f40
1106 1.2.6.2 tls ble,a,pn %icc, 2f
1107 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
1108 1.2.6.2 tls membar #Sync
1109 1.2.6.2 tls 2:
1110 1.2.6.2 tls faligndata %f30, %f48, %f42
1111 1.2.6.2 tls inc BLOCK_SIZE, %o0
1112 1.2.6.2 tls faligndata %f48, %f50, %f44
1113 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1114 1.2.6.2 tls faligndata %f50, %f52, %f46
1115 1.2.6.2 tls
1116 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1117 1.2.6.2 tls
1118 1.2.6.2 tls faligndata %f52, %f54, %f32
1119 1.2.6.2 tls cmp %o0, %o5
1120 1.2.6.2 tls faligndata %f54, %f56, %f34
1121 1.2.6.2 tls dec BLOCK_SIZE, %o2
1122 1.2.6.2 tls faligndata %f56, %f58, %f36
1123 1.2.6.2 tls faligndata %f58, %f60, %f38
1124 1.2.6.2 tls inc BLOCK_SIZE, %o1
1125 1.2.6.2 tls faligndata %f60, %f62, %f40
1126 1.2.6.2 tls bleu,a,pn %icc, 2f
1127 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1128 1.2.6.2 tls membar #Sync
1129 1.2.6.2 tls 2:
1130 1.2.6.2 tls faligndata %f62, %f0, %f42
1131 1.2.6.2 tls inc BLOCK_SIZE, %o0
1132 1.2.6.2 tls faligndata %f0, %f2, %f44
1133 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1134 1.2.6.2 tls faligndata %f2, %f4, %f46
1135 1.2.6.2 tls
1136 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1137 1.2.6.2 tls ba 3b
1138 1.2.6.2 tls inc BLOCK_SIZE, %o1
1139 1.2.6.2 tls
1140 1.2.6.2 tls !!
1141 1.2.6.2 tls !! Source at BLOCK_ALIGN+32
1142 1.2.6.2 tls !!
1143 1.2.6.2 tls !! We need to load 4 doubles by hand.
1144 1.2.6.2 tls !!
1145 1.2.6.2 tls L104:
1146 1.2.6.2 tls #ifdef RETURN_NAME
1147 1.2.6.2 tls sethi %hi(1f), %g1
1148 1.2.6.2 tls ba,pt %icc, 2f
1149 1.2.6.2 tls or %g1, %lo(1f), %g1
1150 1.2.6.2 tls 1:
1151 1.2.6.2 tls .asciz "L104"
1152 1.2.6.2 tls .align 8
1153 1.2.6.2 tls 2:
1154 1.2.6.2 tls #endif
1155 1.2.6.2 tls fmovd %f0, %f6
1156 1.2.6.2 tls ldd [%o0], %f8
1157 1.2.6.2 tls inc 8, %o0
1158 1.2.6.2 tls ldd [%o0], %f10
1159 1.2.6.2 tls inc 8, %o0
1160 1.2.6.2 tls ldd [%o0], %f12
1161 1.2.6.2 tls inc 8, %o0
1162 1.2.6.2 tls ldd [%o0], %f14
1163 1.2.6.2 tls inc 8, %o0
1164 1.2.6.2 tls
1165 1.2.6.2 tls cmp %o0, %o5
1166 1.2.6.2 tls bleu,a,pn %icc, 2f
1167 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1168 1.2.6.2 tls membar #Sync
1169 1.2.6.2 tls 2:
1170 1.2.6.2 tls inc BLOCK_SIZE, %o0
1171 1.2.6.2 tls 3:
1172 1.2.6.2 tls faligndata %f6, %f8, %f32
1173 1.2.6.2 tls cmp %o0, %o5
1174 1.2.6.2 tls faligndata %f8, %f10, %f34
1175 1.2.6.2 tls dec BLOCK_SIZE, %o2
1176 1.2.6.2 tls faligndata %f10, %f12, %f36
1177 1.2.6.2 tls faligndata %f12, %f14, %f38
1178 1.2.6.2 tls bleu,a,pn %icc, 2f
1179 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
1180 1.2.6.2 tls membar #Sync
1181 1.2.6.2 tls 2:
1182 1.2.6.2 tls faligndata %f14, %f16, %f40
1183 1.2.6.2 tls faligndata %f16, %f18, %f42
1184 1.2.6.2 tls inc BLOCK_SIZE, %o0
1185 1.2.6.2 tls faligndata %f18, %f20, %f44
1186 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1187 1.2.6.2 tls faligndata %f20, %f22, %f46
1188 1.2.6.2 tls
1189 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1190 1.2.6.2 tls
1191 1.2.6.2 tls faligndata %f22, %f24, %f32
1192 1.2.6.2 tls cmp %o0, %o5
1193 1.2.6.2 tls faligndata %f24, %f26, %f34
1194 1.2.6.2 tls faligndata %f26, %f28, %f36
1195 1.2.6.2 tls inc BLOCK_SIZE, %o1
1196 1.2.6.2 tls faligndata %f28, %f30, %f38
1197 1.2.6.2 tls bleu,a,pn %icc, 2f
1198 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
1199 1.2.6.2 tls membar #Sync
1200 1.2.6.2 tls 2:
1201 1.2.6.2 tls faligndata %f30, %f48, %f40
1202 1.2.6.2 tls dec BLOCK_SIZE, %o2
1203 1.2.6.2 tls faligndata %f48, %f50, %f42
1204 1.2.6.2 tls inc BLOCK_SIZE, %o0
1205 1.2.6.2 tls faligndata %f50, %f52, %f44
1206 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1207 1.2.6.2 tls faligndata %f52, %f54, %f46
1208 1.2.6.2 tls
1209 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1210 1.2.6.2 tls
1211 1.2.6.2 tls faligndata %f54, %f56, %f32
1212 1.2.6.2 tls cmp %o0, %o5
1213 1.2.6.2 tls faligndata %f56, %f58, %f34
1214 1.2.6.2 tls faligndata %f58, %f60, %f36
1215 1.2.6.2 tls inc BLOCK_SIZE, %o1
1216 1.2.6.2 tls faligndata %f60, %f62, %f38
1217 1.2.6.2 tls bleu,a,pn %icc, 2f
1218 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1219 1.2.6.2 tls membar #Sync
1220 1.2.6.2 tls 2:
1221 1.2.6.2 tls faligndata %f62, %f0, %f40
1222 1.2.6.2 tls dec BLOCK_SIZE, %o2
1223 1.2.6.2 tls faligndata %f0, %f2, %f42
1224 1.2.6.2 tls inc BLOCK_SIZE, %o0
1225 1.2.6.2 tls faligndata %f2, %f4, %f44
1226 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1227 1.2.6.2 tls faligndata %f4, %f6, %f46
1228 1.2.6.2 tls
1229 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1230 1.2.6.2 tls ba 3b
1231 1.2.6.2 tls inc BLOCK_SIZE, %o1
1232 1.2.6.2 tls
1233 1.2.6.2 tls !!
1234 1.2.6.2 tls !! Source at BLOCK_ALIGN+40
1235 1.2.6.2 tls !!
1236 1.2.6.2 tls !! We need to load 3 doubles by hand.
1237 1.2.6.2 tls !!
1238 1.2.6.2 tls L105:
1239 1.2.6.2 tls #ifdef RETURN_NAME
1240 1.2.6.2 tls sethi %hi(1f), %g1
1241 1.2.6.2 tls ba,pt %icc, 2f
1242 1.2.6.2 tls or %g1, %lo(1f), %g1
1243 1.2.6.2 tls 1:
1244 1.2.6.2 tls .asciz "L105"
1245 1.2.6.2 tls .align 8
1246 1.2.6.2 tls 2:
1247 1.2.6.2 tls #endif
1248 1.2.6.2 tls fmovd %f0, %f8
1249 1.2.6.2 tls ldd [%o0], %f10
1250 1.2.6.2 tls inc 8, %o0
1251 1.2.6.2 tls ldd [%o0], %f12
1252 1.2.6.2 tls inc 8, %o0
1253 1.2.6.2 tls ldd [%o0], %f14
1254 1.2.6.2 tls inc 8, %o0
1255 1.2.6.2 tls
1256 1.2.6.2 tls cmp %o0, %o5
1257 1.2.6.2 tls bleu,a,pn %icc, 2f
1258 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1259 1.2.6.2 tls membar #Sync
1260 1.2.6.2 tls 2:
1261 1.2.6.2 tls inc BLOCK_SIZE, %o0
1262 1.2.6.2 tls 3:
1263 1.2.6.2 tls faligndata %f8, %f10, %f32
1264 1.2.6.2 tls cmp %o0, %o5
1265 1.2.6.2 tls faligndata %f10, %f12, %f34
1266 1.2.6.2 tls faligndata %f12, %f14, %f36
1267 1.2.6.2 tls bleu,a,pn %icc, 2f
1268 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
1269 1.2.6.2 tls membar #Sync
1270 1.2.6.2 tls 2:
1271 1.2.6.2 tls faligndata %f14, %f16, %f38
1272 1.2.6.2 tls dec BLOCK_SIZE, %o2
1273 1.2.6.2 tls faligndata %f16, %f18, %f40
1274 1.2.6.2 tls inc BLOCK_SIZE, %o0
1275 1.2.6.2 tls faligndata %f18, %f20, %f42
1276 1.2.6.2 tls faligndata %f20, %f22, %f44
1277 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1278 1.2.6.2 tls faligndata %f22, %f24, %f46
1279 1.2.6.2 tls
1280 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1281 1.2.6.2 tls
1282 1.2.6.2 tls faligndata %f24, %f26, %f32
1283 1.2.6.2 tls cmp %o0, %o5
1284 1.2.6.2 tls faligndata %f26, %f28, %f34
1285 1.2.6.2 tls dec BLOCK_SIZE, %o2
1286 1.2.6.2 tls faligndata %f28, %f30, %f36
1287 1.2.6.2 tls bleu,a,pn %icc, 2f
1288 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
1289 1.2.6.2 tls membar #Sync
1290 1.2.6.2 tls 2:
1291 1.2.6.2 tls faligndata %f30, %f48, %f38
1292 1.2.6.2 tls inc BLOCK_SIZE, %o1
1293 1.2.6.2 tls faligndata %f48, %f50, %f40
1294 1.2.6.2 tls inc BLOCK_SIZE, %o0
1295 1.2.6.2 tls faligndata %f50, %f52, %f42
1296 1.2.6.2 tls faligndata %f52, %f54, %f44
1297 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1298 1.2.6.2 tls faligndata %f54, %f56, %f46
1299 1.2.6.2 tls
1300 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1301 1.2.6.2 tls
1302 1.2.6.2 tls faligndata %f56, %f58, %f32
1303 1.2.6.2 tls cmp %o0, %o5
1304 1.2.6.2 tls faligndata %f58, %f60, %f34
1305 1.2.6.2 tls dec BLOCK_SIZE, %o2
1306 1.2.6.2 tls faligndata %f60, %f62, %f36
1307 1.2.6.2 tls bleu,a,pn %icc, 2f
1308 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1309 1.2.6.2 tls membar #Sync
1310 1.2.6.2 tls 2:
1311 1.2.6.2 tls faligndata %f62, %f0, %f38
1312 1.2.6.2 tls inc BLOCK_SIZE, %o1
1313 1.2.6.2 tls faligndata %f0, %f2, %f40
1314 1.2.6.2 tls inc BLOCK_SIZE, %o0
1315 1.2.6.2 tls faligndata %f2, %f4, %f42
1316 1.2.6.2 tls faligndata %f4, %f6, %f44
1317 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1318 1.2.6.2 tls faligndata %f6, %f8, %f46
1319 1.2.6.2 tls
1320 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1321 1.2.6.2 tls ba 3b
1322 1.2.6.2 tls inc BLOCK_SIZE, %o1
1323 1.2.6.2 tls
1324 1.2.6.2 tls
1325 1.2.6.2 tls !!
1326 1.2.6.2 tls !! Source at BLOCK_ALIGN+48
1327 1.2.6.2 tls !!
1328 1.2.6.2 tls !! We need to load 2 doubles by hand.
1329 1.2.6.2 tls !!
1330 1.2.6.2 tls L106:
1331 1.2.6.2 tls #ifdef RETURN_NAME
1332 1.2.6.2 tls sethi %hi(1f), %g1
1333 1.2.6.2 tls ba,pt %icc, 2f
1334 1.2.6.2 tls or %g1, %lo(1f), %g1
1335 1.2.6.2 tls 1:
1336 1.2.6.2 tls .asciz "L106"
1337 1.2.6.2 tls .align 8
1338 1.2.6.2 tls 2:
1339 1.2.6.2 tls #endif
1340 1.2.6.2 tls fmovd %f0, %f10
1341 1.2.6.2 tls ldd [%o0], %f12
1342 1.2.6.2 tls inc 8, %o0
1343 1.2.6.2 tls ldd [%o0], %f14
1344 1.2.6.2 tls inc 8, %o0
1345 1.2.6.2 tls
1346 1.2.6.2 tls cmp %o0, %o5
1347 1.2.6.2 tls bleu,a,pn %icc, 2f
1348 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1349 1.2.6.2 tls membar #Sync
1350 1.2.6.2 tls 2:
1351 1.2.6.2 tls inc BLOCK_SIZE, %o0
1352 1.2.6.2 tls 3:
1353 1.2.6.2 tls faligndata %f10, %f12, %f32
1354 1.2.6.2 tls cmp %o0, %o5
1355 1.2.6.2 tls faligndata %f12, %f14, %f34
1356 1.2.6.2 tls bleu,a,pn %icc, 2f
1357 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
1358 1.2.6.2 tls membar #Sync
1359 1.2.6.2 tls 2:
1360 1.2.6.2 tls faligndata %f14, %f16, %f36
1361 1.2.6.2 tls dec BLOCK_SIZE, %o2
1362 1.2.6.2 tls faligndata %f16, %f18, %f38
1363 1.2.6.2 tls inc BLOCK_SIZE, %o0
1364 1.2.6.2 tls faligndata %f18, %f20, %f40
1365 1.2.6.2 tls faligndata %f20, %f22, %f42
1366 1.2.6.2 tls faligndata %f22, %f24, %f44
1367 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1368 1.2.6.2 tls faligndata %f24, %f26, %f46
1369 1.2.6.2 tls
1370 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1371 1.2.6.2 tls
1372 1.2.6.2 tls faligndata %f26, %f28, %f32
1373 1.2.6.2 tls cmp %o0, %o5
1374 1.2.6.2 tls faligndata %f28, %f30, %f34
1375 1.2.6.2 tls bleu,a,pn %icc, 2f
1376 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
1377 1.2.6.2 tls membar #Sync
1378 1.2.6.2 tls 2:
1379 1.2.6.2 tls faligndata %f30, %f48, %f36
1380 1.2.6.2 tls dec BLOCK_SIZE, %o2
1381 1.2.6.2 tls faligndata %f48, %f50, %f38
1382 1.2.6.2 tls inc BLOCK_SIZE, %o1
1383 1.2.6.2 tls faligndata %f50, %f52, %f40
1384 1.2.6.2 tls faligndata %f52, %f54, %f42
1385 1.2.6.2 tls inc BLOCK_SIZE, %o0
1386 1.2.6.2 tls faligndata %f54, %f56, %f44
1387 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1388 1.2.6.2 tls faligndata %f56, %f58, %f46
1389 1.2.6.2 tls
1390 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1391 1.2.6.2 tls
1392 1.2.6.2 tls faligndata %f58, %f60, %f32
1393 1.2.6.2 tls cmp %o0, %o5
1394 1.2.6.2 tls faligndata %f60, %f62, %f34
1395 1.2.6.2 tls bleu,a,pn %icc, 2f
1396 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1397 1.2.6.2 tls membar #Sync
1398 1.2.6.2 tls 2:
1399 1.2.6.2 tls faligndata %f62, %f0, %f36
1400 1.2.6.2 tls dec BLOCK_SIZE, %o2
1401 1.2.6.2 tls faligndata %f0, %f2, %f38
1402 1.2.6.2 tls inc BLOCK_SIZE, %o1
1403 1.2.6.2 tls faligndata %f2, %f4, %f40
1404 1.2.6.2 tls faligndata %f4, %f6, %f42
1405 1.2.6.2 tls inc BLOCK_SIZE, %o0
1406 1.2.6.2 tls faligndata %f6, %f8, %f44
1407 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1408 1.2.6.2 tls faligndata %f8, %f10, %f46
1409 1.2.6.2 tls
1410 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1411 1.2.6.2 tls ba 3b
1412 1.2.6.2 tls inc BLOCK_SIZE, %o1
1413 1.2.6.2 tls
1414 1.2.6.2 tls
1415 1.2.6.2 tls !!
1416 1.2.6.2 tls !! Source at BLOCK_ALIGN+56
1417 1.2.6.2 tls !!
1418 1.2.6.2 tls !! We need to load 1 double by hand.
1419 1.2.6.2 tls !!
1420 1.2.6.2 tls L107:
1421 1.2.6.2 tls #ifdef RETURN_NAME
1422 1.2.6.2 tls sethi %hi(1f), %g1
1423 1.2.6.2 tls ba,pt %icc, 2f
1424 1.2.6.2 tls or %g1, %lo(1f), %g1
1425 1.2.6.2 tls 1:
1426 1.2.6.2 tls .asciz "L107"
1427 1.2.6.2 tls .align 8
1428 1.2.6.2 tls 2:
1429 1.2.6.2 tls #endif
1430 1.2.6.2 tls fmovd %f0, %f12
1431 1.2.6.2 tls ldd [%o0], %f14
1432 1.2.6.2 tls inc 8, %o0
1433 1.2.6.2 tls
1434 1.2.6.2 tls cmp %o0, %o5
1435 1.2.6.2 tls bleu,a,pn %icc, 2f
1436 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1437 1.2.6.2 tls membar #Sync
1438 1.2.6.2 tls 2:
1439 1.2.6.2 tls inc BLOCK_SIZE, %o0
1440 1.2.6.2 tls 3:
1441 1.2.6.2 tls faligndata %f12, %f14, %f32
1442 1.2.6.2 tls cmp %o0, %o5
1443 1.2.6.2 tls bleu,a,pn %icc, 2f
1444 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f48
1445 1.2.6.2 tls membar #Sync
1446 1.2.6.2 tls 2:
1447 1.2.6.2 tls faligndata %f14, %f16, %f34
1448 1.2.6.2 tls dec BLOCK_SIZE, %o2
1449 1.2.6.2 tls faligndata %f16, %f18, %f36
1450 1.2.6.2 tls inc BLOCK_SIZE, %o0
1451 1.2.6.2 tls faligndata %f18, %f20, %f38
1452 1.2.6.2 tls faligndata %f20, %f22, %f40
1453 1.2.6.2 tls faligndata %f22, %f24, %f42
1454 1.2.6.2 tls faligndata %f24, %f26, %f44
1455 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1456 1.2.6.2 tls faligndata %f26, %f28, %f46
1457 1.2.6.2 tls
1458 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1459 1.2.6.2 tls
1460 1.2.6.2 tls faligndata %f28, %f30, %f32
1461 1.2.6.2 tls cmp %o0, %o5
1462 1.2.6.2 tls bleu,a,pn %icc, 2f
1463 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f0
1464 1.2.6.2 tls membar #Sync
1465 1.2.6.2 tls 2:
1466 1.2.6.2 tls faligndata %f30, %f48, %f34
1467 1.2.6.2 tls dec BLOCK_SIZE, %o2
1468 1.2.6.2 tls faligndata %f48, %f50, %f36
1469 1.2.6.2 tls inc BLOCK_SIZE, %o1
1470 1.2.6.2 tls faligndata %f50, %f52, %f38
1471 1.2.6.2 tls faligndata %f52, %f54, %f40
1472 1.2.6.2 tls inc BLOCK_SIZE, %o0
1473 1.2.6.2 tls faligndata %f54, %f56, %f42
1474 1.2.6.2 tls faligndata %f56, %f58, %f44
1475 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1476 1.2.6.2 tls faligndata %f58, %f60, %f46
1477 1.2.6.2 tls
1478 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1479 1.2.6.2 tls
1480 1.2.6.2 tls faligndata %f60, %f62, %f32
1481 1.2.6.2 tls cmp %o0, %o5
1482 1.2.6.2 tls bleu,a,pn %icc, 2f
1483 1.2.6.2 tls ldda [%o0] ASI_BLK_P, %f16
1484 1.2.6.2 tls membar #Sync
1485 1.2.6.2 tls 2:
1486 1.2.6.2 tls faligndata %f62, %f0, %f34
1487 1.2.6.2 tls dec BLOCK_SIZE, %o2
1488 1.2.6.2 tls faligndata %f0, %f2, %f36
1489 1.2.6.2 tls inc BLOCK_SIZE, %o1
1490 1.2.6.2 tls faligndata %f2, %f4, %f38
1491 1.2.6.2 tls faligndata %f4, %f6, %f40
1492 1.2.6.2 tls inc BLOCK_SIZE, %o0
1493 1.2.6.2 tls faligndata %f6, %f8, %f42
1494 1.2.6.2 tls faligndata %f8, %f10, %f44
1495 1.2.6.2 tls
1496 1.2.6.2 tls brlez,pn %o2, Lmemcpy_blockdone
1497 1.2.6.2 tls faligndata %f10, %f12, %f46
1498 1.2.6.2 tls
1499 1.2.6.2 tls stda %f32, [%o1] ASI_STORE
1500 1.2.6.2 tls ba 3b
1501 1.2.6.2 tls inc BLOCK_SIZE, %o1
1502 1.2.6.2 tls
1503 1.2.6.2 tls Lmemcpy_blockdone:
1504 1.2.6.2 tls inc BLOCK_SIZE, %o2 ! Fixup our overcommit
1505 1.2.6.2 tls membar #Sync ! Finish any pending loads
1506 1.2.6.2 tls #define FINISH_REG(f) \
1507 1.2.6.2 tls deccc 8, %o2; \
1508 1.2.6.2 tls bl,a Lmemcpy_blockfinish; \
1509 1.2.6.2 tls fmovd f, %f48; \
1510 1.2.6.2 tls std f, [%o1]; \
1511 1.2.6.2 tls inc 8, %o1
1512 1.2.6.2 tls
1513 1.2.6.2 tls FINISH_REG(%f32)
1514 1.2.6.2 tls FINISH_REG(%f34)
1515 1.2.6.2 tls FINISH_REG(%f36)
1516 1.2.6.2 tls FINISH_REG(%f38)
1517 1.2.6.2 tls FINISH_REG(%f40)
1518 1.2.6.2 tls FINISH_REG(%f42)
1519 1.2.6.2 tls FINISH_REG(%f44)
1520 1.2.6.2 tls FINISH_REG(%f46)
1521 1.2.6.2 tls FINISH_REG(%f48)
1522 1.2.6.2 tls #undef FINISH_REG
1523 1.2.6.2 tls !!
1524 1.2.6.2 tls !! The low 3 bits have the sub-word bits needed to be
1525 1.2.6.2 tls !! stored [because (x-8)&0x7 == x].
1526 1.2.6.2 tls !!
1527 1.2.6.2 tls Lmemcpy_blockfinish:
1528 1.2.6.2 tls brz,pn %o2, 2f ! 100% complete?
1529 1.2.6.2 tls fmovd %f48, %f4
1530 1.2.6.2 tls cmp %o2, 8 ! Exactly 8 bytes?
1531 1.2.6.2 tls bz,a,pn CCCR, 2f
1532 1.2.6.2 tls std %f4, [%o1]
1533 1.2.6.2 tls
1534 1.2.6.2 tls btst 4, %o2 ! Word store?
1535 1.2.6.2 tls bz CCCR, 1f
1536 1.2.6.2 tls nop
1537 1.2.6.2 tls st %f4, [%o1]
1538 1.2.6.2 tls inc 4, %o1
1539 1.2.6.2 tls 1:
1540 1.2.6.2 tls btst 2, %o2
1541 1.2.6.2 tls fzero %f0
1542 1.2.6.2 tls bz 1f
1543 1.2.6.2 tls
1544 1.2.6.2 tls mov -6, %o4
1545 1.2.6.2 tls alignaddr %o1, %o4, %g0
1546 1.2.6.2 tls
1547 1.2.6.2 tls faligndata %f0, %f4, %f8
1548 1.2.6.2 tls
1549 1.2.6.2 tls stda %f8, [%o1] ASI_FL16_P ! Store short
1550 1.2.6.2 tls inc 2, %o1
1551 1.2.6.2 tls 1:
1552 1.2.6.2 tls btst 1, %o2 ! Byte aligned?
1553 1.2.6.2 tls bz 2f
1554 1.2.6.2 tls
1555 1.2.6.2 tls mov -7, %o0 ! Calculate dest - 7
1556 1.2.6.2 tls alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
1557 1.2.6.2 tls
1558 1.2.6.2 tls faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
1559 1.2.6.2 tls
1560 1.2.6.2 tls stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
1561 1.2.6.2 tls inc 1, %o1 ! Update address
1562 1.2.6.2 tls 2:
1563 1.2.6.2 tls membar #Sync
1564 1.2.6.2 tls #if 0
1565 1.2.6.2 tls !!
1566 1.2.6.2 tls !! verify copy success.
1567 1.2.6.2 tls !!
1568 1.2.6.2 tls
1569 1.2.6.2 tls mov %i0, %o2
1570 1.2.6.2 tls mov %i1, %o4
1571 1.2.6.2 tls mov %i2, %l4
1572 1.2.6.2 tls 0:
1573 1.2.6.2 tls ldub [%o2], %o1
1574 1.2.6.2 tls inc %o2
1575 1.2.6.2 tls ldub [%o4], %o3
1576 1.2.6.2 tls inc %o4
1577 1.2.6.2 tls cmp %o3, %o1
1578 1.2.6.2 tls bnz 1f
1579 1.2.6.2 tls dec %l4
1580 1.2.6.2 tls brnz %l4, 0b
1581 1.2.6.2 tls nop
1582 1.2.6.2 tls ba 2f
1583 1.2.6.2 tls nop
1584 1.2.6.2 tls
1585 1.2.6.2 tls 1:
1586 1.2.6.2 tls set block_disable, %o0
1587 1.2.6.2 tls stx %o0, [%o0]
1588 1.2.6.2 tls
1589 1.2.6.2 tls set 0f, %o0
1590 1.2.6.2 tls call prom_printf
1591 1.2.6.2 tls sub %i2, %l4, %o5
1592 1.2.6.2 tls set 1f, %o0
1593 1.2.6.2 tls mov %i0, %o2
1594 1.2.6.2 tls mov %i1, %o1
1595 1.2.6.2 tls call prom_printf
1596 1.2.6.2 tls mov %i2, %o3
1597 1.2.6.2 tls ta 1
1598 1.2.6.2 tls .data
1599 1.2.6.2 tls _ALIGN
1600 1.2.6.2 tls 0: .asciz "block memcpy failed: %x@%p != %x@%p byte %d\r\n"
1601 1.2.6.2 tls 1: .asciz "memcpy(%p, %p, %lx)\r\n"
1602 1.2.6.2 tls _ALIGN
1603 1.2.6.2 tls .text
1604 1.2.6.2 tls 2:
1605 1.2.6.2 tls #endif
1606 1.2.6.2 tls #if defined(_KERNEL) && !defined(_RUMPKERNEL)
1607 1.2.6.2 tls
1608 1.2.6.2 tls /*
1609 1.2.6.2 tls * Weve saved our possible fpstate, now disable the fpu
1610 1.2.6.2 tls * and continue with life.
1611 1.2.6.2 tls */
1612 1.2.6.2 tls RESTORE_FPU
1613 1.2.6.2 tls ret
1614 1.2.6.2 tls restore %g1, 0, %o0 ! Return DEST for memcpy
1615 1.2.6.2 tls #endif
1616 1.2.6.2 tls retl
1617 1.2.6.2 tls mov %g1, %o0
1618 1.2.6.2 tls /*
1619 1.2.6.2 tls * Use block_disable to turn off block insns for
1620 1.2.6.2 tls * memcpy/memset
1621 1.2.6.2 tls */
1622 1.2.6.2 tls .data
1623 1.2.6.2 tls .align 8
1624 1.2.6.2 tls .globl block_disable
1625 1.2.6.2 tls block_disable: .xword 1
1626 1.2.6.2 tls .text
1627 1.2.6.2 tls #endif /* USE_BLOCK_STORE_LOAD */
1628