memcpy.S revision 1.2 1 /* $NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $ */
2
3 /*
4 * Copyright (c) 1996-2002 Eduardo Horvath
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26 #include "strmacros.h"
27 #if defined(LIBC_SCCS) && !defined(lint)
28 RCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $")
29 #endif /* LIBC_SCCS and not lint */
30
31 /*
32 * memcpy
33 * Assumes regions do not overlap;
34 *
35 * Must not use %g7 (see copyin/copyout above).
36 */
37 ENTRY(memcpy) /* dest, src, size */
38 /*
39 * Swap args for bcopy. Gcc generates calls to memcpy for
40 * structure assignments.
41 */
42 mov %o0, %o3
43 mov %o1, %o0
44 mov %o3, %o1
45 #if !defined(_KERNEL) || defined(_RUMPKERNEL)
46 ENTRY(bcopy) /* src, dest, size */
47 #endif
48 #ifdef DEBUG
49 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
50 set pmapdebug, %o4
51 ld [%o4], %o4
52 btst 0x80, %o4 ! PDB_COPY
53 bz,pt %icc, 3f
54 nop
55 #endif
56 save %sp, -CC64FSZ, %sp
57 mov %i0, %o1
58 set 2f, %o0
59 mov %i1, %o2
60 call printf
61 mov %i2, %o3
62 ! ta 1; nop
63 restore
64 .data
65 2: .asciz "memcpy(%p<-%p,%x)\n"
66 _ALIGN
67 .text
68 3:
69 #endif
70
71 cmp %o2, BCOPY_SMALL
72
73 Lmemcpy_start:
74 bge,pt CCCR, 2f ! if >= this many, go be fancy.
75 cmp %o2, 256
76
77 mov %o1, %o5 ! Save memcpy return value
78 /*
79 * Not much to copy, just do it a byte at a time.
80 */
81 deccc %o2 ! while (--len >= 0)
82 bl 1f
83 .empty
84 0:
85 inc %o0
86 ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
87 stb %o4, [%o1]
88 deccc %o2
89 bge 0b
90 inc %o1
91 1:
92 retl
93 mov %o5, %o0
94 NOTREACHED
95
96 /*
97 * Plenty of data to copy, so try to do it optimally.
98 */
99 2:
100 #ifdef USE_BLOCK_STORE_LOAD
101 ! If it is big enough, use VIS instructions
102 bge Lmemcpy_block
103 nop
104 #endif /* USE_BLOCK_STORE_LOAD */
105 Lmemcpy_fancy:
106
107 !!
108 !! First align the output to a 8-byte entity
109 !!
110
111 save %sp, -CC64FSZ, %sp
112
113 mov %i0, %l0
114 mov %i1, %l1
115
116 mov %i2, %l2
117 btst 1, %l1
118
119 bz,pt %icc, 4f
120 btst 2, %l1
121 ldub [%l0], %l4 ! Load 1st byte
122
123 deccc 1, %l2
124 ble,pn CCCR, Lmemcpy_finish ! XXXX
125 inc 1, %l0
126
127 stb %l4, [%l1] ! Store 1st byte
128 inc 1, %l1 ! Update address
129 btst 2, %l1
130 4:
131 bz,pt %icc, 4f
132
133 btst 1, %l0
134 bz,a 1f
135 lduh [%l0], %l4 ! Load short
136
137 ldub [%l0], %l4 ! Load bytes
138
139 ldub [%l0+1], %l3
140 sllx %l4, 8, %l4
141 or %l3, %l4, %l4
142
143 1:
144 deccc 2, %l2
145 ble,pn CCCR, Lmemcpy_finish ! XXXX
146 inc 2, %l0
147 sth %l4, [%l1] ! Store 1st short
148
149 inc 2, %l1
150 4:
151 btst 4, %l1
152 bz,pt CCCR, 4f
153
154 btst 3, %l0
155 bz,a,pt CCCR, 1f
156 lduw [%l0], %l4 ! Load word -1
157
158 btst 1, %l0
159 bz,a,pt %icc, 2f
160 lduh [%l0], %l4
161
162 ldub [%l0], %l4
163
164 lduh [%l0+1], %l3
165 sllx %l4, 16, %l4
166 or %l4, %l3, %l4
167
168 ldub [%l0+3], %l3
169 sllx %l4, 8, %l4
170 ba,pt %icc, 1f
171 or %l4, %l3, %l4
172
173 2:
174 lduh [%l0+2], %l3
175 sllx %l4, 16, %l4
176 or %l4, %l3, %l4
177
178 1:
179 deccc 4, %l2
180 ble,pn CCCR, Lmemcpy_finish ! XXXX
181 inc 4, %l0
182
183 st %l4, [%l1] ! Store word
184 inc 4, %l1
185 4:
186 !!
187 !! We are now 32-bit aligned in the dest.
188 !!
189 Lmemcpy_common:
190
191 and %l0, 7, %l4 ! Shift amount
192 andn %l0, 7, %l0 ! Source addr
193
194 brz,pt %l4, Lmemcpy_noshift8 ! No shift version...
195
196 sllx %l4, 3, %l4 ! In bits
197 mov 8<<3, %l3
198
199 ldx [%l0], %o0 ! Load word -1
200 sub %l3, %l4, %l3 ! Reverse shift
201 deccc 12*8, %l2 ! Have enough room?
202
203 sllx %o0, %l4, %o0
204 bl,pn CCCR, 2f
205 and %l3, 0x38, %l3
206 Lmemcpy_unrolled8:
207
208 /*
209 * This is about as close to optimal as you can get, since
210 * the shifts require EU0 and cannot be paired, and you have
211 * 3 dependent operations on the data.
212 */
213
214 ! ldx [%l0+0*8], %o0 ! Already done
215 ! sllx %o0, %l4, %o0 ! Already done
216 ldx [%l0+1*8], %o1
217 ldx [%l0+2*8], %o2
218 ldx [%l0+3*8], %o3
219 ldx [%l0+4*8], %o4
220 ba,pt %icc, 1f
221 ldx [%l0+5*8], %o5
222 .align 8
223 1:
224 srlx %o1, %l3, %g1
225 inc 6*8, %l0
226
227 sllx %o1, %l4, %o1
228 or %g1, %o0, %g6
229 ldx [%l0+0*8], %o0
230
231 stx %g6, [%l1+0*8]
232 srlx %o2, %l3, %g1
233
234 sllx %o2, %l4, %o2
235 or %g1, %o1, %g6
236 ldx [%l0+1*8], %o1
237
238 stx %g6, [%l1+1*8]
239 srlx %o3, %l3, %g1
240
241 sllx %o3, %l4, %o3
242 or %g1, %o2, %g6
243 ldx [%l0+2*8], %o2
244
245 stx %g6, [%l1+2*8]
246 srlx %o4, %l3, %g1
247
248 sllx %o4, %l4, %o4
249 or %g1, %o3, %g6
250 ldx [%l0+3*8], %o3
251
252 stx %g6, [%l1+3*8]
253 srlx %o5, %l3, %g1
254
255 sllx %o5, %l4, %o5
256 or %g1, %o4, %g6
257 ldx [%l0+4*8], %o4
258
259 stx %g6, [%l1+4*8]
260 srlx %o0, %l3, %g1
261 deccc 6*8, %l2 ! Have enough room?
262
263 sllx %o0, %l4, %o0 ! Next loop
264 or %g1, %o5, %g6
265 ldx [%l0+5*8], %o5
266
267 stx %g6, [%l1+5*8]
268 bge,pt CCCR, 1b
269 inc 6*8, %l1
270
271 Lmemcpy_unrolled8_cleanup:
272 !!
273 !! Finished 8 byte block, unload the regs.
274 !!
275 srlx %o1, %l3, %g1
276 inc 5*8, %l0
277
278 sllx %o1, %l4, %o1
279 or %g1, %o0, %g6
280
281 stx %g6, [%l1+0*8]
282 srlx %o2, %l3, %g1
283
284 sllx %o2, %l4, %o2
285 or %g1, %o1, %g6
286
287 stx %g6, [%l1+1*8]
288 srlx %o3, %l3, %g1
289
290 sllx %o3, %l4, %o3
291 or %g1, %o2, %g6
292
293 stx %g6, [%l1+2*8]
294 srlx %o4, %l3, %g1
295
296 sllx %o4, %l4, %o4
297 or %g1, %o3, %g6
298
299 stx %g6, [%l1+3*8]
300 srlx %o5, %l3, %g1
301
302 sllx %o5, %l4, %o5
303 or %g1, %o4, %g6
304
305 stx %g6, [%l1+4*8]
306 inc 5*8, %l1
307
308 mov %o5, %o0 ! Save our unused data
309 dec 5*8, %l2
310 2:
311 inccc 12*8, %l2
312 bz,pn %icc, Lmemcpy_complete
313
314 !! Unrolled 8 times
315 Lmemcpy_aligned8:
316 ! ldx [%l0], %o0 ! Already done
317 ! sllx %o0, %l4, %o0 ! Shift high word
318
319 deccc 8, %l2 ! Pre-decrement
320 bl,pn CCCR, Lmemcpy_finish
321 1:
322 ldx [%l0+8], %o1 ! Load word 0
323 inc 8, %l0
324
325 srlx %o1, %l3, %g6
326 or %g6, %o0, %g6 ! Combine
327
328 stx %g6, [%l1] ! Store result
329 inc 8, %l1
330
331 deccc 8, %l2
332 bge,pn CCCR, 1b
333 sllx %o1, %l4, %o0
334
335 btst 7, %l2 ! Done?
336 bz,pt CCCR, Lmemcpy_complete
337
338 !!
339 !! Loadup the last dregs into %o0 and shift it into place
340 !!
341 srlx %l3, 3, %g6 ! # bytes in %o0
342 dec 8, %g6 ! - 8
343 !! n-8 - (by - 8) -> n - by
344 subcc %l2, %g6, %g0 ! # bytes we need
345 ble,pt %icc, Lmemcpy_finish
346 nop
347 ldx [%l0+8], %o1 ! Need another word
348 srlx %o1, %l3, %o1
349 ba,pt %icc, Lmemcpy_finish
350 or %o0, %o1, %o0 ! All loaded up.
351
352 Lmemcpy_noshift8:
353 deccc 6*8, %l2 ! Have enough room?
354 bl,pn CCCR, 2f
355 nop
356 ba,pt %icc, 1f
357 nop
358 .align 32
359 1:
360 ldx [%l0+0*8], %o0
361 ldx [%l0+1*8], %o1
362 ldx [%l0+2*8], %o2
363 stx %o0, [%l1+0*8]
364 stx %o1, [%l1+1*8]
365 stx %o2, [%l1+2*8]
366
367
368 ldx [%l0+3*8], %o3
369 ldx [%l0+4*8], %o4
370 ldx [%l0+5*8], %o5
371 inc 6*8, %l0
372 stx %o3, [%l1+3*8]
373 deccc 6*8, %l2
374 stx %o4, [%l1+4*8]
375 stx %o5, [%l1+5*8]
376 bge,pt CCCR, 1b
377 inc 6*8, %l1
378 2:
379 inc 6*8, %l2
380 1:
381 deccc 8, %l2
382 bl,pn %icc, 1f ! < 0 --> sub word
383 nop
384 ldx [%l0], %g6
385 inc 8, %l0
386 stx %g6, [%l1]
387 bg,pt %icc, 1b ! Exactly 0 --> done
388 inc 8, %l1
389 1:
390 btst 7, %l2 ! Done?
391 bz,pt CCCR, Lmemcpy_complete
392 clr %l4
393 ldx [%l0], %o0
394 Lmemcpy_finish:
395
396 brz,pn %l2, 2f ! 100% complete?
397 cmp %l2, 8 ! Exactly 8 bytes?
398 bz,a,pn CCCR, 2f
399 stx %o0, [%l1]
400
401 btst 4, %l2 ! Word store?
402 bz CCCR, 1f
403 srlx %o0, 32, %g6 ! Shift high word down
404 stw %g6, [%l1]
405 inc 4, %l1
406 mov %o0, %g6 ! Operate on the low bits
407 1:
408 btst 2, %l2
409 mov %g6, %o0
410 bz 1f
411 srlx %o0, 16, %g6
412
413 sth %g6, [%l1] ! Store short
414 inc 2, %l1
415 mov %o0, %g6 ! Operate on low bytes
416 1:
417 mov %g6, %o0
418 btst 1, %l2 ! Byte aligned?
419 bz 2f
420 srlx %o0, 8, %g6
421
422 stb %g6, [%l1] ! Store last byte
423 inc 1, %l1 ! Update address
424 2:
425 Lmemcpy_complete:
426 #if 0
427 !!
428 !! verify copy success.
429 !!
430
431 mov %i0, %o2
432 mov %i1, %o4
433 mov %i2, %l4
434 0:
435 ldub [%o2], %o1
436 inc %o2
437 ldub [%o4], %o3
438 inc %o4
439 cmp %o3, %o1
440 bnz 1f
441 dec %l4
442 brnz %l4, 0b
443 nop
444 ba 2f
445 nop
446
447 1:
448 set 0f, %o0
449 call printf
450 sub %i2, %l4, %o5
451 set 1f, %o0
452 mov %i0, %o2
453 mov %i1, %o1
454 call printf
455 mov %i2, %o3
456 ta 1
457 .data
458 0: .asciz "memcpy failed: %x@%p != %x@%p byte %d\n"
459 1: .asciz "memcpy(%p, %p, %lx)\n"
460 .align 8
461 .text
462 2:
463 #endif
464 ret
465 restore %i1, %g0, %o0
466
467 #ifdef USE_BLOCK_STORE_LOAD
468
469 /*
470 * Block copy. Useful for >256 byte copies.
471 *
472 * Benchmarking has shown this always seems to be slower than
473 * the integer version, so this is disabled. Maybe someone will
474 * figure out why sometime.
475 */
476
477 Lmemcpy_block:
478 sethi %hi(block_disable), %o3
479 ldx [ %o3 + %lo(block_disable) ], %o3
480 brnz,pn %o3, Lmemcpy_fancy
481 !! Make sure our trap table is installed
482 set _C_LABEL(trapbase), %o5
483 rdpr %tba, %o3
484 sub %o3, %o5, %o3
485 brnz,pn %o3, Lmemcpy_fancy ! No, then don't use block load/store
486 nop
487 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
488 /*
489 * Kernel:
490 *
491 * Here we use VIS instructions to do a block clear of a page.
492 * But before we can do that we need to save and enable the FPU.
493 * The last owner of the FPU registers is fplwp, and
494 * fplwp->l_md.md_fpstate is the current fpstate. If that's not
495 * null, call savefpstate() with it to store our current fp state.
496 *
497 * Next, allocate an aligned fpstate on the stack. We will properly
498 * nest calls on a particular stack so this should not be a problem.
499 *
500 * Now we grab either curlwp (or if we're on the interrupt stack
501 * lwp0). We stash its existing fpstate in a local register and
502 * put our new fpstate in curlwp->p_md.md_fpstate. We point
503 * fplwp at curlwp (or lwp0) and enable the FPU.
504 *
505 * If we are ever preempted, our FPU state will be saved in our
506 * fpstate. Then, when we're resumed and we take an FPDISABLED
507 * trap, the trap handler will be able to fish our FPU state out
508 * of curlwp (or lwp0).
509 *
510 * On exiting this routine we undo the damage: restore the original
511 * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
512 * the MMU.
513 *
514 *
515 * Register usage, Kernel only (after save):
516 *
517 * %i0 src
518 * %i1 dest
519 * %i2 size
520 *
521 * %l0 XXXX DEBUG old fpstate
522 * %l1 fplwp (hi bits only)
523 * %l2 orig fplwp
524 * %l3 orig fpstate
525 * %l5 curlwp
526 * %l6 old fpstate
527 *
528 * Register ussage, Kernel and user:
529 *
530 * %g1 src (retval for memcpy)
531 *
532 * %o0 src
533 * %o1 dest
534 * %o2 end dest
535 * %o5 last safe fetchable address
536 */
537
538 ENABLE_FPU(0)
539
540 mov %i0, %o0 ! Src addr.
541 mov %i1, %o1 ! Store our dest ptr here.
542 mov %i2, %o2 ! Len counter
543 #endif /* _KERNEL */
544
545 !!
546 !! First align the output to a 64-bit entity
547 !!
548
549 mov %o1, %g1 ! memcpy retval
550 add %o0, %o2, %o5 ! End of source block
551
552 andn %o0, 7, %o3 ! Start of block
553 dec %o5
554 fzero %f0
555
556 andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
557 ldd [%o3], %f2 ! Load 1st word
558
559 dec 8, %o3 ! Move %o3 1 word back
560 btst 1, %o1
561 bz 4f
562
563 mov -7, %o4 ! Lowest src addr possible
564 alignaddr %o0, %o4, %o4 ! Base addr for load.
565
566 cmp %o3, %o4
567 be,pt CCCR, 1f ! Already loaded?
568 mov %o4, %o3
569 fmovd %f2, %f0 ! No. Shift
570 ldd [%o3+8], %f2 ! And load
571 1:
572
573 faligndata %f0, %f2, %f4 ! Isolate 1st byte
574
575 stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
576 inc 1, %o1 ! Update address
577 inc 1, %o0
578 dec 1, %o2
579 4:
580 btst 2, %o1
581 bz 4f
582
583 mov -6, %o4 ! Calculate src - 6
584 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
585
586 cmp %o3, %o4 ! Addresses same?
587 be,pt CCCR, 1f
588 mov %o4, %o3
589 fmovd %f2, %f0 ! Shuffle data
590 ldd [%o3+8], %f2 ! Load word 0
591 1:
592 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
593
594 stda %f4, [%o1] ASI_FL16_P ! Store 1st short
595 dec 2, %o2
596 inc 2, %o1
597 inc 2, %o0
598 4:
599 brz,pn %o2, Lmemcpy_blockfinish ! XXXX
600
601 btst 4, %o1
602 bz 4f
603
604 mov -4, %o4
605 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
606
607 cmp %o3, %o4 ! Addresses same?
608 beq,pt CCCR, 1f
609 mov %o4, %o3
610 fmovd %f2, %f0 ! Shuffle data
611 ldd [%o3+8], %f2 ! Load word 0
612 1:
613 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
614
615 st %f5, [%o1] ! Store word
616 dec 4, %o2
617 inc 4, %o1
618 inc 4, %o0
619 4:
620 brz,pn %o2, Lmemcpy_blockfinish ! XXXX
621 !!
622 !! We are now 32-bit aligned in the dest.
623 !!
624 Lmemcpy_block_common:
625
626 mov -0, %o4
627 alignaddr %o0, %o4, %o4 ! base - shift
628
629 cmp %o3, %o4 ! Addresses same?
630 beq,pt CCCR, 1f
631 mov %o4, %o3
632 fmovd %f2, %f0 ! Shuffle data
633 ldd [%o3+8], %f2 ! Load word 0
634 1:
635 add %o3, 8, %o0 ! now use %o0 for src
636
637 !!
638 !! Continue until our dest is block aligned
639 !!
640 Lmemcpy_block_aligned8:
641 1:
642 brz %o2, Lmemcpy_blockfinish
643 btst BLOCK_ALIGN, %o1 ! Block aligned?
644 bz 1f
645
646 faligndata %f0, %f2, %f4 ! Generate result
647 deccc 8, %o2
648 ble,pn %icc, Lmemcpy_blockfinish ! Should never happen
649 fmovd %f4, %f48
650
651 std %f4, [%o1] ! Store result
652 inc 8, %o1
653
654 fmovd %f2, %f0
655 inc 8, %o0
656 ba,pt %xcc, 1b ! Not yet.
657 ldd [%o0], %f2 ! Load next part
658 Lmemcpy_block_aligned64:
659 1:
660
661 /*
662 * 64-byte aligned -- ready for block operations.
663 *
664 * Here we have the destination block aligned, but the
665 * source pointer may not be. Sub-word alignment will
666 * be handled by faligndata instructions. But the source
667 * can still be potentially aligned to 8 different words
668 * in our 64-bit block, so we have 8 different copy routines.
669 *
670 * Once we figure out our source alignment, we branch
671 * to the appropriate copy routine, which sets up the
672 * alignment for faligndata and loads (sets) the values
673 * into the source registers and does the copy loop.
674 *
675 * When were down to less than 1 block to store, we
676 * exit the copy loop and execute cleanup code.
677 *
678 * Block loads and stores are not properly interlocked.
679 * Stores save one reg/cycle, so you can start overwriting
680 * registers the cycle after the store is issued.
681 *
682 * Block loads require a block load to a different register
683 * block or a membar #Sync before accessing the loaded
684 * data.
685 *
686 * Since the faligndata instructions may be offset as far
687 * as 7 registers into a block (if you are shifting source
688 * 7 -> dest 0), you need 3 source register blocks for full
689 * performance: one you are copying, one you are loading,
690 * and one for interlocking. Otherwise, we would need to
691 * sprinkle the code with membar #Sync and lose the advantage
692 * of running faligndata in parallel with block stores. This
693 * means we are fetching a full 128 bytes ahead of the stores.
694 * We need to make sure the prefetch does not inadvertently
695 * cross a page boundary and fault on data that we will never
696 * store.
697 *
698 */
699 #if 1
700 and %o0, BLOCK_ALIGN, %o3
701 srax %o3, 3, %o3 ! Isolate the offset
702
703 brz %o3, L100 ! 0->0
704 btst 4, %o3
705 bnz %xcc, 4f
706 btst 2, %o3
707 bnz %xcc, 2f
708 btst 1, %o3
709 ba,pt %xcc, L101 ! 0->1
710 nop /* XXX spitfire bug */
711 2:
712 bz %xcc, L102 ! 0->2
713 nop
714 ba,pt %xcc, L103 ! 0->3
715 nop /* XXX spitfire bug */
716 4:
717 bnz %xcc, 2f
718 btst 1, %o3
719 bz %xcc, L104 ! 0->4
720 nop
721 ba,pt %xcc, L105 ! 0->5
722 nop /* XXX spitfire bug */
723 2:
724 bz %xcc, L106 ! 0->6
725 nop
726 ba,pt %xcc, L107 ! 0->7
727 nop /* XXX spitfire bug */
728 #else
729
730 !!
731 !! Isolate the word offset, which just happens to be
732 !! the slot in our jump table.
733 !!
734 !! This is 6 insns, most of which cannot be paired,
735 !! which is about the same as the above version.
736 !!
737 rd %pc, %o4
738 1:
739 and %o0, 0x31, %o3
740 add %o3, (Lmemcpy_block_jmp - 1b), %o3
741 jmpl %o4 + %o3, %g0
742 nop
743
744 !!
745 !! Jump table
746 !!
747
748 Lmemcpy_block_jmp:
749 ba,a,pt %xcc, L100
750 nop
751 ba,a,pt %xcc, L101
752 nop
753 ba,a,pt %xcc, L102
754 nop
755 ba,a,pt %xcc, L103
756 nop
757 ba,a,pt %xcc, L104
758 nop
759 ba,a,pt %xcc, L105
760 nop
761 ba,a,pt %xcc, L106
762 nop
763 ba,a,pt %xcc, L107
764 nop
765 #endif
766
767 !!
768 !! Source is block aligned.
769 !!
770 !! Just load a block and go.
771 !!
772 L100:
773 #ifdef RETURN_NAME
774 sethi %hi(1f), %g1
775 ba,pt %icc, 2f
776 or %g1, %lo(1f), %g1
777 1:
778 .asciz "L100"
779 .align 8
780 2:
781 #endif
782 fmovd %f0 , %f62
783 ldda [%o0] ASI_BLK_P, %f0
784 inc BLOCK_SIZE, %o0
785 cmp %o0, %o5
786 bleu,a,pn %icc, 3f
787 ldda [%o0] ASI_BLK_P, %f16
788 ba,pt %icc, 3f
789 membar #Sync
790
791 .align 32 ! ICache align.
792 3:
793 faligndata %f62, %f0, %f32
794 inc BLOCK_SIZE, %o0
795 faligndata %f0, %f2, %f34
796 dec BLOCK_SIZE, %o2
797 faligndata %f2, %f4, %f36
798 cmp %o0, %o5
799 faligndata %f4, %f6, %f38
800 faligndata %f6, %f8, %f40
801 faligndata %f8, %f10, %f42
802 faligndata %f10, %f12, %f44
803 brlez,pn %o2, Lmemcpy_blockdone
804 faligndata %f12, %f14, %f46
805
806 bleu,a,pn %icc, 2f
807 ldda [%o0] ASI_BLK_P, %f48
808 membar #Sync
809 2:
810 stda %f32, [%o1] ASI_STORE
811 faligndata %f14, %f16, %f32
812 inc BLOCK_SIZE, %o0
813 faligndata %f16, %f18, %f34
814 inc BLOCK_SIZE, %o1
815 faligndata %f18, %f20, %f36
816 dec BLOCK_SIZE, %o2
817 faligndata %f20, %f22, %f38
818 cmp %o0, %o5
819 faligndata %f22, %f24, %f40
820 faligndata %f24, %f26, %f42
821 faligndata %f26, %f28, %f44
822 brlez,pn %o2, Lmemcpy_blockdone
823 faligndata %f28, %f30, %f46
824
825 bleu,a,pn %icc, 2f
826 ldda [%o0] ASI_BLK_P, %f0
827 membar #Sync
828 2:
829 stda %f32, [%o1] ASI_STORE
830 faligndata %f30, %f48, %f32
831 inc BLOCK_SIZE, %o0
832 faligndata %f48, %f50, %f34
833 inc BLOCK_SIZE, %o1
834 faligndata %f50, %f52, %f36
835 dec BLOCK_SIZE, %o2
836 faligndata %f52, %f54, %f38
837 cmp %o0, %o5
838 faligndata %f54, %f56, %f40
839 faligndata %f56, %f58, %f42
840 faligndata %f58, %f60, %f44
841 brlez,pn %o2, Lmemcpy_blockdone
842 faligndata %f60, %f62, %f46
843 bleu,a,pn %icc, 2f
844 ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
845 membar #Sync
846 2:
847 stda %f32, [%o1] ASI_STORE
848 ba 3b
849 inc BLOCK_SIZE, %o1
850
851 !!
852 !! Source at BLOCK_ALIGN+8
853 !!
854 !! We need to load almost 1 complete block by hand.
855 !!
856 L101:
857 #ifdef RETURN_NAME
858 sethi %hi(1f), %g1
859 ba,pt %icc, 2f
860 or %g1, %lo(1f), %g1
861 1:
862 .asciz "L101"
863 .align 8
864 2:
865 #endif
866 ! fmovd %f0, %f0 ! Hoist fmovd
867 ldd [%o0], %f2
868 inc 8, %o0
869 ldd [%o0], %f4
870 inc 8, %o0
871 ldd [%o0], %f6
872 inc 8, %o0
873 ldd [%o0], %f8
874 inc 8, %o0
875 ldd [%o0], %f10
876 inc 8, %o0
877 ldd [%o0], %f12
878 inc 8, %o0
879 ldd [%o0], %f14
880 inc 8, %o0
881
882 cmp %o0, %o5
883 bleu,a,pn %icc, 3f
884 ldda [%o0] ASI_BLK_P, %f16
885 membar #Sync
886 3:
887 faligndata %f0, %f2, %f32
888 inc BLOCK_SIZE, %o0
889 faligndata %f2, %f4, %f34
890 cmp %o0, %o5
891 faligndata %f4, %f6, %f36
892 dec BLOCK_SIZE, %o2
893 faligndata %f6, %f8, %f38
894 faligndata %f8, %f10, %f40
895 faligndata %f10, %f12, %f42
896 faligndata %f12, %f14, %f44
897 bleu,a,pn %icc, 2f
898 ldda [%o0] ASI_BLK_P, %f48
899 membar #Sync
900 2:
901 brlez,pn %o2, Lmemcpy_blockdone
902 faligndata %f14, %f16, %f46
903
904 stda %f32, [%o1] ASI_STORE
905
906 faligndata %f16, %f18, %f32
907 inc BLOCK_SIZE, %o0
908 faligndata %f18, %f20, %f34
909 inc BLOCK_SIZE, %o1
910 faligndata %f20, %f22, %f36
911 cmp %o0, %o5
912 faligndata %f22, %f24, %f38
913 dec BLOCK_SIZE, %o2
914 faligndata %f24, %f26, %f40
915 faligndata %f26, %f28, %f42
916 faligndata %f28, %f30, %f44
917 bleu,a,pn %icc, 2f
918 ldda [%o0] ASI_BLK_P, %f0
919 membar #Sync
920 2:
921 brlez,pn %o2, Lmemcpy_blockdone
922 faligndata %f30, %f48, %f46
923
924 stda %f32, [%o1] ASI_STORE
925
926 faligndata %f48, %f50, %f32
927 inc BLOCK_SIZE, %o0
928 faligndata %f50, %f52, %f34
929 inc BLOCK_SIZE, %o1
930 faligndata %f52, %f54, %f36
931 cmp %o0, %o5
932 faligndata %f54, %f56, %f38
933 dec BLOCK_SIZE, %o2
934 faligndata %f56, %f58, %f40
935 faligndata %f58, %f60, %f42
936 faligndata %f60, %f62, %f44
937 bleu,a,pn %icc, 2f
938 ldda [%o0] ASI_BLK_P, %f16
939 membar #Sync
940 2:
941 brlez,pn %o2, Lmemcpy_blockdone
942 faligndata %f62, %f0, %f46
943
944 stda %f32, [%o1] ASI_STORE
945 ba 3b
946 inc BLOCK_SIZE, %o1
947
948 !!
949 !! Source at BLOCK_ALIGN+16
950 !!
951 !! We need to load 6 doubles by hand.
952 !!
953 L102:
954 #ifdef RETURN_NAME
955 sethi %hi(1f), %g1
956 ba,pt %icc, 2f
957 or %g1, %lo(1f), %g1
958 1:
959 .asciz "L102"
960 .align 8
961 2:
962 #endif
963 ldd [%o0], %f4
964 inc 8, %o0
965 fmovd %f0, %f2 ! Hoist fmovd
966 ldd [%o0], %f6
967 inc 8, %o0
968
969 ldd [%o0], %f8
970 inc 8, %o0
971 ldd [%o0], %f10
972 inc 8, %o0
973 ldd [%o0], %f12
974 inc 8, %o0
975 ldd [%o0], %f14
976 inc 8, %o0
977
978 cmp %o0, %o5
979 bleu,a,pn %icc, 3f
980 ldda [%o0] ASI_BLK_P, %f16
981 membar #Sync
982 3:
983 faligndata %f2, %f4, %f32
984 inc BLOCK_SIZE, %o0
985 faligndata %f4, %f6, %f34
986 cmp %o0, %o5
987 faligndata %f6, %f8, %f36
988 dec BLOCK_SIZE, %o2
989 faligndata %f8, %f10, %f38
990 faligndata %f10, %f12, %f40
991 faligndata %f12, %f14, %f42
992 bleu,a,pn %icc, 2f
993 ldda [%o0] ASI_BLK_P, %f48
994 membar #Sync
995 2:
996 faligndata %f14, %f16, %f44
997
998 brlez,pn %o2, Lmemcpy_blockdone
999 faligndata %f16, %f18, %f46
1000
1001 stda %f32, [%o1] ASI_STORE
1002
1003 faligndata %f18, %f20, %f32
1004 inc BLOCK_SIZE, %o0
1005 faligndata %f20, %f22, %f34
1006 inc BLOCK_SIZE, %o1
1007 faligndata %f22, %f24, %f36
1008 cmp %o0, %o5
1009 faligndata %f24, %f26, %f38
1010 dec BLOCK_SIZE, %o2
1011 faligndata %f26, %f28, %f40
1012 faligndata %f28, %f30, %f42
1013 bleu,a,pn %icc, 2f
1014 ldda [%o0] ASI_BLK_P, %f0
1015 membar #Sync
1016 2:
1017 faligndata %f30, %f48, %f44
1018 brlez,pn %o2, Lmemcpy_blockdone
1019 faligndata %f48, %f50, %f46
1020
1021 stda %f32, [%o1] ASI_STORE
1022
1023 faligndata %f50, %f52, %f32
1024 inc BLOCK_SIZE, %o0
1025 faligndata %f52, %f54, %f34
1026 inc BLOCK_SIZE, %o1
1027 faligndata %f54, %f56, %f36
1028 cmp %o0, %o5
1029 faligndata %f56, %f58, %f38
1030 dec BLOCK_SIZE, %o2
1031 faligndata %f58, %f60, %f40
1032 faligndata %f60, %f62, %f42
1033 bleu,a,pn %icc, 2f
1034 ldda [%o0] ASI_BLK_P, %f16
1035 membar #Sync
1036 2:
1037 faligndata %f62, %f0, %f44
1038 brlez,pn %o2, Lmemcpy_blockdone
1039 faligndata %f0, %f2, %f46
1040
1041 stda %f32, [%o1] ASI_STORE
1042 ba 3b
1043 inc BLOCK_SIZE, %o1
1044
1045 !!
1046 !! Source at BLOCK_ALIGN+24
1047 !!
1048 !! We need to load 5 doubles by hand.
1049 !!
1050 L103:
1051 #ifdef RETURN_NAME
1052 sethi %hi(1f), %g1
1053 ba,pt %icc, 2f
1054 or %g1, %lo(1f), %g1
1055 1:
1056 .asciz "L103"
1057 .align 8
1058 2:
1059 #endif
1060 fmovd %f0, %f4
1061 ldd [%o0], %f6
1062 inc 8, %o0
1063 ldd [%o0], %f8
1064 inc 8, %o0
1065 ldd [%o0], %f10
1066 inc 8, %o0
1067 ldd [%o0], %f12
1068 inc 8, %o0
1069 ldd [%o0], %f14
1070 inc 8, %o0
1071
1072 cmp %o0, %o5
1073 bleu,a,pn %icc, 2f
1074 ldda [%o0] ASI_BLK_P, %f16
1075 membar #Sync
1076 2:
1077 inc BLOCK_SIZE, %o0
1078 3:
1079 faligndata %f4, %f6, %f32
1080 cmp %o0, %o5
1081 faligndata %f6, %f8, %f34
1082 dec BLOCK_SIZE, %o2
1083 faligndata %f8, %f10, %f36
1084 faligndata %f10, %f12, %f38
1085 faligndata %f12, %f14, %f40
1086 bleu,a,pn %icc, 2f
1087 ldda [%o0] ASI_BLK_P, %f48
1088 membar #Sync
1089 2:
1090 faligndata %f14, %f16, %f42
1091 inc BLOCK_SIZE, %o0
1092 faligndata %f16, %f18, %f44
1093 brlez,pn %o2, Lmemcpy_blockdone
1094 faligndata %f18, %f20, %f46
1095
1096 stda %f32, [%o1] ASI_STORE
1097
1098 faligndata %f20, %f22, %f32
1099 cmp %o0, %o5
1100 faligndata %f22, %f24, %f34
1101 dec BLOCK_SIZE, %o2
1102 faligndata %f24, %f26, %f36
1103 inc BLOCK_SIZE, %o1
1104 faligndata %f26, %f28, %f38
1105 faligndata %f28, %f30, %f40
1106 ble,a,pn %icc, 2f
1107 ldda [%o0] ASI_BLK_P, %f0
1108 membar #Sync
1109 2:
1110 faligndata %f30, %f48, %f42
1111 inc BLOCK_SIZE, %o0
1112 faligndata %f48, %f50, %f44
1113 brlez,pn %o2, Lmemcpy_blockdone
1114 faligndata %f50, %f52, %f46
1115
1116 stda %f32, [%o1] ASI_STORE
1117
1118 faligndata %f52, %f54, %f32
1119 cmp %o0, %o5
1120 faligndata %f54, %f56, %f34
1121 dec BLOCK_SIZE, %o2
1122 faligndata %f56, %f58, %f36
1123 faligndata %f58, %f60, %f38
1124 inc BLOCK_SIZE, %o1
1125 faligndata %f60, %f62, %f40
1126 bleu,a,pn %icc, 2f
1127 ldda [%o0] ASI_BLK_P, %f16
1128 membar #Sync
1129 2:
1130 faligndata %f62, %f0, %f42
1131 inc BLOCK_SIZE, %o0
1132 faligndata %f0, %f2, %f44
1133 brlez,pn %o2, Lmemcpy_blockdone
1134 faligndata %f2, %f4, %f46
1135
1136 stda %f32, [%o1] ASI_STORE
1137 ba 3b
1138 inc BLOCK_SIZE, %o1
1139
1140 !!
1141 !! Source at BLOCK_ALIGN+32
1142 !!
1143 !! We need to load 4 doubles by hand.
1144 !!
1145 L104:
1146 #ifdef RETURN_NAME
1147 sethi %hi(1f), %g1
1148 ba,pt %icc, 2f
1149 or %g1, %lo(1f), %g1
1150 1:
1151 .asciz "L104"
1152 .align 8
1153 2:
1154 #endif
1155 fmovd %f0, %f6
1156 ldd [%o0], %f8
1157 inc 8, %o0
1158 ldd [%o0], %f10
1159 inc 8, %o0
1160 ldd [%o0], %f12
1161 inc 8, %o0
1162 ldd [%o0], %f14
1163 inc 8, %o0
1164
1165 cmp %o0, %o5
1166 bleu,a,pn %icc, 2f
1167 ldda [%o0] ASI_BLK_P, %f16
1168 membar #Sync
1169 2:
1170 inc BLOCK_SIZE, %o0
1171 3:
1172 faligndata %f6, %f8, %f32
1173 cmp %o0, %o5
1174 faligndata %f8, %f10, %f34
1175 dec BLOCK_SIZE, %o2
1176 faligndata %f10, %f12, %f36
1177 faligndata %f12, %f14, %f38
1178 bleu,a,pn %icc, 2f
1179 ldda [%o0] ASI_BLK_P, %f48
1180 membar #Sync
1181 2:
1182 faligndata %f14, %f16, %f40
1183 faligndata %f16, %f18, %f42
1184 inc BLOCK_SIZE, %o0
1185 faligndata %f18, %f20, %f44
1186 brlez,pn %o2, Lmemcpy_blockdone
1187 faligndata %f20, %f22, %f46
1188
1189 stda %f32, [%o1] ASI_STORE
1190
1191 faligndata %f22, %f24, %f32
1192 cmp %o0, %o5
1193 faligndata %f24, %f26, %f34
1194 faligndata %f26, %f28, %f36
1195 inc BLOCK_SIZE, %o1
1196 faligndata %f28, %f30, %f38
1197 bleu,a,pn %icc, 2f
1198 ldda [%o0] ASI_BLK_P, %f0
1199 membar #Sync
1200 2:
1201 faligndata %f30, %f48, %f40
1202 dec BLOCK_SIZE, %o2
1203 faligndata %f48, %f50, %f42
1204 inc BLOCK_SIZE, %o0
1205 faligndata %f50, %f52, %f44
1206 brlez,pn %o2, Lmemcpy_blockdone
1207 faligndata %f52, %f54, %f46
1208
1209 stda %f32, [%o1] ASI_STORE
1210
1211 faligndata %f54, %f56, %f32
1212 cmp %o0, %o5
1213 faligndata %f56, %f58, %f34
1214 faligndata %f58, %f60, %f36
1215 inc BLOCK_SIZE, %o1
1216 faligndata %f60, %f62, %f38
1217 bleu,a,pn %icc, 2f
1218 ldda [%o0] ASI_BLK_P, %f16
1219 membar #Sync
1220 2:
1221 faligndata %f62, %f0, %f40
1222 dec BLOCK_SIZE, %o2
1223 faligndata %f0, %f2, %f42
1224 inc BLOCK_SIZE, %o0
1225 faligndata %f2, %f4, %f44
1226 brlez,pn %o2, Lmemcpy_blockdone
1227 faligndata %f4, %f6, %f46
1228
1229 stda %f32, [%o1] ASI_STORE
1230 ba 3b
1231 inc BLOCK_SIZE, %o1
1232
1233 !!
1234 !! Source at BLOCK_ALIGN+40
1235 !!
1236 !! We need to load 3 doubles by hand.
1237 !!
1238 L105:
1239 #ifdef RETURN_NAME
1240 sethi %hi(1f), %g1
1241 ba,pt %icc, 2f
1242 or %g1, %lo(1f), %g1
1243 1:
1244 .asciz "L105"
1245 .align 8
1246 2:
1247 #endif
1248 fmovd %f0, %f8
1249 ldd [%o0], %f10
1250 inc 8, %o0
1251 ldd [%o0], %f12
1252 inc 8, %o0
1253 ldd [%o0], %f14
1254 inc 8, %o0
1255
1256 cmp %o0, %o5
1257 bleu,a,pn %icc, 2f
1258 ldda [%o0] ASI_BLK_P, %f16
1259 membar #Sync
1260 2:
1261 inc BLOCK_SIZE, %o0
1262 3:
1263 faligndata %f8, %f10, %f32
1264 cmp %o0, %o5
1265 faligndata %f10, %f12, %f34
1266 faligndata %f12, %f14, %f36
1267 bleu,a,pn %icc, 2f
1268 ldda [%o0] ASI_BLK_P, %f48
1269 membar #Sync
1270 2:
1271 faligndata %f14, %f16, %f38
1272 dec BLOCK_SIZE, %o2
1273 faligndata %f16, %f18, %f40
1274 inc BLOCK_SIZE, %o0
1275 faligndata %f18, %f20, %f42
1276 faligndata %f20, %f22, %f44
1277 brlez,pn %o2, Lmemcpy_blockdone
1278 faligndata %f22, %f24, %f46
1279
1280 stda %f32, [%o1] ASI_STORE
1281
1282 faligndata %f24, %f26, %f32
1283 cmp %o0, %o5
1284 faligndata %f26, %f28, %f34
1285 dec BLOCK_SIZE, %o2
1286 faligndata %f28, %f30, %f36
1287 bleu,a,pn %icc, 2f
1288 ldda [%o0] ASI_BLK_P, %f0
1289 membar #Sync
1290 2:
1291 faligndata %f30, %f48, %f38
1292 inc BLOCK_SIZE, %o1
1293 faligndata %f48, %f50, %f40
1294 inc BLOCK_SIZE, %o0
1295 faligndata %f50, %f52, %f42
1296 faligndata %f52, %f54, %f44
1297 brlez,pn %o2, Lmemcpy_blockdone
1298 faligndata %f54, %f56, %f46
1299
1300 stda %f32, [%o1] ASI_STORE
1301
1302 faligndata %f56, %f58, %f32
1303 cmp %o0, %o5
1304 faligndata %f58, %f60, %f34
1305 dec BLOCK_SIZE, %o2
1306 faligndata %f60, %f62, %f36
1307 bleu,a,pn %icc, 2f
1308 ldda [%o0] ASI_BLK_P, %f16
1309 membar #Sync
1310 2:
1311 faligndata %f62, %f0, %f38
1312 inc BLOCK_SIZE, %o1
1313 faligndata %f0, %f2, %f40
1314 inc BLOCK_SIZE, %o0
1315 faligndata %f2, %f4, %f42
1316 faligndata %f4, %f6, %f44
1317 brlez,pn %o2, Lmemcpy_blockdone
1318 faligndata %f6, %f8, %f46
1319
1320 stda %f32, [%o1] ASI_STORE
1321 ba 3b
1322 inc BLOCK_SIZE, %o1
1323
1324
1325 !!
1326 !! Source at BLOCK_ALIGN+48
1327 !!
1328 !! We need to load 2 doubles by hand.
1329 !!
1330 L106:
1331 #ifdef RETURN_NAME
1332 sethi %hi(1f), %g1
1333 ba,pt %icc, 2f
1334 or %g1, %lo(1f), %g1
1335 1:
1336 .asciz "L106"
1337 .align 8
1338 2:
1339 #endif
1340 fmovd %f0, %f10
1341 ldd [%o0], %f12
1342 inc 8, %o0
1343 ldd [%o0], %f14
1344 inc 8, %o0
1345
1346 cmp %o0, %o5
1347 bleu,a,pn %icc, 2f
1348 ldda [%o0] ASI_BLK_P, %f16
1349 membar #Sync
1350 2:
1351 inc BLOCK_SIZE, %o0
1352 3:
1353 faligndata %f10, %f12, %f32
1354 cmp %o0, %o5
1355 faligndata %f12, %f14, %f34
1356 bleu,a,pn %icc, 2f
1357 ldda [%o0] ASI_BLK_P, %f48
1358 membar #Sync
1359 2:
1360 faligndata %f14, %f16, %f36
1361 dec BLOCK_SIZE, %o2
1362 faligndata %f16, %f18, %f38
1363 inc BLOCK_SIZE, %o0
1364 faligndata %f18, %f20, %f40
1365 faligndata %f20, %f22, %f42
1366 faligndata %f22, %f24, %f44
1367 brlez,pn %o2, Lmemcpy_blockdone
1368 faligndata %f24, %f26, %f46
1369
1370 stda %f32, [%o1] ASI_STORE
1371
1372 faligndata %f26, %f28, %f32
1373 cmp %o0, %o5
1374 faligndata %f28, %f30, %f34
1375 bleu,a,pn %icc, 2f
1376 ldda [%o0] ASI_BLK_P, %f0
1377 membar #Sync
1378 2:
1379 faligndata %f30, %f48, %f36
1380 dec BLOCK_SIZE, %o2
1381 faligndata %f48, %f50, %f38
1382 inc BLOCK_SIZE, %o1
1383 faligndata %f50, %f52, %f40
1384 faligndata %f52, %f54, %f42
1385 inc BLOCK_SIZE, %o0
1386 faligndata %f54, %f56, %f44
1387 brlez,pn %o2, Lmemcpy_blockdone
1388 faligndata %f56, %f58, %f46
1389
1390 stda %f32, [%o1] ASI_STORE
1391
1392 faligndata %f58, %f60, %f32
1393 cmp %o0, %o5
1394 faligndata %f60, %f62, %f34
1395 bleu,a,pn %icc, 2f
1396 ldda [%o0] ASI_BLK_P, %f16
1397 membar #Sync
1398 2:
1399 faligndata %f62, %f0, %f36
1400 dec BLOCK_SIZE, %o2
1401 faligndata %f0, %f2, %f38
1402 inc BLOCK_SIZE, %o1
1403 faligndata %f2, %f4, %f40
1404 faligndata %f4, %f6, %f42
1405 inc BLOCK_SIZE, %o0
1406 faligndata %f6, %f8, %f44
1407 brlez,pn %o2, Lmemcpy_blockdone
1408 faligndata %f8, %f10, %f46
1409
1410 stda %f32, [%o1] ASI_STORE
1411 ba 3b
1412 inc BLOCK_SIZE, %o1
1413
1414
1415 !!
1416 !! Source at BLOCK_ALIGN+56
1417 !!
1418 !! We need to load 1 double by hand.
1419 !!
1420 L107:
1421 #ifdef RETURN_NAME
1422 sethi %hi(1f), %g1
1423 ba,pt %icc, 2f
1424 or %g1, %lo(1f), %g1
1425 1:
1426 .asciz "L107"
1427 .align 8
1428 2:
1429 #endif
1430 fmovd %f0, %f12
1431 ldd [%o0], %f14
1432 inc 8, %o0
1433
1434 cmp %o0, %o5
1435 bleu,a,pn %icc, 2f
1436 ldda [%o0] ASI_BLK_P, %f16
1437 membar #Sync
1438 2:
1439 inc BLOCK_SIZE, %o0
1440 3:
1441 faligndata %f12, %f14, %f32
1442 cmp %o0, %o5
1443 bleu,a,pn %icc, 2f
1444 ldda [%o0] ASI_BLK_P, %f48
1445 membar #Sync
1446 2:
1447 faligndata %f14, %f16, %f34
1448 dec BLOCK_SIZE, %o2
1449 faligndata %f16, %f18, %f36
1450 inc BLOCK_SIZE, %o0
1451 faligndata %f18, %f20, %f38
1452 faligndata %f20, %f22, %f40
1453 faligndata %f22, %f24, %f42
1454 faligndata %f24, %f26, %f44
1455 brlez,pn %o2, Lmemcpy_blockdone
1456 faligndata %f26, %f28, %f46
1457
1458 stda %f32, [%o1] ASI_STORE
1459
1460 faligndata %f28, %f30, %f32
1461 cmp %o0, %o5
1462 bleu,a,pn %icc, 2f
1463 ldda [%o0] ASI_BLK_P, %f0
1464 membar #Sync
1465 2:
1466 faligndata %f30, %f48, %f34
1467 dec BLOCK_SIZE, %o2
1468 faligndata %f48, %f50, %f36
1469 inc BLOCK_SIZE, %o1
1470 faligndata %f50, %f52, %f38
1471 faligndata %f52, %f54, %f40
1472 inc BLOCK_SIZE, %o0
1473 faligndata %f54, %f56, %f42
1474 faligndata %f56, %f58, %f44
1475 brlez,pn %o2, Lmemcpy_blockdone
1476 faligndata %f58, %f60, %f46
1477
1478 stda %f32, [%o1] ASI_STORE
1479
1480 faligndata %f60, %f62, %f32
1481 cmp %o0, %o5
1482 bleu,a,pn %icc, 2f
1483 ldda [%o0] ASI_BLK_P, %f16
1484 membar #Sync
1485 2:
1486 faligndata %f62, %f0, %f34
1487 dec BLOCK_SIZE, %o2
1488 faligndata %f0, %f2, %f36
1489 inc BLOCK_SIZE, %o1
1490 faligndata %f2, %f4, %f38
1491 faligndata %f4, %f6, %f40
1492 inc BLOCK_SIZE, %o0
1493 faligndata %f6, %f8, %f42
1494 faligndata %f8, %f10, %f44
1495
1496 brlez,pn %o2, Lmemcpy_blockdone
1497 faligndata %f10, %f12, %f46
1498
1499 stda %f32, [%o1] ASI_STORE
1500 ba 3b
1501 inc BLOCK_SIZE, %o1
1502
1503 Lmemcpy_blockdone:
1504 inc BLOCK_SIZE, %o2 ! Fixup our overcommit
1505 membar #Sync ! Finish any pending loads
1506 #define FINISH_REG(f) \
1507 deccc 8, %o2; \
1508 bl,a Lmemcpy_blockfinish; \
1509 fmovd f, %f48; \
1510 std f, [%o1]; \
1511 inc 8, %o1
1512
1513 FINISH_REG(%f32)
1514 FINISH_REG(%f34)
1515 FINISH_REG(%f36)
1516 FINISH_REG(%f38)
1517 FINISH_REG(%f40)
1518 FINISH_REG(%f42)
1519 FINISH_REG(%f44)
1520 FINISH_REG(%f46)
1521 FINISH_REG(%f48)
1522 #undef FINISH_REG
1523 !!
1524 !! The low 3 bits have the sub-word bits needed to be
1525 !! stored [because (x-8)&0x7 == x].
1526 !!
1527 Lmemcpy_blockfinish:
1528 brz,pn %o2, 2f ! 100% complete?
1529 fmovd %f48, %f4
1530 cmp %o2, 8 ! Exactly 8 bytes?
1531 bz,a,pn CCCR, 2f
1532 std %f4, [%o1]
1533
1534 btst 4, %o2 ! Word store?
1535 bz CCCR, 1f
1536 nop
1537 st %f4, [%o1]
1538 inc 4, %o1
1539 1:
1540 btst 2, %o2
1541 fzero %f0
1542 bz 1f
1543
1544 mov -6, %o4
1545 alignaddr %o1, %o4, %g0
1546
1547 faligndata %f0, %f4, %f8
1548
1549 stda %f8, [%o1] ASI_FL16_P ! Store short
1550 inc 2, %o1
1551 1:
1552 btst 1, %o2 ! Byte aligned?
1553 bz 2f
1554
1555 mov -7, %o0 ! Calculate dest - 7
1556 alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
1557
1558 faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
1559
1560 stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
1561 inc 1, %o1 ! Update address
1562 2:
1563 membar #Sync
1564 #if 0
1565 !!
1566 !! verify copy success.
1567 !!
1568
1569 mov %i0, %o2
1570 mov %i1, %o4
1571 mov %i2, %l4
1572 0:
1573 ldub [%o2], %o1
1574 inc %o2
1575 ldub [%o4], %o3
1576 inc %o4
1577 cmp %o3, %o1
1578 bnz 1f
1579 dec %l4
1580 brnz %l4, 0b
1581 nop
1582 ba 2f
1583 nop
1584
1585 1:
1586 set block_disable, %o0
1587 stx %o0, [%o0]
1588
1589 set 0f, %o0
1590 call prom_printf
1591 sub %i2, %l4, %o5
1592 set 1f, %o0
1593 mov %i0, %o2
1594 mov %i1, %o1
1595 call prom_printf
1596 mov %i2, %o3
1597 ta 1
1598 .data
1599 _ALIGN
1600 0: .asciz "block memcpy failed: %x@%p != %x@%p byte %d\r\n"
1601 1: .asciz "memcpy(%p, %p, %lx)\r\n"
1602 _ALIGN
1603 .text
1604 2:
1605 #endif
1606 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
1607
1608 /*
1609 * Weve saved our possible fpstate, now disable the fpu
1610 * and continue with life.
1611 */
1612 RESTORE_FPU
1613 ret
1614 restore %g1, 0, %o0 ! Return DEST for memcpy
1615 #endif
1616 retl
1617 mov %g1, %o0
1618 /*
1619 * Use block_disable to turn off block insns for
1620 * memcpy/memset
1621 */
1622 .data
1623 .align 8
1624 .globl block_disable
1625 block_disable: .xword 1
1626 .text
1627 #endif /* USE_BLOCK_STORE_LOAD */
1628