19ad247e8Sjmcneill/*
29ad247e8Sjmcneill * Copyright (c) 2012
39ad247e8Sjmcneill *      MIPS Technologies, Inc., California.
49ad247e8Sjmcneill *
59ad247e8Sjmcneill * Redistribution and use in source and binary forms, with or without
69ad247e8Sjmcneill * modification, are permitted provided that the following conditions
79ad247e8Sjmcneill * are met:
89ad247e8Sjmcneill * 1. Redistributions of source code must retain the above copyright
99ad247e8Sjmcneill *    notice, this list of conditions and the following disclaimer.
109ad247e8Sjmcneill * 2. Redistributions in binary form must reproduce the above copyright
119ad247e8Sjmcneill *    notice, this list of conditions and the following disclaimer in the
129ad247e8Sjmcneill *    documentation and/or other materials provided with the distribution.
139ad247e8Sjmcneill * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
149ad247e8Sjmcneill *    contributors may be used to endorse or promote products derived from
159ad247e8Sjmcneill *    this software without specific prior written permission.
169ad247e8Sjmcneill *
179ad247e8Sjmcneill * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
189ad247e8Sjmcneill * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
199ad247e8Sjmcneill * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
209ad247e8Sjmcneill * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
219ad247e8Sjmcneill * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
229ad247e8Sjmcneill * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
239ad247e8Sjmcneill * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
249ad247e8Sjmcneill * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
259ad247e8Sjmcneill * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
269ad247e8Sjmcneill * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
279ad247e8Sjmcneill * SUCH DAMAGE.
289ad247e8Sjmcneill */
299ad247e8Sjmcneill
309ad247e8Sjmcneill#include "pixman-mips-dspr2-asm.h"
319ad247e8Sjmcneill
329ad247e8Sjmcneill/*
339ad247e8Sjmcneill * This routine could be optimized for MIPS64. The current code only
349ad247e8Sjmcneill * uses MIPS32 instructions.
359ad247e8Sjmcneill */
369ad247e8Sjmcneill
379ad247e8Sjmcneill#ifdef EB
389ad247e8Sjmcneill#  define LWHI	lwl		/* high part is left in big-endian */
399ad247e8Sjmcneill#  define SWHI	swl		/* high part is left in big-endian */
409ad247e8Sjmcneill#  define LWLO	lwr		/* low part is right in big-endian */
419ad247e8Sjmcneill#  define SWLO	swr		/* low part is right in big-endian */
429ad247e8Sjmcneill#else
439ad247e8Sjmcneill#  define LWHI	lwr		/* high part is right in little-endian */
449ad247e8Sjmcneill#  define SWHI	swr		/* high part is right in little-endian */
459ad247e8Sjmcneill#  define LWLO	lwl		/* low part is left in big-endian */
469ad247e8Sjmcneill#  define SWLO	swl		/* low part is left in big-endian */
479ad247e8Sjmcneill#endif
489ad247e8Sjmcneill
499ad247e8SjmcneillLEAF_MIPS32R2(pixman_mips_fast_memcpy)
509ad247e8Sjmcneill
519ad247e8Sjmcneill	slti	AT, a2, 8
529ad247e8Sjmcneill	bne	AT, zero, $last8
539ad247e8Sjmcneill	move	v0, a0	/* memcpy returns the dst pointer */
549ad247e8Sjmcneill
559ad247e8Sjmcneill/* Test if the src and dst are word-aligned, or can be made word-aligned */
569ad247e8Sjmcneill	xor	t8, a1, a0
579ad247e8Sjmcneill	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
589ad247e8Sjmcneill
599ad247e8Sjmcneill	bne	t8, zero, $unaligned
609ad247e8Sjmcneill	negu	a3, a0
619ad247e8Sjmcneill
629ad247e8Sjmcneill	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
639ad247e8Sjmcneill	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
649ad247e8Sjmcneill	subu	a2, a2, a3	/* now a2 is the remining bytes count */
659ad247e8Sjmcneill
669ad247e8Sjmcneill	LWHI	t8, 0(a1)
679ad247e8Sjmcneill	addu	a1, a1, a3
689ad247e8Sjmcneill	SWHI	t8, 0(a0)
699ad247e8Sjmcneill	addu	a0, a0, a3
709ad247e8Sjmcneill
719ad247e8Sjmcneill/* Now the dst/src are mutually word-aligned with word-aligned addresses */
729ad247e8Sjmcneill$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
739ad247e8Sjmcneill				/* t8 is the byte count after 64-byte chunks */
749ad247e8Sjmcneill
759ad247e8Sjmcneill	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
769ad247e8Sjmcneill				/* There will be at most 1 32-byte chunk after it */
779ad247e8Sjmcneill	subu	a3, a2, t8	/* subtract from a2 the reminder */
789ad247e8Sjmcneill                                /* Here a3 counts bytes in 16w chunks */
799ad247e8Sjmcneill	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
809ad247e8Sjmcneill
819ad247e8Sjmcneill	addu	t0, a0, a2	/* t0 is the "past the end" address */
829ad247e8Sjmcneill
839ad247e8Sjmcneill/*
849ad247e8Sjmcneill * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
859ad247e8Sjmcneill * the "t0-32" address
869ad247e8Sjmcneill * This means: for x=128 the last "safe" a0 address is "t0-160"
879ad247e8Sjmcneill * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
889ad247e8Sjmcneill * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
899ad247e8Sjmcneill */
909ad247e8Sjmcneill	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
919ad247e8Sjmcneill
929ad247e8Sjmcneill	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
939ad247e8Sjmcneill	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
949ad247e8Sjmcneill	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
959ad247e8Sjmcneill	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
969ad247e8Sjmcneill/* In case the a0 > t9 don't use "pref 30" at all */
979ad247e8Sjmcneill	sgtu	v1, a0, t9
989ad247e8Sjmcneill	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
999ad247e8Sjmcneill	nop
1009ad247e8Sjmcneill/* otherwise, start with using pref30 */
1019ad247e8Sjmcneill	pref	30, 64(a0)
1029ad247e8Sjmcneill$loop16w:
1039ad247e8Sjmcneill	pref	0, 96(a1)
1049ad247e8Sjmcneill	lw	t0, 0(a1)
1059ad247e8Sjmcneill	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
1069ad247e8Sjmcneill	lw	t1, 4(a1)
1079ad247e8Sjmcneill	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
1089ad247e8Sjmcneill$skip_pref30_96:
1099ad247e8Sjmcneill	lw	t2, 8(a1)
1109ad247e8Sjmcneill	lw	t3, 12(a1)
1119ad247e8Sjmcneill	lw	t4, 16(a1)
1129ad247e8Sjmcneill	lw	t5, 20(a1)
1139ad247e8Sjmcneill	lw	t6, 24(a1)
1149ad247e8Sjmcneill	lw	t7, 28(a1)
1159ad247e8Sjmcneill        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
1169ad247e8Sjmcneill
1179ad247e8Sjmcneill	sw	t0, 0(a0)
1189ad247e8Sjmcneill	sw	t1, 4(a0)
1199ad247e8Sjmcneill	sw	t2, 8(a0)
1209ad247e8Sjmcneill	sw	t3, 12(a0)
1219ad247e8Sjmcneill	sw	t4, 16(a0)
1229ad247e8Sjmcneill	sw	t5, 20(a0)
1239ad247e8Sjmcneill	sw	t6, 24(a0)
1249ad247e8Sjmcneill	sw	t7, 28(a0)
1259ad247e8Sjmcneill
1269ad247e8Sjmcneill	lw	t0, 32(a1)
1279ad247e8Sjmcneill	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
1289ad247e8Sjmcneill	lw	t1, 36(a1)
1299ad247e8Sjmcneill	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
1309ad247e8Sjmcneill$skip_pref30_128:
1319ad247e8Sjmcneill	lw	t2, 40(a1)
1329ad247e8Sjmcneill	lw	t3, 44(a1)
1339ad247e8Sjmcneill	lw	t4, 48(a1)
1349ad247e8Sjmcneill	lw	t5, 52(a1)
1359ad247e8Sjmcneill	lw	t6, 56(a1)
1369ad247e8Sjmcneill	lw	t7, 60(a1)
1379ad247e8Sjmcneill        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
1389ad247e8Sjmcneill
1399ad247e8Sjmcneill	sw	t0, 32(a0)
1409ad247e8Sjmcneill	sw	t1, 36(a0)
1419ad247e8Sjmcneill	sw	t2, 40(a0)
1429ad247e8Sjmcneill	sw	t3, 44(a0)
1439ad247e8Sjmcneill	sw	t4, 48(a0)
1449ad247e8Sjmcneill	sw	t5, 52(a0)
1459ad247e8Sjmcneill	sw	t6, 56(a0)
1469ad247e8Sjmcneill	sw	t7, 60(a0)
1479ad247e8Sjmcneill
1489ad247e8Sjmcneill	addiu	a0, a0, 64	/* adding 64 to dest */
1499ad247e8Sjmcneill	sgtu	v1, a0, t9
1509ad247e8Sjmcneill	bne	a0, a3, $loop16w
1519ad247e8Sjmcneill	addiu	a1, a1, 64	/* adding 64 to src */
1529ad247e8Sjmcneill	move	a2, t8
1539ad247e8Sjmcneill
1549ad247e8Sjmcneill/* Here we have src and dest word-aligned but less than 64-bytes to go */
1559ad247e8Sjmcneill
1569ad247e8Sjmcneill$chk8w:
1579ad247e8Sjmcneill	pref 0, 0x0(a1)
1589ad247e8Sjmcneill	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
1599ad247e8Sjmcneill				/* the t8 is the reminder count past 32-bytes */
1609ad247e8Sjmcneill	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
1619ad247e8Sjmcneill	 nop
1629ad247e8Sjmcneill
1639ad247e8Sjmcneill	lw	t0, 0(a1)
1649ad247e8Sjmcneill	lw	t1, 4(a1)
1659ad247e8Sjmcneill	lw	t2, 8(a1)
1669ad247e8Sjmcneill	lw	t3, 12(a1)
1679ad247e8Sjmcneill	lw	t4, 16(a1)
1689ad247e8Sjmcneill	lw	t5, 20(a1)
1699ad247e8Sjmcneill	lw	t6, 24(a1)
1709ad247e8Sjmcneill	lw	t7, 28(a1)
1719ad247e8Sjmcneill	addiu	a1, a1, 32
1729ad247e8Sjmcneill
1739ad247e8Sjmcneill	sw	t0, 0(a0)
1749ad247e8Sjmcneill	sw	t1, 4(a0)
1759ad247e8Sjmcneill	sw	t2, 8(a0)
1769ad247e8Sjmcneill	sw	t3, 12(a0)
1779ad247e8Sjmcneill	sw	t4, 16(a0)
1789ad247e8Sjmcneill	sw	t5, 20(a0)
1799ad247e8Sjmcneill	sw	t6, 24(a0)
1809ad247e8Sjmcneill	sw	t7, 28(a0)
1819ad247e8Sjmcneill	addiu	a0, a0, 32
1829ad247e8Sjmcneill
1839ad247e8Sjmcneill$chk1w:
1849ad247e8Sjmcneill	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
1859ad247e8Sjmcneill	beq	a2, t8, $last8
1869ad247e8Sjmcneill	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
1879ad247e8Sjmcneill	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
1889ad247e8Sjmcneill
1899ad247e8Sjmcneill/* copying in words (4-byte chunks) */
1909ad247e8Sjmcneill$wordCopy_loop:
1919ad247e8Sjmcneill	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
1929ad247e8Sjmcneill	addiu	a1, a1, 4
1939ad247e8Sjmcneill	addiu	a0, a0, 4
1949ad247e8Sjmcneill	bne	a0, a3, $wordCopy_loop
1959ad247e8Sjmcneill	sw	t3, -4(a0)
1969ad247e8Sjmcneill
1979ad247e8Sjmcneill/* For the last (<8) bytes */
1989ad247e8Sjmcneill$last8:
1999ad247e8Sjmcneill	blez	a2, leave
2009ad247e8Sjmcneill	addu	a3, a0, a2	/* a3 is the last dst address */
2019ad247e8Sjmcneill$last8loop:
2029ad247e8Sjmcneill	lb	v1, 0(a1)
2039ad247e8Sjmcneill	addiu	a1, a1, 1
2049ad247e8Sjmcneill	addiu	a0, a0, 1
2059ad247e8Sjmcneill	bne	a0, a3, $last8loop
2069ad247e8Sjmcneill	sb	v1, -1(a0)
2079ad247e8Sjmcneill
2089ad247e8Sjmcneillleave:	j	ra
2099ad247e8Sjmcneill	nop
2109ad247e8Sjmcneill
2119ad247e8Sjmcneill/*
2129ad247e8Sjmcneill * UNALIGNED case
2139ad247e8Sjmcneill */
2149ad247e8Sjmcneill
2159ad247e8Sjmcneill$unaligned:
2169ad247e8Sjmcneill	/* got here with a3="negu a0" */
2179ad247e8Sjmcneill	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
2189ad247e8Sjmcneill	beqz	a3, $ua_chk16w
2199ad247e8Sjmcneill	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
2209ad247e8Sjmcneill
2219ad247e8Sjmcneill	LWHI	v1, 0(a1)
2229ad247e8Sjmcneill	LWLO	v1, 3(a1)
2239ad247e8Sjmcneill	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
2249ad247e8Sjmcneill	SWHI	v1, 0(a0)
2259ad247e8Sjmcneill	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
2269ad247e8Sjmcneill
2279ad247e8Sjmcneill$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
2289ad247e8Sjmcneill				/* t8 is the byte count after 64-byte chunks */
2299ad247e8Sjmcneill	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
2309ad247e8Sjmcneill				/* There will be at most 1 32-byte chunk after it */
2319ad247e8Sjmcneill	subu	a3, a2, t8	/* subtract from a2 the reminder */
2329ad247e8Sjmcneill                                /* Here a3 counts bytes in 16w chunks */
2339ad247e8Sjmcneill	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
2349ad247e8Sjmcneill
2359ad247e8Sjmcneill	addu	t0, a0, a2	/* t0 is the "past the end" address */
2369ad247e8Sjmcneill
2379ad247e8Sjmcneill	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
2389ad247e8Sjmcneill
2399ad247e8Sjmcneill	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
2409ad247e8Sjmcneill	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
2419ad247e8Sjmcneill	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
2429ad247e8Sjmcneill	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
2439ad247e8Sjmcneill/* In case the a0 > t9 don't use "pref 30" at all */
2449ad247e8Sjmcneill	sgtu	v1, a0, t9
2459ad247e8Sjmcneill	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
2469ad247e8Sjmcneill	nop
2479ad247e8Sjmcneill/* otherwise,  start with using pref30 */
2489ad247e8Sjmcneill	pref	30, 64(a0)
2499ad247e8Sjmcneill$ua_loop16w:
2509ad247e8Sjmcneill	pref	0, 96(a1)
2519ad247e8Sjmcneill	LWHI	t0, 0(a1)
2529ad247e8Sjmcneill	LWLO	t0, 3(a1)
2539ad247e8Sjmcneill	LWHI	t1, 4(a1)
2549ad247e8Sjmcneill	bgtz	v1, $ua_skip_pref30_96
2559ad247e8Sjmcneill	LWLO	t1, 7(a1)
2569ad247e8Sjmcneill	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
2579ad247e8Sjmcneill$ua_skip_pref30_96:
2589ad247e8Sjmcneill	LWHI	t2, 8(a1)
2599ad247e8Sjmcneill	LWLO	t2, 11(a1)
2609ad247e8Sjmcneill	LWHI	t3, 12(a1)
2619ad247e8Sjmcneill	LWLO	t3, 15(a1)
2629ad247e8Sjmcneill	LWHI	t4, 16(a1)
2639ad247e8Sjmcneill	LWLO	t4, 19(a1)
2649ad247e8Sjmcneill	LWHI	t5, 20(a1)
2659ad247e8Sjmcneill	LWLO	t5, 23(a1)
2669ad247e8Sjmcneill	LWHI	t6, 24(a1)
2679ad247e8Sjmcneill	LWLO	t6, 27(a1)
2689ad247e8Sjmcneill	LWHI	t7, 28(a1)
2699ad247e8Sjmcneill	LWLO	t7, 31(a1)
2709ad247e8Sjmcneill        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
2719ad247e8Sjmcneill
2729ad247e8Sjmcneill	sw	t0, 0(a0)
2739ad247e8Sjmcneill	sw	t1, 4(a0)
2749ad247e8Sjmcneill	sw	t2, 8(a0)
2759ad247e8Sjmcneill	sw	t3, 12(a0)
2769ad247e8Sjmcneill	sw	t4, 16(a0)
2779ad247e8Sjmcneill	sw	t5, 20(a0)
2789ad247e8Sjmcneill	sw	t6, 24(a0)
2799ad247e8Sjmcneill	sw	t7, 28(a0)
2809ad247e8Sjmcneill
2819ad247e8Sjmcneill	LWHI	t0, 32(a1)
2829ad247e8Sjmcneill	LWLO	t0, 35(a1)
2839ad247e8Sjmcneill	LWHI	t1, 36(a1)
2849ad247e8Sjmcneill	bgtz	v1, $ua_skip_pref30_128
2859ad247e8Sjmcneill	LWLO	t1, 39(a1)
2869ad247e8Sjmcneill	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
2879ad247e8Sjmcneill$ua_skip_pref30_128:
2889ad247e8Sjmcneill	LWHI	t2, 40(a1)
2899ad247e8Sjmcneill	LWLO	t2, 43(a1)
2909ad247e8Sjmcneill	LWHI	t3, 44(a1)
2919ad247e8Sjmcneill	LWLO	t3, 47(a1)
2929ad247e8Sjmcneill	LWHI	t4, 48(a1)
2939ad247e8Sjmcneill	LWLO	t4, 51(a1)
2949ad247e8Sjmcneill	LWHI	t5, 52(a1)
2959ad247e8Sjmcneill	LWLO	t5, 55(a1)
2969ad247e8Sjmcneill	LWHI	t6, 56(a1)
2979ad247e8Sjmcneill	LWLO	t6, 59(a1)
2989ad247e8Sjmcneill	LWHI	t7, 60(a1)
2999ad247e8Sjmcneill	LWLO	t7, 63(a1)
3009ad247e8Sjmcneill        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
3019ad247e8Sjmcneill
3029ad247e8Sjmcneill	sw	t0, 32(a0)
3039ad247e8Sjmcneill	sw	t1, 36(a0)
3049ad247e8Sjmcneill	sw	t2, 40(a0)
3059ad247e8Sjmcneill	sw	t3, 44(a0)
3069ad247e8Sjmcneill	sw	t4, 48(a0)
3079ad247e8Sjmcneill	sw	t5, 52(a0)
3089ad247e8Sjmcneill	sw	t6, 56(a0)
3099ad247e8Sjmcneill	sw	t7, 60(a0)
3109ad247e8Sjmcneill
3119ad247e8Sjmcneill	addiu	a0, a0, 64	/* adding 64 to dest */
3129ad247e8Sjmcneill	sgtu	v1, a0, t9
3139ad247e8Sjmcneill	bne	a0, a3, $ua_loop16w
3149ad247e8Sjmcneill	addiu	a1, a1, 64	/* adding 64 to src */
3159ad247e8Sjmcneill	move	a2, t8
3169ad247e8Sjmcneill
3179ad247e8Sjmcneill/* Here we have src and dest word-aligned but less than 64-bytes to go */
3189ad247e8Sjmcneill
3199ad247e8Sjmcneill$ua_chk8w:
3209ad247e8Sjmcneill	pref 0, 0x0(a1)
3219ad247e8Sjmcneill	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
3229ad247e8Sjmcneill				/* the t8 is the reminder count */
3239ad247e8Sjmcneill	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
3249ad247e8Sjmcneill
3259ad247e8Sjmcneill	LWHI	t0, 0(a1)
3269ad247e8Sjmcneill	LWLO	t0, 3(a1)
3279ad247e8Sjmcneill	LWHI	t1, 4(a1)
3289ad247e8Sjmcneill	LWLO	t1, 7(a1)
3299ad247e8Sjmcneill	LWHI	t2, 8(a1)
3309ad247e8Sjmcneill	LWLO	t2, 11(a1)
3319ad247e8Sjmcneill	LWHI	t3, 12(a1)
3329ad247e8Sjmcneill	LWLO	t3, 15(a1)
3339ad247e8Sjmcneill	LWHI	t4, 16(a1)
3349ad247e8Sjmcneill	LWLO	t4, 19(a1)
3359ad247e8Sjmcneill	LWHI	t5, 20(a1)
3369ad247e8Sjmcneill	LWLO	t5, 23(a1)
3379ad247e8Sjmcneill	LWHI	t6, 24(a1)
3389ad247e8Sjmcneill	LWLO	t6, 27(a1)
3399ad247e8Sjmcneill	LWHI	t7, 28(a1)
3409ad247e8Sjmcneill	LWLO	t7, 31(a1)
3419ad247e8Sjmcneill	addiu	a1, a1, 32
3429ad247e8Sjmcneill
3439ad247e8Sjmcneill	sw	t0, 0(a0)
3449ad247e8Sjmcneill	sw	t1, 4(a0)
3459ad247e8Sjmcneill	sw	t2, 8(a0)
3469ad247e8Sjmcneill	sw	t3, 12(a0)
3479ad247e8Sjmcneill	sw	t4, 16(a0)
3489ad247e8Sjmcneill	sw	t5, 20(a0)
3499ad247e8Sjmcneill	sw	t6, 24(a0)
3509ad247e8Sjmcneill	sw	t7, 28(a0)
3519ad247e8Sjmcneill	addiu	a0, a0, 32
3529ad247e8Sjmcneill
3539ad247e8Sjmcneill$ua_chk1w:
3549ad247e8Sjmcneill	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
3559ad247e8Sjmcneill	beq	a2, t8, $ua_smallCopy
3569ad247e8Sjmcneill	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
3579ad247e8Sjmcneill	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
3589ad247e8Sjmcneill
3599ad247e8Sjmcneill/* copying in words (4-byte chunks) */
3609ad247e8Sjmcneill$ua_wordCopy_loop:
3619ad247e8Sjmcneill	LWHI	v1, 0(a1)
3629ad247e8Sjmcneill	LWLO	v1, 3(a1)
3639ad247e8Sjmcneill	addiu	a1, a1, 4
3649ad247e8Sjmcneill	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
3659ad247e8Sjmcneill	bne	a0, a3, $ua_wordCopy_loop
3669ad247e8Sjmcneill	sw	v1, -4(a0)
3679ad247e8Sjmcneill
3689ad247e8Sjmcneill/* Now less than 4 bytes (value in a2) left to copy */
3699ad247e8Sjmcneill$ua_smallCopy:
3709ad247e8Sjmcneill	beqz	a2, leave
3719ad247e8Sjmcneill	addu	a3, a0, a2	/* a3 is the last dst address */
3729ad247e8Sjmcneill$ua_smallCopy_loop:
3739ad247e8Sjmcneill	lb	v1, 0(a1)
3749ad247e8Sjmcneill	addiu	a1, a1, 1
3759ad247e8Sjmcneill	addiu	a0, a0, 1
3769ad247e8Sjmcneill	bne	a0, a3, $ua_smallCopy_loop
3779ad247e8Sjmcneill	sb	v1, -1(a0)
3789ad247e8Sjmcneill
3799ad247e8Sjmcneill	j	ra
3809ad247e8Sjmcneill	nop
3819ad247e8Sjmcneill
3829ad247e8SjmcneillEND(pixman_mips_fast_memcpy)
383