Home | History | Annotate | Line # | Download | only in string
      1 /*	$NetBSD: memcpy_xscale.S,v 1.6 2023/01/19 18:03:03 mlelstv Exp $	*/
      2 
      3 /*
      4  * Copyright 2003 Wasabi Systems, Inc.
      5  * All rights reserved.
      6  *
      7  * Written by Steve C. Woodford for Wasabi Systems, Inc.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. All advertising materials mentioning features or use of this software
     18  *    must display the following acknowledgement:
     19  *      This product includes software developed for the NetBSD Project by
     20  *      Wasabi Systems, Inc.
     21  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
     22  *    or promote products derived from this software without specific prior
     23  *    written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
     26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 
     38 #include <machine/asm.h>
     39 
     40 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
     41 ENTRY(memcpy)
     42 	pld	[r1]
     43 	cmp	r2, #0x0c
     44 	bls	.Lmemcpy_short		/* <= 12 bytes */
     45 	mov	r3, r0			/* We must not clobber r0 */
     46 
     47 	/* Word-align the destination buffer */
     48 	ands	ip, r3, #0x03		/* Already word aligned? */
     49 	beq	.Lmemcpy_wordaligned	/* Yup */
     50 	cmp	ip, #0x02
     51 	ldrb	ip, [r1], #0x01
     52 	sub	r2, r2, #0x01
     53 	strb	ip, [r3], #0x01
     54 	ldrbls	ip, [r1], #0x01
     55 	subls	r2, r2, #0x01
     56 	strbls	ip, [r3], #0x01
     57 	ldrblo	ip, [r1], #0x01
     58 	sublo	r2, r2, #0x01
     59 	strblo	ip, [r3], #0x01
     60 
     61 	/* Destination buffer is now word aligned */
     62 .Lmemcpy_wordaligned:
     63 	ands	ip, r1, #0x03		/* Is src also word-aligned? */
     64 	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
     65 
     66 	/* Quad-align the destination buffer */
     67 	tst	r3, #0x07		/* Already quad aligned? */
     68 	ldrne	ip, [r1], #0x04
     69 	push	{r4-r9}		/* Free up some registers */
     70 	subne	r2, r2, #0x04
     71 	strne	ip, [r3], #0x04
     72 
     73 	/* Destination buffer quad aligned, source is at least word aligned */
     74 	subs	r2, r2, #0x80
     75 	blo	.Lmemcpy_w_lessthan128
     76 
     77 	/* Copy 128 bytes at a time */
     78 .Lmemcpy_w_loop128:
     79 	ldr	r4, [r1], #0x04		/* LD:00-03 */
     80 	ldr	r5, [r1], #0x04		/* LD:04-07 */
     81 	pld	[r1, #0x18]		/* Prefetch 0x20 */
     82 	ldr	r6, [r1], #0x04		/* LD:08-0b */
     83 	ldr	r7, [r1], #0x04		/* LD:0c-0f */
     84 	ldr	r8, [r1], #0x04		/* LD:10-13 */
     85 	ldr	r9, [r1], #0x04		/* LD:14-17 */
     86 	strd	r4, r5, [r3], #0x08	/* ST:00-07 */
     87 	ldr	r4, [r1], #0x04		/* LD:18-1b */
     88 	ldr	r5, [r1], #0x04		/* LD:1c-1f */
     89 	strd	r6, r7, [r3], #0x08	/* ST:08-0f */
     90 	ldr	r6, [r1], #0x04		/* LD:20-23 */
     91 	ldr	r7, [r1], #0x04		/* LD:24-27 */
     92 	pld	[r1, #0x18]		/* Prefetch 0x40 */
     93 	strd	r8, r9, [r3], #0x08	/* ST:10-17 */
     94 	ldr	r8, [r1], #0x04		/* LD:28-2b */
     95 	ldr	r9, [r1], #0x04		/* LD:2c-2f */
     96 	strd	r4, r5, [r3], #0x08	/* ST:18-1f */
     97 	ldr	r4, [r1], #0x04		/* LD:30-33 */
     98 	ldr	r5, [r1], #0x04		/* LD:34-37 */
     99 	strd	r6, r7, [r3], #0x08	/* ST:20-27 */
    100 	ldr	r6, [r1], #0x04		/* LD:38-3b */
    101 	ldr	r7, [r1], #0x04		/* LD:3c-3f */
    102 	strd	r8, r9, [r3], #0x08	/* ST:28-2f */
    103 	ldr	r8, [r1], #0x04		/* LD:40-43 */
    104 	ldr	r9, [r1], #0x04		/* LD:44-47 */
    105 	pld	[r1, #0x18]		/* Prefetch 0x60 */
    106 	strd	r4, r5, [r3], #0x08	/* ST:30-37 */
    107 	ldr	r4, [r1], #0x04		/* LD:48-4b */
    108 	ldr	r5, [r1], #0x04		/* LD:4c-4f */
    109 	strd	r6, r7, [r3], #0x08	/* ST:38-3f */
    110 	ldr	r6, [r1], #0x04		/* LD:50-53 */
    111 	ldr	r7, [r1], #0x04		/* LD:54-57 */
    112 	strd	r8, r9, [r3], #0x08	/* ST:40-47 */
    113 	ldr	r8, [r1], #0x04		/* LD:58-5b */
    114 	ldr	r9, [r1], #0x04		/* LD:5c-5f */
    115 	strd	r4, r5, [r3], #0x08	/* ST:48-4f */
    116 	ldr	r4, [r1], #0x04		/* LD:60-63 */
    117 	ldr	r5, [r1], #0x04		/* LD:64-67 */
    118 	pld	[r1, #0x18]		/* Prefetch 0x80 */
    119 	strd	r6, r7, [r3], #0x08	/* ST:50-57 */
    120 	ldr	r6, [r1], #0x04		/* LD:68-6b */
    121 	ldr	r7, [r1], #0x04		/* LD:6c-6f */
    122 	strd	r8, r9, [r3], #0x08	/* ST:58-5f */
    123 	ldr	r8, [r1], #0x04		/* LD:70-73 */
    124 	ldr	r9, [r1], #0x04		/* LD:74-77 */
    125 	strd	r4, r5, [r3], #0x08	/* ST:60-67 */
    126 	ldr	r4, [r1], #0x04		/* LD:78-7b */
    127 	ldr	r5, [r1], #0x04		/* LD:7c-7f */
    128 	strd	r6, r7, [r3], #0x08	/* ST:68-6f */
    129 	strd	r8, r9, [r3], #0x08	/* ST:70-77 */
    130 	subs	r2, r2, #0x80
    131 	strd	r4, r5, [r3], #0x08	/* ST:78-7f */
    132 	bhs	.Lmemcpy_w_loop128
    133 
    134 .Lmemcpy_w_lessthan128:
    135 	adds	r2, r2, #0x80		/* Adjust for extra sub */
    136 	popeq	{r4-r9}
    137 	RETc(eq)			/* Return now if done */
    138 	subs	r2, r2, #0x20
    139 	blo	.Lmemcpy_w_lessthan32
    140 
    141 	/* Copy 32 bytes at a time */
    142 .Lmemcpy_w_loop32:
    143 	ldr	r4, [r1], #0x04
    144 	ldr	r5, [r1], #0x04
    145 	pld	[r1, #0x18]
    146 	ldr	r6, [r1], #0x04
    147 	ldr	r7, [r1], #0x04
    148 	ldr	r8, [r1], #0x04
    149 	ldr	r9, [r1], #0x04
    150 	strd	r4, r5, [r3], #0x08
    151 	ldr	r4, [r1], #0x04
    152 	ldr	r5, [r1], #0x04
    153 	strd	r6, r7, [r3], #0x08
    154 	strd	r8, r9, [r3], #0x08
    155 	subs	r2, r2, #0x20
    156 	strd	r4, r5, [r3], #0x08
    157 	bhs	.Lmemcpy_w_loop32
    158 
    159 .Lmemcpy_w_lessthan32:
    160 	adds	r2, r2, #0x20		/* Adjust for extra sub */
    161 	popeq	{r4-r9}
    162 	RETc(eq)			/* Return now if done */
    163 
    164 	and	r4, r2, #0x18
    165 	rsbs	r4, r4, #0x18
    166 	addne	pc, pc, r4, lsl #1
    167 	nop
    168 
    169 	/* At least 24 bytes remaining */
    170 	ldr	r4, [r1], #0x04
    171 	ldr	r5, [r1], #0x04
    172 	sub	r2, r2, #0x08
    173 	strd	r4, r5, [r3], #0x08
    174 
    175 	/* At least 16 bytes remaining */
    176 	ldr	r4, [r1], #0x04
    177 	ldr	r5, [r1], #0x04
    178 	sub	r2, r2, #0x08
    179 	strd	r4, r5, [r3], #0x08
    180 
    181 	/* At least 8 bytes remaining */
    182 	ldr	r4, [r1], #0x04
    183 	ldr	r5, [r1], #0x04
    184 	subs	r2, r2, #0x08
    185 	strd	r4, r5, [r3], #0x08
    186 
    187 	/* Less than 8 bytes remaining */
    188 	pop	{r4-r9}
    189 	RETc(eq)			/* Return now if done */
    190 	subs	r2, r2, #0x04
    191 	ldrhs	ip, [r1], #0x04
    192 	strhs	ip, [r3], #0x04
    193 	RETc(eq)			/* Return now if done */
    194 	addlo	r2, r2, #0x04
    195 	ldrb	ip, [r1], #0x01
    196 	cmp	r2, #0x02
    197 	ldrbhs	r2, [r1], #0x01
    198 	strb	ip, [r3], #0x01
    199 	ldrbhi	ip, [r1]
    200 	strbhs	r2, [r3], #0x01
    201 	strbhi	ip, [r3]
    202 	RET
    203 
    204 
    205 /*
    206  * At this point, it has not been possible to word align both buffers.
    207  * The destination buffer is word aligned, but the source buffer is not.
    208  */
    209 .Lmemcpy_bad_align:
    210 	push	{r4-r7}
    211 	bic	r1, r1, #0x03
    212 	cmp	ip, #2
    213 	ldr	ip, [r1], #0x04
    214 	bhi	.Lmemcpy_bad3
    215 	beq	.Lmemcpy_bad2
    216 	b	.Lmemcpy_bad1
    217 
    218 .Lmemcpy_bad1_loop16:
    219 #ifdef __ARMEB__
    220 	mov	r4, ip, lsl #8
    221 #else
    222 	mov	r4, ip, lsr #8
    223 #endif
    224 	ldr	r5, [r1], #0x04
    225 	pld	[r1, #0x018]
    226 	ldr	r6, [r1], #0x04
    227 	ldr	r7, [r1], #0x04
    228 	ldr	ip, [r1], #0x04
    229 #ifdef __ARMEB__
    230 	orr	r4, r4, r5, lsr #24
    231 	mov	r5, r5, lsl #8
    232 	orr	r5, r5, r6, lsr #24
    233 	mov	r6, r6, lsl #8
    234 	orr	r6, r6, r7, lsr #24
    235 	mov	r7, r7, lsl #8
    236 	orr	r7, r7, ip, lsr #24
    237 #else
    238 	orr	r4, r4, r5, lsl #24
    239 	mov	r5, r5, lsr #8
    240 	orr	r5, r5, r6, lsl #24
    241 	mov	r6, r6, lsr #8
    242 	orr	r6, r6, r7, lsl #24
    243 	mov	r7, r7, lsr #8
    244 	orr	r7, r7, ip, lsl #24
    245 #endif
    246 	str	r4, [r3], #0x04
    247 	str	r5, [r3], #0x04
    248 	str	r6, [r3], #0x04
    249 	str	r7, [r3], #0x04
    250 	sub	r2, r2, #0x10
    251 
    252 .Lmemcpy_bad1:
    253 	cmp	r2, #0x20
    254 	bhs	.Lmemcpy_bad1_loop16
    255 	cmp	r2, #0x10
    256 	blo	.Lmemcpy_bad1_loop16_short
    257 
    258 	/* copy last 16 bytes (without preload) */
    259 #ifdef __ARMEB__
    260 	mov	r4, ip, lsl #8
    261 #else
    262 	mov	r4, ip, lsr #8
    263 #endif
    264 	ldr	r5, [r1], #0x04
    265 	ldr	r6, [r1], #0x04
    266 	ldr	r7, [r1], #0x04
    267 	ldr	ip, [r1], #0x04
    268 #ifdef __ARMEB__
    269 	orr	r4, r4, r5, lsr #24
    270 	mov	r5, r5, lsl #8
    271 	orr	r5, r5, r6, lsr #24
    272 	mov	r6, r6, lsl #8
    273 	orr	r6, r6, r7, lsr #24
    274 	mov	r7, r7, lsl #8
    275 	orr	r7, r7, ip, lsr #24
    276 #else
    277 	orr	r4, r4, r5, lsl #24
    278 	mov	r5, r5, lsr #8
    279 	orr	r5, r5, r6, lsl #24
    280 	mov	r6, r6, lsr #8
    281 	orr	r6, r6, r7, lsl #24
    282 	mov	r7, r7, lsr #8
    283 	orr	r7, r7, ip, lsl #24
    284 #endif
    285 	str	r4, [r3], #0x04
    286 	str	r5, [r3], #0x04
    287 	str	r6, [r3], #0x04
    288 	str	r7, [r3], #0x04
    289 	subs	r2, r2, #0x10
    290 	popeq	{r4-r7}
    291 	RETc(eq)			/* Return now if done */
    292 
    293 .Lmemcpy_bad1_loop16_short:
    294 	subs	r2, r2, #0x04
    295 	sublo	r1, r1, #0x03
    296 	blo	.Lmemcpy_bad_done
    297 
    298 .Lmemcpy_bad1_loop4:
    299 #ifdef __ARMEB__
    300 	mov	r4, ip, lsl #8
    301 #else
    302 	mov	r4, ip, lsr #8
    303 #endif
    304 	ldr	ip, [r1], #0x04
    305 	subs	r2, r2, #0x04
    306 #ifdef __ARMEB__
    307 	orr	r4, r4, ip, lsr #24
    308 #else
    309 	orr	r4, r4, ip, lsl #24
    310 #endif
    311 	str	r4, [r3], #0x04
    312 	bhs	.Lmemcpy_bad1_loop4
    313 	sub	r1, r1, #0x03
    314 	b	.Lmemcpy_bad_done
    315 
    316 .Lmemcpy_bad2_loop16:
    317 #ifdef __ARMEB__
    318 	mov	r4, ip, lsl #16
    319 #else
    320 	mov	r4, ip, lsr #16
    321 #endif
    322 	ldr	r5, [r1], #0x04
    323 	pld	[r1, #0x018]
    324 	ldr	r6, [r1], #0x04
    325 	ldr	r7, [r1], #0x04
    326 	ldr	ip, [r1], #0x04
    327 #ifdef __ARMEB__
    328 	orr	r4, r4, r5, lsr #16
    329 	mov	r5, r5, lsl #16
    330 	orr	r5, r5, r6, lsr #16
    331 	mov	r6, r6, lsl #16
    332 	orr	r6, r6, r7, lsr #16
    333 	mov	r7, r7, lsl #16
    334 	orr	r7, r7, ip, lsr #16
    335 #else
    336 	orr	r4, r4, r5, lsl #16
    337 	mov	r5, r5, lsr #16
    338 	orr	r5, r5, r6, lsl #16
    339 	mov	r6, r6, lsr #16
    340 	orr	r6, r6, r7, lsl #16
    341 	mov	r7, r7, lsr #16
    342 	orr	r7, r7, ip, lsl #16
    343 #endif
    344 	str	r4, [r3], #0x04
    345 	str	r5, [r3], #0x04
    346 	str	r6, [r3], #0x04
    347 	str	r7, [r3], #0x04
    348 	sub	r2, r2, #0x10
    349 
    350 .Lmemcpy_bad2:
    351 	cmp	r2, #0x20
    352 	bhs	.Lmemcpy_bad2_loop16
    353 	cmp	r2, #0x10
    354 	blo	.Lmemcpy_bad2_loop16_short
    355 
    356 	/* copy last 16 bytes (without preload) */
    357 #ifdef __ARMEB__
    358 	mov	r4, ip, lsl #16
    359 #else
    360 	mov	r4, ip, lsr #16
    361 #endif
    362 	ldr	r5, [r1], #0x04
    363 	ldr	r6, [r1], #0x04
    364 	ldr	r7, [r1], #0x04
    365 	ldr	ip, [r1], #0x04
    366 #ifdef __ARMEB__
    367 	orr	r4, r4, r5, lsr #16
    368 	mov	r5, r5, lsl #16
    369 	orr	r5, r5, r6, lsr #16
    370 	mov	r6, r6, lsl #16
    371 	orr	r6, r6, r7, lsr #16
    372 	mov	r7, r7, lsl #16
    373 	orr	r7, r7, ip, lsr #16
    374 #else
    375 	orr	r4, r4, r5, lsl #16
    376 	mov	r5, r5, lsr #16
    377 	orr	r5, r5, r6, lsl #16
    378 	mov	r6, r6, lsr #16
    379 	orr	r6, r6, r7, lsl #16
    380 	mov	r7, r7, lsr #16
    381 	orr	r7, r7, ip, lsl #16
    382 #endif
    383 	str	r4, [r3], #0x04
    384 	str	r5, [r3], #0x04
    385 	str	r6, [r3], #0x04
    386 	str	r7, [r3], #0x04
    387 	subs	r2, r2, #0x10
    388 	popeq	{r4-r7}
    389 	RETc(eq)			/* Return now if done */
    390 
    391 .Lmemcpy_bad2_loop16_short:
    392 	subs	r2, r2, #0x04
    393 	sublo	r1, r1, #0x02
    394 	blo	.Lmemcpy_bad_done
    395 
    396 .Lmemcpy_bad2_loop4:
    397 #ifdef __ARMEB__
    398 	mov	r4, ip, lsl #16
    399 #else
    400 	mov	r4, ip, lsr #16
    401 #endif
    402 	ldr	ip, [r1], #0x04
    403 	subs	r2, r2, #0x04
    404 #ifdef __ARMEB__
    405 	orr	r4, r4, ip, lsr #16
    406 #else
    407 	orr	r4, r4, ip, lsl #16
    408 #endif
    409 	str	r4, [r3], #0x04
    410 	bhs	.Lmemcpy_bad2_loop4
    411 	sub	r1, r1, #0x02
    412 	b	.Lmemcpy_bad_done
    413 
    414 .Lmemcpy_bad3_loop16:
    415 #ifdef __ARMEB__
    416 	mov	r4, ip, lsl #24
    417 #else
    418 	mov	r4, ip, lsr #24
    419 #endif
    420 	ldr	r5, [r1], #0x04
    421 	pld	[r1, #0x018]
    422 	ldr	r6, [r1], #0x04
    423 	ldr	r7, [r1], #0x04
    424 	ldr	ip, [r1], #0x04
    425 #ifdef __ARMEB__
    426 	orr	r4, r4, r5, lsr #8
    427 	mov	r5, r5, lsl #24
    428 	orr	r5, r5, r6, lsr #8
    429 	mov	r6, r6, lsl #24
    430 	orr	r6, r6, r7, lsr #8
    431 	mov	r7, r7, lsl #24
    432 	orr	r7, r7, ip, lsr #8
    433 #else
    434 	orr	r4, r4, r5, lsl #8
    435 	mov	r5, r5, lsr #24
    436 	orr	r5, r5, r6, lsl #8
    437 	mov	r6, r6, lsr #24
    438 	orr	r6, r6, r7, lsl #8
    439 	mov	r7, r7, lsr #24
    440 	orr	r7, r7, ip, lsl #8
    441 #endif
    442 	str	r4, [r3], #0x04
    443 	str	r5, [r3], #0x04
    444 	str	r6, [r3], #0x04
    445 	str	r7, [r3], #0x04
    446 	sub	r2, r2, #0x10
    447 
    448 .Lmemcpy_bad3:
    449 	cmp	r2, #0x20
    450 	bhs	.Lmemcpy_bad3_loop16
    451 	cmp	r2, #0x10
    452 	blo	.Lmemcpy_bad3_loop16_short
    453 
    454 	/* copy last 16 bytes (without preload) */
    455 #ifdef __ARMEB__
    456 	mov	r4, ip, lsl #24
    457 #else
    458 	mov	r4, ip, lsr #24
    459 #endif
    460 	ldr	r5, [r1], #0x04
    461 	ldr	r6, [r1], #0x04
    462 	ldr	r7, [r1], #0x04
    463 	ldr	ip, [r1], #0x04
    464 #ifdef __ARMEB__
    465 	orr	r4, r4, r5, lsr #8
    466 	mov	r5, r5, lsl #24
    467 	orr	r5, r5, r6, lsr #8
    468 	mov	r6, r6, lsl #24
    469 	orr	r6, r6, r7, lsr #8
    470 	mov	r7, r7, lsl #24
    471 	orr	r7, r7, ip, lsr #8
    472 #else
    473 	orr	r4, r4, r5, lsl #8
    474 	mov	r5, r5, lsr #24
    475 	orr	r5, r5, r6, lsl #8
    476 	mov	r6, r6, lsr #24
    477 	orr	r6, r6, r7, lsl #8
    478 	mov	r7, r7, lsr #24
    479 	orr	r7, r7, ip, lsl #8
    480 #endif
    481 	str	r4, [r3], #0x04
    482 	str	r5, [r3], #0x04
    483 	str	r6, [r3], #0x04
    484 	str	r7, [r3], #0x04
    485 	subs	r2, r2, #0x10
    486 	popeq	{r4-r7}
    487 	RETc(eq)			/* Return now if done */
    488 
    489 .Lmemcpy_bad3_loop16_short:
    490 	subs	r2, r2, #0x04
    491 	sublo	r1, r1, #0x01
    492 	blo	.Lmemcpy_bad_done
    493 
    494 .Lmemcpy_bad3_loop4:
    495 #ifdef __ARMEB__
    496 	mov	r4, ip, lsl #24
    497 #else
    498 	mov	r4, ip, lsr #24
    499 #endif
    500 	ldr	ip, [r1], #0x04
    501 	subs	r2, r2, #0x04
    502 #ifdef __ARMEB__
    503 	orr	r4, r4, ip, lsr #8
    504 #else
    505 	orr	r4, r4, ip, lsl #8
    506 #endif
    507 	str	r4, [r3], #0x04
    508 	bhs	.Lmemcpy_bad3_loop4
    509 	sub	r1, r1, #0x01
    510 
    511 .Lmemcpy_bad_done:
    512 	pop	{r4-r7}
    513 	adds	r2, r2, #0x04
    514 	RETc(eq)
    515 	ldrb	ip, [r1], #0x01
    516 	cmp	r2, #0x02
    517 	ldrbhs	r2, [r1], #0x01
    518 	strb	ip, [r3], #0x01
    519 	ldrbhi	ip, [r1]
    520 	strbhs	r2, [r3], #0x01
    521 	strbhi	ip, [r3]
    522 	RET
    523 
    524 
    525 /*
    526  * Handle short copies (less than 16 bytes), possibly misaligned.
    527  * Some of these are *very* common, thanks to the network stack,
    528  * and so are handled specially.
    529  */
    530 .Lmemcpy_short:
    531 #ifndef _STANDALONE
    532 	add	pc, pc, r2, lsl #2
    533 	nop
    534 	RET				/* 0x00 */
    535 	b	.Lmemcpy_bytewise	/* 0x01 */
    536 	b	.Lmemcpy_bytewise	/* 0x02 */
    537 	b	.Lmemcpy_bytewise	/* 0x03 */
    538 	b	.Lmemcpy_4		/* 0x04 */
    539 	b	.Lmemcpy_bytewise	/* 0x05 */
    540 	b	.Lmemcpy_6		/* 0x06 */
    541 	b	.Lmemcpy_bytewise	/* 0x07 */
    542 	b	.Lmemcpy_8		/* 0x08 */
    543 	b	.Lmemcpy_bytewise	/* 0x09 */
    544 	b	.Lmemcpy_bytewise	/* 0x0a */
    545 	b	.Lmemcpy_bytewise	/* 0x0b */
    546 	b	.Lmemcpy_c		/* 0x0c */
    547 #endif
    548 .Lmemcpy_bytewise:
    549 	mov	r3, r0			/* We must not clobber r0 */
    550 	ldrb	ip, [r1], #0x01
    551 1:	subs	r2, r2, #0x01
    552 	strb	ip, [r3], #0x01
    553 	ldrbne	ip, [r1], #0x01
    554 	bne	1b
    555 	RET
    556 
    557 #ifndef _STANDALONE
    558 /******************************************************************************
    559  * Special case for 4 byte copies
    560  */
    561 #define	LMEMCPY_4_LOG2	6	/* 64 bytes */
    562 #define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
    563 	LMEMCPY_4_PAD
    564 .Lmemcpy_4:
    565 	and	r2, r1, #0x03
    566 	orr	r2, r2, r0, lsl #2
    567 	ands	r2, r2, #0x0f
    568 	sub	r3, pc, #0x14
    569 	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
    570 
    571 /*
    572  * 0000: dst is 32-bit aligned, src is 32-bit aligned
    573  */
    574 	ldr	r2, [r1]
    575 	str	r2, [r0]
    576 	RET
    577 	LMEMCPY_4_PAD
    578 
    579 /*
    580  * 0001: dst is 32-bit aligned, src is 8-bit aligned
    581  */
    582 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
    583 	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
    584 #ifdef __ARMEB__
    585 	mov	r3, r3, lsl #8		/* r3 = 012. */
    586 	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
    587 #else
    588 	mov	r3, r3, lsr #8		/* r3 = .210 */
    589 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
    590 #endif
    591 	str	r3, [r0]
    592 	RET
    593 	LMEMCPY_4_PAD
    594 
    595 /*
    596  * 0010: dst is 32-bit aligned, src is 16-bit aligned
    597  */
    598 #ifdef __ARMEB__
    599 	ldrh	r3, [r1]
    600 	ldrh	r2, [r1, #0x02]
    601 #else
    602 	ldrh	r3, [r1, #0x02]
    603 	ldrh	r2, [r1]
    604 #endif
    605 	orr	r3, r2, r3, lsl #16
    606 	str	r3, [r0]
    607 	RET
    608 	LMEMCPY_4_PAD
    609 
    610 /*
    611  * 0011: dst is 32-bit aligned, src is 8-bit aligned
    612  */
    613 	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
    614 	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
    615 #ifdef __ARMEB__
    616 	mov	r3, r3, lsl #24		/* r3 = 0... */
    617 	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
    618 #else
    619 	mov	r3, r3, lsr #24		/* r3 = ...0 */
    620 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
    621 #endif
    622 	str	r3, [r0]
    623 	RET
    624 	LMEMCPY_4_PAD
    625 
    626 /*
    627  * 0100: dst is 8-bit aligned, src is 32-bit aligned
    628  */
    629 	ldr	r2, [r1]
    630 #ifdef __ARMEB__
    631 	strb	r2, [r0, #0x03]
    632 	mov	r3, r2, lsr #8
    633 	mov	r1, r2, lsr #24
    634 	strb	r1, [r0]
    635 #else
    636 	strb	r2, [r0]
    637 	mov	r3, r2, lsr #8
    638 	mov	r1, r2, lsr #24
    639 	strb	r1, [r0, #0x03]
    640 #endif
    641 	strh	r3, [r0, #0x01]
    642 	RET
    643 	LMEMCPY_4_PAD
    644 
    645 /*
    646  * 0101: dst is 8-bit aligned, src is 8-bit aligned
    647  */
    648 	ldrb	r2, [r1]
    649 	ldrh	r3, [r1, #0x01]
    650 	ldrb	r1, [r1, #0x03]
    651 	strb	r2, [r0]
    652 	strh	r3, [r0, #0x01]
    653 	strb	r1, [r0, #0x03]
    654 	RET
    655 	LMEMCPY_4_PAD
    656 
    657 /*
    658  * 0110: dst is 8-bit aligned, src is 16-bit aligned
    659  */
    660 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
    661 	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
    662 #ifdef __ARMEB__
    663 	mov	r1, r2, lsr #8		/* r1 = ...0 */
    664 	strb	r1, [r0]
    665 	mov	r2, r2, lsl #8		/* r2 = .01. */
    666 	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
    667 #else
    668 	strb	r2, [r0]
    669 	mov	r2, r2, lsr #8		/* r2 = ...1 */
    670 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
    671 	mov	r3, r3, lsr #8		/* r3 = ...3 */
    672 #endif
    673 	strh	r2, [r0, #0x01]
    674 	strb	r3, [r0, #0x03]
    675 	RET
    676 	LMEMCPY_4_PAD
    677 
    678 /*
    679  * 0111: dst is 8-bit aligned, src is 8-bit aligned
    680  */
    681 	ldrb	r2, [r1]
    682 	ldrh	r3, [r1, #0x01]
    683 	ldrb	r1, [r1, #0x03]
    684 	strb	r2, [r0]
    685 	strh	r3, [r0, #0x01]
    686 	strb	r1, [r0, #0x03]
    687 	RET
    688 	LMEMCPY_4_PAD
    689 
    690 /*
    691  * 1000: dst is 16-bit aligned, src is 32-bit aligned
    692  */
    693 	ldr	r2, [r1]
    694 #ifdef __ARMEB__
    695 	strh	r2, [r0, #0x02]
    696 	mov	r3, r2, lsr #16
    697 	strh	r3, [r0]
    698 #else
    699 	strh	r2, [r0]
    700 	mov	r3, r2, lsr #16
    701 	strh	r3, [r0, #0x02]
    702 #endif
    703 	RET
    704 	LMEMCPY_4_PAD
    705 
    706 /*
    707  * 1001: dst is 16-bit aligned, src is 8-bit aligned
    708  */
    709 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
    710 	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
    711 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
    712 	strh	r1, [r0]
    713 #ifdef __ARMEB__
    714 	mov	r2, r2, lsl #8		/* r2 = 012. */
    715 	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
    716 #else
    717 	mov	r2, r2, lsr #24		/* r2 = ...2 */
    718 	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
    719 #endif
    720 	strh	r2, [r0, #0x02]
    721 	RET
    722 	LMEMCPY_4_PAD
    723 
    724 /*
    725  * 1010: dst is 16-bit aligned, src is 16-bit aligned
    726  */
    727 	ldrh	r2, [r1]
    728 	ldrh	r3, [r1, #0x02]
    729 	strh	r2, [r0]
    730 	strh	r3, [r0, #0x02]
    731 	RET
    732 	LMEMCPY_4_PAD
    733 
    734 /*
    735  * 1011: dst is 16-bit aligned, src is 8-bit aligned
    736  */
    737 	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
    738 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
    739 	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
    740 	strh	r1, [r0, #0x02]
    741 #ifdef __ARMEB__
    742 	mov	r3, r3, lsr #24		/* r3 = ...1 */
    743 	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
    744 #else
    745 	mov	r3, r3, lsl #8		/* r3 = 321. */
    746 	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
    747 #endif
    748 	strh	r3, [r0]
    749 	RET
    750 	LMEMCPY_4_PAD
    751 
    752 /*
    753  * 1100: dst is 8-bit aligned, src is 32-bit aligned
    754  */
    755 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
    756 #ifdef __ARMEB__
    757 	strb	r2, [r0, #0x03]
    758 	mov	r3, r2, lsr #8
    759 	mov	r1, r2, lsr #24
    760 	strh	r3, [r0, #0x01]
    761 	strb	r1, [r0]
    762 #else
    763 	strb	r2, [r0]
    764 	mov	r3, r2, lsr #8
    765 	mov	r1, r2, lsr #24
    766 	strh	r3, [r0, #0x01]
    767 	strb	r1, [r0, #0x03]
    768 #endif
    769 	RET
    770 	LMEMCPY_4_PAD
    771 
    772 /*
    773  * 1101: dst is 8-bit aligned, src is 8-bit aligned
    774  */
    775 	ldrb	r2, [r1]
    776 	ldrh	r3, [r1, #0x01]
    777 	ldrb	r1, [r1, #0x03]
    778 	strb	r2, [r0]
    779 	strh	r3, [r0, #0x01]
    780 	strb	r1, [r0, #0x03]
    781 	RET
    782 	LMEMCPY_4_PAD
    783 
    784 /*
    785  * 1110: dst is 8-bit aligned, src is 16-bit aligned
    786  */
    787 #ifdef __ARMEB__
    788 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
    789 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
    790 	strb	r3, [r0, #0x03]
    791 	mov	r3, r3, lsr #8		/* r3 = ...2 */
    792 	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
    793 	strh	r3, [r0, #0x01]
    794 	mov	r2, r2, lsr #8		/* r2 = ...0 */
    795 	strb	r2, [r0]
    796 #else
    797 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
    798 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
    799 	strb	r2, [r0]
    800 	mov	r2, r2, lsr #8		/* r2 = ...1 */
    801 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
    802 	strh	r2, [r0, #0x01]
    803 	mov	r3, r3, lsr #8		/* r3 = ...3 */
    804 	strb	r3, [r0, #0x03]
    805 #endif
    806 	RET
    807 	LMEMCPY_4_PAD
    808 
    809 /*
    810  * 1111: dst is 8-bit aligned, src is 8-bit aligned
    811  */
    812 	ldrb	r2, [r1]
    813 	ldrh	r3, [r1, #0x01]
    814 	ldrb	r1, [r1, #0x03]
    815 	strb	r2, [r0]
    816 	strh	r3, [r0, #0x01]
    817 	strb	r1, [r0, #0x03]
    818 	RET
    819 	LMEMCPY_4_PAD
    820 
    821 
    822 /******************************************************************************
    823  * Special case for 6 byte copies
    824  */
    825 #define	LMEMCPY_6_LOG2	6	/* 64 bytes */
    826 #define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
    827 	LMEMCPY_6_PAD
    828 .Lmemcpy_6:
    829 	and	r2, r1, #0x03
    830 	orr	r2, r2, r0, lsl #2
    831 	ands	r2, r2, #0x0f
    832 	sub	r3, pc, #0x14
    833 	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
    834 
    835 /*
    836  * 0000: dst is 32-bit aligned, src is 32-bit aligned
    837  */
    838 	ldr	r2, [r1]
    839 	ldrh	r3, [r1, #0x04]
    840 	str	r2, [r0]
    841 	strh	r3, [r0, #0x04]
    842 	RET
    843 	LMEMCPY_6_PAD
    844 
    845 /*
    846  * 0001: dst is 32-bit aligned, src is 8-bit aligned
    847  */
    848 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
    849 	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
    850 #ifdef __ARMEB__
    851 	mov	r2, r2, lsl #8		/* r2 = 012. */
    852 	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
    853 #else
    854 	mov	r2, r2, lsr #8		/* r2 = .210 */
    855 	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
    856 #endif
    857 	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
    858 	str	r2, [r0]
    859 	strh	r3, [r0, #0x04]
    860 	RET
    861 	LMEMCPY_6_PAD
    862 
    863 /*
    864  * 0010: dst is 32-bit aligned, src is 16-bit aligned
    865  */
    866 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
    867 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
    868 #ifdef __ARMEB__
    869 	mov	r1, r3, lsr #16		/* r1 = ..23 */
    870 	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
    871 	str	r1, [r0]
    872 	strh	r3, [r0, #0x04]
    873 #else
    874 	mov	r1, r3, lsr #16		/* r1 = ..54 */
    875 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
    876 	str	r2, [r0]
    877 	strh	r1, [r0, #0x04]
    878 #endif
    879 	RET
    880 	LMEMCPY_6_PAD
    881 
    882 /*
    883  * 0011: dst is 32-bit aligned, src is 8-bit aligned
    884  */
    885 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
    886 	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
    887 	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
    888 #ifdef __ARMEB__
    889 	mov	r2, r2, lsl #24		/* r2 = 0... */
    890 	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
    891 	mov	r3, r3, lsl #8		/* r3 = 234. */
    892 	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
    893 #else
    894 	mov	r2, r2, lsr #24		/* r2 = ...0 */
    895 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
    896 	mov	r1, r1, lsl #8		/* r1 = xx5. */
    897 	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
    898 #endif
    899 	str	r2, [r0]
    900 	strh	r1, [r0, #0x04]
    901 	RET
    902 	LMEMCPY_6_PAD
    903 
    904 /*
    905  * 0100: dst is 8-bit aligned, src is 32-bit aligned
    906  */
    907 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
    908 	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
    909 	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
    910 	strh	r1, [r0, #0x01]
    911 #ifdef __ARMEB__
    912 	mov	r1, r3, lsr #24		/* r1 = ...0 */
    913 	strb	r1, [r0]
    914 	mov	r3, r3, lsl #8		/* r3 = 123. */
    915 	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
    916 #else
    917 	strb	r3, [r0]
    918 	mov	r3, r3, lsr #24		/* r3 = ...3 */
    919 	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
    920 	mov	r2, r2, lsr #8		/* r2 = ...5 */
    921 #endif
    922 	strh	r3, [r0, #0x03]
    923 	strb	r2, [r0, #0x05]
    924 	RET
    925 	LMEMCPY_6_PAD
    926 
    927 /*
    928  * 0101: dst is 8-bit aligned, src is 8-bit aligned
    929  */
    930 	ldrb	r2, [r1]
    931 	ldrh	r3, [r1, #0x01]
    932 	ldrh	ip, [r1, #0x03]
    933 	ldrb	r1, [r1, #0x05]
    934 	strb	r2, [r0]
    935 	strh	r3, [r0, #0x01]
    936 	strh	ip, [r0, #0x03]
    937 	strb	r1, [r0, #0x05]
    938 	RET
    939 	LMEMCPY_6_PAD
    940 
    941 /*
    942  * 0110: dst is 8-bit aligned, src is 16-bit aligned
    943  */
    944 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
    945 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
    946 #ifdef __ARMEB__
    947 	mov	r3, r2, lsr #8		/* r3 = ...0 */
    948 	strb	r3, [r0]
    949 	strb	r1, [r0, #0x05]
    950 	mov	r3, r1, lsr #8		/* r3 = .234 */
    951 	strh	r3, [r0, #0x03]
    952 	mov	r3, r2, lsl #8		/* r3 = .01. */
    953 	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
    954 	strh	r3, [r0, #0x01]
    955 #else
    956 	strb	r2, [r0]
    957 	mov	r3, r1, lsr #24
    958 	strb	r3, [r0, #0x05]
    959 	mov	r3, r1, lsr #8		/* r3 = .543 */
    960 	strh	r3, [r0, #0x03]
    961 	mov	r3, r2, lsr #8		/* r3 = ...1 */
    962 	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
    963 	strh	r3, [r0, #0x01]
    964 #endif
    965 	RET
    966 	LMEMCPY_6_PAD
    967 
    968 /*
    969  * 0111: dst is 8-bit aligned, src is 8-bit aligned
    970  */
    971 	ldrb	r2, [r1]
    972 	ldrh	r3, [r1, #0x01]
    973 	ldrh	ip, [r1, #0x03]
    974 	ldrb	r1, [r1, #0x05]
    975 	strb	r2, [r0]
    976 	strh	r3, [r0, #0x01]
    977 	strh	ip, [r0, #0x03]
    978 	strb	r1, [r0, #0x05]
    979 	RET
    980 	LMEMCPY_6_PAD
    981 
    982 /*
    983  * 1000: dst is 16-bit aligned, src is 32-bit aligned
    984  */
    985 #ifdef __ARMEB__
    986 	ldr	r2, [r1]		/* r2 = 0123 */
    987 	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
    988 	mov	r1, r2, lsr #16		/* r1 = ..01 */
    989 	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
    990 	strh	r1, [r0]
    991 	str	r3, [r0, #0x02]
    992 #else
    993 	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
    994 	ldr	r3, [r1]		/* r3 = 3210 */
    995 	mov	r2, r2, lsl #16		/* r2 = 54.. */
    996 	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
    997 	strh	r3, [r0]
    998 	str	r2, [r0, #0x02]
    999 #endif
   1000 	RET
   1001 	LMEMCPY_6_PAD
   1002 
   1003 /*
   1004  * 1001: dst is 16-bit aligned, src is 8-bit aligned
   1005  */
   1006 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
   1007 	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
   1008 	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
   1009 #ifdef __ARMEB__
   1010 	mov	r2, r2, lsr #8		/* r2 = .345 */
   1011 	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
   1012 #else
   1013 	mov	r2, r2, lsl #8		/* r2 = 543. */
   1014 	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
   1015 #endif
   1016 	strh	r1, [r0]
   1017 	str	r2, [r0, #0x02]
   1018 	RET
   1019 	LMEMCPY_6_PAD
   1020 
   1021 /*
   1022  * 1010: dst is 16-bit aligned, src is 16-bit aligned
   1023  */
   1024 	ldrh	r2, [r1]
   1025 	ldr	r3, [r1, #0x02]
   1026 	strh	r2, [r0]
   1027 	str	r3, [r0, #0x02]
   1028 	RET
   1029 	LMEMCPY_6_PAD
   1030 
   1031 /*
   1032  * 1011: dst is 16-bit aligned, src is 8-bit aligned
   1033  */
   1034 	ldrb	r3, [r1]		/* r3 = ...0 */
   1035 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
   1036 	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
   1037 #ifdef __ARMEB__
   1038 	mov	r3, r3, lsl #8		/* r3 = ..0. */
   1039 	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
   1040 	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
   1041 #else
   1042 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
   1043 	mov	r1, r1, lsl #24		/* r1 = 5... */
   1044 	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
   1045 #endif
   1046 	strh	r3, [r0]
   1047 	str	r1, [r0, #0x02]
   1048 	RET
   1049 	LMEMCPY_6_PAD
   1050 
   1051 /*
   1052  * 1100: dst is 8-bit aligned, src is 32-bit aligned
   1053  */
   1054 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
   1055 	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
   1056 #ifdef __ARMEB__
   1057 	mov	r3, r2, lsr #24		/* r3 = ...0 */
   1058 	strb	r3, [r0]
   1059 	mov	r2, r2, lsl #8		/* r2 = 123. */
   1060 	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
   1061 #else
   1062 	strb	r2, [r0]
   1063 	mov	r2, r2, lsr #8		/* r2 = .321 */
   1064 	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
   1065 	mov	r1, r1, lsr #8		/* r1 = ...5 */
   1066 #endif
   1067 	str	r2, [r0, #0x01]
   1068 	strb	r1, [r0, #0x05]
   1069 	RET
   1070 	LMEMCPY_6_PAD
   1071 
   1072 /*
   1073  * 1101: dst is 8-bit aligned, src is 8-bit aligned
   1074  */
   1075 	ldrb	r2, [r1]
   1076 	ldrh	r3, [r1, #0x01]
   1077 	ldrh	ip, [r1, #0x03]
   1078 	ldrb	r1, [r1, #0x05]
   1079 	strb	r2, [r0]
   1080 	strh	r3, [r0, #0x01]
   1081 	strh	ip, [r0, #0x03]
   1082 	strb	r1, [r0, #0x05]
   1083 	RET
   1084 	LMEMCPY_6_PAD
   1085 
   1086 /*
   1087  * 1110: dst is 8-bit aligned, src is 16-bit aligned
   1088  */
   1089 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
   1090 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
   1091 #ifdef __ARMEB__
   1092 	mov	r3, r2, lsr #8		/* r3 = ...0 */
   1093 	strb	r3, [r0]
   1094 	mov	r2, r2, lsl #24		/* r2 = 1... */
   1095 	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
   1096 #else
   1097 	strb	r2, [r0]
   1098 	mov	r2, r2, lsr #8		/* r2 = ...1 */
   1099 	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
   1100 	mov	r1, r1, lsr #24		/* r1 = ...5 */
   1101 #endif
   1102 	str	r2, [r0, #0x01]
   1103 	strb	r1, [r0, #0x05]
   1104 	RET
   1105 	LMEMCPY_6_PAD
   1106 
   1107 /*
   1108  * 1111: dst is 8-bit aligned, src is 8-bit aligned
   1109  */
   1110 	ldrb	r2, [r1]
   1111 	ldr	r3, [r1, #0x01]
   1112 	ldrb	r1, [r1, #0x05]
   1113 	strb	r2, [r0]
   1114 	str	r3, [r0, #0x01]
   1115 	strb	r1, [r0, #0x05]
   1116 	RET
   1117 	LMEMCPY_6_PAD
   1118 
   1119 
   1120 /******************************************************************************
   1121  * Special case for 8 byte copies
   1122  */
   1123 #define	LMEMCPY_8_LOG2	6	/* 64 bytes */
   1124 #define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
   1125 	LMEMCPY_8_PAD
   1126 .Lmemcpy_8:
   1127 	and	r2, r1, #0x03
   1128 	orr	r2, r2, r0, lsl #2
   1129 	ands	r2, r2, #0x0f
   1130 	sub	r3, pc, #0x14
   1131 	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
   1132 
   1133 /*
   1134  * 0000: dst is 32-bit aligned, src is 32-bit aligned
   1135  */
   1136 	ldr	r2, [r1]
   1137 	ldr	r3, [r1, #0x04]
   1138 	str	r2, [r0]
   1139 	str	r3, [r0, #0x04]
   1140 	RET
   1141 	LMEMCPY_8_PAD
   1142 
   1143 /*
   1144  * 0001: dst is 32-bit aligned, src is 8-bit aligned
   1145  */
   1146 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
   1147 	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
   1148 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
   1149 #ifdef __ARMEB__
   1150 	mov	r3, r3, lsl #8		/* r3 = 012. */
   1151 	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
   1152 	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
   1153 #else
   1154 	mov	r3, r3, lsr #8		/* r3 = .210 */
   1155 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
   1156 	mov	r1, r1, lsl #24		/* r1 = 7... */
   1157 	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
   1158 #endif
   1159 	str	r3, [r0]
   1160 	str	r2, [r0, #0x04]
   1161 	RET
   1162 	LMEMCPY_8_PAD
   1163 
   1164 /*
   1165  * 0010: dst is 32-bit aligned, src is 16-bit aligned
   1166  */
   1167 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
   1168 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
   1169 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
   1170 #ifdef __ARMEB__
   1171 	mov	r2, r2, lsl #16		/* r2 = 01.. */
   1172 	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
   1173 	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
   1174 #else
   1175 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
   1176 	mov	r3, r3, lsr #16		/* r3 = ..54 */
   1177 	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
   1178 #endif
   1179 	str	r2, [r0]
   1180 	str	r3, [r0, #0x04]
   1181 	RET
   1182 	LMEMCPY_8_PAD
   1183 
   1184 /*
   1185  * 0011: dst is 32-bit aligned, src is 8-bit aligned
   1186  */
   1187 	ldrb	r3, [r1]		/* r3 = ...0 */
   1188 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
   1189 	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
   1190 #ifdef __ARMEB__
   1191 	mov	r3, r3, lsl #24		/* r3 = 0... */
   1192 	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
   1193 	mov	r2, r2, lsl #24		/* r2 = 4... */
   1194 	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
   1195 #else
   1196 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
   1197 	mov	r2, r2, lsr #24		/* r2 = ...4 */
   1198 	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
   1199 #endif
   1200 	str	r3, [r0]
   1201 	str	r2, [r0, #0x04]
   1202 	RET
   1203 	LMEMCPY_8_PAD
   1204 
   1205 /*
   1206  * 0100: dst is 8-bit aligned, src is 32-bit aligned
   1207  */
   1208 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
   1209 	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
   1210 #ifdef __ARMEB__
   1211 	mov	r1, r3, lsr #24		/* r1 = ...0 */
   1212 	strb	r1, [r0]
   1213 	mov	r1, r3, lsr #8		/* r1 = .012 */
   1214 	strb	r2, [r0, #0x07]
   1215 	mov	r3, r3, lsl #24		/* r3 = 3... */
   1216 	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
   1217 #else
   1218 	strb	r3, [r0]
   1219 	mov	r1, r2, lsr #24		/* r1 = ...7 */
   1220 	strb	r1, [r0, #0x07]
   1221 	mov	r1, r3, lsr #8		/* r1 = .321 */
   1222 	mov	r3, r3, lsr #24		/* r3 = ...3 */
   1223 	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
   1224 #endif
   1225 	strh	r1, [r0, #0x01]
   1226 	str	r3, [r0, #0x03]
   1227 	RET
   1228 	LMEMCPY_8_PAD
   1229 
   1230 /*
   1231  * 0101: dst is 8-bit aligned, src is 8-bit aligned
   1232  */
   1233 	ldrb	r2, [r1]
   1234 	ldrh	r3, [r1, #0x01]
   1235 	ldr	ip, [r1, #0x03]
   1236 	ldrb	r1, [r1, #0x07]
   1237 	strb	r2, [r0]
   1238 	strh	r3, [r0, #0x01]
   1239 	str	ip, [r0, #0x03]
   1240 	strb	r1, [r0, #0x07]
   1241 	RET
   1242 	LMEMCPY_8_PAD
   1243 
   1244 /*
   1245  * 0110: dst is 8-bit aligned, src is 16-bit aligned
   1246  */
   1247 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
   1248 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
   1249 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
   1250 #ifdef __ARMEB__
   1251 	mov	ip, r2, lsr #8		/* ip = ...0 */
   1252 	strb	ip, [r0]
   1253 	mov	ip, r2, lsl #8		/* ip = .01. */
   1254 	orr	ip, ip, r3, lsr #24	/* ip = .012 */
   1255 	strb	r1, [r0, #0x07]
   1256 	mov	r3, r3, lsl #8		/* r3 = 345. */
   1257 	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
   1258 #else
   1259 	strb	r2, [r0]		/* 0 */
   1260 	mov	ip, r1, lsr #8		/* ip = ...7 */
   1261 	strb	ip, [r0, #0x07]		/* 7 */
   1262 	mov	ip, r2, lsr #8		/* ip = ...1 */
   1263 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
   1264 	mov	r3, r3, lsr #8		/* r3 = .543 */
   1265 	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
   1266 #endif
   1267 	strh	ip, [r0, #0x01]
   1268 	str	r3, [r0, #0x03]
   1269 	RET
   1270 	LMEMCPY_8_PAD
   1271 
   1272 /*
   1273  * 0111: dst is 8-bit aligned, src is 8-bit aligned
   1274  */
   1275 	ldrb	r3, [r1]		/* r3 = ...0 */
   1276 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
   1277 	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
   1278 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
   1279 	strb	r3, [r0]
   1280 	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
   1281 #ifdef __ARMEB__
   1282 	strh	r3, [r0, #0x01]
   1283 	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
   1284 #else
   1285 	strh	ip, [r0, #0x01]
   1286 	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
   1287 #endif
   1288 	str	r2, [r0, #0x03]
   1289 	strb	r1, [r0, #0x07]
   1290 	RET
   1291 	LMEMCPY_8_PAD
   1292 
   1293 /*
   1294  * 1000: dst is 16-bit aligned, src is 32-bit aligned
   1295  */
   1296 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
   1297 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
   1298 	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
   1299 #ifdef __ARMEB__
   1300 	strh	r1, [r0]
   1301 	mov	r1, r3, lsr #16		/* r1 = ..45 */
   1302 	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
   1303 #else
   1304 	strh	r2, [r0]
   1305 	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
   1306 	mov	r3, r3, lsr #16		/* r3 = ..76 */
   1307 #endif
   1308 	str	r2, [r0, #0x02]
   1309 	strh	r3, [r0, #0x06]
   1310 	RET
   1311 	LMEMCPY_8_PAD
   1312 
   1313 /*
   1314  * 1001: dst is 16-bit aligned, src is 8-bit aligned
   1315  */
   1316 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
   1317 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
   1318 	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
   1319 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
   1320 	strh	r1, [r0]
   1321 #ifdef __ARMEB__
   1322 	mov	r1, r2, lsl #24		/* r1 = 2... */
   1323 	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
   1324 	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
   1325 #else
   1326 	mov	r1, r2, lsr #24		/* r1 = ...2 */
   1327 	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
   1328 	mov	r3, r3, lsr #24		/* r3 = ...6 */
   1329 	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
   1330 #endif
   1331 	str	r1, [r0, #0x02]
   1332 	strh	r3, [r0, #0x06]
   1333 	RET
   1334 	LMEMCPY_8_PAD
   1335 
   1336 /*
   1337  * 1010: dst is 16-bit aligned, src is 16-bit aligned
   1338  */
   1339 	ldrh	r2, [r1]
   1340 	ldr	ip, [r1, #0x02]
   1341 	ldrh	r3, [r1, #0x06]
   1342 	strh	r2, [r0]
   1343 	str	ip, [r0, #0x02]
   1344 	strh	r3, [r0, #0x06]
   1345 	RET
   1346 	LMEMCPY_8_PAD
   1347 
   1348 /*
   1349  * 1011: dst is 16-bit aligned, src is 8-bit aligned
   1350  */
   1351 	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
   1352 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
   1353 	ldrb	ip, [r1]		/* ip = ...0 */
   1354 	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
   1355 	strh	r1, [r0, #0x06]
   1356 #ifdef __ARMEB__
   1357 	mov	r3, r3, lsr #24		/* r3 = ...5 */
   1358 	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
   1359 	mov	r2, r2, lsr #24		/* r2 = ...1 */
   1360 	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
   1361 #else
   1362 	mov	r3, r3, lsl #24		/* r3 = 5... */
   1363 	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
   1364 	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
   1365 #endif
   1366 	str	r3, [r0, #0x02]
   1367 	strh	r2, [r0]
   1368 	RET
   1369 	LMEMCPY_8_PAD
   1370 
   1371 /*
   1372  * 1100: dst is 8-bit aligned, src is 32-bit aligned
   1373  */
   1374 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
   1375 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
   1376 	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
   1377 	strh	r1, [r0, #0x05]
   1378 #ifdef __ARMEB__
   1379 	strb	r3, [r0, #0x07]
   1380 	mov	r1, r2, lsr #24		/* r1 = ...0 */
   1381 	strb	r1, [r0]
   1382 	mov	r2, r2, lsl #8		/* r2 = 123. */
   1383 	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
   1384 	str	r2, [r0, #0x01]
   1385 #else
   1386 	strb	r2, [r0]
   1387 	mov	r1, r3, lsr #24		/* r1 = ...7 */
   1388 	strb	r1, [r0, #0x07]
   1389 	mov	r2, r2, lsr #8		/* r2 = .321 */
   1390 	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
   1391 	str	r2, [r0, #0x01]
   1392 #endif
   1393 	RET
   1394 	LMEMCPY_8_PAD
   1395 
   1396 /*
   1397  * 1101: dst is 8-bit aligned, src is 8-bit aligned
   1398  */
   1399 	ldrb	r3, [r1]		/* r3 = ...0 */
   1400 	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
   1401 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
   1402 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
   1403 	strb	r3, [r0]
   1404 	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
   1405 #ifdef __ARMEB__
   1406 	strh	ip, [r0, #0x05]
   1407 	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
   1408 #else
   1409 	strh	r3, [r0, #0x05]
   1410 	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
   1411 #endif
   1412 	str	r2, [r0, #0x01]
   1413 	strb	r1, [r0, #0x07]
   1414 	RET
   1415 	LMEMCPY_8_PAD
   1416 
   1417 /*
   1418  * 1110: dst is 8-bit aligned, src is 16-bit aligned
   1419  */
   1420 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
   1421 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
   1422 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
   1423 #ifdef __ARMEB__
   1424 	mov	ip, r2, lsr #8		/* ip = ...0 */
   1425 	strb	ip, [r0]
   1426 	mov	ip, r2, lsl #24		/* ip = 1... */
   1427 	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
   1428 	strb	r1, [r0, #0x07]
   1429 	mov	r1, r1, lsr #8		/* r1 = ...6 */
   1430 	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
   1431 #else
   1432 	strb	r2, [r0]
   1433 	mov	ip, r2, lsr #8		/* ip = ...1 */
   1434 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
   1435 	mov	r2, r1, lsr #8		/* r2 = ...7 */
   1436 	strb	r2, [r0, #0x07]
   1437 	mov	r1, r1, lsl #8		/* r1 = .76. */
   1438 	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
   1439 #endif
   1440 	str	ip, [r0, #0x01]
   1441 	strh	r1, [r0, #0x05]
   1442 	RET
   1443 	LMEMCPY_8_PAD
   1444 
   1445 /*
   1446  * 1111: dst is 8-bit aligned, src is 8-bit aligned
   1447  */
   1448 	ldrb	r2, [r1]
   1449 	ldr	ip, [r1, #0x01]
   1450 	ldrh	r3, [r1, #0x05]
   1451 	ldrb	r1, [r1, #0x07]
   1452 	strb	r2, [r0]
   1453 	str	ip, [r0, #0x01]
   1454 	strh	r3, [r0, #0x05]
   1455 	strb	r1, [r0, #0x07]
   1456 	RET
   1457 	LMEMCPY_8_PAD
   1458 
   1459 /******************************************************************************
   1460  * Special case for 12 byte copies
   1461  */
   1462 #define	LMEMCPY_C_LOG2	7	/* 128 bytes */
   1463 #define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
   1464 	LMEMCPY_C_PAD
   1465 .Lmemcpy_c:
   1466 	and	r2, r1, #0x03
   1467 	orr	r2, r2, r0, lsl #2
   1468 	ands	r2, r2, #0x0f
   1469 	sub	r3, pc, #0x14
   1470 	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
   1471 
   1472 /*
   1473  * 0000: dst is 32-bit aligned, src is 32-bit aligned
   1474  */
   1475 	ldr	r2, [r1]
   1476 	ldr	r3, [r1, #0x04]
   1477 	ldr	r1, [r1, #0x08]
   1478 	str	r2, [r0]
   1479 	str	r3, [r0, #0x04]
   1480 	str	r1, [r0, #0x08]
   1481 	RET
   1482 	LMEMCPY_C_PAD
   1483 
   1484 /*
   1485  * 0001: dst is 32-bit aligned, src is 8-bit aligned
   1486  */
   1487 	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
   1488 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
   1489 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
   1490 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
   1491 #ifdef __ARMEB__
   1492 	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
   1493 	str	r2, [r0, #0x08]
   1494 	mov	r2, ip, lsr #24		/* r2 = ...7 */
   1495 	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
   1496 	mov	r1, r1, lsl #8		/* r1 = 012. */
   1497 	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
   1498 #else
   1499 	mov	r2, r2, lsl #24		/* r2 = B... */
   1500 	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
   1501 	str	r2, [r0, #0x08]
   1502 	mov	r2, ip, lsl #24		/* r2 = 7... */
   1503 	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
   1504 	mov	r1, r1, lsr #8		/* r1 = .210 */
   1505 	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
   1506 #endif
   1507 	str	r2, [r0, #0x04]
   1508 	str	r1, [r0]
   1509 	RET
   1510 	LMEMCPY_C_PAD
   1511 
   1512 /*
   1513  * 0010: dst is 32-bit aligned, src is 16-bit aligned
   1514  */
   1515 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
   1516 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
   1517 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
   1518 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
   1519 #ifdef __ARMEB__
   1520 	mov	r2, r2, lsl #16		/* r2 = 01.. */
   1521 	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
   1522 	str	r2, [r0]
   1523 	mov	r3, r3, lsl #16		/* r3 = 45.. */
   1524 	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
   1525 	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
   1526 #else
   1527 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
   1528 	str	r2, [r0]
   1529 	mov	r3, r3, lsr #16		/* r3 = ..54 */
   1530 	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
   1531 	mov	r1, r1, lsl #16		/* r1 = BA.. */
   1532 	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
   1533 #endif
   1534 	str	r3, [r0, #0x04]
   1535 	str	r1, [r0, #0x08]
   1536 	RET
   1537 	LMEMCPY_C_PAD
   1538 
   1539 /*
   1540  * 0011: dst is 32-bit aligned, src is 8-bit aligned
   1541  */
   1542 	ldrb	r2, [r1]		/* r2 = ...0 */
   1543 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
   1544 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
   1545 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
   1546 #ifdef __ARMEB__
   1547 	mov	r2, r2, lsl #24		/* r2 = 0... */
   1548 	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
   1549 	str	r2, [r0]
   1550 	mov	r3, r3, lsl #24		/* r3 = 4... */
   1551 	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
   1552 	mov	r1, r1, lsr #8		/* r1 = .9AB */
   1553 	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
   1554 #else
   1555 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
   1556 	str	r2, [r0]
   1557 	mov	r3, r3, lsr #24		/* r3 = ...4 */
   1558 	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
   1559 	mov	r1, r1, lsl #8		/* r1 = BA9. */
   1560 	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
   1561 #endif
   1562 	str	r3, [r0, #0x04]
   1563 	str	r1, [r0, #0x08]
   1564 	RET
   1565 	LMEMCPY_C_PAD
   1566 
   1567 /*
   1568  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
   1569  */
   1570 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
   1571 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
   1572 	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
   1573 	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
   1574 	strh	r1, [r0, #0x01]
   1575 #ifdef __ARMEB__
   1576 	mov	r1, r2, lsr #24		/* r1 = ...0 */
   1577 	strb	r1, [r0]
   1578 	mov	r1, r2, lsl #24		/* r1 = 3... */
   1579 	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
   1580 	mov	r1, r3, lsl #24		/* r1 = 7... */
   1581 	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
   1582 #else
   1583 	strb	r2, [r0]
   1584 	mov	r1, r2, lsr #24		/* r1 = ...3 */
   1585 	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
   1586 	mov	r1, r3, lsr #24		/* r1 = ...7 */
   1587 	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
   1588 	mov	ip, ip, lsr #24		/* ip = ...B */
   1589 #endif
   1590 	str	r2, [r0, #0x03]
   1591 	str	r1, [r0, #0x07]
   1592 	strb	ip, [r0, #0x0b]
   1593 	RET
   1594 	LMEMCPY_C_PAD
   1595 
   1596 /*
   1597  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
   1598  */
   1599 	ldrb	r2, [r1]
   1600 	ldrh	r3, [r1, #0x01]
   1601 	ldr	ip, [r1, #0x03]
   1602 	strb	r2, [r0]
   1603 	ldr	r2, [r1, #0x07]
   1604 	ldrb	r1, [r1, #0x0b]
   1605 	strh	r3, [r0, #0x01]
   1606 	str	ip, [r0, #0x03]
   1607 	str	r2, [r0, #0x07]
   1608 	strb	r1, [r0, #0x0b]
   1609 	RET
   1610 	LMEMCPY_C_PAD
   1611 
   1612 /*
   1613  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
   1614  */
   1615 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
   1616 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
   1617 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
   1618 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
   1619 #ifdef __ARMEB__
   1620 	mov	r2, r2, ror #8		/* r2 = 1..0 */
   1621 	strb	r2, [r0]
   1622 	mov	r2, r2, lsr #16		/* r2 = ..1. */
   1623 	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
   1624 	strh	r2, [r0, #0x01]
   1625 	mov	r2, r3, lsl #8		/* r2 = 345. */
   1626 	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
   1627 	mov	r2, ip, lsl #8		/* r2 = 789. */
   1628 	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
   1629 #else
   1630 	strb	r2, [r0]
   1631 	mov	r2, r2, lsr #8		/* r2 = ...1 */
   1632 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
   1633 	strh	r2, [r0, #0x01]
   1634 	mov	r2, r3, lsr #8		/* r2 = .543 */
   1635 	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
   1636 	mov	r2, ip, lsr #8		/* r2 = .987 */
   1637 	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
   1638 	mov	r1, r1, lsr #8		/* r1 = ...B */
   1639 #endif
   1640 	str	r3, [r0, #0x03]
   1641 	str	r2, [r0, #0x07]
   1642 	strb	r1, [r0, #0x0b]
   1643 	RET
   1644 	LMEMCPY_C_PAD
   1645 
   1646 /*
   1647  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
   1648  */
   1649 	ldrb	r2, [r1]
   1650 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
   1651 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
   1652 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
   1653 	strb	r2, [r0]
   1654 #ifdef __ARMEB__
   1655 	mov	r2, r3, lsr #16		/* r2 = ..12 */
   1656 	strh	r2, [r0, #0x01]
   1657 	mov	r3, r3, lsl #16		/* r3 = 34.. */
   1658 	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
   1659 	mov	ip, ip, lsl #16		/* ip = 78.. */
   1660 	orr	ip, ip, r1, lsr #16	/* ip = 789A */
   1661 	mov	r1, r1, lsr #8		/* r1 = .9AB */
   1662 #else
   1663 	strh	r3, [r0, #0x01]
   1664 	mov	r3, r3, lsr #16		/* r3 = ..43 */
   1665 	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
   1666 	mov	ip, ip, lsr #16		/* ip = ..87 */
   1667 	orr	ip, ip, r1, lsl #16	/* ip = A987 */
   1668 	mov	r1, r1, lsr #16		/* r1 = ..xB */
   1669 #endif
   1670 	str	r3, [r0, #0x03]
   1671 	str	ip, [r0, #0x07]
   1672 	strb	r1, [r0, #0x0b]
   1673 	RET
   1674 	LMEMCPY_C_PAD
   1675 
   1676 /*
   1677  * 1000: dst is 16-bit aligned, src is 32-bit aligned
   1678  */
   1679 	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
   1680 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
   1681 	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
   1682 	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
   1683 #ifdef __ARMEB__
   1684 	strh	r1, [r0]
   1685 	mov	r1, ip, lsl #16		/* r1 = 23.. */
   1686 	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
   1687 	mov	r3, r3, lsl #16		/* r3 = 67.. */
   1688 	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
   1689 #else
   1690 	strh	ip, [r0]
   1691 	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
   1692 	mov	r3, r3, lsr #16		/* r3 = ..76 */
   1693 	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
   1694 	mov	r2, r2, lsr #16		/* r2 = ..BA */
   1695 #endif
   1696 	str	r1, [r0, #0x02]
   1697 	str	r3, [r0, #0x06]
   1698 	strh	r2, [r0, #0x0a]
   1699 	RET
   1700 	LMEMCPY_C_PAD
   1701 
   1702 /*
   1703  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
   1704  */
   1705 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
   1706 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
   1707 	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
   1708 	strh	ip, [r0]
   1709 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
   1710 	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
   1711 #ifdef __ARMEB__
   1712 	mov	r2, r2, lsl #24		/* r2 = 2... */
   1713 	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
   1714 	mov	r3, r3, lsl #24		/* r3 = 6... */
   1715 	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
   1716 	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
   1717 #else
   1718 	mov	r2, r2, lsr #24		/* r2 = ...2 */
   1719 	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
   1720 	mov	r3, r3, lsr #24		/* r3 = ...6 */
   1721 	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
   1722 	mov	r1, r1, lsl #8		/* r1 = ..B. */
   1723 	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
   1724 #endif
   1725 	str	r2, [r0, #0x02]
   1726 	str	r3, [r0, #0x06]
   1727 	strh	r1, [r0, #0x0a]
   1728 	RET
   1729 	LMEMCPY_C_PAD
   1730 
   1731 /*
   1732  * 1010: dst is 16-bit aligned, src is 16-bit aligned
   1733  */
   1734 	ldrh	r2, [r1]
   1735 	ldr	r3, [r1, #0x02]
   1736 	ldr	ip, [r1, #0x06]
   1737 	ldrh	r1, [r1, #0x0a]
   1738 	strh	r2, [r0]
   1739 	str	r3, [r0, #0x02]
   1740 	str	ip, [r0, #0x06]
   1741 	strh	r1, [r0, #0x0a]
   1742 	RET
   1743 	LMEMCPY_C_PAD
   1744 
   1745 /*
   1746  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
   1747  */
   1748 	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
   1749 	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
   1750 	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
   1751 	strh	ip, [r0, #0x0a]
   1752 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
   1753 	ldrb	r1, [r1]		/* r1 = ...0 */
   1754 #ifdef __ARMEB__
   1755 	mov	r2, r2, lsr #24		/* r2 = ...9 */
   1756 	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
   1757 	mov	r3, r3, lsr #24		/* r3 = ...5 */
   1758 	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
   1759 	mov	r1, r1, lsl #8		/* r1 = ..0. */
   1760 	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
   1761 #else
   1762 	mov	r2, r2, lsl #24		/* r2 = 9... */
   1763 	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
   1764 	mov	r3, r3, lsl #24		/* r3 = 5... */
   1765 	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
   1766 	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
   1767 #endif
   1768 	str	r2, [r0, #0x06]
   1769 	str	r3, [r0, #0x02]
   1770 	strh	r1, [r0]
   1771 	RET
   1772 	LMEMCPY_C_PAD
   1773 
   1774 /*
   1775  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
   1776  */
   1777 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
   1778 	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
   1779 	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
   1780 #ifdef __ARMEB__
   1781 	mov	r3, r2, lsr #24		/* r3 = ...0 */
   1782 	strb	r3, [r0]
   1783 	mov	r2, r2, lsl #8		/* r2 = 123. */
   1784 	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
   1785 	str	r2, [r0, #0x01]
   1786 	mov	r2, ip, lsl #8		/* r2 = 567. */
   1787 	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
   1788 	str	r2, [r0, #0x05]
   1789 	mov	r2, r1, lsr #8		/* r2 = ..9A */
   1790 	strh	r2, [r0, #0x09]
   1791 	strb	r1, [r0, #0x0b]
   1792 #else
   1793 	strb	r2, [r0]
   1794 	mov	r3, r2, lsr #8		/* r3 = .321 */
   1795 	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
   1796 	str	r3, [r0, #0x01]
   1797 	mov	r3, ip, lsr #8		/* r3 = .765 */
   1798 	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
   1799 	str	r3, [r0, #0x05]
   1800 	mov	r1, r1, lsr #8		/* r1 = .BA9 */
   1801 	strh	r1, [r0, #0x09]
   1802 	mov	r1, r1, lsr #16		/* r1 = ...B */
   1803 	strb	r1, [r0, #0x0b]
   1804 #endif
   1805 	RET
   1806 	LMEMCPY_C_PAD
   1807 
   1808 /*
   1809  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
   1810  */
   1811 	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
   1812 	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
   1813 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
   1814 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
   1815 	strb	r2, [r0, #0x0b]
   1816 #ifdef __ARMEB__
   1817 	strh	r3, [r0, #0x09]
   1818 	mov	r3, r3, lsr #16		/* r3 = ..78 */
   1819 	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
   1820 	mov	ip, ip, lsr #16		/* ip = ..34 */
   1821 	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
   1822 	mov	r1, r1, lsr #16		/* r1 = ..x0 */
   1823 #else
   1824 	mov	r2, r3, lsr #16		/* r2 = ..A9 */
   1825 	strh	r2, [r0, #0x09]
   1826 	mov	r3, r3, lsl #16		/* r3 = 87.. */
   1827 	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
   1828 	mov	ip, ip, lsl #16		/* ip = 43.. */
   1829 	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
   1830 	mov	r1, r1, lsr #8		/* r1 = .210 */
   1831 #endif
   1832 	str	r3, [r0, #0x05]
   1833 	str	ip, [r0, #0x01]
   1834 	strb	r1, [r0]
   1835 	RET
   1836 	LMEMCPY_C_PAD
   1837 
   1838 /*
   1839  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
   1840  */
   1841 #ifdef __ARMEB__
   1842 	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
   1843 	ldr	ip, [r1, #0x06]		/* ip = 6789 */
   1844 	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
   1845 	ldrh	r1, [r1]		/* r1 = ..01 */
   1846 	strb	r2, [r0, #0x0b]
   1847 	mov	r2, r2, lsr #8		/* r2 = ...A */
   1848 	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
   1849 	mov	ip, ip, lsr #8		/* ip = .678 */
   1850 	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
   1851 	mov	r3, r3, lsr #8		/* r3 = .234 */
   1852 	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
   1853 	mov	r1, r1, lsr #8		/* r1 = ...0 */
   1854 	strb	r1, [r0]
   1855 	str	r3, [r0, #0x01]
   1856 	str	ip, [r0, #0x05]
   1857 	strh	r2, [r0, #0x09]
   1858 #else
   1859 	ldrh	r2, [r1]		/* r2 = ..10 */
   1860 	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
   1861 	ldr	ip, [r1, #0x06]		/* ip = 9876 */
   1862 	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
   1863 	strb	r2, [r0]
   1864 	mov	r2, r2, lsr #8		/* r2 = ...1 */
   1865 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
   1866 	mov	r3, r3, lsr #24		/* r3 = ...5 */
   1867 	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
   1868 	mov	ip, ip, lsr #24		/* ip = ...9 */
   1869 	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
   1870 	mov	r1, r1, lsr #8		/* r1 = ...B */
   1871 	str	r2, [r0, #0x01]
   1872 	str	r3, [r0, #0x05]
   1873 	strh	ip, [r0, #0x09]
   1874 	strb	r1, [r0, #0x0b]
   1875 #endif
   1876 	RET
   1877 	LMEMCPY_C_PAD
   1878 
   1879 /*
   1880  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
   1881  */
   1882 	ldrb	r2, [r1]
   1883 	ldr	r3, [r1, #0x01]
   1884 	ldr	ip, [r1, #0x05]
   1885 	strb	r2, [r0]
   1886 	ldrh	r2, [r1, #0x09]
   1887 	ldrb	r1, [r1, #0x0b]
   1888 	str	r3, [r0, #0x01]
   1889 	str	ip, [r0, #0x05]
   1890 	strh	r2, [r0, #0x09]
   1891 	strb	r1, [r0, #0x0b]
   1892 	RET
   1893 END(memcpy)
   1894 #endif	/* !_STANDALONE */
   1895