Home | History | Annotate | Line # | Download | only in string
bcopy.S revision 1.1.4.3
      1  1.1.4.3  martin /* $NetBSD: bcopy.S,v 1.1.4.3 2020/04/21 19:37:41 martin Exp $ */
      2      1.1   skrll 
      3      1.1   skrll /*
      4      1.1   skrll  * Copyright (c) 2018 Ryo Shimizu <ryo (at) nerv.org>
      5      1.1   skrll  * All rights reserved.
      6      1.1   skrll  *
      7      1.1   skrll  * Redistribution and use in source and binary forms, with or without
      8      1.1   skrll  * modification, are permitted provided that the following conditions
      9      1.1   skrll  * are met:
     10      1.1   skrll  * 1. Redistributions of source code must retain the above copyright
     11      1.1   skrll  *    notice, this list of conditions and the following disclaimer.
     12      1.1   skrll  * 2. Redistributions in binary form must reproduce the above copyright
     13      1.1   skrll  *    notice, this list of conditions and the following disclaimer in the
     14      1.1   skrll  *    documentation and/or other materials provided with the distribution.
     15      1.1   skrll  *
     16      1.1   skrll  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17      1.1   skrll  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18      1.1   skrll  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19      1.1   skrll  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20      1.1   skrll  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21      1.1   skrll  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22      1.1   skrll  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23      1.1   skrll  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24      1.1   skrll  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25      1.1   skrll  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26      1.1   skrll  * POSSIBILITY OF SUCH DAMAGE.
     27      1.1   skrll  */
     28      1.1   skrll 
     29      1.1   skrll #include <machine/asm.h>
     30      1.1   skrll 
     31      1.1   skrll #if defined(LIBC_SCCS)
     32  1.1.4.3  martin RCSID("$NetBSD: bcopy.S,v 1.1.4.3 2020/04/21 19:37:41 martin Exp $")
     33      1.1   skrll #endif
     34      1.1   skrll 
     35      1.1   skrll #if defined(MEMCOPY)
     36      1.1   skrll 
     37      1.1   skrll /*
     38      1.1   skrll  * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
     39      1.1   skrll  */
     40      1.1   skrll #define FUNCTION		memcpy
     41      1.1   skrll #define NO_OVERLAP
     42      1.1   skrll #define SRC0			x1
     43      1.1   skrll #define DST0			x0
     44      1.1   skrll #define LEN			x2
     45      1.1   skrll 
     46      1.1   skrll #elif defined(MEMMOVE)
     47      1.1   skrll 
     48      1.1   skrll /*
     49      1.1   skrll  * void *memmove(void *dst, const void *src, size_t len);
     50      1.1   skrll  */
     51      1.1   skrll #define FUNCTION		memmove
     52      1.1   skrll #undef NO_OVERLAP
     53      1.1   skrll #define SRC0			x1
     54      1.1   skrll #define DST0			x0
     55      1.1   skrll #define LEN			x2
     56      1.1   skrll 
     57      1.1   skrll #else /* !MEMCOPY && !MEMMOVE */
     58      1.1   skrll 
     59      1.1   skrll /*
     60      1.1   skrll  * void bcopy(const void *src, void *dst, size_t len);
     61      1.1   skrll  */
     62      1.1   skrll #define FUNCTION		bcopy
     63      1.1   skrll #define NO_OVERLAP
     64      1.1   skrll #define SRC0			x0
     65      1.1   skrll #define DST0			x1
     66      1.1   skrll #define LEN			x2
     67      1.1   skrll 
     68      1.1   skrll #endif /* MEMCOPY/MEMMOVE/BCOPY */
     69      1.1   skrll 
     70      1.1   skrll /* caller-saved temporary registers. breakable. */
     71      1.1   skrll #define TMP_X			x3
     72      1.1   skrll #define TMP_Xw			w3
     73      1.1   skrll #define TMP_D			x4
     74      1.1   skrll #define TMP_S			x5
     75      1.1   skrll #define DST			x6
     76      1.1   skrll #define SRC			x7
     77      1.1   skrll #define DATA0			x8
     78      1.1   skrll #define DATA0w			w8
     79      1.1   skrll #define DATA1			x9
     80      1.1   skrll #define DATA1w			w9
     81      1.1   skrll #define DATA2			x10
     82      1.1   skrll #define SRC_ALIGNBIT		x11	/* (SRC & 7) * 8 */
     83      1.1   skrll #define DST_ALIGNBIT		x12	/* (DST & 7) * 8 */
     84      1.1   skrll #define SRC_DST_ALIGNBIT	x13	/* = SRC_ALIGNBIT - DST_ALIGNBIT */
     85      1.1   skrll #define DST_SRC_ALIGNBIT	x14	/* = -SRC_DST_ALIGNBIT */
     86      1.1   skrll 
     87      1.1   skrll #define STP_ALIGN		16	/* align before stp/ldp. 8 or 16 */
     88      1.1   skrll #define SMALLSIZE		32
     89      1.1   skrll 
     90      1.1   skrll 	.text
     91      1.1   skrll 	.align	5
     92      1.1   skrll 
     93      1.1   skrll #ifndef NO_OVERLAP
     94      1.1   skrll #ifndef STRICT_ALIGNMENT
     95      1.1   skrll backward_ignore_align:
     96      1.1   skrll 	prfm	PLDL1KEEP, [SRC0]
     97      1.1   skrll 	add	SRC0, SRC0, LEN
     98      1.1   skrll 	add	DST, DST0, LEN
     99      1.1   skrll 	cmp	LEN, #SMALLSIZE
    100      1.1   skrll 	bcs	copy_backward
    101      1.1   skrll copy_backward_small:
    102      1.1   skrll 	cmp	LEN, #8
    103      1.1   skrll 	bcs	9f
    104      1.1   skrll 
    105      1.1   skrll 	/* 0 <= len < 8 */
    106      1.1   skrll 	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    107      1.1   skrll 	tbz	LEN, #2, 1f
    108      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    109      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    110      1.1   skrll 1:
    111      1.1   skrll 	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    112      1.1   skrll 	tbz	LEN, #1, 1f
    113      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    114      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    115      1.1   skrll 1:
    116      1.1   skrll 	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    117      1.1   skrll 	tbz	LEN, #0, 1f
    118      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    119      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    120      1.1   skrll 1:
    121      1.1   skrll 	ret
    122      1.1   skrll 9:
    123      1.1   skrll 
    124      1.1   skrll 	cmp	LEN, #16
    125      1.1   skrll 	bcs	9f
    126      1.1   skrll 
    127      1.1   skrll 	/* 8 <= len < 16 */
    128      1.1   skrll 	/* *--(uint64_t *)dst = *--(uint64_t *)src; */
    129      1.1   skrll 	ldr	TMP_X, [SRC0, #-8]!
    130      1.1   skrll 	str	TMP_X, [DST, #-8]!
    131      1.1   skrll 	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    132      1.1   skrll 	tbz	LEN, #2, 1f
    133      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    134      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    135      1.1   skrll 1:
    136      1.1   skrll 	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    137      1.1   skrll 	tbz	LEN, #1, 1f
    138      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    139      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    140      1.1   skrll 1:
    141      1.1   skrll 	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    142      1.1   skrll 	tbz	LEN, #0, 1f
    143      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    144      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    145      1.1   skrll 1:
    146      1.1   skrll 	ret
    147      1.1   skrll 9:
    148      1.1   skrll 
    149      1.1   skrll 	/* 16 <= len < 32 */
    150      1.1   skrll 	ldp	DATA0, DATA1, [SRC0, #-16]!
    151      1.1   skrll 	stp	DATA0, DATA1, [DST, #-16]!
    152      1.1   skrll 	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
    153      1.1   skrll 	tbz	LEN, #3, 1f
    154      1.1   skrll 	ldr	TMP_X, [SRC0, #-8]!
    155      1.1   skrll 	str	TMP_X, [DST, #-8]!
    156      1.1   skrll 1:
    157      1.1   skrll 	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    158      1.1   skrll 	tbz	LEN, #2, 1f
    159      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    160      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    161      1.1   skrll 1:
    162      1.1   skrll 	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    163      1.1   skrll 	tbz	LEN, #1, 1f
    164      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    165      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    166      1.1   skrll 1:
    167      1.1   skrll 	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    168      1.1   skrll 	tbz	LEN, #0, 1f
    169      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    170      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    171      1.1   skrll 1:
    172      1.1   skrll 	ret
    173      1.1   skrll #endif /* !STRICT_ALIGNMENT */
    174      1.1   skrll 
    175      1.1   skrll 	.align	4
    176      1.1   skrll copy_backward:
    177      1.1   skrll 	/* DST is not aligned at this point */
    178      1.1   skrll #ifndef STRICT_ALIGNMENT
    179      1.1   skrll 	cmp	LEN, #512	/* pre-alignment can be overhead when small */
    180      1.1   skrll 	bcc	9f
    181      1.1   skrll #endif
    182      1.1   skrll 	/* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    183      1.1   skrll 	tbz	DST, #0, 1f
    184      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    185      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    186      1.1   skrll 	sub	LEN, LEN, #1
    187      1.1   skrll 1:
    188      1.1   skrll 	/* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    189      1.1   skrll 	tbz	DST, #1, 1f
    190      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    191      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    192      1.1   skrll 	sub	LEN, LEN, #2
    193      1.1   skrll 1:
    194      1.1   skrll 	/* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    195      1.1   skrll 	tbz	DST, #2, 1f
    196      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    197      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    198      1.1   skrll 	sub	LEN, LEN, #4
    199      1.1   skrll 1:
    200      1.1   skrll #if (STP_ALIGN > 8)
    201      1.1   skrll 	/* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
    202      1.1   skrll 	tbz	DST, #3, 1f
    203      1.1   skrll 	ldr	TMP_X, [SRC0, #-8]!
    204      1.1   skrll 	str	TMP_X, [DST, #-8]!
    205      1.1   skrll 	sub	LEN, LEN, #8
    206      1.1   skrll 1:
    207      1.1   skrll #endif /* (STP_ALIGN > 8) */
    208      1.1   skrll 9:
    209      1.1   skrll 
    210  1.1.4.1  martin backward_copy1k:
    211  1.1.4.1  martin 	/* while (len >= 1024) */
    212  1.1.4.1  martin 	/* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
    213      1.1   skrll 	cmp	LEN, #1024
    214  1.1.4.1  martin 	blo	9f
    215  1.1.4.1  martin 1:
    216      1.1   skrll 	sub	LEN, LEN, #1024
    217      1.1   skrll 	.rept	(1024 / 16)
    218      1.1   skrll 	ldp	DATA0, DATA1, [SRC0, #-16]!	/* *--dst = *--src; */
    219      1.1   skrll 	stp	DATA0, DATA1, [DST, #-16]!
    220      1.1   skrll 	.endr
    221      1.1   skrll 	cmp	LEN, #1024
    222  1.1.4.1  martin 	bhs	1b
    223  1.1.4.1  martin 9:
    224      1.1   skrll 
    225  1.1.4.1  martin 	/* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
    226  1.1.4.1  martin 	tbz	LEN, #9, 1f
    227  1.1.4.1  martin 	.rept	(512 / 16)
    228  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0, #-16]!
    229  1.1.4.1  martin 	stp	DATA0, DATA1, [DST, #-16]!
    230  1.1.4.1  martin 	.endr
    231  1.1.4.1  martin 1:
    232  1.1.4.1  martin 	/* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
    233  1.1.4.1  martin 	tbz	LEN, #8, 1f
    234  1.1.4.1  martin 	.rept	(256 / 16)
    235  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0, #-16]!
    236  1.1.4.1  martin 	stp	DATA0, DATA1, [DST, #-16]!
    237  1.1.4.1  martin 	.endr
    238  1.1.4.1  martin 1:
    239  1.1.4.1  martin 	/* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
    240  1.1.4.1  martin 	tbz	LEN, #7, 1f
    241  1.1.4.1  martin 	.rept	(128 / 16)
    242  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0, #-16]!
    243  1.1.4.1  martin 	stp	DATA0, DATA1, [DST, #-16]!
    244  1.1.4.1  martin 	.endr
    245  1.1.4.1  martin 1:
    246  1.1.4.1  martin 	/* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
    247  1.1.4.1  martin 	tbz	LEN, #6, 1f
    248  1.1.4.1  martin 	.rept	(64 / 16)
    249  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0, #-16]!
    250  1.1.4.1  martin 	stp	DATA0, DATA1, [DST, #-16]!
    251  1.1.4.1  martin 	.endr
    252  1.1.4.1  martin 1:
    253  1.1.4.1  martin 	/* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
    254  1.1.4.1  martin 	tbz	LEN, #5, 1f
    255  1.1.4.1  martin 	.rept	(32 / 16)
    256  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0, #-16]!
    257  1.1.4.1  martin 	stp	DATA0, DATA1, [DST, #-16]!
    258  1.1.4.1  martin 	.endr
    259  1.1.4.1  martin 1:
    260      1.1   skrll 	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
    261      1.1   skrll 	tbz	LEN, #4, 1f
    262      1.1   skrll 	ldp	DATA0, DATA1, [SRC0, #-16]!
    263  1.1.4.1  martin 	stp	DATA0, DATA1, [DST, #-16]!
    264      1.1   skrll 1:
    265      1.1   skrll 	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
    266      1.1   skrll 	tbz	LEN, #3, 1f
    267      1.1   skrll 	ldr	TMP_X, [SRC0, #-8]!
    268      1.1   skrll 	str	TMP_X, [DST, #-8]!
    269      1.1   skrll 1:
    270      1.1   skrll 	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    271      1.1   skrll 	tbz	LEN, #2, 1f
    272      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    273      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    274      1.1   skrll 1:
    275      1.1   skrll 	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    276      1.1   skrll 	tbz	LEN, #1, 1f
    277      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    278      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    279      1.1   skrll 1:
    280      1.1   skrll 	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    281      1.1   skrll 	tbz	LEN, #0, 1f
    282      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    283      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    284      1.1   skrll 1:
    285      1.1   skrll 	ret
    286      1.1   skrll #endif /* !NO_OVERLAP */
    287      1.1   skrll 
    288      1.1   skrll 
    289      1.1   skrll #if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
    290      1.1   skrll 	.align	5
    291      1.1   skrll backward_copy:
    292      1.1   skrll 	prfm	PLDL1KEEP, [SRC0]
    293      1.1   skrll 	add	DST, DST0, LEN
    294      1.1   skrll 	add	SRC0, SRC0, LEN
    295      1.1   skrll 	cmp	LEN, #SMALLSIZE
    296      1.1   skrll 	bcs	strict_backward
    297      1.1   skrll 
    298      1.1   skrll 	cmp	LEN, #10
    299      1.1   skrll 	bcs	9f
    300      1.1   skrll backward_tiny:
    301      1.1   skrll 	/* copy 1-10 bytes */
    302  1.1.4.1  martin 1:	sub	LEN, LEN, #1
    303      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    304      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    305  1.1.4.1  martin 	cbz	LEN, 1b
    306      1.1   skrll 	ret
    307      1.1   skrll 9:
    308      1.1   skrll 	/* length is small(<32), and src or dst may be unaligned */
    309      1.1   skrll 	eor	TMP_X, SRC0, DST0
    310      1.1   skrll 	ands	TMP_X, TMP_X, #7
    311      1.1   skrll 	bne	notaligned_backward_small
    312      1.1   skrll 
    313      1.1   skrll samealign_backward_small:
    314      1.1   skrll 	/* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    315      1.1   skrll 	tbz	DST, #0, 1f
    316      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    317      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    318      1.1   skrll 	sub	LEN, LEN, #1
    319      1.1   skrll 1:
    320      1.1   skrll 	/* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    321      1.1   skrll 	tbz	DST, #1, 1f
    322      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    323      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    324      1.1   skrll 	sub	LEN, LEN, #2
    325      1.1   skrll 1:
    326      1.1   skrll 	/* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    327      1.1   skrll 	tbz	DST, #2, 1f
    328      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    329      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    330      1.1   skrll 	sub	LEN, LEN, #4
    331      1.1   skrll 1:
    332      1.1   skrll 	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
    333      1.1   skrll 	tbz	LEN, #4, 1f
    334      1.1   skrll 	ldp	DATA0, DATA1, [SRC0, #-16]!
    335      1.1   skrll 	stp	DATA0, DATA1, [DST, #-16]!
    336      1.1   skrll 1:
    337      1.1   skrll 	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
    338      1.1   skrll 	tbz	LEN, #3, 1f
    339      1.1   skrll 	ldr	TMP_X, [SRC0, #-8]!
    340      1.1   skrll 	str	TMP_X, [DST, #-8]!
    341      1.1   skrll 1:
    342      1.1   skrll 	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
    343      1.1   skrll 	tbz	LEN, #2, 1f
    344      1.1   skrll 	ldr	TMP_Xw, [SRC0, #-4]!
    345      1.1   skrll 	str	TMP_Xw, [DST, #-4]!
    346      1.1   skrll 1:
    347      1.1   skrll 	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
    348      1.1   skrll 	tbz	LEN, #1, 1f
    349      1.1   skrll 	ldrh	TMP_Xw, [SRC0, #-2]!
    350      1.1   skrll 	strh	TMP_Xw, [DST, #-2]!
    351      1.1   skrll 1:
    352      1.1   skrll 	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
    353      1.1   skrll 	tbz	LEN, #0, 1f
    354      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    355      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!
    356      1.1   skrll 1:
    357      1.1   skrll 	ret
    358      1.1   skrll 
    359      1.1   skrll notaligned_backward_small:
    360      1.1   skrll 	/* length is small, and src or dst may be unaligned */
    361      1.1   skrll 	sub	TMP_S, SRC0, LEN	/* tmp_s = src - len */
    362      1.1   skrll 1:					/* do { */
    363      1.1   skrll 	ldrb	TMP_Xw, [SRC0, #-1]!
    364      1.1   skrll 	strb	TMP_Xw, [DST, #-1]!	/*  *(char *)dst++ = *(char *)src++ */
    365      1.1   skrll 	cmp	TMP_S, SRC0		/* while (tmp_s < src) */
    366      1.1   skrll 	blo	1b
    367      1.1   skrll 	ret
    368      1.1   skrll 
    369      1.1   skrll strict_backward:
    370      1.1   skrll 	/* src or dst may be unaligned */
    371      1.1   skrll 	and	SRC_ALIGNBIT, SRC0, #7
    372      1.1   skrll 	and	DST_ALIGNBIT, DST, #7
    373      1.1   skrll 	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
    374      1.1   skrll 	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
    375      1.1   skrll 	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
    376      1.1   skrll 	cbz	SRC_DST_ALIGNBIT, copy_backward	/* same alignment? */
    377      1.1   skrll 
    378      1.1   skrll 	and	SRC, SRC0, #~7
    379      1.1   skrll 	and	DST, DST, #~7
    380      1.1   skrll 	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
    381      1.1   skrll 
    382      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    383      1.1   skrll 	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */
    384      1.1   skrll 
    385      1.1   skrll 	cmp	SRC, SRC0			/* don't access out of range */
    386      1.1   skrll 	beq	1f
    387      1.1   skrll 	ldr	DATA1, [SRC]
    388      1.1   skrll 1:
    389      1.1   skrll 	ldr	DATA0, [SRC, #-8]!
    390      1.1   skrll 
    391      1.1   skrll 	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
    392      1.1   skrll 	lsr	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1<<dst_src_alignbit)| */
    393      1.1   skrll 	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */
    394      1.1   skrll 
    395      1.1   skrll 	b	9f				/* }                          */
    396      1.1   skrll 5:						/* else {                     */
    397      1.1   skrll 	ldr	DATA0, [SRC]			/*  data0 = *src;             */
    398      1.1   skrll 	lsr	DATA1, DATA0, SRC_DST_ALIGNBIT	/*  data1=data0>>src_dst_abit;*/
    399      1.1   skrll 9:						/* }                          */
    400      1.1   skrll 
    401      1.1   skrll 	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
    402      1.1   skrll 	mov	TMP_D, DST		/*   tmp_d = dst;                     */
    403      1.1   skrll 
    404      1.1   skrll 	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
    405      1.1   skrll 	str	DATA1w, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = data1; */
    406      1.1   skrll 	lsr	DATA1, DATA1, #32	/*      data1 >>= 32;                 */
    407      1.1   skrll 1:					/*    }                               */
    408      1.1   skrll 	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
    409      1.1   skrll 	strh	DATA1w, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = data1; */
    410      1.1   skrll 	lsr	DATA1, DATA1, #16	/*      data1 >>= 16;                 */
    411      1.1   skrll 1:					/*    }                               */
    412      1.1   skrll 	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
    413      1.1   skrll 	strb	DATA1w, [TMP_D]		/*      *(uint8_t *)tmp_d = data1;    */
    414      1.1   skrll 1:					/*    }                               */
    415      1.1   skrll 
    416      1.1   skrll 	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
    417      1.1   skrll 9:					/* }                                  */
    418      1.1   skrll #else /* BYTE_ORDER */
    419      1.1   skrll 	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */
    420      1.1   skrll 
    421      1.1   skrll 	cmp	SRC, SRC0			/* don't access out of range */
    422      1.1   skrll 	beq	1f
    423      1.1   skrll 	ldr	DATA1, [SRC]
    424      1.1   skrll 1:
    425      1.1   skrll 	ldr	DATA0, [SRC, #-8]!
    426      1.1   skrll 
    427      1.1   skrll 	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
    428      1.1   skrll 	lsl	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1>>dst_src_alignbit)| */
    429      1.1   skrll 	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */
    430      1.1   skrll 
    431      1.1   skrll 	b	9f				/* }                          */
    432      1.1   skrll 5:						/* else {                     */
    433      1.1   skrll 	ldr	DATA0, [SRC]			/*  data0 = *src;             */
    434      1.1   skrll 	lsr	DATA1, DATA0, DST_SRC_ALIGNBIT	/*  data1=data0<<dst_src_abit;*/
    435      1.1   skrll 9:						/* }                          */
    436      1.1   skrll 
    437      1.1   skrll 	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
    438      1.1   skrll 	mov	TMP_D, DST		/*   tmp_d = dst;                     */
    439      1.1   skrll 
    440      1.1   skrll 	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
    441      1.1   skrll 	lsr	TMP_X, DATA1, #32	/*      x = data1 >> 32;              */
    442      1.1   skrll 	str	TMP_Xw, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = x;     */
    443      1.1   skrll 1:					/*    }                               */
    444      1.1   skrll 	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
    445      1.1   skrll 	lsr	TMP_X, DATA1, #16	/*      x = data1 >> 16;              */
    446      1.1   skrll 	strh	TMP_Xw, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = x;     */
    447      1.1   skrll 1:					/*    }                               */
    448      1.1   skrll 	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
    449      1.1   skrll 	lsr	TMP_X, DATA1, #8	/*      x = data1 >> 8;               */
    450      1.1   skrll 	strb	TMP_Xw, [TMP_D], #1	/*      *(uint8_t *)tmp_d++ = x;      */
    451      1.1   skrll 1:					/*    }                               */
    452      1.1   skrll 
    453      1.1   skrll 	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
    454      1.1   skrll 9:					/* }                                  */
    455      1.1   skrll #endif /* BYTE_ORDER */
    456      1.1   skrll 
    457      1.1   skrll 
    458      1.1   skrll backward_shifting_copy_loop:
    459      1.1   skrll 	ldp	DATA2, DATA1, [SRC, #-16]!
    460      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    461      1.1   skrll 	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
    462      1.1   skrll 	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
    463      1.1   skrll 	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
    464      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    465      1.1   skrll 	/* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
    466      1.1   skrll 	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT
    467      1.1   skrll 	lsr	TMP_X, DATA2, SRC_DST_ALIGNBIT
    468      1.1   skrll 	orr	DATA1, DATA1, TMP_X
    469      1.1   skrll #else /* BYTE_ORDER */
    470      1.1   skrll 	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
    471      1.1   skrll 	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
    472      1.1   skrll 	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
    473      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    474      1.1   skrll 	/* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
    475      1.1   skrll 	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT
    476      1.1   skrll 	lsl	TMP_X, DATA2, SRC_DST_ALIGNBIT
    477      1.1   skrll 	orr	DATA1, DATA1, TMP_X
    478      1.1   skrll #endif /* BYTE_ORDER */
    479      1.1   skrll 	stp	DATA1, DATA0, [DST, #-16]!
    480      1.1   skrll 	mov	DATA0, DATA2
    481      1.1   skrll 	sub	LEN, LEN, #16
    482      1.1   skrll 	cmp	LEN, #16
    483      1.1   skrll 	bhs	backward_shifting_copy_loop
    484      1.1   skrll 
    485      1.1   skrll 
    486      1.1   skrll 	/* write 8 bytes */
    487      1.1   skrll 	tbz	LEN, #3, 9f
    488      1.1   skrll 
    489      1.1   skrll 	ldr	DATA1, [SRC, #-8]!
    490      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    491      1.1   skrll 	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
    492      1.1   skrll 	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
    493      1.1   skrll 	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
    494      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    495      1.1   skrll #else /* BYTE_ORDER */
    496      1.1   skrll 	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
    497      1.1   skrll 	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
    498      1.1   skrll 	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
    499      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    500      1.1   skrll #endif /* BYTE_ORDER */
    501      1.1   skrll 	str	DATA0, [DST, #-8]!
    502      1.1   skrll 	mov	DATA0, DATA1
    503      1.1   skrll 	sub	LEN, LEN, #8
    504      1.1   skrll 9:
    505      1.1   skrll 
    506      1.1   skrll 	cbz	LEN, backward_shifting_copy_done
    507      1.1   skrll 
    508      1.1   skrll 	/* copy last 1-7 bytes */
    509      1.1   skrll 	and	TMP_X, SRC_DST_ALIGNBIT, #63
    510      1.1   skrll 	cmp	LEN, TMP_X, lsr #3
    511      1.1   skrll 	bls	1f
    512      1.1   skrll 	ldr	DATA1, [SRC, #-8]!	/* don't access out of range */
    513      1.1   skrll 1:
    514      1.1   skrll 
    515      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    516      1.1   skrll 	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
    517      1.1   skrll 	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
    518      1.1   skrll 	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
    519      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    520      1.1   skrll #else /* BYTE_ORDER */
    521      1.1   skrll 	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
    522      1.1   skrll 	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
    523      1.1   skrll 	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
    524      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    525      1.1   skrll #endif /* BYTE_ORDER */
    526      1.1   skrll 
    527      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    528      1.1   skrll 	tbz	LEN, #2, 1f
    529      1.1   skrll 	ror	DATA0, DATA0, #32
    530      1.1   skrll 	str	DATA0w, [DST, #-4]!
    531      1.1   skrll 1:
    532      1.1   skrll 	tbz	LEN, #1, 1f
    533      1.1   skrll 	ror	DATA0, DATA0, #48
    534      1.1   skrll 	strh	DATA0w, [DST, #-2]!
    535      1.1   skrll 1:
    536      1.1   skrll 	tbz	LEN, #0, 1f
    537      1.1   skrll 	ror	DATA0, DATA0, #56
    538      1.1   skrll 	strb	DATA0w, [DST, #-1]!
    539      1.1   skrll 1:
    540      1.1   skrll #else /* BYTE_ORDER */
    541      1.1   skrll 	tbz	LEN, #2, 1f
    542      1.1   skrll 	str	DATA0w, [DST, #-4]!
    543      1.1   skrll 	lsr	DATA0, DATA0, #32
    544      1.1   skrll 1:
    545      1.1   skrll 	tbz	LEN, #1, 1f
    546      1.1   skrll 	strh	DATA0w, [DST, #-2]!
    547      1.1   skrll 	lsr	DATA0, DATA0, #16
    548      1.1   skrll 1:
    549      1.1   skrll 	tbz	LEN, #0, 1f
    550      1.1   skrll 	strb	DATA0w, [DST, #-1]!
    551      1.1   skrll 1:
    552      1.1   skrll #endif /* BYTE_ORDER */
    553      1.1   skrll backward_shifting_copy_done:
    554      1.1   skrll 	ret
    555      1.1   skrll #endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
    556      1.1   skrll 
    557      1.1   skrll 
    558      1.1   skrll 	.align	5
    559      1.1   skrll ENTRY(FUNCTION)
    560      1.1   skrll #ifdef STRICT_ALIGNMENT
    561      1.1   skrll 	cbz	LEN, done
    562      1.1   skrll #ifndef NO_OVERLAP
    563      1.1   skrll 	cmp	SRC0, DST0
    564      1.1   skrll 	beq	done
    565      1.1   skrll 	bcc	backward_copy
    566      1.1   skrll #endif /* NO_OVERLAP */
    567      1.1   skrll 	mov	DST, DST0
    568      1.1   skrll 	cmp	LEN, #SMALLSIZE
    569      1.1   skrll 	bcs	strict_forward
    570      1.1   skrll 
    571      1.1   skrll 	cmp	LEN, #10
    572      1.1   skrll 	bcs	9f
    573      1.1   skrll forward_tiny:
    574      1.1   skrll 	/* copy 1-10 bytes */
    575  1.1.4.1  martin 1:	sub	LEN, LEN, #1
    576      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    577      1.1   skrll 	strb	TMP_Xw, [DST], #1
    578  1.1.4.1  martin 	cbz	LEN, 1b
    579      1.1   skrll 	ret
    580      1.1   skrll 9:
    581      1.1   skrll 	/* length is small(<32), and src or dst may be unaligned */
    582      1.1   skrll 	eor	TMP_X, SRC0, DST0
    583      1.1   skrll 	ands	TMP_X, TMP_X, #7
    584      1.1   skrll 	bne	notaligned_forward_small
    585      1.1   skrll samealign_forward_small:
    586      1.1   skrll 	/* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
    587      1.1   skrll 	tbz	DST, #0, 1f
    588      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    589      1.1   skrll 	strb	TMP_Xw, [DST], #1
    590      1.1   skrll 	sub	LEN, LEN, #1
    591      1.1   skrll 1:
    592      1.1   skrll 	/* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
    593      1.1   skrll 	tbz	DST, #1, 1f
    594      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
    595      1.1   skrll 	strh	TMP_Xw, [DST], #2
    596      1.1   skrll 	sub	LEN, LEN, #2
    597      1.1   skrll 1:
    598      1.1   skrll 	/* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
    599      1.1   skrll 	tbz	DST, #2, 1f
    600      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
    601      1.1   skrll 	str	TMP_Xw, [DST], #4
    602      1.1   skrll 	sub	LEN, LEN, #4
    603      1.1   skrll 1:
    604      1.1   skrll 	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
    605      1.1   skrll 	tbz	LEN, #4, 1f
    606      1.1   skrll 	ldp	DATA0, DATA1, [SRC0], #16
    607      1.1   skrll 	stp	DATA0, DATA1, [DST], #16
    608      1.1   skrll 1:
    609      1.1   skrll 	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
    610      1.1   skrll 	tbz	LEN, #3, 1f
    611      1.1   skrll 	ldr	TMP_X, [SRC0], #8
    612      1.1   skrll 	str	TMP_X, [DST], #8
    613      1.1   skrll 1:
    614      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
    615      1.1   skrll 	tbz	LEN, #2, 1f
    616      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
    617      1.1   skrll 	str	TMP_Xw, [DST], #4
    618      1.1   skrll 1:
    619      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
    620      1.1   skrll 	tbz	LEN, #1, 1f
    621      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
    622      1.1   skrll 	strh	TMP_Xw, [DST], #2
    623      1.1   skrll 1:
    624      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
    625      1.1   skrll 	tbz	LEN, #0, 1f
    626      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    627      1.1   skrll 	strb	TMP_Xw, [DST], #1
    628      1.1   skrll 1:
    629      1.1   skrll 	ret
    630      1.1   skrll 
    631      1.1   skrll notaligned_forward_small:
    632      1.1   skrll 	/* src and dst are not aligned... */
    633      1.1   skrll 	prfm	PLDL1KEEP, [SRC0]
    634      1.1   skrll 	prfm	PLDL1KEEP, [SRC0, #8]
    635      1.1   skrll 	prfm	PLDL1KEEP, [SRC0, #16]
    636      1.1   skrll 	add	TMP_S, SRC0, LEN	/* tmp_s = src + len */
    637      1.1   skrll 1:					/* do { */
    638      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    639      1.1   skrll 	strb	TMP_Xw, [DST], #1	/*  *(char *)dst++ = *(char *)src++ */
    640      1.1   skrll 	cmp	SRC0, TMP_S		/* while (src < tmp_s); */
    641      1.1   skrll 	blo	1b
    642      1.1   skrll 	ret
    643      1.1   skrll 
    644      1.1   skrll strict_forward:
    645      1.1   skrll 	/* src or dst may be unaligned */
    646      1.1   skrll 	and	SRC_ALIGNBIT, SRC0, #7
    647      1.1   skrll 	and	DST_ALIGNBIT, DST0, #7
    648      1.1   skrll 	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
    649      1.1   skrll 	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
    650      1.1   skrll 	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
    651      1.1   skrll 	cbz	SRC_DST_ALIGNBIT, copy_forward	/* same alignment? */
    652      1.1   skrll 
    653      1.1   skrll 	and	SRC, SRC0, #~7
    654      1.1   skrll 	and	DST, DST0, #~7
    655      1.1   skrll 	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
    656      1.1   skrll 
    657      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    658      1.1   skrll 	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
    659      1.1   skrll 	ldp	DATA1, DATA0, [SRC], #16
    660      1.1   skrll 	neg	TMP_X, SRC_ALIGNBIT
    661      1.1   skrll 	lsr	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
    662      1.1   skrll 	lsl	TMP_X, DATA0, TMP_X		/*  (data1 >> src_alignbit) | */
    663      1.1   skrll 	orr	DATA1, DATA1, TMP_X		/*  (data0 << -src_alignbit); */
    664      1.1   skrll 	b	9f
    665      1.1   skrll 5:
    666      1.1   skrll 	ldr	DATA0, [SRC], #8
    667      1.1   skrll 	lsr	DATA1, DATA0, SRC_ALIGNBIT
    668      1.1   skrll 9:
    669      1.1   skrll 
    670      1.1   skrll 	cbz	DST_ALIGNBIT, 5f
    671      1.1   skrll 	mov	TMP_D, DST0
    672      1.1   skrll 	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
    673      1.1   skrll 	tbz	TMP_D, #0, 1f
    674      1.1   skrll 	strb	DATA1w, [TMP_D], #1
    675      1.1   skrll 	lsr	DATA1, DATA1, #8
    676      1.1   skrll 1:
    677      1.1   skrll 	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
    678      1.1   skrll 	tbz	TMP_D, #1, 1f
    679      1.1   skrll 	strh	DATA1w, [TMP_D], #2
    680      1.1   skrll 	lsr	DATA1, DATA1, #16
    681      1.1   skrll 1:
    682      1.1   skrll 	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
    683      1.1   skrll 	tbz	TMP_D, #2, 1f
    684      1.1   skrll 	str	DATA1w, [TMP_D], #4
    685      1.1   skrll 1:
    686      1.1   skrll 	add	DST, DST, #8
    687      1.1   skrll 	b	9f
    688      1.1   skrll 5:
    689      1.1   skrll 	str	DATA1, [DST], #8
    690      1.1   skrll 9:
    691      1.1   skrll 	sub	LEN, LEN, #8
    692      1.1   skrll 	add	LEN, LEN, DST_ALIGNBIT, lsr #3
    693      1.1   skrll #else /* BYTE_ORDER */
    694      1.1   skrll 	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
    695      1.1   skrll 	ldp	DATA1, DATA0, [SRC], #16
    696      1.1   skrll 	neg	TMP_X, SRC_ALIGNBIT
    697      1.1   skrll 	lsl	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
    698      1.1   skrll 	lsr	TMP_X, DATA0, TMP_X		/*  (data1 << src_alignbit) | */
    699      1.1   skrll 	orr	DATA1, DATA1, TMP_X		/*  (data0 >> -src_alignbit); */
    700      1.1   skrll 	b	9f
    701      1.1   skrll 5:
    702      1.1   skrll 	ldr	DATA0, [SRC], #8
    703      1.1   skrll 	lsl	DATA1, DATA0, SRC_ALIGNBIT
    704      1.1   skrll 9:
    705      1.1   skrll 
    706      1.1   skrll 	cbz	DST_ALIGNBIT, 5f
    707      1.1   skrll 	mov	TMP_D, DST0
    708      1.1   skrll 	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
    709      1.1   skrll 	tbz	TMP_D, #0, 1f
    710      1.1   skrll 	lsr	TMP_X, DATA1, #56
    711      1.1   skrll 	strb	TMP_Xw, [TMP_D], #1
    712      1.1   skrll 1:
    713      1.1   skrll 	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
    714      1.1   skrll 	tbz	TMP_D, #1, 1f
    715      1.1   skrll 	lsr	TMP_X, DATA1, #48
    716      1.1   skrll 	strh	TMP_Xw, [TMP_D], #2
    717      1.1   skrll 1:
    718      1.1   skrll 	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
    719      1.1   skrll 	tbz	TMP_D, #2, 1f
    720      1.1   skrll 	lsr	TMP_X, DATA1, #32
    721      1.1   skrll 	str	TMP_Xw, [TMP_D], #4
    722      1.1   skrll 1:
    723      1.1   skrll 	add	DST, DST, #8
    724      1.1   skrll 	b	9f
    725      1.1   skrll 5:
    726      1.1   skrll 	str	DATA1, [DST], #8
    727      1.1   skrll 9:
    728      1.1   skrll 	sub	LEN, LEN, #8
    729      1.1   skrll 	add	LEN, LEN, DST_ALIGNBIT, lsr #3
    730      1.1   skrll #endif /* BYTE_ORDER */
    731      1.1   skrll 
    732      1.1   skrll shifting_copy_loop:
    733      1.1   skrll 	ldp	DATA1, DATA2, [SRC], #16
    734      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    735      1.1   skrll 	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
    736      1.1   skrll 	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
    737      1.1   skrll 	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
    738      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    739      1.1   skrll 	/* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
    740      1.1   skrll 	lsr	DATA1, DATA1, SRC_DST_ALIGNBIT
    741      1.1   skrll 	lsl	TMP_X, DATA2, DST_SRC_ALIGNBIT
    742      1.1   skrll 	orr	DATA1, DATA1, TMP_X
    743      1.1   skrll #else /* BYTE_ORDER */
    744      1.1   skrll 	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
    745      1.1   skrll 	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
    746      1.1   skrll 	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
    747      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    748      1.1   skrll 	/* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
    749      1.1   skrll 	lsl	DATA1, DATA1, SRC_DST_ALIGNBIT
    750      1.1   skrll 	lsr	TMP_X, DATA2, DST_SRC_ALIGNBIT
    751      1.1   skrll 	orr	DATA1, DATA1, TMP_X
    752      1.1   skrll #endif /* BYTE_ORDER */
    753      1.1   skrll 	stp	DATA0, DATA1, [DST], #16
    754      1.1   skrll 	mov	DATA0, DATA2
    755      1.1   skrll 	sub	LEN, LEN, #16
    756      1.1   skrll 	cmp	LEN, #16
    757      1.1   skrll 	bhs	shifting_copy_loop
    758      1.1   skrll 
    759      1.1   skrll 
    760      1.1   skrll 	/* write 8 bytes */
    761      1.1   skrll 	tbz	LEN, #3, 9f
    762      1.1   skrll 	ldr	DATA1, [SRC], #8
    763      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    764      1.1   skrll 	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
    765      1.1   skrll 	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
    766      1.1   skrll 	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
    767      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    768      1.1   skrll #else /* BYTE_ORDER */
    769      1.1   skrll 	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
    770      1.1   skrll 	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
    771      1.1   skrll 	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
    772      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    773      1.1   skrll #endif /* BYTE_ORDER */
    774      1.1   skrll 	str	DATA0, [DST], #8
    775      1.1   skrll 	mov	DATA0, DATA1
    776      1.1   skrll 	sub	LEN, LEN, #8
    777      1.1   skrll 9:
    778      1.1   skrll 
    779      1.1   skrll 	cbz	LEN, shifting_copy_done
    780      1.1   skrll 
    781      1.1   skrll 	/* copy last 1-7 bytes */
    782      1.1   skrll 	and	TMP_X, DST_SRC_ALIGNBIT, #63
    783      1.1   skrll 	cmp	LEN, TMP_X, lsr #3
    784      1.1   skrll 	bls	1f
    785      1.1   skrll 	ldr	DATA1, [SRC], #8	/* don't access out of range */
    786      1.1   skrll 1:
    787      1.1   skrll 
    788      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    789      1.1   skrll 	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
    790      1.1   skrll 	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
    791      1.1   skrll 	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
    792      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    793      1.1   skrll #else /* BYTE_ORDER */
    794      1.1   skrll 	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
    795      1.1   skrll 	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
    796      1.1   skrll 	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
    797      1.1   skrll 	orr	DATA0, DATA0, TMP_X
    798      1.1   skrll #endif /* BYTE_ORDER */
    799      1.1   skrll 
    800      1.1   skrll #if BYTE_ORDER == LITTLE_ENDIAN
    801      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = data0; } */
    802      1.1   skrll 	tbz	LEN, #2, 1f
    803      1.1   skrll 	str	DATA0w, [DST], #4
    804      1.1   skrll 	lsr	DATA0, DATA0, #32
    805      1.1   skrll 1:
    806      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = data0; } */
    807      1.1   skrll 	tbz	LEN, #1, 1f
    808      1.1   skrll 	strh	DATA0w, [DST], #2
    809      1.1   skrll 	lsr	DATA0, DATA0, #16
    810      1.1   skrll 1:
    811      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = data0; } */
    812      1.1   skrll 	tbz	LEN, #0, 1f
    813      1.1   skrll 	strb	DATA0w, [DST], #1
    814      1.1   skrll 1:
    815      1.1   skrll #else /* BYTE_ORDER */
    816      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
    817      1.1   skrll 	tbz	LEN, #2, 1f
    818      1.1   skrll 	lsr	TMP_X, DATA0, #32
    819      1.1   skrll 	str	TMP_Xw, [DST], #4
    820      1.1   skrll 1:
    821      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
    822      1.1   skrll 	tbz	LEN, #1, 1f
    823      1.1   skrll 	lsr	TMP_X, DATA0, #16
    824      1.1   skrll 	strh	TMP_Xw, [DST], #2
    825      1.1   skrll 1:
    826      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
    827      1.1   skrll 	tbz	LEN, #0, 1f
    828      1.1   skrll 	lsr	TMP_X, DATA0, #8
    829      1.1   skrll 	strb	TMP_Xw, [DST], #1
    830      1.1   skrll 1:
    831      1.1   skrll #endif /* BYTE_ORDER */
    832      1.1   skrll shifting_copy_done:
    833      1.1   skrll 	ret
    834      1.1   skrll 
    835      1.1   skrll #else /* STRICT_ALIGNMENT */
    836      1.1   skrll #ifndef NO_OVERLAP
    837      1.1   skrll 	cbz	LEN, done
    838      1.1   skrll 	cmp	SRC0, DST0
    839      1.1   skrll 	beq	done
    840      1.1   skrll 	bcc	backward_ignore_align
    841      1.1   skrll #endif /* NO_OVERLAP */
    842      1.1   skrll 
    843      1.1   skrll 	prfm	PLDL1KEEP, [SRC0]
    844      1.1   skrll 	cmp	LEN, #SMALLSIZE
    845      1.1   skrll 	bcs	copy_forward
    846      1.1   skrll 	mov	DST, DST0
    847      1.1   skrll 
    848      1.1   skrll copy_forward_small:
    849      1.1   skrll 	cmp	LEN, #8
    850      1.1   skrll 	bcs	9f
    851      1.1   skrll 
    852      1.1   skrll 	/* 0 <= len < 8 */
    853      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
    854      1.1   skrll 	tbz	LEN, #2, 1f
    855      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
    856      1.1   skrll 	str	TMP_Xw, [DST], #4
    857      1.1   skrll 1:
    858      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
    859      1.1   skrll 	tbz	LEN, #1, 1f
    860      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
    861      1.1   skrll 	strh	TMP_Xw, [DST], #2
    862      1.1   skrll 1:
    863      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
    864      1.1   skrll 	tbz	LEN, #0, 1f
    865      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    866      1.1   skrll 	strb	TMP_Xw, [DST], #1
    867      1.1   skrll 1:
    868      1.1   skrll 	ret
    869      1.1   skrll 9:
    870      1.1   skrll 
    871      1.1   skrll 	prfm	PLDL1KEEP, [SRC0, #8]
    872      1.1   skrll 	cmp	LEN, #16
    873      1.1   skrll 	bcs	9f
    874      1.1   skrll 
    875      1.1   skrll 	/* 8 <= len < 16 */
    876      1.1   skrll 	/* *(uint64_t *)dst++ = *(uint64_t *)src++; */
    877      1.1   skrll 	ldr	TMP_X, [SRC0], #8
    878      1.1   skrll 	str	TMP_X, [DST], #8
    879      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
    880      1.1   skrll 	tbz	LEN, #2, 1f
    881      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
    882      1.1   skrll 	str	TMP_Xw, [DST], #4
    883      1.1   skrll 1:
    884      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
    885      1.1   skrll 	tbz	LEN, #1, 1f
    886      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
    887      1.1   skrll 	strh	TMP_Xw, [DST], #2
    888      1.1   skrll 1:
    889      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
    890      1.1   skrll 	tbz	LEN, #0, 1f
    891      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    892      1.1   skrll 	strb	TMP_Xw, [DST], #1
    893      1.1   skrll 1:
    894      1.1   skrll 	ret
    895      1.1   skrll 9:
    896      1.1   skrll 
    897      1.1   skrll 	/* 16 <= len < 32 */
    898      1.1   skrll 	prfm	PLDL1KEEP, [SRC0, 16]
    899      1.1   skrll 	prfm	PLDL1KEEP, [SRC0, 24]
    900      1.1   skrll 	ldp	DATA0, DATA1, [SRC0], #16
    901      1.1   skrll 	stp	DATA0, DATA1, [DST], #16
    902      1.1   skrll 	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
    903      1.1   skrll 	tbz	LEN, #3, 1f
    904      1.1   skrll 	ldr	TMP_X, [SRC0], #8
    905      1.1   skrll 	str	TMP_X, [DST], #8
    906      1.1   skrll 1:
    907      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
    908      1.1   skrll 	tbz	LEN, #2, 1f
    909      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
    910      1.1   skrll 	str	TMP_Xw, [DST], #4
    911      1.1   skrll 1:
    912      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
    913      1.1   skrll 	tbz	LEN, #1, 1f
    914      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
    915      1.1   skrll 	strh	TMP_Xw, [DST], #2
    916      1.1   skrll 1:
    917      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
    918      1.1   skrll 	tbz	LEN, #0, 1f
    919      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    920      1.1   skrll 	strb	TMP_Xw, [DST], #1
    921      1.1   skrll 1:
    922      1.1   skrll 	ret
    923      1.1   skrll #endif /* !STRICT_ALIGNMENT */
    924      1.1   skrll 
    925      1.1   skrll 	.align	4
    926      1.1   skrll copy_forward:
    927      1.1   skrll 	/* DST is not aligned at this point */
    928      1.1   skrll 	mov	DST, DST0
    929      1.1   skrll #ifndef STRICT_ALIGNMENT
    930      1.1   skrll 	cmp	LEN, #512	/* pre-alignment can be overhead when small */
    931      1.1   skrll 	bcc	9f
    932      1.1   skrll #endif /* STRICT_ALIGNMENT */
    933      1.1   skrll 	/* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
    934      1.1   skrll 	tbz	DST, #0, 1f
    935      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
    936      1.1   skrll 	strb	TMP_Xw, [DST], #1
    937      1.1   skrll 	sub	LEN, LEN, #1
    938      1.1   skrll 1:
    939      1.1   skrll 	/* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
    940      1.1   skrll 	tbz	DST, #1, 1f
    941      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
    942      1.1   skrll 	strh	TMP_Xw, [DST], #2
    943      1.1   skrll 	sub	LEN, LEN, #2
    944      1.1   skrll 1:
    945      1.1   skrll 	/* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
    946      1.1   skrll 	tbz	DST, #2, 1f
    947      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
    948      1.1   skrll 	str	TMP_Xw, [DST], #4
    949      1.1   skrll 	sub	LEN, LEN, #4
    950      1.1   skrll 1:
    951      1.1   skrll #if (STP_ALIGN > 8)
    952      1.1   skrll 	/* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
    953      1.1   skrll 	tbz	DST, #3, 1f
    954      1.1   skrll 	ldr	TMP_X, [SRC0], #8
    955      1.1   skrll 	str	TMP_X, [DST], #8
    956      1.1   skrll 	sub	LEN, LEN, #8
    957      1.1   skrll 1:
    958      1.1   skrll #endif /* (STP_ALIGN > 8) */
    959      1.1   skrll 9:
    960      1.1   skrll 
    961  1.1.4.1  martin forward_copy1k:
    962  1.1.4.1  martin 	/* while (len >= 1024) */
    963  1.1.4.1  martin 	/* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
    964      1.1   skrll 	cmp	LEN, #1024
    965  1.1.4.1  martin 	blo	9f
    966  1.1.4.1  martin 1:
    967      1.1   skrll 	sub	LEN, LEN, #1024
    968      1.1   skrll 	.rept	(1024 / 16)
    969      1.1   skrll 	ldp	DATA0, DATA1, [SRC0], #16	/* *dst++ = *src++; */
    970      1.1   skrll 	stp	DATA0, DATA1, [DST], #16
    971      1.1   skrll 	.endr
    972      1.1   skrll 	cmp	LEN, #1024
    973  1.1.4.1  martin 	bhs	1b
    974  1.1.4.1  martin 9:
    975      1.1   skrll 
    976  1.1.4.1  martin 	/* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
    977  1.1.4.1  martin 	tbz	LEN, #9, 1f
    978  1.1.4.1  martin 	.rept	(512 / 16)
    979  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0], #16
    980  1.1.4.1  martin 	stp	DATA0, DATA1, [DST], #16
    981  1.1.4.1  martin 	.endr
    982  1.1.4.1  martin 1:
    983  1.1.4.1  martin 	/* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
    984  1.1.4.1  martin 	tbz	LEN, #8, 1f
    985  1.1.4.1  martin 	.rept	(256 / 16)
    986  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0], #16
    987  1.1.4.1  martin 	stp	DATA0, DATA1, [DST], #16
    988  1.1.4.1  martin 	.endr
    989  1.1.4.1  martin 1:
    990  1.1.4.1  martin 	/* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
    991  1.1.4.1  martin 	tbz	LEN, #7, 1f
    992  1.1.4.1  martin 	.rept	(128 / 16)
    993  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0], #16
    994  1.1.4.1  martin 	stp	DATA0, DATA1, [DST], #16
    995  1.1.4.1  martin 	.endr
    996  1.1.4.1  martin 1:
    997  1.1.4.1  martin 	/* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
    998  1.1.4.1  martin 	tbz	LEN, #6, 1f
    999  1.1.4.1  martin 	.rept	(64 / 16)
   1000  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0], #16
   1001  1.1.4.1  martin 	stp	DATA0, DATA1, [DST], #16
   1002  1.1.4.1  martin 	.endr
   1003  1.1.4.1  martin 1:
   1004  1.1.4.1  martin 	/* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
   1005  1.1.4.1  martin 	tbz	LEN, #5, 1f
   1006  1.1.4.1  martin 	.rept	(32 / 16)
   1007  1.1.4.1  martin 	ldp	DATA0, DATA1, [SRC0], #16
   1008  1.1.4.1  martin 	stp	DATA0, DATA1, [DST], #16
   1009  1.1.4.1  martin 	.endr
   1010  1.1.4.1  martin 1:
   1011      1.1   skrll 	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
   1012      1.1   skrll 	tbz	LEN, #4, 1f
   1013      1.1   skrll 	ldp	DATA0, DATA1, [SRC0], #16
   1014      1.1   skrll 	stp	DATA0, DATA1, [DST], #16
   1015      1.1   skrll 1:
   1016      1.1   skrll 	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
   1017      1.1   skrll 	tbz	LEN, #3, 1f
   1018      1.1   skrll 	ldr	TMP_X, [SRC0], #8
   1019      1.1   skrll 	str	TMP_X, [DST], #8
   1020      1.1   skrll 1:
   1021      1.1   skrll 	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
   1022      1.1   skrll 	tbz	LEN, #2, 1f
   1023      1.1   skrll 	ldr	TMP_Xw, [SRC0], #4
   1024      1.1   skrll 	str	TMP_Xw, [DST], #4
   1025      1.1   skrll 1:
   1026      1.1   skrll 	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
   1027      1.1   skrll 	tbz	LEN, #1, 1f
   1028      1.1   skrll 	ldrh	TMP_Xw, [SRC0], #2
   1029      1.1   skrll 	strh	TMP_Xw, [DST], #2
   1030      1.1   skrll 1:
   1031      1.1   skrll 	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
   1032      1.1   skrll 	tbz	LEN, #0, 1f
   1033      1.1   skrll 	ldrb	TMP_Xw, [SRC0], #1
   1034      1.1   skrll 	strb	TMP_Xw, [DST], #1
   1035      1.1   skrll 1:
   1036      1.1   skrll done:
   1037      1.1   skrll 	ret
   1038      1.1   skrll END(FUNCTION)
   1039