i386/string/swab.S

 1.1       cgd /*
1.11      salo  * Written by J.T. Conklin <jtc (at) NetBSD.org>.
 1.8       jtc  * Public domain.
 1.1       cgd  */
 1.1       cgd
 1.7       jtc #include <machine/asm.h>
 1.7       jtc
 1.2       jtc #if defined(LIBC_SCCS)
1.14  uebayasi 	RCSID("$NetBSD: swab.S,v 1.14 2014/05/23 02:34:19 uebayasi Exp $")
 1.1       cgd #endif
 1.1       cgd
 1.1       cgd /*
 1.1       cgd  * On the i486, this code is negligibly faster than the code generated
 1.3       jtc  * by gcc at about half the size.  If my i386 databook is correct, it
 1.1       cgd  * should be considerably faster than the gcc code on a i386.
 1.1       cgd  */
 1.1       cgd
 1.1       cgd ENTRY(swab)
 1.1       cgd 	pushl	%esi
 1.1       cgd 	pushl	%edi
 1.1       cgd 	movl	12(%esp),%esi
 1.1       cgd 	movl	16(%esp),%edi
 1.1       cgd 	movl	20(%esp),%ecx
 1.1       cgd
 1.4       jtc 	shrl	$1,%ecx
 1.1       cgd 	testl	$7,%ecx			# copy first group of 1 to 7 words
1.12    rpaulo 	jz	L2			# while swapping alternate bytes.
1.10    kleink 	_ALIGN_TEXT,0x90
 1.1       cgd L1:	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	decl	%ecx
 1.1       cgd 	testl	$7,%ecx
 1.4       jtc 	jnz	L1
 1.3       jtc
 1.1       cgd L2:	shrl	$3,%ecx			# copy remainder 8 words at a time
 1.1       cgd 	jz	L4			# while swapping alternate bytes.
1.10    kleink 	_ALIGN_TEXT,0x90
 1.1       cgd L3:	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	lodsw
 1.6       jtc 	rorw	$8,%ax
 1.1       cgd 	stosw
 1.1       cgd 	decl	%ecx
 1.1       cgd 	jnz	L3
 1.1       cgd
 1.6       jtc L4:	popl	%edi
 1.1       cgd 	popl	%esi
 1.1       cgd 	ret
1.14  uebayasi END(swab)