swab.S revision 1.14 1 1.1 cgd /*
2 1.11 salo * Written by J.T. Conklin <jtc (at) NetBSD.org>.
3 1.8 jtc * Public domain.
4 1.1 cgd */
5 1.1 cgd
6 1.7 jtc #include <machine/asm.h>
7 1.7 jtc
8 1.2 jtc #if defined(LIBC_SCCS)
9 1.14 uebayasi RCSID("$NetBSD: swab.S,v 1.14 2014/05/23 02:34:19 uebayasi Exp $")
10 1.1 cgd #endif
11 1.1 cgd
12 1.1 cgd /*
13 1.1 cgd * On the i486, this code is negligibly faster than the code generated
14 1.3 jtc * by gcc at about half the size. If my i386 databook is correct, it
15 1.1 cgd * should be considerably faster than the gcc code on a i386.
16 1.1 cgd */
17 1.1 cgd
18 1.1 cgd ENTRY(swab)
19 1.1 cgd pushl %esi
20 1.1 cgd pushl %edi
21 1.1 cgd movl 12(%esp),%esi
22 1.1 cgd movl 16(%esp),%edi
23 1.1 cgd movl 20(%esp),%ecx
24 1.1 cgd
25 1.4 jtc shrl $1,%ecx
26 1.1 cgd testl $7,%ecx # copy first group of 1 to 7 words
27 1.12 rpaulo jz L2 # while swapping alternate bytes.
28 1.10 kleink _ALIGN_TEXT,0x90
29 1.1 cgd L1: lodsw
30 1.6 jtc rorw $8,%ax
31 1.1 cgd stosw
32 1.1 cgd decl %ecx
33 1.1 cgd testl $7,%ecx
34 1.4 jtc jnz L1
35 1.3 jtc
36 1.1 cgd L2: shrl $3,%ecx # copy remainder 8 words at a time
37 1.1 cgd jz L4 # while swapping alternate bytes.
38 1.10 kleink _ALIGN_TEXT,0x90
39 1.1 cgd L3: lodsw
40 1.6 jtc rorw $8,%ax
41 1.1 cgd stosw
42 1.1 cgd lodsw
43 1.6 jtc rorw $8,%ax
44 1.1 cgd stosw
45 1.1 cgd lodsw
46 1.6 jtc rorw $8,%ax
47 1.1 cgd stosw
48 1.1 cgd lodsw
49 1.6 jtc rorw $8,%ax
50 1.1 cgd stosw
51 1.1 cgd lodsw
52 1.6 jtc rorw $8,%ax
53 1.1 cgd stosw
54 1.1 cgd lodsw
55 1.6 jtc rorw $8,%ax
56 1.1 cgd stosw
57 1.1 cgd lodsw
58 1.6 jtc rorw $8,%ax
59 1.1 cgd stosw
60 1.1 cgd lodsw
61 1.6 jtc rorw $8,%ax
62 1.1 cgd stosw
63 1.1 cgd decl %ecx
64 1.1 cgd jnz L3
65 1.1 cgd
66 1.6 jtc L4: popl %edi
67 1.1 cgd popl %esi
68 1.1 cgd ret
69 1.14 uebayasi END(swab)
70