lshift.asm revision 1.1.1.2 1 dnl AMD K6 mpn_lshift -- mpn left shift.
2
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
30
31 include(`../config.m4')
32
33
34 C K6: 3.0 cycles/limb
35
36
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38 C unsigned shift);
39 C
40 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
41 C instructions. This is despite every second fetch being unaligned.
42
43
44 defframe(PARAM_SHIFT,16)
45 defframe(PARAM_SIZE, 12)
46 defframe(PARAM_SRC, 8)
47 defframe(PARAM_DST, 4)
48
49 TEXT
50 ALIGN(32)
51
52 PROLOGUE(mpn_lshift)
53 deflit(`FRAME',0)
54
55 C The 1 limb case can be done without the push %ebx, but it's then
56 C still the same speed. The push is left as a free helping hand for
57 C the two_or_more code.
58
59 movl PARAM_SIZE, %eax
60 pushl %ebx FRAME_pushl()
61
62 movl PARAM_SRC, %ebx
63 decl %eax
64
65 movl PARAM_SHIFT, %ecx
66 jnz L(two_or_more)
67
68 movl (%ebx), %edx C src limb
69 movl PARAM_DST, %ebx
70
71 shldl( %cl, %edx, %eax) C return value
72
73 shll %cl, %edx
74
75 movl %edx, (%ebx) C dst limb
76 popl %ebx
77
78 ret
79
80
81 ALIGN(16) C avoid offset 0x1f
82 nop C avoid bad cache line crossing
83 L(two_or_more):
84 C eax size-1
85 C ebx src
86 C ecx shift
87 C edx
88
89 movl (%ebx,%eax,4), %edx C src high limb
90 negl %ecx
91
92 movd PARAM_SHIFT, %mm6
93 addl $32, %ecx C 32-shift
94
95 shrl %cl, %edx
96
97 movd %ecx, %mm7
98 movl PARAM_DST, %ecx
99
100 L(top):
101 C eax counter, size-1 to 1
102 C ebx src
103 C ecx dst
104 C edx retval
105 C
106 C mm0 scratch
107 C mm6 shift
108 C mm7 32-shift
109
110 movq -4(%ebx,%eax,4), %mm0
111 decl %eax
112
113 psrlq %mm7, %mm0
114
115 movd %mm0, 4(%ecx,%eax,4)
116 jnz L(top)
117
118
119 movd (%ebx), %mm0
120 popl %ebx
121
122 psllq %mm6, %mm0
123 movl %edx, %eax
124
125 movd %mm0, (%ecx)
126
127 emms
128 ret
129
130 EPILOGUE()
131