mul_1.asm revision 1.1.1.3 1 dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer.
2
3 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
30
31 include(`../config.m4')
32
33 C cycles/limb
34 C AMD K8,K9 3.65
35 C AMD K10 3.30 3.68
36 C AMD bull 4.04 4.29
37 C AMD pile 4.33
38 C AMD steam
39 C AMD excavator
40 C AMD bobcat 5.73
41 C AMD jaguar 5.87
42 C Intel P4 12.5
43 C Intel core2 4.38
44 C Intel NHM 4.28
45 C Intel SBR 2.69
46 C Intel IBR 2.55
47 C Intel HWL 2.41
48 C Intel BWL 2.49
49 C Intel SKL 2.50
50 C Intel atom 20.3
51 C Intel SLM 7.8
52 C VIA nano 4.25
53
54 C The loop of this code is the result of running a code generation and
55 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56
57 C TODO
58 C * Move loop code into feed-in blocks, to save insn for zeroing regs.
59
60 define(`rp', `%rdi') C rcx
61 define(`up', `%rsi') C rdx
62 define(`n_param', `%rdx') C r8
63 define(`v0', `%rcx') C r9
64
65 define(`n', `%rbx')
66
67 ABI_SUPPORT(DOS64)
68 ABI_SUPPORT(STD64)
69
70 IFDOS(` define(`up', ``%rsi'') ') dnl
71 IFDOS(` define(`rp', ``%rcx'') ') dnl
72 IFDOS(` define(`v0', ``%r9'') ') dnl
73 IFDOS(` define(`r9', ``rdi'') ') dnl
74 IFDOS(` define(`n', ``%r8'') ') dnl
75 IFDOS(` define(`r8', ``rbx'') ') dnl
76
77 ASM_START()
78 TEXT
79 ALIGN(16)
80 PROLOGUE(mpn_mul_1c)
81 IFDOS(``push %rsi '')
82 IFDOS(``push %rdi '')
83 IFDOS(``mov %rdx, %rsi '')
84
85 mov (up), %rax C read first u limb early
86 push %rbx
87 IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
88 IFDOS(` mov n, %r11 ')
89 mul v0
90
91 IFSTD(` add %r8, %rax ')
92 IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns)
93 adc $0, %rdx
94 jmp L(common)
95
96 EPILOGUE()
97
98 ALIGN(16)
99 PROLOGUE(mpn_mul_1)
100 IFDOS(``push %rsi '')
101 IFDOS(``push %rdi '')
102 IFDOS(``mov %rdx, %rsi '')
103
104 mov (up), %rax C read first u limb early
105 push %rbx
106 IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
107 IFDOS(` mov n, %r11 ')
108 mul v0
109
110 L(common):
111 IFSTD(` mov %r11, n ')
112
113 and $3, R32(%r11)
114 lea -16(rp,n,8), rp
115 jz L(b0)
116 cmp $2, R32(%r11)
117 jb L(b1)
118 jz L(b2)
119
120 L(b3): mov %rax, %r10
121 mov %rdx, %r11
122 mov 8(up), %rax
123 mul v0
124 lea (up,n,8), up
125 not n
126 jmp L(L3)
127
128 L(b0): mov %rax, %r9
129 mov %rdx, %r10
130 mov 8(up), %rax
131 lea (up,n,8), up
132 neg n
133 jmp L(L0)
134
135 L(b1): mov %rax, %r8
136 cmp $1, n
137 jz L(n1)
138 mov %rdx, %r9
139 lea (up,n,8), up
140 neg n
141 mov %r8, 16(rp,n,8)
142 inc n
143 jmp L(L1)
144
145 L(b2): mov %rax, %r11
146 mov %rdx, %r8
147 mov 8(up), %rax
148 lea (up,n,8), up
149 neg n
150 add $2, n
151 jns L(end)
152
153 ALIGN(16)
154 L(top): mul v0
155 mov %rdx, %r9
156 add %rax, %r8
157 adc $0, %r9
158 mov %r8, 8(rp,n,8)
159 mov %r11, (rp,n,8)
160 L(L1): mov (up,n,8), %rax
161 mul v0
162 add %rax, %r9
163 mov %rdx, %r10
164 mov 8(up,n,8), %rax
165 adc $0, %r10
166 L(L0): mul v0
167 add %rax, %r10
168 mov %rdx, %r11
169 mov 16(up,n,8), %rax
170 adc $0, %r11
171 mul v0
172 mov %r9, 16(rp,n,8)
173 L(L3): add %rax, %r11
174 mov %r10, 24(rp,n,8)
175 mov %rdx, %r8
176 adc $0, %r8
177 add $4, n
178 mov -8(up,n,8), %rax
179 js L(top)
180
181 L(end): mul v0
182 add %rax, %r8
183 adc $0, %rdx
184 mov %r11, (rp)
185 L(n1): mov %r8, 8(rp)
186 mov %rdx, %rax
187
188 pop %rbx
189 IFDOS(``pop %rdi '')
190 IFDOS(``pop %rsi '')
191 ret
192 EPILOGUE()
193 ASM_END()
194