mul_1.asm revision 1.1 1 dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
3
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6 dnl This file is part of the GNU MP Library.
7
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
12
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
17
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23 C Algorithm: We use two floating-point multiplies per limb product, with the
24 C invariant v operand split into two 16-bit pieces, and the u operand split
25 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
26 C the integer unit.
27
28 C cycles/limb
29 C UltraSPARC 1&2: 6.5
30 C UltraSPARC 3: ?
31
32 C Possible optimizations:
33 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
34 C memory bandwidth limited, this could save 1.5 cycles/limb.
35 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
36 C it is very straightforward to unroll, using an exit branch midways.
37 C Unrolling would allow deeper scheduling which could improve speed for L2
38 C cache case.
39 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
40 C aren't sufficiently apart-scheduled with just two temp areas.
41 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
42 C could save many operations.
43
44 C INPUT PARAMETERS
45 C rp i0
46 C up i1
47 C n i2
48 C v i3
49
50 define(`FSIZE',224)
51
52 ASM_START()
53 PROLOGUE(mpn_mul_1)
54 add %sp, -FSIZE, %sp
55 sethi %hi(0xffff), %g1
56 srl %o3, 16, %g2
57 or %g1, %lo(0xffff), %g1
58 and %o3, %g1, %g1
59 stx %g1, [%sp+104]
60 stx %g2, [%sp+112]
61 ldd [%sp+104], %f6
62 ldd [%sp+112], %f8
63 fxtod %f6, %f6
64 fxtod %f8, %f8
65 ld [%sp+104], %f10 C zero f10
66
67 mov 0, %g3 C cy = 0
68
69 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
70
71 add %sp, 160, %o5 C point in scratch area
72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
73
74 subcc %o2, 1, %o2
75 ld [%o1], %f11 C read up[i]
76 add %o1, 4, %o1 C up++
77 bne,pt %icc, .L_two_or_more
78 fxtod %f10, %f2
79
80 fmuld %f2, %f8, %f16
81 fmuld %f2, %f6, %f4
82 fdtox %f16, %f14
83 fdtox %f4, %f12
84 std %f14, [%o5+16]
85 std %f12, [%o5+24]
86 ldx [%o5+16], %g2 C p16
87 ldx [%o5+24], %g1 C p0
88 b .L1
89 add %o0, -16, %o0
90
91 .align 16
92 .L_two_or_more:
93 subcc %o2, 1, %o2
94 ld [%o1], %f11 C read up[i]
95 fmuld %f2, %f8, %f16
96 fmuld %f2, %f6, %f4
97 add %o1, 4, %o1 C up++
98 bne,pt %icc, .L_three_or_more
99 fxtod %f10, %f2
100
101 fdtox %f16, %f14
102 fdtox %f4, %f12
103 std %f14, [%o5+16]
104 fmuld %f2, %f8, %f16
105 std %f12, [%o5+24]
106 fmuld %f2, %f6, %f4
107 fdtox %f16, %f14
108 fdtox %f4, %f12
109 std %f14, [%o5+0]
110 std %f12, [%o5+8]
111 ldx [%o5+16], %g2 C p16
112 ldx [%o5+24], %g1 C p0
113 b .L2
114 add %o0, -12, %o0
115
116 .align 16
117 .L_three_or_more:
118 subcc %o2, 1, %o2
119 ld [%o1], %f11 C read up[i]
120 fdtox %f16, %f14
121 fdtox %f4, %f12
122 std %f14, [%o5+16]
123 fmuld %f2, %f8, %f16
124 std %f12, [%o5+24]
125 fmuld %f2, %f6, %f4
126 add %o1, 4, %o1 C up++
127 bne,pt %icc, .L_four_or_more
128 fxtod %f10, %f2
129
130 fdtox %f16, %f14
131 fdtox %f4, %f12
132 std %f14, [%o5+0]
133 fmuld %f2, %f8, %f16
134 std %f12, [%o5+8]
135 fmuld %f2, %f6, %f4
136 fdtox %f16, %f14
137 ldx [%o5+16], %g2 C p16
138 fdtox %f4, %f12
139 ldx [%o5+24], %g1 C p0
140 std %f14, [%o5+16]
141 std %f12, [%o5+24]
142 b .L3
143 add %o0, -8, %o0
144
145 .align 16
146 .L_four_or_more:
147 subcc %o2, 1, %o2
148 ld [%o1], %f11 C read up[i]
149 fdtox %f16, %f14
150 fdtox %f4, %f12
151 std %f14, [%o5+0]
152 fmuld %f2, %f8, %f16
153 std %f12, [%o5+8]
154 fmuld %f2, %f6, %f4
155 add %o1, 4, %o1 C up++
156 bne,pt %icc, .L_five_or_more
157 fxtod %f10, %f2
158
159 fdtox %f16, %f14
160 ldx [%o5+16], %g2 C p16
161 fdtox %f4, %f12
162 ldx [%o5+24], %g1 C p0
163 std %f14, [%o5+16]
164 fmuld %f2, %f8, %f16
165 std %f12, [%o5+24]
166 fmuld %f2, %f6, %f4
167 add %o1, 4, %o1 C up++
168 b .L4
169 add %o0, -4, %o0
170
171 .align 16
172 .L_five_or_more:
173 subcc %o2, 1, %o2
174 ld [%o1], %f11 C read up[i]
175 fdtox %f16, %f14
176 ldx [%o5+16], %g2 C p16
177 fdtox %f4, %f12
178 ldx [%o5+24], %g1 C p0
179 std %f14, [%o5+16]
180 fmuld %f2, %f8, %f16
181 std %f12, [%o5+24]
182 fmuld %f2, %f6, %f4
183 add %o1, 4, %o1 C up++
184 bne,pt %icc, .Loop
185 fxtod %f10, %f2
186 b,a .L5
187
188 C BEGIN MAIN LOOP
189 .align 16
190 C -- 0
191 .Loop: nop
192 subcc %o2, 1, %o2
193 ld [%o1], %f11 C read up[i]
194 fdtox %f16, %f14
195 C -- 1
196 sllx %g2, 16, %g4 C (p16 << 16)
197 add %o0, 4, %o0 C rp++
198 ldx [%o5+0], %g2 C p16
199 fdtox %f4, %f12
200 C -- 2
201 nop
202 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
203 ldx [%o5+8], %g1 C p0
204 fanop
205 C -- 3
206 nop
207 add %g3, %g4, %g4 C p += cy
208 std %f14, [%o5+0]
209 fmuld %f2, %f8, %f16
210 C -- 4
211 srlx %g4, 32, %g3 C new cy
212 add %o1, 4, %o1 C up++
213 std %f12, [%o5+8]
214 fmuld %f2, %f6, %f4
215 C -- 5
216 xor %o5, 16, %o5 C alternate scratch variables
217 stw %g4, [%o0-4]
218 bne,pt %icc, .Loop
219 fxtod %f10, %f2
220 C END MAIN LOOP
221
222 .L5: fdtox %f16, %f14
223 sllx %g2, 16, %g4 C (p16 << 16)
224 ldx [%o5+0], %g2 C p16
225 fdtox %f4, %f12
226 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
227 ldx [%o5+8], %g1 C p0
228 add %g4, %g3, %g4 C p += cy
229 std %f14, [%o5+0]
230 fmuld %f2, %f8, %f16
231 std %f12, [%o5+8]
232 fmuld %f2, %f6, %f4
233 xor %o5, 16, %o5
234 stw %g4, [%o0+0]
235 srlx %g4, 32, %g3 C new cy
236
237 .L4: fdtox %f16, %f14
238 sllx %g2, 16, %g4 C (p16 << 16)
239 ldx [%o5+0], %g2 C p16
240 fdtox %f4, %f12
241 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
242 ldx [%o5+8], %g1 C p0
243 add %g3, %g4, %g4 C p += cy
244 std %f14, [%o5+0]
245 std %f12, [%o5+8]
246 xor %o5, 16, %o5
247 stw %g4, [%o0+4]
248 srlx %g4, 32, %g3 C new cy
249
250 .L3: sllx %g2, 16, %g4 C (p16 << 16)
251 ldx [%o5+0], %g2 C p16
252 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
253 ldx [%o5+8], %g1 C p0
254 add %g3, %g4, %g4 C p += cy
255 xor %o5, 16, %o5
256 stw %g4, [%o0+8]
257 srlx %g4, 32, %g3 C new cy
258
259 .L2: sllx %g2, 16, %g4 C (p16 << 16)
260 ldx [%o5+0], %g2 C p16
261 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
262 ldx [%o5+8], %g1 C p0
263 add %g3, %g4, %g4 C p += cy
264 stw %g4, [%o0+12]
265 srlx %g4, 32, %g3 C new cy
266
267 .L1: sllx %g2, 16, %g4 C (p16 << 16)
268 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
269 add %g3, %g4, %g4 C p += cy
270 stw %g4, [%o0+16]
271 srlx %g4, 32, %g3 C new cy
272
273 mov %g3, %o0
274 retl
275 sub %sp, -FSIZE, %sp
276 EPILOGUE(mpn_mul_1)
277