addmul_2.asm revision 1.1.1.1.4.2 1 dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
2 dnl number and add the result to a n limb vector.
3
4 dnl Copyright 2002, 2003 Free Software Foundation, Inc.
5
6 dnl This file is part of the GNU MP Library.
7
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
12
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
17
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23 C cycles/limb
24 C UltraSPARC 1&2: 9
25 C UltraSPARC 3: 10
26
27 C Algorithm: We use 16 floating-point multiplies per limb product, with the
28 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
29 C split into 32-bit pieces. We sum four 48-bit partial products using
30 C floating-point add, then convert the resulting four 50-bit quantities and
31 C transfer them to the integer unit.
32
33 C Possible optimizations:
34 C 1. Align the stack area where we transfer the four 50-bit product-sums
35 C to a 32-byte boundary. That would minimize the cache collision.
36 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
37 C be to align the area to map to the area immediately before up?)
38 C 2. Perform two of the fp->int conversions with integer instructions. We
39 C can get almost ten free IEU slots, if we clean up bookkeeping and the
40 C silly carry-limb code.
41 C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
42 C code.
43
44 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
45 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
46 C FI = 20
47 C L = 9 x un * vn
48 C WDFI = 10 x vn / 2
49 C WD = 4
50
51 C Instruction classification (as per UltraSPARC functional units).
52 C Assuming silly carry code is fixed. Includes bookkeeping.
53 C
54 C mpn_addmul_X mpn_mul_X
55 C 1 2 1 2
56 C ========== ==========
57 C FM 8 16 8 16
58 C FA 10 18 10 18
59 C MEM 12 12 10 10
60 C ISHIFT 6 6 6 6
61 C IADDLOG 11 11 10 10
62 C BRANCH 1 1 1 1
63 C
64 C TOTAL IEU 17 17 16 16
65 C TOTAL 48 64 45 61
66 C
67 C IEU cycles 8.5 8.5 8 8
68 C MEM cycles 12 12 10 10
69 C ISSUE cycles 12 16 11.25 15.25
70 C FPU cycles 10 18 10 18
71 C cycles/loop 12 18 12 18
72 C cycles/limb 12 9 12 9
73
74
75 C INPUT PARAMETERS
76 C rp[n + 1] i0
77 C up[n] i1
78 C n i2
79 C vp[2] i3
80
81
82 ASM_START()
83 REGISTER(%g2,#scratch)
84 REGISTER(%g3,#scratch)
85
86 C Combine registers:
87 C u00_hi= u32_hi
88 C u00_lo= u32_lo
89 C a000 = out000
90 C a016 = out016
91 C Free: f52 f54
92
93
94 define(`p000', `%f8') define(`p016',`%f10')
95 define(`p032',`%f12') define(`p048',`%f14')
96 define(`p064',`%f16') define(`p080',`%f18')
97 define(`p096a',`%f20') define(`p112a',`%f22')
98 define(`p096b',`%f56') define(`p112b',`%f58')
99
100 define(`out000',`%f0') define(`out016',`%f6')
101
102 define(`v000',`%f24') define(`v016',`%f26')
103 define(`v032',`%f28') define(`v048',`%f30')
104 define(`v064',`%f44') define(`v080',`%f46')
105 define(`v096',`%f48') define(`v112',`%f50')
106
107 define(`u00',`%f32') define(`u32', `%f34')
108
109 define(`a000',`%f36') define(`a016',`%f38')
110 define(`a032',`%f40') define(`a048',`%f42')
111 define(`a064',`%f60') define(`a080',`%f62')
112
113 define(`u00_hi',`%f2') define(`u32_hi',`%f4')
114 define(`u00_lo',`%f3') define(`u32_lo',`%f5')
115
116 define(`cy',`%g1')
117 define(`rlimb',`%g3')
118 define(`i00',`%l0') define(`i16',`%l1')
119 define(`r00',`%l2') define(`r32',`%l3')
120 define(`xffffffff',`%l7')
121 define(`xffff',`%o0')
122
123
124 PROLOGUE(mpn_addmul_2)
125
126 C Initialization. (1) Split v operand into eight 16-bit chunks and store them
127 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
128 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
129 C This code could be better scheduled.
130
131 save %sp, -256, %sp
132
133 ifdef(`HAVE_VIS',
134 ` mov -1, %g4
135 wr %g0, 0xD2, %asi
136 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
137 ldda [%i3+6] %asi, v000
138 ldda [%i3+4] %asi, v016
139 ldda [%i3+2] %asi, v032
140 ldda [%i3+0] %asi, v048
141 fxtod v000, v000
142 ldda [%i3+14] %asi, v064
143 fxtod v016, v016
144 ldda [%i3+12] %asi, v080
145 fxtod v032, v032
146 ldda [%i3+10] %asi, v096
147 fxtod v048, v048
148 ldda [%i3+8] %asi, v112
149 fxtod v064, v064
150 fxtod v080, v080
151 fxtod v096, v096
152 fxtod v112, v112
153 fzero u00_hi
154 fzero u32_hi
155 ',
156 ` mov -1, %g4
157 ldx [%i3+0], %l0 C vp[0]
158 srlx %g4, 48, xffff C store mask in register `xffff'
159 ldx [%i3+8], %l1 C vp[1]
160
161 and %l0, xffff, %g2
162 stx %g2, [%sp+2223+0]
163 srlx %l0, 16, %g3
164 and %g3, xffff, %g3
165 stx %g3, [%sp+2223+8]
166 srlx %l0, 32, %g2
167 and %g2, xffff, %g2
168 stx %g2, [%sp+2223+16]
169 srlx %l0, 48, %g3
170 stx %g3, [%sp+2223+24]
171 and %l1, xffff, %g2
172 stx %g2, [%sp+2223+32]
173 srlx %l1, 16, %g3
174 and %g3, xffff, %g3
175 stx %g3, [%sp+2223+40]
176 srlx %l1, 32, %g2
177 and %g2, xffff, %g2
178 stx %g2, [%sp+2223+48]
179 srlx %l1, 48, %g3
180 stx %g3, [%sp+2223+56]
181
182 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
183
184 ldd [%sp+2223+0], v000
185 ldd [%sp+2223+8], v016
186 ldd [%sp+2223+16], v032
187 ldd [%sp+2223+24], v048
188 fxtod v000, v000
189 ldd [%sp+2223+32], v064
190 fxtod v016, v016
191 ldd [%sp+2223+40], v080
192 fxtod v032, v032
193 ldd [%sp+2223+48], v096
194 fxtod v048, v048
195 ldd [%sp+2223+56], v112
196 fxtod v064, v064
197 ld [%sp+2223+0], u00_hi C zero u00_hi
198 fxtod v080, v080
199 ld [%sp+2223+0], u32_hi C zero u32_hi
200 fxtod v096, v096
201 fxtod v112, v112
202 ')
203 C Initialization done.
204 mov 0, %g2
205 mov 0, rlimb
206 mov 0, %g4
207 add %i0, -8, %i0 C BOOKKEEPING
208
209 C Start software pipeline.
210
211 ld [%i1+4], u00_lo C read low 32 bits of up[i]
212 fxtod u00_hi, u00
213 C mid
214 ld [%i1+0], u32_lo C read high 32 bits of up[i]
215 fmuld u00, v000, a000
216 fmuld u00, v016, a016
217 fmuld u00, v032, a032
218 fmuld u00, v048, a048
219 add %i2, -1, %i2 C BOOKKEEPING
220 fmuld u00, v064, p064
221 add %i1, 8, %i1 C BOOKKEEPING
222 fxtod u32_hi, u32
223 fmuld u00, v080, p080
224 fmuld u00, v096, p096a
225 brnz,pt %i2, .L_2_or_more
226 fmuld u00, v112, p112a
227
228 .L1: fdtox a000, out000
229 fmuld u32, v000, p000
230 fdtox a016, out016
231 fmuld u32, v016, p016
232 fmovd p064, a064
233 fmuld u32, v032, p032
234 fmovd p080, a080
235 fmuld u32, v048, p048
236 std out000, [%sp+2223+16]
237 faddd p000, a032, a000
238 fmuld u32, v064, p064
239 std out016, [%sp+2223+24]
240 fxtod u00_hi, u00
241 faddd p016, a048, a016
242 fmuld u32, v080, p080
243 faddd p032, a064, a032
244 fmuld u32, v096, p096b
245 faddd p048, a080, a048
246 fmuld u32, v112, p112b
247 C mid
248 fdtox a000, out000
249 fdtox a016, out016
250 faddd p064, p096a, a064
251 faddd p080, p112a, a080
252 std out000, [%sp+2223+0]
253 b .L_wd2
254 std out016, [%sp+2223+8]
255
256 .L_2_or_more:
257 ld [%i1+4], u00_lo C read low 32 bits of up[i]
258 fdtox a000, out000
259 fmuld u32, v000, p000
260 fdtox a016, out016
261 fmuld u32, v016, p016
262 fmovd p064, a064
263 fmuld u32, v032, p032
264 fmovd p080, a080
265 fmuld u32, v048, p048
266 std out000, [%sp+2223+16]
267 faddd p000, a032, a000
268 fmuld u32, v064, p064
269 std out016, [%sp+2223+24]
270 fxtod u00_hi, u00
271 faddd p016, a048, a016
272 fmuld u32, v080, p080
273 faddd p032, a064, a032
274 fmuld u32, v096, p096b
275 faddd p048, a080, a048
276 fmuld u32, v112, p112b
277 C mid
278 ld [%i1+0], u32_lo C read high 32 bits of up[i]
279 fdtox a000, out000
280 fmuld u00, v000, p000
281 fdtox a016, out016
282 fmuld u00, v016, p016
283 faddd p064, p096a, a064
284 fmuld u00, v032, p032
285 faddd p080, p112a, a080
286 fmuld u00, v048, p048
287 add %i2, -1, %i2 C BOOKKEEPING
288 std out000, [%sp+2223+0]
289 faddd p000, a032, a000
290 fmuld u00, v064, p064
291 add %i1, 8, %i1 C BOOKKEEPING
292 std out016, [%sp+2223+8]
293 fxtod u32_hi, u32
294 faddd p016, a048, a016
295 fmuld u00, v080, p080
296 faddd p032, a064, a032
297 fmuld u00, v096, p096a
298 faddd p048, a080, a048
299 brnz,pt %i2, .L_3_or_more
300 fmuld u00, v112, p112a
301
302 b .Lend
303 nop
304
305 C 64 32 0
306 C . . .
307 C . |__rXXX_| 32
308 C . |___cy___| 34
309 C . |_______i00__| 50
310 C |_______i16__| . 50
311
312
313 C BEGIN MAIN LOOP
314 .align 16
315 .L_3_or_more:
316 .Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i]
317 and %g2, xffffffff, %g2
318 fdtox a000, out000
319 fmuld u32, v000, p000
320 C
321 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
322 add %g2, rlimb, %l5
323 fdtox a016, out016
324 fmuld u32, v016, p016
325 C
326 srlx %l5, 32, cy
327 ldx [%sp+2223+16], i00
328 faddd p064, p096b, a064
329 fmuld u32, v032, p032
330 C
331 add %g4, cy, cy C new cy
332 ldx [%sp+2223+24], i16
333 faddd p080, p112b, a080
334 fmuld u32, v048, p048
335 C
336 nop
337 std out000, [%sp+2223+16]
338 faddd p000, a032, a000
339 fmuld u32, v064, p064
340 C
341 add i00, r00, rlimb
342 add %i0, 8, %i0 C BOOKKEEPING
343 std out016, [%sp+2223+24]
344 fxtod u00_hi, u00
345 C
346 sllx i16, 16, %g2
347 add cy, rlimb, rlimb
348 faddd p016, a048, a016
349 fmuld u32, v080, p080
350 C
351 srlx i16, 16, %g4
352 add %g2, rlimb, %l5
353 faddd p032, a064, a032
354 fmuld u32, v096, p096b
355 C
356 stw %l5, [%i0+4]
357 nop
358 faddd p048, a080, a048
359 fmuld u32, v112, p112b
360 C midloop
361 ld [%i1+0], u32_lo C read high 32 bits of up[i]
362 and %g2, xffffffff, %g2
363 fdtox a000, out000
364 fmuld u00, v000, p000
365 C
366 lduw [%i0+0], r32 C read high 32 bits of rp[i]
367 add %g2, rlimb, %l5
368 fdtox a016, out016
369 fmuld u00, v016, p016
370 C
371 srlx %l5, 32, cy
372 ldx [%sp+2223+0], i00
373 faddd p064, p096a, a064
374 fmuld u00, v032, p032
375 C
376 add %g4, cy, cy C new cy
377 ldx [%sp+2223+8], i16
378 faddd p080, p112a, a080
379 fmuld u00, v048, p048
380 C
381 add %i2, -1, %i2 C BOOKKEEPING
382 std out000, [%sp+2223+0]
383 faddd p000, a032, a000
384 fmuld u00, v064, p064
385 C
386 add i00, r32, rlimb
387 add %i1, 8, %i1 C BOOKKEEPING
388 std out016, [%sp+2223+8]
389 fxtod u32_hi, u32
390 C
391 sllx i16, 16, %g2
392 add cy, rlimb, rlimb
393 faddd p016, a048, a016
394 fmuld u00, v080, p080
395 C
396 srlx i16, 16, %g4
397 add %g2, rlimb, %l5
398 faddd p032, a064, a032
399 fmuld u00, v096, p096a
400 C
401 stw %l5, [%i0+0]
402 faddd p048, a080, a048
403 brnz,pt %i2, .Loop
404 fmuld u00, v112, p112a
405 C END MAIN LOOP
406
407 C WIND-DOWN PHASE 1
408 .Lend: and %g2, xffffffff, %g2
409 fdtox a000, out000
410 fmuld u32, v000, p000
411 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
412 add %g2, rlimb, %l5
413 fdtox a016, out016
414 fmuld u32, v016, p016
415 srlx %l5, 32, cy
416 ldx [%sp+2223+16], i00
417 faddd p064, p096b, a064
418 fmuld u32, v032, p032
419 add %g4, cy, cy C new cy
420 ldx [%sp+2223+24], i16
421 faddd p080, p112b, a080
422 fmuld u32, v048, p048
423 std out000, [%sp+2223+16]
424 faddd p000, a032, a000
425 fmuld u32, v064, p064
426 add i00, r00, rlimb
427 add %i0, 8, %i0 C BOOKKEEPING
428 std out016, [%sp+2223+24]
429 sllx i16, 16, %g2
430 add cy, rlimb, rlimb
431 faddd p016, a048, a016
432 fmuld u32, v080, p080
433 srlx i16, 16, %g4
434 add %g2, rlimb, %l5
435 faddd p032, a064, a032
436 fmuld u32, v096, p096b
437 stw %l5, [%i0+4]
438 faddd p048, a080, a048
439 fmuld u32, v112, p112b
440 C mid
441 and %g2, xffffffff, %g2
442 fdtox a000, out000
443 lduw [%i0+0], r32 C read high 32 bits of rp[i]
444 add %g2, rlimb, %l5
445 fdtox a016, out016
446 srlx %l5, 32, cy
447 ldx [%sp+2223+0], i00
448 faddd p064, p096a, a064
449 add %g4, cy, cy C new cy
450 ldx [%sp+2223+8], i16
451 faddd p080, p112a, a080
452 std out000, [%sp+2223+0]
453 add i00, r32, rlimb
454 std out016, [%sp+2223+8]
455 sllx i16, 16, %g2
456 add cy, rlimb, rlimb
457 srlx i16, 16, %g4
458 add %g2, rlimb, %l5
459 stw %l5, [%i0+0]
460
461 C WIND-DOWN PHASE 2
462 .L_wd2: and %g2, xffffffff, %g2
463 fdtox a032, out000
464 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
465 add %g2, rlimb, %l5
466 fdtox a048, out016
467 srlx %l5, 32, cy
468 ldx [%sp+2223+16], i00
469 add %g4, cy, cy C new cy
470 ldx [%sp+2223+24], i16
471 std out000, [%sp+2223+16]
472 add i00, r00, rlimb
473 add %i0, 8, %i0 C BOOKKEEPING
474 std out016, [%sp+2223+24]
475 sllx i16, 16, %g2
476 add cy, rlimb, rlimb
477 srlx i16, 16, %g4
478 add %g2, rlimb, %l5
479 stw %l5, [%i0+4]
480 C mid
481 and %g2, xffffffff, %g2
482 fdtox a064, out000
483 lduw [%i0+0], r32 C read high 32 bits of rp[i]
484 add %g2, rlimb, %l5
485 fdtox a080, out016
486 srlx %l5, 32, cy
487 ldx [%sp+2223+0], i00
488 add %g4, cy, cy C new cy
489 ldx [%sp+2223+8], i16
490 std out000, [%sp+2223+0]
491 add i00, r32, rlimb
492 std out016, [%sp+2223+8]
493 sllx i16, 16, %g2
494 add cy, rlimb, rlimb
495 srlx i16, 16, %g4
496 add %g2, rlimb, %l5
497 stw %l5, [%i0+0]
498
499 C WIND-DOWN PHASE 3
500 .L_wd3: and %g2, xffffffff, %g2
501 fdtox p096b, out000
502 add %g2, rlimb, %l5
503 fdtox p112b, out016
504 srlx %l5, 32, cy
505 ldx [%sp+2223+16], rlimb
506 add %g4, cy, cy C new cy
507 ldx [%sp+2223+24], i16
508 std out000, [%sp+2223+16]
509 add %i0, 8, %i0 C BOOKKEEPING
510 std out016, [%sp+2223+24]
511 sllx i16, 16, %g2
512 add cy, rlimb, rlimb
513 srlx i16, 16, %g4
514 add %g2, rlimb, %l5
515 stw %l5, [%i0+4]
516 C mid
517 and %g2, xffffffff, %g2
518 add %g2, rlimb, %l5
519 srlx %l5, 32, cy
520 ldx [%sp+2223+0], rlimb
521 add %g4, cy, cy C new cy
522 ldx [%sp+2223+8], i16
523 sllx i16, 16, %g2
524 add cy, rlimb, rlimb
525 srlx i16, 16, %g4
526 add %g2, rlimb, %l5
527 stw %l5, [%i0+0]
528
529 and %g2, xffffffff, %g2
530 add %g2, rlimb, %l5
531 srlx %l5, 32, cy
532 ldx [%sp+2223+16], i00
533 add %g4, cy, cy C new cy
534 ldx [%sp+2223+24], i16
535
536 sllx i16, 16, %g2
537 add i00, cy, cy
538 return %i7+8
539 add %g2, cy, %o0
540 EPILOGUE(mpn_addmul_2)
541