addmul_1.asm revision 1.1.1.1.2.1 1 dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2 dnl result to a second limb vector.
3
4 dnl Contributed to the GNU project by Torbjorn Granlund.
5
6 dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
7 dnl Foundation, Inc.
8
9 dnl This file is part of the GNU MP Library.
10
11 dnl The GNU MP Library is free software; you can redistribute it and/or modify
12 dnl it under the terms of the GNU Lesser General Public License as published
13 dnl by the Free Software Foundation; either version 3 of the License, or (at
14 dnl your option) any later version.
15
16 dnl The GNU MP Library is distributed in the hope that it will be useful, but
17 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 dnl License for more details.
20
21 dnl You should have received a copy of the GNU Lesser General Public License
22 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
23
24 include(`../config.m4')
25
26 C cycles/limb
27 C Itanium: 3.0
28 C Itanium 2: 2.0
29
30 C TODO
31 C * Further optimize feed-in and wind-down code, both for speed and code size.
32 C * Handle low limb input and results specially, using a common stf8 in the
33 C epilogue.
34 C * Use 1 c/l carry propagation scheme in wind-down code.
35 C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
36 C * Work out final differences with mul_1.asm. That function is 300 bytes
37 C smaller than this due to better loop scheduling and thus simpler feed-in
38 C code.
39
40 C INPUT PARAMETERS
41 define(`rp', `r32')
42 define(`up', `r33')
43 define(`n', `r34')
44 define(`vl', `r35')
45
46 ASM_START()
47 PROLOGUE(mpn_addmul_1)
48 .prologue
49 .save ar.lc, r2
50 .body
51
52 ifdef(`HAVE_ABI_32',
53 ` addp4 rp = 0, rp C M I
54 addp4 up = 0, up C M I
55 zxt4 n = n C I
56 ;;
57 ')
58 {.mmi
59 adds r15 = -1, n C M I
60 mov r20 = rp C M I
61 mov.i r2 = ar.lc C I0
62 }
63 {.mmi
64 ldf8 f7 = [up], 8 C M
65 ldf8 f8 = [rp], 8 C M
66 and r14 = 3, n C M I
67 ;;
68 }
69 {.mmi
70 setf.sig f6 = vl C M2 M3
71 cmp.eq p10, p0 = 0, r14 C M I
72 shr.u r31 = r15, 2 C I0
73 }
74 {.mmi
75 cmp.eq p11, p0 = 2, r14 C M I
76 cmp.eq p12, p0 = 3, r14 C M I
77 nop.i 0 C I
78 ;;
79 }
80 {.mii
81 cmp.ne p6, p7 = r0, r0 C M I
82 mov.i ar.lc = r31 C I0
83 cmp.ne p8, p9 = r0, r0 C M I
84 }
85 {.bbb
86 (p10) br.dptk .Lb00 C B
87 (p11) br.dptk .Lb10 C B
88 (p12) br.dptk .Lb11 C B
89 ;;
90 }
91
92 .Lb01: br.cloop.dptk .grt1 C B
93
94 xma.l f39 = f7, f6, f8 C F
95 xma.hu f43 = f7, f6, f8 C F
96 ;;
97 getf.sig r8 = f43 C M2
98 stf8 [r20] = f39 C M2 M3
99 mov.i ar.lc = r2 C I0
100 br.ret.sptk.many b0 C B
101
102 .grt1:
103 ldf8 f32 = [up], 8
104 ldf8 f44 = [rp], 8
105 ;;
106 ldf8 f33 = [up], 8
107 ldf8 f45 = [rp], 8
108 ;;
109 ldf8 f34 = [up], 8
110 xma.l f39 = f7, f6, f8
111 ldf8 f46 = [rp], 8
112 xma.hu f43 = f7, f6, f8
113 ;;
114 ldf8 f35 = [up], 8
115 ldf8 f47 = [rp], 8
116 br.cloop.dptk .grt5
117
118 xma.l f36 = f32, f6, f44
119 xma.hu f40 = f32, f6, f44
120 ;;
121 stf8 [r20] = f39, 8
122 xma.l f37 = f33, f6, f45
123 xma.hu f41 = f33, f6, f45
124 ;;
125 getf.sig r31 = f43
126 getf.sig r24 = f36
127 xma.l f38 = f34, f6, f46
128 xma.hu f42 = f34, f6, f46
129 ;;
130 getf.sig r28 = f40
131 getf.sig r25 = f37
132 xma.l f39 = f35, f6, f47
133 xma.hu f43 = f35, f6, f47
134 ;;
135 getf.sig r29 = f41
136 getf.sig r26 = f38
137 br .Lcj5
138
139 .grt5:
140 mov r30 = 0
141 xma.l f36 = f32, f6, f44
142 xma.hu f40 = f32, f6, f44
143 ;;
144 ldf8 f32 = [up], 8
145 xma.l f37 = f33, f6, f45
146 ldf8 f44 = [rp], 8
147 xma.hu f41 = f33, f6, f45
148 ;;
149 ldf8 f33 = [up], 8
150 getf.sig r27 = f39
151 ;;
152 getf.sig r31 = f43
153 xma.l f38 = f34, f6, f46
154 ldf8 f45 = [rp], 8
155 xma.hu f42 = f34, f6, f46
156 ;;
157 ldf8 f34 = [up], 8
158 getf.sig r24 = f36
159 ;;
160 getf.sig r28 = f40
161 xma.l f39 = f35, f6, f47
162 ldf8 f46 = [rp], 8
163 xma.hu f43 = f35, f6, f47
164 ;;
165 ldf8 f35 = [up], 8
166 getf.sig r25 = f37
167 br.cloop.dptk .Loop
168 br .Le0
169
170
171 .Lb10: ldf8 f35 = [up], 8
172 ldf8 f47 = [rp], 8
173 br.cloop.dptk .grt2
174
175 xma.l f38 = f7, f6, f8
176 xma.hu f42 = f7, f6, f8
177 ;;
178 xma.l f39 = f35, f6, f47
179 xma.hu f43 = f35, f6, f47
180 ;;
181 getf.sig r30 = f42
182 stf8 [r20] = f38, 8
183 getf.sig r27 = f39
184 getf.sig r8 = f43
185 br .Lcj2
186
187 .grt2:
188 ldf8 f32 = [up], 8
189 ldf8 f44 = [rp], 8
190 ;;
191 ldf8 f33 = [up], 8
192 xma.l f38 = f7, f6, f8
193 ldf8 f45 = [rp], 8
194 xma.hu f42 = f7, f6, f8
195 ;;
196 ldf8 f34 = [up], 8
197 xma.l f39 = f35, f6, f47
198 ldf8 f46 = [rp], 8
199 xma.hu f43 = f35, f6, f47
200 ;;
201 ldf8 f35 = [up], 8
202 ldf8 f47 = [rp], 8
203 br.cloop.dptk .grt6
204
205 stf8 [r20] = f38, 8
206 xma.l f36 = f32, f6, f44
207 xma.hu f40 = f32, f6, f44
208 ;;
209 getf.sig r30 = f42
210 getf.sig r27 = f39
211 xma.l f37 = f33, f6, f45
212 xma.hu f41 = f33, f6, f45
213 ;;
214 getf.sig r31 = f43
215 getf.sig r24 = f36
216 xma.l f38 = f34, f6, f46
217 xma.hu f42 = f34, f6, f46
218 ;;
219 getf.sig r28 = f40
220 getf.sig r25 = f37
221 xma.l f39 = f35, f6, f47
222 xma.hu f43 = f35, f6, f47
223 br .Lcj6
224
225 .grt6:
226 mov r29 = 0
227 xma.l f36 = f32, f6, f44
228 xma.hu f40 = f32, f6, f44
229 ;;
230 ldf8 f32 = [up], 8
231 getf.sig r26 = f38
232 ;;
233 getf.sig r30 = f42
234 xma.l f37 = f33, f6, f45
235 ldf8 f44 = [rp], 8
236 xma.hu f41 = f33, f6, f45
237 ;;
238 ldf8 f33 = [up], 8
239 getf.sig r27 = f39
240 ;;
241 getf.sig r31 = f43
242 xma.l f38 = f34, f6, f46
243 ldf8 f45 = [rp], 8
244 xma.hu f42 = f34, f6, f46
245 ;;
246 ldf8 f34 = [up], 8
247 getf.sig r24 = f36
248 br .LL10
249
250
251 .Lb11: ldf8 f34 = [up], 8
252 ldf8 f46 = [rp], 8
253 ;;
254 ldf8 f35 = [up], 8
255 ldf8 f47 = [rp], 8
256 br.cloop.dptk .grt3
257 ;;
258
259 xma.l f37 = f7, f6, f8
260 xma.hu f41 = f7, f6, f8
261 xma.l f38 = f34, f6, f46
262 xma.hu f42 = f34, f6, f46
263 xma.l f39 = f35, f6, f47
264 xma.hu f43 = f35, f6, f47
265 ;;
266 getf.sig r29 = f41
267 stf8 [r20] = f37, 8
268 getf.sig r26 = f38
269 getf.sig r30 = f42
270 getf.sig r27 = f39
271 getf.sig r8 = f43
272 br .Lcj3
273
274 .grt3:
275 ldf8 f32 = [up], 8
276 xma.l f37 = f7, f6, f8
277 ldf8 f44 = [rp], 8
278 xma.hu f41 = f7, f6, f8
279 ;;
280 ldf8 f33 = [up], 8
281 xma.l f38 = f34, f6, f46
282 ldf8 f45 = [rp], 8
283 xma.hu f42 = f34, f6, f46
284 ;;
285 ldf8 f34 = [up], 8
286 xma.l f39 = f35, f6, f47
287 ldf8 f46 = [rp], 8
288 xma.hu f43 = f35, f6, f47
289 ;;
290 ldf8 f35 = [up], 8
291 getf.sig r25 = f37 C FIXME
292 ldf8 f47 = [rp], 8
293 br.cloop.dptk .grt7
294
295 getf.sig r29 = f41
296 stf8 [r20] = f37, 8 C FIXME
297 xma.l f36 = f32, f6, f44
298 getf.sig r26 = f38
299 xma.hu f40 = f32, f6, f44
300 ;;
301 getf.sig r30 = f42
302 xma.l f37 = f33, f6, f45
303 getf.sig r27 = f39
304 xma.hu f41 = f33, f6, f45
305 ;;
306 getf.sig r31 = f43
307 xma.l f38 = f34, f6, f46
308 getf.sig r24 = f36
309 xma.hu f42 = f34, f6, f46
310 br .Lcj7
311
312 .grt7:
313 getf.sig r29 = f41
314 xma.l f36 = f32, f6, f44
315 mov r28 = 0
316 xma.hu f40 = f32, f6, f44
317 ;;
318 ldf8 f32 = [up], 8
319 getf.sig r26 = f38
320 ;;
321 getf.sig r30 = f42
322 xma.l f37 = f33, f6, f45
323 ldf8 f44 = [rp], 8
324 xma.hu f41 = f33, f6, f45
325 ;;
326 ldf8 f33 = [up], 8
327 getf.sig r27 = f39
328 br .LL11
329
330
331 .Lb00: ldf8 f33 = [up], 8
332 ldf8 f45 = [rp], 8
333 ;;
334 ldf8 f34 = [up], 8
335 ldf8 f46 = [rp], 8
336 ;;
337 ldf8 f35 = [up], 8
338 xma.l f36 = f7, f6, f8
339 ldf8 f47 = [rp], 8
340 xma.hu f40 = f7, f6, f8
341 br.cloop.dptk .grt4
342
343 xma.l f37 = f33, f6, f45
344 xma.hu f41 = f33, f6, f45
345 xma.l f38 = f34, f6, f46
346 xma.hu f42 = f34, f6, f46
347 ;;
348 getf.sig r28 = f40
349 stf8 [r20] = f36, 8
350 xma.l f39 = f35, f6, f47
351 getf.sig r25 = f37
352 xma.hu f43 = f35, f6, f47
353 ;;
354 getf.sig r29 = f41
355 getf.sig r26 = f38
356 getf.sig r30 = f42
357 getf.sig r27 = f39
358 br .Lcj4
359
360 .grt4:
361 ldf8 f32 = [up], 8
362 xma.l f37 = f33, f6, f45
363 ldf8 f44 = [rp], 8
364 xma.hu f41 = f33, f6, f45
365 ;;
366 ldf8 f33 = [up], 8
367 xma.l f38 = f34, f6, f46
368 ldf8 f45 = [rp], 8
369 xma.hu f42 = f34, f6, f46
370 ;;
371 ldf8 f34 = [up], 8
372 getf.sig r24 = f36 C FIXME
373 xma.l f39 = f35, f6, f47
374 ldf8 f46 = [rp], 8
375 getf.sig r28 = f40
376 xma.hu f43 = f35, f6, f47
377 ;;
378 ldf8 f35 = [up], 8
379 getf.sig r25 = f37
380 ldf8 f47 = [rp], 8
381 br.cloop.dptk .grt8
382
383 getf.sig r29 = f41
384 stf8 [r20] = f36, 8 C FIXME
385 xma.l f36 = f32, f6, f44
386 getf.sig r26 = f38
387 getf.sig r30 = f42
388 xma.hu f40 = f32, f6, f44
389 ;;
390 xma.l f37 = f33, f6, f45
391 getf.sig r27 = f39
392 xma.hu f41 = f33, f6, f45
393 br .Lcj8
394
395 .grt8:
396 getf.sig r29 = f41
397 xma.l f36 = f32, f6, f44
398 mov r31 = 0
399 xma.hu f40 = f32, f6, f44
400 ;;
401 ldf8 f32 = [up], 8
402 getf.sig r26 = f38
403 br .LL00
404
405
406 C *** MAIN LOOP START ***
407 ALIGN(32) C insn fed cycle #
408 .Loop:
409 .pred.rel "mutex", p6, p7 C num by i1 i2
410 getf.sig r29 = f41 C 00 16 0 0
411 xma.l f36 = f32, f6, f44 C 01 06,15 0 0
412 (p6) add r14 = r30, r27, 1 C 02 0 0
413 ldf8 f47 = [rp], 8 C 03 0 0
414 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
415 (p7) add r14 = r30, r27 C 05 0 0
416 ;;
417 .pred.rel "mutex", p6, p7
418 ldf8 f32 = [up], 8 C 06 1 1
419 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
420 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
421 getf.sig r26 = f38 C 09 25 2 1
422 st8 [r20] = r14, 8 C 10 2 1
423 nop.b 0 C 11 2 1
424 ;;
425 .LL00:
426 .pred.rel "mutex", p8, p9
427 getf.sig r30 = f42 C 12 28 3 2
428 xma.l f37 = f33, f6, f45 C 13 18,27 3 2
429 (p8) add r16 = r31, r24, 1 C 14 3 2
430 ldf8 f44 = [rp], 8 C 15 3 2
431 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
432 (p9) add r16 = r31, r24 C 17 3 2
433 ;;
434 .pred.rel "mutex", p8, p9
435 ldf8 f33 = [up], 8 C 18 4 3
436 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
437 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
438 getf.sig r27 = f39 C 21 37 5 3
439 st8 [r20] = r16, 8 C 22 5 3
440 nop.b 0 C 23 5 3
441 ;;
442 .LL11:
443 .pred.rel "mutex", p6, p7
444 getf.sig r31 = f43 C 24 40 6 4
445 xma.l f38 = f34, f6, f46 C 25 30,39 6 4
446 (p6) add r14 = r28, r25, 1 C 26 6 4
447 ldf8 f45 = [rp], 8 C 27 6 4
448 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
449 (p7) add r14 = r28, r25 C 29 6 4
450 ;;
451 .pred.rel "mutex", p6, p7
452 ldf8 f34 = [up], 8 C 30 7 5
453 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
454 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
455 getf.sig r24 = f36 C 33 01 8 5
456 st8 [r20] = r14, 8 C 34 8 5
457 nop.b 0 C 35 8 5
458 ;;
459 .LL10:
460 .pred.rel "mutex", p8, p9
461 getf.sig r28 = f40 C 36 04 9 6
462 xma.l f39 = f35, f6, f47 C 37 42,03 9 6
463 (p8) add r16 = r29, r26, 1 C 38 9 6
464 ldf8 f46 = [rp], 8 C 39 9 6
465 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
466 (p9) add r16 = r29, r26 C 41 9 6
467 ;;
468 .pred.rel "mutex", p8, p9
469 ldf8 f35 = [up], 8 C 42 10 7
470 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
471 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
472 getf.sig r25 = f37 C 45 13 11 7
473 st8 [r20] = r16, 8 C 46 11 7
474 br.cloop.dptk .Loop C 47 11 7
475 C *** MAIN LOOP END ***
476 ;;
477 .Le0:
478 .pred.rel "mutex", p6, p7
479 getf.sig r29 = f41 C
480 xma.l f36 = f32, f6, f44 C
481 (p6) add r14 = r30, r27, 1 C
482 ldf8 f47 = [rp], 8 C
483 xma.hu f40 = f32, f6, f44 C
484 (p7) add r14 = r30, r27 C
485 ;;
486 .pred.rel "mutex", p6, p7
487 (p6) cmp.leu p8, p9 = r14, r27 C
488 (p7) cmp.ltu p8, p9 = r14, r27 C
489 getf.sig r26 = f38 C
490 st8 [r20] = r14, 8 C
491 ;;
492 .pred.rel "mutex", p8, p9
493 getf.sig r30 = f42 C
494 xma.l f37 = f33, f6, f45 C
495 (p8) add r16 = r31, r24, 1 C
496 xma.hu f41 = f33, f6, f45 C
497 (p9) add r16 = r31, r24 C
498 ;;
499 .pred.rel "mutex", p8, p9
500 (p8) cmp.leu p6, p7 = r16, r24 C
501 (p9) cmp.ltu p6, p7 = r16, r24 C
502 getf.sig r27 = f39 C
503 st8 [r20] = r16, 8 C
504 ;;
505 .Lcj8:
506 .pred.rel "mutex", p6, p7
507 getf.sig r31 = f43 C
508 xma.l f38 = f34, f6, f46 C
509 (p6) add r14 = r28, r25, 1 C
510 xma.hu f42 = f34, f6, f46 C
511 (p7) add r14 = r28, r25 C
512 ;;
513 .pred.rel "mutex", p6, p7
514 (p6) cmp.leu p8, p9 = r14, r25 C
515 (p7) cmp.ltu p8, p9 = r14, r25 C
516 getf.sig r24 = f36 C
517 st8 [r20] = r14, 8 C
518 ;;
519 .Lcj7:
520 .pred.rel "mutex", p8, p9
521 getf.sig r28 = f40 C
522 xma.l f39 = f35, f6, f47 C
523 (p8) add r16 = r29, r26, 1 C
524 xma.hu f43 = f35, f6, f47 C
525 (p9) add r16 = r29, r26 C
526 ;;
527 .pred.rel "mutex", p8, p9
528 (p8) cmp.leu p6, p7 = r16, r26 C
529 (p9) cmp.ltu p6, p7 = r16, r26 C
530 getf.sig r25 = f37 C
531 st8 [r20] = r16, 8 C
532 ;;
533 .Lcj6:
534 .pred.rel "mutex", p6, p7
535 getf.sig r29 = f41 C
536 (p6) add r14 = r30, r27, 1 C
537 (p7) add r14 = r30, r27 C
538 ;;
539 .pred.rel "mutex", p6, p7
540 (p6) cmp.leu p8, p9 = r14, r27 C
541 (p7) cmp.ltu p8, p9 = r14, r27 C
542 getf.sig r26 = f38 C
543 st8 [r20] = r14, 8 C
544 ;;
545 .Lcj5:
546 .pred.rel "mutex", p8, p9
547 getf.sig r30 = f42 C
548 (p8) add r16 = r31, r24, 1 C
549 (p9) add r16 = r31, r24 C
550 ;;
551 .pred.rel "mutex", p8, p9
552 (p8) cmp.leu p6, p7 = r16, r24 C
553 (p9) cmp.ltu p6, p7 = r16, r24 C
554 getf.sig r27 = f39 C
555 st8 [r20] = r16, 8 C
556 ;;
557 .Lcj4:
558 .pred.rel "mutex", p6, p7
559 getf.sig r8 = f43 C
560 (p6) add r14 = r28, r25, 1 C
561 (p7) add r14 = r28, r25 C
562 ;;
563 .pred.rel "mutex", p6, p7
564 st8 [r20] = r14, 8 C
565 (p6) cmp.leu p8, p9 = r14, r25 C
566 (p7) cmp.ltu p8, p9 = r14, r25 C
567 ;;
568 .Lcj3:
569 .pred.rel "mutex", p8, p9
570 (p8) add r16 = r29, r26, 1 C
571 (p9) add r16 = r29, r26 C
572 ;;
573 .pred.rel "mutex", p8, p9
574 st8 [r20] = r16, 8 C
575 (p8) cmp.leu p6, p7 = r16, r26 C
576 (p9) cmp.ltu p6, p7 = r16, r26 C
577 ;;
578 .Lcj2:
579 .pred.rel "mutex", p6, p7
580 (p6) add r14 = r30, r27, 1 C
581 (p7) add r14 = r30, r27 C
582 ;;
583 .pred.rel "mutex", p6, p7
584 st8 [r20] = r14 C
585 (p6) cmp.leu p8, p9 = r14, r27 C
586 (p7) cmp.ltu p8, p9 = r14, r27 C
587 ;;
588 (p8) add r8 = 1, r8 C M I
589 mov.i ar.lc = r2 C I0
590 br.ret.sptk.many b0 C B
591 EPILOGUE()
592 ASM_END()
593