addmul_1.asm revision 1.1.1.1 1 dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2 dnl result to a second limb vector.
3
4 dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
5 dnl Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
13
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
18
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24 C cycles/limb
25 C Itanium: 3.0
26 C Itanium 2: 2.0
27
28 C TODO
29 C * Further optimize feed-in and wind-down code, both for speed and code size.
30 C * Handle low limb input and results specially, using a common stf8 in the
31 C epilogue.
32 C * Use 1 c/l carry propagation scheme in wind-down code.
33 C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
34 C * Work out final differences with mul_1.asm. That function is 300 bytes
35 C smaller than this due to better loop scheduling and thus simpler feed-in
36 C code.
37
38 C INPUT PARAMETERS
39 define(`rp', `r32')
40 define(`up', `r33')
41 define(`n', `r34')
42 define(`vl', `r35')
43
44 ASM_START()
45 PROLOGUE(mpn_addmul_1)
46 .prologue
47 .save ar.lc, r2
48 .body
49
50 ifdef(`HAVE_ABI_32',
51 ` addp4 rp = 0, rp C M I
52 addp4 up = 0, up C M I
53 zxt4 n = n C I
54 ;;
55 ')
56 {.mmi
57 adds r15 = -1, n C M I
58 mov r20 = rp C M I
59 mov.i r2 = ar.lc C I0
60 }
61 {.mmi
62 ldf8 f7 = [up], 8 C M
63 ldf8 f8 = [rp], 8 C M
64 and r14 = 3, n C M I
65 ;;
66 }
67 {.mmi
68 setf.sig f6 = vl C M2 M3
69 cmp.eq p10, p0 = 0, r14 C M I
70 shr.u r31 = r15, 2 C I0
71 }
72 {.mmi
73 cmp.eq p11, p0 = 2, r14 C M I
74 cmp.eq p12, p0 = 3, r14 C M I
75 nop.i 0 C I
76 ;;
77 }
78 {.mii
79 cmp.ne p6, p7 = r0, r0 C M I
80 mov.i ar.lc = r31 C I0
81 cmp.ne p8, p9 = r0, r0 C M I
82 }
83 {.bbb
84 (p10) br.dptk .Lb00 C B
85 (p11) br.dptk .Lb10 C B
86 (p12) br.dptk .Lb11 C B
87 ;;
88 }
89
90 .Lb01: br.cloop.dptk .grt1 C B
91
92 xma.l f39 = f7, f6, f8 C F
93 xma.hu f43 = f7, f6, f8 C F
94 ;;
95 getf.sig r8 = f43 C M2
96 stf8 [r20] = f39 C M2 M3
97 mov.i ar.lc = r2 C I0
98 br.ret.sptk.many b0 C B
99
100 .grt1:
101 ldf8 f32 = [up], 8
102 ldf8 f44 = [rp], 8
103 ;;
104 ldf8 f33 = [up], 8
105 ldf8 f45 = [rp], 8
106 ;;
107 ldf8 f34 = [up], 8
108 xma.l f39 = f7, f6, f8
109 ldf8 f46 = [rp], 8
110 xma.hu f43 = f7, f6, f8
111 ;;
112 ldf8 f35 = [up], 8
113 ldf8 f47 = [rp], 8
114 br.cloop.dptk .grt5
115
116 xma.l f36 = f32, f6, f44
117 xma.hu f40 = f32, f6, f44
118 ;;
119 stf8 [r20] = f39, 8
120 xma.l f37 = f33, f6, f45
121 xma.hu f41 = f33, f6, f45
122 ;;
123 getf.sig r31 = f43
124 getf.sig r24 = f36
125 xma.l f38 = f34, f6, f46
126 xma.hu f42 = f34, f6, f46
127 ;;
128 getf.sig r28 = f40
129 getf.sig r25 = f37
130 xma.l f39 = f35, f6, f47
131 xma.hu f43 = f35, f6, f47
132 ;;
133 getf.sig r29 = f41
134 getf.sig r26 = f38
135 br .Lcj5
136
137 .grt5:
138 mov r30 = 0
139 xma.l f36 = f32, f6, f44
140 xma.hu f40 = f32, f6, f44
141 ;;
142 ldf8 f32 = [up], 8
143 xma.l f37 = f33, f6, f45
144 ldf8 f44 = [rp], 8
145 xma.hu f41 = f33, f6, f45
146 ;;
147 ldf8 f33 = [up], 8
148 getf.sig r27 = f39
149 ;;
150 getf.sig r31 = f43
151 xma.l f38 = f34, f6, f46
152 ldf8 f45 = [rp], 8
153 xma.hu f42 = f34, f6, f46
154 ;;
155 ldf8 f34 = [up], 8
156 getf.sig r24 = f36
157 ;;
158 getf.sig r28 = f40
159 xma.l f39 = f35, f6, f47
160 ldf8 f46 = [rp], 8
161 xma.hu f43 = f35, f6, f47
162 ;;
163 ldf8 f35 = [up], 8
164 getf.sig r25 = f37
165 br.cloop.dptk .Loop
166 br .Le0
167
168
169 .Lb10: ldf8 f35 = [up], 8
170 ldf8 f47 = [rp], 8
171 br.cloop.dptk .grt2
172
173 xma.l f38 = f7, f6, f8
174 xma.hu f42 = f7, f6, f8
175 ;;
176 xma.l f39 = f35, f6, f47
177 xma.hu f43 = f35, f6, f47
178 ;;
179 getf.sig r30 = f42
180 stf8 [r20] = f38, 8
181 getf.sig r27 = f39
182 getf.sig r8 = f43
183 br .Lcj2
184
185 .grt2:
186 ldf8 f32 = [up], 8
187 ldf8 f44 = [rp], 8
188 ;;
189 ldf8 f33 = [up], 8
190 xma.l f38 = f7, f6, f8
191 ldf8 f45 = [rp], 8
192 xma.hu f42 = f7, f6, f8
193 ;;
194 ldf8 f34 = [up], 8
195 xma.l f39 = f35, f6, f47
196 ldf8 f46 = [rp], 8
197 xma.hu f43 = f35, f6, f47
198 ;;
199 ldf8 f35 = [up], 8
200 ldf8 f47 = [rp], 8
201 br.cloop.dptk .grt6
202
203 stf8 [r20] = f38, 8
204 xma.l f36 = f32, f6, f44
205 xma.hu f40 = f32, f6, f44
206 ;;
207 getf.sig r30 = f42
208 getf.sig r27 = f39
209 xma.l f37 = f33, f6, f45
210 xma.hu f41 = f33, f6, f45
211 ;;
212 getf.sig r31 = f43
213 getf.sig r24 = f36
214 xma.l f38 = f34, f6, f46
215 xma.hu f42 = f34, f6, f46
216 ;;
217 getf.sig r28 = f40
218 getf.sig r25 = f37
219 xma.l f39 = f35, f6, f47
220 xma.hu f43 = f35, f6, f47
221 br .Lcj6
222
223 .grt6:
224 mov r29 = 0
225 xma.l f36 = f32, f6, f44
226 xma.hu f40 = f32, f6, f44
227 ;;
228 ldf8 f32 = [up], 8
229 getf.sig r26 = f38
230 ;;
231 getf.sig r30 = f42
232 xma.l f37 = f33, f6, f45
233 ldf8 f44 = [rp], 8
234 xma.hu f41 = f33, f6, f45
235 ;;
236 ldf8 f33 = [up], 8
237 getf.sig r27 = f39
238 ;;
239 getf.sig r31 = f43
240 xma.l f38 = f34, f6, f46
241 ldf8 f45 = [rp], 8
242 xma.hu f42 = f34, f6, f46
243 ;;
244 ldf8 f34 = [up], 8
245 getf.sig r24 = f36
246 br .LL10
247
248
249 .Lb11: ldf8 f34 = [up], 8
250 ldf8 f46 = [rp], 8
251 ;;
252 ldf8 f35 = [up], 8
253 ldf8 f47 = [rp], 8
254 br.cloop.dptk .grt3
255 ;;
256
257 xma.l f37 = f7, f6, f8
258 xma.hu f41 = f7, f6, f8
259 xma.l f38 = f34, f6, f46
260 xma.hu f42 = f34, f6, f46
261 xma.l f39 = f35, f6, f47
262 xma.hu f43 = f35, f6, f47
263 ;;
264 getf.sig r29 = f41
265 stf8 [r20] = f37, 8
266 getf.sig r26 = f38
267 getf.sig r30 = f42
268 getf.sig r27 = f39
269 getf.sig r8 = f43
270 br .Lcj3
271
272 .grt3:
273 ldf8 f32 = [up], 8
274 xma.l f37 = f7, f6, f8
275 ldf8 f44 = [rp], 8
276 xma.hu f41 = f7, f6, f8
277 ;;
278 ldf8 f33 = [up], 8
279 xma.l f38 = f34, f6, f46
280 ldf8 f45 = [rp], 8
281 xma.hu f42 = f34, f6, f46
282 ;;
283 ldf8 f34 = [up], 8
284 xma.l f39 = f35, f6, f47
285 ldf8 f46 = [rp], 8
286 xma.hu f43 = f35, f6, f47
287 ;;
288 ldf8 f35 = [up], 8
289 getf.sig r25 = f37 C FIXME
290 ldf8 f47 = [rp], 8
291 br.cloop.dptk .grt7
292
293 getf.sig r29 = f41
294 stf8 [r20] = f37, 8 C FIXME
295 xma.l f36 = f32, f6, f44
296 getf.sig r26 = f38
297 xma.hu f40 = f32, f6, f44
298 ;;
299 getf.sig r30 = f42
300 xma.l f37 = f33, f6, f45
301 getf.sig r27 = f39
302 xma.hu f41 = f33, f6, f45
303 ;;
304 getf.sig r31 = f43
305 xma.l f38 = f34, f6, f46
306 getf.sig r24 = f36
307 xma.hu f42 = f34, f6, f46
308 br .Lcj7
309
310 .grt7:
311 getf.sig r29 = f41
312 xma.l f36 = f32, f6, f44
313 mov r28 = 0
314 xma.hu f40 = f32, f6, f44
315 ;;
316 ldf8 f32 = [up], 8
317 getf.sig r26 = f38
318 ;;
319 getf.sig r30 = f42
320 xma.l f37 = f33, f6, f45
321 ldf8 f44 = [rp], 8
322 xma.hu f41 = f33, f6, f45
323 ;;
324 ldf8 f33 = [up], 8
325 getf.sig r27 = f39
326 br .LL11
327
328
329 .Lb00: ldf8 f33 = [up], 8
330 ldf8 f45 = [rp], 8
331 ;;
332 ldf8 f34 = [up], 8
333 ldf8 f46 = [rp], 8
334 ;;
335 ldf8 f35 = [up], 8
336 xma.l f36 = f7, f6, f8
337 ldf8 f47 = [rp], 8
338 xma.hu f40 = f7, f6, f8
339 br.cloop.dptk .grt4
340
341 xma.l f37 = f33, f6, f45
342 xma.hu f41 = f33, f6, f45
343 xma.l f38 = f34, f6, f46
344 xma.hu f42 = f34, f6, f46
345 ;;
346 getf.sig r28 = f40
347 stf8 [r20] = f36, 8
348 xma.l f39 = f35, f6, f47
349 getf.sig r25 = f37
350 xma.hu f43 = f35, f6, f47
351 ;;
352 getf.sig r29 = f41
353 getf.sig r26 = f38
354 getf.sig r30 = f42
355 getf.sig r27 = f39
356 br .Lcj4
357
358 .grt4:
359 ldf8 f32 = [up], 8
360 xma.l f37 = f33, f6, f45
361 ldf8 f44 = [rp], 8
362 xma.hu f41 = f33, f6, f45
363 ;;
364 ldf8 f33 = [up], 8
365 xma.l f38 = f34, f6, f46
366 ldf8 f45 = [rp], 8
367 xma.hu f42 = f34, f6, f46
368 ;;
369 ldf8 f34 = [up], 8
370 getf.sig r24 = f36 C FIXME
371 xma.l f39 = f35, f6, f47
372 ldf8 f46 = [rp], 8
373 getf.sig r28 = f40
374 xma.hu f43 = f35, f6, f47
375 ;;
376 ldf8 f35 = [up], 8
377 getf.sig r25 = f37
378 ldf8 f47 = [rp], 8
379 br.cloop.dptk .grt8
380
381 getf.sig r29 = f41
382 stf8 [r20] = f36, 8 C FIXME
383 xma.l f36 = f32, f6, f44
384 getf.sig r26 = f38
385 getf.sig r30 = f42
386 xma.hu f40 = f32, f6, f44
387 ;;
388 xma.l f37 = f33, f6, f45
389 getf.sig r27 = f39
390 xma.hu f41 = f33, f6, f45
391 br .Lcj8
392
393 .grt8:
394 getf.sig r29 = f41
395 xma.l f36 = f32, f6, f44
396 mov r31 = 0
397 xma.hu f40 = f32, f6, f44
398 ;;
399 ldf8 f32 = [up], 8
400 getf.sig r26 = f38
401 br .LL00
402
403
404 C *** MAIN LOOP START ***
405 ALIGN(32) C insn fed cycle #
406 .Loop:
407 .pred.rel "mutex", p6, p7 C num by i1 i2
408 getf.sig r29 = f41 C 00 16 0 0
409 xma.l f36 = f32, f6, f44 C 01 06,15 0 0
410 (p6) add r14 = r30, r27, 1 C 02 0 0
411 ldf8 f47 = [rp], 8 C 03 0 0
412 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
413 (p7) add r14 = r30, r27 C 05 0 0
414 ;;
415 .pred.rel "mutex", p6, p7
416 ldf8 f32 = [up], 8 C 06 1 1
417 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
418 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
419 getf.sig r26 = f38 C 09 25 2 1
420 st8 [r20] = r14, 8 C 10 2 1
421 nop.b 0 C 11 2 1
422 ;;
423 .LL00:
424 .pred.rel "mutex", p8, p9
425 getf.sig r30 = f42 C 12 28 3 2
426 xma.l f37 = f33, f6, f45 C 13 18,27 3 2
427 (p8) add r16 = r31, r24, 1 C 14 3 2
428 ldf8 f44 = [rp], 8 C 15 3 2
429 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
430 (p9) add r16 = r31, r24 C 17 3 2
431 ;;
432 .pred.rel "mutex", p8, p9
433 ldf8 f33 = [up], 8 C 18 4 3
434 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
435 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
436 getf.sig r27 = f39 C 21 37 5 3
437 st8 [r20] = r16, 8 C 22 5 3
438 nop.b 0 C 23 5 3
439 ;;
440 .LL11:
441 .pred.rel "mutex", p6, p7
442 getf.sig r31 = f43 C 24 40 6 4
443 xma.l f38 = f34, f6, f46 C 25 30,39 6 4
444 (p6) add r14 = r28, r25, 1 C 26 6 4
445 ldf8 f45 = [rp], 8 C 27 6 4
446 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
447 (p7) add r14 = r28, r25 C 29 6 4
448 ;;
449 .pred.rel "mutex", p6, p7
450 ldf8 f34 = [up], 8 C 30 7 5
451 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
452 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
453 getf.sig r24 = f36 C 33 01 8 5
454 st8 [r20] = r14, 8 C 34 8 5
455 nop.b 0 C 35 8 5
456 ;;
457 .LL10:
458 .pred.rel "mutex", p8, p9
459 getf.sig r28 = f40 C 36 04 9 6
460 xma.l f39 = f35, f6, f47 C 37 42,03 9 6
461 (p8) add r16 = r29, r26, 1 C 38 9 6
462 ldf8 f46 = [rp], 8 C 39 9 6
463 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
464 (p9) add r16 = r29, r26 C 41 9 6
465 ;;
466 .pred.rel "mutex", p8, p9
467 ldf8 f35 = [up], 8 C 42 10 7
468 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
469 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
470 getf.sig r25 = f37 C 45 13 11 7
471 st8 [r20] = r16, 8 C 46 11 7
472 br.cloop.dptk .Loop C 47 11 7
473 C *** MAIN LOOP END ***
474 ;;
475 .Le0:
476 .pred.rel "mutex", p6, p7
477 getf.sig r29 = f41 C
478 xma.l f36 = f32, f6, f44 C
479 (p6) add r14 = r30, r27, 1 C
480 ldf8 f47 = [rp], 8 C
481 xma.hu f40 = f32, f6, f44 C
482 (p7) add r14 = r30, r27 C
483 ;;
484 .pred.rel "mutex", p6, p7
485 (p6) cmp.leu p8, p9 = r14, r27 C
486 (p7) cmp.ltu p8, p9 = r14, r27 C
487 getf.sig r26 = f38 C
488 st8 [r20] = r14, 8 C
489 ;;
490 .pred.rel "mutex", p8, p9
491 getf.sig r30 = f42 C
492 xma.l f37 = f33, f6, f45 C
493 (p8) add r16 = r31, r24, 1 C
494 xma.hu f41 = f33, f6, f45 C
495 (p9) add r16 = r31, r24 C
496 ;;
497 .pred.rel "mutex", p8, p9
498 (p8) cmp.leu p6, p7 = r16, r24 C
499 (p9) cmp.ltu p6, p7 = r16, r24 C
500 getf.sig r27 = f39 C
501 st8 [r20] = r16, 8 C
502 ;;
503 .Lcj8:
504 .pred.rel "mutex", p6, p7
505 getf.sig r31 = f43 C
506 xma.l f38 = f34, f6, f46 C
507 (p6) add r14 = r28, r25, 1 C
508 xma.hu f42 = f34, f6, f46 C
509 (p7) add r14 = r28, r25 C
510 ;;
511 .pred.rel "mutex", p6, p7
512 (p6) cmp.leu p8, p9 = r14, r25 C
513 (p7) cmp.ltu p8, p9 = r14, r25 C
514 getf.sig r24 = f36 C
515 st8 [r20] = r14, 8 C
516 ;;
517 .Lcj7:
518 .pred.rel "mutex", p8, p9
519 getf.sig r28 = f40 C
520 xma.l f39 = f35, f6, f47 C
521 (p8) add r16 = r29, r26, 1 C
522 xma.hu f43 = f35, f6, f47 C
523 (p9) add r16 = r29, r26 C
524 ;;
525 .pred.rel "mutex", p8, p9
526 (p8) cmp.leu p6, p7 = r16, r26 C
527 (p9) cmp.ltu p6, p7 = r16, r26 C
528 getf.sig r25 = f37 C
529 st8 [r20] = r16, 8 C
530 ;;
531 .Lcj6:
532 .pred.rel "mutex", p6, p7
533 getf.sig r29 = f41 C
534 (p6) add r14 = r30, r27, 1 C
535 (p7) add r14 = r30, r27 C
536 ;;
537 .pred.rel "mutex", p6, p7
538 (p6) cmp.leu p8, p9 = r14, r27 C
539 (p7) cmp.ltu p8, p9 = r14, r27 C
540 getf.sig r26 = f38 C
541 st8 [r20] = r14, 8 C
542 ;;
543 .Lcj5:
544 .pred.rel "mutex", p8, p9
545 getf.sig r30 = f42 C
546 (p8) add r16 = r31, r24, 1 C
547 (p9) add r16 = r31, r24 C
548 ;;
549 .pred.rel "mutex", p8, p9
550 (p8) cmp.leu p6, p7 = r16, r24 C
551 (p9) cmp.ltu p6, p7 = r16, r24 C
552 getf.sig r27 = f39 C
553 st8 [r20] = r16, 8 C
554 ;;
555 .Lcj4:
556 .pred.rel "mutex", p6, p7
557 getf.sig r8 = f43 C
558 (p6) add r14 = r28, r25, 1 C
559 (p7) add r14 = r28, r25 C
560 ;;
561 .pred.rel "mutex", p6, p7
562 st8 [r20] = r14, 8 C
563 (p6) cmp.leu p8, p9 = r14, r25 C
564 (p7) cmp.ltu p8, p9 = r14, r25 C
565 ;;
566 .Lcj3:
567 .pred.rel "mutex", p8, p9
568 (p8) add r16 = r29, r26, 1 C
569 (p9) add r16 = r29, r26 C
570 ;;
571 .pred.rel "mutex", p8, p9
572 st8 [r20] = r16, 8 C
573 (p8) cmp.leu p6, p7 = r16, r26 C
574 (p9) cmp.ltu p6, p7 = r16, r26 C
575 ;;
576 .Lcj2:
577 .pred.rel "mutex", p6, p7
578 (p6) add r14 = r30, r27, 1 C
579 (p7) add r14 = r30, r27 C
580 ;;
581 .pred.rel "mutex", p6, p7
582 st8 [r20] = r14 C
583 (p6) cmp.leu p8, p9 = r14, r27 C
584 (p7) cmp.ltu p8, p9 = r14, r27 C
585 ;;
586 (p8) add r8 = 1, r8 C M I
587 mov.i ar.lc = r2 C I0
588 br.ret.sptk.many b0 C B
589 EPILOGUE()
590 ASM_END()
591