mul_2.asm revision 1.1.1.1.2.1 1 dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2 dnl store the result to a (n+1)-limb number.
3
4 dnl Contributed to the GNU project by Torbjorn Granlund.
5
6 dnl Copyright 2004, 2011 Free Software Foundation, Inc.
7
8 dnl This file is part of the GNU MP Library.
9
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of the GNU Lesser General Public License as published
12 dnl by the Free Software Foundation; either version 3 of the License, or (at
13 dnl your option) any later version.
14
15 dnl The GNU MP Library is distributed in the hope that it will be useful, but
16 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
18 dnl License for more details.
19
20 dnl You should have received a copy of the GNU Lesser General Public License
21 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22
23 include(`../config.m4')
24
25 C cycles/limb
26 C Itanium: ?
27 C Itanium 2: 1.5
28
29 C TODO
30 C * Clean up variable names, and try to decrease the number of distinct
31 C registers used.
32 C * Clean up feed-in code to not require zeroing several registers.
33 C * Make sure we don't depend on uninitialized predicate registers.
34 C * Could perhaps save a few cycles by using 1 c/l carry propagation in
35 C wind-down code.
36 C * Ultimately rewrite. The problem with this code is that it first uses a
37 C loaded u value in one xma pair, then leaves it live over several unrelated
38 C xma pairs, before it uses it again. It should actually be quite possible
39 C to just swap some aligned xma pairs around. But we should then schedule
40 C u loads further from the first use.
41
42 C INPUT PARAMETERS
43 define(`rp',`r32')
44 define(`up',`r33')
45 define(`n',`r34')
46 define(`vp',`r35')
47
48 define(`srp',`r3')
49
50 define(`v0',`f6')
51 define(`v1',`f7')
52
53 define(`s0',`r14')
54 define(`acc0',`r15')
55
56 define(`pr0_0',`r16') define(`pr0_1',`r17')
57 define(`pr0_2',`r18') define(`pr0_3',`r19')
58
59 define(`pr1_0',`r20') define(`pr1_1',`r21')
60 define(`pr1_2',`r22') define(`pr1_3',`r23')
61
62 define(`acc1_0',`r24') define(`acc1_1',`r25')
63 define(`acc1_2',`r26') define(`acc1_3',`r27')
64
65 dnl define(`',`r28')
66 dnl define(`',`r29')
67 dnl define(`',`r30')
68 dnl define(`',`r31')
69
70 define(`fp0b_0',`f8') define(`fp0b_1',`f9')
71 define(`fp0b_2',`f10') define(`fp0b_3',`f11')
72
73 define(`fp1a_0',`f12') define(`fp1a_1',`f13')
74 define(`fp1a_2',`f14') define(`fp1a_3',`f15')
75
76 define(`fp1b_0',`f32') define(`fp1b_1',`f33')
77 define(`fp1b_2',`f34') define(`fp1b_3',`f35')
78
79 define(`fp2a_0',`f36') define(`fp2a_1',`f37')
80 define(`fp2a_2',`f38') define(`fp2a_3',`f39')
81
82 define(`u_0',`f44') define(`u_1',`f45')
83 define(`u_2',`f46') define(`u_3',`f47')
84
85 define(`ux',`f49')
86 define(`uy',`f51')
87
88 ASM_START()
89 PROLOGUE(mpn_mul_2)
90 .prologue
91 .save ar.lc, r2
92 .body
93
94 ifdef(`HAVE_ABI_32',`
95 .mmi; addp4 rp = 0, rp C M I
96 addp4 up = 0, up C M I
97 addp4 vp = 0, vp C M I
98 .mmi; nop 1
99 nop 1
100 zxt4 n = n C I
101 ;;')
102
103 .mmi; ldf8 ux = [up], 8 C M
104 ldf8 v0 = [vp], 8 C M
105 mov r2 = ar.lc C I0
106 .mmi; nop 1 C M
107 and r14 = 3, n C M I
108 add n = -2, n C M I
109 ;;
110 .mmi; ldf8 uy = [up], 8 C M
111 ldf8 v1 = [vp] C M
112 shr.u n = n, 2 C I
113 .mmi; nop 1 C M
114 cmp.eq p10, p0 = 1, r14 C M I
115 cmp.eq p11, p0 = 2, r14 C M I
116 ;;
117 .mmi; nop 1 C M
118 cmp.eq p12, p0 = 3, r14 C M I
119 mov ar.lc = n C I0
120 .bbb; (p10) br.dptk L(b01) C B
121 (p11) br.dptk L(b10) C B
122 (p12) br.dptk L(b11) C B
123 ;;
124
125 ALIGN(32)
126 L(b00): ldf8 u_1 = [up], 8
127 mov acc1_2 = 0
128 mov pr1_2 = 0
129 mov pr0_3 = 0
130 cmp.ne p8, p9 = r0, r0
131 ;;
132 xma.l fp0b_3 = ux, v0, f0
133 cmp.ne p12, p13 = r0, r0
134 ldf8 u_2 = [up], 8
135 xma.hu fp1a_3 = ux, v0, f0
136 br.cloop.dptk L(gt4)
137
138 xma.l fp0b_0 = uy, v0, f0
139 xma.hu fp1a_0 = uy, v0, f0
140 ;;
141 getfsig acc0 = fp0b_3
142 xma.l fp1b_3 = ux, v1, fp1a_3
143 xma.hu fp2a_3 = ux, v1, fp1a_3
144 ;;
145 xma.l fp0b_1 = u_1, v0, f0
146 xma.hu fp1a_1 = u_1, v0, f0
147 ;;
148 getfsig pr0_0 = fp0b_0
149 xma.l fp1b_0 = uy, v1, fp1a_0
150 xma.hu fp2a_0 = uy, v1, fp1a_0
151 ;;
152 getfsig pr1_3 = fp1b_3
153 getfsig acc1_3 = fp2a_3
154 xma.l fp0b_2 = u_2, v0, f0
155 xma.hu fp1a_2 = u_2, v0, f0
156 br L(cj4)
157
158 L(gt4): xma.l fp0b_0 = uy, v0, f0
159 xma.hu fp1a_0 = uy, v0, f0
160 ;;
161 getfsig acc0 = fp0b_3
162 xma.l fp1b_3 = ux, v1, fp1a_3
163 ldf8 u_3 = [up], 8
164 xma.hu fp2a_3 = ux, v1, fp1a_3
165 ;;
166 xma.l fp0b_1 = u_1, v0, f0
167 xma.hu fp1a_1 = u_1, v0, f0
168 ;;
169 getfsig pr0_0 = fp0b_0
170 xma.l fp1b_0 = uy, v1, fp1a_0
171 xma.hu fp2a_0 = uy, v1, fp1a_0
172 ;;
173 ldf8 u_0 = [up], 8
174 getfsig pr1_3 = fp1b_3
175 xma.l fp0b_2 = u_2, v0, f0
176 ;;
177 getfsig acc1_3 = fp2a_3
178 xma.hu fp1a_2 = u_2, v0, f0
179 br L(00)
180
181
182 ALIGN(32)
183 L(b01): ldf8 u_0 = [up], 8 C M
184 mov acc1_1 = 0 C M I
185 mov pr1_1 = 0 C M I
186 mov pr0_2 = 0 C M I
187 cmp.ne p6, p7 = r0, r0 C M I
188 ;;
189 xma.l fp0b_2 = ux, v0, f0 C F
190 cmp.ne p10, p11 = r0, r0 C M I
191 ldf8 u_1 = [up], 8 C M
192 xma.hu fp1a_2 = ux, v0, f0 C F
193 ;;
194 xma.l fp0b_3 = uy, v0, f0 C F
195 xma.hu fp1a_3 = uy, v0, f0 C F
196 ;;
197 getfsig acc0 = fp0b_2 C M
198 xma.l fp1b_2 = ux, v1,fp1a_2 C F
199 ldf8 u_2 = [up], 8 C M
200 xma.hu fp2a_2 = ux, v1,fp1a_2 C F
201 br.cloop.dptk L(gt5)
202
203 xma.l fp0b_0 = u_0, v0, f0 C F
204 xma.hu fp1a_0 = u_0, v0, f0 C F
205 ;;
206 getfsig pr0_3 = fp0b_3 C M
207 xma.l fp1b_3 = uy, v1,fp1a_3 C F
208 xma.hu fp2a_3 = uy, v1,fp1a_3 C F
209 ;;
210 getfsig pr1_2 = fp1b_2 C M
211 getfsig acc1_2 = fp2a_2 C M
212 xma.l fp0b_1 = u_1, v0, f0 C F
213 xma.hu fp1a_1 = u_1, v0, f0 C F
214 br L(cj5)
215
216 L(gt5): xma.l fp0b_0 = u_0, v0, f0
217 xma.hu fp1a_0 = u_0, v0, f0
218 ;;
219 getfsig pr0_3 = fp0b_3
220 xma.l fp1b_3 = uy, v1, fp1a_3
221 xma.hu fp2a_3 = uy, v1, fp1a_3
222 ;;
223 ldf8 u_3 = [up], 8
224 getfsig pr1_2 = fp1b_2
225 xma.l fp0b_1 = u_1, v0, f0
226 ;;
227 getfsig acc1_2 = fp2a_2
228 xma.hu fp1a_1 = u_1, v0, f0
229 br L(01)
230
231
232 ALIGN(32)
233 L(b10): br.cloop.dptk L(gt2)
234 xma.l fp0b_1 = ux, v0, f0
235 xma.hu fp1a_1 = ux, v0, f0
236 ;;
237 xma.l fp0b_2 = uy, v0, f0
238 xma.hu fp1a_2 = uy, v0, f0
239 ;;
240 stf8 [rp] = fp0b_1, 8
241 xma.l fp1b_1 = ux, v1, fp1a_1
242 xma.hu fp2a_1 = ux, v1, fp1a_1
243 ;;
244 getfsig acc0 = fp0b_2
245 xma.l fp1b_2 = uy, v1, fp1a_2
246 xma.hu fp2a_2 = uy, v1, fp1a_2
247 ;;
248 getfsig pr1_1 = fp1b_1
249 getfsig acc1_1 = fp2a_1
250 mov ar.lc = r2
251 getfsig pr1_2 = fp1b_2
252 getfsig r8 = fp2a_2
253 ;;
254 add s0 = pr1_1, acc0
255 ;;
256 st8 [rp] = s0, 8
257 cmp.ltu p8, p9 = s0, pr1_1
258 sub r31 = -1, acc1_1
259 ;;
260 .pred.rel "mutex", p8, p9
261 (p8) add acc0 = pr1_2, acc1_1, 1
262 (p9) add acc0 = pr1_2, acc1_1
263 (p8) cmp.leu p10, p0 = r31, pr1_2
264 (p9) cmp.ltu p10, p0 = r31, pr1_2
265 ;;
266 st8 [rp] = acc0, 8
267 (p10) add r8 = 1, r8
268 br.ret.sptk.many b0
269
270 L(gt2): ldf8 u_3 = [up], 8
271 mov acc1_0 = 0
272 mov pr1_0 = 0
273 ;;
274 mov pr0_1 = 0
275 xma.l fp0b_1 = ux, v0, f0
276 ldf8 u_0 = [up], 8
277 xma.hu fp1a_1 = ux, v0, f0
278 ;;
279 xma.l fp0b_2 = uy, v0, f0
280 xma.hu fp1a_2 = uy, v0, f0
281 ;;
282 getfsig acc0 = fp0b_1
283 xma.l fp1b_1 = ux, v1, fp1a_1
284 xma.hu fp2a_1 = ux, v1, fp1a_1
285 ;;
286 ldf8 u_1 = [up], 8
287 xma.l fp0b_3 = u_3, v0, f0
288 xma.hu fp1a_3 = u_3, v0, f0
289 ;;
290 getfsig pr0_2 = fp0b_2
291 xma.l fp1b_2 = uy, v1, fp1a_2
292 xma.hu fp2a_2 = uy, v1, fp1a_2
293 ;;
294 ldf8 u_2 = [up], 8
295 getfsig pr1_1 = fp1b_1
296 ;;
297 .mfi; getfsig acc1_1 = fp2a_1
298 xma.l fp0b_0 = u_0, v0, f0
299 cmp.ne p8, p9 = r0, r0
300 .mfb; cmp.ne p12, p13 = r0, r0
301 xma.hu fp1a_0 = u_0, v0, f0
302 br L(10)
303
304
305 ALIGN(32)
306 L(b11): mov acc1_3 = 0
307 mov pr1_3 = 0
308 mov pr0_0 = 0
309 ldf8 u_2 = [up], 8
310 cmp.ne p6, p7 = r0, r0
311 br.cloop.dptk L(gt3)
312 ;;
313 xma.l fp0b_0 = ux, v0, f0
314 xma.hu fp1a_0 = ux, v0, f0
315 ;;
316 cmp.ne p10, p11 = r0, r0
317 xma.l fp0b_1 = uy, v0, f0
318 xma.hu fp1a_1 = uy, v0, f0
319 ;;
320 getfsig acc0 = fp0b_0
321 xma.l fp1b_0 = ux, v1, fp1a_0
322 xma.hu fp2a_0 = ux, v1, fp1a_0
323 ;;
324 xma.l fp0b_2 = u_2, v0, f0
325 xma.hu fp1a_2 = u_2, v0, f0
326 ;;
327 getfsig pr0_1 = fp0b_1
328 xma.l fp1b_1 = uy, v1, fp1a_1
329 xma.hu fp2a_1 = uy, v1, fp1a_1
330 ;;
331 getfsig pr1_0 = fp1b_0
332 getfsig acc1_0 = fp2a_0
333 br L(cj3)
334
335 L(gt3): xma.l fp0b_0 = ux, v0, f0
336 cmp.ne p10, p11 = r0, r0
337 ldf8 u_3 = [up], 8
338 xma.hu fp1a_0 = ux, v0, f0
339 ;;
340 xma.l fp0b_1 = uy, v0, f0
341 xma.hu fp1a_1 = uy, v0, f0
342 ;;
343 getfsig acc0 = fp0b_0
344 xma.l fp1b_0 = ux, v1, fp1a_0
345 ldf8 u_0 = [up], 8
346 xma.hu fp2a_0 = ux, v1, fp1a_0
347 ;;
348 xma.l fp0b_2 = u_2, v0, f0
349 xma.hu fp1a_2 = u_2, v0, f0
350 ;;
351 getfsig pr0_1 = fp0b_1
352 xma.l fp1b_1 = uy, v1, fp1a_1
353 xma.hu fp2a_1 = uy, v1, fp1a_1
354 ;;
355 ldf8 u_1 = [up], 8
356 getfsig pr1_0 = fp1b_0
357 ;;
358 getfsig acc1_0 = fp2a_0
359 xma.l fp0b_3 = u_3, v0, f0
360 xma.hu fp1a_3 = u_3, v0, f0
361 br L(11)
362
363
364 C *** MAIN LOOP START ***
365 ALIGN(32)
366 L(top): C 00
367 .pred.rel "mutex", p8, p9
368 .pred.rel "mutex", p12, p13
369 ldf8 u_3 = [up], 8
370 getfsig pr1_2 = fp1b_2
371 (p8) cmp.leu p6, p7 = acc0, pr0_1
372 (p9) cmp.ltu p6, p7 = acc0, pr0_1
373 (p12) cmp.leu p10, p11 = s0, pr1_0
374 (p13) cmp.ltu p10, p11 = s0, pr1_0
375 ;; C 01
376 .pred.rel "mutex", p6, p7
377 getfsig acc1_2 = fp2a_2
378 st8 [rp] = s0, 8
379 xma.l fp0b_1 = u_1, v0, f0
380 (p6) add acc0 = pr0_2, acc1_0, 1
381 (p7) add acc0 = pr0_2, acc1_0
382 xma.hu fp1a_1 = u_1, v0, f0
383 ;; C 02
384 L(01):
385 .pred.rel "mutex", p10, p11
386 getfsig pr0_0 = fp0b_0
387 xma.l fp1b_0 = u_0, v1, fp1a_0
388 (p10) add s0 = pr1_1, acc0, 1
389 (p11) add s0 = pr1_1, acc0
390 xma.hu fp2a_0 = u_0, v1, fp1a_0
391 nop 1
392 ;; C 03
393 .pred.rel "mutex", p6, p7
394 .pred.rel "mutex", p10, p11
395 ldf8 u_0 = [up], 8
396 getfsig pr1_3 = fp1b_3
397 (p6) cmp.leu p8, p9 = acc0, pr0_2
398 (p7) cmp.ltu p8, p9 = acc0, pr0_2
399 (p10) cmp.leu p12, p13 = s0, pr1_1
400 (p11) cmp.ltu p12, p13 = s0, pr1_1
401 ;; C 04
402 .pred.rel "mutex", p8, p9
403 getfsig acc1_3 = fp2a_3
404 st8 [rp] = s0, 8
405 xma.l fp0b_2 = u_2, v0, f0
406 (p8) add acc0 = pr0_3, acc1_1, 1
407 (p9) add acc0 = pr0_3, acc1_1
408 xma.hu fp1a_2 = u_2, v0, f0
409 ;; C 05
410 L(00):
411 .pred.rel "mutex", p12, p13
412 getfsig pr0_1 = fp0b_1
413 xma.l fp1b_1 = u_1, v1, fp1a_1
414 (p12) add s0 = pr1_2, acc0, 1
415 (p13) add s0 = pr1_2, acc0
416 xma.hu fp2a_1 = u_1, v1, fp1a_1
417 nop 1
418 ;; C 06
419 .pred.rel "mutex", p8, p9
420 .pred.rel "mutex", p12, p13
421 ldf8 u_1 = [up], 8
422 getfsig pr1_0 = fp1b_0
423 (p8) cmp.leu p6, p7 = acc0, pr0_3
424 (p9) cmp.ltu p6, p7 = acc0, pr0_3
425 (p12) cmp.leu p10, p11 = s0, pr1_2
426 (p13) cmp.ltu p10, p11 = s0, pr1_2
427 ;; C 07
428 .pred.rel "mutex", p6, p7
429 getfsig acc1_0 = fp2a_0
430 st8 [rp] = s0, 8
431 xma.l fp0b_3 = u_3, v0, f0
432 (p6) add acc0 = pr0_0, acc1_2, 1
433 (p7) add acc0 = pr0_0, acc1_2
434 xma.hu fp1a_3 = u_3, v0, f0
435 ;; C 08
436 L(11):
437 .pred.rel "mutex", p10, p11
438 getfsig pr0_2 = fp0b_2
439 xma.l fp1b_2 = u_2, v1, fp1a_2
440 (p10) add s0 = pr1_3, acc0, 1
441 (p11) add s0 = pr1_3, acc0
442 xma.hu fp2a_2 = u_2, v1, fp1a_2
443 nop 1
444 ;; C 09
445 .pred.rel "mutex", p6, p7
446 .pred.rel "mutex", p10, p11
447 ldf8 u_2 = [up], 8
448 getfsig pr1_1 = fp1b_1
449 (p6) cmp.leu p8, p9 = acc0, pr0_0
450 (p7) cmp.ltu p8, p9 = acc0, pr0_0
451 (p10) cmp.leu p12, p13 = s0, pr1_3
452 (p11) cmp.ltu p12, p13 = s0, pr1_3
453 ;; C 10
454 .pred.rel "mutex", p8, p9
455 getfsig acc1_1 = fp2a_1
456 st8 [rp] = s0, 8
457 xma.l fp0b_0 = u_0, v0, f0
458 (p8) add acc0 = pr0_1, acc1_3, 1
459 (p9) add acc0 = pr0_1, acc1_3
460 xma.hu fp1a_0 = u_0, v0, f0
461 ;; C 11
462 L(10):
463 .pred.rel "mutex", p12, p13
464 getfsig pr0_3 = fp0b_3
465 xma.l fp1b_3 = u_3, v1, fp1a_3
466 (p12) add s0 = pr1_0, acc0, 1
467 (p13) add s0 = pr1_0, acc0
468 xma.hu fp2a_3 = u_3, v1, fp1a_3
469 br.cloop.dptk L(top)
470 ;;
471 C *** MAIN LOOP END ***
472
473 .pred.rel "mutex", p8, p9
474 .pred.rel "mutex", p12, p13
475 .mmi; getfsig pr1_2 = fp1b_2
476 st8 [rp] = s0, 8
477 (p8) cmp.leu p6, p7 = acc0, pr0_1
478 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
479 (p12) cmp.leu p10, p11 = s0, pr1_0
480 (p13) cmp.ltu p10, p11 = s0, pr1_0
481 ;;
482 .pred.rel "mutex", p6, p7
483 .mfi; getfsig acc1_2 = fp2a_2
484 xma.l fp0b_1 = u_1, v0, f0
485 nop 1
486 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1
487 (p7) add acc0 = pr0_2, acc1_0
488 xma.hu fp1a_1 = u_1, v0, f0
489 ;;
490 L(cj5):
491 .pred.rel "mutex", p10, p11
492 .mfi; getfsig pr0_0 = fp0b_0
493 xma.l fp1b_0 = u_0, v1, fp1a_0
494 (p10) add s0 = pr1_1, acc0, 1
495 .mfi; (p11) add s0 = pr1_1, acc0
496 xma.hu fp2a_0 = u_0, v1, fp1a_0
497 nop 1
498 ;;
499 .pred.rel "mutex", p6, p7
500 .pred.rel "mutex", p10, p11
501 .mmi; getfsig pr1_3 = fp1b_3
502 st8 [rp] = s0, 8
503 (p6) cmp.leu p8, p9 = acc0, pr0_2
504 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
505 (p10) cmp.leu p12, p13 = s0, pr1_1
506 (p11) cmp.ltu p12, p13 = s0, pr1_1
507 ;;
508 .pred.rel "mutex", p8, p9
509 .mfi; getfsig acc1_3 = fp2a_3
510 xma.l fp0b_2 = u_2, v0, f0
511 nop 1
512 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1
513 (p9) add acc0 = pr0_3, acc1_1
514 xma.hu fp1a_2 = u_2, v0, f0
515 ;;
516 L(cj4):
517 .pred.rel "mutex", p12, p13
518 .mfi; getfsig pr0_1 = fp0b_1
519 xma.l fp1b_1 = u_1, v1, fp1a_1
520 (p12) add s0 = pr1_2, acc0, 1
521 .mfi; (p13) add s0 = pr1_2, acc0
522 xma.hu fp2a_1 = u_1, v1, fp1a_1
523 nop 1
524 ;;
525 .pred.rel "mutex", p8, p9
526 .pred.rel "mutex", p12, p13
527 .mmi; getfsig pr1_0 = fp1b_0
528 st8 [rp] = s0, 8
529 (p8) cmp.leu p6, p7 = acc0, pr0_3
530 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
531 (p12) cmp.leu p10, p11 = s0, pr1_2
532 (p13) cmp.ltu p10, p11 = s0, pr1_2
533 ;;
534 .pred.rel "mutex", p6, p7
535 .mmi; getfsig acc1_0 = fp2a_0
536 (p6) add acc0 = pr0_0, acc1_2, 1
537 (p7) add acc0 = pr0_0, acc1_2
538 ;;
539 L(cj3):
540 .pred.rel "mutex", p10, p11
541 .mfi; getfsig pr0_2 = fp0b_2
542 xma.l fp1b_2 = u_2, v1, fp1a_2
543 (p10) add s0 = pr1_3, acc0, 1
544 .mfi; (p11) add s0 = pr1_3, acc0
545 xma.hu fp2a_2 = u_2, v1, fp1a_2
546 nop 1
547 ;;
548 .pred.rel "mutex", p6, p7
549 .pred.rel "mutex", p10, p11
550 .mmi; getfsig pr1_1 = fp1b_1
551 st8 [rp] = s0, 8
552 (p6) cmp.leu p8, p9 = acc0, pr0_0
553 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
554 (p10) cmp.leu p12, p13 = s0, pr1_3
555 (p11) cmp.ltu p12, p13 = s0, pr1_3
556 ;;
557 .pred.rel "mutex", p8, p9
558 .mmi; getfsig acc1_1 = fp2a_1
559 (p8) add acc0 = pr0_1, acc1_3, 1
560 (p9) add acc0 = pr0_1, acc1_3
561 ;;
562 .pred.rel "mutex", p12, p13
563 .mmi; (p12) add s0 = pr1_0, acc0, 1
564 (p13) add s0 = pr1_0, acc0
565 nop 1
566 ;;
567 .pred.rel "mutex", p8, p9
568 .pred.rel "mutex", p12, p13
569 .mmi; getfsig pr1_2 = fp1b_2
570 st8 [rp] = s0, 8
571 (p8) cmp.leu p6, p7 = acc0, pr0_1
572 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
573 (p12) cmp.leu p10, p11 = s0, pr1_0
574 (p13) cmp.ltu p10, p11 = s0, pr1_0
575 ;;
576 .pred.rel "mutex", p6, p7
577 .mmi; getfsig r8 = fp2a_2
578 (p6) add acc0 = pr0_2, acc1_0, 1
579 (p7) add acc0 = pr0_2, acc1_0
580 ;;
581 .pred.rel "mutex", p10, p11
582 .mmi; (p10) add s0 = pr1_1, acc0, 1
583 (p11) add s0 = pr1_1, acc0
584 (p6) cmp.leu p8, p9 = acc0, pr0_2
585 ;;
586 .pred.rel "mutex", p10, p11
587 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
588 (p10) cmp.leu p12, p13 = s0, pr1_1
589 (p11) cmp.ltu p12, p13 = s0, pr1_1
590 ;;
591 .pred.rel "mutex", p8, p9
592 .mmi; st8 [rp] = s0, 8
593 (p8) add acc0 = pr1_2, acc1_1, 1
594 (p9) add acc0 = pr1_2, acc1_1
595 ;;
596 .pred.rel "mutex", p8, p9
597 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
598 (p9) cmp.ltu p10, p11 = acc0, pr1_2
599 (p12) add acc0 = 1, acc0
600 ;;
601 .mmi; st8 [rp] = acc0, 8
602 (p12) cmpeqor p10, p0 = 0, acc0
603 nop 1
604 ;;
605 .mib; (p10) add r8 = 1, r8
606 mov ar.lc = r2
607 br.ret.sptk.many b0
608 EPILOGUE()
609 ASM_END()
610