mul_2.asm revision 1.1 1 1.1 mrg dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2 1.1 mrg dnl store the result to a (n+1)-limb number.
3 1.1 mrg
4 1.1 mrg dnl Copyright 2004 Free Software Foundation, Inc.
5 1.1 mrg
6 1.1 mrg dnl This file is part of the GNU MP Library.
7 1.1 mrg
8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1 mrg dnl it under the terms of the GNU Lesser General Public License as published
10 1.1 mrg dnl by the Free Software Foundation; either version 3 of the License, or (at
11 1.1 mrg dnl your option) any later version.
12 1.1 mrg
13 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 1.1 mrg dnl License for more details.
17 1.1 mrg
18 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
19 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 1.1 mrg
21 1.1 mrg include(`../config.m4')
22 1.1 mrg
23 1.1 mrg C cycles/limb
24 1.1 mrg C Itanium: 3.15
25 1.1 mrg C Itanium 2: 1.625
26 1.1 mrg
27 1.1 mrg C Note that this is very similar to addmul_2.asm. If you change this file,
28 1.1 mrg C please change that file too.
29 1.1 mrg
30 1.1 mrg C TODO
31 1.1 mrg C * Clean up variable names, and try to decrease the number of distinct
32 1.1 mrg C registers used.
33 1.1 mrg C * Cleanup feed-in code to not require zeroing several registers.
34 1.1 mrg C * Make sure we don't depend on uninitialized predicate registers.
35 1.1 mrg C * We currently cross-jump very aggressively, at the expense of a few cycles
36 1.1 mrg C per operation. Consider changing that.
37 1.1 mrg C * Could perhaps save a few cycles by using 1 c/l carry propagation in
38 1.1 mrg C wind-down code.
39 1.1 mrg C * Ultimately rewrite. The problem with this code is that it first uses a
40 1.1 mrg C loaded u value in one xma pair, then leaves it live over several unrelated
41 1.1 mrg C xma pairs, before it uses it again. It should actually be quite possible
42 1.1 mrg C to just swap some aligned xma pairs around. But we should then schedule
43 1.1 mrg C u loads further from the first use.
44 1.1 mrg
45 1.1 mrg C INPUT PARAMETERS
46 1.1 mrg define(`rp',`r32')
47 1.1 mrg define(`up',`r33')
48 1.1 mrg define(`n',`r34')
49 1.1 mrg define(`vp',`r35')
50 1.1 mrg
51 1.1 mrg define(`srp',`r3')
52 1.1 mrg
53 1.1 mrg define(`v0',`f6')
54 1.1 mrg define(`v1',`f7')
55 1.1 mrg
56 1.1 mrg define(`s0',`r14')
57 1.1 mrg define(`acc0',`r15')
58 1.1 mrg
59 1.1 mrg define(`pr0_0',`r16') define(`pr0_1',`r17')
60 1.1 mrg define(`pr0_2',`r18') define(`pr0_3',`r19')
61 1.1 mrg
62 1.1 mrg define(`pr1_0',`r20') define(`pr1_1',`r21')
63 1.1 mrg define(`pr1_2',`r22') define(`pr1_3',`r23')
64 1.1 mrg
65 1.1 mrg define(`acc1_0',`r24') define(`acc1_1',`r25')
66 1.1 mrg define(`acc1_2',`r26') define(`acc1_3',`r27')
67 1.1 mrg
68 1.1 mrg dnl define(`',`r28')
69 1.1 mrg dnl define(`',`r29')
70 1.1 mrg dnl define(`',`r30')
71 1.1 mrg dnl define(`',`r31')
72 1.1 mrg
73 1.1 mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9')
74 1.1 mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11')
75 1.1 mrg
76 1.1 mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13')
77 1.1 mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15')
78 1.1 mrg
79 1.1 mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33')
80 1.1 mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35')
81 1.1 mrg
82 1.1 mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37')
83 1.1 mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39')
84 1.1 mrg
85 1.1 mrg define(`u_0',`f44') define(`u_1',`f45')
86 1.1 mrg define(`u_2',`f46') define(`u_3',`f47')
87 1.1 mrg
88 1.1 mrg define(`ux',`f49')
89 1.1 mrg define(`uy',`f51')
90 1.1 mrg
91 1.1 mrg ASM_START()
92 1.1 mrg PROLOGUE(mpn_mul_2)
93 1.1 mrg .prologue
94 1.1 mrg .save ar.lc, r2
95 1.1 mrg .body
96 1.1 mrg
97 1.1 mrg ifdef(`HAVE_ABI_32',
98 1.1 mrg ` addp4 rp = 0, rp C M I
99 1.1 mrg addp4 up = 0, up C M I
100 1.1 mrg addp4 vp = 0, vp C M I
101 1.1 mrg zxt4 n = n C I
102 1.1 mrg ;;')
103 1.1 mrg
104 1.1 mrg {.mmi C 00
105 1.1 mrg ldf8 ux = [up], 8 C M
106 1.1 mrg ldf8 v0 = [vp], 8 C M
107 1.1 mrg mov.i r2 = ar.lc C I0
108 1.1 mrg }{.mmi
109 1.1 mrg nop 0 C M
110 1.1 mrg and r14 = 3, n C M I
111 1.1 mrg add n = -2, n C M I
112 1.1 mrg ;;
113 1.1 mrg }{.mmi C 01
114 1.1 mrg ldf8 uy = [up], 8 C M
115 1.1 mrg ldf8 v1 = [vp] C M
116 1.1 mrg shr.u n = n, 2 C I
117 1.1 mrg }{.mmi
118 1.1 mrg nop 0 C M
119 1.1 mrg cmp.eq p10, p0 = 1, r14 C M I
120 1.1 mrg cmp.eq p11, p0 = 2, r14 C M I
121 1.1 mrg ;;
122 1.1 mrg }{.mmi C 02
123 1.1 mrg nop 0 C M
124 1.1 mrg cmp.eq p12, p0 = 3, r14 C M I
125 1.1 mrg mov.i ar.lc = n C I0
126 1.1 mrg }{.bbb
127 1.1 mrg (p10) br.dptk .Lb01 C B
128 1.1 mrg (p11) br.dptk .Lb10 C B
129 1.1 mrg (p12) br.dptk .Lb11 C B
130 1.1 mrg ;;
131 1.1 mrg }
132 1.1 mrg
133 1.1 mrg ALIGN(32)
134 1.1 mrg .Lb00: ldf8 u_1 = [up], 8
135 1.1 mrg mov acc1_2 = 0
136 1.1 mrg mov pr1_2 = 0
137 1.1 mrg mov pr0_3 = 0
138 1.1 mrg cmp.ne p8, p9 = r0, r0
139 1.1 mrg ;;
140 1.1 mrg xma.l fp0b_3 = ux, v0, f0
141 1.1 mrg cmp.ne p12, p13 = r0, r0
142 1.1 mrg ldf8 u_2 = [up], 8
143 1.1 mrg xma.hu fp1a_3 = ux, v0, f0
144 1.1 mrg br.cloop.dptk .grt4
145 1.1 mrg
146 1.1 mrg xma.l fp0b_0 = uy, v0, f0
147 1.1 mrg xma.hu fp1a_0 = uy, v0, f0
148 1.1 mrg ;;
149 1.1 mrg getf.sig acc0 = fp0b_3
150 1.1 mrg xma.l fp1b_3 = ux, v1, fp1a_3
151 1.1 mrg xma.hu fp2a_3 = ux, v1, fp1a_3
152 1.1 mrg ;;
153 1.1 mrg xma.l fp0b_1 = u_1, v0, f0
154 1.1 mrg xma.hu fp1a_1 = u_1, v0, f0
155 1.1 mrg ;;
156 1.1 mrg getf.sig pr0_0 = fp0b_0
157 1.1 mrg xma.l fp1b_0 = uy, v1, fp1a_0
158 1.1 mrg xma.hu fp2a_0 = uy, v1, fp1a_0
159 1.1 mrg ;;
160 1.1 mrg getf.sig pr1_3 = fp1b_3
161 1.1 mrg getf.sig acc1_3 = fp2a_3
162 1.1 mrg xma.l fp0b_2 = u_2, v0, f0
163 1.1 mrg xma.hu fp1a_2 = u_2, v0, f0
164 1.1 mrg br .Lcj4
165 1.1 mrg
166 1.1 mrg .grt4: xma.l fp0b_0 = uy, v0, f0
167 1.1 mrg xma.hu fp1a_0 = uy, v0, f0
168 1.1 mrg ;;
169 1.1 mrg getf.sig acc0 = fp0b_3
170 1.1 mrg xma.l fp1b_3 = ux, v1, fp1a_3
171 1.1 mrg ldf8 u_3 = [up], 8
172 1.1 mrg xma.hu fp2a_3 = ux, v1, fp1a_3
173 1.1 mrg ;;
174 1.1 mrg xma.l fp0b_1 = u_1, v0, f0
175 1.1 mrg xma.hu fp1a_1 = u_1, v0, f0
176 1.1 mrg ;;
177 1.1 mrg getf.sig pr0_0 = fp0b_0
178 1.1 mrg xma.l fp1b_0 = uy, v1, fp1a_0
179 1.1 mrg xma.hu fp2a_0 = uy, v1, fp1a_0
180 1.1 mrg ;;
181 1.1 mrg ldf8 u_0 = [up], 8
182 1.1 mrg getf.sig pr1_3 = fp1b_3
183 1.1 mrg ;;
184 1.1 mrg getf.sig acc1_3 = fp2a_3
185 1.1 mrg xma.l fp0b_2 = u_2, v0, f0
186 1.1 mrg xma.hu fp1a_2 = u_2, v0, f0
187 1.1 mrg br .LL00
188 1.1 mrg
189 1.1 mrg
190 1.1 mrg ALIGN(32)
191 1.1 mrg .Lb01: ldf8 u_0 = [up], 8 C M
192 1.1 mrg mov acc1_1 = 0 C M I
193 1.1 mrg mov pr1_1 = 0 C M I
194 1.1 mrg mov pr0_2 = 0 C M I
195 1.1 mrg cmp.ne p6, p7 = r0, r0 C M I
196 1.1 mrg ;;
197 1.1 mrg xma.l fp0b_2 = ux, v0, f0 C F
198 1.1 mrg cmp.ne p10, p11 = r0, r0 C M I
199 1.1 mrg ldf8 u_1 = [up], 8 C M
200 1.1 mrg xma.hu fp1a_2 = ux, v0, f0 C F
201 1.1 mrg ;;
202 1.1 mrg xma.l fp0b_3 = uy, v0, f0 C F
203 1.1 mrg xma.hu fp1a_3 = uy, v0, f0 C F
204 1.1 mrg ;;
205 1.1 mrg getf.sig acc0 = fp0b_2 C M
206 1.1 mrg xma.l fp1b_2 = ux, v1,fp1a_2 C F
207 1.1 mrg xma.hu fp2a_2 = ux, v1,fp1a_2 C F
208 1.1 mrg ldf8 u_2 = [up], 8 C M
209 1.1 mrg br.cloop.dptk .grt5
210 1.1 mrg
211 1.1 mrg xma.l fp0b_0 = u_0, v0, f0 C F
212 1.1 mrg xma.hu fp1a_0 = u_0, v0, f0 C F
213 1.1 mrg ;;
214 1.1 mrg getf.sig pr0_3 = fp0b_3 C M
215 1.1 mrg xma.l fp1b_3 = uy, v1,fp1a_3 C F
216 1.1 mrg xma.hu fp2a_3 = uy, v1,fp1a_3 C F
217 1.1 mrg ;;
218 1.1 mrg getf.sig pr1_2 = fp1b_2 C M
219 1.1 mrg getf.sig acc1_2 = fp2a_2 C M
220 1.1 mrg xma.l fp0b_1 = u_1, v0, f0 C F
221 1.1 mrg xma.hu fp1a_1 = u_1, v0, f0 C F
222 1.1 mrg br .Lcj5
223 1.1 mrg
224 1.1 mrg .grt5: xma.l fp0b_0 = u_0, v0, f0
225 1.1 mrg xma.hu fp1a_0 = u_0, v0, f0
226 1.1 mrg ;;
227 1.1 mrg getf.sig pr0_3 = fp0b_3
228 1.1 mrg xma.l fp1b_3 = uy, v1, fp1a_3
229 1.1 mrg xma.hu fp2a_3 = uy, v1, fp1a_3
230 1.1 mrg ;;
231 1.1 mrg ldf8 u_3 = [up], 8
232 1.1 mrg getf.sig pr1_2 = fp1b_2
233 1.1 mrg ;;
234 1.1 mrg getf.sig acc1_2 = fp2a_2
235 1.1 mrg xma.l fp0b_1 = u_1, v0, f0
236 1.1 mrg xma.hu fp1a_1 = u_1, v0, f0
237 1.1 mrg br .LL01
238 1.1 mrg
239 1.1 mrg
240 1.1 mrg C We have two variants for n = 2. They turn out to run at exactly the same
241 1.1 mrg C speed. But the first, odd variant might allow one cycle to be trimmed.
242 1.1 mrg ALIGN(32)
243 1.1 mrg ifdef(`',`
244 1.1 mrg .Lb10: C 03
245 1.1 mrg br.cloop.dptk .grt2
246 1.1 mrg C 04
247 1.1 mrg C 05
248 1.1 mrg C 06
249 1.1 mrg xma.l fp0b_1 = ux, v0, f0 C 0
250 1.1 mrg xma.hu fp1a_1 = ux, v0, f0 C 1
251 1.1 mrg ;; C 07
252 1.1 mrg xma.l fp0b_2 = uy, v0, f0 C 1
253 1.1 mrg xma.l fp1b_1 = ux, v1, f0 C 1
254 1.1 mrg ;; C 08
255 1.1 mrg xma.hu fp1a_2 = uy, v0, f0 C 2
256 1.1 mrg xma.hu fp2a_1 = ux, v1, f0 C 2
257 1.1 mrg ;; C 09
258 1.1 mrg xma.l fp1b_2 = uy, v1, f0 C 2
259 1.1 mrg xma.hu fp2a_2 = uy, v1, f0 C 3
260 1.1 mrg ;; C 10
261 1.1 mrg getf.sig r16 = fp1a_1
262 1.1 mrg stf8 [rp] = fp0b_1, 8
263 1.1 mrg ;; C 11
264 1.1 mrg getf.sig r17 = fp0b_2
265 1.1 mrg C 12
266 1.1 mrg getf.sig r18 = fp1b_1
267 1.1 mrg C 13
268 1.1 mrg getf.sig r19 = fp1a_2
269 1.1 mrg C 14
270 1.1 mrg getf.sig r20 = fp2a_1
271 1.1 mrg C 15
272 1.1 mrg getf.sig r21 = fp1b_2
273 1.1 mrg ;; C 16
274 1.1 mrg getf.sig r8 = fp2a_2
275 1.1 mrg add r24 = r16, r17
276 1.1 mrg ;; C 17
277 1.1 mrg cmp.ltu p6, p7 = r24, r16
278 1.1 mrg add r26 = r24, r18
279 1.1 mrg ;; C 18
280 1.1 mrg cmp.ltu p8, p9 = r26, r24
281 1.1 mrg ;; C 19
282 1.1 mrg st8 [rp] = r26, 8
283 1.1 mrg (p6) add r25 = r19, r20, 1
284 1.1 mrg (p7) add r25 = r19, r20
285 1.1 mrg ;; C 20
286 1.1 mrg (p8) add r27 = r25, r21, 1
287 1.1 mrg (p9) add r27 = r25, r21
288 1.1 mrg (p6) cmp.leu p10, p0 = r25, r19
289 1.1 mrg (p7) cmp.ltu p10, p0 = r25, r19
290 1.1 mrg ;; C 21
291 1.1 mrg (p10) add r8 = 1, r8
292 1.1 mrg (p8) cmp.leu p12, p0 = r27, r25
293 1.1 mrg (p9) cmp.ltu p12, p0 = r27, r25
294 1.1 mrg ;; C 22
295 1.1 mrg st8 [rp] = r27, 8
296 1.1 mrg mov.i ar.lc = r2
297 1.1 mrg (p12) add r8 = 1, r8
298 1.1 mrg br.ret.sptk.many b0
299 1.1 mrg ')
300 1.1 mrg
301 1.1 mrg .Lb10: C 03
302 1.1 mrg br.cloop.dptk .grt2
303 1.1 mrg C 04
304 1.1 mrg C 05
305 1.1 mrg C 06
306 1.1 mrg xma.l fp0b_1 = ux, v0, f0
307 1.1 mrg xma.hu fp1a_1 = ux, v0, f0
308 1.1 mrg ;; C 07
309 1.1 mrg xma.l fp0b_2 = uy, v0, f0
310 1.1 mrg xma.hu fp1a_2 = uy, v0, f0
311 1.1 mrg ;; C 08
312 1.1 mrg C 09
313 1.1 mrg C 10
314 1.1 mrg stf8 [rp] = fp0b_1, 8
315 1.1 mrg xma.l fp1b_1 = ux, v1, fp1a_1
316 1.1 mrg xma.hu fp2a_1 = ux, v1, fp1a_1
317 1.1 mrg ;; C 11
318 1.1 mrg getf.sig acc0 = fp0b_2
319 1.1 mrg xma.l fp1b_2 = uy, v1, fp1a_2
320 1.1 mrg xma.hu fp2a_2 = uy, v1, fp1a_2
321 1.1 mrg ;; C 12
322 1.1 mrg C 13
323 1.1 mrg C 14
324 1.1 mrg getf.sig pr1_1 = fp1b_1
325 1.1 mrg C 15
326 1.1 mrg getf.sig acc1_1 = fp2a_1
327 1.1 mrg C 16
328 1.1 mrg getf.sig pr1_2 = fp1b_2
329 1.1 mrg C 17
330 1.1 mrg getf.sig r8 = fp2a_2
331 1.1 mrg ;; C 18
332 1.1 mrg C 19
333 1.1 mrg add s0 = pr1_1, acc0
334 1.1 mrg ;; C 20
335 1.1 mrg st8 [rp] = s0, 8
336 1.1 mrg cmp.ltu p8, p9 = s0, pr1_1
337 1.1 mrg sub r31 = -1, acc1_1
338 1.1 mrg ;; C 21
339 1.1 mrg .pred.rel "mutex", p8, p9
340 1.1 mrg (p8) add acc0 = pr1_2, acc1_1, 1
341 1.1 mrg (p9) add acc0 = pr1_2, acc1_1
342 1.1 mrg (p8) cmp.leu p10, p0 = r31, pr1_2
343 1.1 mrg (p9) cmp.ltu p10, p0 = r31, pr1_2
344 1.1 mrg ;; C 22
345 1.1 mrg st8 [rp] = acc0, 8
346 1.1 mrg mov.i ar.lc = r2
347 1.1 mrg (p10) add r8 = 1, r8
348 1.1 mrg br.ret.sptk.many b0
349 1.1 mrg
350 1.1 mrg
351 1.1 mrg .grt2: ldf8 u_3 = [up], 8
352 1.1 mrg mov acc1_0 = 0
353 1.1 mrg mov pr1_0 = 0
354 1.1 mrg ;;
355 1.1 mrg mov pr0_1 = 0
356 1.1 mrg xma.l fp0b_1 = ux, v0, f0
357 1.1 mrg ldf8 u_0 = [up], 8
358 1.1 mrg xma.hu fp1a_1 = ux, v0, f0
359 1.1 mrg ;;
360 1.1 mrg xma.l fp0b_2 = uy, v0, f0
361 1.1 mrg xma.hu fp1a_2 = uy, v0, f0
362 1.1 mrg ;;
363 1.1 mrg getf.sig acc0 = fp0b_1
364 1.1 mrg xma.l fp1b_1 = ux, v1, fp1a_1
365 1.1 mrg xma.hu fp2a_1 = ux, v1, fp1a_1
366 1.1 mrg ;;
367 1.1 mrg ldf8 u_1 = [up], 8
368 1.1 mrg xma.l fp0b_3 = u_3, v0, f0
369 1.1 mrg xma.hu fp1a_3 = u_3, v0, f0
370 1.1 mrg ;;
371 1.1 mrg getf.sig pr0_2 = fp0b_2
372 1.1 mrg xma.l fp1b_2 = uy, v1, fp1a_2
373 1.1 mrg xma.hu fp2a_2 = uy, v1, fp1a_2
374 1.1 mrg ;;
375 1.1 mrg ldf8 u_2 = [up], 8
376 1.1 mrg getf.sig pr1_1 = fp1b_1
377 1.1 mrg ;;
378 1.1 mrg getf.sig acc1_1 = fp2a_1
379 1.1 mrg xma.l fp0b_0 = u_0, v0, f0
380 1.1 mrg cmp.ne p8, p9 = r0, r0
381 1.1 mrg cmp.ne p12, p13 = r0, r0
382 1.1 mrg xma.hu fp1a_0 = u_0, v0, f0
383 1.1 mrg br .LL10
384 1.1 mrg
385 1.1 mrg
386 1.1 mrg ALIGN(32)
387 1.1 mrg .Lb11: mov acc1_3 = 0
388 1.1 mrg mov pr1_3 = 0
389 1.1 mrg mov pr0_0 = 0
390 1.1 mrg cmp.ne p6, p7 = r0, r0
391 1.1 mrg ;;
392 1.1 mrg ldf8 u_2 = [up], 8
393 1.1 mrg br.cloop.dptk .grt3
394 1.1 mrg ;;
395 1.1 mrg xma.l fp0b_0 = ux, v0, f0
396 1.1 mrg xma.hu fp1a_0 = ux, v0, f0
397 1.1 mrg ;;
398 1.1 mrg cmp.ne p10, p11 = r0, r0
399 1.1 mrg xma.l fp0b_1 = uy, v0, f0
400 1.1 mrg xma.hu fp1a_1 = uy, v0, f0
401 1.1 mrg ;;
402 1.1 mrg getf.sig acc0 = fp0b_0
403 1.1 mrg xma.l fp1b_0 = ux, v1, fp1a_0
404 1.1 mrg xma.hu fp2a_0 = ux, v1, fp1a_0
405 1.1 mrg ;;
406 1.1 mrg xma.l fp0b_2 = u_2, v0, f0
407 1.1 mrg xma.hu fp1a_2 = u_2, v0, f0
408 1.1 mrg ;;
409 1.1 mrg getf.sig pr0_1 = fp0b_1
410 1.1 mrg xma.l fp1b_1 = uy, v1, fp1a_1
411 1.1 mrg xma.hu fp2a_1 = uy, v1, fp1a_1
412 1.1 mrg ;;
413 1.1 mrg getf.sig pr1_0 = fp1b_0
414 1.1 mrg getf.sig acc1_0 = fp2a_0
415 1.1 mrg br .Lcj3
416 1.1 mrg
417 1.1 mrg .grt3: xma.l fp0b_0 = ux, v0, f0
418 1.1 mrg cmp.ne p10, p11 = r0, r0
419 1.1 mrg ldf8 u_3 = [up], 8
420 1.1 mrg xma.hu fp1a_0 = ux, v0, f0
421 1.1 mrg ;;
422 1.1 mrg xma.l fp0b_1 = uy, v0, f0
423 1.1 mrg xma.hu fp1a_1 = uy, v0, f0
424 1.1 mrg ;;
425 1.1 mrg getf.sig acc0 = fp0b_0
426 1.1 mrg xma.l fp1b_0 = ux, v1, fp1a_0
427 1.1 mrg ldf8 u_0 = [up], 8
428 1.1 mrg xma.hu fp2a_0 = ux, v1, fp1a_0
429 1.1 mrg ;;
430 1.1 mrg xma.l fp0b_2 = u_2, v0, f0
431 1.1 mrg xma.hu fp1a_2 = u_2, v0, f0
432 1.1 mrg ;;
433 1.1 mrg getf.sig pr0_1 = fp0b_1
434 1.1 mrg xma.l fp1b_1 = uy, v1, fp1a_1
435 1.1 mrg xma.hu fp2a_1 = uy, v1, fp1a_1
436 1.1 mrg ;;
437 1.1 mrg ldf8 u_1 = [up], 8
438 1.1 mrg getf.sig pr1_0 = fp1b_0
439 1.1 mrg ;;
440 1.1 mrg getf.sig acc1_0 = fp2a_0
441 1.1 mrg xma.l fp0b_3 = u_3, v0, f0
442 1.1 mrg xma.hu fp1a_3 = u_3, v0, f0
443 1.1 mrg br .LL11
444 1.1 mrg
445 1.1 mrg
446 1.1 mrg C *** MAIN LOOP START ***
447 1.1 mrg ALIGN(32)
448 1.1 mrg .Loop: C 00
449 1.1 mrg .pred.rel "mutex", p12, p13
450 1.1 mrg getf.sig pr0_3 = fp0b_3
451 1.1 mrg xma.l fp1b_3 = u_3, v1, fp1a_3
452 1.1 mrg (p12) add s0 = pr1_0, acc0, 1
453 1.1 mrg (p13) add s0 = pr1_0, acc0
454 1.1 mrg xma.hu fp2a_3 = u_3, v1, fp1a_3
455 1.1 mrg ;; C 01
456 1.1 mrg .pred.rel "mutex", p8, p9
457 1.1 mrg .pred.rel "mutex", p12, p13
458 1.1 mrg ldf8 u_3 = [up], 8
459 1.1 mrg getf.sig pr1_2 = fp1b_2
460 1.1 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
461 1.1 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_1
462 1.1 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
463 1.1 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
464 1.1 mrg ;; C 02
465 1.1 mrg .pred.rel "mutex", p6, p7
466 1.1 mrg getf.sig acc1_2 = fp2a_2
467 1.1 mrg st8 [rp] = s0, 8
468 1.1 mrg xma.l fp0b_1 = u_1, v0, f0
469 1.1 mrg (p6) add acc0 = pr0_2, acc1_0, 1
470 1.1 mrg (p7) add acc0 = pr0_2, acc1_0
471 1.1 mrg xma.hu fp1a_1 = u_1, v0, f0
472 1.1 mrg ;; C 03
473 1.1 mrg .LL01:
474 1.1 mrg .pred.rel "mutex", p10, p11
475 1.1 mrg getf.sig pr0_0 = fp0b_0
476 1.1 mrg xma.l fp1b_0 = u_0, v1, fp1a_0
477 1.1 mrg (p10) add s0 = pr1_1, acc0, 1
478 1.1 mrg (p11) add s0 = pr1_1, acc0
479 1.1 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0
480 1.1 mrg ;; C 04
481 1.1 mrg .pred.rel "mutex", p6, p7
482 1.1 mrg .pred.rel "mutex", p10, p11
483 1.1 mrg ldf8 u_0 = [up], 8
484 1.1 mrg getf.sig pr1_3 = fp1b_3
485 1.1 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
486 1.1 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_2
487 1.1 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
488 1.1 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
489 1.1 mrg ;; C 05
490 1.1 mrg .pred.rel "mutex", p8, p9
491 1.1 mrg getf.sig acc1_3 = fp2a_3
492 1.1 mrg st8 [rp] = s0, 8
493 1.1 mrg xma.l fp0b_2 = u_2, v0, f0
494 1.1 mrg (p8) add acc0 = pr0_3, acc1_1, 1
495 1.1 mrg (p9) add acc0 = pr0_3, acc1_1
496 1.1 mrg xma.hu fp1a_2 = u_2, v0, f0
497 1.1 mrg ;; C 06
498 1.1 mrg .LL00:
499 1.1 mrg .pred.rel "mutex", p12, p13
500 1.1 mrg getf.sig pr0_1 = fp0b_1
501 1.1 mrg xma.l fp1b_1 = u_1, v1, fp1a_1
502 1.1 mrg (p12) add s0 = pr1_2, acc0, 1
503 1.1 mrg (p13) add s0 = pr1_2, acc0
504 1.1 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1
505 1.1 mrg ;; C 07
506 1.1 mrg .pred.rel "mutex", p8, p9
507 1.1 mrg .pred.rel "mutex", p12, p13
508 1.1 mrg ldf8 u_1 = [up], 8
509 1.1 mrg getf.sig pr1_0 = fp1b_0
510 1.1 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3
511 1.1 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_3
512 1.1 mrg (p12) cmp.leu p10, p11 = s0, pr1_2
513 1.1 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2
514 1.1 mrg ;; C 08
515 1.1 mrg .pred.rel "mutex", p6, p7
516 1.1 mrg getf.sig acc1_0 = fp2a_0
517 1.1 mrg st8 [rp] = s0, 8
518 1.1 mrg xma.l fp0b_3 = u_3, v0, f0
519 1.1 mrg (p6) add acc0 = pr0_0, acc1_2, 1
520 1.1 mrg (p7) add acc0 = pr0_0, acc1_2
521 1.1 mrg xma.hu fp1a_3 = u_3, v0, f0
522 1.1 mrg ;; C 09
523 1.1 mrg .LL11:
524 1.1 mrg .pred.rel "mutex", p10, p11
525 1.1 mrg getf.sig pr0_2 = fp0b_2
526 1.1 mrg xma.l fp1b_2 = u_2, v1, fp1a_2
527 1.1 mrg (p10) add s0 = pr1_3, acc0, 1
528 1.1 mrg (p11) add s0 = pr1_3, acc0
529 1.1 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2
530 1.1 mrg ;; C 10
531 1.1 mrg .pred.rel "mutex", p6, p7
532 1.1 mrg .pred.rel "mutex", p10, p11
533 1.1 mrg ldf8 u_2 = [up], 8
534 1.1 mrg getf.sig pr1_1 = fp1b_1
535 1.1 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0
536 1.1 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_0
537 1.1 mrg (p10) cmp.leu p12, p13 = s0, pr1_3
538 1.1 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3
539 1.1 mrg ;; C 11
540 1.1 mrg .pred.rel "mutex", p8, p9
541 1.1 mrg getf.sig acc1_1 = fp2a_1
542 1.1 mrg st8 [rp] = s0, 8
543 1.1 mrg xma.l fp0b_0 = u_0, v0, f0
544 1.1 mrg (p8) add acc0 = pr0_1, acc1_3, 1
545 1.1 mrg (p9) add acc0 = pr0_1, acc1_3
546 1.1 mrg xma.hu fp1a_0 = u_0, v0, f0
547 1.1 mrg .LL10: br.cloop.dptk .Loop C 12
548 1.1 mrg ;;
549 1.1 mrg C *** MAIN LOOP END ***
550 1.1 mrg
551 1.1 mrg .Lcj6:
552 1.1 mrg .pred.rel "mutex", p12, p13
553 1.1 mrg getf.sig pr0_3 = fp0b_3
554 1.1 mrg xma.l fp1b_3 = u_3, v1, fp1a_3
555 1.1 mrg (p12) add s0 = pr1_0, acc0, 1
556 1.1 mrg (p13) add s0 = pr1_0, acc0
557 1.1 mrg xma.hu fp2a_3 = u_3, v1, fp1a_3
558 1.1 mrg ;;
559 1.1 mrg .pred.rel "mutex", p8, p9
560 1.1 mrg .pred.rel "mutex", p12, p13
561 1.1 mrg getf.sig pr1_2 = fp1b_2
562 1.1 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
563 1.1 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_1
564 1.1 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
565 1.1 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
566 1.1 mrg ;;
567 1.1 mrg .pred.rel "mutex", p6, p7
568 1.1 mrg getf.sig acc1_2 = fp2a_2
569 1.1 mrg st8 [rp] = s0, 8
570 1.1 mrg xma.l fp0b_1 = u_1, v0, f0
571 1.1 mrg (p6) add acc0 = pr0_2, acc1_0, 1
572 1.1 mrg (p7) add acc0 = pr0_2, acc1_0
573 1.1 mrg xma.hu fp1a_1 = u_1, v0, f0
574 1.1 mrg ;;
575 1.1 mrg .Lcj5:
576 1.1 mrg .pred.rel "mutex", p10, p11
577 1.1 mrg getf.sig pr0_0 = fp0b_0
578 1.1 mrg xma.l fp1b_0 = u_0, v1, fp1a_0
579 1.1 mrg (p10) add s0 = pr1_1, acc0, 1
580 1.1 mrg (p11) add s0 = pr1_1, acc0
581 1.1 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0
582 1.1 mrg ;;
583 1.1 mrg .pred.rel "mutex", p6, p7
584 1.1 mrg .pred.rel "mutex", p10, p11
585 1.1 mrg getf.sig pr1_3 = fp1b_3
586 1.1 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
587 1.1 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_2
588 1.1 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
589 1.1 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
590 1.1 mrg ;;
591 1.1 mrg .pred.rel "mutex", p8, p9
592 1.1 mrg getf.sig acc1_3 = fp2a_3
593 1.1 mrg st8 [rp] = s0, 8
594 1.1 mrg xma.l fp0b_2 = u_2, v0, f0
595 1.1 mrg (p8) add acc0 = pr0_3, acc1_1, 1
596 1.1 mrg (p9) add acc0 = pr0_3, acc1_1
597 1.1 mrg xma.hu fp1a_2 = u_2, v0, f0
598 1.1 mrg ;;
599 1.1 mrg .Lcj4:
600 1.1 mrg .pred.rel "mutex", p12, p13
601 1.1 mrg getf.sig pr0_1 = fp0b_1
602 1.1 mrg xma.l fp1b_1 = u_1, v1, fp1a_1
603 1.1 mrg (p12) add s0 = pr1_2, acc0, 1
604 1.1 mrg (p13) add s0 = pr1_2, acc0
605 1.1 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1
606 1.1 mrg ;;
607 1.1 mrg .pred.rel "mutex", p8, p9
608 1.1 mrg .pred.rel "mutex", p12, p13
609 1.1 mrg getf.sig pr1_0 = fp1b_0
610 1.1 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3
611 1.1 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_3
612 1.1 mrg (p12) cmp.leu p10, p11 = s0, pr1_2
613 1.1 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2
614 1.1 mrg ;;
615 1.1 mrg .pred.rel "mutex", p6, p7
616 1.1 mrg getf.sig acc1_0 = fp2a_0
617 1.1 mrg st8 [rp] = s0, 8
618 1.1 mrg (p6) add acc0 = pr0_0, acc1_2, 1
619 1.1 mrg (p7) add acc0 = pr0_0, acc1_2
620 1.1 mrg ;;
621 1.1 mrg .Lcj3:
622 1.1 mrg .pred.rel "mutex", p10, p11
623 1.1 mrg getf.sig pr0_2 = fp0b_2
624 1.1 mrg xma.l fp1b_2 = u_2, v1, fp1a_2
625 1.1 mrg (p10) add s0 = pr1_3, acc0, 1
626 1.1 mrg (p11) add s0 = pr1_3, acc0
627 1.1 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2
628 1.1 mrg ;;
629 1.1 mrg .pred.rel "mutex", p6, p7
630 1.1 mrg .pred.rel "mutex", p10, p11
631 1.1 mrg getf.sig pr1_1 = fp1b_1
632 1.1 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0
633 1.1 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_0
634 1.1 mrg (p10) cmp.leu p12, p13 = s0, pr1_3
635 1.1 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3
636 1.1 mrg ;;
637 1.1 mrg .pred.rel "mutex", p8, p9
638 1.1 mrg getf.sig acc1_1 = fp2a_1
639 1.1 mrg st8 [rp] = s0, 8
640 1.1 mrg (p8) add acc0 = pr0_1, acc1_3, 1
641 1.1 mrg (p9) add acc0 = pr0_1, acc1_3
642 1.1 mrg ;;
643 1.1 mrg .pred.rel "mutex", p12, p13
644 1.1 mrg (p12) add s0 = pr1_0, acc0, 1
645 1.1 mrg (p13) add s0 = pr1_0, acc0
646 1.1 mrg ;;
647 1.1 mrg .pred.rel "mutex", p8, p9
648 1.1 mrg .pred.rel "mutex", p12, p13
649 1.1 mrg getf.sig pr1_2 = fp1b_2
650 1.1 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
651 1.1 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_1
652 1.1 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
653 1.1 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
654 1.1 mrg ;;
655 1.1 mrg .pred.rel "mutex", p6, p7
656 1.1 mrg getf.sig acc1_2 = fp2a_2
657 1.1 mrg st8 [rp] = s0, 8
658 1.1 mrg (p6) add acc0 = pr0_2, acc1_0, 1
659 1.1 mrg (p7) add acc0 = pr0_2, acc1_0
660 1.1 mrg ;;
661 1.1 mrg .pred.rel "mutex", p10, p11
662 1.1 mrg (p10) add s0 = pr1_1, acc0, 1
663 1.1 mrg (p11) add s0 = pr1_1, acc0
664 1.1 mrg ;;
665 1.1 mrg .pred.rel "mutex", p6, p7
666 1.1 mrg .pred.rel "mutex", p10, p11
667 1.1 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
668 1.1 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_2
669 1.1 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
670 1.1 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
671 1.1 mrg ;;
672 1.1 mrg .pred.rel "mutex", p8, p9
673 1.1 mrg st8 [rp] = s0, 8
674 1.1 mrg (p8) add acc0 = pr1_2, acc1_1, 1
675 1.1 mrg (p9) add acc0 = pr1_2, acc1_1
676 1.1 mrg ;;
677 1.1 mrg .pred.rel "mutex", p8, p9
678 1.1 mrg (p8) cmp.leu p10, p11 = acc0, pr1_2
679 1.1 mrg (p9) cmp.ltu p10, p11 = acc0, pr1_2
680 1.1 mrg (p12) add acc0 = 1, acc0
681 1.1 mrg ;;
682 1.1 mrg st8 [rp] = acc0, 8
683 1.1 mrg (p12) cmp.eq.or p10, p0 = 0, acc0
684 1.1 mrg mov r8 = acc1_2
685 1.1 mrg ;;
686 1.1 mrg .pred.rel "mutex", p10, p11
687 1.1 mrg (p10) add r8 = 1, r8
688 1.1 mrg mov.i ar.lc = r2
689 1.1 mrg br.ret.sptk.many b0
690 1.1 mrg EPILOGUE()
691 1.1 mrg ASM_END()
692