mul_2.asm revision 1.1.1.2 1 1.1 mrg dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2 1.1 mrg dnl store the result to a (n+1)-limb number.
3 1.1 mrg
4 1.1.1.2 mrg dnl Contributed to the GNU project by Torbjorn Granlund.
5 1.1.1.2 mrg
6 1.1.1.2 mrg dnl Copyright 2004, 2011 Free Software Foundation, Inc.
7 1.1 mrg
8 1.1 mrg dnl This file is part of the GNU MP Library.
9 1.1 mrg
10 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 1.1 mrg dnl it under the terms of the GNU Lesser General Public License as published
12 1.1 mrg dnl by the Free Software Foundation; either version 3 of the License, or (at
13 1.1 mrg dnl your option) any later version.
14 1.1 mrg
15 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
16 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
18 1.1 mrg dnl License for more details.
19 1.1 mrg
20 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
21 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 1.1 mrg
23 1.1 mrg include(`../config.m4')
24 1.1 mrg
25 1.1 mrg C cycles/limb
26 1.1.1.2 mrg C Itanium: ?
27 1.1.1.2 mrg C Itanium 2: 1.5
28 1.1 mrg
29 1.1 mrg C TODO
30 1.1 mrg C * Clean up variable names, and try to decrease the number of distinct
31 1.1 mrg C registers used.
32 1.1.1.2 mrg C * Clean up feed-in code to not require zeroing several registers.
33 1.1 mrg C * Make sure we don't depend on uninitialized predicate registers.
34 1.1 mrg C * Could perhaps save a few cycles by using 1 c/l carry propagation in
35 1.1 mrg C wind-down code.
36 1.1 mrg C * Ultimately rewrite. The problem with this code is that it first uses a
37 1.1 mrg C loaded u value in one xma pair, then leaves it live over several unrelated
38 1.1 mrg C xma pairs, before it uses it again. It should actually be quite possible
39 1.1 mrg C to just swap some aligned xma pairs around. But we should then schedule
40 1.1 mrg C u loads further from the first use.
41 1.1 mrg
42 1.1 mrg C INPUT PARAMETERS
43 1.1 mrg define(`rp',`r32')
44 1.1 mrg define(`up',`r33')
45 1.1 mrg define(`n',`r34')
46 1.1 mrg define(`vp',`r35')
47 1.1 mrg
48 1.1 mrg define(`srp',`r3')
49 1.1 mrg
50 1.1 mrg define(`v0',`f6')
51 1.1 mrg define(`v1',`f7')
52 1.1 mrg
53 1.1 mrg define(`s0',`r14')
54 1.1 mrg define(`acc0',`r15')
55 1.1 mrg
56 1.1 mrg define(`pr0_0',`r16') define(`pr0_1',`r17')
57 1.1 mrg define(`pr0_2',`r18') define(`pr0_3',`r19')
58 1.1 mrg
59 1.1 mrg define(`pr1_0',`r20') define(`pr1_1',`r21')
60 1.1 mrg define(`pr1_2',`r22') define(`pr1_3',`r23')
61 1.1 mrg
62 1.1 mrg define(`acc1_0',`r24') define(`acc1_1',`r25')
63 1.1 mrg define(`acc1_2',`r26') define(`acc1_3',`r27')
64 1.1 mrg
65 1.1 mrg dnl define(`',`r28')
66 1.1 mrg dnl define(`',`r29')
67 1.1 mrg dnl define(`',`r30')
68 1.1 mrg dnl define(`',`r31')
69 1.1 mrg
70 1.1 mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9')
71 1.1 mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11')
72 1.1 mrg
73 1.1 mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13')
74 1.1 mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15')
75 1.1 mrg
76 1.1 mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33')
77 1.1 mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35')
78 1.1 mrg
79 1.1 mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37')
80 1.1 mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39')
81 1.1 mrg
82 1.1 mrg define(`u_0',`f44') define(`u_1',`f45')
83 1.1 mrg define(`u_2',`f46') define(`u_3',`f47')
84 1.1 mrg
85 1.1 mrg define(`ux',`f49')
86 1.1 mrg define(`uy',`f51')
87 1.1 mrg
88 1.1 mrg ASM_START()
89 1.1 mrg PROLOGUE(mpn_mul_2)
90 1.1 mrg .prologue
91 1.1 mrg .save ar.lc, r2
92 1.1 mrg .body
93 1.1 mrg
94 1.1.1.2 mrg ifdef(`HAVE_ABI_32',`
95 1.1.1.2 mrg .mmi; addp4 rp = 0, rp C M I
96 1.1.1.2 mrg addp4 up = 0, up C M I
97 1.1.1.2 mrg addp4 vp = 0, vp C M I
98 1.1.1.2 mrg .mmi; nop 1
99 1.1.1.2 mrg nop 1
100 1.1.1.2 mrg zxt4 n = n C I
101 1.1 mrg ;;')
102 1.1 mrg
103 1.1.1.2 mrg .mmi; ldf8 ux = [up], 8 C M
104 1.1.1.2 mrg ldf8 v0 = [vp], 8 C M
105 1.1.1.2 mrg mov r2 = ar.lc C I0
106 1.1.1.2 mrg .mmi; nop 1 C M
107 1.1.1.2 mrg and r14 = 3, n C M I
108 1.1.1.2 mrg add n = -2, n C M I
109 1.1.1.2 mrg ;;
110 1.1.1.2 mrg .mmi; ldf8 uy = [up], 8 C M
111 1.1.1.2 mrg ldf8 v1 = [vp] C M
112 1.1.1.2 mrg shr.u n = n, 2 C I
113 1.1.1.2 mrg .mmi; nop 1 C M
114 1.1.1.2 mrg cmp.eq p10, p0 = 1, r14 C M I
115 1.1.1.2 mrg cmp.eq p11, p0 = 2, r14 C M I
116 1.1.1.2 mrg ;;
117 1.1.1.2 mrg .mmi; nop 1 C M
118 1.1.1.2 mrg cmp.eq p12, p0 = 3, r14 C M I
119 1.1.1.2 mrg mov ar.lc = n C I0
120 1.1.1.2 mrg .bbb; (p10) br.dptk L(b01) C B
121 1.1.1.2 mrg (p11) br.dptk L(b10) C B
122 1.1.1.2 mrg (p12) br.dptk L(b11) C B
123 1.1 mrg ;;
124 1.1 mrg
125 1.1 mrg ALIGN(32)
126 1.1.1.2 mrg L(b00): ldf8 u_1 = [up], 8
127 1.1.1.2 mrg mov acc1_2 = 0
128 1.1.1.2 mrg mov pr1_2 = 0
129 1.1.1.2 mrg mov pr0_3 = 0
130 1.1.1.2 mrg cmp.ne p8, p9 = r0, r0
131 1.1.1.2 mrg ;;
132 1.1.1.2 mrg xma.l fp0b_3 = ux, v0, f0
133 1.1.1.2 mrg cmp.ne p12, p13 = r0, r0
134 1.1.1.2 mrg ldf8 u_2 = [up], 8
135 1.1.1.2 mrg xma.hu fp1a_3 = ux, v0, f0
136 1.1.1.2 mrg br.cloop.dptk L(gt4)
137 1.1.1.2 mrg
138 1.1.1.2 mrg xma.l fp0b_0 = uy, v0, f0
139 1.1.1.2 mrg xma.hu fp1a_0 = uy, v0, f0
140 1.1.1.2 mrg ;;
141 1.1.1.2 mrg getfsig acc0 = fp0b_3
142 1.1.1.2 mrg xma.l fp1b_3 = ux, v1, fp1a_3
143 1.1.1.2 mrg xma.hu fp2a_3 = ux, v1, fp1a_3
144 1.1.1.2 mrg ;;
145 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
146 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
147 1.1.1.2 mrg ;;
148 1.1.1.2 mrg getfsig pr0_0 = fp0b_0
149 1.1.1.2 mrg xma.l fp1b_0 = uy, v1, fp1a_0
150 1.1.1.2 mrg xma.hu fp2a_0 = uy, v1, fp1a_0
151 1.1.1.2 mrg ;;
152 1.1.1.2 mrg getfsig pr1_3 = fp1b_3
153 1.1.1.2 mrg getfsig acc1_3 = fp2a_3
154 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
155 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
156 1.1.1.2 mrg br L(cj4)
157 1.1.1.2 mrg
158 1.1.1.2 mrg L(gt4): xma.l fp0b_0 = uy, v0, f0
159 1.1.1.2 mrg xma.hu fp1a_0 = uy, v0, f0
160 1.1.1.2 mrg ;;
161 1.1.1.2 mrg getfsig acc0 = fp0b_3
162 1.1.1.2 mrg xma.l fp1b_3 = ux, v1, fp1a_3
163 1.1.1.2 mrg ldf8 u_3 = [up], 8
164 1.1.1.2 mrg xma.hu fp2a_3 = ux, v1, fp1a_3
165 1.1.1.2 mrg ;;
166 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
167 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
168 1.1.1.2 mrg ;;
169 1.1.1.2 mrg getfsig pr0_0 = fp0b_0
170 1.1.1.2 mrg xma.l fp1b_0 = uy, v1, fp1a_0
171 1.1.1.2 mrg xma.hu fp2a_0 = uy, v1, fp1a_0
172 1.1.1.2 mrg ;;
173 1.1.1.2 mrg ldf8 u_0 = [up], 8
174 1.1.1.2 mrg getfsig pr1_3 = fp1b_3
175 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
176 1.1.1.2 mrg ;;
177 1.1.1.2 mrg getfsig acc1_3 = fp2a_3
178 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
179 1.1.1.2 mrg br L(00)
180 1.1 mrg
181 1.1 mrg
182 1.1 mrg ALIGN(32)
183 1.1.1.2 mrg L(b01): ldf8 u_0 = [up], 8 C M
184 1.1.1.2 mrg mov acc1_1 = 0 C M I
185 1.1.1.2 mrg mov pr1_1 = 0 C M I
186 1.1.1.2 mrg mov pr0_2 = 0 C M I
187 1.1.1.2 mrg cmp.ne p6, p7 = r0, r0 C M I
188 1.1.1.2 mrg ;;
189 1.1.1.2 mrg xma.l fp0b_2 = ux, v0, f0 C F
190 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0 C M I
191 1.1.1.2 mrg ldf8 u_1 = [up], 8 C M
192 1.1.1.2 mrg xma.hu fp1a_2 = ux, v0, f0 C F
193 1.1.1.2 mrg ;;
194 1.1.1.2 mrg xma.l fp0b_3 = uy, v0, f0 C F
195 1.1.1.2 mrg xma.hu fp1a_3 = uy, v0, f0 C F
196 1.1.1.2 mrg ;;
197 1.1.1.2 mrg getfsig acc0 = fp0b_2 C M
198 1.1.1.2 mrg xma.l fp1b_2 = ux, v1,fp1a_2 C F
199 1.1.1.2 mrg ldf8 u_2 = [up], 8 C M
200 1.1.1.2 mrg xma.hu fp2a_2 = ux, v1,fp1a_2 C F
201 1.1.1.2 mrg br.cloop.dptk L(gt5)
202 1.1.1.2 mrg
203 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0 C F
204 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0 C F
205 1.1.1.2 mrg ;;
206 1.1.1.2 mrg getfsig pr0_3 = fp0b_3 C M
207 1.1.1.2 mrg xma.l fp1b_3 = uy, v1,fp1a_3 C F
208 1.1.1.2 mrg xma.hu fp2a_3 = uy, v1,fp1a_3 C F
209 1.1.1.2 mrg ;;
210 1.1.1.2 mrg getfsig pr1_2 = fp1b_2 C M
211 1.1.1.2 mrg getfsig acc1_2 = fp2a_2 C M
212 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 C F
213 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 C F
214 1.1.1.2 mrg br L(cj5)
215 1.1.1.2 mrg
216 1.1.1.2 mrg L(gt5): xma.l fp0b_0 = u_0, v0, f0
217 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0
218 1.1.1.2 mrg ;;
219 1.1.1.2 mrg getfsig pr0_3 = fp0b_3
220 1.1.1.2 mrg xma.l fp1b_3 = uy, v1, fp1a_3
221 1.1.1.2 mrg xma.hu fp2a_3 = uy, v1, fp1a_3
222 1.1.1.2 mrg ;;
223 1.1.1.2 mrg ldf8 u_3 = [up], 8
224 1.1.1.2 mrg getfsig pr1_2 = fp1b_2
225 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
226 1.1.1.2 mrg ;;
227 1.1.1.2 mrg getfsig acc1_2 = fp2a_2
228 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
229 1.1.1.2 mrg br L(01)
230 1.1 mrg
231 1.1 mrg
232 1.1 mrg ALIGN(32)
233 1.1.1.2 mrg L(b10): br.cloop.dptk L(gt2)
234 1.1.1.2 mrg xma.l fp0b_1 = ux, v0, f0
235 1.1.1.2 mrg xma.hu fp1a_1 = ux, v0, f0
236 1.1.1.2 mrg ;;
237 1.1.1.2 mrg xma.l fp0b_2 = uy, v0, f0
238 1.1.1.2 mrg xma.hu fp1a_2 = uy, v0, f0
239 1.1.1.2 mrg ;;
240 1.1.1.2 mrg stf8 [rp] = fp0b_1, 8
241 1.1.1.2 mrg xma.l fp1b_1 = ux, v1, fp1a_1
242 1.1.1.2 mrg xma.hu fp2a_1 = ux, v1, fp1a_1
243 1.1.1.2 mrg ;;
244 1.1.1.2 mrg getfsig acc0 = fp0b_2
245 1.1.1.2 mrg xma.l fp1b_2 = uy, v1, fp1a_2
246 1.1.1.2 mrg xma.hu fp2a_2 = uy, v1, fp1a_2
247 1.1.1.2 mrg ;;
248 1.1.1.2 mrg getfsig pr1_1 = fp1b_1
249 1.1.1.2 mrg getfsig acc1_1 = fp2a_1
250 1.1.1.2 mrg mov ar.lc = r2
251 1.1.1.2 mrg getfsig pr1_2 = fp1b_2
252 1.1.1.2 mrg getfsig r8 = fp2a_2
253 1.1.1.2 mrg ;;
254 1.1.1.2 mrg add s0 = pr1_1, acc0
255 1.1.1.2 mrg ;;
256 1.1.1.2 mrg st8 [rp] = s0, 8
257 1.1.1.2 mrg cmp.ltu p8, p9 = s0, pr1_1
258 1.1.1.2 mrg sub r31 = -1, acc1_1
259 1.1.1.2 mrg ;;
260 1.1.1.2 mrg .pred.rel "mutex", p8, p9
261 1.1.1.2 mrg (p8) add acc0 = pr1_2, acc1_1, 1
262 1.1.1.2 mrg (p9) add acc0 = pr1_2, acc1_1
263 1.1.1.2 mrg (p8) cmp.leu p10, p0 = r31, pr1_2
264 1.1.1.2 mrg (p9) cmp.ltu p10, p0 = r31, pr1_2
265 1.1.1.2 mrg ;;
266 1.1.1.2 mrg st8 [rp] = acc0, 8
267 1.1.1.2 mrg (p10) add r8 = 1, r8
268 1.1.1.2 mrg br.ret.sptk.many b0
269 1.1.1.2 mrg
270 1.1.1.2 mrg L(gt2): ldf8 u_3 = [up], 8
271 1.1.1.2 mrg mov acc1_0 = 0
272 1.1.1.2 mrg mov pr1_0 = 0
273 1.1.1.2 mrg ;;
274 1.1.1.2 mrg mov pr0_1 = 0
275 1.1.1.2 mrg xma.l fp0b_1 = ux, v0, f0
276 1.1.1.2 mrg ldf8 u_0 = [up], 8
277 1.1.1.2 mrg xma.hu fp1a_1 = ux, v0, f0
278 1.1.1.2 mrg ;;
279 1.1.1.2 mrg xma.l fp0b_2 = uy, v0, f0
280 1.1.1.2 mrg xma.hu fp1a_2 = uy, v0, f0
281 1.1.1.2 mrg ;;
282 1.1.1.2 mrg getfsig acc0 = fp0b_1
283 1.1.1.2 mrg xma.l fp1b_1 = ux, v1, fp1a_1
284 1.1.1.2 mrg xma.hu fp2a_1 = ux, v1, fp1a_1
285 1.1.1.2 mrg ;;
286 1.1.1.2 mrg ldf8 u_1 = [up], 8
287 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0
288 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0
289 1.1.1.2 mrg ;;
290 1.1.1.2 mrg getfsig pr0_2 = fp0b_2
291 1.1.1.2 mrg xma.l fp1b_2 = uy, v1, fp1a_2
292 1.1.1.2 mrg xma.hu fp2a_2 = uy, v1, fp1a_2
293 1.1.1.2 mrg ;;
294 1.1.1.2 mrg ldf8 u_2 = [up], 8
295 1.1.1.2 mrg getfsig pr1_1 = fp1b_1
296 1.1.1.2 mrg ;;
297 1.1.1.2 mrg .mfi; getfsig acc1_1 = fp2a_1
298 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0
299 1.1.1.2 mrg cmp.ne p8, p9 = r0, r0
300 1.1.1.2 mrg .mfb; cmp.ne p12, p13 = r0, r0
301 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0
302 1.1.1.2 mrg br L(10)
303 1.1 mrg
304 1.1 mrg
305 1.1 mrg ALIGN(32)
306 1.1.1.2 mrg L(b11): mov acc1_3 = 0
307 1.1.1.2 mrg mov pr1_3 = 0
308 1.1.1.2 mrg mov pr0_0 = 0
309 1.1.1.2 mrg ldf8 u_2 = [up], 8
310 1.1.1.2 mrg cmp.ne p6, p7 = r0, r0
311 1.1.1.2 mrg br.cloop.dptk L(gt3)
312 1.1.1.2 mrg ;;
313 1.1.1.2 mrg xma.l fp0b_0 = ux, v0, f0
314 1.1.1.2 mrg xma.hu fp1a_0 = ux, v0, f0
315 1.1.1.2 mrg ;;
316 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0
317 1.1.1.2 mrg xma.l fp0b_1 = uy, v0, f0
318 1.1.1.2 mrg xma.hu fp1a_1 = uy, v0, f0
319 1.1.1.2 mrg ;;
320 1.1.1.2 mrg getfsig acc0 = fp0b_0
321 1.1.1.2 mrg xma.l fp1b_0 = ux, v1, fp1a_0
322 1.1.1.2 mrg xma.hu fp2a_0 = ux, v1, fp1a_0
323 1.1.1.2 mrg ;;
324 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
325 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
326 1.1.1.2 mrg ;;
327 1.1.1.2 mrg getfsig pr0_1 = fp0b_1
328 1.1.1.2 mrg xma.l fp1b_1 = uy, v1, fp1a_1
329 1.1.1.2 mrg xma.hu fp2a_1 = uy, v1, fp1a_1
330 1.1.1.2 mrg ;;
331 1.1.1.2 mrg getfsig pr1_0 = fp1b_0
332 1.1.1.2 mrg getfsig acc1_0 = fp2a_0
333 1.1.1.2 mrg br L(cj3)
334 1.1.1.2 mrg
335 1.1.1.2 mrg L(gt3): xma.l fp0b_0 = ux, v0, f0
336 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0
337 1.1.1.2 mrg ldf8 u_3 = [up], 8
338 1.1.1.2 mrg xma.hu fp1a_0 = ux, v0, f0
339 1.1.1.2 mrg ;;
340 1.1.1.2 mrg xma.l fp0b_1 = uy, v0, f0
341 1.1.1.2 mrg xma.hu fp1a_1 = uy, v0, f0
342 1.1.1.2 mrg ;;
343 1.1.1.2 mrg getfsig acc0 = fp0b_0
344 1.1.1.2 mrg xma.l fp1b_0 = ux, v1, fp1a_0
345 1.1.1.2 mrg ldf8 u_0 = [up], 8
346 1.1.1.2 mrg xma.hu fp2a_0 = ux, v1, fp1a_0
347 1.1.1.2 mrg ;;
348 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
349 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
350 1.1.1.2 mrg ;;
351 1.1.1.2 mrg getfsig pr0_1 = fp0b_1
352 1.1.1.2 mrg xma.l fp1b_1 = uy, v1, fp1a_1
353 1.1.1.2 mrg xma.hu fp2a_1 = uy, v1, fp1a_1
354 1.1.1.2 mrg ;;
355 1.1.1.2 mrg ldf8 u_1 = [up], 8
356 1.1.1.2 mrg getfsig pr1_0 = fp1b_0
357 1.1.1.2 mrg ;;
358 1.1.1.2 mrg getfsig acc1_0 = fp2a_0
359 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0
360 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0
361 1.1.1.2 mrg br L(11)
362 1.1 mrg
363 1.1 mrg
364 1.1 mrg C *** MAIN LOOP START ***
365 1.1 mrg ALIGN(32)
366 1.1.1.2 mrg L(top): C 00
367 1.1.1.2 mrg .pred.rel "mutex", p8, p9
368 1.1.1.2 mrg .pred.rel "mutex", p12, p13
369 1.1.1.2 mrg ldf8 u_3 = [up], 8
370 1.1.1.2 mrg getfsig pr1_2 = fp1b_2
371 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
372 1.1.1.2 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_1
373 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
374 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
375 1.1 mrg ;; C 01
376 1.1.1.2 mrg .pred.rel "mutex", p6, p7
377 1.1.1.2 mrg getfsig acc1_2 = fp2a_2
378 1.1.1.2 mrg st8 [rp] = s0, 8
379 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
380 1.1.1.2 mrg (p6) add acc0 = pr0_2, acc1_0, 1
381 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0
382 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
383 1.1 mrg ;; C 02
384 1.1.1.2 mrg L(01):
385 1.1.1.2 mrg .pred.rel "mutex", p10, p11
386 1.1.1.2 mrg getfsig pr0_0 = fp0b_0
387 1.1.1.2 mrg xma.l fp1b_0 = u_0, v1, fp1a_0
388 1.1.1.2 mrg (p10) add s0 = pr1_1, acc0, 1
389 1.1.1.2 mrg (p11) add s0 = pr1_1, acc0
390 1.1.1.2 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0
391 1.1.1.2 mrg nop 1
392 1.1 mrg ;; C 03
393 1.1.1.2 mrg .pred.rel "mutex", p6, p7
394 1.1.1.2 mrg .pred.rel "mutex", p10, p11
395 1.1.1.2 mrg ldf8 u_0 = [up], 8
396 1.1.1.2 mrg getfsig pr1_3 = fp1b_3
397 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
398 1.1.1.2 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_2
399 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
400 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
401 1.1 mrg ;; C 04
402 1.1.1.2 mrg .pred.rel "mutex", p8, p9
403 1.1.1.2 mrg getfsig acc1_3 = fp2a_3
404 1.1.1.2 mrg st8 [rp] = s0, 8
405 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
406 1.1.1.2 mrg (p8) add acc0 = pr0_3, acc1_1, 1
407 1.1.1.2 mrg (p9) add acc0 = pr0_3, acc1_1
408 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
409 1.1 mrg ;; C 05
410 1.1.1.2 mrg L(00):
411 1.1.1.2 mrg .pred.rel "mutex", p12, p13
412 1.1.1.2 mrg getfsig pr0_1 = fp0b_1
413 1.1.1.2 mrg xma.l fp1b_1 = u_1, v1, fp1a_1
414 1.1.1.2 mrg (p12) add s0 = pr1_2, acc0, 1
415 1.1.1.2 mrg (p13) add s0 = pr1_2, acc0
416 1.1.1.2 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1
417 1.1.1.2 mrg nop 1
418 1.1 mrg ;; C 06
419 1.1.1.2 mrg .pred.rel "mutex", p8, p9
420 1.1.1.2 mrg .pred.rel "mutex", p12, p13
421 1.1.1.2 mrg ldf8 u_1 = [up], 8
422 1.1.1.2 mrg getfsig pr1_0 = fp1b_0
423 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3
424 1.1.1.2 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_3
425 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_2
426 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2
427 1.1 mrg ;; C 07
428 1.1.1.2 mrg .pred.rel "mutex", p6, p7
429 1.1.1.2 mrg getfsig acc1_0 = fp2a_0
430 1.1.1.2 mrg st8 [rp] = s0, 8
431 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0
432 1.1.1.2 mrg (p6) add acc0 = pr0_0, acc1_2, 1
433 1.1.1.2 mrg (p7) add acc0 = pr0_0, acc1_2
434 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0
435 1.1 mrg ;; C 08
436 1.1.1.2 mrg L(11):
437 1.1.1.2 mrg .pred.rel "mutex", p10, p11
438 1.1.1.2 mrg getfsig pr0_2 = fp0b_2
439 1.1.1.2 mrg xma.l fp1b_2 = u_2, v1, fp1a_2
440 1.1.1.2 mrg (p10) add s0 = pr1_3, acc0, 1
441 1.1.1.2 mrg (p11) add s0 = pr1_3, acc0
442 1.1.1.2 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2
443 1.1.1.2 mrg nop 1
444 1.1 mrg ;; C 09
445 1.1.1.2 mrg .pred.rel "mutex", p6, p7
446 1.1.1.2 mrg .pred.rel "mutex", p10, p11
447 1.1.1.2 mrg ldf8 u_2 = [up], 8
448 1.1.1.2 mrg getfsig pr1_1 = fp1b_1
449 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0
450 1.1.1.2 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_0
451 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_3
452 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3
453 1.1 mrg ;; C 10
454 1.1.1.2 mrg .pred.rel "mutex", p8, p9
455 1.1.1.2 mrg getfsig acc1_1 = fp2a_1
456 1.1.1.2 mrg st8 [rp] = s0, 8
457 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0
458 1.1.1.2 mrg (p8) add acc0 = pr0_1, acc1_3, 1
459 1.1.1.2 mrg (p9) add acc0 = pr0_1, acc1_3
460 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0
461 1.1 mrg ;; C 11
462 1.1.1.2 mrg L(10):
463 1.1.1.2 mrg .pred.rel "mutex", p12, p13
464 1.1.1.2 mrg getfsig pr0_3 = fp0b_3
465 1.1.1.2 mrg xma.l fp1b_3 = u_3, v1, fp1a_3
466 1.1.1.2 mrg (p12) add s0 = pr1_0, acc0, 1
467 1.1.1.2 mrg (p13) add s0 = pr1_0, acc0
468 1.1.1.2 mrg xma.hu fp2a_3 = u_3, v1, fp1a_3
469 1.1.1.2 mrg br.cloop.dptk L(top)
470 1.1 mrg ;;
471 1.1 mrg C *** MAIN LOOP END ***
472 1.1 mrg
473 1.1.1.2 mrg .pred.rel "mutex", p8, p9
474 1.1.1.2 mrg .pred.rel "mutex", p12, p13
475 1.1.1.2 mrg .mmi; getfsig pr1_2 = fp1b_2
476 1.1.1.2 mrg st8 [rp] = s0, 8
477 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
478 1.1.1.2 mrg .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
479 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
480 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
481 1.1.1.2 mrg ;;
482 1.1.1.2 mrg .pred.rel "mutex", p6, p7
483 1.1.1.2 mrg .mfi; getfsig acc1_2 = fp2a_2
484 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
485 1.1.1.2 mrg nop 1
486 1.1.1.2 mrg .mmf; (p6) add acc0 = pr0_2, acc1_0, 1
487 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0
488 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
489 1.1.1.2 mrg ;;
490 1.1.1.2 mrg L(cj5):
491 1.1.1.2 mrg .pred.rel "mutex", p10, p11
492 1.1.1.2 mrg .mfi; getfsig pr0_0 = fp0b_0
493 1.1.1.2 mrg xma.l fp1b_0 = u_0, v1, fp1a_0
494 1.1.1.2 mrg (p10) add s0 = pr1_1, acc0, 1
495 1.1.1.2 mrg .mfi; (p11) add s0 = pr1_1, acc0
496 1.1.1.2 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0
497 1.1.1.2 mrg nop 1
498 1.1.1.2 mrg ;;
499 1.1.1.2 mrg .pred.rel "mutex", p6, p7
500 1.1.1.2 mrg .pred.rel "mutex", p10, p11
501 1.1.1.2 mrg .mmi; getfsig pr1_3 = fp1b_3
502 1.1.1.2 mrg st8 [rp] = s0, 8
503 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
504 1.1.1.2 mrg .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
505 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
506 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
507 1.1.1.2 mrg ;;
508 1.1.1.2 mrg .pred.rel "mutex", p8, p9
509 1.1.1.2 mrg .mfi; getfsig acc1_3 = fp2a_3
510 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
511 1.1.1.2 mrg nop 1
512 1.1.1.2 mrg .mmf; (p8) add acc0 = pr0_3, acc1_1, 1
513 1.1.1.2 mrg (p9) add acc0 = pr0_3, acc1_1
514 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
515 1.1.1.2 mrg ;;
516 1.1.1.2 mrg L(cj4):
517 1.1.1.2 mrg .pred.rel "mutex", p12, p13
518 1.1.1.2 mrg .mfi; getfsig pr0_1 = fp0b_1
519 1.1.1.2 mrg xma.l fp1b_1 = u_1, v1, fp1a_1
520 1.1.1.2 mrg (p12) add s0 = pr1_2, acc0, 1
521 1.1.1.2 mrg .mfi; (p13) add s0 = pr1_2, acc0
522 1.1.1.2 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1
523 1.1.1.2 mrg nop 1
524 1.1.1.2 mrg ;;
525 1.1.1.2 mrg .pred.rel "mutex", p8, p9
526 1.1.1.2 mrg .pred.rel "mutex", p12, p13
527 1.1.1.2 mrg .mmi; getfsig pr1_0 = fp1b_0
528 1.1.1.2 mrg st8 [rp] = s0, 8
529 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3
530 1.1.1.2 mrg .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
531 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_2
532 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2
533 1.1.1.2 mrg ;;
534 1.1.1.2 mrg .pred.rel "mutex", p6, p7
535 1.1.1.2 mrg .mmi; getfsig acc1_0 = fp2a_0
536 1.1.1.2 mrg (p6) add acc0 = pr0_0, acc1_2, 1
537 1.1.1.2 mrg (p7) add acc0 = pr0_0, acc1_2
538 1.1.1.2 mrg ;;
539 1.1.1.2 mrg L(cj3):
540 1.1.1.2 mrg .pred.rel "mutex", p10, p11
541 1.1.1.2 mrg .mfi; getfsig pr0_2 = fp0b_2
542 1.1.1.2 mrg xma.l fp1b_2 = u_2, v1, fp1a_2
543 1.1.1.2 mrg (p10) add s0 = pr1_3, acc0, 1
544 1.1.1.2 mrg .mfi; (p11) add s0 = pr1_3, acc0
545 1.1.1.2 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2
546 1.1.1.2 mrg nop 1
547 1.1.1.2 mrg ;;
548 1.1.1.2 mrg .pred.rel "mutex", p6, p7
549 1.1.1.2 mrg .pred.rel "mutex", p10, p11
550 1.1.1.2 mrg .mmi; getfsig pr1_1 = fp1b_1
551 1.1.1.2 mrg st8 [rp] = s0, 8
552 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0
553 1.1.1.2 mrg .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
554 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_3
555 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3
556 1.1.1.2 mrg ;;
557 1.1.1.2 mrg .pred.rel "mutex", p8, p9
558 1.1.1.2 mrg .mmi; getfsig acc1_1 = fp2a_1
559 1.1.1.2 mrg (p8) add acc0 = pr0_1, acc1_3, 1
560 1.1.1.2 mrg (p9) add acc0 = pr0_1, acc1_3
561 1.1.1.2 mrg ;;
562 1.1.1.2 mrg .pred.rel "mutex", p12, p13
563 1.1.1.2 mrg .mmi; (p12) add s0 = pr1_0, acc0, 1
564 1.1.1.2 mrg (p13) add s0 = pr1_0, acc0
565 1.1.1.2 mrg nop 1
566 1.1.1.2 mrg ;;
567 1.1.1.2 mrg .pred.rel "mutex", p8, p9
568 1.1.1.2 mrg .pred.rel "mutex", p12, p13
569 1.1.1.2 mrg .mmi; getfsig pr1_2 = fp1b_2
570 1.1.1.2 mrg st8 [rp] = s0, 8
571 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
572 1.1.1.2 mrg .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
573 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
574 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
575 1.1.1.2 mrg ;;
576 1.1.1.2 mrg .pred.rel "mutex", p6, p7
577 1.1.1.2 mrg .mmi; getfsig r8 = fp2a_2
578 1.1.1.2 mrg (p6) add acc0 = pr0_2, acc1_0, 1
579 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0
580 1.1.1.2 mrg ;;
581 1.1.1.2 mrg .pred.rel "mutex", p10, p11
582 1.1.1.2 mrg .mmi; (p10) add s0 = pr1_1, acc0, 1
583 1.1.1.2 mrg (p11) add s0 = pr1_1, acc0
584 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
585 1.1.1.2 mrg ;;
586 1.1.1.2 mrg .pred.rel "mutex", p10, p11
587 1.1.1.2 mrg .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
588 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
589 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
590 1.1.1.2 mrg ;;
591 1.1.1.2 mrg .pred.rel "mutex", p8, p9
592 1.1.1.2 mrg .mmi; st8 [rp] = s0, 8
593 1.1.1.2 mrg (p8) add acc0 = pr1_2, acc1_1, 1
594 1.1.1.2 mrg (p9) add acc0 = pr1_2, acc1_1
595 1.1.1.2 mrg ;;
596 1.1.1.2 mrg .pred.rel "mutex", p8, p9
597 1.1.1.2 mrg .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
598 1.1.1.2 mrg (p9) cmp.ltu p10, p11 = acc0, pr1_2
599 1.1.1.2 mrg (p12) add acc0 = 1, acc0
600 1.1.1.2 mrg ;;
601 1.1.1.2 mrg .mmi; st8 [rp] = acc0, 8
602 1.1.1.2 mrg (p12) cmpeqor p10, p0 = 0, acc0
603 1.1.1.2 mrg nop 1
604 1.1.1.2 mrg ;;
605 1.1.1.2 mrg .mib; (p10) add r8 = 1, r8
606 1.1.1.2 mrg mov ar.lc = r2
607 1.1.1.2 mrg br.ret.sptk.many b0
608 1.1 mrg EPILOGUE()
609 1.1 mrg ASM_END()
610