mul_2.asm revision 1.1.1.3 1 1.1 mrg dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2 1.1 mrg dnl store the result to a (n+1)-limb number.
3 1.1 mrg
4 1.1.1.2 mrg dnl Contributed to the GNU project by Torbjorn Granlund.
5 1.1.1.2 mrg
6 1.1.1.2 mrg dnl Copyright 2004, 2011 Free Software Foundation, Inc.
7 1.1 mrg
8 1.1 mrg dnl This file is part of the GNU MP Library.
9 1.1.1.3 mrg dnl
10 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 1.1.1.3 mrg dnl it under the terms of either:
12 1.1.1.3 mrg dnl
13 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free
14 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your
15 1.1.1.3 mrg dnl option) any later version.
16 1.1.1.3 mrg dnl
17 1.1.1.3 mrg dnl or
18 1.1.1.3 mrg dnl
19 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software
20 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any
21 1.1.1.3 mrg dnl later version.
22 1.1.1.3 mrg dnl
23 1.1.1.3 mrg dnl or both in parallel, as here.
24 1.1.1.3 mrg dnl
25 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 1.1.1.3 mrg dnl for more details.
29 1.1.1.3 mrg dnl
30 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the
31 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/.
33 1.1 mrg
34 1.1 mrg include(`../config.m4')
35 1.1 mrg
36 1.1 mrg C cycles/limb
37 1.1.1.2 mrg C Itanium: ?
38 1.1.1.2 mrg C Itanium 2: 1.5
39 1.1 mrg
40 1.1 mrg C TODO
41 1.1 mrg C * Clean up variable names, and try to decrease the number of distinct
42 1.1 mrg C registers used.
43 1.1.1.2 mrg C * Clean up feed-in code to not require zeroing several registers.
44 1.1 mrg C * Make sure we don't depend on uninitialized predicate registers.
45 1.1 mrg C * Could perhaps save a few cycles by using 1 c/l carry propagation in
46 1.1 mrg C wind-down code.
47 1.1 mrg C * Ultimately rewrite. The problem with this code is that it first uses a
48 1.1 mrg C loaded u value in one xma pair, then leaves it live over several unrelated
49 1.1 mrg C xma pairs, before it uses it again. It should actually be quite possible
50 1.1 mrg C to just swap some aligned xma pairs around. But we should then schedule
51 1.1 mrg C u loads further from the first use.
52 1.1 mrg
53 1.1 mrg C INPUT PARAMETERS
54 1.1 mrg define(`rp',`r32')
55 1.1 mrg define(`up',`r33')
56 1.1 mrg define(`n',`r34')
57 1.1 mrg define(`vp',`r35')
58 1.1 mrg
59 1.1 mrg define(`srp',`r3')
60 1.1 mrg
61 1.1 mrg define(`v0',`f6')
62 1.1 mrg define(`v1',`f7')
63 1.1 mrg
64 1.1 mrg define(`s0',`r14')
65 1.1 mrg define(`acc0',`r15')
66 1.1 mrg
67 1.1 mrg define(`pr0_0',`r16') define(`pr0_1',`r17')
68 1.1 mrg define(`pr0_2',`r18') define(`pr0_3',`r19')
69 1.1 mrg
70 1.1 mrg define(`pr1_0',`r20') define(`pr1_1',`r21')
71 1.1 mrg define(`pr1_2',`r22') define(`pr1_3',`r23')
72 1.1 mrg
73 1.1 mrg define(`acc1_0',`r24') define(`acc1_1',`r25')
74 1.1 mrg define(`acc1_2',`r26') define(`acc1_3',`r27')
75 1.1 mrg
76 1.1 mrg dnl define(`',`r28')
77 1.1 mrg dnl define(`',`r29')
78 1.1 mrg dnl define(`',`r30')
79 1.1 mrg dnl define(`',`r31')
80 1.1 mrg
81 1.1 mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9')
82 1.1 mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11')
83 1.1 mrg
84 1.1 mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13')
85 1.1 mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15')
86 1.1 mrg
87 1.1 mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33')
88 1.1 mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35')
89 1.1 mrg
90 1.1 mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37')
91 1.1 mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39')
92 1.1 mrg
93 1.1 mrg define(`u_0',`f44') define(`u_1',`f45')
94 1.1 mrg define(`u_2',`f46') define(`u_3',`f47')
95 1.1 mrg
96 1.1 mrg define(`ux',`f49')
97 1.1 mrg define(`uy',`f51')
98 1.1 mrg
99 1.1 mrg ASM_START()
100 1.1 mrg PROLOGUE(mpn_mul_2)
101 1.1 mrg .prologue
102 1.1 mrg .save ar.lc, r2
103 1.1 mrg .body
104 1.1 mrg
105 1.1.1.2 mrg ifdef(`HAVE_ABI_32',`
106 1.1.1.3 mrg {.mmi; addp4 rp = 0, rp C M I
107 1.1.1.2 mrg addp4 up = 0, up C M I
108 1.1.1.2 mrg addp4 vp = 0, vp C M I
109 1.1.1.3 mrg }{.mmi; nop 1
110 1.1.1.2 mrg nop 1
111 1.1.1.2 mrg zxt4 n = n C I
112 1.1.1.3 mrg ;;
113 1.1.1.3 mrg }')
114 1.1 mrg
115 1.1.1.3 mrg {.mmi; ldf8 ux = [up], 8 C M
116 1.1.1.2 mrg ldf8 v0 = [vp], 8 C M
117 1.1.1.2 mrg mov r2 = ar.lc C I0
118 1.1.1.3 mrg }{.mmi; nop 1 C M
119 1.1.1.2 mrg and r14 = 3, n C M I
120 1.1.1.2 mrg add n = -2, n C M I
121 1.1.1.2 mrg ;;
122 1.1.1.3 mrg }{.mmi; ldf8 uy = [up], 8 C M
123 1.1.1.2 mrg ldf8 v1 = [vp] C M
124 1.1.1.3 mrg shr.u n = n, 2 C I0
125 1.1.1.3 mrg }{.mmi; nop 1 C M
126 1.1.1.2 mrg cmp.eq p10, p0 = 1, r14 C M I
127 1.1.1.2 mrg cmp.eq p11, p0 = 2, r14 C M I
128 1.1.1.2 mrg ;;
129 1.1.1.3 mrg }{.mmi; nop 1 C M
130 1.1.1.2 mrg cmp.eq p12, p0 = 3, r14 C M I
131 1.1.1.2 mrg mov ar.lc = n C I0
132 1.1.1.3 mrg }{.bbb; (p10) br.dptk L(b01) C B
133 1.1.1.2 mrg (p11) br.dptk L(b10) C B
134 1.1.1.2 mrg (p12) br.dptk L(b11) C B
135 1.1 mrg ;;
136 1.1.1.3 mrg }
137 1.1 mrg ALIGN(32)
138 1.1.1.2 mrg L(b00): ldf8 u_1 = [up], 8
139 1.1.1.2 mrg mov acc1_2 = 0
140 1.1.1.2 mrg mov pr1_2 = 0
141 1.1.1.2 mrg mov pr0_3 = 0
142 1.1.1.2 mrg cmp.ne p8, p9 = r0, r0
143 1.1.1.2 mrg ;;
144 1.1.1.2 mrg xma.l fp0b_3 = ux, v0, f0
145 1.1.1.2 mrg cmp.ne p12, p13 = r0, r0
146 1.1.1.2 mrg ldf8 u_2 = [up], 8
147 1.1.1.2 mrg xma.hu fp1a_3 = ux, v0, f0
148 1.1.1.2 mrg br.cloop.dptk L(gt4)
149 1.1.1.2 mrg
150 1.1.1.2 mrg xma.l fp0b_0 = uy, v0, f0
151 1.1.1.2 mrg xma.hu fp1a_0 = uy, v0, f0
152 1.1.1.2 mrg ;;
153 1.1.1.2 mrg getfsig acc0 = fp0b_3
154 1.1.1.2 mrg xma.l fp1b_3 = ux, v1, fp1a_3
155 1.1.1.2 mrg xma.hu fp2a_3 = ux, v1, fp1a_3
156 1.1.1.2 mrg ;;
157 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
158 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
159 1.1.1.2 mrg ;;
160 1.1.1.2 mrg getfsig pr0_0 = fp0b_0
161 1.1.1.2 mrg xma.l fp1b_0 = uy, v1, fp1a_0
162 1.1.1.2 mrg xma.hu fp2a_0 = uy, v1, fp1a_0
163 1.1.1.2 mrg ;;
164 1.1.1.2 mrg getfsig pr1_3 = fp1b_3
165 1.1.1.2 mrg getfsig acc1_3 = fp2a_3
166 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
167 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
168 1.1.1.2 mrg br L(cj4)
169 1.1.1.2 mrg
170 1.1.1.2 mrg L(gt4): xma.l fp0b_0 = uy, v0, f0
171 1.1.1.2 mrg xma.hu fp1a_0 = uy, v0, f0
172 1.1.1.2 mrg ;;
173 1.1.1.2 mrg getfsig acc0 = fp0b_3
174 1.1.1.2 mrg xma.l fp1b_3 = ux, v1, fp1a_3
175 1.1.1.2 mrg ldf8 u_3 = [up], 8
176 1.1.1.2 mrg xma.hu fp2a_3 = ux, v1, fp1a_3
177 1.1.1.2 mrg ;;
178 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
179 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
180 1.1.1.2 mrg ;;
181 1.1.1.2 mrg getfsig pr0_0 = fp0b_0
182 1.1.1.2 mrg xma.l fp1b_0 = uy, v1, fp1a_0
183 1.1.1.2 mrg xma.hu fp2a_0 = uy, v1, fp1a_0
184 1.1.1.2 mrg ;;
185 1.1.1.2 mrg ldf8 u_0 = [up], 8
186 1.1.1.2 mrg getfsig pr1_3 = fp1b_3
187 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
188 1.1.1.2 mrg ;;
189 1.1.1.2 mrg getfsig acc1_3 = fp2a_3
190 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
191 1.1.1.2 mrg br L(00)
192 1.1 mrg
193 1.1 mrg
194 1.1 mrg ALIGN(32)
195 1.1.1.2 mrg L(b01): ldf8 u_0 = [up], 8 C M
196 1.1.1.2 mrg mov acc1_1 = 0 C M I
197 1.1.1.2 mrg mov pr1_1 = 0 C M I
198 1.1.1.2 mrg mov pr0_2 = 0 C M I
199 1.1.1.2 mrg cmp.ne p6, p7 = r0, r0 C M I
200 1.1.1.2 mrg ;;
201 1.1.1.2 mrg xma.l fp0b_2 = ux, v0, f0 C F
202 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0 C M I
203 1.1.1.2 mrg ldf8 u_1 = [up], 8 C M
204 1.1.1.2 mrg xma.hu fp1a_2 = ux, v0, f0 C F
205 1.1.1.2 mrg ;;
206 1.1.1.2 mrg xma.l fp0b_3 = uy, v0, f0 C F
207 1.1.1.2 mrg xma.hu fp1a_3 = uy, v0, f0 C F
208 1.1.1.2 mrg ;;
209 1.1.1.2 mrg getfsig acc0 = fp0b_2 C M
210 1.1.1.2 mrg xma.l fp1b_2 = ux, v1,fp1a_2 C F
211 1.1.1.2 mrg ldf8 u_2 = [up], 8 C M
212 1.1.1.2 mrg xma.hu fp2a_2 = ux, v1,fp1a_2 C F
213 1.1.1.2 mrg br.cloop.dptk L(gt5)
214 1.1.1.2 mrg
215 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0 C F
216 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0 C F
217 1.1.1.2 mrg ;;
218 1.1.1.2 mrg getfsig pr0_3 = fp0b_3 C M
219 1.1.1.2 mrg xma.l fp1b_3 = uy, v1,fp1a_3 C F
220 1.1.1.2 mrg xma.hu fp2a_3 = uy, v1,fp1a_3 C F
221 1.1.1.2 mrg ;;
222 1.1.1.2 mrg getfsig pr1_2 = fp1b_2 C M
223 1.1.1.2 mrg getfsig acc1_2 = fp2a_2 C M
224 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 C F
225 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 C F
226 1.1.1.2 mrg br L(cj5)
227 1.1.1.2 mrg
228 1.1.1.2 mrg L(gt5): xma.l fp0b_0 = u_0, v0, f0
229 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0
230 1.1.1.2 mrg ;;
231 1.1.1.2 mrg getfsig pr0_3 = fp0b_3
232 1.1.1.2 mrg xma.l fp1b_3 = uy, v1, fp1a_3
233 1.1.1.2 mrg xma.hu fp2a_3 = uy, v1, fp1a_3
234 1.1.1.2 mrg ;;
235 1.1.1.2 mrg ldf8 u_3 = [up], 8
236 1.1.1.2 mrg getfsig pr1_2 = fp1b_2
237 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
238 1.1.1.2 mrg ;;
239 1.1.1.2 mrg getfsig acc1_2 = fp2a_2
240 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
241 1.1.1.2 mrg br L(01)
242 1.1 mrg
243 1.1 mrg
244 1.1 mrg ALIGN(32)
245 1.1.1.2 mrg L(b10): br.cloop.dptk L(gt2)
246 1.1.1.2 mrg xma.l fp0b_1 = ux, v0, f0
247 1.1.1.2 mrg xma.hu fp1a_1 = ux, v0, f0
248 1.1.1.2 mrg ;;
249 1.1.1.2 mrg xma.l fp0b_2 = uy, v0, f0
250 1.1.1.2 mrg xma.hu fp1a_2 = uy, v0, f0
251 1.1.1.2 mrg ;;
252 1.1.1.2 mrg stf8 [rp] = fp0b_1, 8
253 1.1.1.2 mrg xma.l fp1b_1 = ux, v1, fp1a_1
254 1.1.1.2 mrg xma.hu fp2a_1 = ux, v1, fp1a_1
255 1.1.1.2 mrg ;;
256 1.1.1.2 mrg getfsig acc0 = fp0b_2
257 1.1.1.2 mrg xma.l fp1b_2 = uy, v1, fp1a_2
258 1.1.1.2 mrg xma.hu fp2a_2 = uy, v1, fp1a_2
259 1.1.1.2 mrg ;;
260 1.1.1.2 mrg getfsig pr1_1 = fp1b_1
261 1.1.1.2 mrg getfsig acc1_1 = fp2a_1
262 1.1.1.2 mrg mov ar.lc = r2
263 1.1.1.2 mrg getfsig pr1_2 = fp1b_2
264 1.1.1.2 mrg getfsig r8 = fp2a_2
265 1.1.1.2 mrg ;;
266 1.1.1.2 mrg add s0 = pr1_1, acc0
267 1.1.1.2 mrg ;;
268 1.1.1.2 mrg st8 [rp] = s0, 8
269 1.1.1.2 mrg cmp.ltu p8, p9 = s0, pr1_1
270 1.1.1.2 mrg sub r31 = -1, acc1_1
271 1.1.1.2 mrg ;;
272 1.1.1.3 mrg .pred.rel "mutex", p8, p9
273 1.1.1.2 mrg (p8) add acc0 = pr1_2, acc1_1, 1
274 1.1.1.2 mrg (p9) add acc0 = pr1_2, acc1_1
275 1.1.1.2 mrg (p8) cmp.leu p10, p0 = r31, pr1_2
276 1.1.1.2 mrg (p9) cmp.ltu p10, p0 = r31, pr1_2
277 1.1.1.2 mrg ;;
278 1.1.1.2 mrg st8 [rp] = acc0, 8
279 1.1.1.2 mrg (p10) add r8 = 1, r8
280 1.1.1.2 mrg br.ret.sptk.many b0
281 1.1.1.2 mrg
282 1.1.1.2 mrg L(gt2): ldf8 u_3 = [up], 8
283 1.1.1.2 mrg mov acc1_0 = 0
284 1.1.1.2 mrg mov pr1_0 = 0
285 1.1.1.2 mrg ;;
286 1.1.1.2 mrg mov pr0_1 = 0
287 1.1.1.2 mrg xma.l fp0b_1 = ux, v0, f0
288 1.1.1.2 mrg ldf8 u_0 = [up], 8
289 1.1.1.2 mrg xma.hu fp1a_1 = ux, v0, f0
290 1.1.1.2 mrg ;;
291 1.1.1.2 mrg xma.l fp0b_2 = uy, v0, f0
292 1.1.1.2 mrg xma.hu fp1a_2 = uy, v0, f0
293 1.1.1.2 mrg ;;
294 1.1.1.2 mrg getfsig acc0 = fp0b_1
295 1.1.1.2 mrg xma.l fp1b_1 = ux, v1, fp1a_1
296 1.1.1.2 mrg xma.hu fp2a_1 = ux, v1, fp1a_1
297 1.1.1.2 mrg ;;
298 1.1.1.2 mrg ldf8 u_1 = [up], 8
299 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0
300 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0
301 1.1.1.2 mrg ;;
302 1.1.1.2 mrg getfsig pr0_2 = fp0b_2
303 1.1.1.2 mrg xma.l fp1b_2 = uy, v1, fp1a_2
304 1.1.1.2 mrg xma.hu fp2a_2 = uy, v1, fp1a_2
305 1.1.1.2 mrg ;;
306 1.1.1.2 mrg ldf8 u_2 = [up], 8
307 1.1.1.2 mrg getfsig pr1_1 = fp1b_1
308 1.1.1.2 mrg ;;
309 1.1.1.3 mrg {.mfi; getfsig acc1_1 = fp2a_1
310 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0
311 1.1.1.2 mrg cmp.ne p8, p9 = r0, r0
312 1.1.1.3 mrg }{.mfb; cmp.ne p12, p13 = r0, r0
313 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0
314 1.1.1.2 mrg br L(10)
315 1.1.1.3 mrg }
316 1.1 mrg
317 1.1 mrg ALIGN(32)
318 1.1.1.2 mrg L(b11): mov acc1_3 = 0
319 1.1.1.2 mrg mov pr1_3 = 0
320 1.1.1.2 mrg mov pr0_0 = 0
321 1.1.1.2 mrg ldf8 u_2 = [up], 8
322 1.1.1.2 mrg cmp.ne p6, p7 = r0, r0
323 1.1.1.2 mrg br.cloop.dptk L(gt3)
324 1.1.1.2 mrg ;;
325 1.1.1.2 mrg xma.l fp0b_0 = ux, v0, f0
326 1.1.1.2 mrg xma.hu fp1a_0 = ux, v0, f0
327 1.1.1.2 mrg ;;
328 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0
329 1.1.1.2 mrg xma.l fp0b_1 = uy, v0, f0
330 1.1.1.2 mrg xma.hu fp1a_1 = uy, v0, f0
331 1.1.1.2 mrg ;;
332 1.1.1.2 mrg getfsig acc0 = fp0b_0
333 1.1.1.2 mrg xma.l fp1b_0 = ux, v1, fp1a_0
334 1.1.1.2 mrg xma.hu fp2a_0 = ux, v1, fp1a_0
335 1.1.1.2 mrg ;;
336 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
337 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
338 1.1.1.2 mrg ;;
339 1.1.1.2 mrg getfsig pr0_1 = fp0b_1
340 1.1.1.2 mrg xma.l fp1b_1 = uy, v1, fp1a_1
341 1.1.1.2 mrg xma.hu fp2a_1 = uy, v1, fp1a_1
342 1.1.1.2 mrg ;;
343 1.1.1.2 mrg getfsig pr1_0 = fp1b_0
344 1.1.1.2 mrg getfsig acc1_0 = fp2a_0
345 1.1.1.2 mrg br L(cj3)
346 1.1.1.2 mrg
347 1.1.1.2 mrg L(gt3): xma.l fp0b_0 = ux, v0, f0
348 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0
349 1.1.1.2 mrg ldf8 u_3 = [up], 8
350 1.1.1.2 mrg xma.hu fp1a_0 = ux, v0, f0
351 1.1.1.2 mrg ;;
352 1.1.1.2 mrg xma.l fp0b_1 = uy, v0, f0
353 1.1.1.2 mrg xma.hu fp1a_1 = uy, v0, f0
354 1.1.1.2 mrg ;;
355 1.1.1.2 mrg getfsig acc0 = fp0b_0
356 1.1.1.2 mrg xma.l fp1b_0 = ux, v1, fp1a_0
357 1.1.1.2 mrg ldf8 u_0 = [up], 8
358 1.1.1.2 mrg xma.hu fp2a_0 = ux, v1, fp1a_0
359 1.1.1.2 mrg ;;
360 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
361 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
362 1.1.1.2 mrg ;;
363 1.1.1.2 mrg getfsig pr0_1 = fp0b_1
364 1.1.1.2 mrg xma.l fp1b_1 = uy, v1, fp1a_1
365 1.1.1.2 mrg xma.hu fp2a_1 = uy, v1, fp1a_1
366 1.1.1.2 mrg ;;
367 1.1.1.2 mrg ldf8 u_1 = [up], 8
368 1.1.1.2 mrg getfsig pr1_0 = fp1b_0
369 1.1.1.2 mrg ;;
370 1.1.1.2 mrg getfsig acc1_0 = fp2a_0
371 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0
372 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0
373 1.1.1.2 mrg br L(11)
374 1.1 mrg
375 1.1 mrg
376 1.1 mrg C *** MAIN LOOP START ***
377 1.1 mrg ALIGN(32)
378 1.1.1.2 mrg L(top): C 00
379 1.1.1.3 mrg .pred.rel "mutex", p8, p9
380 1.1.1.3 mrg .pred.rel "mutex", p12, p13
381 1.1.1.2 mrg ldf8 u_3 = [up], 8
382 1.1.1.2 mrg getfsig pr1_2 = fp1b_2
383 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
384 1.1.1.2 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_1
385 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
386 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
387 1.1 mrg ;; C 01
388 1.1.1.3 mrg .pred.rel "mutex", p6, p7
389 1.1.1.2 mrg getfsig acc1_2 = fp2a_2
390 1.1.1.2 mrg st8 [rp] = s0, 8
391 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
392 1.1.1.2 mrg (p6) add acc0 = pr0_2, acc1_0, 1
393 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0
394 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
395 1.1 mrg ;; C 02
396 1.1.1.2 mrg L(01):
397 1.1.1.3 mrg .pred.rel "mutex", p10, p11
398 1.1.1.2 mrg getfsig pr0_0 = fp0b_0
399 1.1.1.2 mrg xma.l fp1b_0 = u_0, v1, fp1a_0
400 1.1.1.2 mrg (p10) add s0 = pr1_1, acc0, 1
401 1.1.1.2 mrg (p11) add s0 = pr1_1, acc0
402 1.1.1.2 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0
403 1.1.1.2 mrg nop 1
404 1.1 mrg ;; C 03
405 1.1.1.3 mrg .pred.rel "mutex", p6, p7
406 1.1.1.3 mrg .pred.rel "mutex", p10, p11
407 1.1.1.2 mrg ldf8 u_0 = [up], 8
408 1.1.1.2 mrg getfsig pr1_3 = fp1b_3
409 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
410 1.1.1.2 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_2
411 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
412 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
413 1.1 mrg ;; C 04
414 1.1.1.3 mrg .pred.rel "mutex", p8, p9
415 1.1.1.2 mrg getfsig acc1_3 = fp2a_3
416 1.1.1.2 mrg st8 [rp] = s0, 8
417 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
418 1.1.1.2 mrg (p8) add acc0 = pr0_3, acc1_1, 1
419 1.1.1.2 mrg (p9) add acc0 = pr0_3, acc1_1
420 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
421 1.1 mrg ;; C 05
422 1.1.1.2 mrg L(00):
423 1.1.1.3 mrg .pred.rel "mutex", p12, p13
424 1.1.1.2 mrg getfsig pr0_1 = fp0b_1
425 1.1.1.2 mrg xma.l fp1b_1 = u_1, v1, fp1a_1
426 1.1.1.2 mrg (p12) add s0 = pr1_2, acc0, 1
427 1.1.1.2 mrg (p13) add s0 = pr1_2, acc0
428 1.1.1.2 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1
429 1.1.1.2 mrg nop 1
430 1.1 mrg ;; C 06
431 1.1.1.3 mrg .pred.rel "mutex", p8, p9
432 1.1.1.3 mrg .pred.rel "mutex", p12, p13
433 1.1.1.2 mrg ldf8 u_1 = [up], 8
434 1.1.1.2 mrg getfsig pr1_0 = fp1b_0
435 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3
436 1.1.1.2 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_3
437 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_2
438 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2
439 1.1 mrg ;; C 07
440 1.1.1.3 mrg .pred.rel "mutex", p6, p7
441 1.1.1.2 mrg getfsig acc1_0 = fp2a_0
442 1.1.1.2 mrg st8 [rp] = s0, 8
443 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0
444 1.1.1.2 mrg (p6) add acc0 = pr0_0, acc1_2, 1
445 1.1.1.2 mrg (p7) add acc0 = pr0_0, acc1_2
446 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0
447 1.1 mrg ;; C 08
448 1.1.1.2 mrg L(11):
449 1.1.1.3 mrg .pred.rel "mutex", p10, p11
450 1.1.1.2 mrg getfsig pr0_2 = fp0b_2
451 1.1.1.2 mrg xma.l fp1b_2 = u_2, v1, fp1a_2
452 1.1.1.2 mrg (p10) add s0 = pr1_3, acc0, 1
453 1.1.1.2 mrg (p11) add s0 = pr1_3, acc0
454 1.1.1.2 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2
455 1.1.1.2 mrg nop 1
456 1.1 mrg ;; C 09
457 1.1.1.3 mrg .pred.rel "mutex", p6, p7
458 1.1.1.3 mrg .pred.rel "mutex", p10, p11
459 1.1.1.2 mrg ldf8 u_2 = [up], 8
460 1.1.1.2 mrg getfsig pr1_1 = fp1b_1
461 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0
462 1.1.1.2 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_0
463 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_3
464 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3
465 1.1 mrg ;; C 10
466 1.1.1.3 mrg .pred.rel "mutex", p8, p9
467 1.1.1.2 mrg getfsig acc1_1 = fp2a_1
468 1.1.1.2 mrg st8 [rp] = s0, 8
469 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0
470 1.1.1.2 mrg (p8) add acc0 = pr0_1, acc1_3, 1
471 1.1.1.2 mrg (p9) add acc0 = pr0_1, acc1_3
472 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0
473 1.1 mrg ;; C 11
474 1.1.1.2 mrg L(10):
475 1.1.1.3 mrg .pred.rel "mutex", p12, p13
476 1.1.1.2 mrg getfsig pr0_3 = fp0b_3
477 1.1.1.2 mrg xma.l fp1b_3 = u_3, v1, fp1a_3
478 1.1.1.2 mrg (p12) add s0 = pr1_0, acc0, 1
479 1.1.1.2 mrg (p13) add s0 = pr1_0, acc0
480 1.1.1.2 mrg xma.hu fp2a_3 = u_3, v1, fp1a_3
481 1.1.1.2 mrg br.cloop.dptk L(top)
482 1.1 mrg ;;
483 1.1 mrg C *** MAIN LOOP END ***
484 1.1 mrg
485 1.1.1.3 mrg .pred.rel "mutex", p8, p9
486 1.1.1.3 mrg .pred.rel "mutex", p12, p13
487 1.1.1.3 mrg {.mmi; getfsig pr1_2 = fp1b_2
488 1.1.1.2 mrg st8 [rp] = s0, 8
489 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
490 1.1.1.3 mrg }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
491 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
492 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
493 1.1.1.2 mrg ;;
494 1.1.1.3 mrg } .pred.rel "mutex", p6, p7
495 1.1.1.3 mrg {.mfi; getfsig acc1_2 = fp2a_2
496 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0
497 1.1.1.2 mrg nop 1
498 1.1.1.3 mrg }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
499 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0
500 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0
501 1.1.1.2 mrg ;;
502 1.1.1.3 mrg }
503 1.1.1.2 mrg L(cj5):
504 1.1.1.3 mrg .pred.rel "mutex", p10, p11
505 1.1.1.3 mrg {.mfi; getfsig pr0_0 = fp0b_0
506 1.1.1.2 mrg xma.l fp1b_0 = u_0, v1, fp1a_0
507 1.1.1.2 mrg (p10) add s0 = pr1_1, acc0, 1
508 1.1.1.3 mrg }{.mfi; (p11) add s0 = pr1_1, acc0
509 1.1.1.2 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0
510 1.1.1.2 mrg nop 1
511 1.1.1.2 mrg ;;
512 1.1.1.3 mrg } .pred.rel "mutex", p6, p7
513 1.1.1.3 mrg .pred.rel "mutex", p10, p11
514 1.1.1.3 mrg {.mmi; getfsig pr1_3 = fp1b_3
515 1.1.1.2 mrg st8 [rp] = s0, 8
516 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
517 1.1.1.3 mrg }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
518 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
519 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
520 1.1.1.2 mrg ;;
521 1.1.1.3 mrg } .pred.rel "mutex", p8, p9
522 1.1.1.3 mrg {.mfi; getfsig acc1_3 = fp2a_3
523 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0
524 1.1.1.2 mrg nop 1
525 1.1.1.3 mrg }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
526 1.1.1.2 mrg (p9) add acc0 = pr0_3, acc1_1
527 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0
528 1.1.1.2 mrg ;;
529 1.1.1.3 mrg }
530 1.1.1.2 mrg L(cj4):
531 1.1.1.3 mrg .pred.rel "mutex", p12, p13
532 1.1.1.3 mrg {.mfi; getfsig pr0_1 = fp0b_1
533 1.1.1.2 mrg xma.l fp1b_1 = u_1, v1, fp1a_1
534 1.1.1.2 mrg (p12) add s0 = pr1_2, acc0, 1
535 1.1.1.3 mrg }{.mfi; (p13) add s0 = pr1_2, acc0
536 1.1.1.2 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1
537 1.1.1.2 mrg nop 1
538 1.1.1.2 mrg ;;
539 1.1.1.3 mrg } .pred.rel "mutex", p8, p9
540 1.1.1.3 mrg .pred.rel "mutex", p12, p13
541 1.1.1.3 mrg {.mmi; getfsig pr1_0 = fp1b_0
542 1.1.1.2 mrg st8 [rp] = s0, 8
543 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3
544 1.1.1.3 mrg }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
545 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_2
546 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2
547 1.1.1.2 mrg ;;
548 1.1.1.3 mrg } .pred.rel "mutex", p6, p7
549 1.1.1.3 mrg {.mmi; getfsig acc1_0 = fp2a_0
550 1.1.1.2 mrg (p6) add acc0 = pr0_0, acc1_2, 1
551 1.1.1.2 mrg (p7) add acc0 = pr0_0, acc1_2
552 1.1.1.2 mrg ;;
553 1.1.1.3 mrg }
554 1.1.1.2 mrg L(cj3):
555 1.1.1.3 mrg .pred.rel "mutex", p10, p11
556 1.1.1.3 mrg {.mfi; getfsig pr0_2 = fp0b_2
557 1.1.1.2 mrg xma.l fp1b_2 = u_2, v1, fp1a_2
558 1.1.1.2 mrg (p10) add s0 = pr1_3, acc0, 1
559 1.1.1.3 mrg }{.mfi; (p11) add s0 = pr1_3, acc0
560 1.1.1.2 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2
561 1.1.1.2 mrg nop 1
562 1.1.1.2 mrg ;;
563 1.1.1.3 mrg } .pred.rel "mutex", p6, p7
564 1.1.1.3 mrg .pred.rel "mutex", p10, p11
565 1.1.1.3 mrg {.mmi; getfsig pr1_1 = fp1b_1
566 1.1.1.2 mrg st8 [rp] = s0, 8
567 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0
568 1.1.1.3 mrg }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
569 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_3
570 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3
571 1.1.1.2 mrg ;;
572 1.1.1.3 mrg } .pred.rel "mutex", p8, p9
573 1.1.1.3 mrg {.mmi; getfsig acc1_1 = fp2a_1
574 1.1.1.2 mrg (p8) add acc0 = pr0_1, acc1_3, 1
575 1.1.1.2 mrg (p9) add acc0 = pr0_1, acc1_3
576 1.1.1.2 mrg ;;
577 1.1.1.3 mrg } .pred.rel "mutex", p12, p13
578 1.1.1.3 mrg {.mmi; (p12) add s0 = pr1_0, acc0, 1
579 1.1.1.2 mrg (p13) add s0 = pr1_0, acc0
580 1.1.1.2 mrg nop 1
581 1.1.1.2 mrg ;;
582 1.1.1.3 mrg } .pred.rel "mutex", p8, p9
583 1.1.1.3 mrg .pred.rel "mutex", p12, p13
584 1.1.1.3 mrg {.mmi; getfsig pr1_2 = fp1b_2
585 1.1.1.2 mrg st8 [rp] = s0, 8
586 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1
587 1.1.1.3 mrg }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
588 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0
589 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0
590 1.1.1.2 mrg ;;
591 1.1.1.3 mrg } .pred.rel "mutex", p6, p7
592 1.1.1.3 mrg {.mmi; getfsig r8 = fp2a_2
593 1.1.1.2 mrg (p6) add acc0 = pr0_2, acc1_0, 1
594 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0
595 1.1.1.2 mrg ;;
596 1.1.1.3 mrg } .pred.rel "mutex", p10, p11
597 1.1.1.3 mrg {.mmi; (p10) add s0 = pr1_1, acc0, 1
598 1.1.1.2 mrg (p11) add s0 = pr1_1, acc0
599 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2
600 1.1.1.2 mrg ;;
601 1.1.1.3 mrg } .pred.rel "mutex", p10, p11
602 1.1.1.3 mrg {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
603 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1
604 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1
605 1.1.1.2 mrg ;;
606 1.1.1.3 mrg } .pred.rel "mutex", p8, p9
607 1.1.1.3 mrg {.mmi; st8 [rp] = s0, 8
608 1.1.1.2 mrg (p8) add acc0 = pr1_2, acc1_1, 1
609 1.1.1.2 mrg (p9) add acc0 = pr1_2, acc1_1
610 1.1.1.2 mrg ;;
611 1.1.1.3 mrg } .pred.rel "mutex", p8, p9
612 1.1.1.3 mrg {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
613 1.1.1.2 mrg (p9) cmp.ltu p10, p11 = acc0, pr1_2
614 1.1.1.2 mrg (p12) add acc0 = 1, acc0
615 1.1.1.2 mrg ;;
616 1.1.1.3 mrg }{.mmi; st8 [rp] = acc0, 8
617 1.1.1.2 mrg (p12) cmpeqor p10, p0 = 0, acc0
618 1.1.1.2 mrg nop 1
619 1.1.1.2 mrg ;;
620 1.1.1.3 mrg }{.mib; (p10) add r8 = 1, r8
621 1.1.1.2 mrg mov ar.lc = r2
622 1.1.1.2 mrg br.ret.sptk.many b0
623 1.1.1.3 mrg }
624 1.1 mrg EPILOGUE()
625 1.1 mrg ASM_END()
626