addmul_1.asm revision 1.1 1 1.1 mrg dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2 1.1 mrg dnl result to a second limb vector.
3 1.1 mrg
4 1.1 mrg dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
5 1.1 mrg dnl Foundation, Inc.
6 1.1 mrg
7 1.1 mrg dnl This file is part of the GNU MP Library.
8 1.1 mrg
9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 1.1 mrg dnl it under the terms of the GNU Lesser General Public License as published
11 1.1 mrg dnl by the Free Software Foundation; either version 3 of the License, or (at
12 1.1 mrg dnl your option) any later version.
13 1.1 mrg
14 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 1.1 mrg dnl License for more details.
18 1.1 mrg
19 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
20 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 1.1 mrg
22 1.1 mrg include(`../config.m4')
23 1.1 mrg
24 1.1 mrg C cycles/limb
25 1.1 mrg C Itanium: 3.0
26 1.1 mrg C Itanium 2: 2.0
27 1.1 mrg
28 1.1 mrg C TODO
29 1.1 mrg C * Further optimize feed-in and wind-down code, both for speed and code size.
30 1.1 mrg C * Handle low limb input and results specially, using a common stf8 in the
31 1.1 mrg C epilogue.
32 1.1 mrg C * Use 1 c/l carry propagation scheme in wind-down code.
33 1.1 mrg C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
34 1.1 mrg C * Work out final differences with mul_1.asm. That function is 300 bytes
35 1.1 mrg C smaller than this due to better loop scheduling and thus simpler feed-in
36 1.1 mrg C code.
37 1.1 mrg
38 1.1 mrg C INPUT PARAMETERS
39 1.1 mrg define(`rp', `r32')
40 1.1 mrg define(`up', `r33')
41 1.1 mrg define(`n', `r34')
42 1.1 mrg define(`vl', `r35')
43 1.1 mrg
44 1.1 mrg ASM_START()
45 1.1 mrg PROLOGUE(mpn_addmul_1)
46 1.1 mrg .prologue
47 1.1 mrg .save ar.lc, r2
48 1.1 mrg .body
49 1.1 mrg
50 1.1 mrg ifdef(`HAVE_ABI_32',
51 1.1 mrg ` addp4 rp = 0, rp C M I
52 1.1 mrg addp4 up = 0, up C M I
53 1.1 mrg zxt4 n = n C I
54 1.1 mrg ;;
55 1.1 mrg ')
56 1.1 mrg {.mmi
57 1.1 mrg adds r15 = -1, n C M I
58 1.1 mrg mov r20 = rp C M I
59 1.1 mrg mov.i r2 = ar.lc C I0
60 1.1 mrg }
61 1.1 mrg {.mmi
62 1.1 mrg ldf8 f7 = [up], 8 C M
63 1.1 mrg ldf8 f8 = [rp], 8 C M
64 1.1 mrg and r14 = 3, n C M I
65 1.1 mrg ;;
66 1.1 mrg }
67 1.1 mrg {.mmi
68 1.1 mrg setf.sig f6 = vl C M2 M3
69 1.1 mrg cmp.eq p10, p0 = 0, r14 C M I
70 1.1 mrg shr.u r31 = r15, 2 C I0
71 1.1 mrg }
72 1.1 mrg {.mmi
73 1.1 mrg cmp.eq p11, p0 = 2, r14 C M I
74 1.1 mrg cmp.eq p12, p0 = 3, r14 C M I
75 1.1 mrg nop.i 0 C I
76 1.1 mrg ;;
77 1.1 mrg }
78 1.1 mrg {.mii
79 1.1 mrg cmp.ne p6, p7 = r0, r0 C M I
80 1.1 mrg mov.i ar.lc = r31 C I0
81 1.1 mrg cmp.ne p8, p9 = r0, r0 C M I
82 1.1 mrg }
83 1.1 mrg {.bbb
84 1.1 mrg (p10) br.dptk .Lb00 C B
85 1.1 mrg (p11) br.dptk .Lb10 C B
86 1.1 mrg (p12) br.dptk .Lb11 C B
87 1.1 mrg ;;
88 1.1 mrg }
89 1.1 mrg
90 1.1 mrg .Lb01: br.cloop.dptk .grt1 C B
91 1.1 mrg
92 1.1 mrg xma.l f39 = f7, f6, f8 C F
93 1.1 mrg xma.hu f43 = f7, f6, f8 C F
94 1.1 mrg ;;
95 1.1 mrg getf.sig r8 = f43 C M2
96 1.1 mrg stf8 [r20] = f39 C M2 M3
97 1.1 mrg mov.i ar.lc = r2 C I0
98 1.1 mrg br.ret.sptk.many b0 C B
99 1.1 mrg
100 1.1 mrg .grt1:
101 1.1 mrg ldf8 f32 = [up], 8
102 1.1 mrg ldf8 f44 = [rp], 8
103 1.1 mrg ;;
104 1.1 mrg ldf8 f33 = [up], 8
105 1.1 mrg ldf8 f45 = [rp], 8
106 1.1 mrg ;;
107 1.1 mrg ldf8 f34 = [up], 8
108 1.1 mrg xma.l f39 = f7, f6, f8
109 1.1 mrg ldf8 f46 = [rp], 8
110 1.1 mrg xma.hu f43 = f7, f6, f8
111 1.1 mrg ;;
112 1.1 mrg ldf8 f35 = [up], 8
113 1.1 mrg ldf8 f47 = [rp], 8
114 1.1 mrg br.cloop.dptk .grt5
115 1.1 mrg
116 1.1 mrg xma.l f36 = f32, f6, f44
117 1.1 mrg xma.hu f40 = f32, f6, f44
118 1.1 mrg ;;
119 1.1 mrg stf8 [r20] = f39, 8
120 1.1 mrg xma.l f37 = f33, f6, f45
121 1.1 mrg xma.hu f41 = f33, f6, f45
122 1.1 mrg ;;
123 1.1 mrg getf.sig r31 = f43
124 1.1 mrg getf.sig r24 = f36
125 1.1 mrg xma.l f38 = f34, f6, f46
126 1.1 mrg xma.hu f42 = f34, f6, f46
127 1.1 mrg ;;
128 1.1 mrg getf.sig r28 = f40
129 1.1 mrg getf.sig r25 = f37
130 1.1 mrg xma.l f39 = f35, f6, f47
131 1.1 mrg xma.hu f43 = f35, f6, f47
132 1.1 mrg ;;
133 1.1 mrg getf.sig r29 = f41
134 1.1 mrg getf.sig r26 = f38
135 1.1 mrg br .Lcj5
136 1.1 mrg
137 1.1 mrg .grt5:
138 1.1 mrg mov r30 = 0
139 1.1 mrg xma.l f36 = f32, f6, f44
140 1.1 mrg xma.hu f40 = f32, f6, f44
141 1.1 mrg ;;
142 1.1 mrg ldf8 f32 = [up], 8
143 1.1 mrg xma.l f37 = f33, f6, f45
144 1.1 mrg ldf8 f44 = [rp], 8
145 1.1 mrg xma.hu f41 = f33, f6, f45
146 1.1 mrg ;;
147 1.1 mrg ldf8 f33 = [up], 8
148 1.1 mrg getf.sig r27 = f39
149 1.1 mrg ;;
150 1.1 mrg getf.sig r31 = f43
151 1.1 mrg xma.l f38 = f34, f6, f46
152 1.1 mrg ldf8 f45 = [rp], 8
153 1.1 mrg xma.hu f42 = f34, f6, f46
154 1.1 mrg ;;
155 1.1 mrg ldf8 f34 = [up], 8
156 1.1 mrg getf.sig r24 = f36
157 1.1 mrg ;;
158 1.1 mrg getf.sig r28 = f40
159 1.1 mrg xma.l f39 = f35, f6, f47
160 1.1 mrg ldf8 f46 = [rp], 8
161 1.1 mrg xma.hu f43 = f35, f6, f47
162 1.1 mrg ;;
163 1.1 mrg ldf8 f35 = [up], 8
164 1.1 mrg getf.sig r25 = f37
165 1.1 mrg br.cloop.dptk .Loop
166 1.1 mrg br .Le0
167 1.1 mrg
168 1.1 mrg
169 1.1 mrg .Lb10: ldf8 f35 = [up], 8
170 1.1 mrg ldf8 f47 = [rp], 8
171 1.1 mrg br.cloop.dptk .grt2
172 1.1 mrg
173 1.1 mrg xma.l f38 = f7, f6, f8
174 1.1 mrg xma.hu f42 = f7, f6, f8
175 1.1 mrg ;;
176 1.1 mrg xma.l f39 = f35, f6, f47
177 1.1 mrg xma.hu f43 = f35, f6, f47
178 1.1 mrg ;;
179 1.1 mrg getf.sig r30 = f42
180 1.1 mrg stf8 [r20] = f38, 8
181 1.1 mrg getf.sig r27 = f39
182 1.1 mrg getf.sig r8 = f43
183 1.1 mrg br .Lcj2
184 1.1 mrg
185 1.1 mrg .grt2:
186 1.1 mrg ldf8 f32 = [up], 8
187 1.1 mrg ldf8 f44 = [rp], 8
188 1.1 mrg ;;
189 1.1 mrg ldf8 f33 = [up], 8
190 1.1 mrg xma.l f38 = f7, f6, f8
191 1.1 mrg ldf8 f45 = [rp], 8
192 1.1 mrg xma.hu f42 = f7, f6, f8
193 1.1 mrg ;;
194 1.1 mrg ldf8 f34 = [up], 8
195 1.1 mrg xma.l f39 = f35, f6, f47
196 1.1 mrg ldf8 f46 = [rp], 8
197 1.1 mrg xma.hu f43 = f35, f6, f47
198 1.1 mrg ;;
199 1.1 mrg ldf8 f35 = [up], 8
200 1.1 mrg ldf8 f47 = [rp], 8
201 1.1 mrg br.cloop.dptk .grt6
202 1.1 mrg
203 1.1 mrg stf8 [r20] = f38, 8
204 1.1 mrg xma.l f36 = f32, f6, f44
205 1.1 mrg xma.hu f40 = f32, f6, f44
206 1.1 mrg ;;
207 1.1 mrg getf.sig r30 = f42
208 1.1 mrg getf.sig r27 = f39
209 1.1 mrg xma.l f37 = f33, f6, f45
210 1.1 mrg xma.hu f41 = f33, f6, f45
211 1.1 mrg ;;
212 1.1 mrg getf.sig r31 = f43
213 1.1 mrg getf.sig r24 = f36
214 1.1 mrg xma.l f38 = f34, f6, f46
215 1.1 mrg xma.hu f42 = f34, f6, f46
216 1.1 mrg ;;
217 1.1 mrg getf.sig r28 = f40
218 1.1 mrg getf.sig r25 = f37
219 1.1 mrg xma.l f39 = f35, f6, f47
220 1.1 mrg xma.hu f43 = f35, f6, f47
221 1.1 mrg br .Lcj6
222 1.1 mrg
223 1.1 mrg .grt6:
224 1.1 mrg mov r29 = 0
225 1.1 mrg xma.l f36 = f32, f6, f44
226 1.1 mrg xma.hu f40 = f32, f6, f44
227 1.1 mrg ;;
228 1.1 mrg ldf8 f32 = [up], 8
229 1.1 mrg getf.sig r26 = f38
230 1.1 mrg ;;
231 1.1 mrg getf.sig r30 = f42
232 1.1 mrg xma.l f37 = f33, f6, f45
233 1.1 mrg ldf8 f44 = [rp], 8
234 1.1 mrg xma.hu f41 = f33, f6, f45
235 1.1 mrg ;;
236 1.1 mrg ldf8 f33 = [up], 8
237 1.1 mrg getf.sig r27 = f39
238 1.1 mrg ;;
239 1.1 mrg getf.sig r31 = f43
240 1.1 mrg xma.l f38 = f34, f6, f46
241 1.1 mrg ldf8 f45 = [rp], 8
242 1.1 mrg xma.hu f42 = f34, f6, f46
243 1.1 mrg ;;
244 1.1 mrg ldf8 f34 = [up], 8
245 1.1 mrg getf.sig r24 = f36
246 1.1 mrg br .LL10
247 1.1 mrg
248 1.1 mrg
249 1.1 mrg .Lb11: ldf8 f34 = [up], 8
250 1.1 mrg ldf8 f46 = [rp], 8
251 1.1 mrg ;;
252 1.1 mrg ldf8 f35 = [up], 8
253 1.1 mrg ldf8 f47 = [rp], 8
254 1.1 mrg br.cloop.dptk .grt3
255 1.1 mrg ;;
256 1.1 mrg
257 1.1 mrg xma.l f37 = f7, f6, f8
258 1.1 mrg xma.hu f41 = f7, f6, f8
259 1.1 mrg xma.l f38 = f34, f6, f46
260 1.1 mrg xma.hu f42 = f34, f6, f46
261 1.1 mrg xma.l f39 = f35, f6, f47
262 1.1 mrg xma.hu f43 = f35, f6, f47
263 1.1 mrg ;;
264 1.1 mrg getf.sig r29 = f41
265 1.1 mrg stf8 [r20] = f37, 8
266 1.1 mrg getf.sig r26 = f38
267 1.1 mrg getf.sig r30 = f42
268 1.1 mrg getf.sig r27 = f39
269 1.1 mrg getf.sig r8 = f43
270 1.1 mrg br .Lcj3
271 1.1 mrg
272 1.1 mrg .grt3:
273 1.1 mrg ldf8 f32 = [up], 8
274 1.1 mrg xma.l f37 = f7, f6, f8
275 1.1 mrg ldf8 f44 = [rp], 8
276 1.1 mrg xma.hu f41 = f7, f6, f8
277 1.1 mrg ;;
278 1.1 mrg ldf8 f33 = [up], 8
279 1.1 mrg xma.l f38 = f34, f6, f46
280 1.1 mrg ldf8 f45 = [rp], 8
281 1.1 mrg xma.hu f42 = f34, f6, f46
282 1.1 mrg ;;
283 1.1 mrg ldf8 f34 = [up], 8
284 1.1 mrg xma.l f39 = f35, f6, f47
285 1.1 mrg ldf8 f46 = [rp], 8
286 1.1 mrg xma.hu f43 = f35, f6, f47
287 1.1 mrg ;;
288 1.1 mrg ldf8 f35 = [up], 8
289 1.1 mrg getf.sig r25 = f37 C FIXME
290 1.1 mrg ldf8 f47 = [rp], 8
291 1.1 mrg br.cloop.dptk .grt7
292 1.1 mrg
293 1.1 mrg getf.sig r29 = f41
294 1.1 mrg stf8 [r20] = f37, 8 C FIXME
295 1.1 mrg xma.l f36 = f32, f6, f44
296 1.1 mrg getf.sig r26 = f38
297 1.1 mrg xma.hu f40 = f32, f6, f44
298 1.1 mrg ;;
299 1.1 mrg getf.sig r30 = f42
300 1.1 mrg xma.l f37 = f33, f6, f45
301 1.1 mrg getf.sig r27 = f39
302 1.1 mrg xma.hu f41 = f33, f6, f45
303 1.1 mrg ;;
304 1.1 mrg getf.sig r31 = f43
305 1.1 mrg xma.l f38 = f34, f6, f46
306 1.1 mrg getf.sig r24 = f36
307 1.1 mrg xma.hu f42 = f34, f6, f46
308 1.1 mrg br .Lcj7
309 1.1 mrg
310 1.1 mrg .grt7:
311 1.1 mrg getf.sig r29 = f41
312 1.1 mrg xma.l f36 = f32, f6, f44
313 1.1 mrg mov r28 = 0
314 1.1 mrg xma.hu f40 = f32, f6, f44
315 1.1 mrg ;;
316 1.1 mrg ldf8 f32 = [up], 8
317 1.1 mrg getf.sig r26 = f38
318 1.1 mrg ;;
319 1.1 mrg getf.sig r30 = f42
320 1.1 mrg xma.l f37 = f33, f6, f45
321 1.1 mrg ldf8 f44 = [rp], 8
322 1.1 mrg xma.hu f41 = f33, f6, f45
323 1.1 mrg ;;
324 1.1 mrg ldf8 f33 = [up], 8
325 1.1 mrg getf.sig r27 = f39
326 1.1 mrg br .LL11
327 1.1 mrg
328 1.1 mrg
329 1.1 mrg .Lb00: ldf8 f33 = [up], 8
330 1.1 mrg ldf8 f45 = [rp], 8
331 1.1 mrg ;;
332 1.1 mrg ldf8 f34 = [up], 8
333 1.1 mrg ldf8 f46 = [rp], 8
334 1.1 mrg ;;
335 1.1 mrg ldf8 f35 = [up], 8
336 1.1 mrg xma.l f36 = f7, f6, f8
337 1.1 mrg ldf8 f47 = [rp], 8
338 1.1 mrg xma.hu f40 = f7, f6, f8
339 1.1 mrg br.cloop.dptk .grt4
340 1.1 mrg
341 1.1 mrg xma.l f37 = f33, f6, f45
342 1.1 mrg xma.hu f41 = f33, f6, f45
343 1.1 mrg xma.l f38 = f34, f6, f46
344 1.1 mrg xma.hu f42 = f34, f6, f46
345 1.1 mrg ;;
346 1.1 mrg getf.sig r28 = f40
347 1.1 mrg stf8 [r20] = f36, 8
348 1.1 mrg xma.l f39 = f35, f6, f47
349 1.1 mrg getf.sig r25 = f37
350 1.1 mrg xma.hu f43 = f35, f6, f47
351 1.1 mrg ;;
352 1.1 mrg getf.sig r29 = f41
353 1.1 mrg getf.sig r26 = f38
354 1.1 mrg getf.sig r30 = f42
355 1.1 mrg getf.sig r27 = f39
356 1.1 mrg br .Lcj4
357 1.1 mrg
358 1.1 mrg .grt4:
359 1.1 mrg ldf8 f32 = [up], 8
360 1.1 mrg xma.l f37 = f33, f6, f45
361 1.1 mrg ldf8 f44 = [rp], 8
362 1.1 mrg xma.hu f41 = f33, f6, f45
363 1.1 mrg ;;
364 1.1 mrg ldf8 f33 = [up], 8
365 1.1 mrg xma.l f38 = f34, f6, f46
366 1.1 mrg ldf8 f45 = [rp], 8
367 1.1 mrg xma.hu f42 = f34, f6, f46
368 1.1 mrg ;;
369 1.1 mrg ldf8 f34 = [up], 8
370 1.1 mrg getf.sig r24 = f36 C FIXME
371 1.1 mrg xma.l f39 = f35, f6, f47
372 1.1 mrg ldf8 f46 = [rp], 8
373 1.1 mrg getf.sig r28 = f40
374 1.1 mrg xma.hu f43 = f35, f6, f47
375 1.1 mrg ;;
376 1.1 mrg ldf8 f35 = [up], 8
377 1.1 mrg getf.sig r25 = f37
378 1.1 mrg ldf8 f47 = [rp], 8
379 1.1 mrg br.cloop.dptk .grt8
380 1.1 mrg
381 1.1 mrg getf.sig r29 = f41
382 1.1 mrg stf8 [r20] = f36, 8 C FIXME
383 1.1 mrg xma.l f36 = f32, f6, f44
384 1.1 mrg getf.sig r26 = f38
385 1.1 mrg getf.sig r30 = f42
386 1.1 mrg xma.hu f40 = f32, f6, f44
387 1.1 mrg ;;
388 1.1 mrg xma.l f37 = f33, f6, f45
389 1.1 mrg getf.sig r27 = f39
390 1.1 mrg xma.hu f41 = f33, f6, f45
391 1.1 mrg br .Lcj8
392 1.1 mrg
393 1.1 mrg .grt8:
394 1.1 mrg getf.sig r29 = f41
395 1.1 mrg xma.l f36 = f32, f6, f44
396 1.1 mrg mov r31 = 0
397 1.1 mrg xma.hu f40 = f32, f6, f44
398 1.1 mrg ;;
399 1.1 mrg ldf8 f32 = [up], 8
400 1.1 mrg getf.sig r26 = f38
401 1.1 mrg br .LL00
402 1.1 mrg
403 1.1 mrg
404 1.1 mrg C *** MAIN LOOP START ***
405 1.1 mrg ALIGN(32) C insn fed cycle #
406 1.1 mrg .Loop:
407 1.1 mrg .pred.rel "mutex", p6, p7 C num by i1 i2
408 1.1 mrg getf.sig r29 = f41 C 00 16 0 0
409 1.1 mrg xma.l f36 = f32, f6, f44 C 01 06,15 0 0
410 1.1 mrg (p6) add r14 = r30, r27, 1 C 02 0 0
411 1.1 mrg ldf8 f47 = [rp], 8 C 03 0 0
412 1.1 mrg xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
413 1.1 mrg (p7) add r14 = r30, r27 C 05 0 0
414 1.1 mrg ;;
415 1.1 mrg .pred.rel "mutex", p6, p7
416 1.1 mrg ldf8 f32 = [up], 8 C 06 1 1
417 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
418 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
419 1.1 mrg getf.sig r26 = f38 C 09 25 2 1
420 1.1 mrg st8 [r20] = r14, 8 C 10 2 1
421 1.1 mrg nop.b 0 C 11 2 1
422 1.1 mrg ;;
423 1.1 mrg .LL00:
424 1.1 mrg .pred.rel "mutex", p8, p9
425 1.1 mrg getf.sig r30 = f42 C 12 28 3 2
426 1.1 mrg xma.l f37 = f33, f6, f45 C 13 18,27 3 2
427 1.1 mrg (p8) add r16 = r31, r24, 1 C 14 3 2
428 1.1 mrg ldf8 f44 = [rp], 8 C 15 3 2
429 1.1 mrg xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
430 1.1 mrg (p9) add r16 = r31, r24 C 17 3 2
431 1.1 mrg ;;
432 1.1 mrg .pred.rel "mutex", p8, p9
433 1.1 mrg ldf8 f33 = [up], 8 C 18 4 3
434 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
435 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
436 1.1 mrg getf.sig r27 = f39 C 21 37 5 3
437 1.1 mrg st8 [r20] = r16, 8 C 22 5 3
438 1.1 mrg nop.b 0 C 23 5 3
439 1.1 mrg ;;
440 1.1 mrg .LL11:
441 1.1 mrg .pred.rel "mutex", p6, p7
442 1.1 mrg getf.sig r31 = f43 C 24 40 6 4
443 1.1 mrg xma.l f38 = f34, f6, f46 C 25 30,39 6 4
444 1.1 mrg (p6) add r14 = r28, r25, 1 C 26 6 4
445 1.1 mrg ldf8 f45 = [rp], 8 C 27 6 4
446 1.1 mrg xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
447 1.1 mrg (p7) add r14 = r28, r25 C 29 6 4
448 1.1 mrg ;;
449 1.1 mrg .pred.rel "mutex", p6, p7
450 1.1 mrg ldf8 f34 = [up], 8 C 30 7 5
451 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
452 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
453 1.1 mrg getf.sig r24 = f36 C 33 01 8 5
454 1.1 mrg st8 [r20] = r14, 8 C 34 8 5
455 1.1 mrg nop.b 0 C 35 8 5
456 1.1 mrg ;;
457 1.1 mrg .LL10:
458 1.1 mrg .pred.rel "mutex", p8, p9
459 1.1 mrg getf.sig r28 = f40 C 36 04 9 6
460 1.1 mrg xma.l f39 = f35, f6, f47 C 37 42,03 9 6
461 1.1 mrg (p8) add r16 = r29, r26, 1 C 38 9 6
462 1.1 mrg ldf8 f46 = [rp], 8 C 39 9 6
463 1.1 mrg xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
464 1.1 mrg (p9) add r16 = r29, r26 C 41 9 6
465 1.1 mrg ;;
466 1.1 mrg .pred.rel "mutex", p8, p9
467 1.1 mrg ldf8 f35 = [up], 8 C 42 10 7
468 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
469 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
470 1.1 mrg getf.sig r25 = f37 C 45 13 11 7
471 1.1 mrg st8 [r20] = r16, 8 C 46 11 7
472 1.1 mrg br.cloop.dptk .Loop C 47 11 7
473 1.1 mrg C *** MAIN LOOP END ***
474 1.1 mrg ;;
475 1.1 mrg .Le0:
476 1.1 mrg .pred.rel "mutex", p6, p7
477 1.1 mrg getf.sig r29 = f41 C
478 1.1 mrg xma.l f36 = f32, f6, f44 C
479 1.1 mrg (p6) add r14 = r30, r27, 1 C
480 1.1 mrg ldf8 f47 = [rp], 8 C
481 1.1 mrg xma.hu f40 = f32, f6, f44 C
482 1.1 mrg (p7) add r14 = r30, r27 C
483 1.1 mrg ;;
484 1.1 mrg .pred.rel "mutex", p6, p7
485 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C
486 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C
487 1.1 mrg getf.sig r26 = f38 C
488 1.1 mrg st8 [r20] = r14, 8 C
489 1.1 mrg ;;
490 1.1 mrg .pred.rel "mutex", p8, p9
491 1.1 mrg getf.sig r30 = f42 C
492 1.1 mrg xma.l f37 = f33, f6, f45 C
493 1.1 mrg (p8) add r16 = r31, r24, 1 C
494 1.1 mrg xma.hu f41 = f33, f6, f45 C
495 1.1 mrg (p9) add r16 = r31, r24 C
496 1.1 mrg ;;
497 1.1 mrg .pred.rel "mutex", p8, p9
498 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C
499 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C
500 1.1 mrg getf.sig r27 = f39 C
501 1.1 mrg st8 [r20] = r16, 8 C
502 1.1 mrg ;;
503 1.1 mrg .Lcj8:
504 1.1 mrg .pred.rel "mutex", p6, p7
505 1.1 mrg getf.sig r31 = f43 C
506 1.1 mrg xma.l f38 = f34, f6, f46 C
507 1.1 mrg (p6) add r14 = r28, r25, 1 C
508 1.1 mrg xma.hu f42 = f34, f6, f46 C
509 1.1 mrg (p7) add r14 = r28, r25 C
510 1.1 mrg ;;
511 1.1 mrg .pred.rel "mutex", p6, p7
512 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C
513 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C
514 1.1 mrg getf.sig r24 = f36 C
515 1.1 mrg st8 [r20] = r14, 8 C
516 1.1 mrg ;;
517 1.1 mrg .Lcj7:
518 1.1 mrg .pred.rel "mutex", p8, p9
519 1.1 mrg getf.sig r28 = f40 C
520 1.1 mrg xma.l f39 = f35, f6, f47 C
521 1.1 mrg (p8) add r16 = r29, r26, 1 C
522 1.1 mrg xma.hu f43 = f35, f6, f47 C
523 1.1 mrg (p9) add r16 = r29, r26 C
524 1.1 mrg ;;
525 1.1 mrg .pred.rel "mutex", p8, p9
526 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C
527 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C
528 1.1 mrg getf.sig r25 = f37 C
529 1.1 mrg st8 [r20] = r16, 8 C
530 1.1 mrg ;;
531 1.1 mrg .Lcj6:
532 1.1 mrg .pred.rel "mutex", p6, p7
533 1.1 mrg getf.sig r29 = f41 C
534 1.1 mrg (p6) add r14 = r30, r27, 1 C
535 1.1 mrg (p7) add r14 = r30, r27 C
536 1.1 mrg ;;
537 1.1 mrg .pred.rel "mutex", p6, p7
538 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C
539 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C
540 1.1 mrg getf.sig r26 = f38 C
541 1.1 mrg st8 [r20] = r14, 8 C
542 1.1 mrg ;;
543 1.1 mrg .Lcj5:
544 1.1 mrg .pred.rel "mutex", p8, p9
545 1.1 mrg getf.sig r30 = f42 C
546 1.1 mrg (p8) add r16 = r31, r24, 1 C
547 1.1 mrg (p9) add r16 = r31, r24 C
548 1.1 mrg ;;
549 1.1 mrg .pred.rel "mutex", p8, p9
550 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C
551 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C
552 1.1 mrg getf.sig r27 = f39 C
553 1.1 mrg st8 [r20] = r16, 8 C
554 1.1 mrg ;;
555 1.1 mrg .Lcj4:
556 1.1 mrg .pred.rel "mutex", p6, p7
557 1.1 mrg getf.sig r8 = f43 C
558 1.1 mrg (p6) add r14 = r28, r25, 1 C
559 1.1 mrg (p7) add r14 = r28, r25 C
560 1.1 mrg ;;
561 1.1 mrg .pred.rel "mutex", p6, p7
562 1.1 mrg st8 [r20] = r14, 8 C
563 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C
564 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C
565 1.1 mrg ;;
566 1.1 mrg .Lcj3:
567 1.1 mrg .pred.rel "mutex", p8, p9
568 1.1 mrg (p8) add r16 = r29, r26, 1 C
569 1.1 mrg (p9) add r16 = r29, r26 C
570 1.1 mrg ;;
571 1.1 mrg .pred.rel "mutex", p8, p9
572 1.1 mrg st8 [r20] = r16, 8 C
573 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C
574 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C
575 1.1 mrg ;;
576 1.1 mrg .Lcj2:
577 1.1 mrg .pred.rel "mutex", p6, p7
578 1.1 mrg (p6) add r14 = r30, r27, 1 C
579 1.1 mrg (p7) add r14 = r30, r27 C
580 1.1 mrg ;;
581 1.1 mrg .pred.rel "mutex", p6, p7
582 1.1 mrg st8 [r20] = r14 C
583 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C
584 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C
585 1.1 mrg ;;
586 1.1 mrg (p8) add r8 = 1, r8 C M I
587 1.1 mrg mov.i ar.lc = r2 C I0
588 1.1 mrg br.ret.sptk.many b0 C B
589 1.1 mrg EPILOGUE()
590 1.1 mrg ASM_END()
591