addmul_1.asm revision 1.1.1.3 1 1.1 mrg dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2 1.1 mrg dnl result to a second limb vector.
3 1.1 mrg
4 1.1.1.2 mrg dnl Contributed to the GNU project by Torbjorn Granlund.
5 1.1.1.2 mrg
6 1.1.1.3 mrg dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc.
7 1.1 mrg
8 1.1 mrg dnl This file is part of the GNU MP Library.
9 1.1.1.3 mrg dnl
10 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 1.1.1.3 mrg dnl it under the terms of either:
12 1.1.1.3 mrg dnl
13 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free
14 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your
15 1.1.1.3 mrg dnl option) any later version.
16 1.1.1.3 mrg dnl
17 1.1.1.3 mrg dnl or
18 1.1.1.3 mrg dnl
19 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software
20 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any
21 1.1.1.3 mrg dnl later version.
22 1.1.1.3 mrg dnl
23 1.1.1.3 mrg dnl or both in parallel, as here.
24 1.1.1.3 mrg dnl
25 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 1.1.1.3 mrg dnl for more details.
29 1.1.1.3 mrg dnl
30 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the
31 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/.
33 1.1 mrg
34 1.1 mrg include(`../config.m4')
35 1.1 mrg
36 1.1 mrg C cycles/limb
37 1.1 mrg C Itanium: 3.0
38 1.1 mrg C Itanium 2: 2.0
39 1.1 mrg
40 1.1 mrg C TODO
41 1.1 mrg C * Further optimize feed-in and wind-down code, both for speed and code size.
42 1.1 mrg C * Handle low limb input and results specially, using a common stf8 in the
43 1.1 mrg C epilogue.
44 1.1 mrg C * Use 1 c/l carry propagation scheme in wind-down code.
45 1.1 mrg C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
46 1.1 mrg C * Work out final differences with mul_1.asm. That function is 300 bytes
47 1.1 mrg C smaller than this due to better loop scheduling and thus simpler feed-in
48 1.1 mrg C code.
49 1.1 mrg
50 1.1 mrg C INPUT PARAMETERS
51 1.1 mrg define(`rp', `r32')
52 1.1 mrg define(`up', `r33')
53 1.1 mrg define(`n', `r34')
54 1.1 mrg define(`vl', `r35')
55 1.1 mrg
56 1.1 mrg ASM_START()
57 1.1 mrg PROLOGUE(mpn_addmul_1)
58 1.1 mrg .prologue
59 1.1 mrg .save ar.lc, r2
60 1.1 mrg .body
61 1.1 mrg
62 1.1 mrg ifdef(`HAVE_ABI_32',
63 1.1 mrg ` addp4 rp = 0, rp C M I
64 1.1 mrg addp4 up = 0, up C M I
65 1.1 mrg zxt4 n = n C I
66 1.1 mrg ;;
67 1.1 mrg ')
68 1.1 mrg {.mmi
69 1.1 mrg adds r15 = -1, n C M I
70 1.1 mrg mov r20 = rp C M I
71 1.1 mrg mov.i r2 = ar.lc C I0
72 1.1 mrg }
73 1.1 mrg {.mmi
74 1.1 mrg ldf8 f7 = [up], 8 C M
75 1.1 mrg ldf8 f8 = [rp], 8 C M
76 1.1 mrg and r14 = 3, n C M I
77 1.1 mrg ;;
78 1.1 mrg }
79 1.1 mrg {.mmi
80 1.1 mrg setf.sig f6 = vl C M2 M3
81 1.1 mrg cmp.eq p10, p0 = 0, r14 C M I
82 1.1 mrg shr.u r31 = r15, 2 C I0
83 1.1 mrg }
84 1.1 mrg {.mmi
85 1.1 mrg cmp.eq p11, p0 = 2, r14 C M I
86 1.1 mrg cmp.eq p12, p0 = 3, r14 C M I
87 1.1 mrg nop.i 0 C I
88 1.1 mrg ;;
89 1.1 mrg }
90 1.1 mrg {.mii
91 1.1 mrg cmp.ne p6, p7 = r0, r0 C M I
92 1.1 mrg mov.i ar.lc = r31 C I0
93 1.1 mrg cmp.ne p8, p9 = r0, r0 C M I
94 1.1 mrg }
95 1.1 mrg {.bbb
96 1.1 mrg (p10) br.dptk .Lb00 C B
97 1.1 mrg (p11) br.dptk .Lb10 C B
98 1.1 mrg (p12) br.dptk .Lb11 C B
99 1.1 mrg ;;
100 1.1 mrg }
101 1.1 mrg
102 1.1 mrg .Lb01: br.cloop.dptk .grt1 C B
103 1.1 mrg
104 1.1 mrg xma.l f39 = f7, f6, f8 C F
105 1.1 mrg xma.hu f43 = f7, f6, f8 C F
106 1.1 mrg ;;
107 1.1 mrg getf.sig r8 = f43 C M2
108 1.1 mrg stf8 [r20] = f39 C M2 M3
109 1.1 mrg mov.i ar.lc = r2 C I0
110 1.1 mrg br.ret.sptk.many b0 C B
111 1.1 mrg
112 1.1 mrg .grt1:
113 1.1 mrg ldf8 f32 = [up], 8
114 1.1 mrg ldf8 f44 = [rp], 8
115 1.1 mrg ;;
116 1.1 mrg ldf8 f33 = [up], 8
117 1.1 mrg ldf8 f45 = [rp], 8
118 1.1 mrg ;;
119 1.1 mrg ldf8 f34 = [up], 8
120 1.1 mrg xma.l f39 = f7, f6, f8
121 1.1 mrg ldf8 f46 = [rp], 8
122 1.1 mrg xma.hu f43 = f7, f6, f8
123 1.1 mrg ;;
124 1.1 mrg ldf8 f35 = [up], 8
125 1.1 mrg ldf8 f47 = [rp], 8
126 1.1 mrg br.cloop.dptk .grt5
127 1.1 mrg
128 1.1 mrg xma.l f36 = f32, f6, f44
129 1.1 mrg xma.hu f40 = f32, f6, f44
130 1.1 mrg ;;
131 1.1 mrg stf8 [r20] = f39, 8
132 1.1 mrg xma.l f37 = f33, f6, f45
133 1.1 mrg xma.hu f41 = f33, f6, f45
134 1.1 mrg ;;
135 1.1 mrg getf.sig r31 = f43
136 1.1 mrg getf.sig r24 = f36
137 1.1 mrg xma.l f38 = f34, f6, f46
138 1.1 mrg xma.hu f42 = f34, f6, f46
139 1.1 mrg ;;
140 1.1 mrg getf.sig r28 = f40
141 1.1 mrg getf.sig r25 = f37
142 1.1 mrg xma.l f39 = f35, f6, f47
143 1.1 mrg xma.hu f43 = f35, f6, f47
144 1.1 mrg ;;
145 1.1 mrg getf.sig r29 = f41
146 1.1 mrg getf.sig r26 = f38
147 1.1 mrg br .Lcj5
148 1.1 mrg
149 1.1 mrg .grt5:
150 1.1 mrg mov r30 = 0
151 1.1 mrg xma.l f36 = f32, f6, f44
152 1.1 mrg xma.hu f40 = f32, f6, f44
153 1.1 mrg ;;
154 1.1 mrg ldf8 f32 = [up], 8
155 1.1 mrg xma.l f37 = f33, f6, f45
156 1.1 mrg ldf8 f44 = [rp], 8
157 1.1 mrg xma.hu f41 = f33, f6, f45
158 1.1 mrg ;;
159 1.1 mrg ldf8 f33 = [up], 8
160 1.1 mrg getf.sig r27 = f39
161 1.1 mrg ;;
162 1.1 mrg getf.sig r31 = f43
163 1.1 mrg xma.l f38 = f34, f6, f46
164 1.1 mrg ldf8 f45 = [rp], 8
165 1.1 mrg xma.hu f42 = f34, f6, f46
166 1.1 mrg ;;
167 1.1 mrg ldf8 f34 = [up], 8
168 1.1 mrg getf.sig r24 = f36
169 1.1 mrg ;;
170 1.1 mrg getf.sig r28 = f40
171 1.1 mrg xma.l f39 = f35, f6, f47
172 1.1 mrg ldf8 f46 = [rp], 8
173 1.1 mrg xma.hu f43 = f35, f6, f47
174 1.1 mrg ;;
175 1.1 mrg ldf8 f35 = [up], 8
176 1.1 mrg getf.sig r25 = f37
177 1.1 mrg br.cloop.dptk .Loop
178 1.1 mrg br .Le0
179 1.1 mrg
180 1.1 mrg
181 1.1 mrg .Lb10: ldf8 f35 = [up], 8
182 1.1 mrg ldf8 f47 = [rp], 8
183 1.1 mrg br.cloop.dptk .grt2
184 1.1 mrg
185 1.1 mrg xma.l f38 = f7, f6, f8
186 1.1 mrg xma.hu f42 = f7, f6, f8
187 1.1 mrg ;;
188 1.1 mrg xma.l f39 = f35, f6, f47
189 1.1 mrg xma.hu f43 = f35, f6, f47
190 1.1 mrg ;;
191 1.1 mrg getf.sig r30 = f42
192 1.1 mrg stf8 [r20] = f38, 8
193 1.1 mrg getf.sig r27 = f39
194 1.1 mrg getf.sig r8 = f43
195 1.1 mrg br .Lcj2
196 1.1 mrg
197 1.1 mrg .grt2:
198 1.1 mrg ldf8 f32 = [up], 8
199 1.1 mrg ldf8 f44 = [rp], 8
200 1.1 mrg ;;
201 1.1 mrg ldf8 f33 = [up], 8
202 1.1 mrg xma.l f38 = f7, f6, f8
203 1.1 mrg ldf8 f45 = [rp], 8
204 1.1 mrg xma.hu f42 = f7, f6, f8
205 1.1 mrg ;;
206 1.1 mrg ldf8 f34 = [up], 8
207 1.1 mrg xma.l f39 = f35, f6, f47
208 1.1 mrg ldf8 f46 = [rp], 8
209 1.1 mrg xma.hu f43 = f35, f6, f47
210 1.1 mrg ;;
211 1.1 mrg ldf8 f35 = [up], 8
212 1.1 mrg ldf8 f47 = [rp], 8
213 1.1 mrg br.cloop.dptk .grt6
214 1.1 mrg
215 1.1 mrg stf8 [r20] = f38, 8
216 1.1 mrg xma.l f36 = f32, f6, f44
217 1.1 mrg xma.hu f40 = f32, f6, f44
218 1.1 mrg ;;
219 1.1 mrg getf.sig r30 = f42
220 1.1 mrg getf.sig r27 = f39
221 1.1 mrg xma.l f37 = f33, f6, f45
222 1.1 mrg xma.hu f41 = f33, f6, f45
223 1.1 mrg ;;
224 1.1 mrg getf.sig r31 = f43
225 1.1 mrg getf.sig r24 = f36
226 1.1 mrg xma.l f38 = f34, f6, f46
227 1.1 mrg xma.hu f42 = f34, f6, f46
228 1.1 mrg ;;
229 1.1 mrg getf.sig r28 = f40
230 1.1 mrg getf.sig r25 = f37
231 1.1 mrg xma.l f39 = f35, f6, f47
232 1.1 mrg xma.hu f43 = f35, f6, f47
233 1.1 mrg br .Lcj6
234 1.1 mrg
235 1.1 mrg .grt6:
236 1.1 mrg mov r29 = 0
237 1.1 mrg xma.l f36 = f32, f6, f44
238 1.1 mrg xma.hu f40 = f32, f6, f44
239 1.1 mrg ;;
240 1.1 mrg ldf8 f32 = [up], 8
241 1.1 mrg getf.sig r26 = f38
242 1.1 mrg ;;
243 1.1 mrg getf.sig r30 = f42
244 1.1 mrg xma.l f37 = f33, f6, f45
245 1.1 mrg ldf8 f44 = [rp], 8
246 1.1 mrg xma.hu f41 = f33, f6, f45
247 1.1 mrg ;;
248 1.1 mrg ldf8 f33 = [up], 8
249 1.1 mrg getf.sig r27 = f39
250 1.1 mrg ;;
251 1.1 mrg getf.sig r31 = f43
252 1.1 mrg xma.l f38 = f34, f6, f46
253 1.1 mrg ldf8 f45 = [rp], 8
254 1.1 mrg xma.hu f42 = f34, f6, f46
255 1.1 mrg ;;
256 1.1 mrg ldf8 f34 = [up], 8
257 1.1 mrg getf.sig r24 = f36
258 1.1 mrg br .LL10
259 1.1 mrg
260 1.1 mrg
261 1.1 mrg .Lb11: ldf8 f34 = [up], 8
262 1.1 mrg ldf8 f46 = [rp], 8
263 1.1 mrg ;;
264 1.1 mrg ldf8 f35 = [up], 8
265 1.1 mrg ldf8 f47 = [rp], 8
266 1.1 mrg br.cloop.dptk .grt3
267 1.1 mrg ;;
268 1.1 mrg
269 1.1 mrg xma.l f37 = f7, f6, f8
270 1.1 mrg xma.hu f41 = f7, f6, f8
271 1.1 mrg xma.l f38 = f34, f6, f46
272 1.1 mrg xma.hu f42 = f34, f6, f46
273 1.1 mrg xma.l f39 = f35, f6, f47
274 1.1 mrg xma.hu f43 = f35, f6, f47
275 1.1 mrg ;;
276 1.1 mrg getf.sig r29 = f41
277 1.1 mrg stf8 [r20] = f37, 8
278 1.1 mrg getf.sig r26 = f38
279 1.1 mrg getf.sig r30 = f42
280 1.1 mrg getf.sig r27 = f39
281 1.1 mrg getf.sig r8 = f43
282 1.1 mrg br .Lcj3
283 1.1 mrg
284 1.1 mrg .grt3:
285 1.1 mrg ldf8 f32 = [up], 8
286 1.1 mrg xma.l f37 = f7, f6, f8
287 1.1 mrg ldf8 f44 = [rp], 8
288 1.1 mrg xma.hu f41 = f7, f6, f8
289 1.1 mrg ;;
290 1.1 mrg ldf8 f33 = [up], 8
291 1.1 mrg xma.l f38 = f34, f6, f46
292 1.1 mrg ldf8 f45 = [rp], 8
293 1.1 mrg xma.hu f42 = f34, f6, f46
294 1.1 mrg ;;
295 1.1 mrg ldf8 f34 = [up], 8
296 1.1 mrg xma.l f39 = f35, f6, f47
297 1.1 mrg ldf8 f46 = [rp], 8
298 1.1 mrg xma.hu f43 = f35, f6, f47
299 1.1 mrg ;;
300 1.1 mrg ldf8 f35 = [up], 8
301 1.1 mrg getf.sig r25 = f37 C FIXME
302 1.1 mrg ldf8 f47 = [rp], 8
303 1.1 mrg br.cloop.dptk .grt7
304 1.1 mrg
305 1.1 mrg getf.sig r29 = f41
306 1.1 mrg stf8 [r20] = f37, 8 C FIXME
307 1.1 mrg xma.l f36 = f32, f6, f44
308 1.1 mrg getf.sig r26 = f38
309 1.1 mrg xma.hu f40 = f32, f6, f44
310 1.1 mrg ;;
311 1.1 mrg getf.sig r30 = f42
312 1.1 mrg xma.l f37 = f33, f6, f45
313 1.1 mrg getf.sig r27 = f39
314 1.1 mrg xma.hu f41 = f33, f6, f45
315 1.1 mrg ;;
316 1.1 mrg getf.sig r31 = f43
317 1.1 mrg xma.l f38 = f34, f6, f46
318 1.1 mrg getf.sig r24 = f36
319 1.1 mrg xma.hu f42 = f34, f6, f46
320 1.1 mrg br .Lcj7
321 1.1 mrg
322 1.1 mrg .grt7:
323 1.1 mrg getf.sig r29 = f41
324 1.1 mrg xma.l f36 = f32, f6, f44
325 1.1 mrg mov r28 = 0
326 1.1 mrg xma.hu f40 = f32, f6, f44
327 1.1 mrg ;;
328 1.1 mrg ldf8 f32 = [up], 8
329 1.1 mrg getf.sig r26 = f38
330 1.1 mrg ;;
331 1.1 mrg getf.sig r30 = f42
332 1.1 mrg xma.l f37 = f33, f6, f45
333 1.1 mrg ldf8 f44 = [rp], 8
334 1.1 mrg xma.hu f41 = f33, f6, f45
335 1.1 mrg ;;
336 1.1 mrg ldf8 f33 = [up], 8
337 1.1 mrg getf.sig r27 = f39
338 1.1 mrg br .LL11
339 1.1 mrg
340 1.1 mrg
341 1.1 mrg .Lb00: ldf8 f33 = [up], 8
342 1.1 mrg ldf8 f45 = [rp], 8
343 1.1 mrg ;;
344 1.1 mrg ldf8 f34 = [up], 8
345 1.1 mrg ldf8 f46 = [rp], 8
346 1.1 mrg ;;
347 1.1 mrg ldf8 f35 = [up], 8
348 1.1 mrg xma.l f36 = f7, f6, f8
349 1.1 mrg ldf8 f47 = [rp], 8
350 1.1 mrg xma.hu f40 = f7, f6, f8
351 1.1 mrg br.cloop.dptk .grt4
352 1.1 mrg
353 1.1 mrg xma.l f37 = f33, f6, f45
354 1.1 mrg xma.hu f41 = f33, f6, f45
355 1.1 mrg xma.l f38 = f34, f6, f46
356 1.1 mrg xma.hu f42 = f34, f6, f46
357 1.1 mrg ;;
358 1.1 mrg getf.sig r28 = f40
359 1.1 mrg stf8 [r20] = f36, 8
360 1.1 mrg xma.l f39 = f35, f6, f47
361 1.1 mrg getf.sig r25 = f37
362 1.1 mrg xma.hu f43 = f35, f6, f47
363 1.1 mrg ;;
364 1.1 mrg getf.sig r29 = f41
365 1.1 mrg getf.sig r26 = f38
366 1.1 mrg getf.sig r30 = f42
367 1.1 mrg getf.sig r27 = f39
368 1.1 mrg br .Lcj4
369 1.1 mrg
370 1.1 mrg .grt4:
371 1.1 mrg ldf8 f32 = [up], 8
372 1.1 mrg xma.l f37 = f33, f6, f45
373 1.1 mrg ldf8 f44 = [rp], 8
374 1.1 mrg xma.hu f41 = f33, f6, f45
375 1.1 mrg ;;
376 1.1 mrg ldf8 f33 = [up], 8
377 1.1 mrg xma.l f38 = f34, f6, f46
378 1.1 mrg ldf8 f45 = [rp], 8
379 1.1 mrg xma.hu f42 = f34, f6, f46
380 1.1 mrg ;;
381 1.1 mrg ldf8 f34 = [up], 8
382 1.1 mrg getf.sig r24 = f36 C FIXME
383 1.1 mrg xma.l f39 = f35, f6, f47
384 1.1 mrg ldf8 f46 = [rp], 8
385 1.1 mrg getf.sig r28 = f40
386 1.1 mrg xma.hu f43 = f35, f6, f47
387 1.1 mrg ;;
388 1.1 mrg ldf8 f35 = [up], 8
389 1.1 mrg getf.sig r25 = f37
390 1.1 mrg ldf8 f47 = [rp], 8
391 1.1 mrg br.cloop.dptk .grt8
392 1.1 mrg
393 1.1 mrg getf.sig r29 = f41
394 1.1 mrg stf8 [r20] = f36, 8 C FIXME
395 1.1 mrg xma.l f36 = f32, f6, f44
396 1.1 mrg getf.sig r26 = f38
397 1.1 mrg getf.sig r30 = f42
398 1.1 mrg xma.hu f40 = f32, f6, f44
399 1.1 mrg ;;
400 1.1 mrg xma.l f37 = f33, f6, f45
401 1.1 mrg getf.sig r27 = f39
402 1.1 mrg xma.hu f41 = f33, f6, f45
403 1.1 mrg br .Lcj8
404 1.1 mrg
405 1.1 mrg .grt8:
406 1.1 mrg getf.sig r29 = f41
407 1.1 mrg xma.l f36 = f32, f6, f44
408 1.1 mrg mov r31 = 0
409 1.1 mrg xma.hu f40 = f32, f6, f44
410 1.1 mrg ;;
411 1.1 mrg ldf8 f32 = [up], 8
412 1.1 mrg getf.sig r26 = f38
413 1.1 mrg br .LL00
414 1.1 mrg
415 1.1 mrg
416 1.1 mrg C *** MAIN LOOP START ***
417 1.1 mrg ALIGN(32) C insn fed cycle #
418 1.1 mrg .Loop:
419 1.1 mrg .pred.rel "mutex", p6, p7 C num by i1 i2
420 1.1 mrg getf.sig r29 = f41 C 00 16 0 0
421 1.1 mrg xma.l f36 = f32, f6, f44 C 01 06,15 0 0
422 1.1 mrg (p6) add r14 = r30, r27, 1 C 02 0 0
423 1.1 mrg ldf8 f47 = [rp], 8 C 03 0 0
424 1.1 mrg xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
425 1.1 mrg (p7) add r14 = r30, r27 C 05 0 0
426 1.1 mrg ;;
427 1.1 mrg .pred.rel "mutex", p6, p7
428 1.1 mrg ldf8 f32 = [up], 8 C 06 1 1
429 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
430 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
431 1.1 mrg getf.sig r26 = f38 C 09 25 2 1
432 1.1 mrg st8 [r20] = r14, 8 C 10 2 1
433 1.1 mrg nop.b 0 C 11 2 1
434 1.1 mrg ;;
435 1.1 mrg .LL00:
436 1.1 mrg .pred.rel "mutex", p8, p9
437 1.1 mrg getf.sig r30 = f42 C 12 28 3 2
438 1.1 mrg xma.l f37 = f33, f6, f45 C 13 18,27 3 2
439 1.1 mrg (p8) add r16 = r31, r24, 1 C 14 3 2
440 1.1 mrg ldf8 f44 = [rp], 8 C 15 3 2
441 1.1 mrg xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
442 1.1 mrg (p9) add r16 = r31, r24 C 17 3 2
443 1.1 mrg ;;
444 1.1 mrg .pred.rel "mutex", p8, p9
445 1.1 mrg ldf8 f33 = [up], 8 C 18 4 3
446 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
447 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
448 1.1 mrg getf.sig r27 = f39 C 21 37 5 3
449 1.1 mrg st8 [r20] = r16, 8 C 22 5 3
450 1.1 mrg nop.b 0 C 23 5 3
451 1.1 mrg ;;
452 1.1 mrg .LL11:
453 1.1 mrg .pred.rel "mutex", p6, p7
454 1.1 mrg getf.sig r31 = f43 C 24 40 6 4
455 1.1 mrg xma.l f38 = f34, f6, f46 C 25 30,39 6 4
456 1.1 mrg (p6) add r14 = r28, r25, 1 C 26 6 4
457 1.1 mrg ldf8 f45 = [rp], 8 C 27 6 4
458 1.1 mrg xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
459 1.1 mrg (p7) add r14 = r28, r25 C 29 6 4
460 1.1 mrg ;;
461 1.1 mrg .pred.rel "mutex", p6, p7
462 1.1 mrg ldf8 f34 = [up], 8 C 30 7 5
463 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
464 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
465 1.1 mrg getf.sig r24 = f36 C 33 01 8 5
466 1.1 mrg st8 [r20] = r14, 8 C 34 8 5
467 1.1 mrg nop.b 0 C 35 8 5
468 1.1 mrg ;;
469 1.1 mrg .LL10:
470 1.1 mrg .pred.rel "mutex", p8, p9
471 1.1 mrg getf.sig r28 = f40 C 36 04 9 6
472 1.1 mrg xma.l f39 = f35, f6, f47 C 37 42,03 9 6
473 1.1 mrg (p8) add r16 = r29, r26, 1 C 38 9 6
474 1.1 mrg ldf8 f46 = [rp], 8 C 39 9 6
475 1.1 mrg xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
476 1.1 mrg (p9) add r16 = r29, r26 C 41 9 6
477 1.1 mrg ;;
478 1.1 mrg .pred.rel "mutex", p8, p9
479 1.1 mrg ldf8 f35 = [up], 8 C 42 10 7
480 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
481 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
482 1.1 mrg getf.sig r25 = f37 C 45 13 11 7
483 1.1 mrg st8 [r20] = r16, 8 C 46 11 7
484 1.1 mrg br.cloop.dptk .Loop C 47 11 7
485 1.1 mrg C *** MAIN LOOP END ***
486 1.1 mrg ;;
487 1.1 mrg .Le0:
488 1.1 mrg .pred.rel "mutex", p6, p7
489 1.1 mrg getf.sig r29 = f41 C
490 1.1 mrg xma.l f36 = f32, f6, f44 C
491 1.1 mrg (p6) add r14 = r30, r27, 1 C
492 1.1 mrg ldf8 f47 = [rp], 8 C
493 1.1 mrg xma.hu f40 = f32, f6, f44 C
494 1.1 mrg (p7) add r14 = r30, r27 C
495 1.1 mrg ;;
496 1.1 mrg .pred.rel "mutex", p6, p7
497 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C
498 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C
499 1.1 mrg getf.sig r26 = f38 C
500 1.1 mrg st8 [r20] = r14, 8 C
501 1.1 mrg ;;
502 1.1 mrg .pred.rel "mutex", p8, p9
503 1.1 mrg getf.sig r30 = f42 C
504 1.1 mrg xma.l f37 = f33, f6, f45 C
505 1.1 mrg (p8) add r16 = r31, r24, 1 C
506 1.1 mrg xma.hu f41 = f33, f6, f45 C
507 1.1 mrg (p9) add r16 = r31, r24 C
508 1.1 mrg ;;
509 1.1 mrg .pred.rel "mutex", p8, p9
510 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C
511 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C
512 1.1 mrg getf.sig r27 = f39 C
513 1.1 mrg st8 [r20] = r16, 8 C
514 1.1 mrg ;;
515 1.1 mrg .Lcj8:
516 1.1 mrg .pred.rel "mutex", p6, p7
517 1.1 mrg getf.sig r31 = f43 C
518 1.1 mrg xma.l f38 = f34, f6, f46 C
519 1.1 mrg (p6) add r14 = r28, r25, 1 C
520 1.1 mrg xma.hu f42 = f34, f6, f46 C
521 1.1 mrg (p7) add r14 = r28, r25 C
522 1.1 mrg ;;
523 1.1 mrg .pred.rel "mutex", p6, p7
524 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C
525 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C
526 1.1 mrg getf.sig r24 = f36 C
527 1.1 mrg st8 [r20] = r14, 8 C
528 1.1 mrg ;;
529 1.1 mrg .Lcj7:
530 1.1 mrg .pred.rel "mutex", p8, p9
531 1.1 mrg getf.sig r28 = f40 C
532 1.1 mrg xma.l f39 = f35, f6, f47 C
533 1.1 mrg (p8) add r16 = r29, r26, 1 C
534 1.1 mrg xma.hu f43 = f35, f6, f47 C
535 1.1 mrg (p9) add r16 = r29, r26 C
536 1.1 mrg ;;
537 1.1 mrg .pred.rel "mutex", p8, p9
538 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C
539 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C
540 1.1 mrg getf.sig r25 = f37 C
541 1.1 mrg st8 [r20] = r16, 8 C
542 1.1 mrg ;;
543 1.1 mrg .Lcj6:
544 1.1 mrg .pred.rel "mutex", p6, p7
545 1.1 mrg getf.sig r29 = f41 C
546 1.1 mrg (p6) add r14 = r30, r27, 1 C
547 1.1 mrg (p7) add r14 = r30, r27 C
548 1.1 mrg ;;
549 1.1 mrg .pred.rel "mutex", p6, p7
550 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C
551 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C
552 1.1 mrg getf.sig r26 = f38 C
553 1.1 mrg st8 [r20] = r14, 8 C
554 1.1 mrg ;;
555 1.1 mrg .Lcj5:
556 1.1 mrg .pred.rel "mutex", p8, p9
557 1.1 mrg getf.sig r30 = f42 C
558 1.1 mrg (p8) add r16 = r31, r24, 1 C
559 1.1 mrg (p9) add r16 = r31, r24 C
560 1.1 mrg ;;
561 1.1 mrg .pred.rel "mutex", p8, p9
562 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C
563 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C
564 1.1 mrg getf.sig r27 = f39 C
565 1.1 mrg st8 [r20] = r16, 8 C
566 1.1 mrg ;;
567 1.1 mrg .Lcj4:
568 1.1 mrg .pred.rel "mutex", p6, p7
569 1.1 mrg getf.sig r8 = f43 C
570 1.1 mrg (p6) add r14 = r28, r25, 1 C
571 1.1 mrg (p7) add r14 = r28, r25 C
572 1.1 mrg ;;
573 1.1 mrg .pred.rel "mutex", p6, p7
574 1.1 mrg st8 [r20] = r14, 8 C
575 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C
576 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C
577 1.1 mrg ;;
578 1.1 mrg .Lcj3:
579 1.1 mrg .pred.rel "mutex", p8, p9
580 1.1 mrg (p8) add r16 = r29, r26, 1 C
581 1.1 mrg (p9) add r16 = r29, r26 C
582 1.1 mrg ;;
583 1.1 mrg .pred.rel "mutex", p8, p9
584 1.1 mrg st8 [r20] = r16, 8 C
585 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C
586 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C
587 1.1 mrg ;;
588 1.1 mrg .Lcj2:
589 1.1 mrg .pred.rel "mutex", p6, p7
590 1.1 mrg (p6) add r14 = r30, r27, 1 C
591 1.1 mrg (p7) add r14 = r30, r27 C
592 1.1 mrg ;;
593 1.1 mrg .pred.rel "mutex", p6, p7
594 1.1 mrg st8 [r20] = r14 C
595 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C
596 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C
597 1.1 mrg ;;
598 1.1 mrg (p8) add r8 = 1, r8 C M I
599 1.1 mrg mov.i ar.lc = r2 C I0
600 1.1 mrg br.ret.sptk.many b0 C B
601 1.1 mrg EPILOGUE()
602 1.1 mrg ASM_END()
603