rsh1aors_n.asm revision 1.1.1.1.8.1 1 dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
2
3 dnl Contributed to the GNU project by Torbjorn Granlund.
4
5 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
13
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
18
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24 C cycles/limb
25 C Itanium: 2.5
26 C Itanium 2: 1.5
27
28 C TODO
29 C * Rewrite function entry code using aorslsh1_n.asm style.
30 C * Micro-optimize feed-in and wind-down code.
31
32 C INPUT PARAMETERS
33 define(`rp',`r32')
34 define(`up',`r33')
35 define(`vp',`r34')
36 define(`n',`r35')
37
38 ifdef(`OPERATION_rsh1add_n',`
39 define(ADDSUB, add)
40 define(PRED, ltu)
41 define(INCR, 1)
42 define(LIM, -1)
43 define(func, mpn_rsh1add_n)
44 ')
45 ifdef(`OPERATION_rsh1sub_n',`
46 define(ADDSUB, sub)
47 define(PRED, gtu)
48 define(INCR, -1)
49 define(LIM, 0)
50 define(func, mpn_rsh1sub_n)
51 ')
52
53 C Some useful aliases for registers we use
54 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
55 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
56 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
57 define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
58
59 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
60
61 ASM_START()
62 PROLOGUE(func)
63 .prologue
64 .save ar.lc, r2
65 .body
66 ifdef(`HAVE_ABI_32',`
67 addp4 rp = 0, rp C M I
68 addp4 up = 0, up C M I
69 addp4 vp = 0, vp C M I
70 zxt4 n = n C I
71 ;;
72 ')
73 {.mmi; ld8 r11 = [vp], 8 C M01
74 ld8 r10 = [up], 8 C M01
75 mov.i r2 = ar.lc C I0
76 }{.mmi; and r14 = 3, n C M I
77 cmp.lt p15, p0 = 4, n C M I
78 add n = -4, n C M I
79 ;;
80 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
81 cmp.eq p7, p0 = 2, r14 C M I
82 cmp.eq p8, p0 = 3, r14 C M I
83 }{.bbb
84 (p6) br.dptk .Lb01 C B
85 (p7) br.dptk .Lb10 C B
86 (p8) br.dptk .Lb11 C B
87 }
88
89 .Lb00: ld8 v0 = [vp], 8 C M01
90 ld8 u0 = [up], 8 C M01
91 shr.u n = n, 2 C I0
92 ;;
93 ld8 v1 = [vp], 8 C M01
94 ld8 u1 = [up], 8 C M01
95 ADDSUB w3 = r10, r11 C M I
96 ;;
97 ld8 v2 = [vp], 8 C M01
98 ld8 u2 = [up], 8 C M01
99 (p15) br.dpnt .grt4 C B
100 ;;
101
102 cmp.PRED p7, p0 = w3, r10 C M I
103 and r8 = 1, w3 C M I
104 ADDSUB w0 = u0, v0 C M I
105 ;;
106 cmp.PRED p8, p0 = w0, u0 C M I
107 ADDSUB w1 = u1, v1 C M I
108 ;;
109 cmp.PRED p9, p0 = w1, u1 C M I
110 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
111 (p7) add w0 = INCR, w0 C M I
112 ;;
113 shrp x3 = w0, w3, 1 C I0
114 ADDSUB w2 = u2, v2 C M I
115 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
116 (p8) add w1 = INCR, w1 C M I
117 br .Lcj4 C B
118
119 .grt4: ld8 v3 = [vp], 8 C M01
120 cmp.PRED p7, p0 = w3, r10 C M I
121 ld8 u3 = [up], 8 C M01
122 and r8 = 1, w3 C M I
123 ;;
124 ADDSUB w0 = u0, v0 C M I
125 ld8 v0 = [vp], 8 C M01
126 add n = -1, n
127 ;;
128 cmp.PRED p8, p0 = w0, u0 C M I
129 ld8 u0 = [up], 8 C M01
130 ADDSUB w1 = u1, v1 C M I
131 ;;
132 ld8 v1 = [vp], 8 C M01
133 mov.i ar.lc = n C I0
134 cmp.PRED p9, p0 = w1, u1 C M I
135 ld8 u1 = [up], 8 C M01
136 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
137 (p7) add w0 = INCR, w0 C M I
138 ;;
139 ADDSUB w2 = u2, v2 C M I
140 ld8 v2 = [vp], 8 C M01
141 shrp x3 = w0, w3, 1 C I0
142 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
143 (p8) add w1 = INCR, w1 C M I
144 br .LL00 C B
145
146
147 .Lb01: ADDSUB w2 = r10, r11 C M I
148 shr.u n = n, 2 C I0
149 (p15) br.dpnt .grt1 C B
150 ;;
151
152 cmp.PRED p6, p7 = w2, r10 C M I
153 shr.u x2 = w2, 1 C I0
154 and r8 = 1, w2 C M I
155 ;;
156 (p6) dep x2 = -1, x2, 63, 1 C I0
157 br .Lcj1 C B
158
159 .grt1: ld8 v3 = [vp], 8 C M01
160 ld8 u3 = [up], 8 C M01
161 ;;
162 ld8 v0 = [vp], 8 C M01
163 ld8 u0 = [up], 8 C M01
164 mov.i ar.lc = n C FIXME swap with next I0
165 ;;
166 ld8 v1 = [vp], 8 C M01
167 ld8 u1 = [up], 8 C M01
168 ;;
169 ld8 v2 = [vp], 8 C M01
170 ld8 u2 = [up], 8 C M01
171 cmp.PRED p6, p0 = w2, r10 C M I
172 and r8 = 1, w2 C M I
173 ADDSUB w3 = u3, v3 C M I
174 br.cloop.dptk .grt5 C B
175 ;;
176
177 cmp.PRED p7, p0 = w3, u3 C M I
178 ;;
179 ADDSUB w0 = u0, v0 C M I
180 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
181 (p6) add w3 = INCR, w3 C M I
182 ;;
183 cmp.PRED p8, p0 = w0, u0 C M I
184 shrp x2 = w3, w2, 1 C I0
185 ADDSUB w1 = u1, v1 C M I
186 ;;
187 cmp.PRED p9, p0 = w1, u1 C M I
188 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
189 (p7) add w0 = INCR, w0 C M I
190 br .Lcj5 C B
191
192 .grt5: ld8 v3 = [vp], 8 C M01
193 cmp.PRED p7, p0 = w3, u3 C M I
194 ld8 u3 = [up], 8 C M01
195 ;;
196 ADDSUB w0 = u0, v0 C M I
197 ld8 v0 = [vp], 8 C M01
198 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
199 (p6) add w3 = INCR, w3 C M I
200 ;;
201 cmp.PRED p8, p0 = w0, u0 C M I
202 shrp x2 = w3, w2, 1 C I0
203 ld8 u0 = [up], 8 C M01
204 ADDSUB w1 = u1, v1 C M I
205 ;;
206 ld8 v1 = [vp], 8 C M01
207 cmp.PRED p9, p0 = w1, u1 C M I
208 ld8 u1 = [up], 8 C M01
209 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
210 (p7) add w0 = INCR, w0 C M I
211 br .LL01 C B
212
213
214 .Lb10: ld8 v2 = [vp], 8 C M01
215 ld8 u2 = [up], 8 C M01
216 shr.u n = n, 2 C I0
217 ADDSUB w1 = r10, r11 C M I
218 (p15) br.dpnt .grt2 C B
219 ;;
220
221 cmp.PRED p9, p0 = w1, r10 C M I
222 and r8 = 1, w1 C M I
223 ADDSUB w2 = u2, v2 C M I
224 ;;
225 cmp.PRED p6, p0 = w2, u2 C M I
226 ;;
227 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
228 (p9) add w2 = INCR, w2 C M I
229 ;;
230 shrp x1 = w2, w1, 1 C I0
231 shr.u x2 = w2, 1 C I0
232 br .Lcj2 C B
233
234 .grt2: ld8 v3 = [vp], 8 C M01
235 ld8 u3 = [up], 8 C M01
236 ;;
237 ld8 v0 = [vp], 8 C M01
238 ld8 u0 = [up], 8 C M01
239 mov.i ar.lc = n C I0
240 ;;
241 ld8 v1 = [vp], 8 C M01
242 cmp.PRED p9, p0 = w1, r10 C M I
243 ld8 u1 = [up], 8 C M01
244 and r8 = 1, w1 C M I
245 ;;
246 ADDSUB w2 = u2, v2 C M I
247 ld8 v2 = [vp], 8 C M01
248 ;;
249 cmp.PRED p6, p0 = w2, u2 C M I
250 ld8 u2 = [up], 8 C M01
251 ADDSUB w3 = u3, v3 C M I
252 br.cloop.dptk .grt6 C B
253 ;;
254
255 cmp.PRED p7, p0 = w3, u3 C M I
256 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
257 (p9) add w2 = INCR, w2 C M I
258 ;;
259 shrp x1 = w2, w1, 1 C I0
260 ADDSUB w0 = u0, v0 C M I
261 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
262 (p6) add w3 = INCR, w3 C M I
263 br .Lcj6 C B
264
265 .grt6: ld8 v3 = [vp], 8 C M01
266 cmp.PRED p7, p0 = w3, u3 C M I
267 ld8 u3 = [up], 8 C M01
268 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
269 (p9) add w2 = INCR, w2 C M I
270 ;;
271 shrp x1 = w2, w1, 1 C I0
272 ADDSUB w0 = u0, v0 C M I
273 ld8 v0 = [vp], 8 C M01
274 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
275 (p6) add w3 = INCR, w3 C M I
276 br .LL10 C B
277
278
279 .Lb11: ld8 v1 = [vp], 8 C M01
280 ld8 u1 = [up], 8 C M01
281 shr.u n = n, 2 C I0
282 ;;
283 ld8 v2 = [vp], 8 C M01
284 ld8 u2 = [up], 8 C M01
285 ADDSUB w0 = r10, r11 C M I
286 (p15) br.dpnt .grt3 C B
287 ;;
288
289 cmp.PRED p8, p0 = w0, r10 C M I
290 ADDSUB w1 = u1, v1 C M I
291 and r8 = 1, w0 C M I
292 ;;
293 cmp.PRED p9, p0 = w1, u1 C M I
294 ;;
295 ADDSUB w2 = u2, v2 C M I
296 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
297 (p8) add w1 = INCR, w1 C M I
298 ;;
299 cmp.PRED p6, p0 = w2, u2 C M I
300 shrp x0 = w1, w0, 1 C I0
301 ;;
302 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
303 (p9) add w2 = INCR, w2 C M I
304 br .Lcj3 C B
305
306 .grt3: ld8 v3 = [vp], 8 C M01
307 ld8 u3 = [up], 8 C M01
308 ;;
309 ld8 v0 = [vp], 8 C M01
310 mov.i ar.lc = n C I0
311 cmp.PRED p8, p0 = w0, r10 C M I
312 ld8 u0 = [up], 8 C M01
313 ADDSUB w1 = u1, v1 C M I
314 and r8 = 1, w0 C M I
315 ;;
316 ld8 v1 = [vp], 8 C M01
317 cmp.PRED p9, p0 = w1, u1 C M I
318 ld8 u1 = [up], 8 C M01
319 ;;
320 ADDSUB w2 = u2, v2 C M I
321 ld8 v2 = [vp], 8 C M01
322 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
323 (p8) add w1 = INCR, w1 C M I
324 ;;
325 cmp.PRED p6, p0 = w2, u2 C M I
326 shrp x0 = w1, w0, 1 C I0
327 ld8 u2 = [up], 8 C M01
328 ADDSUB w3 = u3, v3 C M I
329 br.cloop.dptk .grt7 C B
330 ;;
331
332 cmp.PRED p7, p0 = w3, u3 C M I
333 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
334 (p9) add w2 = INCR, w2 C M I
335 br .Lcj7 C B
336
337 .grt7: ld8 v3 = [vp], 8 C M01
338 cmp.PRED p7, p0 = w3, u3 C M I
339 ld8 u3 = [up], 8 C M01
340 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
341 (p9) add w2 = INCR, w2 C M I
342 br .LL11 C B
343
344
345 C *** MAIN LOOP START ***
346 ALIGN(32)
347 .Loop: st8 [rp] = x3, 8 C M23
348 ld8 v3 = [vp], 8 C M01
349 cmp.PRED p7, p0 = w3, u3 C M I
350 ld8 u3 = [up], 8 C M01
351 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
352 (p9) add w2 = INCR, w2 C M I
353 ;;
354 .LL11: st8 [rp] = x0, 8 C M23
355 shrp x1 = w2, w1, 1 C I0
356 ADDSUB w0 = u0, v0 C M I
357 ld8 v0 = [vp], 8 C M01
358 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
359 (p6) add w3 = INCR, w3 C M I
360 ;;
361 .LL10: cmp.PRED p8, p0 = w0, u0 C M I
362 shrp x2 = w3, w2, 1 C I0
363 nop.b 0
364 ld8 u0 = [up], 8 C M01
365 ADDSUB w1 = u1, v1 C M I
366 nop.b 0
367 ;;
368 st8 [rp] = x1, 8 C M23
369 ld8 v1 = [vp], 8 C M01
370 cmp.PRED p9, p0 = w1, u1 C M I
371 ld8 u1 = [up], 8 C M01
372 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
373 (p7) add w0 = INCR, w0 C M I
374 ;;
375 .LL01: st8 [rp] = x2, 8 C M23
376 shrp x3 = w0, w3, 1 C I0
377 ADDSUB w2 = u2, v2 C M I
378 ld8 v2 = [vp], 8 C M01
379 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
380 (p8) add w1 = INCR, w1 C M I
381 ;;
382 .LL00: cmp.PRED p6, p0 = w2, u2 C M I
383 shrp x0 = w1, w0, 1 C I0
384 nop.b 0
385 ld8 u2 = [up], 8 C M01
386 ADDSUB w3 = u3, v3 C M I
387 br.cloop.dptk .Loop C B
388 ;;
389 C *** MAIN LOOP END ***
390
391 .Lskip: st8 [rp] = x3, 8 C M23
392 cmp.PRED p7, p0 = w3, u3 C M I
393 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
394 (p9) add w2 = INCR, w2 C M I
395 ;;
396 .Lcj7: st8 [rp] = x0, 8 C M23
397 shrp x1 = w2, w1, 1 C I0
398 ADDSUB w0 = u0, v0 C M I
399 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
400 (p6) add w3 = INCR, w3 C M I
401 ;;
402 .Lcj6: cmp.PRED p8, p0 = w0, u0 C M I
403 shrp x2 = w3, w2, 1 C I0
404 ADDSUB w1 = u1, v1 C M I
405 ;;
406 st8 [rp] = x1, 8 C M23
407 cmp.PRED p9, p0 = w1, u1 C M I
408 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
409 (p7) add w0 = INCR, w0 C M I
410 ;;
411 .Lcj5: st8 [rp] = x2, 8 C M23
412 shrp x3 = w0, w3, 1 C I0
413 ADDSUB w2 = u2, v2 C M I
414 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
415 (p8) add w1 = INCR, w1 C M I
416 ;;
417 .Lcj4: cmp.PRED p6, p0 = w2, u2 C M I
418 shrp x0 = w1, w0, 1 C I0
419 ;;
420 st8 [rp] = x3, 8 C M23
421 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
422 (p9) add w2 = INCR, w2 C M I
423 ;;
424 .Lcj3: st8 [rp] = x0, 8 C M23
425 shrp x1 = w2, w1, 1 C I0
426 shr.u x2 = w2, 1 C I0
427 ;;
428 .Lcj2: st8 [rp] = x1, 8 C M23
429 (p6) dep x2 = -1, x2, 63, 1 C I0
430 ;;
431 .Lcj1: st8 [rp] = x2 C M23
432 mov.i ar.lc = r2 C I0
433 br.ret.sptk.many b0 C B
434 EPILOGUE()
435