Home | History | Annotate | Line # | Download | only in ia64
rsh1aors_n.asm revision 1.1.1.1.8.1
      1 dnl  IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
      2 
      3 dnl  Contributed to the GNU project by Torbjorn Granlund.
      4 
      5 dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of the GNU Lesser General Public License as published
     11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     12 dnl  your option) any later version.
     13 
     14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     17 dnl  License for more details.
     18 
     19 dnl  You should have received a copy of the GNU Lesser General Public License
     20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     21 
     22 include(`../config.m4')
     23 
     24 C         cycles/limb
     25 C Itanium:    2.5
     26 C Itanium 2:  1.5
     27 
     28 C TODO
     29 C  * Rewrite function entry code using aorslsh1_n.asm style.
     30 C  * Micro-optimize feed-in and wind-down code.
     31 
     32 C INPUT PARAMETERS
     33 define(`rp',`r32')
     34 define(`up',`r33')
     35 define(`vp',`r34')
     36 define(`n',`r35')
     37 
     38 ifdef(`OPERATION_rsh1add_n',`
     39   define(ADDSUB,       add)
     40   define(PRED,	       ltu)
     41   define(INCR,	       1)
     42   define(LIM,	       -1)
     43   define(func, mpn_rsh1add_n)
     44 ')
     45 ifdef(`OPERATION_rsh1sub_n',`
     46   define(ADDSUB,       sub)
     47   define(PRED,	       gtu)
     48   define(INCR,	       -1)
     49   define(LIM,	       0)
     50   define(func, mpn_rsh1sub_n)
     51 ')
     52 
     53 C Some useful aliases for registers we use
     54 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
     55 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
     56 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
     57 define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
     58 
     59 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
     60 
     61 ASM_START()
     62 PROLOGUE(func)
     63 	.prologue
     64 	.save	ar.lc, r2
     65 	.body
     66 ifdef(`HAVE_ABI_32',`
     67 	addp4		rp = 0, rp		C			M I
     68 	addp4		up = 0, up		C			M I
     69 	addp4		vp = 0, vp		C			M I
     70 	zxt4		n = n			C			I
     71 	;;
     72 ')
     73  {.mmi;	ld8		r11 = [vp], 8		C			M01
     74 	ld8		r10 = [up], 8		C			M01
     75 	mov.i		r2 = ar.lc		C			I0
     76 }{.mmi;	and		r14 = 3, n		C			M I
     77 	cmp.lt		p15, p0 = 4, n		C			M I
     78 	add		n = -4, n		C			M I
     79 	;;
     80 }{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
     81 	cmp.eq		p7, p0 = 2, r14		C			M I
     82 	cmp.eq		p8, p0 = 3, r14		C			M I
     83 }{.bbb
     84   (p6)	br.dptk		.Lb01			C			B
     85   (p7)	br.dptk		.Lb10			C			B
     86   (p8)	br.dptk		.Lb11			C			B
     87 }
     88 
     89 .Lb00:	ld8		v0 = [vp], 8		C			M01
     90 	ld8		u0 = [up], 8		C			M01
     91 	shr.u		n = n, 2		C			I0
     92 	;;
     93 	ld8		v1 = [vp], 8		C			M01
     94 	ld8		u1 = [up], 8		C			M01
     95 	ADDSUB		w3 = r10, r11		C			M I
     96 	;;
     97 	ld8		v2 = [vp], 8		C			M01
     98 	ld8		u2 = [up], 8		C			M01
     99   (p15)	br.dpnt		.grt4			C			B
    100 	;;
    101 
    102 	cmp.PRED	p7, p0 = w3, r10	C			M I
    103 	and		r8 = 1, w3		C			M I
    104 	ADDSUB		w0 = u0, v0		C			M I
    105 	;;
    106 	cmp.PRED	p8, p0 = w0, u0		C			M I
    107 	ADDSUB		w1 = u1, v1		C			M I
    108 	;;
    109 	cmp.PRED	p9, p0 = w1, u1		C			M I
    110    (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
    111    (p7)	add		w0 = INCR, w0		C			M I
    112 	;;
    113 	shrp		x3 = w0, w3, 1		C			I0
    114 	ADDSUB		w2 = u2, v2		C			M I
    115    (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
    116    (p8)	add		w1 = INCR, w1		C			M I
    117 	br		.Lcj4			C			B
    118 
    119 .grt4:	ld8		v3 = [vp], 8		C			M01
    120 	cmp.PRED	p7, p0 = w3, r10	C			M I
    121 	ld8		u3 = [up], 8		C			M01
    122 	and		r8 = 1, w3		C			M I
    123 	;;
    124 	ADDSUB		w0 = u0, v0		C			M I
    125 	ld8		v0 = [vp], 8		C			M01
    126 	add		n = -1, n
    127 	;;
    128 	cmp.PRED	p8, p0 = w0, u0		C			M I
    129 	ld8		u0 = [up], 8		C			M01
    130 	ADDSUB		w1 = u1, v1		C			M I
    131 	;;
    132 	ld8		v1 = [vp], 8		C			M01
    133 	mov.i		ar.lc = n		C			I0
    134 	cmp.PRED	p9, p0 = w1, u1		C			M I
    135 	ld8		u1 = [up], 8		C			M01
    136    (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
    137    (p7)	add		w0 = INCR, w0		C			M I
    138 	;;
    139 	ADDSUB		w2 = u2, v2		C			M I
    140 	ld8		v2 = [vp], 8		C			M01
    141 	shrp		x3 = w0, w3, 1		C			I0
    142    (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
    143    (p8)	add		w1 = INCR, w1		C			M I
    144 	br		.LL00			C			B
    145 
    146 
    147 .Lb01:	ADDSUB		w2 = r10, r11		C			M I
    148 	shr.u		n = n, 2		C			I0
    149   (p15)	br.dpnt		.grt1			C			B
    150 	;;
    151 
    152 	cmp.PRED	p6, p7 = w2, r10	C			M I
    153 	shr.u		x2 = w2, 1		C			I0
    154 	and		r8 = 1, w2		C			M I
    155 	;;
    156    (p6)	dep		x2 = -1, x2, 63, 1	C			I0
    157 	br		.Lcj1			C			B
    158 
    159 .grt1:	ld8		v3 = [vp], 8		C			M01
    160 	ld8		u3 = [up], 8		C			M01
    161 	;;
    162 	ld8		v0 = [vp], 8		C			M01
    163 	ld8		u0 = [up], 8		C			M01
    164 	mov.i		ar.lc = n		C FIXME swap with next	I0
    165 	;;
    166 	ld8		v1 = [vp], 8		C			M01
    167 	ld8		u1 = [up], 8		C			M01
    168 	;;
    169 	ld8		v2 = [vp], 8		C			M01
    170 	ld8		u2 = [up], 8		C			M01
    171 	cmp.PRED	p6, p0 = w2, r10	C			M I
    172 	and		r8 = 1, w2		C			M I
    173 	ADDSUB		w3 = u3, v3		C			M I
    174 	br.cloop.dptk	.grt5			C			B
    175 	;;
    176 
    177 	cmp.PRED	p7, p0 = w3, u3		C			M I
    178 	;;
    179 	ADDSUB		w0 = u0, v0		C			M I
    180    (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
    181    (p6)	add		w3 = INCR, w3		C			M I
    182 	;;
    183 	cmp.PRED	p8, p0 = w0, u0		C			M I
    184 	shrp		x2 = w3, w2, 1		C			I0
    185 	ADDSUB		w1 = u1, v1		C			M I
    186 	;;
    187 	cmp.PRED	p9, p0 = w1, u1		C			M I
    188    (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
    189    (p7)	add		w0 = INCR, w0		C			M I
    190 	br		.Lcj5			C			B
    191 
    192 .grt5:	ld8		v3 = [vp], 8		C			M01
    193 	cmp.PRED	p7, p0 = w3, u3		C			M I
    194 	ld8		u3 = [up], 8		C			M01
    195 	;;
    196 	ADDSUB		w0 = u0, v0		C			M I
    197 	ld8		v0 = [vp], 8		C			M01
    198    (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
    199    (p6)	add		w3 = INCR, w3		C			M I
    200 	;;
    201 	cmp.PRED	p8, p0 = w0, u0		C			M I
    202 	shrp		x2 = w3, w2, 1		C			I0
    203 	ld8		u0 = [up], 8		C			M01
    204 	ADDSUB		w1 = u1, v1		C			M I
    205 	;;
    206 	ld8		v1 = [vp], 8		C			M01
    207 	cmp.PRED	p9, p0 = w1, u1		C			M I
    208 	ld8		u1 = [up], 8		C			M01
    209    (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
    210    (p7)	add		w0 = INCR, w0		C			M I
    211 	br		.LL01			C			B
    212 
    213 
    214 .Lb10:	ld8		v2 = [vp], 8		C			M01
    215 	ld8		u2 = [up], 8		C			M01
    216 	shr.u		n = n, 2		C			I0
    217 	ADDSUB		w1 = r10, r11		C			M I
    218   (p15)	br.dpnt		.grt2			C			B
    219 	;;
    220 
    221 	cmp.PRED	p9, p0 = w1, r10	C			M I
    222 	and		r8 = 1, w1		C			M I
    223 	ADDSUB		w2 = u2, v2		C			M I
    224 	;;
    225 	cmp.PRED	p6, p0 = w2, u2		C			M I
    226 	;;
    227    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    228    (p9)	add		w2 = INCR, w2		C			M I
    229 	;;
    230 	shrp		x1 = w2, w1, 1		C			I0
    231 	shr.u		x2 = w2, 1		C			I0
    232 	br		.Lcj2			C			B
    233 
    234 .grt2:	ld8		v3 = [vp], 8		C			M01
    235 	ld8		u3 = [up], 8		C			M01
    236 	;;
    237 	ld8		v0 = [vp], 8		C			M01
    238 	ld8		u0 = [up], 8		C			M01
    239 	mov.i		ar.lc = n		C			I0
    240 	;;
    241 	ld8		v1 = [vp], 8		C			M01
    242 	cmp.PRED	p9, p0 = w1, r10	C			M I
    243 	ld8		u1 = [up], 8		C			M01
    244 	and		r8 = 1, w1		C			M I
    245 	;;
    246 	ADDSUB		w2 = u2, v2		C			M I
    247 	ld8		v2 = [vp], 8		C			M01
    248 	;;
    249 	cmp.PRED	p6, p0 = w2, u2		C			M I
    250 	ld8		u2 = [up], 8		C			M01
    251 	ADDSUB		w3 = u3, v3		C			M I
    252 	br.cloop.dptk	.grt6			C			B
    253 	;;
    254 
    255 	cmp.PRED	p7, p0 = w3, u3		C			M I
    256    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    257    (p9)	add		w2 = INCR, w2		C			M I
    258 	;;
    259 	shrp		x1 = w2, w1, 1		C			I0
    260 	ADDSUB		w0 = u0, v0		C			M I
    261    (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
    262    (p6)	add		w3 = INCR, w3		C			M I
    263 	br		.Lcj6			C			B
    264 
    265 .grt6:	ld8		v3 = [vp], 8		C			M01
    266 	cmp.PRED	p7, p0 = w3, u3		C			M I
    267 	ld8		u3 = [up], 8		C			M01
    268    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    269    (p9)	add		w2 = INCR, w2		C			M I
    270 	;;
    271 	shrp		x1 = w2, w1, 1		C			I0
    272 	ADDSUB		w0 = u0, v0		C			M I
    273 	ld8		v0 = [vp], 8		C			M01
    274    (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
    275    (p6)	add		w3 = INCR, w3		C			M I
    276 	br		.LL10			C			B
    277 
    278 
    279 .Lb11:	ld8		v1 = [vp], 8		C			M01
    280 	ld8		u1 = [up], 8		C			M01
    281 	shr.u		n = n, 2		C			I0
    282 	;;
    283 	ld8		v2 = [vp], 8		C			M01
    284 	ld8		u2 = [up], 8		C			M01
    285 	ADDSUB		w0 = r10, r11		C			M I
    286   (p15)	br.dpnt		.grt3			C			B
    287 	;;
    288 
    289 	cmp.PRED	p8, p0 = w0, r10	C			M I
    290 	ADDSUB		w1 = u1, v1		C			M I
    291 	and		r8 = 1, w0		C			M I
    292 	;;
    293 	cmp.PRED	p9, p0 = w1, u1		C			M I
    294 	;;
    295 	ADDSUB		w2 = u2, v2		C			M I
    296    (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
    297    (p8)	add		w1 = INCR, w1		C			M I
    298 	;;
    299 	cmp.PRED	p6, p0 = w2, u2		C			M I
    300 	shrp		x0 = w1, w0, 1		C			I0
    301 	;;
    302    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    303    (p9)	add		w2 = INCR, w2		C			M I
    304 	br		.Lcj3			C			B
    305 
    306 .grt3:	ld8		v3 = [vp], 8		C			M01
    307 	ld8		u3 = [up], 8		C			M01
    308 	;;
    309 	ld8		v0 = [vp], 8		C			M01
    310 	mov.i		ar.lc = n		C			I0
    311 	cmp.PRED	p8, p0 = w0, r10	C			M I
    312 	ld8		u0 = [up], 8		C			M01
    313 	ADDSUB		w1 = u1, v1		C			M I
    314 	and		r8 = 1, w0		C			M I
    315 	;;
    316 	ld8		v1 = [vp], 8		C			M01
    317 	cmp.PRED	p9, p0 = w1, u1		C			M I
    318 	ld8		u1 = [up], 8		C			M01
    319 	;;
    320 	ADDSUB		w2 = u2, v2		C			M I
    321 	ld8		v2 = [vp], 8		C			M01
    322    (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
    323    (p8)	add		w1 = INCR, w1		C			M I
    324 	;;
    325 	cmp.PRED	p6, p0 = w2, u2		C			M I
    326 	shrp		x0 = w1, w0, 1		C			I0
    327 	ld8		u2 = [up], 8		C			M01
    328 	ADDSUB		w3 = u3, v3		C			M I
    329 	br.cloop.dptk	.grt7			C			B
    330 	;;
    331 
    332 	cmp.PRED	p7, p0 = w3, u3		C			M I
    333    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    334    (p9)	add		w2 = INCR, w2		C			M I
    335 	br		.Lcj7			C			B
    336 
    337 .grt7:	ld8		v3 = [vp], 8		C			M01
    338 	cmp.PRED	p7, p0 = w3, u3		C			M I
    339 	ld8		u3 = [up], 8		C			M01
    340    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    341    (p9)	add		w2 = INCR, w2		C			M I
    342 	br		.LL11			C			B
    343 
    344 
    345 C *** MAIN LOOP START ***
    346 	ALIGN(32)
    347 .Loop:	st8		[rp] = x3, 8		C			M23
    348 	ld8		v3 = [vp], 8		C			M01
    349 	cmp.PRED	p7, p0 = w3, u3		C			M I
    350 	ld8		u3 = [up], 8		C			M01
    351    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    352    (p9)	add		w2 = INCR, w2		C			M I
    353 	;;
    354 .LL11:	st8		[rp] = x0, 8		C			M23
    355 	shrp		x1 = w2, w1, 1		C			I0
    356 	ADDSUB		w0 = u0, v0		C			M I
    357 	ld8		v0 = [vp], 8		C			M01
    358    (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
    359    (p6)	add		w3 = INCR, w3		C			M I
    360 	;;
    361 .LL10:	cmp.PRED	p8, p0 = w0, u0		C			M I
    362 	shrp		x2 = w3, w2, 1		C			I0
    363 	nop.b		0
    364 	ld8		u0 = [up], 8		C			M01
    365 	ADDSUB		w1 = u1, v1		C			M I
    366 	nop.b		0
    367 	;;
    368 	st8		[rp] = x1, 8		C			M23
    369 	ld8		v1 = [vp], 8		C			M01
    370 	cmp.PRED	p9, p0 = w1, u1		C			M I
    371 	ld8		u1 = [up], 8		C			M01
    372    (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
    373    (p7)	add		w0 = INCR, w0		C			M I
    374 	;;
    375 .LL01:	st8		[rp] = x2, 8		C			M23
    376 	shrp		x3 = w0, w3, 1		C			I0
    377 	ADDSUB		w2 = u2, v2		C			M I
    378 	ld8		v2 = [vp], 8		C			M01
    379    (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
    380    (p8)	add		w1 = INCR, w1		C			M I
    381 	;;
    382 .LL00:	cmp.PRED	p6, p0 = w2, u2		C			M I
    383 	shrp		x0 = w1, w0, 1		C			I0
    384 	nop.b		0
    385 	ld8		u2 = [up], 8		C			M01
    386 	ADDSUB		w3 = u3, v3		C			M I
    387 	br.cloop.dptk	.Loop			C			B
    388 	;;
    389 C *** MAIN LOOP END ***
    390 
    391 .Lskip:	st8		[rp] = x3, 8		C			M23
    392 	cmp.PRED	p7, p0 = w3, u3		C			M I
    393    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    394    (p9)	add		w2 = INCR, w2		C			M I
    395 	;;
    396 .Lcj7:	st8		[rp] = x0, 8		C			M23
    397 	shrp		x1 = w2, w1, 1		C			I0
    398 	ADDSUB		w0 = u0, v0		C			M I
    399    (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
    400    (p6)	add		w3 = INCR, w3		C			M I
    401 	;;
    402 .Lcj6:	cmp.PRED	p8, p0 = w0, u0		C			M I
    403 	shrp		x2 = w3, w2, 1		C			I0
    404 	ADDSUB		w1 = u1, v1		C			M I
    405 	;;
    406 	st8		[rp] = x1, 8		C			M23
    407 	cmp.PRED	p9, p0 = w1, u1		C			M I
    408    (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
    409    (p7)	add		w0 = INCR, w0		C			M I
    410 	;;
    411 .Lcj5:	st8		[rp] = x2, 8		C			M23
    412 	shrp		x3 = w0, w3, 1		C			I0
    413 	ADDSUB		w2 = u2, v2		C			M I
    414    (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
    415    (p8)	add		w1 = INCR, w1		C			M I
    416 	;;
    417 .Lcj4:	cmp.PRED	p6, p0 = w2, u2		C			M I
    418 	shrp		x0 = w1, w0, 1		C			I0
    419 	;;
    420 	st8		[rp] = x3, 8		C			M23
    421    (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
    422    (p9)	add		w2 = INCR, w2		C			M I
    423 	;;
    424 .Lcj3:	st8		[rp] = x0, 8		C			M23
    425 	shrp		x1 = w2, w1, 1		C			I0
    426 	shr.u		x2 = w2, 1		C			I0
    427 	;;
    428 .Lcj2:	st8		[rp] = x1, 8		C			M23
    429    (p6)	dep		x2 = -1, x2, 63, 1	C			I0
    430 	;;
    431 .Lcj1:	st8		[rp] = x2		C			M23
    432 	mov.i		ar.lc = r2		C			I0
    433 	br.ret.sptk.many b0			C			B
    434 EPILOGUE()
    435