Home | History | Annotate | Line # | Download | only in ia64
      1 dnl  IA-64 mpn_lshift/mpn_rshift.
      2 
      3 dnl  Contributed to the GNU project by Torbjorn Granlund.
      4 
      5 dnl  Copyright 2000-2005 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C           cycles/limb
     36 C Itanium:      2
     37 C Itanium 2:    1
     38 
     39 C This code is scheduled deeply since the plain shift instructions shr and shl
     40 C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
     41 C these instructions cause a 10 cycle replay trap on Itanium.
     42 
     43 C The ld8 scheduling should probably be decreased to make the function smaller.
     44 C Good lfetch  will make sure we never stall anyway.
     45 
     46 C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
     47 C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
     48 C in the prologue.
     49 
     50 
     51 C INPUT PARAMETERS
     52 define(`rp', `r32')
     53 define(`up', `r33')
     54 define(`n',  `r34')
     55 define(`cnt',`r35')
     56 
     57 define(`tnc',`r9')
     58 
     59 ifdef(`OPERATION_lshift',`
     60 	define(`FSH',`shl')
     61 	define(`BSH',`shr.u')
     62 	define(`UPD',`-8')
     63 	define(`POFF',`-512')
     64 	define(`PUPD',`-32')
     65 	define(`func',`mpn_lshift')
     66 ')
     67 ifdef(`OPERATION_rshift',`
     68 	define(`FSH',`shr.u')
     69 	define(`BSH',`shl')
     70 	define(`UPD',`8')
     71 	define(`POFF',`512')
     72 	define(`PUPD',`32')
     73 	define(`func',`mpn_rshift')
     74 ')
     75 
     76 MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
     77 
     78 ASM_START()
     79 PROLOGUE(func)
     80 	.prologue
     81 	.save	ar.lc, r2
     82 	.body
     83 ifdef(`HAVE_ABI_32',
     84 `	addp4	rp = 0, rp		C			M I
     85 	addp4	up = 0, up		C		M I
     86 	sxt4	n = n			C		M I
     87 	nop.m		0
     88 	nop.m		0
     89 	zxt4	cnt = cnt		C		I
     90 	;;
     91 ')
     92 
     93  {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
     94 	and	r14 = 3, n		C		M I
     95 	mov.i	r2 = ar.lc		C		I0
     96 }{.mmi;	add	r15 = -1, n		C		M I
     97 	sub	tnc = 64, cnt		C		M I
     98 	add	r16 = -5, n
     99 	;;
    100 }{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
    101 	cmp.eq	p7, p0 = 2, r14		C		M I
    102 	shr.u	n = r16, 2		C		I0
    103 }{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
    104 ifdef(`OPERATION_lshift',
    105 `	shladd	up = r15, 3, up		C		M I
    106 	shladd	rp = r15, 3, rp')	C		M I
    107 	;;
    108 }{.mmi;	add	r11 = POFF, up		C		M I
    109 	ld8	r10 = [up], UPD		C		M01
    110 	mov.i	ar.lc = n		C		I0
    111 }{.bbb;
    112    (p6)	br.dptk	.Lb01
    113    (p7)	br.dptk	.Lb10
    114    (p8)	br.dptk	.Lb11
    115 	;; }
    116 
    117 .Lb00:	ld8	r19 = [up], UPD
    118 	;;
    119 	ld8	r16 = [up], UPD
    120 	;;
    121 	ld8	r17 = [up], UPD
    122 	BSH	r8 = r10, tnc		C function return value
    123 	;;
    124 	FSH	r24 = r10, cnt
    125 	BSH	r25 = r19, tnc
    126   (p14)	br.cond.dptk	.grt4
    127 	;;
    128 	FSH	r26 = r19, cnt
    129 	BSH	r27 = r16, tnc
    130 	;;
    131 	FSH	r20 = r16, cnt
    132 	BSH	r21 = r17, tnc
    133 	;;
    134 	or	r14 = r25, r24
    135 	FSH	r22 = r17, cnt
    136 	BSH	r23 = r10, tnc
    137 	br	.Lr4
    138 
    139 .grt4:	ld8	r18 = [up], UPD
    140 	FSH	r26 = r19, cnt
    141 	BSH	r27 = r16, tnc
    142 	;;
    143 	ld8	r19 = [up], UPD
    144 	FSH	r20 = r16, cnt
    145 	BSH	r21 = r17, tnc
    146 	;;
    147 	ld8	r16 = [up], UPD
    148 	FSH	r22 = r17, cnt
    149 	BSH	r23 = r18, tnc
    150 	;;
    151 	or	r14 = r25, r24
    152 	ld8	r17 = [up], UPD
    153 	br.cloop.dpnt	.Ltop
    154 	br	.Lbot
    155 
    156 .Lb01:
    157   (p15)	BSH	r8 = r10, tnc		C function return value	I
    158   (p15)	FSH	r22 = r10, cnt		C		I
    159   (p15)	br.cond.dptk	.Lr1		C return	B
    160 
    161 .grt1:	ld8	r18 = [up], UPD
    162 	;;
    163 	ld8	r19 = [up], UPD
    164 	BSH	r8 = r10, tnc		C function return value
    165 	;;
    166 	ld8	r16 = [up], UPD
    167 	FSH	r22 = r10, cnt
    168 	BSH	r23 = r18, tnc
    169 	;;
    170 	ld8	r17 = [up], UPD
    171 	FSH	r24 = r18, cnt
    172 	BSH	r25 = r19, tnc
    173 	br.cloop.dpnt	.grt5
    174 	;;
    175 	or	r15 = r23, r22
    176 	FSH	r26 = r19, cnt
    177 	BSH	r27 = r16, tnc
    178 	;;
    179 	FSH	r20 = r16, cnt
    180 	BSH	r21 = r17, tnc
    181 	br	.Lr5
    182 
    183 .grt5:	ld8	r18 = [up], UPD
    184 	FSH	r26 = r19, cnt
    185 	BSH	r27 = r16, tnc
    186 	;;
    187 	ld8	r19 = [up], UPD
    188 	FSH	r20 = r16, cnt
    189 	BSH	r21 = r17, tnc
    190 	;;
    191 	or	r15 = r23, r22
    192 	ld8	r16 = [up], UPD
    193 	br	.LL01
    194 
    195 
    196 .Lb10:	ld8	r17 = [up], UPD
    197   (p14)	br.cond.dptk	.grt2
    198 
    199 	BSH	r8 = r10, tnc		C function return value
    200 	;;
    201 	FSH	r20 = r10, cnt
    202 	BSH	r21 = r17, tnc
    203 	;;
    204 	or	r14 = r21, r20
    205 	FSH	r22 = r17, cnt
    206 	br	.Lr2			C return
    207 
    208 .grt2:	ld8	r18 = [up], UPD
    209 	BSH	r8 = r10, tnc		C function return value
    210 	;;
    211 	ld8	r19 = [up], UPD
    212 	FSH	r20 = r10, cnt
    213 	BSH	r21 = r17, tnc
    214 	;;
    215 	ld8	r16 = [up], UPD
    216 	FSH	r22 = r17, cnt
    217 	BSH	r23 = r18, tnc
    218 	;;
    219  {.mmi;	ld8	r17 = [up], UPD
    220 	or	r14 = r21, r20
    221 	FSH	r24 = r18, cnt
    222 }{.mib;	nop	0
    223 	BSH	r25 = r19, tnc
    224 	br.cloop.dpnt	.grt6
    225 	;; }
    226 
    227 	FSH	r26 = r19, cnt
    228 	BSH	r27 = r16, tnc
    229 	br	.Lr6
    230 
    231 .grt6:	ld8	r18 = [up], UPD
    232 	FSH	r26 = r19, cnt
    233 	BSH	r27 = r16, tnc
    234 	;;
    235 	ld8	r19 = [up], UPD
    236 	br	.LL10
    237 
    238 
    239 .Lb11:	ld8	r16 = [up], UPD
    240 	;;
    241 	ld8	r17 = [up], UPD
    242 	BSH	r8 = r10, tnc		C function return value
    243   (p14)	br.cond.dptk	.grt3
    244 	;;
    245 
    246 	FSH	r26 = r10, cnt
    247 	BSH	r27 = r16, tnc
    248 	;;
    249 	FSH	r20 = r16, cnt
    250 	BSH	r21 = r17, tnc
    251 	;;
    252 	or	r15 = r27, r26
    253 	FSH	r22 = r17, cnt
    254 	br	.Lr3			C return
    255 
    256 .grt3:	ld8	r18 = [up], UPD
    257 	FSH	r26 = r10, cnt
    258 	BSH	r27 = r16, tnc
    259 	;;
    260 	ld8	r19 = [up], UPD
    261 	FSH	r20 = r16, cnt
    262 	BSH	r21 = r17, tnc
    263 	;;
    264 	ld8	r16 = [up], UPD
    265 	FSH	r22 = r17, cnt
    266 	BSH	r23 = r18, tnc
    267 	;;
    268 	ld8	r17 = [up], UPD
    269 	br.cloop.dpnt	.grt7
    270 
    271 	or	r15 = r27, r26
    272 	FSH	r24 = r18, cnt
    273 	BSH	r25 = r19, tnc
    274 	br	.Lr7
    275 
    276 .grt7:	or	r15 = r27, r26
    277 	FSH	r24 = r18, cnt
    278 	BSH	r25 = r19, tnc
    279 	ld8	r18 = [up], UPD
    280 	br	.LL11
    281 
    282 C *** MAIN LOOP START ***
    283 	ALIGN(32)
    284 .Ltop:
    285  {.mmi;	st8	[rp] = r14, UPD		C M2
    286 	or	r15 = r27, r26		C M3
    287 	FSH	r24 = r18, cnt		C I0
    288 }{.mmi;	ld8	r18 = [up], UPD		C M1
    289 	lfetch	[r11], PUPD
    290 	BSH	r25 = r19, tnc		C I1
    291 	;; }
    292 .LL11:
    293  {.mmi;	st8	[rp] = r15, UPD
    294 	or	r14 = r21, r20
    295 	FSH	r26 = r19, cnt
    296 }{.mmi;	ld8	r19 = [up], UPD
    297 	nop.m	0
    298 	BSH	r27 = r16, tnc
    299 	;; }
    300 .LL10:
    301  {.mmi;	st8	[rp] = r14, UPD
    302 	or	r15 = r23, r22
    303 	FSH	r20 = r16, cnt
    304 }{.mmi;	ld8	r16 = [up], UPD
    305 	nop.m	0
    306 	BSH	r21 = r17, tnc
    307 	;; }
    308 .LL01:
    309  {.mmi;	st8	[rp] = r15, UPD
    310 	or	r14 = r25, r24
    311 	FSH	r22 = r17, cnt
    312 }{.mib;	ld8	r17 = [up], UPD
    313 	BSH	r23 = r18, tnc
    314 	br.cloop.dptk	.Ltop
    315 	;; }
    316 C *** MAIN LOOP END ***
    317 
    318 .Lbot:
    319  {.mmi;	st8	[rp] = r14, UPD
    320 	or	r15 = r27, r26
    321 	FSH	r24 = r18, cnt
    322 }{.mib;	nop	0
    323 	BSH	r25 = r19, tnc
    324 	nop	0
    325 	;; }
    326 .Lr7:
    327  {.mmi;	st8	[rp] = r15, UPD
    328 	or	r14 = r21, r20
    329 	FSH	r26 = r19, cnt
    330 }{.mib;	nop	0
    331 	BSH	r27 = r16, tnc
    332 	nop	0
    333 	;; }
    334 .Lr6:
    335  {.mmi;	st8	[rp] = r14, UPD
    336 	or	r15 = r23, r22
    337 	FSH	r20 = r16, cnt
    338 }{.mib;	nop	0
    339 	BSH	r21 = r17, tnc
    340 	nop	0
    341 	;; }
    342 .Lr5:	st8	[rp] = r15, UPD
    343 	or	r14 = r25, r24
    344 	FSH	r22 = r17, cnt
    345 	;;
    346 .Lr4:	st8	[rp] = r14, UPD
    347 	or	r15 = r27, r26
    348 	;;
    349 .Lr3:	st8	[rp] = r15, UPD
    350 	or	r14 = r21, r20
    351 	;;
    352 .Lr2:	st8	[rp] = r14, UPD
    353 	;;
    354 .Lr1:	st8	[rp] = r22, UPD		C		M23
    355 	mov	ar.lc = r2		C		I0
    356 	br.ret.sptk.many b0		C		B
    357 EPILOGUE(func)
    358 ASM_END()
    359