Home | History | Annotate | Line # | Download | only in nails
      1 dnl  Alpha ev6 nails mpn_mul_1.
      2 
      3 dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C      cycles/limb
     34 C EV4:    42
     35 C EV5:    18
     36 C EV6:     3.25
     37 
     38 C TODO
     39 C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
     40 C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
     41 C    umulh.
     42 C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
     43 C    and would work since the loop structure is really regular.
     44 
     45 C  INPUT PARAMETERS
     46 define(`rp',`r16')
     47 define(`up',`r17')
     48 define(`n', `r18')
     49 define(`vl0',`r19')
     50 
     51 define(`numb_mask',`r6')
     52 
     53 define(`m0a',`r0')
     54 define(`m0b',`r1')
     55 define(`m1a',`r2')
     56 define(`m1b',`r3')
     57 define(`m2a',`r20')
     58 define(`m2b',`r21')
     59 define(`m3a',`r22')
     60 define(`m3b',`r23')
     61 
     62 define(`acc0',`r25')
     63 define(`acc1',`r27')
     64 
     65 define(`ul0',`r4')
     66 define(`ul1',`r5')
     67 define(`ul2',`r4')
     68 define(`ul3',`r5')
     69 
     70 define(`rl0',`r24')
     71 define(`rl1',`r24')
     72 define(`rl2',`r24')
     73 define(`rl3',`r24')
     74 
     75 define(`t0',`r7')
     76 define(`t1',`r8')
     77 
     78 define(`NAIL_BITS',`GMP_NAIL_BITS')
     79 define(`NUMB_BITS',`GMP_NUMB_BITS')
     80 
     81 dnl  This declaration is munged by configure
     82 NAILS_SUPPORT(1-63)
     83 
     84 ASM_START()
     85 PROLOGUE(mpn_mul_1)
     86 	sll	vl0, NAIL_BITS, vl0
     87 	lda	numb_mask, -1(r31)
     88 	srl	numb_mask, NAIL_BITS, numb_mask
     89 
     90 	and	n,	3,	r25
     91 	cmpeq	r25,	1,	r21
     92 	bne	r21,	L(1m4)
     93 	cmpeq	r25,	2,	r21
     94 	bne	r21,	L(2m4)
     95 	beq	r25,	L(0m4)
     96 
     97 L(3m4):	ldq	ul3,	0(up)
     98 	lda	n,	-4(n)
     99 	ldq	ul0,	8(up)
    100 	mulq	vl0,	ul3,	m3a
    101 	umulh	vl0,	ul3,	m3b
    102 	ldq	ul1,	16(up)
    103 	lda	up,	24(up)
    104 	lda	rp,	-8(rp)
    105 	mulq	vl0,	ul0,	m0a
    106 	umulh	vl0,	ul0,	m0b
    107 	bge	n,	L(ge3)
    108 
    109 	mulq	vl0,	ul1,	m1a
    110 	umulh	vl0,	ul1,	m1b
    111 	srl	m3a,NAIL_BITS,	t0
    112 	addq	t0,	r31,	acc1
    113 	srl	m0a,NAIL_BITS,	t0
    114 	addq	t0,	m3b,	acc0
    115 	srl	acc1,NUMB_BITS,	t1
    116 	br	r31,	L(ta3)
    117 
    118 L(ge3):	ldq	ul2,	0(up)
    119 	mulq	vl0,	ul1,	m1a
    120 	umulh	vl0,	ul1,	m1b
    121 	srl	m3a,NAIL_BITS,	t0
    122 	ldq	ul3,	8(up)
    123 	lda	n,	-4(n)
    124 	mulq	vl0,	ul2,	m2a
    125 	addq	t0,	r31,	acc1
    126 	umulh	vl0,	ul2,	m2b
    127 	srl	m0a,NAIL_BITS,	t0
    128 	ldq	ul0,	16(up)
    129 	mulq	vl0,	ul3,	m3a
    130 	addq	t0,	m3b,	acc0
    131 	srl	acc1,NUMB_BITS,	t1
    132 	br	r31,	L(el3)
    133 
    134 L(0m4):	lda	n,	-8(n)
    135 	ldq	ul2,	0(up)
    136 	ldq	ul3,	8(up)
    137 	mulq	vl0,	ul2,	m2a
    138 	umulh	vl0,	ul2,	m2b
    139 	ldq	ul0,	16(up)
    140 	mulq	vl0,	ul3,	m3a
    141 	umulh	vl0,	ul3,	m3b
    142 	ldq	ul1,	24(up)
    143 	lda	up,	32(up)
    144 	mulq	vl0,	ul0,	m0a
    145 	umulh	vl0,	ul0,	m0b
    146 	bge	n,	L(ge4)
    147 
    148 	srl	m2a,NAIL_BITS,	t0
    149 	mulq	vl0,	ul1,	m1a
    150 	addq	t0,	r31,	acc0
    151 	umulh	vl0,	ul1,	m1b
    152 	srl	m3a,NAIL_BITS,	t0
    153 	addq	t0,	m2b,	acc1
    154 	srl	acc0,NUMB_BITS,	t1
    155 	br	r31,	L(ta4)
    156 
    157 L(ge4):	srl	m2a,NAIL_BITS,	t0
    158 	ldq	ul2,	0(up)
    159 	mulq	vl0,	ul1,	m1a
    160 	addq	t0,	r31,	acc0
    161 	umulh	vl0,	ul1,	m1b
    162 	srl	m3a,NAIL_BITS,	t0
    163 	ldq	ul3,	8(up)
    164 	lda	n,	-4(n)
    165 	mulq	vl0,	ul2,	m2a
    166 	addq	t0,	m2b,	acc1
    167 	srl	acc0,NUMB_BITS,	t1
    168 	br	r31,	L(el0)
    169 
    170 L(2m4):	lda	n,	-4(n)
    171 	ldq	ul0,	0(up)
    172 	ldq	ul1,	8(up)
    173 	lda	up,	16(up)
    174 	lda	rp,	-16(rp)
    175 	mulq	vl0,	ul0,	m0a
    176 	umulh	vl0,	ul0,	m0b
    177 	bge	n,	L(ge2)
    178 
    179 	mulq	vl0,	ul1,	m1a
    180 	umulh	vl0,	ul1,	m1b
    181 	srl	m0a,NAIL_BITS,	t0
    182 	addq	t0,	r31,	acc0
    183 	srl	m1a,NAIL_BITS,	t0
    184 	addq	t0,	m0b,	acc1
    185 	srl	acc0,NUMB_BITS,	t1
    186 	br	r31,	L(ta2)
    187 
    188 L(ge2):	ldq	ul2,	0(up)
    189 	mulq	vl0,	ul1,	m1a
    190 	umulh	vl0,	ul1,	m1b
    191 	ldq	ul3,	8(up)
    192 	lda	n,	-4(n)
    193 	mulq	vl0,	ul2,	m2a
    194 	umulh	vl0,	ul2,	m2b
    195 	srl	m0a,NAIL_BITS,	t0
    196 	ldq	ul0,	16(up)
    197 	mulq	vl0,	ul3,	m3a
    198 	addq	t0,	r31,	acc0
    199 	umulh	vl0,	ul3,	m3b
    200 	srl	m1a,NAIL_BITS,	t0
    201 	ldq	ul1,	24(up)
    202 	lda	up,	32(up)
    203 	lda	rp,	32(rp)
    204 	mulq	vl0,	ul0,	m0a
    205 	addq	t0,	m0b,	acc1
    206 	srl	acc0,NUMB_BITS,	t1
    207 	bge	n,	L(el2)
    208 
    209 	br	r31,	L(ta6)
    210 
    211 L(1m4):	lda	n,	-4(n)
    212 	ldq	ul1,	0(up)
    213 	lda	up,	8(up)
    214 	lda	rp,	-24(rp)
    215 	bge	n,	L(ge1)
    216 
    217 	mulq	vl0,	ul1,	m1a
    218 	umulh	vl0,	ul1,	m1b
    219 	srl	m1a,NAIL_BITS,	t0
    220 	addq	t0,	r31,	acc1
    221 	and	acc1,numb_mask,	r28
    222 	srl	acc1,NUMB_BITS,	t1
    223 	stq	r28,	24(rp)
    224 	addq	t1,	m1b,	r0
    225 	ret	r31,	(r26),	1
    226 
    227 L(ge1):	ldq	ul2,	0(up)
    228 	mulq	vl0,	ul1,	m1a
    229 	umulh	vl0,	ul1,	m1b
    230 	ldq	ul3,	8(up)
    231 	lda	n,	-4(n)
    232 	mulq	vl0,	ul2,	m2a
    233 	umulh	vl0,	ul2,	m2b
    234 	ldq	ul0,	16(up)
    235 	mulq	vl0,	ul3,	m3a
    236 	umulh	vl0,	ul3,	m3b
    237 	srl	m1a,NAIL_BITS,	t0
    238 	ldq	ul1,	24(up)
    239 	lda	up,	32(up)
    240 	lda	rp,	32(rp)
    241 	mulq	vl0,	ul0,	m0a
    242 	addq	t0,	r31,	acc1
    243 	umulh	vl0,	ul0,	m0b
    244 	srl	m2a,NAIL_BITS,	t0
    245 	mulq	vl0,	ul1,	m1a
    246 	addq	t0,	m1b,	acc0
    247 	srl	acc1,NUMB_BITS,	t1
    248 	blt	n,	L(ta5)
    249 
    250 L(ge5):	ldq	ul2,	0(up)
    251 	br	r31,	L(el1)
    252 
    253 	ALIGN(16)
    254 L(top):	mulq	vl0,	ul0,	m0a		C U1
    255 	addq	t0,	m0b,	acc1		C L0
    256 	srl	acc0,NUMB_BITS,	t1		C U0
    257 	stq	r28,	-24(rp)			C L1
    258 C
    259 L(el2):	umulh	vl0,	ul0,	m0b		C U1
    260 	and	acc0,numb_mask,	r28		C L0
    261 	unop					C U0
    262 	unop					C L1
    263 C
    264 	unop					C U1
    265 	addq	t1,	acc1,	acc1		C L0
    266 	srl	m2a,NAIL_BITS,	t0		C U0
    267 	ldq	ul2,	0(up)			C L1
    268 C
    269 	mulq	vl0,	ul1,	m1a		C U1
    270 	addq	t0,	m1b,	acc0		C L0
    271 	srl	acc1,NUMB_BITS,	t1		C U0
    272 	stq	r28,	-16(rp)			C L1
    273 C
    274 L(el1):	umulh	vl0,	ul1,	m1b		C U1
    275 	and	acc1,numb_mask,	r28		C L0
    276 	unop					C U0
    277 	lda	n,	-4(n)			C L1
    278 C
    279 	unop					C U1
    280 	addq	t1,	acc0,	acc0		C L0
    281 	srl	m3a,NAIL_BITS,	t0		C U0
    282 	ldq	ul3,	8(up)			C L1
    283 C
    284 	mulq	vl0,	ul2,	m2a		C U1
    285 	addq	t0,	m2b,	acc1		C L0
    286 	srl	acc0,NUMB_BITS,	t1		C U0
    287 	stq	r28,	-8(rp)			C L1
    288 C
    289 L(el0):	umulh	vl0,	ul2,	m2b		C U1
    290 	and	acc0,numb_mask,	r28		C L0
    291 	unop					C U0
    292 	unop					C L1
    293 C
    294 	unop					C U1
    295 	addq	t1,	acc1,	acc1		C L0
    296 	srl	m0a,NAIL_BITS,	t0		C U0
    297 	ldq	ul0,	16(up)			C L1
    298 C
    299 	mulq	vl0,	ul3,	m3a		C U1
    300 	addq	t0,	m3b,	acc0		C L0
    301 	srl	acc1,NUMB_BITS,	t1		C U0
    302 	stq	r28,	0(rp)			C L1
    303 C
    304 L(el3):	umulh	vl0,	ul3,	m3b		C U1
    305 	and	acc1,numb_mask,	r28		C L0
    306 	unop					C U0
    307 	unop					C L1
    308 C
    309 	unop					C U1
    310 	addq	t1,	acc0,	acc0		C L0
    311 	srl	m1a,NAIL_BITS,	t0		C U0
    312 	ldq	ul1,	24(up)			C L1
    313 C
    314 	lda	up,	32(up)			C L0
    315 	unop					C U1
    316 	lda	rp,	32(rp)			C L1
    317 	bge	n,	L(top)			C U0
    318 
    319 L(end):	mulq	vl0,	ul0,	m0a
    320 	addq	t0,	m0b,	acc1
    321 	srl	acc0,NUMB_BITS,	t1
    322 	stq	r28,	-24(rp)
    323 L(ta6):	umulh	vl0,	ul0,	m0b
    324 	and	acc0,numb_mask,	r28
    325 	addq	t1,	acc1,	acc1
    326 	srl	m2a,NAIL_BITS,	t0
    327 	mulq	vl0,	ul1,	m1a
    328 	addq	t0,	m1b,	acc0
    329 	srl	acc1,NUMB_BITS,	t1
    330 	stq	r28,	-16(rp)
    331 L(ta5):	umulh	vl0,	ul1,	m1b
    332 	and	acc1,numb_mask,	r28
    333 	addq	t1,	acc0,	acc0
    334 	srl	m3a,NAIL_BITS,	t0
    335 	addq	t0,	m2b,	acc1
    336 	srl	acc0,NUMB_BITS,	t1
    337 	stq	r28,	-8(rp)
    338 	ALIGN(16)
    339 L(ta4):	and	acc0,numb_mask,	r28
    340 	addq	t1,	acc1,	acc1
    341 	srl	m0a,NAIL_BITS,	t0
    342 	addq	t0,	m3b,	acc0
    343 	srl	acc1,NUMB_BITS,	t1
    344 	stq	r28,	0(rp)
    345 	unop
    346 	ALIGN(16)
    347 L(ta3):	and	acc1,numb_mask,	r28
    348 	addq	t1,	acc0,	acc0
    349 	srl	m1a,NAIL_BITS,	t0
    350 	addq	t0,	m0b,	acc1
    351 	srl	acc0,NUMB_BITS,	t1
    352 	stq	r28,	8(rp)
    353 	unop
    354 	ALIGN(16)
    355 L(ta2):	and	acc0,numb_mask,	r28
    356 	addq	t1,	acc1,	acc1
    357 	srl	acc1,NUMB_BITS,	t1
    358 	stq	r28,	16(rp)
    359 	and	acc1,numb_mask,	r28
    360 	addq	t1,	m1b,	r0
    361 	stq	r28,	24(rp)
    362 	ret	r31,	(r26),	1
    363 EPILOGUE()
    364 ASM_END()
    365