Home | History | Annotate | Line # | Download | only in nails
      1 dnl  Alpha ev6 nails mpn_submul_1.
      2 
      3 dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C      cycles/limb
     34 C EV4:    42
     35 C EV5:    18
     36 C EV6:     4
     37 
     38 C TODO
     39 C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
     40 C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
     41 C    umulh.
     42 C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
     43 C    and would work since the loop structure is really regular.
     44 
     45 C  INPUT PARAMETERS
     46 define(`rp',`r16')
     47 define(`up',`r17')
     48 define(`n', `r18')
     49 define(`vl0',`r19')
     50 
     51 define(`numb_mask',`r6')
     52 
     53 define(`m0a',`r0')
     54 define(`m0b',`r1')
     55 define(`m1a',`r2')
     56 define(`m1b',`r3')
     57 define(`m2a',`r20')
     58 define(`m2b',`r21')
     59 define(`m3a',`r22')
     60 define(`m3b',`r23')
     61 
     62 define(`acc0',`r25')
     63 define(`acc1',`r27')
     64 
     65 define(`ul0',`r4')
     66 define(`ul1',`r5')
     67 define(`ul2',`r4')
     68 define(`ul3',`r5')
     69 
     70 define(`rl0',`r24')
     71 define(`rl1',`r24')
     72 define(`rl2',`r24')
     73 define(`rl3',`r24')
     74 
     75 define(`t0',`r7')
     76 define(`t1',`r8')
     77 
     78 define(`NAIL_BITS',`GMP_NAIL_BITS')
     79 define(`NUMB_BITS',`GMP_NUMB_BITS')
     80 
     81 dnl  This declaration is munged by configure
     82 NAILS_SUPPORT(2-63)
     83 
     84 ASM_START()
     85 PROLOGUE(mpn_submul_1)
     86 	sll	vl0, NAIL_BITS, vl0
     87 	lda	numb_mask, -1(r31)
     88 	srl	numb_mask, NAIL_BITS, numb_mask
     89 
     90 	and	n,	3,	r25
     91 	cmpeq	r25,	1,	r21
     92 	bne	r21,	L(1m4)
     93 	cmpeq	r25,	2,	r21
     94 	bne	r21,	L(2m4)
     95 	beq	r25,	L(0m4)
     96 
     97 L(3m4):	ldq	ul3,	0(up)
     98 	lda	n,	-4(n)
     99 	ldq	ul0,	8(up)
    100 	mulq	vl0,	ul3,	m3a
    101 	umulh	vl0,	ul3,	m3b
    102 	ldq	ul1,	16(up)
    103 	lda	up,	24(up)
    104 	lda	rp,	-8(rp)
    105 	mulq	vl0,	ul0,	m0a
    106 	umulh	vl0,	ul0,	m0b
    107 	bge	n,	L(ge3)
    108 
    109 	mulq	vl0,	ul1,	m1a
    110 	umulh	vl0,	ul1,	m1b
    111 	ldq	rl3,	8(rp)
    112 	srl	m3a,NAIL_BITS,	t0
    113 	addq	t0,	r31,	acc1
    114 	subq	rl3,	acc1,	acc1
    115 	ldq	rl0,	16(rp)
    116 	srl	m0a,NAIL_BITS,	t0
    117 	addq	t0,	m3b,	acc0
    118 	sra	acc1,NUMB_BITS,	t1
    119 	br	r31,	L(ta3)
    120 
    121 L(ge3):	ldq	ul2,	0(up)
    122 	mulq	vl0,	ul1,	m1a
    123 	umulh	vl0,	ul1,	m1b
    124 	ldq	rl3,	8(rp)
    125 	srl	m3a,NAIL_BITS,	t0
    126 	ldq	ul3,	8(up)
    127 	lda	n,	-4(n)
    128 	mulq	vl0,	ul2,	m2a
    129 	addq	t0,	r31,	acc1
    130 	umulh	vl0,	ul2,	m2b
    131 	subq	rl3,	acc1,	acc1
    132 	ldq	rl0,	16(rp)
    133 	srl	m0a,NAIL_BITS,	t0
    134 	ldq	ul0,	16(up)
    135 	mulq	vl0,	ul3,	m3a
    136 	addq	t0,	m3b,	acc0
    137 	sra	acc1,NUMB_BITS,	t1
    138 	br	r31,	L(el3)
    139 
    140 L(0m4):	lda	n,	-8(n)
    141 	ldq	ul2,	0(up)
    142 	ldq	ul3,	8(up)
    143 	mulq	vl0,	ul2,	m2a
    144 	umulh	vl0,	ul2,	m2b
    145 	ldq	ul0,	16(up)
    146 	mulq	vl0,	ul3,	m3a
    147 	umulh	vl0,	ul3,	m3b
    148 	ldq	ul1,	24(up)
    149 	lda	up,	32(up)
    150 	mulq	vl0,	ul0,	m0a
    151 	umulh	vl0,	ul0,	m0b
    152 	bge	n,	L(ge4)
    153 
    154 	ldq	rl2,	0(rp)
    155 	srl	m2a,NAIL_BITS,	t0
    156 	mulq	vl0,	ul1,	m1a
    157 	addq	t0,	r31,	acc0
    158 	umulh	vl0,	ul1,	m1b
    159 	subq	rl2,	acc0,	acc0
    160 	ldq	rl3,	8(rp)
    161 	srl	m3a,NAIL_BITS,	t0
    162 	addq	t0,	m2b,	acc1
    163 	sra	acc0,NUMB_BITS,	t1
    164 	br	r31,	L(ta4)
    165 
    166 L(ge4):	ldq	rl2,	0(rp)
    167 	srl	m2a,NAIL_BITS,	t0
    168 	ldq	ul2,	0(up)
    169 	mulq	vl0,	ul1,	m1a
    170 	addq	t0,	r31,	acc0
    171 	umulh	vl0,	ul1,	m1b
    172 	subq	rl2,	acc0,	acc0
    173 	ldq	rl3,	8(rp)
    174 	srl	m3a,NAIL_BITS,	t0
    175 	ldq	ul3,	8(up)
    176 	lda	n,	-4(n)
    177 	mulq	vl0,	ul2,	m2a
    178 	addq	t0,	m2b,	acc1
    179 	sra	acc0,NUMB_BITS,	t1
    180 	br	r31,	L(el0)
    181 
    182 L(2m4):	lda	n,	-4(n)
    183 	ldq	ul0,	0(up)
    184 	ldq	ul1,	8(up)
    185 	lda	up,	16(up)
    186 	lda	rp,	-16(rp)
    187 	mulq	vl0,	ul0,	m0a
    188 	umulh	vl0,	ul0,	m0b
    189 	bge	n,	L(ge2)
    190 
    191 	mulq	vl0,	ul1,	m1a
    192 	umulh	vl0,	ul1,	m1b
    193 	ldq	rl0,	16(rp)
    194 	srl	m0a,NAIL_BITS,	t0
    195 	addq	t0,	r31,	acc0
    196 	subq	rl0,	acc0,	acc0
    197 	ldq	rl1,	24(rp)
    198 	srl	m1a,NAIL_BITS,	t0
    199 	addq	t0,	m0b,	acc1
    200 	sra	acc0,NUMB_BITS,	t1
    201 	br	r31,	L(ta2)
    202 
    203 L(ge2):	ldq	ul2,	0(up)
    204 	mulq	vl0,	ul1,	m1a
    205 	umulh	vl0,	ul1,	m1b
    206 	ldq	ul3,	8(up)
    207 	lda	n,	-4(n)
    208 	mulq	vl0,	ul2,	m2a
    209 	umulh	vl0,	ul2,	m2b
    210 	ldq	rl0,	16(rp)
    211 	srl	m0a,NAIL_BITS,	t0
    212 	ldq	ul0,	16(up)
    213 	mulq	vl0,	ul3,	m3a
    214 	addq	t0,	r31,	acc0
    215 	umulh	vl0,	ul3,	m3b
    216 	subq	rl0,	acc0,	acc0
    217 	ldq	rl1,	24(rp)
    218 	srl	m1a,NAIL_BITS,	t0
    219 	ldq	ul1,	24(up)
    220 	lda	up,	32(up)
    221 	lda	rp,	32(rp)
    222 	mulq	vl0,	ul0,	m0a
    223 	addq	t0,	m0b,	acc1
    224 	sra	acc0,NUMB_BITS,	t1
    225 	bge	n,	L(el2)
    226 
    227 	br	r31,	L(ta6)
    228 
    229 L(1m4):	lda	n,	-4(n)
    230 	ldq	ul1,	0(up)
    231 	lda	up,	8(up)
    232 	lda	rp,	-24(rp)
    233 	bge	n,	L(ge1)
    234 
    235 	mulq	vl0,	ul1,	m1a
    236 	umulh	vl0,	ul1,	m1b
    237 	ldq	rl1,	24(rp)
    238 	srl	m1a,NAIL_BITS,	t0
    239 	subq	rl1,	t0,	acc1
    240 	and	acc1,numb_mask,	r28
    241 	sra	acc1,NUMB_BITS,	t1
    242 	stq	r28,	24(rp)
    243 	subq	m1b,	t1,	r0
    244 	ret	r31,	(r26),	1
    245 
    246 L(ge1):	ldq	ul2,	0(up)
    247 	mulq	vl0,	ul1,	m1a
    248 	umulh	vl0,	ul1,	m1b
    249 	ldq	ul3,	8(up)
    250 	lda	n,	-4(n)
    251 	mulq	vl0,	ul2,	m2a
    252 	umulh	vl0,	ul2,	m2b
    253 	ldq	ul0,	16(up)
    254 	mulq	vl0,	ul3,	m3a
    255 	umulh	vl0,	ul3,	m3b
    256 	ldq	rl1,	24(rp)
    257 	srl	m1a,NAIL_BITS,	t0
    258 	ldq	ul1,	24(up)
    259 	lda	up,	32(up)
    260 	lda	rp,	32(rp)
    261 	mulq	vl0,	ul0,	m0a
    262 	addq	t0,	r31,	acc1
    263 	umulh	vl0,	ul0,	m0b
    264 	subq	rl1,	acc1,	acc1
    265 	ldq	rl2,	0(rp)
    266 	srl	m2a,NAIL_BITS,	t0
    267 	mulq	vl0,	ul1,	m1a
    268 	addq	t0,	m1b,	acc0
    269 	sra	acc1,NUMB_BITS,	t1
    270 	blt	n,	L(ta5)
    271 
    272 L(ge5):	ldq	ul2,	0(up)
    273 	br	r31,	L(el1)
    274 
    275 	ALIGN(16)
    276 L(top):	mulq	vl0,	ul0,	m0a		C U1
    277 	addq	t0,	m0b,	acc1		C L0
    278 	sra	acc0,NUMB_BITS,	t1		C U0
    279 	stq	r28,	-24(rp)			C L1
    280 C
    281 L(el2):	umulh	vl0,	ul0,	m0b		C U1
    282 	and	acc0,numb_mask,	r28		C L0
    283 	subq	rl1,	acc1,	acc1		C U0
    284 	ldq	rl2,	0(rp)			C L1
    285 C
    286 	unop					C U1
    287 	addq	t1,	acc1,	acc1		C L0
    288 	srl	m2a,NAIL_BITS,	t0		C U0
    289 	ldq	ul2,	0(up)			C L1
    290 C
    291 	mulq	vl0,	ul1,	m1a		C U1
    292 	addq	t0,	m1b,	acc0		C L0
    293 	sra	acc1,NUMB_BITS,	t1		C U0
    294 	stq	r28,	-16(rp)			C L1
    295 C
    296 L(el1):	umulh	vl0,	ul1,	m1b		C U1
    297 	and	acc1,numb_mask,	r28		C L0
    298 	subq	rl2,	acc0,	acc0		C U0
    299 	ldq	rl3,	8(rp)			C L1
    300 C
    301 	lda	n,	-4(n)			C L1
    302 	addq	t1,	acc0,	acc0		C L0
    303 	srl	m3a,NAIL_BITS,	t0		C U0
    304 	ldq	ul3,	8(up)			C L1
    305 C
    306 	mulq	vl0,	ul2,	m2a		C U1
    307 	addq	t0,	m2b,	acc1		C L0
    308 	sra	acc0,NUMB_BITS,	t1		C U0
    309 	stq	r28,	-8(rp)			C L1
    310 C
    311 L(el0):	umulh	vl0,	ul2,	m2b		C U1
    312 	and	acc0,numb_mask,	r28		C L0
    313 	subq	rl3,	acc1,	acc1		C U0
    314 	ldq	rl0,	16(rp)			C L1
    315 C
    316 	unop					C U1
    317 	addq	t1,	acc1,	acc1		C L0
    318 	srl	m0a,NAIL_BITS,	t0		C U0
    319 	ldq	ul0,	16(up)			C L1
    320 C
    321 	mulq	vl0,	ul3,	m3a		C U1
    322 	addq	t0,	m3b,	acc0		C L0
    323 	sra	acc1,NUMB_BITS,	t1		C U0
    324 	stq	r28,	0(rp)			C L1
    325 C
    326 L(el3):	umulh	vl0,	ul3,	m3b		C U1
    327 	and	acc1,numb_mask,	r28		C L0
    328 	subq	rl0,	acc0,	acc0		C U0
    329 	ldq	rl1,	24(rp)			C L1
    330 C
    331 	unop					C U1
    332 	addq	t1,	acc0,	acc0		C L0
    333 	srl	m1a,NAIL_BITS,	t0		C U0
    334 	ldq	ul1,	24(up)			C L1
    335 C
    336 	lda	up,	32(up)			C L0
    337 	unop					C U1
    338 	lda	rp,	32(rp)			C L1
    339 	bge	n,	L(top)			C U0
    340 
    341 L(end):	mulq	vl0,	ul0,	m0a
    342 	addq	t0,	m0b,	acc1
    343 	sra	acc0,NUMB_BITS,	t1
    344 	stq	r28,	-24(rp)
    345 L(ta6):	umulh	vl0,	ul0,	m0b
    346 	and	acc0,numb_mask,	r28
    347 	subq	rl1,	acc1,	acc1
    348 	ldq	rl2,	0(rp)
    349 	addq	t1,	acc1,	acc1
    350 	srl	m2a,NAIL_BITS,	t0
    351 	mulq	vl0,	ul1,	m1a
    352 	addq	t0,	m1b,	acc0
    353 	sra	acc1,NUMB_BITS,	t1
    354 	stq	r28,	-16(rp)
    355 L(ta5):	umulh	vl0,	ul1,	m1b
    356 	and	acc1,numb_mask,	r28
    357 	subq	rl2,	acc0,	acc0
    358 	ldq	rl3,	8(rp)
    359 	addq	t1,	acc0,	acc0
    360 	srl	m3a,NAIL_BITS,	t0
    361 	addq	t0,	m2b,	acc1
    362 	sra	acc0,NUMB_BITS,	t1
    363 	stq	r28,	-8(rp)
    364 	unop
    365 	ALIGN(16)
    366 L(ta4):	and	acc0,numb_mask,	r28
    367 	subq	rl3,	acc1,	acc1
    368 	ldq	rl0,	16(rp)
    369 	addq	t1,	acc1,	acc1
    370 	srl	m0a,NAIL_BITS,	t0
    371 	addq	t0,	m3b,	acc0
    372 	sra	acc1,NUMB_BITS,	t1
    373 	stq	r28,	0(rp)
    374 	unop
    375 	ALIGN(16)
    376 L(ta3):	and	acc1,numb_mask,	r28
    377 	subq	rl0,	acc0,	acc0
    378 	ldq	rl1,	24(rp)
    379 	addq	t1,	acc0,	acc0
    380 	srl	m1a,NAIL_BITS,	t0
    381 	addq	t0,	m0b,	acc1
    382 	sra	acc0,NUMB_BITS,	t1
    383 	stq	r28,	8(rp)
    384 	unop
    385 	ALIGN(16)
    386 L(ta2):	and	acc0,numb_mask,	r28
    387 	subq	rl1,	acc1,	acc1
    388 	addq	t1,	acc1,	acc1
    389 	sra	acc1,NUMB_BITS,	t1
    390 	stq	r28,	16(rp)
    391 	and	acc1,numb_mask,	r28
    392 	subq	m1b,	t1,	r0
    393 	stq	r28,	24(rp)
    394 	ret	r31,	(r26),	1
    395 EPILOGUE()
    396 ASM_END()
    397