Home | History | Annotate | Line # | Download | only in nails
      1 dnl  Alpha ev6 nails mpn_addmul_4.
      2 
      3 dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C Runs at 2.5 cycles/limb.
     34 
     35 C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
     36 C to 3.24 insn/cycle.
     37 
     38 
     39 C  INPUT PARAMETERS
     40 define(`rp',`r16')
     41 define(`up',`r17')
     42 define(`n',`r18')
     43 define(`vp',`r19')
     44 
     45 C  Useful register aliases
     46 define(`numb_mask',`r24')
     47 define(`ulimb',`r25')
     48 define(`rlimb',`r27')
     49 
     50 define(`m0a',`r0')
     51 define(`m0b',`r1')
     52 define(`m1a',`r2')
     53 define(`m1b',`r3')
     54 define(`m2a',`r20')
     55 define(`m2b',`r21')
     56 define(`m3a',`r12')
     57 define(`m3b',`r13')
     58 
     59 define(`acc0',`r4')
     60 define(`acc1',`r5')
     61 define(`acc2',`r22')
     62 define(`acc3',`r14')
     63 
     64 define(`v0',`r6')
     65 define(`v1',`r7')
     66 define(`v2',`r23')
     67 define(`v3',`r15')
     68 
     69 C Used for temps: r8 r19 r28
     70 
     71 define(`NAIL_BITS',`GMP_NAIL_BITS')
     72 define(`NUMB_BITS',`GMP_NUMB_BITS')
     73 
     74 C  This declaration is munged by configure
     75 NAILS_SUPPORT(4-63)
     76 
     77 ASM_START()
     78 PROLOGUE(mpn_addmul_4)
     79 	lda	r30,	-240(r30)
     80 	stq	r12,	32(r30)
     81 	stq	r13,	40(r30)
     82 	stq	r14,	48(r30)
     83 	stq	r15,	56(r30)
     84 
     85 	lda	numb_mask,-1(r31)
     86 	srl	numb_mask,NAIL_BITS,numb_mask
     87 
     88 	ldq	v0,	0(vp)
     89 	ldq	v1,	8(vp)
     90 	ldq	v2,	16(vp)
     91 	ldq	v3,	24(vp)
     92 
     93 	bis	r31,	r31,	acc0		C	zero acc0
     94 	sll	v0,NAIL_BITS,	v0
     95 	bis	r31,	r31,	acc1		C	zero acc1
     96 	sll	v1,NAIL_BITS,	v1
     97 	bis	r31,	r31,	acc2		C	zero acc2
     98 	sll	v2,NAIL_BITS,	v2
     99 	bis	r31,	r31,	acc3		C	zero acc3
    100 	sll	v3,NAIL_BITS,	v3
    101 	bis	r31,	r31,	r19
    102 
    103 	ldq	ulimb,	0(up)
    104 	lda	up,	8(up)
    105 	mulq	v0,	ulimb,	m0a		C U1
    106 	umulh	v0,	ulimb,	m0b		C U1
    107 	mulq	v1,	ulimb,	m1a		C U1
    108 	umulh	v1,	ulimb,	m1b		C U1
    109 	lda	n,	-1(n)
    110 	mulq	v2,	ulimb,	m2a		C U1
    111 	umulh	v2,	ulimb,	m2b		C U1
    112 	mulq	v3,	ulimb,	m3a		C U1
    113 	umulh	v3,	ulimb,	m3b		C U1
    114 	beq	n,	L(end)			C U0
    115 
    116 	ALIGN(16)
    117 L(top):	bis	r31,	r31,	r31		C U1	nop
    118 	ldq	rlimb,	0(rp)			C L0
    119 	ldq	ulimb,	0(up)			C L1
    120 	addq	r19,	acc0,	acc0		C U0	propagate nail
    121 
    122 	bis	r31,	r31,	r31		C L0	nop
    123 	bis	r31,	r31,	r31		C U1	nop
    124 	bis	r31,	r31,	r31		C L1	nop
    125 	bis	r31,	r31,	r31		C U0	nop
    126 
    127 	lda	rp,	8(rp)			C L0
    128 	srl	m0a,NAIL_BITS,	r8		C U0
    129 	lda	up,	8(up)			C L1
    130 	mulq	v0,	ulimb,	m0a		C U1
    131 
    132 	addq	r8,	acc0,	r19		C U0
    133 	addq	m0b,	acc1,	acc0		C L0
    134 	umulh	v0,	ulimb,	m0b		C U1
    135 	bis	r31,	r31,	r31		C L1	nop
    136 
    137 	addq	rlimb,	r19,	r19		C L0
    138 	srl	m1a,NAIL_BITS,	r8		C U0
    139 	bis	r31,	r31,	r31		C L1	nop
    140 	mulq	v1,	ulimb,	m1a		C U1
    141 
    142 	addq	r8,	acc0,	acc0		C U0
    143 	addq	m1b,	acc2,	acc1		C L0
    144 	umulh	v1,	ulimb,	m1b		C U1
    145 	and	r19,numb_mask,	r28		C L1	extract numb part
    146 
    147 	bis	r31,	r31,	r31		C L0	nop
    148 	srl	m2a,NAIL_BITS,	r8		C U0
    149 	lda	n,	-1(n)			C L1
    150 	mulq	v2,	ulimb,	m2a		C U1
    151 
    152 	addq	r8,	acc1,	acc1		C L1
    153 	addq	m2b,	acc3,	acc2		C L0
    154 	umulh	v2,	ulimb,	m2b		C U1
    155 	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
    156 
    157 	bis	r31,	r31,	r31		C L0	nop
    158 	srl	m3a,NAIL_BITS,	r8		C U0
    159 	stq	r28,	-8(rp)			C L1
    160 	mulq	v3,	ulimb,	m3a		C U1
    161 
    162 	addq	r8,	acc2,	acc2		C L0
    163 	bis	r31,	m3b,	acc3		C L1
    164 	umulh	v3,	ulimb,	m3b		C U1
    165 	bne	n,	L(top)			C U0
    166 
    167 L(end):	ldq	rlimb,	0(rp)
    168 	addq	r19,	acc0,	acc0		C	propagate nail
    169 	lda	rp,	8(rp)			C FIXME: DELETE
    170 	srl	m0a,NAIL_BITS,	r8		C U0
    171 	addq	r8,	acc0,	r19
    172 	addq	m0b,	acc1,	acc0
    173 	addq	rlimb,	r19,	r19
    174 	srl	m1a,NAIL_BITS,	r8		C U0
    175 	addq	r8,	acc0,	acc0
    176 	addq	m1b,	acc2,	acc1
    177 	and	r19,numb_mask,	r28		C extract limb
    178 	srl	m2a,NAIL_BITS,	r8		C U0
    179 	addq	r8,	acc1,	acc1
    180 	addq	m2b,	acc3,	acc2
    181 	srl	r19,NUMB_BITS,	r19		C extract nail
    182 	srl	m3a,NAIL_BITS,	r8		C U0
    183 	stq	r28,	-8(rp)
    184 	addq	r8,	acc2,	acc2
    185 	bis	r31,	m3b,	acc3
    186 
    187 	addq	r19,	acc0,	acc0		C propagate nail
    188 	and	acc0,numb_mask,	r28
    189 	stq	r28,	0(rp)
    190 	srl	acc0,NUMB_BITS,	r19
    191 	addq	r19,	acc1,	acc1
    192 
    193 	and	acc1,numb_mask,	r28
    194 	stq	r28,	8(rp)
    195 	srl	acc1,NUMB_BITS,	r19
    196 	addq	r19,	acc2,	acc2
    197 
    198 	and	acc2,numb_mask,	r28
    199 	stq	r28,	16(rp)
    200 	srl	acc2,NUMB_BITS,	r19
    201 	addq	r19,	acc3,	r0
    202 
    203 	ldq	r12,	32(r30)
    204 	ldq	r13,	40(r30)
    205 	ldq	r14,	48(r30)
    206 	ldq	r15,	56(r30)
    207 	lda	r30,	240(r30)
    208 	ret	r31,	(r26),	1
    209 EPILOGUE()
    210 ASM_END()
    211