Home | History | Annotate | Line # | Download | only in ultrasparc1234
      1 dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
      2 dnl  store difference in a third limb vector.
      3 
      4 dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 C		   cycles/limb
     35 C UltraSPARC 1&2:     4
     36 C UltraSPARC 3:	      4.5
     37 
     38 C Compute carry-out from the most significant bits of u,v, and r, where
     39 C r=u-v-carry_in, using logic operations.
     40 
     41 C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
     42 C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
     43 C Therefore, it seems futile to try to optimize this any further...
     44 
     45 C INPUT PARAMETERS
     46 define(`rp',`%i0')
     47 define(`up',`%i1')
     48 define(`vp',`%i2')
     49 define(`n',`%i3')
     50 
     51 define(`u0',`%l0')
     52 define(`u1',`%l2')
     53 define(`u2',`%l4')
     54 define(`u3',`%l6')
     55 define(`v0',`%l1')
     56 define(`v1',`%l3')
     57 define(`v2',`%l5')
     58 define(`v3',`%l7')
     59 
     60 define(`cy',`%i4')
     61 
     62 define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
     63 define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
     64 
     65 ASM_START()
     66 	REGISTER(%g2,#scratch)
     67 	REGISTER(%g3,#scratch)
     68 PROLOGUE(mpn_sub_nc)
     69 	save	%sp,-160,%sp
     70 
     71 	fitod	%f0,%f0		C make sure f0 contains small, quiet number
     72 	subcc	n,4,%g0
     73 	bl,pn	%xcc,.Loop0
     74 	nop
     75 	b,a	L(com)
     76 EPILOGUE()
     77 
     78 PROLOGUE(mpn_sub_n)
     79 	save	%sp,-160,%sp
     80 
     81 	fitod	%f0,%f0		C make sure f0 contains small, quiet number
     82 	subcc	n,4,%g0
     83 	bl,pn	%xcc,.Loop0
     84 	mov	0,cy
     85 L(com):
     86 	ldx	[up+0],u0
     87 	ldx	[vp+0],v0
     88 	add	up,32,up
     89 	ldx	[up-24],u1
     90 	ldx	[vp+8],v1
     91 	add	vp,32,vp
     92 	ldx	[up-16],u2
     93 	ldx	[vp-16],v2
     94 	ldx	[up-8],u3
     95 	ldx	[vp-8],v3
     96 	subcc	n,8,n
     97 	sub	u0,v0,%g1	C main sub
     98 	sub	%g1,cy,%g5	C carry sub
     99 	orn	u0,v0,%g2
    100 	bl,pn	%xcc,.Lend4567
    101 	fanop
    102 	b,a	.Loop
    103 
    104 	.align	16
    105 C START MAIN LOOP
    106 .Loop:	orn	%g5,%g2,%g2
    107 	andn	u0,v0,%g3
    108 	ldx	[up+0],u0
    109 	fanop
    110 C --
    111 	andn	%g2,%g3,%g2
    112 	ldx	[vp+0],v0
    113 	add	up,32,up
    114 	fanop
    115 C --
    116 	srlx	%g2,63,cy
    117 	sub	u1,v1,%g1
    118 	stx	%g5,[rp+0]
    119 	fanop
    120 C --
    121 	sub	%g1,cy,%g5
    122 	orn	u1,v1,%g2
    123 	fmnop
    124 	fanop
    125 C --
    126 	orn	%g5,%g2,%g2
    127 	andn	u1,v1,%g3
    128 	ldx	[up-24],u1
    129 	fanop
    130 C --
    131 	andn	%g2,%g3,%g2
    132 	ldx	[vp+8],v1
    133 	add	vp,32,vp
    134 	fanop
    135 C --
    136 	srlx	%g2,63,cy
    137 	sub	u2,v2,%g1
    138 	stx	%g5,[rp+8]
    139 	fanop
    140 C --
    141 	sub	%g1,cy,%g5
    142 	orn	u2,v2,%g2
    143 	fmnop
    144 	fanop
    145 C --
    146 	orn	%g5,%g2,%g2
    147 	andn	u2,v2,%g3
    148 	ldx	[up-16],u2
    149 	fanop
    150 C --
    151 	andn	%g2,%g3,%g2
    152 	ldx	[vp-16],v2
    153 	add	rp,32,rp
    154 	fanop
    155 C --
    156 	srlx	%g2,63,cy
    157 	sub	u3,v3,%g1
    158 	stx	%g5,[rp-16]
    159 	fanop
    160 C --
    161 	sub	%g1,cy,%g5
    162 	orn	u3,v3,%g2
    163 	fmnop
    164 	fanop
    165 C --
    166 	orn	%g5,%g2,%g2
    167 	andn	u3,v3,%g3
    168 	ldx	[up-8],u3
    169 	fanop
    170 C --
    171 	andn	%g2,%g3,%g2
    172 	subcc	n,4,n
    173 	ldx	[vp-8],v3
    174 	fanop
    175 C --
    176 	srlx	%g2,63,cy
    177 	sub	u0,v0,%g1
    178 	stx	%g5,[rp-8]
    179 	fanop
    180 C --
    181 	sub	%g1,cy,%g5
    182 	orn	u0,v0,%g2
    183 	bge,pt	%xcc,.Loop
    184 	fanop
    185 C END MAIN LOOP
    186 .Lend4567:
    187 	orn	%g5,%g2,%g2
    188 	andn	u0,v0,%g3
    189 	andn	%g2,%g3,%g2
    190 	srlx	%g2,63,cy
    191 	sub	u1,v1,%g1
    192 	stx	%g5,[rp+0]
    193 	sub	%g1,cy,%g5
    194 	orn	u1,v1,%g2
    195 	orn	%g5,%g2,%g2
    196 	andn	u1,v1,%g3
    197 	andn	%g2,%g3,%g2
    198 	srlx	%g2,63,cy
    199 	sub	u2,v2,%g1
    200 	stx	%g5,[rp+8]
    201 	sub	%g1,cy,%g5
    202 	orn	u2,v2,%g2
    203 	orn	%g5,%g2,%g2
    204 	andn	u2,v2,%g3
    205 	andn	%g2,%g3,%g2
    206 	add	rp,32,rp
    207 	srlx	%g2,63,cy
    208 	sub	u3,v3,%g1
    209 	stx	%g5,[rp-16]
    210 	sub	%g1,cy,%g5
    211 	orn	u3,v3,%g2
    212 	orn	%g5,%g2,%g2
    213 	andn	u3,v3,%g3
    214 	andn	%g2,%g3,%g2
    215 	srlx	%g2,63,cy
    216 	stx	%g5,[rp-8]
    217 
    218 	addcc	n,4,n
    219 	bz,pn	%xcc,.Lret
    220 	fanop
    221 
    222 .Loop0:	ldx	[up],u0
    223 	add	up,8,up
    224 	ldx	[vp],v0
    225 	add	vp,8,vp
    226 	add	rp,8,rp
    227 	subcc	n,1,n
    228 	sub	u0,v0,%g1
    229 	orn	u0,v0,%g2
    230 	sub	%g1,cy,%g5
    231 	andn	u0,v0,%g3
    232 	orn	%g5,%g2,%g2
    233 	stx	%g5,[rp-8]
    234 	andn	%g2,%g3,%g2
    235 	bnz,pt	%xcc,.Loop0
    236 	srlx	%g2,63,cy
    237 
    238 .Lret:	mov	cy,%i0
    239 	ret
    240 	restore
    241 EPILOGUE(mpn_sub_n)
    242