Home | History | Annotate | Line # | Download | only in ultrasparct3
      1 dnl  SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
      2 
      3 dnl  Contributed to the GNU project by Torbjrn Granlund.
      4 
      5 dnl  Copyright 2013 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 
     36 C		    cycles/limb      cycles/limb
     37 C		       mul_4           addmul_4
     38 C UltraSPARC T3:	21.5		22.0
     39 C UltraSPARC T4:	 2.625		 2.75
     40 
     41 
     42 C The code is well-scheduled and relies on OoO very little.  There is hope that
     43 C this will run at around 2.5 and 2.75 c/l respectively, on T4.
     44 
     45 define(`rp', `%i0')
     46 define(`up', `%i1')
     47 define(`n',  `%i2')
     48 define(`vp', `%i3')
     49 
     50 define(`v0', `%g1')
     51 define(`v1', `%o7')
     52 define(`v2', `%g2')
     53 define(`v3', `%i3')
     54 
     55 define(`w0', `%o0')
     56 define(`w1', `%o1')
     57 define(`w2', `%o2')
     58 define(`w3', `%o3')
     59 define(`w4', `%o4')
     60 
     61 define(`r0', `%o5')
     62 
     63 define(`u0', `%i4')
     64 define(`u1', `%i5')
     65 
     66 define(`rp0', `rp')
     67 define(`rp1', `%g3')
     68 define(`rp2', `%g4')
     69 define(`up0', `up')
     70 define(`up1', `%g5')
     71 
     72 ifdef(`OPERATION_mul_4',`
     73       define(`AM4',      `')
     74       define(`ADDX',	 `addcc`'$1')
     75       define(`func',     `mpn_mul_4')
     76 ')
     77 ifdef(`OPERATION_addmul_4',`
     78       define(`AM4',      `$1')
     79       define(`ADDX',	 `addxccc($1,$2,$3)')
     80       define(`func',     `mpn_addmul_4')
     81 ')
     82 
     83 
     84 MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
     85 
     86 ASM_START()
     87 	REGISTER(%g2,#scratch)
     88 	REGISTER(%g3,#scratch)
     89 PROLOGUE(func)
     90 	save	%sp, -176, %sp
     91 
     92 	ldx	[up + 0], u1		C load up[0] early
     93 	andcc	n, 1, %g0		C is n odd?
     94 	ldx	[vp + 0], v0
     95 	sllx	n, 3, n
     96 	ldx	[vp + 8], v1
     97 	add	n, -28, n
     98 	ldx	[vp + 16], v2
     99 	add	rp, -16, rp
    100 	ldx	[vp + 24], v3
    101 	add	up, n, up0
    102 	add	rp, n, rp0
    103 	add	up0, 8, up1
    104 	add	rp0, 8, rp1
    105 	add	rp0, 16, rp2
    106 	mulx	u1, v0, %l0
    107 	mov	0, w0
    108 	mulx	u1, v1, %l1
    109 	mov	0, w1
    110 	mulx	u1, v2, %l2
    111 	mov	0, w2
    112 	mulx	u1, v3, %l3
    113 	mov	0, w3
    114 
    115 	be	L(evn)
    116 	 neg	n, n
    117 
    118 L(odd):	mov	u1, u0
    119 	ldx	[up1 + n], u1
    120 AM4(`	ldx	[rp2 + n], r0')
    121 	umulxhi(u0, v0, %l4)
    122 	umulxhi(u0, v1, %l5)
    123 	umulxhi(u0, v2, %l6)
    124 	umulxhi(u0, v3, %l7)
    125 	b	L(mid)
    126 	 add	n, 8, n
    127 
    128 L(evn):	ldx	[up1 + n], u0
    129 AM4(`	ldx	[rp2 + n], r0')
    130 	umulxhi(u1, v0, %l4)
    131 	umulxhi(u1, v1, %l5)
    132 	umulxhi(u1, v2, %l6)
    133 	umulxhi(u1, v3, %l7)
    134 	add	n, 16, n
    135 
    136 	ALIGN(16)
    137 L(top):	addcc	%l0, w0, w0
    138 	mulx	u0, v0, %l0	C w 0
    139 	addxccc(%l1, w1, w1)
    140 	mulx	u0, v1, %l1	C w 1
    141 	addxccc(%l2, w2, w2)
    142 	mulx	u0, v2, %l2	C w 2
    143 	addxccc(%l3, w3, w3)
    144 	mulx	u0, v3, %l3	C w 3
    145 	ldx	[up0 + n], u1
    146 	addxc(	%g0, %g0, w4)
    147 AM4(`	addcc	r0, w0, w0')
    148 	stx	w0, [rp0 + n]
    149 	ADDX(`	%l4, w1, w0')
    150 	umulxhi(u0, v0, %l4)	C w 1
    151 AM4(`	ldx	[rp1 + n], r0')
    152 	addxccc(%l5, w2, w1)
    153 	umulxhi(u0, v1, %l5)	C w 2
    154 	addxccc(%l6, w3, w2)
    155 	umulxhi(u0, v2, %l6)	C w 3
    156 	addxc(	%l7, w4, w3)
    157 	umulxhi(u0, v3, %l7)	C w 4
    158 L(mid):	addcc	%l0, w0, w0
    159 	mulx	u1, v0, %l0	C w 1
    160 	addxccc(%l1, w1, w1)
    161 	mulx	u1, v1, %l1	C w 2
    162 	addxccc(%l2, w2, w2)
    163 	mulx	u1, v2, %l2	C w 3
    164 	addxccc(%l3, w3, w3)
    165 	mulx	u1, v3, %l3	C w 4
    166 	ldx	[up1 + n], u0
    167 	addxc(	%g0, %g0, w4)
    168 AM4(`	addcc	r0, w0, w0')
    169 	stx	w0, [rp1 + n]
    170 	ADDX(`	%l4, w1, w0')
    171 	umulxhi(u1, v0, %l4)	C w 2
    172 AM4(`	ldx	[rp2 + n], r0')
    173 	addxccc(%l5, w2, w1)
    174 	umulxhi(u1, v1, %l5)	C w 3
    175 	addxccc(%l6, w3, w2)
    176 	umulxhi(u1, v2, %l6)	C w 4
    177 	addxc(	%l7, w4, w3)
    178 	umulxhi(u1, v3, %l7)	C w 5
    179 	brlz	n, L(top)
    180 	 add	n, 16, n
    181 
    182 L(end):	addcc	%l0, w0, w0
    183 	mulx	u0, v0, %l0
    184 	addxccc(%l1, w1, w1)
    185 	mulx	u0, v1, %l1
    186 	addxccc(%l2, w2, w2)
    187 	mulx	u0, v2, %l2
    188 	addxccc(%l3, w3, w3)
    189 	mulx	u0, v3, %l3
    190 	addxc(	%g0, %g0, w4)
    191 AM4(`	addcc	r0, w0, w0')
    192 	stx	w0, [rp0 + n]
    193 	ADDX(`	%l4, w1, w0')
    194 	umulxhi(u0, v0, %l4)
    195 AM4(`	ldx	[rp1 + n], r0')
    196 	addxccc(%l5, w2, w1)
    197 	umulxhi(u0, v1, %l5)
    198 	addxccc(%l6, w3, w2)
    199 	umulxhi(u0, v2, %l6)
    200 	addxc(	%l7, w4, w3)
    201 	umulxhi(u0, v3, %l7)
    202 	addcc	%l0, w0, w0
    203 	addxccc(%l1, w1, w1)
    204 	addxccc(%l2, w2, w2)
    205 	addxccc(%l3, w3, w3)
    206 	addxc(	%g0, %g0, w4)
    207 AM4(`	addcc	r0, w0, w0')
    208 	stx	w0, [rp1 + n]
    209 	ADDX(`	%l4, w1, w0')
    210 	addxccc(%l5, w2, w1)
    211 	addxccc(%l6, w3, w2)
    212 	stx	w0, [rp2 + n]
    213 	add	n, 16, n
    214 	stx	w1, [rp1 + n]
    215 	stx	w2, [rp2 + n]
    216 	addxc(	%l7, w4, %i0)
    217 	ret
    218 	 restore
    219 EPILOGUE()
    220