Home | History | Annotate | Line # | Download | only in ultrasparct3
      1  1.1  mrg dnl  SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
      2  1.1  mrg 
      3  1.1  mrg dnl  Contributed to the GNU project by Torbjrn Granlund.
      4  1.1  mrg 
      5  1.1  mrg dnl  Copyright 2013 Free Software Foundation, Inc.
      6  1.1  mrg 
      7  1.1  mrg dnl  This file is part of the GNU MP Library.
      8  1.1  mrg dnl
      9  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1  mrg dnl  it under the terms of either:
     11  1.1  mrg dnl
     12  1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1  mrg dnl      option) any later version.
     15  1.1  mrg dnl
     16  1.1  mrg dnl  or
     17  1.1  mrg dnl
     18  1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1  mrg dnl      later version.
     21  1.1  mrg dnl
     22  1.1  mrg dnl  or both in parallel, as here.
     23  1.1  mrg dnl
     24  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1  mrg dnl  for more details.
     28  1.1  mrg dnl
     29  1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1  mrg dnl  see https://www.gnu.org/licenses/.
     32  1.1  mrg 
     33  1.1  mrg include(`../config.m4')
     34  1.1  mrg 
     35  1.1  mrg 
     36  1.1  mrg C		    cycles/limb      cycles/limb
     37  1.1  mrg C		       mul_4           addmul_4
     38  1.1  mrg C UltraSPARC T3:	21.5		22.0
     39  1.1  mrg C UltraSPARC T4:	 2.625		 2.75
     40  1.1  mrg 
     41  1.1  mrg 
     42  1.1  mrg C The code is well-scheduled and relies on OoO very little.  There is hope that
     43  1.1  mrg C this will run at around 2.5 and 2.75 c/l respectively, on T4.
     44  1.1  mrg 
     45  1.1  mrg define(`rp', `%i0')
     46  1.1  mrg define(`up', `%i1')
     47  1.1  mrg define(`n',  `%i2')
     48  1.1  mrg define(`vp', `%i3')
     49  1.1  mrg 
     50  1.1  mrg define(`v0', `%g1')
     51  1.1  mrg define(`v1', `%o7')
     52  1.1  mrg define(`v2', `%g2')
     53  1.1  mrg define(`v3', `%i3')
     54  1.1  mrg 
     55  1.1  mrg define(`w0', `%o0')
     56  1.1  mrg define(`w1', `%o1')
     57  1.1  mrg define(`w2', `%o2')
     58  1.1  mrg define(`w3', `%o3')
     59  1.1  mrg define(`w4', `%o4')
     60  1.1  mrg 
     61  1.1  mrg define(`r0', `%o5')
     62  1.1  mrg 
     63  1.1  mrg define(`u0', `%i4')
     64  1.1  mrg define(`u1', `%i5')
     65  1.1  mrg 
     66  1.1  mrg define(`rp0', `rp')
     67  1.1  mrg define(`rp1', `%g3')
     68  1.1  mrg define(`rp2', `%g4')
     69  1.1  mrg define(`up0', `up')
     70  1.1  mrg define(`up1', `%g5')
     71  1.1  mrg 
     72  1.1  mrg ifdef(`OPERATION_mul_4',`
     73  1.1  mrg       define(`AM4',      `')
     74  1.1  mrg       define(`ADDX',	 `addcc`'$1')
     75  1.1  mrg       define(`func',     `mpn_mul_4')
     76  1.1  mrg ')
     77  1.1  mrg ifdef(`OPERATION_addmul_4',`
     78  1.1  mrg       define(`AM4',      `$1')
     79  1.1  mrg       define(`ADDX',	 `addxccc($1,$2,$3)')
     80  1.1  mrg       define(`func',     `mpn_addmul_4')
     81  1.1  mrg ')
     82  1.1  mrg 
     83  1.1  mrg 
     84  1.1  mrg MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
     85  1.1  mrg 
     86  1.1  mrg ASM_START()
     87  1.1  mrg 	REGISTER(%g2,#scratch)
     88  1.1  mrg 	REGISTER(%g3,#scratch)
     89  1.1  mrg PROLOGUE(func)
     90  1.1  mrg 	save	%sp, -176, %sp
     91  1.1  mrg 
     92  1.1  mrg 	ldx	[up + 0], u1		C load up[0] early
     93  1.1  mrg 	andcc	n, 1, %g0		C is n odd?
     94  1.1  mrg 	ldx	[vp + 0], v0
     95  1.1  mrg 	sllx	n, 3, n
     96  1.1  mrg 	ldx	[vp + 8], v1
     97  1.1  mrg 	add	n, -28, n
     98  1.1  mrg 	ldx	[vp + 16], v2
     99  1.1  mrg 	add	rp, -16, rp
    100  1.1  mrg 	ldx	[vp + 24], v3
    101  1.1  mrg 	add	up, n, up0
    102  1.1  mrg 	add	rp, n, rp0
    103  1.1  mrg 	add	up0, 8, up1
    104  1.1  mrg 	add	rp0, 8, rp1
    105  1.1  mrg 	add	rp0, 16, rp2
    106  1.1  mrg 	mulx	u1, v0, %l0
    107  1.1  mrg 	mov	0, w0
    108  1.1  mrg 	mulx	u1, v1, %l1
    109  1.1  mrg 	mov	0, w1
    110  1.1  mrg 	mulx	u1, v2, %l2
    111  1.1  mrg 	mov	0, w2
    112  1.1  mrg 	mulx	u1, v3, %l3
    113  1.1  mrg 	mov	0, w3
    114  1.1  mrg 
    115  1.1  mrg 	be	L(evn)
    116  1.1  mrg 	 neg	n, n
    117  1.1  mrg 
    118  1.1  mrg L(odd):	mov	u1, u0
    119  1.1  mrg 	ldx	[up1 + n], u1
    120  1.1  mrg AM4(`	ldx	[rp2 + n], r0')
    121  1.1  mrg 	umulxhi(u0, v0, %l4)
    122  1.1  mrg 	umulxhi(u0, v1, %l5)
    123  1.1  mrg 	umulxhi(u0, v2, %l6)
    124  1.1  mrg 	umulxhi(u0, v3, %l7)
    125  1.1  mrg 	b	L(mid)
    126  1.1  mrg 	 add	n, 8, n
    127  1.1  mrg 
    128  1.1  mrg L(evn):	ldx	[up1 + n], u0
    129  1.1  mrg AM4(`	ldx	[rp2 + n], r0')
    130  1.1  mrg 	umulxhi(u1, v0, %l4)
    131  1.1  mrg 	umulxhi(u1, v1, %l5)
    132  1.1  mrg 	umulxhi(u1, v2, %l6)
    133  1.1  mrg 	umulxhi(u1, v3, %l7)
    134  1.1  mrg 	add	n, 16, n
    135  1.1  mrg 
    136  1.1  mrg 	ALIGN(16)
    137  1.1  mrg L(top):	addcc	%l0, w0, w0
    138  1.1  mrg 	mulx	u0, v0, %l0	C w 0
    139  1.1  mrg 	addxccc(%l1, w1, w1)
    140  1.1  mrg 	mulx	u0, v1, %l1	C w 1
    141  1.1  mrg 	addxccc(%l2, w2, w2)
    142  1.1  mrg 	mulx	u0, v2, %l2	C w 2
    143  1.1  mrg 	addxccc(%l3, w3, w3)
    144  1.1  mrg 	mulx	u0, v3, %l3	C w 3
    145  1.1  mrg 	ldx	[up0 + n], u1
    146  1.1  mrg 	addxc(	%g0, %g0, w4)
    147  1.1  mrg AM4(`	addcc	r0, w0, w0')
    148  1.1  mrg 	stx	w0, [rp0 + n]
    149  1.1  mrg 	ADDX(`	%l4, w1, w0')
    150  1.1  mrg 	umulxhi(u0, v0, %l4)	C w 1
    151  1.1  mrg AM4(`	ldx	[rp1 + n], r0')
    152  1.1  mrg 	addxccc(%l5, w2, w1)
    153  1.1  mrg 	umulxhi(u0, v1, %l5)	C w 2
    154  1.1  mrg 	addxccc(%l6, w3, w2)
    155  1.1  mrg 	umulxhi(u0, v2, %l6)	C w 3
    156  1.1  mrg 	addxc(	%l7, w4, w3)
    157  1.1  mrg 	umulxhi(u0, v3, %l7)	C w 4
    158  1.1  mrg L(mid):	addcc	%l0, w0, w0
    159  1.1  mrg 	mulx	u1, v0, %l0	C w 1
    160  1.1  mrg 	addxccc(%l1, w1, w1)
    161  1.1  mrg 	mulx	u1, v1, %l1	C w 2
    162  1.1  mrg 	addxccc(%l2, w2, w2)
    163  1.1  mrg 	mulx	u1, v2, %l2	C w 3
    164  1.1  mrg 	addxccc(%l3, w3, w3)
    165  1.1  mrg 	mulx	u1, v3, %l3	C w 4
    166  1.1  mrg 	ldx	[up1 + n], u0
    167  1.1  mrg 	addxc(	%g0, %g0, w4)
    168  1.1  mrg AM4(`	addcc	r0, w0, w0')
    169  1.1  mrg 	stx	w0, [rp1 + n]
    170  1.1  mrg 	ADDX(`	%l4, w1, w0')
    171  1.1  mrg 	umulxhi(u1, v0, %l4)	C w 2
    172  1.1  mrg AM4(`	ldx	[rp2 + n], r0')
    173  1.1  mrg 	addxccc(%l5, w2, w1)
    174  1.1  mrg 	umulxhi(u1, v1, %l5)	C w 3
    175  1.1  mrg 	addxccc(%l6, w3, w2)
    176  1.1  mrg 	umulxhi(u1, v2, %l6)	C w 4
    177  1.1  mrg 	addxc(	%l7, w4, w3)
    178  1.1  mrg 	umulxhi(u1, v3, %l7)	C w 5
    179  1.1  mrg 	brlz	n, L(top)
    180  1.1  mrg 	 add	n, 16, n
    181  1.1  mrg 
    182  1.1  mrg L(end):	addcc	%l0, w0, w0
    183  1.1  mrg 	mulx	u0, v0, %l0
    184  1.1  mrg 	addxccc(%l1, w1, w1)
    185  1.1  mrg 	mulx	u0, v1, %l1
    186  1.1  mrg 	addxccc(%l2, w2, w2)
    187  1.1  mrg 	mulx	u0, v2, %l2
    188  1.1  mrg 	addxccc(%l3, w3, w3)
    189  1.1  mrg 	mulx	u0, v3, %l3
    190  1.1  mrg 	addxc(	%g0, %g0, w4)
    191  1.1  mrg AM4(`	addcc	r0, w0, w0')
    192  1.1  mrg 	stx	w0, [rp0 + n]
    193  1.1  mrg 	ADDX(`	%l4, w1, w0')
    194  1.1  mrg 	umulxhi(u0, v0, %l4)
    195  1.1  mrg AM4(`	ldx	[rp1 + n], r0')
    196  1.1  mrg 	addxccc(%l5, w2, w1)
    197  1.1  mrg 	umulxhi(u0, v1, %l5)
    198  1.1  mrg 	addxccc(%l6, w3, w2)
    199  1.1  mrg 	umulxhi(u0, v2, %l6)
    200  1.1  mrg 	addxc(	%l7, w4, w3)
    201  1.1  mrg 	umulxhi(u0, v3, %l7)
    202  1.1  mrg 	addcc	%l0, w0, w0
    203  1.1  mrg 	addxccc(%l1, w1, w1)
    204  1.1  mrg 	addxccc(%l2, w2, w2)
    205  1.1  mrg 	addxccc(%l3, w3, w3)
    206  1.1  mrg 	addxc(	%g0, %g0, w4)
    207  1.1  mrg AM4(`	addcc	r0, w0, w0')
    208  1.1  mrg 	stx	w0, [rp1 + n]
    209  1.1  mrg 	ADDX(`	%l4, w1, w0')
    210  1.1  mrg 	addxccc(%l5, w2, w1)
    211  1.1  mrg 	addxccc(%l6, w3, w2)
    212  1.1  mrg 	stx	w0, [rp2 + n]
    213  1.1  mrg 	add	n, 16, n
    214  1.1  mrg 	stx	w1, [rp1 + n]
    215  1.1  mrg 	stx	w2, [rp2 + n]
    216  1.1  mrg 	addxc(	%l7, w4, %i0)
    217  1.1  mrg 	ret
    218  1.1  mrg 	 restore
    219  1.1  mrg EPILOGUE()
    220