Home | History | Annotate | Line # | Download | only in v9
      1      1.1  mrg dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
      2      1.1  mrg dnl  the result to a second limb vector.
      3      1.1  mrg 
      4      1.1  mrg dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1.1.2  mrg dnl
      8      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg dnl  it under the terms of either:
     10  1.1.1.2  mrg dnl
     11  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg dnl      option) any later version.
     14  1.1.1.2  mrg dnl
     15  1.1.1.2  mrg dnl  or
     16  1.1.1.2  mrg dnl
     17  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg dnl      later version.
     20  1.1.1.2  mrg dnl
     21  1.1.1.2  mrg dnl  or both in parallel, as here.
     22  1.1.1.2  mrg dnl
     23      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg dnl  for more details.
     27  1.1.1.2  mrg dnl
     28  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg C Algorithm: We use two floating-point multiplies per limb product, with the
     35      1.1  mrg C invariant v operand split into two 16-bit pieces, and the u operand split
     36      1.1  mrg C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
     37      1.1  mrg C the integer unit.
     38      1.1  mrg 
     39      1.1  mrg C		   cycles/limb
     40      1.1  mrg C UltraSPARC 1&2:     6.5
     41      1.1  mrg C UltraSPARC 3:	      ?
     42      1.1  mrg 
     43      1.1  mrg C Possible optimizations:
     44      1.1  mrg C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
     45      1.1  mrg C      memory bandwidth limited, this could save 1.5 cycles/limb.
     46      1.1  mrg C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
     47      1.1  mrg C      it is very straightforward to unroll, using an exit branch midways.
     48      1.1  mrg C      Unrolling would allow deeper scheduling which could improve speed for L2
     49      1.1  mrg C      cache case.
     50      1.1  mrg C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
     51      1.1  mrg C      aren't sufficiently apart-scheduled with just two temp areas.
     52      1.1  mrg C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
     53      1.1  mrg C      could save many operations.
     54      1.1  mrg 
     55      1.1  mrg C INPUT PARAMETERS
     56      1.1  mrg C rp	i0
     57      1.1  mrg C up	i1
     58      1.1  mrg C n	i2
     59      1.1  mrg C v	i3
     60      1.1  mrg 
     61      1.1  mrg define(`FSIZE',224)
     62      1.1  mrg 
     63      1.1  mrg ASM_START()
     64      1.1  mrg PROLOGUE(mpn_addmul_1)
     65      1.1  mrg 	add	%sp, -FSIZE, %sp
     66      1.1  mrg 	sethi	%hi(0xffff), %g1
     67      1.1  mrg 	srl	%o3, 16, %g2
     68      1.1  mrg 	or	%g1, %lo(0xffff), %g1
     69      1.1  mrg 	and	%o3, %g1, %g1
     70      1.1  mrg 	stx	%g1, [%sp+104]
     71      1.1  mrg 	stx	%g2, [%sp+112]
     72      1.1  mrg 	ldd	[%sp+104], %f6
     73      1.1  mrg 	ldd	[%sp+112], %f8
     74      1.1  mrg 	fxtod	%f6, %f6
     75      1.1  mrg 	fxtod	%f8, %f8
     76      1.1  mrg 	ld	[%sp+104], %f10		C zero f10
     77      1.1  mrg 
     78      1.1  mrg 	mov	0, %g3			C cy = 0
     79      1.1  mrg 
     80      1.1  mrg define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
     81      1.1  mrg 
     82      1.1  mrg 	add	%sp, 160, %o5		C point in scratch area
     83      1.1  mrg 	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
     84      1.1  mrg 
     85      1.1  mrg 	subcc	%o2, 1, %o2
     86      1.1  mrg 	ld	[%o1], %f11		C read up[i]
     87      1.1  mrg 	add	%o1, 4, %o1		C up++
     88      1.1  mrg 	bne,pt	%icc, .L_two_or_more
     89      1.1  mrg 	fxtod	%f10, %f2
     90      1.1  mrg 
     91      1.1  mrg 	fmuld	%f2, %f8, %f16
     92      1.1  mrg 	fmuld	%f2, %f6, %f4
     93      1.1  mrg 	fdtox	%f16, %f14
     94      1.1  mrg 	fdtox	%f4, %f12
     95      1.1  mrg 	std	%f14, [%o5+16]
     96      1.1  mrg 	std	%f12, [%o5+24]
     97      1.1  mrg 	ldx	[%o5+16], %g2		C p16
     98      1.1  mrg 	ldx	[%o5+24], %g1		C p0
     99      1.1  mrg 	lduw	[%o0], %g5		C read rp[i]
    100      1.1  mrg 	b	.L1
    101      1.1  mrg 	add	%o0, -16, %o0
    102      1.1  mrg 
    103      1.1  mrg 	.align	16
    104      1.1  mrg .L_two_or_more:
    105      1.1  mrg 	subcc	%o2, 1, %o2
    106      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    107      1.1  mrg 	fmuld	%f2, %f8, %f16
    108      1.1  mrg 	fmuld	%f2, %f6, %f4
    109      1.1  mrg 	add	%o1, 4, %o1		C up++
    110      1.1  mrg 	bne,pt	%icc, .L_three_or_more
    111      1.1  mrg 	fxtod	%f10, %f2
    112      1.1  mrg 
    113      1.1  mrg 	fdtox	%f16, %f14
    114      1.1  mrg 	fdtox	%f4, %f12
    115      1.1  mrg 	std	%f14, [%o5+16]
    116      1.1  mrg 	fmuld	%f2, %f8, %f16
    117      1.1  mrg 	std	%f12, [%o5+24]
    118      1.1  mrg 	fmuld	%f2, %f6, %f4
    119      1.1  mrg 	fdtox	%f16, %f14
    120      1.1  mrg 	fdtox	%f4, %f12
    121      1.1  mrg 	std	%f14, [%o5+0]
    122      1.1  mrg 	std	%f12, [%o5+8]
    123      1.1  mrg 	lduw	[%o0], %g5		C read rp[i]
    124      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    125      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    126      1.1  mrg 	b	.L2
    127      1.1  mrg 	add	%o0, -12, %o0
    128      1.1  mrg 
    129      1.1  mrg 	.align	16
    130      1.1  mrg .L_three_or_more:
    131      1.1  mrg 	subcc	%o2, 1, %o2
    132      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    133      1.1  mrg 	fdtox	%f16, %f14
    134      1.1  mrg 	fdtox	%f4, %f12
    135      1.1  mrg 	std	%f14, [%o5+16]
    136      1.1  mrg 	fmuld	%f2, %f8, %f16
    137      1.1  mrg 	std	%f12, [%o5+24]
    138      1.1  mrg 	fmuld	%f2, %f6, %f4
    139      1.1  mrg 	add	%o1, 4, %o1		C up++
    140      1.1  mrg 	bne,pt	%icc, .L_four_or_more
    141      1.1  mrg 	fxtod	%f10, %f2
    142      1.1  mrg 
    143      1.1  mrg 	fdtox	%f16, %f14
    144      1.1  mrg 	fdtox	%f4, %f12
    145      1.1  mrg 	std	%f14, [%o5+0]
    146      1.1  mrg 	fmuld	%f2, %f8, %f16
    147      1.1  mrg 	std	%f12, [%o5+8]
    148      1.1  mrg 	fmuld	%f2, %f6, %f4
    149      1.1  mrg 	fdtox	%f16, %f14
    150      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    151      1.1  mrg 	fdtox	%f4, %f12
    152      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    153      1.1  mrg 	std	%f14, [%o5+16]
    154      1.1  mrg 	std	%f12, [%o5+24]
    155      1.1  mrg 	lduw	[%o0], %g5		C read rp[i]
    156      1.1  mrg 	b	.L3
    157      1.1  mrg 	add	%o0, -8, %o0
    158      1.1  mrg 
    159      1.1  mrg 	.align	16
    160      1.1  mrg .L_four_or_more:
    161      1.1  mrg 	subcc	%o2, 1, %o2
    162      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    163      1.1  mrg 	fdtox	%f16, %f14
    164      1.1  mrg 	fdtox	%f4, %f12
    165      1.1  mrg 	std	%f14, [%o5+0]
    166      1.1  mrg 	fmuld	%f2, %f8, %f16
    167      1.1  mrg 	std	%f12, [%o5+8]
    168      1.1  mrg 	fmuld	%f2, %f6, %f4
    169      1.1  mrg 	add	%o1, 4, %o1		C up++
    170      1.1  mrg 	bne,pt	%icc, .L_five_or_more
    171      1.1  mrg 	fxtod	%f10, %f2
    172      1.1  mrg 
    173      1.1  mrg 	fdtox	%f16, %f14
    174      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    175      1.1  mrg 	fdtox	%f4, %f12
    176      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    177      1.1  mrg 	std	%f14, [%o5+16]
    178      1.1  mrg 	fmuld	%f2, %f8, %f16
    179      1.1  mrg 	std	%f12, [%o5+24]
    180      1.1  mrg 	fmuld	%f2, %f6, %f4
    181      1.1  mrg 	add	%o1, 4, %o1		C up++
    182      1.1  mrg 	lduw	[%o0], %g5		C read rp[i]
    183      1.1  mrg 	b	.L4
    184      1.1  mrg 	add	%o0, -4, %o0
    185      1.1  mrg 
    186      1.1  mrg 	.align	16
    187      1.1  mrg .L_five_or_more:
    188      1.1  mrg 	subcc	%o2, 1, %o2
    189      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    190      1.1  mrg 	fdtox	%f16, %f14
    191      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    192      1.1  mrg 	fdtox	%f4, %f12
    193      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    194      1.1  mrg 	std	%f14, [%o5+16]
    195      1.1  mrg 	fmuld	%f2, %f8, %f16
    196      1.1  mrg 	std	%f12, [%o5+24]
    197      1.1  mrg 	fmuld	%f2, %f6, %f4
    198      1.1  mrg 	add	%o1, 4, %o1		C up++
    199      1.1  mrg 	lduw	[%o0], %g5		C read rp[i]
    200      1.1  mrg 	bne,pt	%icc, .Loop
    201      1.1  mrg 	fxtod	%f10, %f2
    202      1.1  mrg 	b,a	.L5
    203      1.1  mrg 
    204      1.1  mrg C BEGIN MAIN LOOP
    205      1.1  mrg 	.align 16
    206      1.1  mrg C -- 0
    207      1.1  mrg .Loop:	nop
    208      1.1  mrg 	subcc	%o2, 1, %o2
    209      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    210      1.1  mrg 	fdtox	%f16, %f14
    211      1.1  mrg C -- 1
    212      1.1  mrg 	sllx	%g2, 16, %g4		C (p16 << 16)
    213      1.1  mrg 	add	%o0, 4, %o0		C rp++
    214      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    215      1.1  mrg 	fdtox	%f4, %f12
    216      1.1  mrg C -- 2
    217      1.1  mrg 	nop
    218      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    219      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    220      1.1  mrg 	fanop
    221      1.1  mrg C -- 3
    222      1.1  mrg 	nop
    223      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    224      1.1  mrg 	std	%f14, [%o5+0]
    225      1.1  mrg 	fmuld	%f2, %f8, %f16
    226      1.1  mrg C -- 4
    227      1.1  mrg 	nop
    228      1.1  mrg 	add	%g5, %g4, %g4		C p += rp[i]
    229      1.1  mrg 	std	%f12, [%o5+8]
    230      1.1  mrg 	fmuld	%f2, %f6, %f4
    231      1.1  mrg C -- 5
    232      1.1  mrg 	xor	%o5, 16, %o5		C alternate scratch variables
    233      1.1  mrg 	add	%o1, 4, %o1		C up++
    234      1.1  mrg 	stw	%g4, [%o0-4]
    235      1.1  mrg 	fanop
    236      1.1  mrg C -- 6
    237      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    238      1.1  mrg 	lduw	[%o0], %g5		C read rp[i]
    239      1.1  mrg 	bne,pt	%icc, .Loop
    240      1.1  mrg 	fxtod	%f10, %f2
    241      1.1  mrg C END MAIN LOOP
    242      1.1  mrg 
    243      1.1  mrg .L5:	fdtox	%f16, %f14
    244      1.1  mrg 	sllx	%g2, 16, %g4		C (p16 << 16)
    245      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    246      1.1  mrg 	fdtox	%f4, %f12
    247      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    248      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    249      1.1  mrg 	add	%g4, %g3, %g4		C p += cy
    250      1.1  mrg 	std	%f14, [%o5+0]
    251      1.1  mrg 	fmuld	%f2, %f8, %f16
    252      1.1  mrg 	add	%g5, %g4, %g4		C p += rp[i]
    253      1.1  mrg 	std	%f12, [%o5+8]
    254      1.1  mrg 	fmuld	%f2, %f6, %f4
    255      1.1  mrg 	xor	%o5, 16, %o5
    256      1.1  mrg 	stw	%g4, [%o0+0]
    257      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    258      1.1  mrg 	lduw	[%o0+4], %g5		C read rp[i]
    259      1.1  mrg 
    260      1.1  mrg .L4:	fdtox	%f16, %f14
    261      1.1  mrg 	sllx	%g2, 16, %g4		C (p16 << 16)
    262      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    263      1.1  mrg 	fdtox	%f4, %f12
    264      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    265      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    266      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    267      1.1  mrg 	std	%f14, [%o5+0]
    268      1.1  mrg 	add	%g5, %g4, %g4		C p += rp[i]
    269      1.1  mrg 	std	%f12, [%o5+8]
    270      1.1  mrg 	xor	%o5, 16, %o5
    271      1.1  mrg 	stw	%g4, [%o0+4]
    272      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    273      1.1  mrg 	lduw	[%o0+8], %g5		C read rp[i]
    274      1.1  mrg 
    275      1.1  mrg .L3:	sllx	%g2, 16, %g4		C (p16 << 16)
    276      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    277      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    278      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    279      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    280      1.1  mrg 	add	%g5, %g4, %g4		C p += rp[i]
    281      1.1  mrg 	xor	%o5, 16, %o5
    282      1.1  mrg 	stw	%g4, [%o0+8]
    283      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    284      1.1  mrg 	lduw	[%o0+12], %g5		C read rp[i]
    285      1.1  mrg 
    286      1.1  mrg .L2:	sllx	%g2, 16, %g4		C (p16 << 16)
    287      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    288      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    289      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    290      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    291      1.1  mrg 	add	%g5, %g4, %g4		C p += rp[i]
    292      1.1  mrg 	stw	%g4, [%o0+12]
    293      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    294      1.1  mrg 	lduw	[%o0+16], %g5		C read rp[i]
    295      1.1  mrg 
    296      1.1  mrg .L1:	sllx	%g2, 16, %g4		C (p16 << 16)
    297      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    298      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    299      1.1  mrg 	add	%g5, %g4, %g4		C p += rp[i]
    300      1.1  mrg 	stw	%g4, [%o0+16]
    301      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    302      1.1  mrg 
    303      1.1  mrg 	mov	%g3, %o0
    304      1.1  mrg 	retl
    305      1.1  mrg 	sub	%sp, -FSIZE, %sp
    306      1.1  mrg EPILOGUE(mpn_addmul_1)
    307