Home | History | Annotate | Line # | Download | only in v9
      1      1.1  mrg dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
      2      1.1  mrg dnl  the result in a second limb vector.
      3      1.1  mrg 
      4      1.1  mrg dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1.1.2  mrg dnl
      8      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg dnl  it under the terms of either:
     10  1.1.1.2  mrg dnl
     11  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg dnl      option) any later version.
     14  1.1.1.2  mrg dnl
     15  1.1.1.2  mrg dnl  or
     16  1.1.1.2  mrg dnl
     17  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg dnl      later version.
     20  1.1.1.2  mrg dnl
     21  1.1.1.2  mrg dnl  or both in parallel, as here.
     22  1.1.1.2  mrg dnl
     23      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg dnl  for more details.
     27  1.1.1.2  mrg dnl
     28  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg C Algorithm: We use two floating-point multiplies per limb product, with the
     35      1.1  mrg C invariant v operand split into two 16-bit pieces, and the u operand split
     36      1.1  mrg C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
     37      1.1  mrg C the integer unit.
     38      1.1  mrg 
     39      1.1  mrg C		   cycles/limb
     40      1.1  mrg C UltraSPARC 1&2:     6.5
     41      1.1  mrg C UltraSPARC 3:	      ?
     42      1.1  mrg 
     43      1.1  mrg C Possible optimizations:
     44      1.1  mrg C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
     45      1.1  mrg C      memory bandwidth limited, this could save 1.5 cycles/limb.
     46      1.1  mrg C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
     47      1.1  mrg C      it is very straightforward to unroll, using an exit branch midways.
     48      1.1  mrg C      Unrolling would allow deeper scheduling which could improve speed for L2
     49      1.1  mrg C      cache case.
     50      1.1  mrg C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
     51      1.1  mrg C      aren't sufficiently apart-scheduled with just two temp areas.
     52      1.1  mrg C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
     53      1.1  mrg C      could save many operations.
     54      1.1  mrg 
     55      1.1  mrg C INPUT PARAMETERS
     56      1.1  mrg C rp	i0
     57      1.1  mrg C up	i1
     58      1.1  mrg C n	i2
     59      1.1  mrg C v	i3
     60      1.1  mrg 
     61      1.1  mrg define(`FSIZE',224)
     62      1.1  mrg 
     63      1.1  mrg ASM_START()
     64      1.1  mrg PROLOGUE(mpn_mul_1)
     65      1.1  mrg 	add	%sp, -FSIZE, %sp
     66      1.1  mrg 	sethi	%hi(0xffff), %g1
     67      1.1  mrg 	srl	%o3, 16, %g2
     68      1.1  mrg 	or	%g1, %lo(0xffff), %g1
     69      1.1  mrg 	and	%o3, %g1, %g1
     70      1.1  mrg 	stx	%g1, [%sp+104]
     71      1.1  mrg 	stx	%g2, [%sp+112]
     72      1.1  mrg 	ldd	[%sp+104], %f6
     73      1.1  mrg 	ldd	[%sp+112], %f8
     74      1.1  mrg 	fxtod	%f6, %f6
     75      1.1  mrg 	fxtod	%f8, %f8
     76      1.1  mrg 	ld	[%sp+104], %f10		C zero f10
     77      1.1  mrg 
     78      1.1  mrg 	mov	0, %g3			C cy = 0
     79      1.1  mrg 
     80      1.1  mrg define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
     81      1.1  mrg 
     82      1.1  mrg 	add	%sp, 160, %o5		C point in scratch area
     83      1.1  mrg 	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
     84      1.1  mrg 
     85      1.1  mrg 	subcc	%o2, 1, %o2
     86      1.1  mrg 	ld	[%o1], %f11		C read up[i]
     87      1.1  mrg 	add	%o1, 4, %o1		C up++
     88      1.1  mrg 	bne,pt	%icc, .L_two_or_more
     89      1.1  mrg 	fxtod	%f10, %f2
     90      1.1  mrg 
     91      1.1  mrg 	fmuld	%f2, %f8, %f16
     92      1.1  mrg 	fmuld	%f2, %f6, %f4
     93      1.1  mrg 	fdtox	%f16, %f14
     94      1.1  mrg 	fdtox	%f4, %f12
     95      1.1  mrg 	std	%f14, [%o5+16]
     96      1.1  mrg 	std	%f12, [%o5+24]
     97      1.1  mrg 	ldx	[%o5+16], %g2		C p16
     98      1.1  mrg 	ldx	[%o5+24], %g1		C p0
     99      1.1  mrg 	b	.L1
    100      1.1  mrg 	add	%o0, -16, %o0
    101      1.1  mrg 
    102      1.1  mrg 	.align	16
    103      1.1  mrg .L_two_or_more:
    104      1.1  mrg 	subcc	%o2, 1, %o2
    105      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    106      1.1  mrg 	fmuld	%f2, %f8, %f16
    107      1.1  mrg 	fmuld	%f2, %f6, %f4
    108      1.1  mrg 	add	%o1, 4, %o1		C up++
    109      1.1  mrg 	bne,pt	%icc, .L_three_or_more
    110      1.1  mrg 	fxtod	%f10, %f2
    111      1.1  mrg 
    112      1.1  mrg 	fdtox	%f16, %f14
    113      1.1  mrg 	fdtox	%f4, %f12
    114      1.1  mrg 	std	%f14, [%o5+16]
    115      1.1  mrg 	fmuld	%f2, %f8, %f16
    116      1.1  mrg 	std	%f12, [%o5+24]
    117      1.1  mrg 	fmuld	%f2, %f6, %f4
    118      1.1  mrg 	fdtox	%f16, %f14
    119      1.1  mrg 	fdtox	%f4, %f12
    120      1.1  mrg 	std	%f14, [%o5+0]
    121      1.1  mrg 	std	%f12, [%o5+8]
    122      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    123      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    124      1.1  mrg 	b	.L2
    125      1.1  mrg 	add	%o0, -12, %o0
    126      1.1  mrg 
    127      1.1  mrg 	.align	16
    128      1.1  mrg .L_three_or_more:
    129      1.1  mrg 	subcc	%o2, 1, %o2
    130      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    131      1.1  mrg 	fdtox	%f16, %f14
    132      1.1  mrg 	fdtox	%f4, %f12
    133      1.1  mrg 	std	%f14, [%o5+16]
    134      1.1  mrg 	fmuld	%f2, %f8, %f16
    135      1.1  mrg 	std	%f12, [%o5+24]
    136      1.1  mrg 	fmuld	%f2, %f6, %f4
    137      1.1  mrg 	add	%o1, 4, %o1		C up++
    138      1.1  mrg 	bne,pt	%icc, .L_four_or_more
    139      1.1  mrg 	fxtod	%f10, %f2
    140      1.1  mrg 
    141      1.1  mrg 	fdtox	%f16, %f14
    142      1.1  mrg 	fdtox	%f4, %f12
    143      1.1  mrg 	std	%f14, [%o5+0]
    144      1.1  mrg 	fmuld	%f2, %f8, %f16
    145      1.1  mrg 	std	%f12, [%o5+8]
    146      1.1  mrg 	fmuld	%f2, %f6, %f4
    147      1.1  mrg 	fdtox	%f16, %f14
    148      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    149      1.1  mrg 	fdtox	%f4, %f12
    150      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    151      1.1  mrg 	std	%f14, [%o5+16]
    152      1.1  mrg 	std	%f12, [%o5+24]
    153      1.1  mrg 	b	.L3
    154      1.1  mrg 	add	%o0, -8, %o0
    155      1.1  mrg 
    156      1.1  mrg 	.align	16
    157      1.1  mrg .L_four_or_more:
    158      1.1  mrg 	subcc	%o2, 1, %o2
    159      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    160      1.1  mrg 	fdtox	%f16, %f14
    161      1.1  mrg 	fdtox	%f4, %f12
    162      1.1  mrg 	std	%f14, [%o5+0]
    163      1.1  mrg 	fmuld	%f2, %f8, %f16
    164      1.1  mrg 	std	%f12, [%o5+8]
    165      1.1  mrg 	fmuld	%f2, %f6, %f4
    166      1.1  mrg 	add	%o1, 4, %o1		C up++
    167      1.1  mrg 	bne,pt	%icc, .L_five_or_more
    168      1.1  mrg 	fxtod	%f10, %f2
    169      1.1  mrg 
    170      1.1  mrg 	fdtox	%f16, %f14
    171      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    172      1.1  mrg 	fdtox	%f4, %f12
    173      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    174      1.1  mrg 	std	%f14, [%o5+16]
    175      1.1  mrg 	fmuld	%f2, %f8, %f16
    176      1.1  mrg 	std	%f12, [%o5+24]
    177      1.1  mrg 	fmuld	%f2, %f6, %f4
    178      1.1  mrg 	add	%o1, 4, %o1		C up++
    179      1.1  mrg 	b	.L4
    180      1.1  mrg 	add	%o0, -4, %o0
    181      1.1  mrg 
    182      1.1  mrg 	.align	16
    183      1.1  mrg .L_five_or_more:
    184      1.1  mrg 	subcc	%o2, 1, %o2
    185      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    186      1.1  mrg 	fdtox	%f16, %f14
    187      1.1  mrg 	ldx	[%o5+16], %g2		C p16
    188      1.1  mrg 	fdtox	%f4, %f12
    189      1.1  mrg 	ldx	[%o5+24], %g1		C p0
    190      1.1  mrg 	std	%f14, [%o5+16]
    191      1.1  mrg 	fmuld	%f2, %f8, %f16
    192      1.1  mrg 	std	%f12, [%o5+24]
    193      1.1  mrg 	fmuld	%f2, %f6, %f4
    194      1.1  mrg 	add	%o1, 4, %o1		C up++
    195      1.1  mrg 	bne,pt	%icc, .Loop
    196      1.1  mrg 	fxtod	%f10, %f2
    197      1.1  mrg 	b,a	.L5
    198      1.1  mrg 
    199      1.1  mrg C BEGIN MAIN LOOP
    200      1.1  mrg 	.align 16
    201      1.1  mrg C -- 0
    202      1.1  mrg .Loop:	nop
    203      1.1  mrg 	subcc	%o2, 1, %o2
    204      1.1  mrg 	ld	[%o1], %f11		C read up[i]
    205      1.1  mrg 	fdtox	%f16, %f14
    206      1.1  mrg C -- 1
    207      1.1  mrg 	sllx	%g2, 16, %g4		C (p16 << 16)
    208      1.1  mrg 	add	%o0, 4, %o0		C rp++
    209      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    210      1.1  mrg 	fdtox	%f4, %f12
    211      1.1  mrg C -- 2
    212      1.1  mrg 	nop
    213      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    214      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    215      1.1  mrg 	fanop
    216      1.1  mrg C -- 3
    217      1.1  mrg 	nop
    218      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    219      1.1  mrg 	std	%f14, [%o5+0]
    220      1.1  mrg 	fmuld	%f2, %f8, %f16
    221      1.1  mrg C -- 4
    222      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    223      1.1  mrg 	add	%o1, 4, %o1		C up++
    224      1.1  mrg 	std	%f12, [%o5+8]
    225      1.1  mrg 	fmuld	%f2, %f6, %f4
    226      1.1  mrg C -- 5
    227      1.1  mrg 	xor	%o5, 16, %o5		C alternate scratch variables
    228      1.1  mrg 	stw	%g4, [%o0-4]
    229      1.1  mrg 	bne,pt	%icc, .Loop
    230      1.1  mrg 	fxtod	%f10, %f2
    231      1.1  mrg C END MAIN LOOP
    232      1.1  mrg 
    233      1.1  mrg .L5:	fdtox	%f16, %f14
    234      1.1  mrg 	sllx	%g2, 16, %g4		C (p16 << 16)
    235      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    236      1.1  mrg 	fdtox	%f4, %f12
    237      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    238      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    239      1.1  mrg 	add	%g4, %g3, %g4		C p += cy
    240      1.1  mrg 	std	%f14, [%o5+0]
    241      1.1  mrg 	fmuld	%f2, %f8, %f16
    242      1.1  mrg 	std	%f12, [%o5+8]
    243      1.1  mrg 	fmuld	%f2, %f6, %f4
    244      1.1  mrg 	xor	%o5, 16, %o5
    245      1.1  mrg 	stw	%g4, [%o0+0]
    246      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    247      1.1  mrg 
    248      1.1  mrg .L4:	fdtox	%f16, %f14
    249      1.1  mrg 	sllx	%g2, 16, %g4		C (p16 << 16)
    250      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    251      1.1  mrg 	fdtox	%f4, %f12
    252      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    253      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    254      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    255      1.1  mrg 	std	%f14, [%o5+0]
    256      1.1  mrg 	std	%f12, [%o5+8]
    257      1.1  mrg 	xor	%o5, 16, %o5
    258      1.1  mrg 	stw	%g4, [%o0+4]
    259      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    260      1.1  mrg 
    261      1.1  mrg .L3:	sllx	%g2, 16, %g4		C (p16 << 16)
    262      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    263      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    264      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    265      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    266      1.1  mrg 	xor	%o5, 16, %o5
    267      1.1  mrg 	stw	%g4, [%o0+8]
    268      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    269      1.1  mrg 
    270      1.1  mrg .L2:	sllx	%g2, 16, %g4		C (p16 << 16)
    271      1.1  mrg 	ldx	[%o5+0], %g2		C p16
    272      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    273      1.1  mrg 	ldx	[%o5+8], %g1		C p0
    274      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    275      1.1  mrg 	stw	%g4, [%o0+12]
    276      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    277      1.1  mrg 
    278      1.1  mrg .L1:	sllx	%g2, 16, %g4		C (p16 << 16)
    279      1.1  mrg 	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
    280      1.1  mrg 	add	%g3, %g4, %g4		C p += cy
    281      1.1  mrg 	stw	%g4, [%o0+16]
    282      1.1  mrg 	srlx	%g4, 32, %g3		C new cy
    283      1.1  mrg 
    284      1.1  mrg 	mov	%g3, %o0
    285      1.1  mrg 	retl
    286      1.1  mrg 	sub	%sp, -FSIZE, %sp
    287      1.1  mrg EPILOGUE(mpn_mul_1)
    288