Home | History | Annotate | Line # | Download | only in ultrasparc1234
addmul_2.asm revision 1.1.1.1.4.2
      1 dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
      2 dnl  number and add the result to a n limb vector.
      3 
      4 dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of the GNU Lesser General Public License as published
     10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11 dnl  your option) any later version.
     12 
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16 dnl  License for more details.
     17 
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 C                  cycles/limb
     24 C UltraSPARC 1&2:      9
     25 C UltraSPARC 3:       10
     26 
     27 C Algorithm: We use 16 floating-point multiplies per limb product, with the
     28 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
     29 C split into 32-bit pieces.  We sum four 48-bit partial products using
     30 C floating-point add, then convert the resulting four 50-bit quantities and
     31 C transfer them to the integer unit.
     32 
     33 C Possible optimizations:
     34 C   1. Align the stack area where we transfer the four 50-bit product-sums
     35 C      to a 32-byte boundary.  That would minimize the cache collision.
     36 C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
     37 C      be to align the area to map to the area immediately before up?)
     38 C   2. Perform two of the fp->int conversions with integer instructions.  We
     39 C      can get almost ten free IEU slots, if we clean up bookkeeping and the
     40 C      silly carry-limb code.
     41 C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
     42 C      code.
     43 
     44 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
     45 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
     46 C FI	= 20
     47 C L	=  9 x un * vn
     48 C WDFI	= 10 x vn / 2
     49 C WD	= 4
     50 
     51 C Instruction classification (as per UltraSPARC functional units).
     52 C Assuming silly carry code is fixed.  Includes bookkeeping.
     53 C
     54 C               mpn_addmul_X     mpn_mul_X
     55 C                1       2       1       2
     56 C               ==========      ==========
     57 C      FM        8      16       8      16
     58 C      FA       10      18      10      18
     59 C     MEM       12      12      10      10
     60 C  ISHIFT        6       6       6       6
     61 C IADDLOG       11      11      10      10
     62 C  BRANCH        1       1       1       1
     63 C
     64 C TOTAL IEU     17      17      16      16
     65 C TOTAL         48      64      45      61
     66 C
     67 C IEU cycles     8.5     8.5     8       8
     68 C MEM cycles    12      12      10      10
     69 C ISSUE cycles  12      16      11.25   15.25
     70 C FPU cycles    10      18      10      18
     71 C cycles/loop   12      18      12      18
     72 C cycles/limb   12       9      12       9
     73 
     74 
     75 C INPUT PARAMETERS
     76 C rp[n + 1]	i0
     77 C up[n]		i1
     78 C n		i2
     79 C vp[2]		i3
     80 
     81 
     82 ASM_START()
     83 	REGISTER(%g2,#scratch)
     84 	REGISTER(%g3,#scratch)
     85 
     86 C Combine registers:
     87 C u00_hi= u32_hi
     88 C u00_lo= u32_lo
     89 C a000  = out000
     90 C a016  = out016
     91 C Free: f52 f54
     92 
     93 
     94 define(`p000', `%f8')  define(`p016',`%f10')
     95 define(`p032',`%f12')  define(`p048',`%f14')
     96 define(`p064',`%f16')  define(`p080',`%f18')
     97 define(`p096a',`%f20') define(`p112a',`%f22')
     98 define(`p096b',`%f56') define(`p112b',`%f58')
     99 
    100 define(`out000',`%f0') define(`out016',`%f6')
    101 
    102 define(`v000',`%f24')  define(`v016',`%f26')
    103 define(`v032',`%f28')  define(`v048',`%f30')
    104 define(`v064',`%f44')  define(`v080',`%f46')
    105 define(`v096',`%f48')  define(`v112',`%f50')
    106 
    107 define(`u00',`%f32')   define(`u32', `%f34')
    108 
    109 define(`a000',`%f36')  define(`a016',`%f38')
    110 define(`a032',`%f40')  define(`a048',`%f42')
    111 define(`a064',`%f60')  define(`a080',`%f62')
    112 
    113 define(`u00_hi',`%f2') define(`u32_hi',`%f4')
    114 define(`u00_lo',`%f3') define(`u32_lo',`%f5')
    115 
    116 define(`cy',`%g1')
    117 define(`rlimb',`%g3')
    118 define(`i00',`%l0')    define(`i16',`%l1')
    119 define(`r00',`%l2')    define(`r32',`%l3')
    120 define(`xffffffff',`%l7')
    121 define(`xffff',`%o0')
    122 
    123 
    124 PROLOGUE(mpn_addmul_2)
    125 
    126 C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
    127 C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
    128 C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
    129 C This code could be better scheduled.
    130 
    131 	save	%sp, -256, %sp
    132 
    133 ifdef(`HAVE_VIS',
    134 `	mov	-1, %g4
    135 	wr	%g0, 0xD2, %asi
    136 	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
    137 	ldda	[%i3+6] %asi, v000
    138 	ldda	[%i3+4] %asi, v016
    139 	ldda	[%i3+2] %asi, v032
    140 	ldda	[%i3+0] %asi, v048
    141 	fxtod	v000, v000
    142 	ldda	[%i3+14] %asi, v064
    143 	fxtod	v016, v016
    144 	ldda	[%i3+12] %asi, v080
    145 	fxtod	v032, v032
    146 	ldda	[%i3+10] %asi, v096
    147 	fxtod	v048, v048
    148 	ldda	[%i3+8] %asi, v112
    149 	fxtod	v064, v064
    150 	fxtod	v080, v080
    151 	fxtod	v096, v096
    152 	fxtod	v112, v112
    153 	fzero	u00_hi
    154 	fzero	u32_hi
    155 ',
    156 `	mov	-1, %g4
    157 	ldx	[%i3+0], %l0		C vp[0]
    158 	srlx	%g4, 48, xffff		C store mask in register `xffff'
    159 	ldx	[%i3+8], %l1		C vp[1]
    160 
    161 	and	%l0, xffff, %g2
    162 	stx	%g2, [%sp+2223+0]
    163 	srlx	%l0, 16, %g3
    164 	and	%g3, xffff, %g3
    165 	stx	%g3, [%sp+2223+8]
    166 	srlx	%l0, 32, %g2
    167 	and	%g2, xffff, %g2
    168 	stx	%g2, [%sp+2223+16]
    169 	srlx	%l0, 48, %g3
    170 	stx	%g3, [%sp+2223+24]
    171 	and	%l1, xffff, %g2
    172 	stx	%g2, [%sp+2223+32]
    173 	srlx	%l1, 16, %g3
    174 	and	%g3, xffff, %g3
    175 	stx	%g3, [%sp+2223+40]
    176 	srlx	%l1, 32, %g2
    177 	and	%g2, xffff, %g2
    178 	stx	%g2, [%sp+2223+48]
    179 	srlx	%l1, 48, %g3
    180 	stx	%g3, [%sp+2223+56]
    181 
    182 	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
    183 
    184 	ldd	[%sp+2223+0], v000
    185 	ldd	[%sp+2223+8], v016
    186 	ldd	[%sp+2223+16], v032
    187 	ldd	[%sp+2223+24], v048
    188 	fxtod	v000, v000
    189 	ldd	[%sp+2223+32], v064
    190 	fxtod	v016, v016
    191 	ldd	[%sp+2223+40], v080
    192 	fxtod	v032, v032
    193 	ldd	[%sp+2223+48], v096
    194 	fxtod	v048, v048
    195 	ldd	[%sp+2223+56], v112
    196 	fxtod	v064, v064
    197 	ld	[%sp+2223+0], u00_hi	C zero u00_hi
    198 	fxtod	v080, v080
    199 	ld	[%sp+2223+0], u32_hi	C zero u32_hi
    200 	fxtod	v096, v096
    201 	fxtod	v112, v112
    202 ')
    203 C Initialization done.
    204 	mov	0, %g2
    205 	mov	0, rlimb
    206 	mov	0, %g4
    207 	add	%i0, -8, %i0		C BOOKKEEPING
    208 
    209 C Start software pipeline.
    210 
    211 	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
    212 	fxtod	u00_hi, u00
    213 C mid
    214 	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
    215 	fmuld	u00, v000, a000
    216 	fmuld	u00, v016, a016
    217 	fmuld	u00, v032, a032
    218 	fmuld	u00, v048, a048
    219 	add	%i2, -1, %i2		C BOOKKEEPING
    220 	fmuld	u00, v064, p064
    221 	add	%i1, 8, %i1		C BOOKKEEPING
    222 	fxtod	u32_hi, u32
    223 	fmuld	u00, v080, p080
    224 	fmuld	u00, v096, p096a
    225 	brnz,pt	%i2, .L_2_or_more
    226 	 fmuld	u00, v112, p112a
    227 
    228 .L1:	fdtox	a000, out000
    229 	fmuld	u32, v000, p000
    230 	fdtox	a016, out016
    231 	fmuld	u32, v016, p016
    232 	fmovd	p064, a064
    233 	fmuld	u32, v032, p032
    234 	fmovd	p080, a080
    235 	fmuld	u32, v048, p048
    236 	std	out000, [%sp+2223+16]
    237 	faddd	p000, a032, a000
    238 	fmuld	u32, v064, p064
    239 	std	out016, [%sp+2223+24]
    240 	fxtod	u00_hi, u00
    241 	faddd	p016, a048, a016
    242 	fmuld	u32, v080, p080
    243 	faddd	p032, a064, a032
    244 	fmuld	u32, v096, p096b
    245 	faddd	p048, a080, a048
    246 	fmuld	u32, v112, p112b
    247 C mid
    248 	fdtox	a000, out000
    249 	fdtox	a016, out016
    250 	faddd	p064, p096a, a064
    251 	faddd	p080, p112a, a080
    252 	std	out000, [%sp+2223+0]
    253 	b	.L_wd2
    254 	 std	out016, [%sp+2223+8]
    255 
    256 .L_2_or_more:
    257 	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
    258 	fdtox	a000, out000
    259 	fmuld	u32, v000, p000
    260 	fdtox	a016, out016
    261 	fmuld	u32, v016, p016
    262 	fmovd	p064, a064
    263 	fmuld	u32, v032, p032
    264 	fmovd	p080, a080
    265 	fmuld	u32, v048, p048
    266 	std	out000, [%sp+2223+16]
    267 	faddd	p000, a032, a000
    268 	fmuld	u32, v064, p064
    269 	std	out016, [%sp+2223+24]
    270 	fxtod	u00_hi, u00
    271 	faddd	p016, a048, a016
    272 	fmuld	u32, v080, p080
    273 	faddd	p032, a064, a032
    274 	fmuld	u32, v096, p096b
    275 	faddd	p048, a080, a048
    276 	fmuld	u32, v112, p112b
    277 C mid
    278 	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
    279 	fdtox	a000, out000
    280 	fmuld	u00, v000, p000
    281 	fdtox	a016, out016
    282 	fmuld	u00, v016, p016
    283 	faddd	p064, p096a, a064
    284 	fmuld	u00, v032, p032
    285 	faddd	p080, p112a, a080
    286 	fmuld	u00, v048, p048
    287 	add	%i2, -1, %i2		C BOOKKEEPING
    288 	std	out000, [%sp+2223+0]
    289 	faddd	p000, a032, a000
    290 	fmuld	u00, v064, p064
    291 	add	%i1, 8, %i1		C BOOKKEEPING
    292 	std	out016, [%sp+2223+8]
    293 	fxtod	u32_hi, u32
    294 	faddd	p016, a048, a016
    295 	fmuld	u00, v080, p080
    296 	faddd	p032, a064, a032
    297 	fmuld	u00, v096, p096a
    298 	faddd	p048, a080, a048
    299 	brnz,pt	%i2, .L_3_or_more
    300 	 fmuld	u00, v112, p112a
    301 
    302 	b	.Lend
    303 	 nop
    304 
    305 C  64      32       0
    306 C   .       .       .
    307 C   .       |__rXXX_|	32
    308 C   .      |___cy___|	34
    309 C   .  |_______i00__|	50
    310 C  |_______i16__|   .	50
    311 
    312 
    313 C BEGIN MAIN LOOP
    314 	.align	16
    315 .L_3_or_more:
    316 .Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
    317 	and	%g2, xffffffff, %g2
    318 	fdtox	a000, out000
    319 	fmuld	u32, v000, p000
    320 C
    321 	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
    322 	add	%g2, rlimb, %l5
    323 	fdtox	a016, out016
    324 	fmuld	u32, v016, p016
    325 C
    326 	srlx	%l5, 32, cy
    327 	ldx	[%sp+2223+16], i00
    328 	faddd	p064, p096b, a064
    329 	fmuld	u32, v032, p032
    330 C
    331 	add	%g4, cy, cy		C new cy
    332 	ldx	[%sp+2223+24], i16
    333 	faddd	p080, p112b, a080
    334 	fmuld	u32, v048, p048
    335 C
    336 	nop
    337 	std	out000, [%sp+2223+16]
    338 	faddd	p000, a032, a000
    339 	fmuld	u32, v064, p064
    340 C
    341 	add	i00, r00, rlimb
    342 	add	%i0, 8, %i0		C BOOKKEEPING
    343 	std	out016, [%sp+2223+24]
    344 	fxtod	u00_hi, u00
    345 C
    346 	sllx	i16, 16, %g2
    347 	add	cy, rlimb, rlimb
    348 	faddd	p016, a048, a016
    349 	fmuld	u32, v080, p080
    350 C
    351 	srlx	i16, 16, %g4
    352 	add	%g2, rlimb, %l5
    353 	faddd	p032, a064, a032
    354 	fmuld	u32, v096, p096b
    355 C
    356 	stw	%l5, [%i0+4]
    357 	nop
    358 	faddd	p048, a080, a048
    359 	fmuld	u32, v112, p112b
    360 C midloop
    361 	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
    362 	and	%g2, xffffffff, %g2
    363 	fdtox	a000, out000
    364 	fmuld	u00, v000, p000
    365 C
    366 	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
    367 	add	%g2, rlimb, %l5
    368 	fdtox	a016, out016
    369 	fmuld	u00, v016, p016
    370 C
    371 	srlx	%l5, 32, cy
    372 	ldx	[%sp+2223+0], i00
    373 	faddd	p064, p096a, a064
    374 	fmuld	u00, v032, p032
    375 C
    376 	add	%g4, cy, cy		C new cy
    377 	ldx	[%sp+2223+8], i16
    378 	faddd	p080, p112a, a080
    379 	fmuld	u00, v048, p048
    380 C
    381 	add	%i2, -1, %i2		C BOOKKEEPING
    382 	std	out000, [%sp+2223+0]
    383 	faddd	p000, a032, a000
    384 	fmuld	u00, v064, p064
    385 C
    386 	add	i00, r32, rlimb
    387 	add	%i1, 8, %i1		C BOOKKEEPING
    388 	std	out016, [%sp+2223+8]
    389 	fxtod	u32_hi, u32
    390 C
    391 	sllx	i16, 16, %g2
    392 	add	cy, rlimb, rlimb
    393 	faddd	p016, a048, a016
    394 	fmuld	u00, v080, p080
    395 C
    396 	srlx	i16, 16, %g4
    397 	add	%g2, rlimb, %l5
    398 	faddd	p032, a064, a032
    399 	fmuld	u00, v096, p096a
    400 C
    401 	stw	%l5, [%i0+0]
    402 	faddd	p048, a080, a048
    403 	brnz,pt	%i2, .Loop
    404 	 fmuld	u00, v112, p112a
    405 C END MAIN LOOP
    406 
    407 C WIND-DOWN PHASE 1
    408 .Lend:	and	%g2, xffffffff, %g2
    409 	fdtox	a000, out000
    410 	fmuld	u32, v000, p000
    411 	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
    412 	add	%g2, rlimb, %l5
    413 	fdtox	a016, out016
    414 	fmuld	u32, v016, p016
    415 	srlx	%l5, 32, cy
    416 	ldx	[%sp+2223+16], i00
    417 	faddd	p064, p096b, a064
    418 	fmuld	u32, v032, p032
    419 	add	%g4, cy, cy		C new cy
    420 	ldx	[%sp+2223+24], i16
    421 	faddd	p080, p112b, a080
    422 	fmuld	u32, v048, p048
    423 	std	out000, [%sp+2223+16]
    424 	faddd	p000, a032, a000
    425 	fmuld	u32, v064, p064
    426 	add	i00, r00, rlimb
    427 	add	%i0, 8, %i0		C BOOKKEEPING
    428 	std	out016, [%sp+2223+24]
    429 	sllx	i16, 16, %g2
    430 	add	cy, rlimb, rlimb
    431 	faddd	p016, a048, a016
    432 	fmuld	u32, v080, p080
    433 	srlx	i16, 16, %g4
    434 	add	%g2, rlimb, %l5
    435 	faddd	p032, a064, a032
    436 	fmuld	u32, v096, p096b
    437 	stw	%l5, [%i0+4]
    438 	faddd	p048, a080, a048
    439 	fmuld	u32, v112, p112b
    440 C mid
    441 	and	%g2, xffffffff, %g2
    442 	fdtox	a000, out000
    443 	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
    444 	add	%g2, rlimb, %l5
    445 	fdtox	a016, out016
    446 	srlx	%l5, 32, cy
    447 	ldx	[%sp+2223+0], i00
    448 	faddd	p064, p096a, a064
    449 	add	%g4, cy, cy		C new cy
    450 	ldx	[%sp+2223+8], i16
    451 	faddd	p080, p112a, a080
    452 	std	out000, [%sp+2223+0]
    453 	add	i00, r32, rlimb
    454 	std	out016, [%sp+2223+8]
    455 	sllx	i16, 16, %g2
    456 	add	cy, rlimb, rlimb
    457 	srlx	i16, 16, %g4
    458 	add	%g2, rlimb, %l5
    459 	stw	%l5, [%i0+0]
    460 
    461 C WIND-DOWN PHASE 2
    462 .L_wd2:	and	%g2, xffffffff, %g2
    463 	fdtox	a032, out000
    464 	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
    465 	add	%g2, rlimb, %l5
    466 	fdtox	a048, out016
    467 	srlx	%l5, 32, cy
    468 	ldx	[%sp+2223+16], i00
    469 	add	%g4, cy, cy		C new cy
    470 	ldx	[%sp+2223+24], i16
    471 	std	out000, [%sp+2223+16]
    472 	add	i00, r00, rlimb
    473 	add	%i0, 8, %i0		C BOOKKEEPING
    474 	std	out016, [%sp+2223+24]
    475 	sllx	i16, 16, %g2
    476 	add	cy, rlimb, rlimb
    477 	srlx	i16, 16, %g4
    478 	add	%g2, rlimb, %l5
    479 	stw	%l5, [%i0+4]
    480 C mid
    481 	and	%g2, xffffffff, %g2
    482 	fdtox	a064, out000
    483 	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
    484 	add	%g2, rlimb, %l5
    485 	fdtox	a080, out016
    486 	srlx	%l5, 32, cy
    487 	ldx	[%sp+2223+0], i00
    488 	add	%g4, cy, cy		C new cy
    489 	ldx	[%sp+2223+8], i16
    490 	std	out000, [%sp+2223+0]
    491 	add	i00, r32, rlimb
    492 	std	out016, [%sp+2223+8]
    493 	sllx	i16, 16, %g2
    494 	add	cy, rlimb, rlimb
    495 	srlx	i16, 16, %g4
    496 	add	%g2, rlimb, %l5
    497 	stw	%l5, [%i0+0]
    498 
    499 C WIND-DOWN PHASE 3
    500 .L_wd3:	and	%g2, xffffffff, %g2
    501 	fdtox	p096b, out000
    502 	add	%g2, rlimb, %l5
    503 	fdtox	p112b, out016
    504 	srlx	%l5, 32, cy
    505 	ldx	[%sp+2223+16], rlimb
    506 	add	%g4, cy, cy		C new cy
    507 	ldx	[%sp+2223+24], i16
    508 	std	out000, [%sp+2223+16]
    509 	add	%i0, 8, %i0		C BOOKKEEPING
    510 	std	out016, [%sp+2223+24]
    511 	sllx	i16, 16, %g2
    512 	add	cy, rlimb, rlimb
    513 	srlx	i16, 16, %g4
    514 	add	%g2, rlimb, %l5
    515 	stw	%l5, [%i0+4]
    516 C mid
    517 	and	%g2, xffffffff, %g2
    518 	add	%g2, rlimb, %l5
    519 	srlx	%l5, 32, cy
    520 	ldx	[%sp+2223+0], rlimb
    521 	add	%g4, cy, cy		C new cy
    522 	ldx	[%sp+2223+8], i16
    523 	sllx	i16, 16, %g2
    524 	add	cy, rlimb, rlimb
    525 	srlx	i16, 16, %g4
    526 	add	%g2, rlimb, %l5
    527 	stw	%l5, [%i0+0]
    528 
    529 	and	%g2, xffffffff, %g2
    530 	add	%g2, rlimb, %l5
    531 	srlx	%l5, 32, cy
    532 	ldx	[%sp+2223+16], i00
    533 	add	%g4, cy, cy		C new cy
    534 	ldx	[%sp+2223+24], i16
    535 
    536 	sllx	i16, 16, %g2
    537 	add	i00, cy, cy
    538 	return	%i7+8
    539 	add	%g2, cy, %o0
    540 EPILOGUE(mpn_addmul_2)
    541