Home | History | Annotate | Line # | Download | only in ultrasparc1234
addmul_2.asm revision 1.1.1.2
      1 dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
      2 dnl  number and add the result to a n limb vector.
      3 
      4 dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 C                  cycles/limb
     35 C UltraSPARC 1&2:      9
     36 C UltraSPARC 3:       10
     37 
     38 C Algorithm: We use 16 floating-point multiplies per limb product, with the
     39 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
     40 C split into 32-bit pieces.  We sum four 48-bit partial products using
     41 C floating-point add, then convert the resulting four 50-bit quantities and
     42 C transfer them to the integer unit.
     43 
     44 C Possible optimizations:
     45 C   1. Align the stack area where we transfer the four 50-bit product-sums
     46 C      to a 32-byte boundary.  That would minimize the cache collision.
     47 C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
     48 C      be to align the area to map to the area immediately before up?)
     49 C   2. Perform two of the fp->int conversions with integer instructions.  We
     50 C      can get almost ten free IEU slots, if we clean up bookkeeping and the
     51 C      silly carry-limb code.
     52 C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
     53 C      code.
     54 
     55 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
     56 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
     57 C FI	= 20
     58 C L	=  9 x un * vn
     59 C WDFI	= 10 x vn / 2
     60 C WD	= 4
     61 
     62 C Instruction classification (as per UltraSPARC functional units).
     63 C Assuming silly carry code is fixed.  Includes bookkeeping.
     64 C
     65 C               mpn_addmul_X     mpn_mul_X
     66 C                1       2       1       2
     67 C               ==========      ==========
     68 C      FM        8      16       8      16
     69 C      FA       10      18      10      18
     70 C     MEM       12      12      10      10
     71 C  ISHIFT        6       6       6       6
     72 C IADDLOG       11      11      10      10
     73 C  BRANCH        1       1       1       1
     74 C
     75 C TOTAL IEU     17      17      16      16
     76 C TOTAL         48      64      45      61
     77 C
     78 C IEU cycles     8.5     8.5     8       8
     79 C MEM cycles    12      12      10      10
     80 C ISSUE cycles  12      16      11.25   15.25
     81 C FPU cycles    10      18      10      18
     82 C cycles/loop   12      18      12      18
     83 C cycles/limb   12       9      12       9
     84 
     85 
     86 C INPUT PARAMETERS
     87 C rp[n + 1]	i0
     88 C up[n]		i1
     89 C n		i2
     90 C vp[2]		i3
     91 
     92 
     93 ASM_START()
     94 	REGISTER(%g2,#scratch)
     95 	REGISTER(%g3,#scratch)
     96 
     97 C Combine registers:
     98 C u00_hi= u32_hi
     99 C u00_lo= u32_lo
    100 C a000  = out000
    101 C a016  = out016
    102 C Free: f52 f54
    103 
    104 
    105 define(`p000', `%f8')  define(`p016',`%f10')
    106 define(`p032',`%f12')  define(`p048',`%f14')
    107 define(`p064',`%f16')  define(`p080',`%f18')
    108 define(`p096a',`%f20') define(`p112a',`%f22')
    109 define(`p096b',`%f56') define(`p112b',`%f58')
    110 
    111 define(`out000',`%f0') define(`out016',`%f6')
    112 
    113 define(`v000',`%f24')  define(`v016',`%f26')
    114 define(`v032',`%f28')  define(`v048',`%f30')
    115 define(`v064',`%f44')  define(`v080',`%f46')
    116 define(`v096',`%f48')  define(`v112',`%f50')
    117 
    118 define(`u00',`%f32')   define(`u32', `%f34')
    119 
    120 define(`a000',`%f36')  define(`a016',`%f38')
    121 define(`a032',`%f40')  define(`a048',`%f42')
    122 define(`a064',`%f60')  define(`a080',`%f62')
    123 
    124 define(`u00_hi',`%f2') define(`u32_hi',`%f4')
    125 define(`u00_lo',`%f3') define(`u32_lo',`%f5')
    126 
    127 define(`cy',`%g1')
    128 define(`rlimb',`%g3')
    129 define(`i00',`%l0')    define(`i16',`%l1')
    130 define(`r00',`%l2')    define(`r32',`%l3')
    131 define(`xffffffff',`%l7')
    132 define(`xffff',`%o0')
    133 
    134 
    135 PROLOGUE(mpn_addmul_2)
    136 
    137 C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
    138 C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
    139 C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
    140 C This code could be better scheduled.
    141 
    142 	save	%sp, -256, %sp
    143 
    144 ifdef(`HAVE_VIS',
    145 `	mov	-1, %g4
    146 	wr	%g0, 0xD2, %asi
    147 	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
    148 	ldda	[%i3+6] %asi, v000
    149 	ldda	[%i3+4] %asi, v016
    150 	ldda	[%i3+2] %asi, v032
    151 	ldda	[%i3+0] %asi, v048
    152 	fxtod	v000, v000
    153 	ldda	[%i3+14] %asi, v064
    154 	fxtod	v016, v016
    155 	ldda	[%i3+12] %asi, v080
    156 	fxtod	v032, v032
    157 	ldda	[%i3+10] %asi, v096
    158 	fxtod	v048, v048
    159 	ldda	[%i3+8] %asi, v112
    160 	fxtod	v064, v064
    161 	fxtod	v080, v080
    162 	fxtod	v096, v096
    163 	fxtod	v112, v112
    164 	fzero	u00_hi
    165 	fzero	u32_hi
    166 ',
    167 `	mov	-1, %g4
    168 	ldx	[%i3+0], %l0		C vp[0]
    169 	srlx	%g4, 48, xffff		C store mask in register `xffff'
    170 	ldx	[%i3+8], %l1		C vp[1]
    171 
    172 	and	%l0, xffff, %g2
    173 	stx	%g2, [%sp+2223+0]
    174 	srlx	%l0, 16, %g3
    175 	and	%g3, xffff, %g3
    176 	stx	%g3, [%sp+2223+8]
    177 	srlx	%l0, 32, %g2
    178 	and	%g2, xffff, %g2
    179 	stx	%g2, [%sp+2223+16]
    180 	srlx	%l0, 48, %g3
    181 	stx	%g3, [%sp+2223+24]
    182 	and	%l1, xffff, %g2
    183 	stx	%g2, [%sp+2223+32]
    184 	srlx	%l1, 16, %g3
    185 	and	%g3, xffff, %g3
    186 	stx	%g3, [%sp+2223+40]
    187 	srlx	%l1, 32, %g2
    188 	and	%g2, xffff, %g2
    189 	stx	%g2, [%sp+2223+48]
    190 	srlx	%l1, 48, %g3
    191 	stx	%g3, [%sp+2223+56]
    192 
    193 	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
    194 
    195 	ldd	[%sp+2223+0], v000
    196 	ldd	[%sp+2223+8], v016
    197 	ldd	[%sp+2223+16], v032
    198 	ldd	[%sp+2223+24], v048
    199 	fxtod	v000, v000
    200 	ldd	[%sp+2223+32], v064
    201 	fxtod	v016, v016
    202 	ldd	[%sp+2223+40], v080
    203 	fxtod	v032, v032
    204 	ldd	[%sp+2223+48], v096
    205 	fxtod	v048, v048
    206 	ldd	[%sp+2223+56], v112
    207 	fxtod	v064, v064
    208 	ld	[%sp+2223+0], u00_hi	C zero u00_hi
    209 	fxtod	v080, v080
    210 	ld	[%sp+2223+0], u32_hi	C zero u32_hi
    211 	fxtod	v096, v096
    212 	fxtod	v112, v112
    213 ')
    214 C Initialization done.
    215 	mov	0, %g2
    216 	mov	0, rlimb
    217 	mov	0, %g4
    218 	add	%i0, -8, %i0		C BOOKKEEPING
    219 
    220 C Start software pipeline.
    221 
    222 	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
    223 	fxtod	u00_hi, u00
    224 C mid
    225 	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
    226 	fmuld	u00, v000, a000
    227 	fmuld	u00, v016, a016
    228 	fmuld	u00, v032, a032
    229 	fmuld	u00, v048, a048
    230 	add	%i2, -1, %i2		C BOOKKEEPING
    231 	fmuld	u00, v064, p064
    232 	add	%i1, 8, %i1		C BOOKKEEPING
    233 	fxtod	u32_hi, u32
    234 	fmuld	u00, v080, p080
    235 	fmuld	u00, v096, p096a
    236 	brnz,pt	%i2, .L_2_or_more
    237 	 fmuld	u00, v112, p112a
    238 
    239 .L1:	fdtox	a000, out000
    240 	fmuld	u32, v000, p000
    241 	fdtox	a016, out016
    242 	fmuld	u32, v016, p016
    243 	fmovd	p064, a064
    244 	fmuld	u32, v032, p032
    245 	fmovd	p080, a080
    246 	fmuld	u32, v048, p048
    247 	std	out000, [%sp+2223+16]
    248 	faddd	p000, a032, a000
    249 	fmuld	u32, v064, p064
    250 	std	out016, [%sp+2223+24]
    251 	fxtod	u00_hi, u00
    252 	faddd	p016, a048, a016
    253 	fmuld	u32, v080, p080
    254 	faddd	p032, a064, a032
    255 	fmuld	u32, v096, p096b
    256 	faddd	p048, a080, a048
    257 	fmuld	u32, v112, p112b
    258 C mid
    259 	fdtox	a000, out000
    260 	fdtox	a016, out016
    261 	faddd	p064, p096a, a064
    262 	faddd	p080, p112a, a080
    263 	std	out000, [%sp+2223+0]
    264 	b	.L_wd2
    265 	 std	out016, [%sp+2223+8]
    266 
    267 .L_2_or_more:
    268 	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
    269 	fdtox	a000, out000
    270 	fmuld	u32, v000, p000
    271 	fdtox	a016, out016
    272 	fmuld	u32, v016, p016
    273 	fmovd	p064, a064
    274 	fmuld	u32, v032, p032
    275 	fmovd	p080, a080
    276 	fmuld	u32, v048, p048
    277 	std	out000, [%sp+2223+16]
    278 	faddd	p000, a032, a000
    279 	fmuld	u32, v064, p064
    280 	std	out016, [%sp+2223+24]
    281 	fxtod	u00_hi, u00
    282 	faddd	p016, a048, a016
    283 	fmuld	u32, v080, p080
    284 	faddd	p032, a064, a032
    285 	fmuld	u32, v096, p096b
    286 	faddd	p048, a080, a048
    287 	fmuld	u32, v112, p112b
    288 C mid
    289 	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
    290 	fdtox	a000, out000
    291 	fmuld	u00, v000, p000
    292 	fdtox	a016, out016
    293 	fmuld	u00, v016, p016
    294 	faddd	p064, p096a, a064
    295 	fmuld	u00, v032, p032
    296 	faddd	p080, p112a, a080
    297 	fmuld	u00, v048, p048
    298 	add	%i2, -1, %i2		C BOOKKEEPING
    299 	std	out000, [%sp+2223+0]
    300 	faddd	p000, a032, a000
    301 	fmuld	u00, v064, p064
    302 	add	%i1, 8, %i1		C BOOKKEEPING
    303 	std	out016, [%sp+2223+8]
    304 	fxtod	u32_hi, u32
    305 	faddd	p016, a048, a016
    306 	fmuld	u00, v080, p080
    307 	faddd	p032, a064, a032
    308 	fmuld	u00, v096, p096a
    309 	faddd	p048, a080, a048
    310 	brnz,pt	%i2, .L_3_or_more
    311 	 fmuld	u00, v112, p112a
    312 
    313 	b	.Lend
    314 	 nop
    315 
    316 C  64      32       0
    317 C   .       .       .
    318 C   .       |__rXXX_|	32
    319 C   .      |___cy___|	34
    320 C   .  |_______i00__|	50
    321 C  |_______i16__|   .	50
    322 
    323 
    324 C BEGIN MAIN LOOP
    325 	.align	16
    326 .L_3_or_more:
    327 .Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
    328 	and	%g2, xffffffff, %g2
    329 	fdtox	a000, out000
    330 	fmuld	u32, v000, p000
    331 C
    332 	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
    333 	add	%g2, rlimb, %l5
    334 	fdtox	a016, out016
    335 	fmuld	u32, v016, p016
    336 C
    337 	srlx	%l5, 32, cy
    338 	ldx	[%sp+2223+16], i00
    339 	faddd	p064, p096b, a064
    340 	fmuld	u32, v032, p032
    341 C
    342 	add	%g4, cy, cy		C new cy
    343 	ldx	[%sp+2223+24], i16
    344 	faddd	p080, p112b, a080
    345 	fmuld	u32, v048, p048
    346 C
    347 	nop
    348 	std	out000, [%sp+2223+16]
    349 	faddd	p000, a032, a000
    350 	fmuld	u32, v064, p064
    351 C
    352 	add	i00, r00, rlimb
    353 	add	%i0, 8, %i0		C BOOKKEEPING
    354 	std	out016, [%sp+2223+24]
    355 	fxtod	u00_hi, u00
    356 C
    357 	sllx	i16, 16, %g2
    358 	add	cy, rlimb, rlimb
    359 	faddd	p016, a048, a016
    360 	fmuld	u32, v080, p080
    361 C
    362 	srlx	i16, 16, %g4
    363 	add	%g2, rlimb, %l5
    364 	faddd	p032, a064, a032
    365 	fmuld	u32, v096, p096b
    366 C
    367 	stw	%l5, [%i0+4]
    368 	nop
    369 	faddd	p048, a080, a048
    370 	fmuld	u32, v112, p112b
    371 C midloop
    372 	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
    373 	and	%g2, xffffffff, %g2
    374 	fdtox	a000, out000
    375 	fmuld	u00, v000, p000
    376 C
    377 	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
    378 	add	%g2, rlimb, %l5
    379 	fdtox	a016, out016
    380 	fmuld	u00, v016, p016
    381 C
    382 	srlx	%l5, 32, cy
    383 	ldx	[%sp+2223+0], i00
    384 	faddd	p064, p096a, a064
    385 	fmuld	u00, v032, p032
    386 C
    387 	add	%g4, cy, cy		C new cy
    388 	ldx	[%sp+2223+8], i16
    389 	faddd	p080, p112a, a080
    390 	fmuld	u00, v048, p048
    391 C
    392 	add	%i2, -1, %i2		C BOOKKEEPING
    393 	std	out000, [%sp+2223+0]
    394 	faddd	p000, a032, a000
    395 	fmuld	u00, v064, p064
    396 C
    397 	add	i00, r32, rlimb
    398 	add	%i1, 8, %i1		C BOOKKEEPING
    399 	std	out016, [%sp+2223+8]
    400 	fxtod	u32_hi, u32
    401 C
    402 	sllx	i16, 16, %g2
    403 	add	cy, rlimb, rlimb
    404 	faddd	p016, a048, a016
    405 	fmuld	u00, v080, p080
    406 C
    407 	srlx	i16, 16, %g4
    408 	add	%g2, rlimb, %l5
    409 	faddd	p032, a064, a032
    410 	fmuld	u00, v096, p096a
    411 C
    412 	stw	%l5, [%i0+0]
    413 	faddd	p048, a080, a048
    414 	brnz,pt	%i2, .Loop
    415 	 fmuld	u00, v112, p112a
    416 C END MAIN LOOP
    417 
    418 C WIND-DOWN PHASE 1
    419 .Lend:	and	%g2, xffffffff, %g2
    420 	fdtox	a000, out000
    421 	fmuld	u32, v000, p000
    422 	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
    423 	add	%g2, rlimb, %l5
    424 	fdtox	a016, out016
    425 	fmuld	u32, v016, p016
    426 	srlx	%l5, 32, cy
    427 	ldx	[%sp+2223+16], i00
    428 	faddd	p064, p096b, a064
    429 	fmuld	u32, v032, p032
    430 	add	%g4, cy, cy		C new cy
    431 	ldx	[%sp+2223+24], i16
    432 	faddd	p080, p112b, a080
    433 	fmuld	u32, v048, p048
    434 	std	out000, [%sp+2223+16]
    435 	faddd	p000, a032, a000
    436 	fmuld	u32, v064, p064
    437 	add	i00, r00, rlimb
    438 	add	%i0, 8, %i0		C BOOKKEEPING
    439 	std	out016, [%sp+2223+24]
    440 	sllx	i16, 16, %g2
    441 	add	cy, rlimb, rlimb
    442 	faddd	p016, a048, a016
    443 	fmuld	u32, v080, p080
    444 	srlx	i16, 16, %g4
    445 	add	%g2, rlimb, %l5
    446 	faddd	p032, a064, a032
    447 	fmuld	u32, v096, p096b
    448 	stw	%l5, [%i0+4]
    449 	faddd	p048, a080, a048
    450 	fmuld	u32, v112, p112b
    451 C mid
    452 	and	%g2, xffffffff, %g2
    453 	fdtox	a000, out000
    454 	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
    455 	add	%g2, rlimb, %l5
    456 	fdtox	a016, out016
    457 	srlx	%l5, 32, cy
    458 	ldx	[%sp+2223+0], i00
    459 	faddd	p064, p096a, a064
    460 	add	%g4, cy, cy		C new cy
    461 	ldx	[%sp+2223+8], i16
    462 	faddd	p080, p112a, a080
    463 	std	out000, [%sp+2223+0]
    464 	add	i00, r32, rlimb
    465 	std	out016, [%sp+2223+8]
    466 	sllx	i16, 16, %g2
    467 	add	cy, rlimb, rlimb
    468 	srlx	i16, 16, %g4
    469 	add	%g2, rlimb, %l5
    470 	stw	%l5, [%i0+0]
    471 
    472 C WIND-DOWN PHASE 2
    473 .L_wd2:	and	%g2, xffffffff, %g2
    474 	fdtox	a032, out000
    475 	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
    476 	add	%g2, rlimb, %l5
    477 	fdtox	a048, out016
    478 	srlx	%l5, 32, cy
    479 	ldx	[%sp+2223+16], i00
    480 	add	%g4, cy, cy		C new cy
    481 	ldx	[%sp+2223+24], i16
    482 	std	out000, [%sp+2223+16]
    483 	add	i00, r00, rlimb
    484 	add	%i0, 8, %i0		C BOOKKEEPING
    485 	std	out016, [%sp+2223+24]
    486 	sllx	i16, 16, %g2
    487 	add	cy, rlimb, rlimb
    488 	srlx	i16, 16, %g4
    489 	add	%g2, rlimb, %l5
    490 	stw	%l5, [%i0+4]
    491 C mid
    492 	and	%g2, xffffffff, %g2
    493 	fdtox	a064, out000
    494 	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
    495 	add	%g2, rlimb, %l5
    496 	fdtox	a080, out016
    497 	srlx	%l5, 32, cy
    498 	ldx	[%sp+2223+0], i00
    499 	add	%g4, cy, cy		C new cy
    500 	ldx	[%sp+2223+8], i16
    501 	std	out000, [%sp+2223+0]
    502 	add	i00, r32, rlimb
    503 	std	out016, [%sp+2223+8]
    504 	sllx	i16, 16, %g2
    505 	add	cy, rlimb, rlimb
    506 	srlx	i16, 16, %g4
    507 	add	%g2, rlimb, %l5
    508 	stw	%l5, [%i0+0]
    509 
    510 C WIND-DOWN PHASE 3
    511 .L_wd3:	and	%g2, xffffffff, %g2
    512 	fdtox	p096b, out000
    513 	add	%g2, rlimb, %l5
    514 	fdtox	p112b, out016
    515 	srlx	%l5, 32, cy
    516 	ldx	[%sp+2223+16], rlimb
    517 	add	%g4, cy, cy		C new cy
    518 	ldx	[%sp+2223+24], i16
    519 	std	out000, [%sp+2223+16]
    520 	add	%i0, 8, %i0		C BOOKKEEPING
    521 	std	out016, [%sp+2223+24]
    522 	sllx	i16, 16, %g2
    523 	add	cy, rlimb, rlimb
    524 	srlx	i16, 16, %g4
    525 	add	%g2, rlimb, %l5
    526 	stw	%l5, [%i0+4]
    527 C mid
    528 	and	%g2, xffffffff, %g2
    529 	add	%g2, rlimb, %l5
    530 	srlx	%l5, 32, cy
    531 	ldx	[%sp+2223+0], rlimb
    532 	add	%g4, cy, cy		C new cy
    533 	ldx	[%sp+2223+8], i16
    534 	sllx	i16, 16, %g2
    535 	add	cy, rlimb, rlimb
    536 	srlx	i16, 16, %g4
    537 	add	%g2, rlimb, %l5
    538 	stw	%l5, [%i0+0]
    539 
    540 	and	%g2, xffffffff, %g2
    541 	add	%g2, rlimb, %l5
    542 	srlx	%l5, 32, cy
    543 	ldx	[%sp+2223+16], i00
    544 	add	%g4, cy, cy		C new cy
    545 	ldx	[%sp+2223+24], i16
    546 
    547 	sllx	i16, 16, %g2
    548 	add	i00, cy, cy
    549 	return	%i7+8
    550 	add	%g2, cy, %o0
    551 EPILOGUE(mpn_addmul_2)
    552