Home | History | Annotate | Line # | Download | only in pa64
mul_1.asm revision 1.1
      1  1.1  mrg dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
      2  1.1  mrg dnl  the result in a second limb vector.
      3  1.1  mrg 
      4  1.1  mrg dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
      5  1.1  mrg 
      6  1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1  mrg 
      8  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1  mrg dnl  it under the terms of the GNU Lesser General Public License as published
     10  1.1  mrg dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11  1.1  mrg dnl  your option) any later version.
     12  1.1  mrg 
     13  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16  1.1  mrg dnl  License for more details.
     17  1.1  mrg 
     18  1.1  mrg dnl  You should have received a copy of the GNU Lesser General Public License
     19  1.1  mrg dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20  1.1  mrg 
     21  1.1  mrg include(`../config.m4')
     22  1.1  mrg 
     23  1.1  mrg C		    cycles/limb
     24  1.1  mrg C 8000,8200:		6.5
     25  1.1  mrg C 8500,8600,8700:	5.625
     26  1.1  mrg 
     27  1.1  mrg C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     28  1.1  mrg C  could be saved there per call.
     29  1.1  mrg 
     30  1.1  mrg C  DESCRIPTION:
     31  1.1  mrg C  The main loop "BIG" is 4-way unrolled, mainly to allow
     32  1.1  mrg C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     33  1.1  mrg C  registers to the IU registers, have demanded a deep software pipeline, and
     34  1.1  mrg C  a lot of stack slots for partial products in flight.
     35  1.1  mrg C
     36  1.1  mrg C  CODE STRUCTURE:
     37  1.1  mrg C  save-some-registers
     38  1.1  mrg C  do 0, 1, 2, or 3 limbs
     39  1.1  mrg C  if done, restore-some-regs and return
     40  1.1  mrg C  save-many-regs
     41  1.1  mrg C  do 4, 8, ... limb
     42  1.1  mrg C  restore-all-regs
     43  1.1  mrg 
     44  1.1  mrg C  STACK LAYOUT:
     45  1.1  mrg C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     46  1.1  mrg C  slots marked FREE, as well as some slots in the caller's "frame marker".
     47  1.1  mrg C
     48  1.1  mrg C -00 <- r30
     49  1.1  mrg C -08  FREE
     50  1.1  mrg C -10  tmp
     51  1.1  mrg C -18  tmp
     52  1.1  mrg C -20  tmp
     53  1.1  mrg C -28  tmp
     54  1.1  mrg C -30  tmp
     55  1.1  mrg C -38  tmp
     56  1.1  mrg C -40  tmp
     57  1.1  mrg C -48  tmp
     58  1.1  mrg C -50  tmp
     59  1.1  mrg C -58  tmp
     60  1.1  mrg C -60  tmp
     61  1.1  mrg C -68  tmp
     62  1.1  mrg C -70  tmp
     63  1.1  mrg C -78  tmp
     64  1.1  mrg C -80  tmp
     65  1.1  mrg C -88  tmp
     66  1.1  mrg C -90  FREE
     67  1.1  mrg C -98  FREE
     68  1.1  mrg C -a0  FREE
     69  1.1  mrg C -a8  FREE
     70  1.1  mrg C -b0  r13
     71  1.1  mrg C -b8  r12
     72  1.1  mrg C -c0  r11
     73  1.1  mrg C -c8  r10
     74  1.1  mrg C -d0  r8
     75  1.1  mrg C -d8  r8
     76  1.1  mrg C -e0  r7
     77  1.1  mrg C -e8  r6
     78  1.1  mrg C -f0  r5
     79  1.1  mrg C -f8  r4
     80  1.1  mrg C -100 r3
     81  1.1  mrg C  Previous frame:
     82  1.1  mrg C  [unused area]
     83  1.1  mrg C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     84  1.1  mrg 
     85  1.1  mrg 
     86  1.1  mrg include(`../config.m4')
     87  1.1  mrg 
     88  1.1  mrg C INPUT PARAMETERS:
     89  1.1  mrg define(`rp',`%r26')	C
     90  1.1  mrg define(`up',`%r25')	C
     91  1.1  mrg define(`n',`%r24')	C
     92  1.1  mrg define(`vlimb',`%r23')	C
     93  1.1  mrg 
     94  1.1  mrg define(`climb',`%r23')	C
     95  1.1  mrg 
     96  1.1  mrg ifdef(`HAVE_ABI_2_0w',
     97  1.1  mrg `	.level	2.0w
     98  1.1  mrg ',`	.level	2.0
     99  1.1  mrg ')
    100  1.1  mrg PROLOGUE(mpn_mul_1)
    101  1.1  mrg 
    102  1.1  mrg ifdef(`HAVE_ABI_2_0w',
    103  1.1  mrg `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    104  1.1  mrg ')
    105  1.1  mrg 	std,ma		%r3, 0x100(%r30)
    106  1.1  mrg 	std		%r4, -0xf8(%r30)
    107  1.1  mrg 	std		%r5, -0xf0(%r30)
    108  1.1  mrg 	ldo		0(%r0), climb		C clear climb
    109  1.1  mrg 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    110  1.1  mrg 
    111  1.1  mrg define(`p032a1',`%r1')	C
    112  1.1  mrg define(`p032a2',`%r19')	C
    113  1.1  mrg 
    114  1.1  mrg define(`m032',`%r20')	C
    115  1.1  mrg define(`m096',`%r21')	C
    116  1.1  mrg 
    117  1.1  mrg define(`p000a',`%r22')	C
    118  1.1  mrg define(`p064a',`%r29')	C
    119  1.1  mrg 
    120  1.1  mrg define(`s000',`%r31')	C
    121  1.1  mrg 
    122  1.1  mrg define(`ma000',`%r4')	C
    123  1.1  mrg define(`ma064',`%r20')	C
    124  1.1  mrg 
    125  1.1  mrg C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
    126  1.1  mrg 
    127  1.1  mrg 	extrd,u		n, 63, 2, %r5
    128  1.1  mrg 	cmpb,=		%r5, %r0, L(BIG)
    129  1.1  mrg 	nop
    130  1.1  mrg 
    131  1.1  mrg 	fldd		0(up), %fr4
    132  1.1  mrg 	ldo		8(up), up
    133  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    134  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    135  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    136  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    137  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    138  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    139  1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    140  1.1  mrg 	addib,<>	-1, %r5, L(two_or_more)
    141  1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    142  1.1  mrg LDEF(one)
    143  1.1  mrg 	ldd		-0x78(%r30), p032a1
    144  1.1  mrg 	ldd		-0x70(%r30), p032a2
    145  1.1  mrg 	ldd		-0x80(%r30), p000a
    146  1.1  mrg 	b		L(0_one_out)
    147  1.1  mrg 	ldd		-0x68(%r30), p064a
    148  1.1  mrg 
    149  1.1  mrg LDEF(two_or_more)
    150  1.1  mrg 	fldd		0(up), %fr4
    151  1.1  mrg 	ldo		8(up), up
    152  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    153  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    154  1.1  mrg 	ldd		-0x78(%r30), p032a1
    155  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    156  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    157  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    158  1.1  mrg 	ldd		-0x70(%r30), p032a2
    159  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    160  1.1  mrg 	ldd		-0x80(%r30), p000a
    161  1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    162  1.1  mrg 	ldd		-0x68(%r30), p064a
    163  1.1  mrg 	addib,<>	-1, %r5, L(three_or_more)
    164  1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    165  1.1  mrg LDEF(two)
    166  1.1  mrg 	add		p032a1, p032a2, m032
    167  1.1  mrg 	add,dc		%r0, %r0, m096
    168  1.1  mrg 	depd,z		m032, 31, 32, ma000
    169  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    170  1.1  mrg 	b		L(0_two_out)
    171  1.1  mrg 	depd		m096, 31, 32, ma064
    172  1.1  mrg 
    173  1.1  mrg LDEF(three_or_more)
    174  1.1  mrg 	fldd		0(up), %fr4
    175  1.1  mrg 	add		p032a1, p032a2, m032
    176  1.1  mrg 	add,dc		%r0, %r0, m096
    177  1.1  mrg 	depd,z		m032, 31, 32, ma000
    178  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    179  1.1  mrg C	addib,=		-1, %r5, L(0_out)
    180  1.1  mrg 	depd		m096, 31, 32, ma064
    181  1.1  mrg LDEF(loop0)
    182  1.1  mrg C	xmpyu		%fr8R, %fr4L, %fr22
    183  1.1  mrg C	xmpyu		%fr8L, %fr4R, %fr23
    184  1.1  mrg C	ldd		-0x78(%r30), p032a1
    185  1.1  mrg C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    186  1.1  mrg C
    187  1.1  mrg C	xmpyu		%fr8R, %fr4R, %fr24
    188  1.1  mrg C	xmpyu		%fr8L, %fr4L, %fr25
    189  1.1  mrg C	ldd		-0x70(%r30), p032a2
    190  1.1  mrg C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    191  1.1  mrg C
    192  1.1  mrg C	ldo		8(rp), rp
    193  1.1  mrg C	add		climb, p000a, s000
    194  1.1  mrg C	ldd		-0x80(%r30), p000a
    195  1.1  mrg C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    196  1.1  mrg C
    197  1.1  mrg C	add,dc		p064a, %r0, climb
    198  1.1  mrg C	ldo		8(up), up
    199  1.1  mrg C	ldd		-0x68(%r30), p064a
    200  1.1  mrg C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    201  1.1  mrg C
    202  1.1  mrg C	add		ma000, s000, s000
    203  1.1  mrg C	add,dc		ma064, climb, climb
    204  1.1  mrg C	fldd		0(up), %fr4
    205  1.1  mrg C
    206  1.1  mrg C	std		s000, -8(rp)
    207  1.1  mrg C
    208  1.1  mrg C	add		p032a1, p032a2, m032
    209  1.1  mrg C	add,dc		%r0, %r0, m096
    210  1.1  mrg C
    211  1.1  mrg C	depd,z		m032, 31, 32, ma000
    212  1.1  mrg C	extrd,u		m032, 31, 32, ma064
    213  1.1  mrg C	addib,<>	-1, %r5, L(loop0)
    214  1.1  mrg C	depd		m096, 31, 32, ma064
    215  1.1  mrg LDEF(0_out)
    216  1.1  mrg 	ldo		8(up), up
    217  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    218  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    219  1.1  mrg 	ldd		-0x78(%r30), p032a1
    220  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    221  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    222  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    223  1.1  mrg 	ldd		-0x70(%r30), p032a2
    224  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    225  1.1  mrg 	ldo		8(rp), rp
    226  1.1  mrg 	add		climb, p000a, s000
    227  1.1  mrg 	ldd		-0x80(%r30), p000a
    228  1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    229  1.1  mrg 	add,dc		p064a, %r0, climb
    230  1.1  mrg 	ldd		-0x68(%r30), p064a
    231  1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    232  1.1  mrg 	add		ma000, s000, s000
    233  1.1  mrg 	add,dc		ma064, climb, climb
    234  1.1  mrg 	std		s000, -8(rp)
    235  1.1  mrg 	add		p032a1, p032a2, m032
    236  1.1  mrg 	add,dc		%r0, %r0, m096
    237  1.1  mrg 	depd,z		m032, 31, 32, ma000
    238  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    239  1.1  mrg 	depd		m096, 31, 32, ma064
    240  1.1  mrg LDEF(0_two_out)
    241  1.1  mrg 	ldd		-0x78(%r30), p032a1
    242  1.1  mrg 	ldd		-0x70(%r30), p032a2
    243  1.1  mrg 	ldo		8(rp), rp
    244  1.1  mrg 	add		climb, p000a, s000
    245  1.1  mrg 	ldd		-0x80(%r30), p000a
    246  1.1  mrg 	add,dc		p064a, %r0, climb
    247  1.1  mrg 	ldd		-0x68(%r30), p064a
    248  1.1  mrg 	add		ma000, s000, s000
    249  1.1  mrg 	add,dc		ma064, climb, climb
    250  1.1  mrg 	std		s000, -8(rp)
    251  1.1  mrg LDEF(0_one_out)
    252  1.1  mrg 	add		p032a1, p032a2, m032
    253  1.1  mrg 	add,dc		%r0, %r0, m096
    254  1.1  mrg 	depd,z		m032, 31, 32, ma000
    255  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    256  1.1  mrg 	depd		m096, 31, 32, ma064
    257  1.1  mrg 
    258  1.1  mrg 	add		climb, p000a, s000
    259  1.1  mrg 	add,dc		p064a, %r0, climb
    260  1.1  mrg 	add		ma000, s000, s000
    261  1.1  mrg 	add,dc		ma064, climb, climb
    262  1.1  mrg 	std		s000, 0(rp)
    263  1.1  mrg 
    264  1.1  mrg 	cmpib,>=	4, n, L(done)
    265  1.1  mrg 	ldo		8(rp), rp
    266  1.1  mrg 
    267  1.1  mrg C 4-way unrolled code.
    268  1.1  mrg 
    269  1.1  mrg LDEF(BIG)
    270  1.1  mrg 
    271  1.1  mrg define(`p032a1',`%r1')	C
    272  1.1  mrg define(`p032a2',`%r19')	C
    273  1.1  mrg define(`p096b1',`%r20')	C
    274  1.1  mrg define(`p096b2',`%r21')	C
    275  1.1  mrg define(`p160c1',`%r22')	C
    276  1.1  mrg define(`p160c2',`%r29')	C
    277  1.1  mrg define(`p224d1',`%r31')	C
    278  1.1  mrg define(`p224d2',`%r3')	C
    279  1.1  mrg 			C
    280  1.1  mrg define(`m032',`%r4')	C
    281  1.1  mrg define(`m096',`%r5')	C
    282  1.1  mrg define(`m160',`%r6')	C
    283  1.1  mrg define(`m224',`%r7')	C
    284  1.1  mrg define(`m288',`%r8')	C
    285  1.1  mrg 			C
    286  1.1  mrg define(`p000a',`%r1')	C
    287  1.1  mrg define(`p064a',`%r19')	C
    288  1.1  mrg define(`p064b',`%r20')	C
    289  1.1  mrg define(`p128b',`%r21')	C
    290  1.1  mrg define(`p128c',`%r22')	C
    291  1.1  mrg define(`p192c',`%r29')	C
    292  1.1  mrg define(`p192d',`%r31')	C
    293  1.1  mrg define(`p256d',`%r3')	C
    294  1.1  mrg 			C
    295  1.1  mrg define(`s000',`%r10')	C
    296  1.1  mrg define(`s064',`%r11')	C
    297  1.1  mrg define(`s128',`%r12')	C
    298  1.1  mrg define(`s192',`%r13')	C
    299  1.1  mrg 			C
    300  1.1  mrg define(`ma000',`%r9')	C
    301  1.1  mrg define(`ma064',`%r4')	C
    302  1.1  mrg define(`ma128',`%r5')	C
    303  1.1  mrg define(`ma192',`%r6')	C
    304  1.1  mrg define(`ma256',`%r7')	C
    305  1.1  mrg 
    306  1.1  mrg 	std		%r6, -0xe8(%r30)
    307  1.1  mrg 	std		%r7, -0xe0(%r30)
    308  1.1  mrg 	std		%r8, -0xd8(%r30)
    309  1.1  mrg 	std		%r9, -0xd0(%r30)
    310  1.1  mrg 	std		%r10, -0xc8(%r30)
    311  1.1  mrg 	std		%r11, -0xc0(%r30)
    312  1.1  mrg 	std		%r12, -0xb8(%r30)
    313  1.1  mrg 	std		%r13, -0xb0(%r30)
    314  1.1  mrg 
    315  1.1  mrg ifdef(`HAVE_ABI_2_0w',
    316  1.1  mrg `	extrd,u		n, 61, 62, n		C right shift 2
    317  1.1  mrg ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    318  1.1  mrg ')
    319  1.1  mrg 
    320  1.1  mrg LDEF(4_or_more)
    321  1.1  mrg 	fldd		0(up), %fr4
    322  1.1  mrg 	fldd		8(up), %fr5
    323  1.1  mrg 	fldd		16(up), %fr6
    324  1.1  mrg 	fldd		24(up), %fr7
    325  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    326  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    327  1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    328  1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    329  1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    330  1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    331  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    332  1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    333  1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    334  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    335  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    336  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    337  1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    338  1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    339  1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    340  1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    341  1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    342  1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    343  1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    344  1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    345  1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    346  1.1  mrg 	addib,<>	-1, n, L(8_or_more)
    347  1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    348  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    349  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    350  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    351  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    352  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    353  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    354  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    355  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    356  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    357  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    358  1.1  mrg 	ldd		-0x78(%r30), p032a1
    359  1.1  mrg 	ldd		-0x70(%r30), p032a2
    360  1.1  mrg 	ldd		-0x38(%r30), p096b1
    361  1.1  mrg 	ldd		-0x30(%r30), p096b2
    362  1.1  mrg 	ldd		-0x58(%r30), p160c1
    363  1.1  mrg 	ldd		-0x50(%r30), p160c2
    364  1.1  mrg 	ldd		-0x18(%r30), p224d1
    365  1.1  mrg 	ldd		-0x10(%r30), p224d2
    366  1.1  mrg 	b		L(end1)
    367  1.1  mrg 	nop
    368  1.1  mrg 
    369  1.1  mrg LDEF(8_or_more)
    370  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    371  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    372  1.1  mrg 	ldo		32(up), up
    373  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    374  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    375  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    376  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    377  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    378  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    379  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    380  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    381  1.1  mrg 	fldd		0(up), %fr4
    382  1.1  mrg 	fldd		8(up), %fr5
    383  1.1  mrg 	fldd		16(up), %fr6
    384  1.1  mrg 	fldd		24(up), %fr7
    385  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    386  1.1  mrg 	ldd		-0x78(%r30), p032a1
    387  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    388  1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    389  1.1  mrg 	ldd		-0x70(%r30), p032a2
    390  1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    391  1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    392  1.1  mrg 	ldd		-0x38(%r30), p096b1
    393  1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    394  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    395  1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    396  1.1  mrg 	ldd		-0x30(%r30), p096b2
    397  1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    398  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    399  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    400  1.1  mrg 	ldd		-0x58(%r30), p160c1
    401  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    402  1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    403  1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    404  1.1  mrg 	ldd		-0x50(%r30), p160c2
    405  1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    406  1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    407  1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    408  1.1  mrg 	ldd		-0x18(%r30), p224d1
    409  1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    410  1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    411  1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    412  1.1  mrg 	ldd		-0x10(%r30), p224d2
    413  1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    414  1.1  mrg 	addib,=		-1, n, L(end2)
    415  1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    416  1.1  mrg LDEF(loop)
    417  1.1  mrg 	add		p032a1, p032a2, m032
    418  1.1  mrg 	ldd		-0x80(%r30), p000a
    419  1.1  mrg 	add,dc		p096b1, p096b2, m096
    420  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    421  1.1  mrg 
    422  1.1  mrg 	add,dc		p160c1, p160c2, m160
    423  1.1  mrg 	ldd		-0x68(%r30), p064a
    424  1.1  mrg 	add,dc		p224d1, p224d2, m224
    425  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    426  1.1  mrg 
    427  1.1  mrg 	add,dc		%r0, %r0, m288
    428  1.1  mrg 	ldd		-0x40(%r30), p064b
    429  1.1  mrg 	ldo		32(up), up
    430  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    431  1.1  mrg 
    432  1.1  mrg 	depd,z		m032, 31, 32, ma000
    433  1.1  mrg 	ldd		-0x28(%r30), p128b
    434  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    435  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    436  1.1  mrg 
    437  1.1  mrg 	depd		m096, 31, 32, ma064
    438  1.1  mrg 	ldd		-0x60(%r30), p128c
    439  1.1  mrg 	extrd,u		m096, 31, 32, ma128
    440  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    441  1.1  mrg 
    442  1.1  mrg 	depd		m160, 31, 32, ma128
    443  1.1  mrg 	ldd		-0x48(%r30), p192c
    444  1.1  mrg 	extrd,u		m160, 31, 32, ma192
    445  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    446  1.1  mrg 
    447  1.1  mrg 	depd		m224, 31, 32, ma192
    448  1.1  mrg 	ldd		-0x20(%r30), p192d
    449  1.1  mrg 	extrd,u		m224, 31, 32, ma256
    450  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    451  1.1  mrg 
    452  1.1  mrg 	depd		m288, 31, 32, ma256
    453  1.1  mrg 	ldd		-0x88(%r30), p256d
    454  1.1  mrg 	add		climb, p000a, s000
    455  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    456  1.1  mrg 
    457  1.1  mrg 	add,dc		p064a, p064b, s064
    458  1.1  mrg 	add,dc		p128b, p128c, s128
    459  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    460  1.1  mrg 
    461  1.1  mrg 	add,dc		p192c, p192d, s192
    462  1.1  mrg 	add,dc		p256d, %r0, climb
    463  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    464  1.1  mrg 
    465  1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    466  1.1  mrg 	fldd		0(up), %fr4
    467  1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    468  1.1  mrg 	std		s000, 0(rp)
    469  1.1  mrg 
    470  1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    471  1.1  mrg 	fldd		8(up), %fr5
    472  1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    473  1.1  mrg 	std		s064, 8(rp)
    474  1.1  mrg 
    475  1.1  mrg 	add,dc		ma256, climb, climb
    476  1.1  mrg 	fldd		16(up), %fr6
    477  1.1  mrg 	std		s128, 16(rp)
    478  1.1  mrg 
    479  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    480  1.1  mrg 	ldd		-0x78(%r30), p032a1
    481  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    482  1.1  mrg 	fldd		24(up), %fr7
    483  1.1  mrg 
    484  1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    485  1.1  mrg 	ldd		-0x70(%r30), p032a2
    486  1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    487  1.1  mrg 	std		s192, 24(rp)
    488  1.1  mrg 
    489  1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    490  1.1  mrg 	ldd		-0x38(%r30), p096b1
    491  1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    492  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    493  1.1  mrg 
    494  1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    495  1.1  mrg 	ldd		-0x30(%r30), p096b2
    496  1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    497  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    498  1.1  mrg 
    499  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    500  1.1  mrg 	ldd		-0x58(%r30), p160c1
    501  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    502  1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    503  1.1  mrg 
    504  1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    505  1.1  mrg 	ldd		-0x50(%r30), p160c2
    506  1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    507  1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    508  1.1  mrg 
    509  1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    510  1.1  mrg 	ldd		-0x18(%r30), p224d1
    511  1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    512  1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    513  1.1  mrg 
    514  1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    515  1.1  mrg 	ldd		-0x10(%r30), p224d2
    516  1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    517  1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    518  1.1  mrg 
    519  1.1  mrg 	addib,<>	-1, n, L(loop)
    520  1.1  mrg 	ldo		32(rp), rp
    521  1.1  mrg 
    522  1.1  mrg LDEF(end2)
    523  1.1  mrg 	add		p032a1, p032a2, m032
    524  1.1  mrg 	ldd		-0x80(%r30), p000a
    525  1.1  mrg 	add,dc		p096b1, p096b2, m096
    526  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    527  1.1  mrg 	add,dc		p160c1, p160c2, m160
    528  1.1  mrg 	ldd		-0x68(%r30), p064a
    529  1.1  mrg 	add,dc		p224d1, p224d2, m224
    530  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    531  1.1  mrg 	add,dc		%r0, %r0, m288
    532  1.1  mrg 	ldd		-0x40(%r30), p064b
    533  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    534  1.1  mrg 	depd,z		m032, 31, 32, ma000
    535  1.1  mrg 	ldd		-0x28(%r30), p128b
    536  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    537  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    538  1.1  mrg 	depd		m096, 31, 32, ma064
    539  1.1  mrg 	ldd		-0x60(%r30), p128c
    540  1.1  mrg 	extrd,u		m096, 31, 32, ma128
    541  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    542  1.1  mrg 	depd		m160, 31, 32, ma128
    543  1.1  mrg 	ldd		-0x48(%r30), p192c
    544  1.1  mrg 	extrd,u		m160, 31, 32, ma192
    545  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    546  1.1  mrg 	depd		m224, 31, 32, ma192
    547  1.1  mrg 	ldd		-0x20(%r30), p192d
    548  1.1  mrg 	extrd,u		m224, 31, 32, ma256
    549  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    550  1.1  mrg 	depd		m288, 31, 32, ma256
    551  1.1  mrg 	ldd		-0x88(%r30), p256d
    552  1.1  mrg 	add		climb, p000a, s000
    553  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    554  1.1  mrg 	add,dc		p064a, p064b, s064
    555  1.1  mrg 	add,dc		p128b, p128c, s128
    556  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    557  1.1  mrg 	add,dc		p192c, p192d, s192
    558  1.1  mrg 	add,dc		p256d, %r0, climb
    559  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    560  1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    561  1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    562  1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    563  1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    564  1.1  mrg 	add,dc		ma256, climb, climb
    565  1.1  mrg 	std		s000, 0(rp)
    566  1.1  mrg 	std		s064, 8(rp)
    567  1.1  mrg 	ldd		-0x78(%r30), p032a1
    568  1.1  mrg 	std		s128, 16(rp)
    569  1.1  mrg 	ldd		-0x70(%r30), p032a2
    570  1.1  mrg 	std		s192, 24(rp)
    571  1.1  mrg 	ldd		-0x38(%r30), p096b1
    572  1.1  mrg 	ldd		-0x30(%r30), p096b2
    573  1.1  mrg 	ldd		-0x58(%r30), p160c1
    574  1.1  mrg 	ldd		-0x50(%r30), p160c2
    575  1.1  mrg 	ldd		-0x18(%r30), p224d1
    576  1.1  mrg 	ldd		-0x10(%r30), p224d2
    577  1.1  mrg 	ldo		32(rp), rp
    578  1.1  mrg 
    579  1.1  mrg LDEF(end1)
    580  1.1  mrg 	add		p032a1, p032a2, m032
    581  1.1  mrg 	ldd		-0x80(%r30), p000a
    582  1.1  mrg 	add,dc		p096b1, p096b2, m096
    583  1.1  mrg 	add,dc		p160c1, p160c2, m160
    584  1.1  mrg 	ldd		-0x68(%r30), p064a
    585  1.1  mrg 	add,dc		p224d1, p224d2, m224
    586  1.1  mrg 	add,dc		%r0, %r0, m288
    587  1.1  mrg 	ldd		-0x40(%r30), p064b
    588  1.1  mrg 	depd,z		m032, 31, 32, ma000
    589  1.1  mrg 	ldd		-0x28(%r30), p128b
    590  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    591  1.1  mrg 	depd		m096, 31, 32, ma064
    592  1.1  mrg 	ldd		-0x60(%r30), p128c
    593  1.1  mrg 	extrd,u		m096, 31, 32, ma128
    594  1.1  mrg 	depd		m160, 31, 32, ma128
    595  1.1  mrg 	ldd		-0x48(%r30), p192c
    596  1.1  mrg 	extrd,u		m160, 31, 32, ma192
    597  1.1  mrg 	depd		m224, 31, 32, ma192
    598  1.1  mrg 	ldd		-0x20(%r30), p192d
    599  1.1  mrg 	extrd,u		m224, 31, 32, ma256
    600  1.1  mrg 	depd		m288, 31, 32, ma256
    601  1.1  mrg 	ldd		-0x88(%r30), p256d
    602  1.1  mrg 	add		climb, p000a, s000
    603  1.1  mrg 	add,dc		p064a, p064b, s064
    604  1.1  mrg 	add,dc		p128b, p128c, s128
    605  1.1  mrg 	add,dc		p192c, p192d, s192
    606  1.1  mrg 	add,dc		p256d, %r0, climb
    607  1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    608  1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    609  1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    610  1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    611  1.1  mrg 	add,dc		ma256, climb, climb
    612  1.1  mrg 	std		s000, 0(rp)
    613  1.1  mrg 	std		s064, 8(rp)
    614  1.1  mrg 	std		s128, 16(rp)
    615  1.1  mrg 	std		s192, 24(rp)
    616  1.1  mrg 
    617  1.1  mrg 	ldd		-0xb0(%r30), %r13
    618  1.1  mrg 	ldd		-0xb8(%r30), %r12
    619  1.1  mrg 	ldd		-0xc0(%r30), %r11
    620  1.1  mrg 	ldd		-0xc8(%r30), %r10
    621  1.1  mrg 	ldd		-0xd0(%r30), %r9
    622  1.1  mrg 	ldd		-0xd8(%r30), %r8
    623  1.1  mrg 	ldd		-0xe0(%r30), %r7
    624  1.1  mrg 	ldd		-0xe8(%r30), %r6
    625  1.1  mrg LDEF(done)
    626  1.1  mrg ifdef(`HAVE_ABI_2_0w',
    627  1.1  mrg `	copy		climb, %r28
    628  1.1  mrg ',`	extrd,u		climb, 63, 32, %r29
    629  1.1  mrg 	extrd,u		climb, 31, 32, %r28
    630  1.1  mrg ')
    631  1.1  mrg 	ldd		-0xf0(%r30), %r5
    632  1.1  mrg 	ldd		-0xf8(%r30), %r4
    633  1.1  mrg 	bve		(%r2)
    634  1.1  mrg 	ldd,mb		-0x100(%r30), %r3
    635  1.1  mrg EPILOGUE(mpn_mul_1)
    636