Home | History | Annotate | Line # | Download | only in pa64
submul_1.asm revision 1.1
      1  1.1  mrg dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
      2  1.1  mrg dnl  subtract the result from a second limb vector.
      3  1.1  mrg 
      4  1.1  mrg dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
      5  1.1  mrg 
      6  1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1  mrg 
      8  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1  mrg dnl  it under the terms of the GNU Lesser General Public License as published
     10  1.1  mrg dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11  1.1  mrg dnl  your option) any later version.
     12  1.1  mrg 
     13  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16  1.1  mrg dnl  License for more details.
     17  1.1  mrg 
     18  1.1  mrg dnl  You should have received a copy of the GNU Lesser General Public License
     19  1.1  mrg dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20  1.1  mrg 
     21  1.1  mrg include(`../config.m4')
     22  1.1  mrg 
     23  1.1  mrg C		    cycles/limb
     24  1.1  mrg C 8000,8200:		7
     25  1.1  mrg C 8500,8600,8700:	6.5
     26  1.1  mrg 
     27  1.1  mrg C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     28  1.1  mrg C  could be saved there per call.
     29  1.1  mrg 
     30  1.1  mrg C  DESCRIPTION:
     31  1.1  mrg C  The main loop "BIG" is 4-way unrolled, mainly to allow
     32  1.1  mrg C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     33  1.1  mrg C  registers to the IU registers, have demanded a deep software pipeline, and
     34  1.1  mrg C  a lot of stack slots for partial products in flight.
     35  1.1  mrg C
     36  1.1  mrg C  CODE STRUCTURE:
     37  1.1  mrg C  save-some-registers
     38  1.1  mrg C  do 0, 1, 2, or 3 limbs
     39  1.1  mrg C  if done, restore-some-regs and return
     40  1.1  mrg C  save-many-regs
     41  1.1  mrg C  do 4, 8, ... limb
     42  1.1  mrg C  restore-all-regs
     43  1.1  mrg 
     44  1.1  mrg C  STACK LAYOUT:
     45  1.1  mrg C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     46  1.1  mrg C  slots marked FREE, as well as some slots in the caller's "frame marker".
     47  1.1  mrg C
     48  1.1  mrg C -00 <- r30
     49  1.1  mrg C -08  FREE
     50  1.1  mrg C -10  tmp
     51  1.1  mrg C -18  tmp
     52  1.1  mrg C -20  tmp
     53  1.1  mrg C -28  tmp
     54  1.1  mrg C -30  tmp
     55  1.1  mrg C -38  tmp
     56  1.1  mrg C -40  tmp
     57  1.1  mrg C -48  tmp
     58  1.1  mrg C -50  tmp
     59  1.1  mrg C -58  tmp
     60  1.1  mrg C -60  tmp
     61  1.1  mrg C -68  tmp
     62  1.1  mrg C -70  tmp
     63  1.1  mrg C -78  tmp
     64  1.1  mrg C -80  tmp
     65  1.1  mrg C -88  tmp
     66  1.1  mrg C -90  FREE
     67  1.1  mrg C -98  FREE
     68  1.1  mrg C -a0  FREE
     69  1.1  mrg C -a8  FREE
     70  1.1  mrg C -b0  r13
     71  1.1  mrg C -b8  r12
     72  1.1  mrg C -c0  r11
     73  1.1  mrg C -c8  r10
     74  1.1  mrg C -d0  r8
     75  1.1  mrg C -d8  r8
     76  1.1  mrg C -e0  r7
     77  1.1  mrg C -e8  r6
     78  1.1  mrg C -f0  r5
     79  1.1  mrg C -f8  r4
     80  1.1  mrg C -100 r3
     81  1.1  mrg C  Previous frame:
     82  1.1  mrg C  [unused area]
     83  1.1  mrg C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     84  1.1  mrg 
     85  1.1  mrg 
     86  1.1  mrg include(`../config.m4')
     87  1.1  mrg 
     88  1.1  mrg C INPUT PARAMETERS:
     89  1.1  mrg define(`rp',`%r26')	C
     90  1.1  mrg define(`up',`%r25')	C
     91  1.1  mrg define(`n',`%r24')	C
     92  1.1  mrg define(`vlimb',`%r23')	C
     93  1.1  mrg 
     94  1.1  mrg define(`climb',`%r23')	C
     95  1.1  mrg 
     96  1.1  mrg ifdef(`HAVE_ABI_2_0w',
     97  1.1  mrg `	.level	2.0w
     98  1.1  mrg ',`	.level	2.0
     99  1.1  mrg ')
    100  1.1  mrg PROLOGUE(mpn_submul_1)
    101  1.1  mrg 
    102  1.1  mrg ifdef(`HAVE_ABI_2_0w',
    103  1.1  mrg `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    104  1.1  mrg ')
    105  1.1  mrg 	std,ma		%r3, 0x100(%r30)
    106  1.1  mrg 	std		%r4, -0xf8(%r30)
    107  1.1  mrg 	std		%r5, -0xf0(%r30)
    108  1.1  mrg 	ldo		0(%r0), climb		C clear climb
    109  1.1  mrg 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    110  1.1  mrg 
    111  1.1  mrg define(`p032a1',`%r1')	C
    112  1.1  mrg define(`p032a2',`%r19')	C
    113  1.1  mrg 
    114  1.1  mrg define(`m032',`%r20')	C
    115  1.1  mrg define(`m096',`%r21')	C
    116  1.1  mrg 
    117  1.1  mrg define(`p000a',`%r22')	C
    118  1.1  mrg define(`p064a',`%r29')	C
    119  1.1  mrg 
    120  1.1  mrg define(`s000',`%r31')	C
    121  1.1  mrg 
    122  1.1  mrg define(`ma000',`%r4')	C
    123  1.1  mrg define(`ma064',`%r20')	C
    124  1.1  mrg 
    125  1.1  mrg define(`r000',`%r3')	C
    126  1.1  mrg 
    127  1.1  mrg 	extrd,u		n, 63, 2, %r5
    128  1.1  mrg 	cmpb,=		%r5, %r0, L(BIG)
    129  1.1  mrg 	nop
    130  1.1  mrg 
    131  1.1  mrg 	fldd		0(up), %fr4
    132  1.1  mrg 	ldo		8(up), up
    133  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    134  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    135  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    136  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    137  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    138  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    139  1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    140  1.1  mrg 	addib,<>	-1, %r5, L(two_or_more)
    141  1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    142  1.1  mrg LDEF(one)
    143  1.1  mrg 	ldd		-0x78(%r30), p032a1
    144  1.1  mrg 	ldd		-0x70(%r30), p032a2
    145  1.1  mrg 	ldd		-0x80(%r30), p000a
    146  1.1  mrg 	b		L(0_one_out)
    147  1.1  mrg 	ldd		-0x68(%r30), p064a
    148  1.1  mrg 
    149  1.1  mrg LDEF(two_or_more)
    150  1.1  mrg 	fldd		0(up), %fr4
    151  1.1  mrg 	ldo		8(up), up
    152  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    153  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    154  1.1  mrg 	ldd		-0x78(%r30), p032a1
    155  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    156  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    157  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    158  1.1  mrg 	ldd		-0x70(%r30), p032a2
    159  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    160  1.1  mrg 	ldd		-0x80(%r30), p000a
    161  1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    162  1.1  mrg 	ldd		-0x68(%r30), p064a
    163  1.1  mrg 	addib,<>	-1, %r5, L(three_or_more)
    164  1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    165  1.1  mrg LDEF(two)
    166  1.1  mrg 	add		p032a1, p032a2, m032
    167  1.1  mrg 	add,dc		%r0, %r0, m096
    168  1.1  mrg 	depd,z		m032, 31, 32, ma000
    169  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    170  1.1  mrg 	ldd		0(rp), r000
    171  1.1  mrg 	b		L(0_two_out)
    172  1.1  mrg 	depd		m096, 31, 32, ma064
    173  1.1  mrg 
    174  1.1  mrg LDEF(three_or_more)
    175  1.1  mrg 	fldd		0(up), %fr4
    176  1.1  mrg 	add		p032a1, p032a2, m032
    177  1.1  mrg 	add,dc		%r0, %r0, m096
    178  1.1  mrg 	depd,z		m032, 31, 32, ma000
    179  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    180  1.1  mrg 	ldd		0(rp), r000
    181  1.1  mrg C	addib,=		-1, %r5, L(0_out)
    182  1.1  mrg 	depd		m096, 31, 32, ma064
    183  1.1  mrg LDEF(loop0)
    184  1.1  mrg C	xmpyu		%fr8R, %fr4L, %fr22
    185  1.1  mrg C	xmpyu		%fr8L, %fr4R, %fr23
    186  1.1  mrg C	ldd		-0x78(%r30), p032a1
    187  1.1  mrg C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    188  1.1  mrg C
    189  1.1  mrg C	xmpyu		%fr8R, %fr4R, %fr24
    190  1.1  mrg C	xmpyu		%fr8L, %fr4L, %fr25
    191  1.1  mrg C	ldd		-0x70(%r30), p032a2
    192  1.1  mrg C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    193  1.1  mrg C
    194  1.1  mrg C	ldo		8(rp), rp
    195  1.1  mrg C	add		climb, p000a, s000
    196  1.1  mrg C	ldd		-0x80(%r30), p000a
    197  1.1  mrg C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    198  1.1  mrg C
    199  1.1  mrg C	add,dc		p064a, %r0, climb
    200  1.1  mrg C	ldo		8(up), up
    201  1.1  mrg C	ldd		-0x68(%r30), p064a
    202  1.1  mrg C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    203  1.1  mrg C
    204  1.1  mrg C	add		ma000, s000, s000
    205  1.1  mrg C	add,dc		ma064, climb, climb
    206  1.1  mrg C	fldd		0(up), %fr4
    207  1.1  mrg C
    208  1.1  mrg C	sub		r000, s000, s000
    209  1.1  mrg C	sub,db		%r0, climb, climb
    210  1.1  mrg C	sub		%r0, climb, climb
    211  1.1  mrg C	std		s000, -8(rp)
    212  1.1  mrg C
    213  1.1  mrg C	add		p032a1, p032a2, m032
    214  1.1  mrg C	add,dc		%r0, %r0, m096
    215  1.1  mrg C
    216  1.1  mrg C	depd,z		m032, 31, 32, ma000
    217  1.1  mrg C	extrd,u		m032, 31, 32, ma064
    218  1.1  mrg C	ldd		0(rp), r000
    219  1.1  mrg C	addib,<>	-1, %r5, L(loop0)
    220  1.1  mrg C	depd		m096, 31, 32, ma064
    221  1.1  mrg LDEF(0_out)
    222  1.1  mrg 	ldo		8(up), up
    223  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    224  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    225  1.1  mrg 	ldd		-0x78(%r30), p032a1
    226  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    227  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    228  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    229  1.1  mrg 	ldd		-0x70(%r30), p032a2
    230  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    231  1.1  mrg 	ldo		8(rp), rp
    232  1.1  mrg 	add		climb, p000a, s000
    233  1.1  mrg 	ldd		-0x80(%r30), p000a
    234  1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    235  1.1  mrg 	add,dc		p064a, %r0, climb
    236  1.1  mrg 	ldd		-0x68(%r30), p064a
    237  1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    238  1.1  mrg 	add		ma000, s000, s000
    239  1.1  mrg 	add,dc		ma064, climb, climb
    240  1.1  mrg 	sub		r000, s000, s000
    241  1.1  mrg 	sub,db		%r0, climb, climb
    242  1.1  mrg 	sub		%r0, climb, climb
    243  1.1  mrg 	std		s000, -8(rp)
    244  1.1  mrg 	add		p032a1, p032a2, m032
    245  1.1  mrg 	add,dc		%r0, %r0, m096
    246  1.1  mrg 	depd,z		m032, 31, 32, ma000
    247  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    248  1.1  mrg 	ldd		0(rp), r000
    249  1.1  mrg 	depd		m096, 31, 32, ma064
    250  1.1  mrg LDEF(0_two_out)
    251  1.1  mrg 	ldd		-0x78(%r30), p032a1
    252  1.1  mrg 	ldd		-0x70(%r30), p032a2
    253  1.1  mrg 	ldo		8(rp), rp
    254  1.1  mrg 	add		climb, p000a, s000
    255  1.1  mrg 	ldd		-0x80(%r30), p000a
    256  1.1  mrg 	add,dc		p064a, %r0, climb
    257  1.1  mrg 	ldd		-0x68(%r30), p064a
    258  1.1  mrg 	add		ma000, s000, s000
    259  1.1  mrg 	add,dc		ma064, climb, climb
    260  1.1  mrg 	sub		r000, s000, s000
    261  1.1  mrg 	sub,db		%r0, climb, climb
    262  1.1  mrg 	sub		%r0, climb, climb
    263  1.1  mrg 	std		s000, -8(rp)
    264  1.1  mrg LDEF(0_one_out)
    265  1.1  mrg 	add		p032a1, p032a2, m032
    266  1.1  mrg 	add,dc		%r0, %r0, m096
    267  1.1  mrg 	depd,z		m032, 31, 32, ma000
    268  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    269  1.1  mrg 	ldd		0(rp), r000
    270  1.1  mrg 	depd		m096, 31, 32, ma064
    271  1.1  mrg 
    272  1.1  mrg 	add		climb, p000a, s000
    273  1.1  mrg 	add,dc		p064a, %r0, climb
    274  1.1  mrg 	add		ma000, s000, s000
    275  1.1  mrg 	add,dc		ma064, climb, climb
    276  1.1  mrg 	sub		r000, s000, s000
    277  1.1  mrg 	sub,db		%r0, climb, climb
    278  1.1  mrg 	sub		%r0, climb, climb
    279  1.1  mrg 	std		s000, 0(rp)
    280  1.1  mrg 
    281  1.1  mrg 	cmpib,>=	4, n, L(done)
    282  1.1  mrg 	ldo		8(rp), rp
    283  1.1  mrg 
    284  1.1  mrg C 4-way unrolled code.
    285  1.1  mrg 
    286  1.1  mrg LDEF(BIG)
    287  1.1  mrg 
    288  1.1  mrg define(`p032a1',`%r1')	C
    289  1.1  mrg define(`p032a2',`%r19')	C
    290  1.1  mrg define(`p096b1',`%r20')	C
    291  1.1  mrg define(`p096b2',`%r21')	C
    292  1.1  mrg define(`p160c1',`%r22')	C
    293  1.1  mrg define(`p160c2',`%r29')	C
    294  1.1  mrg define(`p224d1',`%r31')	C
    295  1.1  mrg define(`p224d2',`%r3')	C
    296  1.1  mrg 			C
    297  1.1  mrg define(`m032',`%r4')	C
    298  1.1  mrg define(`m096',`%r5')	C
    299  1.1  mrg define(`m160',`%r6')	C
    300  1.1  mrg define(`m224',`%r7')	C
    301  1.1  mrg define(`m288',`%r8')	C
    302  1.1  mrg 			C
    303  1.1  mrg define(`p000a',`%r1')	C
    304  1.1  mrg define(`p064a',`%r19')	C
    305  1.1  mrg define(`p064b',`%r20')	C
    306  1.1  mrg define(`p128b',`%r21')	C
    307  1.1  mrg define(`p128c',`%r22')	C
    308  1.1  mrg define(`p192c',`%r29')	C
    309  1.1  mrg define(`p192d',`%r31')	C
    310  1.1  mrg define(`p256d',`%r3')	C
    311  1.1  mrg 			C
    312  1.1  mrg define(`s000',`%r10')	C
    313  1.1  mrg define(`s064',`%r11')	C
    314  1.1  mrg define(`s128',`%r12')	C
    315  1.1  mrg define(`s192',`%r13')	C
    316  1.1  mrg 			C
    317  1.1  mrg define(`ma000',`%r9')	C
    318  1.1  mrg define(`ma064',`%r4')	C
    319  1.1  mrg define(`ma128',`%r5')	C
    320  1.1  mrg define(`ma192',`%r6')	C
    321  1.1  mrg define(`ma256',`%r7')	C
    322  1.1  mrg 			C
    323  1.1  mrg define(`r000',`%r1')	C
    324  1.1  mrg define(`r064',`%r19')	C
    325  1.1  mrg define(`r128',`%r20')	C
    326  1.1  mrg define(`r192',`%r21')	C
    327  1.1  mrg 
    328  1.1  mrg 	std		%r6, -0xe8(%r30)
    329  1.1  mrg 	std		%r7, -0xe0(%r30)
    330  1.1  mrg 	std		%r8, -0xd8(%r30)
    331  1.1  mrg 	std		%r9, -0xd0(%r30)
    332  1.1  mrg 	std		%r10, -0xc8(%r30)
    333  1.1  mrg 	std		%r11, -0xc0(%r30)
    334  1.1  mrg 	std		%r12, -0xb8(%r30)
    335  1.1  mrg 	std		%r13, -0xb0(%r30)
    336  1.1  mrg 
    337  1.1  mrg ifdef(`HAVE_ABI_2_0w',
    338  1.1  mrg `	extrd,u		n, 61, 62, n		C right shift 2
    339  1.1  mrg ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    340  1.1  mrg ')
    341  1.1  mrg 
    342  1.1  mrg LDEF(4_or_more)
    343  1.1  mrg 	fldd		0(up), %fr4
    344  1.1  mrg 	fldd		8(up), %fr5
    345  1.1  mrg 	fldd		16(up), %fr6
    346  1.1  mrg 	fldd		24(up), %fr7
    347  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    348  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    349  1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    350  1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    351  1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    352  1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    353  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    354  1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    355  1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    356  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    357  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    358  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    359  1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    360  1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    361  1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    362  1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    363  1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    364  1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    365  1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    366  1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    367  1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    368  1.1  mrg 	addib,<>	-1, n, L(8_or_more)
    369  1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    370  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    371  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    372  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    373  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    374  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    375  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    376  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    377  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    378  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    379  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    380  1.1  mrg 	ldd		-0x78(%r30), p032a1
    381  1.1  mrg 	ldd		-0x70(%r30), p032a2
    382  1.1  mrg 	ldd		-0x38(%r30), p096b1
    383  1.1  mrg 	ldd		-0x30(%r30), p096b2
    384  1.1  mrg 	ldd		-0x58(%r30), p160c1
    385  1.1  mrg 	ldd		-0x50(%r30), p160c2
    386  1.1  mrg 	ldd		-0x18(%r30), p224d1
    387  1.1  mrg 	ldd		-0x10(%r30), p224d2
    388  1.1  mrg 	b		L(end1)
    389  1.1  mrg 	nop
    390  1.1  mrg 
    391  1.1  mrg LDEF(8_or_more)
    392  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    393  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    394  1.1  mrg 	ldo		32(up), up
    395  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    396  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    397  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    398  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    399  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    400  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    401  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    402  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    403  1.1  mrg 	fldd		0(up), %fr4
    404  1.1  mrg 	fldd		8(up), %fr5
    405  1.1  mrg 	fldd		16(up), %fr6
    406  1.1  mrg 	fldd		24(up), %fr7
    407  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    408  1.1  mrg 	ldd		-0x78(%r30), p032a1
    409  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    410  1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    411  1.1  mrg 	ldd		-0x70(%r30), p032a2
    412  1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    413  1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    414  1.1  mrg 	ldd		-0x38(%r30), p096b1
    415  1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    416  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    417  1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    418  1.1  mrg 	ldd		-0x30(%r30), p096b2
    419  1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    420  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    421  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    422  1.1  mrg 	ldd		-0x58(%r30), p160c1
    423  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    424  1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    425  1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    426  1.1  mrg 	ldd		-0x50(%r30), p160c2
    427  1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    428  1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    429  1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    430  1.1  mrg 	ldd		-0x18(%r30), p224d1
    431  1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    432  1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    433  1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    434  1.1  mrg 	ldd		-0x10(%r30), p224d2
    435  1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    436  1.1  mrg 	addib,=		-1, n, L(end2)
    437  1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    438  1.1  mrg LDEF(loop)
    439  1.1  mrg 	add		p032a1, p032a2, m032
    440  1.1  mrg 	ldd		-0x80(%r30), p000a
    441  1.1  mrg 	add,dc		p096b1, p096b2, m096
    442  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    443  1.1  mrg 
    444  1.1  mrg 	add,dc		p160c1, p160c2, m160
    445  1.1  mrg 	ldd		-0x68(%r30), p064a
    446  1.1  mrg 	add,dc		p224d1, p224d2, m224
    447  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    448  1.1  mrg 
    449  1.1  mrg 	add,dc		%r0, %r0, m288
    450  1.1  mrg 	ldd		-0x40(%r30), p064b
    451  1.1  mrg 	ldo		32(up), up
    452  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    453  1.1  mrg 
    454  1.1  mrg 	depd,z		m032, 31, 32, ma000
    455  1.1  mrg 	ldd		-0x28(%r30), p128b
    456  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    457  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    458  1.1  mrg 
    459  1.1  mrg 	depd		m096, 31, 32, ma064
    460  1.1  mrg 	ldd		-0x60(%r30), p128c
    461  1.1  mrg 	extrd,u		m096, 31, 32, ma128
    462  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    463  1.1  mrg 
    464  1.1  mrg 	depd		m160, 31, 32, ma128
    465  1.1  mrg 	ldd		-0x48(%r30), p192c
    466  1.1  mrg 	extrd,u		m160, 31, 32, ma192
    467  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    468  1.1  mrg 
    469  1.1  mrg 	depd		m224, 31, 32, ma192
    470  1.1  mrg 	ldd		-0x20(%r30), p192d
    471  1.1  mrg 	extrd,u		m224, 31, 32, ma256
    472  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    473  1.1  mrg 
    474  1.1  mrg 	depd		m288, 31, 32, ma256
    475  1.1  mrg 	ldd		-0x88(%r30), p256d
    476  1.1  mrg 	add		climb, p000a, s000
    477  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    478  1.1  mrg 
    479  1.1  mrg 	add,dc		p064a, p064b, s064
    480  1.1  mrg 	ldd		0(rp), r000
    481  1.1  mrg 	add,dc		p128b, p128c, s128
    482  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    483  1.1  mrg 
    484  1.1  mrg 	add,dc		p192c, p192d, s192
    485  1.1  mrg 	ldd		8(rp), r064
    486  1.1  mrg 	add,dc		p256d, %r0, climb
    487  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    488  1.1  mrg 
    489  1.1  mrg 	ldd		16(rp), r128
    490  1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    491  1.1  mrg 	ldd		24(rp), r192
    492  1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    493  1.1  mrg 
    494  1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    495  1.1  mrg 	fldd		0(up), %fr4
    496  1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    497  1.1  mrg 	fldd		8(up), %fr5
    498  1.1  mrg 
    499  1.1  mrg 	add,dc		ma256, climb, climb
    500  1.1  mrg 	fldd		16(up), %fr6
    501  1.1  mrg 	sub		r000, s000, s000	C accum rlimb 0
    502  1.1  mrg 	fldd		24(up), %fr7
    503  1.1  mrg 
    504  1.1  mrg 	sub,db		r064, s064, s064	C accum rlimb 1
    505  1.1  mrg 	sub,db		r128, s128, s128	C accum rlimb 2
    506  1.1  mrg 	std		s000, 0(rp)
    507  1.1  mrg 
    508  1.1  mrg 	sub,db		r192, s192, s192	C accum rlimb 3
    509  1.1  mrg 	sub,db		%r0, climb, climb
    510  1.1  mrg 	sub		%r0, climb, climb
    511  1.1  mrg 	std		s064, 8(rp)
    512  1.1  mrg 
    513  1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    514  1.1  mrg 	ldd		-0x78(%r30), p032a1
    515  1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    516  1.1  mrg 	std		s128, 16(rp)
    517  1.1  mrg 
    518  1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    519  1.1  mrg 	ldd		-0x70(%r30), p032a2
    520  1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    521  1.1  mrg 	std		s192, 24(rp)
    522  1.1  mrg 
    523  1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    524  1.1  mrg 	ldd		-0x38(%r30), p096b1
    525  1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    526  1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    527  1.1  mrg 
    528  1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    529  1.1  mrg 	ldd		-0x30(%r30), p096b2
    530  1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    531  1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    532  1.1  mrg 
    533  1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    534  1.1  mrg 	ldd		-0x58(%r30), p160c1
    535  1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    536  1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    537  1.1  mrg 
    538  1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    539  1.1  mrg 	ldd		-0x50(%r30), p160c2
    540  1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    541  1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    542  1.1  mrg 
    543  1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    544  1.1  mrg 	ldd		-0x18(%r30), p224d1
    545  1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    546  1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    547  1.1  mrg 
    548  1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    549  1.1  mrg 	ldd		-0x10(%r30), p224d2
    550  1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    551  1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    552  1.1  mrg 
    553  1.1  mrg 	addib,<>	-1, n, L(loop)
    554  1.1  mrg 	ldo		32(rp), rp
    555  1.1  mrg 
    556  1.1  mrg LDEF(end2)
    557  1.1  mrg 	add		p032a1, p032a2, m032
    558  1.1  mrg 	ldd		-0x80(%r30), p000a
    559  1.1  mrg 	add,dc		p096b1, p096b2, m096
    560  1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    561  1.1  mrg 	add,dc		p160c1, p160c2, m160
    562  1.1  mrg 	ldd		-0x68(%r30), p064a
    563  1.1  mrg 	add,dc		p224d1, p224d2, m224
    564  1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    565  1.1  mrg 	add,dc		%r0, %r0, m288
    566  1.1  mrg 	ldd		-0x40(%r30), p064b
    567  1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    568  1.1  mrg 	depd,z		m032, 31, 32, ma000
    569  1.1  mrg 	ldd		-0x28(%r30), p128b
    570  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    571  1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    572  1.1  mrg 	depd		m096, 31, 32, ma064
    573  1.1  mrg 	ldd		-0x60(%r30), p128c
    574  1.1  mrg 	extrd,u		m096, 31, 32, ma128
    575  1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    576  1.1  mrg 	depd		m160, 31, 32, ma128
    577  1.1  mrg 	ldd		-0x48(%r30), p192c
    578  1.1  mrg 	extrd,u		m160, 31, 32, ma192
    579  1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    580  1.1  mrg 	depd		m224, 31, 32, ma192
    581  1.1  mrg 	ldd		-0x20(%r30), p192d
    582  1.1  mrg 	extrd,u		m224, 31, 32, ma256
    583  1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    584  1.1  mrg 	depd		m288, 31, 32, ma256
    585  1.1  mrg 	ldd		-0x88(%r30), p256d
    586  1.1  mrg 	add		climb, p000a, s000
    587  1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    588  1.1  mrg 	add,dc		p064a, p064b, s064
    589  1.1  mrg 	ldd		0(rp), r000
    590  1.1  mrg 	add,dc		p128b, p128c, s128
    591  1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    592  1.1  mrg 	add,dc		p192c, p192d, s192
    593  1.1  mrg 	ldd		8(rp), r064
    594  1.1  mrg 	add,dc		p256d, %r0, climb
    595  1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    596  1.1  mrg 	ldd		16(rp), r128
    597  1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    598  1.1  mrg 	ldd		24(rp), r192
    599  1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    600  1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    601  1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    602  1.1  mrg 	add,dc		ma256, climb, climb
    603  1.1  mrg 	sub		r000, s000, s000	C accum rlimb 0
    604  1.1  mrg 	sub,db		r064, s064, s064	C accum rlimb 1
    605  1.1  mrg 	sub,db		r128, s128, s128	C accum rlimb 2
    606  1.1  mrg 	std		s000, 0(rp)
    607  1.1  mrg 	sub,db		r192, s192, s192	C accum rlimb 3
    608  1.1  mrg 	sub,db		%r0, climb, climb
    609  1.1  mrg 	sub		%r0, climb, climb
    610  1.1  mrg 	std		s064, 8(rp)
    611  1.1  mrg 	ldd		-0x78(%r30), p032a1
    612  1.1  mrg 	std		s128, 16(rp)
    613  1.1  mrg 	ldd		-0x70(%r30), p032a2
    614  1.1  mrg 	std		s192, 24(rp)
    615  1.1  mrg 	ldd		-0x38(%r30), p096b1
    616  1.1  mrg 	ldd		-0x30(%r30), p096b2
    617  1.1  mrg 	ldd		-0x58(%r30), p160c1
    618  1.1  mrg 	ldd		-0x50(%r30), p160c2
    619  1.1  mrg 	ldd		-0x18(%r30), p224d1
    620  1.1  mrg 	ldd		-0x10(%r30), p224d2
    621  1.1  mrg 	ldo		32(rp), rp
    622  1.1  mrg 
    623  1.1  mrg LDEF(end1)
    624  1.1  mrg 	add		p032a1, p032a2, m032
    625  1.1  mrg 	ldd		-0x80(%r30), p000a
    626  1.1  mrg 	add,dc		p096b1, p096b2, m096
    627  1.1  mrg 	add,dc		p160c1, p160c2, m160
    628  1.1  mrg 	ldd		-0x68(%r30), p064a
    629  1.1  mrg 	add,dc		p224d1, p224d2, m224
    630  1.1  mrg 	add,dc		%r0, %r0, m288
    631  1.1  mrg 	ldd		-0x40(%r30), p064b
    632  1.1  mrg 	depd,z		m032, 31, 32, ma000
    633  1.1  mrg 	ldd		-0x28(%r30), p128b
    634  1.1  mrg 	extrd,u		m032, 31, 32, ma064
    635  1.1  mrg 	depd		m096, 31, 32, ma064
    636  1.1  mrg 	ldd		-0x60(%r30), p128c
    637  1.1  mrg 	extrd,u		m096, 31, 32, ma128
    638  1.1  mrg 	depd		m160, 31, 32, ma128
    639  1.1  mrg 	ldd		-0x48(%r30), p192c
    640  1.1  mrg 	extrd,u		m160, 31, 32, ma192
    641  1.1  mrg 	depd		m224, 31, 32, ma192
    642  1.1  mrg 	ldd		-0x20(%r30), p192d
    643  1.1  mrg 	extrd,u		m224, 31, 32, ma256
    644  1.1  mrg 	depd		m288, 31, 32, ma256
    645  1.1  mrg 	ldd		-0x88(%r30), p256d
    646  1.1  mrg 	add		climb, p000a, s000
    647  1.1  mrg 	add,dc		p064a, p064b, s064
    648  1.1  mrg 	ldd		0(rp), r000
    649  1.1  mrg 	add,dc		p128b, p128c, s128
    650  1.1  mrg 	add,dc		p192c, p192d, s192
    651  1.1  mrg 	ldd		8(rp), r064
    652  1.1  mrg 	add,dc		p256d, %r0, climb
    653  1.1  mrg 	ldd		16(rp), r128
    654  1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    655  1.1  mrg 	ldd		24(rp), r192
    656  1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    657  1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    658  1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    659  1.1  mrg 	add,dc		ma256, climb, climb
    660  1.1  mrg 	sub		r000, s000, s000	C accum rlimb 0
    661  1.1  mrg 	sub,db		r064, s064, s064	C accum rlimb 1
    662  1.1  mrg 	sub,db		r128, s128, s128	C accum rlimb 2
    663  1.1  mrg 	std		s000, 0(rp)
    664  1.1  mrg 	sub,db		r192, s192, s192	C accum rlimb 3
    665  1.1  mrg 	sub,db		%r0, climb, climb
    666  1.1  mrg 	sub		%r0, climb, climb
    667  1.1  mrg 	std		s064, 8(rp)
    668  1.1  mrg 	std		s128, 16(rp)
    669  1.1  mrg 	std		s192, 24(rp)
    670  1.1  mrg 
    671  1.1  mrg 	ldd		-0xb0(%r30), %r13
    672  1.1  mrg 	ldd		-0xb8(%r30), %r12
    673  1.1  mrg 	ldd		-0xc0(%r30), %r11
    674  1.1  mrg 	ldd		-0xc8(%r30), %r10
    675  1.1  mrg 	ldd		-0xd0(%r30), %r9
    676  1.1  mrg 	ldd		-0xd8(%r30), %r8
    677  1.1  mrg 	ldd		-0xe0(%r30), %r7
    678  1.1  mrg 	ldd		-0xe8(%r30), %r6
    679  1.1  mrg LDEF(done)
    680  1.1  mrg ifdef(`HAVE_ABI_2_0w',
    681  1.1  mrg `	copy		climb, %r28
    682  1.1  mrg ',`	extrd,u		climb, 63, 32, %r29
    683  1.1  mrg 	extrd,u		climb, 31, 32, %r28
    684  1.1  mrg ')
    685  1.1  mrg 	ldd		-0xf0(%r30), %r5
    686  1.1  mrg 	ldd		-0xf8(%r30), %r4
    687  1.1  mrg 	bve		(%r2)
    688  1.1  mrg 	ldd,mb		-0x100(%r30), %r3
    689  1.1  mrg EPILOGUE(mpn_submul_1)
    690