Home | History | Annotate | Line # | Download | only in pa64
submul_1.asm revision 1.1.1.2
      1      1.1  mrg dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
      2      1.1  mrg dnl  subtract the result from a second limb vector.
      3      1.1  mrg 
      4  1.1.1.2  mrg dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1.1.2  mrg dnl
      8      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg dnl  it under the terms of either:
     10  1.1.1.2  mrg dnl
     11  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg dnl      option) any later version.
     14  1.1.1.2  mrg dnl
     15  1.1.1.2  mrg dnl  or
     16  1.1.1.2  mrg dnl
     17  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg dnl      later version.
     20  1.1.1.2  mrg dnl
     21  1.1.1.2  mrg dnl  or both in parallel, as here.
     22  1.1.1.2  mrg dnl
     23      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg dnl  for more details.
     27  1.1.1.2  mrg dnl
     28  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg C		    cycles/limb
     35      1.1  mrg C 8000,8200:		7
     36      1.1  mrg C 8500,8600,8700:	6.5
     37      1.1  mrg 
     38      1.1  mrg C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     39      1.1  mrg C  could be saved there per call.
     40      1.1  mrg 
     41      1.1  mrg C  DESCRIPTION:
     42      1.1  mrg C  The main loop "BIG" is 4-way unrolled, mainly to allow
     43      1.1  mrg C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     44      1.1  mrg C  registers to the IU registers, have demanded a deep software pipeline, and
     45      1.1  mrg C  a lot of stack slots for partial products in flight.
     46      1.1  mrg C
     47      1.1  mrg C  CODE STRUCTURE:
     48      1.1  mrg C  save-some-registers
     49      1.1  mrg C  do 0, 1, 2, or 3 limbs
     50      1.1  mrg C  if done, restore-some-regs and return
     51      1.1  mrg C  save-many-regs
     52      1.1  mrg C  do 4, 8, ... limb
     53      1.1  mrg C  restore-all-regs
     54      1.1  mrg 
     55      1.1  mrg C  STACK LAYOUT:
     56      1.1  mrg C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     57      1.1  mrg C  slots marked FREE, as well as some slots in the caller's "frame marker".
     58      1.1  mrg C
     59      1.1  mrg C -00 <- r30
     60      1.1  mrg C -08  FREE
     61      1.1  mrg C -10  tmp
     62      1.1  mrg C -18  tmp
     63      1.1  mrg C -20  tmp
     64      1.1  mrg C -28  tmp
     65      1.1  mrg C -30  tmp
     66      1.1  mrg C -38  tmp
     67      1.1  mrg C -40  tmp
     68      1.1  mrg C -48  tmp
     69      1.1  mrg C -50  tmp
     70      1.1  mrg C -58  tmp
     71      1.1  mrg C -60  tmp
     72      1.1  mrg C -68  tmp
     73      1.1  mrg C -70  tmp
     74      1.1  mrg C -78  tmp
     75      1.1  mrg C -80  tmp
     76      1.1  mrg C -88  tmp
     77      1.1  mrg C -90  FREE
     78      1.1  mrg C -98  FREE
     79      1.1  mrg C -a0  FREE
     80      1.1  mrg C -a8  FREE
     81      1.1  mrg C -b0  r13
     82      1.1  mrg C -b8  r12
     83      1.1  mrg C -c0  r11
     84      1.1  mrg C -c8  r10
     85      1.1  mrg C -d0  r8
     86      1.1  mrg C -d8  r8
     87      1.1  mrg C -e0  r7
     88      1.1  mrg C -e8  r6
     89      1.1  mrg C -f0  r5
     90      1.1  mrg C -f8  r4
     91      1.1  mrg C -100 r3
     92      1.1  mrg C  Previous frame:
     93      1.1  mrg C  [unused area]
     94      1.1  mrg C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     95      1.1  mrg 
     96      1.1  mrg 
     97      1.1  mrg include(`../config.m4')
     98      1.1  mrg 
     99      1.1  mrg C INPUT PARAMETERS:
    100      1.1  mrg define(`rp',`%r26')	C
    101      1.1  mrg define(`up',`%r25')	C
    102      1.1  mrg define(`n',`%r24')	C
    103      1.1  mrg define(`vlimb',`%r23')	C
    104      1.1  mrg 
    105      1.1  mrg define(`climb',`%r23')	C
    106      1.1  mrg 
    107      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    108      1.1  mrg `	.level	2.0w
    109      1.1  mrg ',`	.level	2.0
    110      1.1  mrg ')
    111      1.1  mrg PROLOGUE(mpn_submul_1)
    112      1.1  mrg 
    113      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    114      1.1  mrg `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    115      1.1  mrg ')
    116      1.1  mrg 	std,ma		%r3, 0x100(%r30)
    117      1.1  mrg 	std		%r4, -0xf8(%r30)
    118      1.1  mrg 	std		%r5, -0xf0(%r30)
    119      1.1  mrg 	ldo		0(%r0), climb		C clear climb
    120      1.1  mrg 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    121      1.1  mrg 
    122      1.1  mrg define(`p032a1',`%r1')	C
    123      1.1  mrg define(`p032a2',`%r19')	C
    124      1.1  mrg 
    125      1.1  mrg define(`m032',`%r20')	C
    126      1.1  mrg define(`m096',`%r21')	C
    127      1.1  mrg 
    128      1.1  mrg define(`p000a',`%r22')	C
    129      1.1  mrg define(`p064a',`%r29')	C
    130      1.1  mrg 
    131      1.1  mrg define(`s000',`%r31')	C
    132      1.1  mrg 
    133      1.1  mrg define(`ma000',`%r4')	C
    134      1.1  mrg define(`ma064',`%r20')	C
    135      1.1  mrg 
    136      1.1  mrg define(`r000',`%r3')	C
    137      1.1  mrg 
    138      1.1  mrg 	extrd,u		n, 63, 2, %r5
    139      1.1  mrg 	cmpb,=		%r5, %r0, L(BIG)
    140      1.1  mrg 	nop
    141      1.1  mrg 
    142      1.1  mrg 	fldd		0(up), %fr4
    143      1.1  mrg 	ldo		8(up), up
    144      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    145      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    146      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    147      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    148      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    149      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    150      1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    151      1.1  mrg 	addib,<>	-1, %r5, L(two_or_more)
    152      1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    153      1.1  mrg LDEF(one)
    154      1.1  mrg 	ldd		-0x78(%r30), p032a1
    155      1.1  mrg 	ldd		-0x70(%r30), p032a2
    156      1.1  mrg 	ldd		-0x80(%r30), p000a
    157      1.1  mrg 	b		L(0_one_out)
    158      1.1  mrg 	ldd		-0x68(%r30), p064a
    159      1.1  mrg 
    160      1.1  mrg LDEF(two_or_more)
    161      1.1  mrg 	fldd		0(up), %fr4
    162      1.1  mrg 	ldo		8(up), up
    163      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    164      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    165      1.1  mrg 	ldd		-0x78(%r30), p032a1
    166      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    167      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    168      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    169      1.1  mrg 	ldd		-0x70(%r30), p032a2
    170      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    171      1.1  mrg 	ldd		-0x80(%r30), p000a
    172      1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    173      1.1  mrg 	ldd		-0x68(%r30), p064a
    174      1.1  mrg 	addib,<>	-1, %r5, L(three_or_more)
    175      1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    176      1.1  mrg LDEF(two)
    177      1.1  mrg 	add		p032a1, p032a2, m032
    178      1.1  mrg 	add,dc		%r0, %r0, m096
    179      1.1  mrg 	depd,z		m032, 31, 32, ma000
    180      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    181      1.1  mrg 	ldd		0(rp), r000
    182      1.1  mrg 	b		L(0_two_out)
    183      1.1  mrg 	depd		m096, 31, 32, ma064
    184      1.1  mrg 
    185      1.1  mrg LDEF(three_or_more)
    186      1.1  mrg 	fldd		0(up), %fr4
    187      1.1  mrg 	add		p032a1, p032a2, m032
    188      1.1  mrg 	add,dc		%r0, %r0, m096
    189      1.1  mrg 	depd,z		m032, 31, 32, ma000
    190      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    191      1.1  mrg 	ldd		0(rp), r000
    192      1.1  mrg C	addib,=		-1, %r5, L(0_out)
    193      1.1  mrg 	depd		m096, 31, 32, ma064
    194      1.1  mrg LDEF(loop0)
    195      1.1  mrg C	xmpyu		%fr8R, %fr4L, %fr22
    196      1.1  mrg C	xmpyu		%fr8L, %fr4R, %fr23
    197      1.1  mrg C	ldd		-0x78(%r30), p032a1
    198      1.1  mrg C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    199      1.1  mrg C
    200      1.1  mrg C	xmpyu		%fr8R, %fr4R, %fr24
    201      1.1  mrg C	xmpyu		%fr8L, %fr4L, %fr25
    202      1.1  mrg C	ldd		-0x70(%r30), p032a2
    203      1.1  mrg C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    204      1.1  mrg C
    205      1.1  mrg C	ldo		8(rp), rp
    206      1.1  mrg C	add		climb, p000a, s000
    207      1.1  mrg C	ldd		-0x80(%r30), p000a
    208      1.1  mrg C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    209      1.1  mrg C
    210      1.1  mrg C	add,dc		p064a, %r0, climb
    211      1.1  mrg C	ldo		8(up), up
    212      1.1  mrg C	ldd		-0x68(%r30), p064a
    213      1.1  mrg C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    214      1.1  mrg C
    215      1.1  mrg C	add		ma000, s000, s000
    216      1.1  mrg C	add,dc		ma064, climb, climb
    217      1.1  mrg C	fldd		0(up), %fr4
    218      1.1  mrg C
    219      1.1  mrg C	sub		r000, s000, s000
    220      1.1  mrg C	sub,db		%r0, climb, climb
    221      1.1  mrg C	sub		%r0, climb, climb
    222      1.1  mrg C	std		s000, -8(rp)
    223      1.1  mrg C
    224      1.1  mrg C	add		p032a1, p032a2, m032
    225      1.1  mrg C	add,dc		%r0, %r0, m096
    226      1.1  mrg C
    227      1.1  mrg C	depd,z		m032, 31, 32, ma000
    228      1.1  mrg C	extrd,u		m032, 31, 32, ma064
    229      1.1  mrg C	ldd		0(rp), r000
    230      1.1  mrg C	addib,<>	-1, %r5, L(loop0)
    231      1.1  mrg C	depd		m096, 31, 32, ma064
    232      1.1  mrg LDEF(0_out)
    233      1.1  mrg 	ldo		8(up), up
    234      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    235      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    236      1.1  mrg 	ldd		-0x78(%r30), p032a1
    237      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    238      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    239      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    240      1.1  mrg 	ldd		-0x70(%r30), p032a2
    241      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    242      1.1  mrg 	ldo		8(rp), rp
    243      1.1  mrg 	add		climb, p000a, s000
    244      1.1  mrg 	ldd		-0x80(%r30), p000a
    245      1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    246      1.1  mrg 	add,dc		p064a, %r0, climb
    247      1.1  mrg 	ldd		-0x68(%r30), p064a
    248      1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    249      1.1  mrg 	add		ma000, s000, s000
    250      1.1  mrg 	add,dc		ma064, climb, climb
    251      1.1  mrg 	sub		r000, s000, s000
    252      1.1  mrg 	sub,db		%r0, climb, climb
    253      1.1  mrg 	sub		%r0, climb, climb
    254      1.1  mrg 	std		s000, -8(rp)
    255      1.1  mrg 	add		p032a1, p032a2, m032
    256      1.1  mrg 	add,dc		%r0, %r0, m096
    257      1.1  mrg 	depd,z		m032, 31, 32, ma000
    258      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    259      1.1  mrg 	ldd		0(rp), r000
    260      1.1  mrg 	depd		m096, 31, 32, ma064
    261      1.1  mrg LDEF(0_two_out)
    262      1.1  mrg 	ldd		-0x78(%r30), p032a1
    263      1.1  mrg 	ldd		-0x70(%r30), p032a2
    264      1.1  mrg 	ldo		8(rp), rp
    265      1.1  mrg 	add		climb, p000a, s000
    266      1.1  mrg 	ldd		-0x80(%r30), p000a
    267      1.1  mrg 	add,dc		p064a, %r0, climb
    268      1.1  mrg 	ldd		-0x68(%r30), p064a
    269      1.1  mrg 	add		ma000, s000, s000
    270      1.1  mrg 	add,dc		ma064, climb, climb
    271      1.1  mrg 	sub		r000, s000, s000
    272      1.1  mrg 	sub,db		%r0, climb, climb
    273      1.1  mrg 	sub		%r0, climb, climb
    274      1.1  mrg 	std		s000, -8(rp)
    275      1.1  mrg LDEF(0_one_out)
    276      1.1  mrg 	add		p032a1, p032a2, m032
    277      1.1  mrg 	add,dc		%r0, %r0, m096
    278      1.1  mrg 	depd,z		m032, 31, 32, ma000
    279      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    280      1.1  mrg 	ldd		0(rp), r000
    281      1.1  mrg 	depd		m096, 31, 32, ma064
    282      1.1  mrg 
    283      1.1  mrg 	add		climb, p000a, s000
    284      1.1  mrg 	add,dc		p064a, %r0, climb
    285      1.1  mrg 	add		ma000, s000, s000
    286      1.1  mrg 	add,dc		ma064, climb, climb
    287      1.1  mrg 	sub		r000, s000, s000
    288      1.1  mrg 	sub,db		%r0, climb, climb
    289      1.1  mrg 	sub		%r0, climb, climb
    290      1.1  mrg 	std		s000, 0(rp)
    291      1.1  mrg 
    292      1.1  mrg 	cmpib,>=	4, n, L(done)
    293      1.1  mrg 	ldo		8(rp), rp
    294      1.1  mrg 
    295      1.1  mrg C 4-way unrolled code.
    296      1.1  mrg 
    297      1.1  mrg LDEF(BIG)
    298      1.1  mrg 
    299      1.1  mrg define(`p032a1',`%r1')	C
    300      1.1  mrg define(`p032a2',`%r19')	C
    301      1.1  mrg define(`p096b1',`%r20')	C
    302      1.1  mrg define(`p096b2',`%r21')	C
    303      1.1  mrg define(`p160c1',`%r22')	C
    304      1.1  mrg define(`p160c2',`%r29')	C
    305      1.1  mrg define(`p224d1',`%r31')	C
    306      1.1  mrg define(`p224d2',`%r3')	C
    307      1.1  mrg 			C
    308      1.1  mrg define(`m032',`%r4')	C
    309      1.1  mrg define(`m096',`%r5')	C
    310      1.1  mrg define(`m160',`%r6')	C
    311      1.1  mrg define(`m224',`%r7')	C
    312      1.1  mrg define(`m288',`%r8')	C
    313      1.1  mrg 			C
    314      1.1  mrg define(`p000a',`%r1')	C
    315      1.1  mrg define(`p064a',`%r19')	C
    316      1.1  mrg define(`p064b',`%r20')	C
    317      1.1  mrg define(`p128b',`%r21')	C
    318      1.1  mrg define(`p128c',`%r22')	C
    319      1.1  mrg define(`p192c',`%r29')	C
    320      1.1  mrg define(`p192d',`%r31')	C
    321      1.1  mrg define(`p256d',`%r3')	C
    322      1.1  mrg 			C
    323      1.1  mrg define(`s000',`%r10')	C
    324      1.1  mrg define(`s064',`%r11')	C
    325      1.1  mrg define(`s128',`%r12')	C
    326      1.1  mrg define(`s192',`%r13')	C
    327      1.1  mrg 			C
    328      1.1  mrg define(`ma000',`%r9')	C
    329      1.1  mrg define(`ma064',`%r4')	C
    330      1.1  mrg define(`ma128',`%r5')	C
    331      1.1  mrg define(`ma192',`%r6')	C
    332      1.1  mrg define(`ma256',`%r7')	C
    333      1.1  mrg 			C
    334      1.1  mrg define(`r000',`%r1')	C
    335      1.1  mrg define(`r064',`%r19')	C
    336      1.1  mrg define(`r128',`%r20')	C
    337      1.1  mrg define(`r192',`%r21')	C
    338      1.1  mrg 
    339      1.1  mrg 	std		%r6, -0xe8(%r30)
    340      1.1  mrg 	std		%r7, -0xe0(%r30)
    341      1.1  mrg 	std		%r8, -0xd8(%r30)
    342      1.1  mrg 	std		%r9, -0xd0(%r30)
    343      1.1  mrg 	std		%r10, -0xc8(%r30)
    344      1.1  mrg 	std		%r11, -0xc0(%r30)
    345      1.1  mrg 	std		%r12, -0xb8(%r30)
    346      1.1  mrg 	std		%r13, -0xb0(%r30)
    347      1.1  mrg 
    348      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    349      1.1  mrg `	extrd,u		n, 61, 62, n		C right shift 2
    350      1.1  mrg ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    351      1.1  mrg ')
    352      1.1  mrg 
    353      1.1  mrg LDEF(4_or_more)
    354      1.1  mrg 	fldd		0(up), %fr4
    355      1.1  mrg 	fldd		8(up), %fr5
    356      1.1  mrg 	fldd		16(up), %fr6
    357      1.1  mrg 	fldd		24(up), %fr7
    358      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    359      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    360      1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    361      1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    362      1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    363      1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    364      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    365      1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    366      1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    367      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    368      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    369      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    370      1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    371      1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    372      1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    373      1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    374      1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    375      1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    376      1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    377      1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    378      1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    379      1.1  mrg 	addib,<>	-1, n, L(8_or_more)
    380      1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    381      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    382      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    383      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    384      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    385      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    386      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    387      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    388      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    389      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    390      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    391      1.1  mrg 	ldd		-0x78(%r30), p032a1
    392      1.1  mrg 	ldd		-0x70(%r30), p032a2
    393      1.1  mrg 	ldd		-0x38(%r30), p096b1
    394      1.1  mrg 	ldd		-0x30(%r30), p096b2
    395      1.1  mrg 	ldd		-0x58(%r30), p160c1
    396      1.1  mrg 	ldd		-0x50(%r30), p160c2
    397      1.1  mrg 	ldd		-0x18(%r30), p224d1
    398      1.1  mrg 	ldd		-0x10(%r30), p224d2
    399      1.1  mrg 	b		L(end1)
    400      1.1  mrg 	nop
    401      1.1  mrg 
    402      1.1  mrg LDEF(8_or_more)
    403      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    404      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    405      1.1  mrg 	ldo		32(up), up
    406      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    407      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    408      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    409      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    410      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    411      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    412      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    413      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    414      1.1  mrg 	fldd		0(up), %fr4
    415      1.1  mrg 	fldd		8(up), %fr5
    416      1.1  mrg 	fldd		16(up), %fr6
    417      1.1  mrg 	fldd		24(up), %fr7
    418      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    419      1.1  mrg 	ldd		-0x78(%r30), p032a1
    420      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    421      1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    422      1.1  mrg 	ldd		-0x70(%r30), p032a2
    423      1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    424      1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    425      1.1  mrg 	ldd		-0x38(%r30), p096b1
    426      1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    427      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    428      1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    429      1.1  mrg 	ldd		-0x30(%r30), p096b2
    430      1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    431      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    432      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    433      1.1  mrg 	ldd		-0x58(%r30), p160c1
    434      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    435      1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    436      1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    437      1.1  mrg 	ldd		-0x50(%r30), p160c2
    438      1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    439      1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    440      1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    441      1.1  mrg 	ldd		-0x18(%r30), p224d1
    442      1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    443      1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    444      1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    445      1.1  mrg 	ldd		-0x10(%r30), p224d2
    446      1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    447      1.1  mrg 	addib,=		-1, n, L(end2)
    448      1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    449      1.1  mrg LDEF(loop)
    450      1.1  mrg 	add		p032a1, p032a2, m032
    451      1.1  mrg 	ldd		-0x80(%r30), p000a
    452      1.1  mrg 	add,dc		p096b1, p096b2, m096
    453      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    454      1.1  mrg 
    455      1.1  mrg 	add,dc		p160c1, p160c2, m160
    456      1.1  mrg 	ldd		-0x68(%r30), p064a
    457      1.1  mrg 	add,dc		p224d1, p224d2, m224
    458      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    459      1.1  mrg 
    460      1.1  mrg 	add,dc		%r0, %r0, m288
    461      1.1  mrg 	ldd		-0x40(%r30), p064b
    462      1.1  mrg 	ldo		32(up), up
    463      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    464      1.1  mrg 
    465      1.1  mrg 	depd,z		m032, 31, 32, ma000
    466      1.1  mrg 	ldd		-0x28(%r30), p128b
    467      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    468      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    469      1.1  mrg 
    470      1.1  mrg 	depd		m096, 31, 32, ma064
    471      1.1  mrg 	ldd		-0x60(%r30), p128c
    472      1.1  mrg 	extrd,u		m096, 31, 32, ma128
    473      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    474      1.1  mrg 
    475      1.1  mrg 	depd		m160, 31, 32, ma128
    476      1.1  mrg 	ldd		-0x48(%r30), p192c
    477      1.1  mrg 	extrd,u		m160, 31, 32, ma192
    478      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    479      1.1  mrg 
    480      1.1  mrg 	depd		m224, 31, 32, ma192
    481      1.1  mrg 	ldd		-0x20(%r30), p192d
    482      1.1  mrg 	extrd,u		m224, 31, 32, ma256
    483      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    484      1.1  mrg 
    485      1.1  mrg 	depd		m288, 31, 32, ma256
    486      1.1  mrg 	ldd		-0x88(%r30), p256d
    487      1.1  mrg 	add		climb, p000a, s000
    488      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    489      1.1  mrg 
    490      1.1  mrg 	add,dc		p064a, p064b, s064
    491      1.1  mrg 	ldd		0(rp), r000
    492      1.1  mrg 	add,dc		p128b, p128c, s128
    493      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    494      1.1  mrg 
    495      1.1  mrg 	add,dc		p192c, p192d, s192
    496      1.1  mrg 	ldd		8(rp), r064
    497      1.1  mrg 	add,dc		p256d, %r0, climb
    498      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    499      1.1  mrg 
    500      1.1  mrg 	ldd		16(rp), r128
    501      1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    502      1.1  mrg 	ldd		24(rp), r192
    503      1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    504      1.1  mrg 
    505      1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    506      1.1  mrg 	fldd		0(up), %fr4
    507      1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    508      1.1  mrg 	fldd		8(up), %fr5
    509      1.1  mrg 
    510      1.1  mrg 	add,dc		ma256, climb, climb
    511      1.1  mrg 	fldd		16(up), %fr6
    512      1.1  mrg 	sub		r000, s000, s000	C accum rlimb 0
    513      1.1  mrg 	fldd		24(up), %fr7
    514      1.1  mrg 
    515      1.1  mrg 	sub,db		r064, s064, s064	C accum rlimb 1
    516      1.1  mrg 	sub,db		r128, s128, s128	C accum rlimb 2
    517      1.1  mrg 	std		s000, 0(rp)
    518      1.1  mrg 
    519      1.1  mrg 	sub,db		r192, s192, s192	C accum rlimb 3
    520      1.1  mrg 	sub,db		%r0, climb, climb
    521      1.1  mrg 	sub		%r0, climb, climb
    522      1.1  mrg 	std		s064, 8(rp)
    523      1.1  mrg 
    524      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    525      1.1  mrg 	ldd		-0x78(%r30), p032a1
    526      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    527      1.1  mrg 	std		s128, 16(rp)
    528      1.1  mrg 
    529      1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    530      1.1  mrg 	ldd		-0x70(%r30), p032a2
    531      1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    532      1.1  mrg 	std		s192, 24(rp)
    533      1.1  mrg 
    534      1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    535      1.1  mrg 	ldd		-0x38(%r30), p096b1
    536      1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    537      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    538      1.1  mrg 
    539      1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    540      1.1  mrg 	ldd		-0x30(%r30), p096b2
    541      1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    542      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    543      1.1  mrg 
    544      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    545      1.1  mrg 	ldd		-0x58(%r30), p160c1
    546      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    547      1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    548      1.1  mrg 
    549      1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    550      1.1  mrg 	ldd		-0x50(%r30), p160c2
    551      1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    552      1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    553      1.1  mrg 
    554      1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    555      1.1  mrg 	ldd		-0x18(%r30), p224d1
    556      1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    557      1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    558      1.1  mrg 
    559      1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    560      1.1  mrg 	ldd		-0x10(%r30), p224d2
    561      1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    562      1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    563      1.1  mrg 
    564      1.1  mrg 	addib,<>	-1, n, L(loop)
    565      1.1  mrg 	ldo		32(rp), rp
    566      1.1  mrg 
    567      1.1  mrg LDEF(end2)
    568      1.1  mrg 	add		p032a1, p032a2, m032
    569      1.1  mrg 	ldd		-0x80(%r30), p000a
    570      1.1  mrg 	add,dc		p096b1, p096b2, m096
    571      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    572      1.1  mrg 	add,dc		p160c1, p160c2, m160
    573      1.1  mrg 	ldd		-0x68(%r30), p064a
    574      1.1  mrg 	add,dc		p224d1, p224d2, m224
    575      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    576      1.1  mrg 	add,dc		%r0, %r0, m288
    577      1.1  mrg 	ldd		-0x40(%r30), p064b
    578      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    579      1.1  mrg 	depd,z		m032, 31, 32, ma000
    580      1.1  mrg 	ldd		-0x28(%r30), p128b
    581      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    582      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    583      1.1  mrg 	depd		m096, 31, 32, ma064
    584      1.1  mrg 	ldd		-0x60(%r30), p128c
    585      1.1  mrg 	extrd,u		m096, 31, 32, ma128
    586      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    587      1.1  mrg 	depd		m160, 31, 32, ma128
    588      1.1  mrg 	ldd		-0x48(%r30), p192c
    589      1.1  mrg 	extrd,u		m160, 31, 32, ma192
    590      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    591      1.1  mrg 	depd		m224, 31, 32, ma192
    592      1.1  mrg 	ldd		-0x20(%r30), p192d
    593      1.1  mrg 	extrd,u		m224, 31, 32, ma256
    594      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    595      1.1  mrg 	depd		m288, 31, 32, ma256
    596      1.1  mrg 	ldd		-0x88(%r30), p256d
    597      1.1  mrg 	add		climb, p000a, s000
    598      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    599      1.1  mrg 	add,dc		p064a, p064b, s064
    600      1.1  mrg 	ldd		0(rp), r000
    601      1.1  mrg 	add,dc		p128b, p128c, s128
    602      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    603      1.1  mrg 	add,dc		p192c, p192d, s192
    604      1.1  mrg 	ldd		8(rp), r064
    605      1.1  mrg 	add,dc		p256d, %r0, climb
    606      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    607      1.1  mrg 	ldd		16(rp), r128
    608      1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    609      1.1  mrg 	ldd		24(rp), r192
    610      1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    611      1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    612      1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    613      1.1  mrg 	add,dc		ma256, climb, climb
    614      1.1  mrg 	sub		r000, s000, s000	C accum rlimb 0
    615      1.1  mrg 	sub,db		r064, s064, s064	C accum rlimb 1
    616      1.1  mrg 	sub,db		r128, s128, s128	C accum rlimb 2
    617      1.1  mrg 	std		s000, 0(rp)
    618      1.1  mrg 	sub,db		r192, s192, s192	C accum rlimb 3
    619      1.1  mrg 	sub,db		%r0, climb, climb
    620      1.1  mrg 	sub		%r0, climb, climb
    621      1.1  mrg 	std		s064, 8(rp)
    622      1.1  mrg 	ldd		-0x78(%r30), p032a1
    623      1.1  mrg 	std		s128, 16(rp)
    624      1.1  mrg 	ldd		-0x70(%r30), p032a2
    625      1.1  mrg 	std		s192, 24(rp)
    626      1.1  mrg 	ldd		-0x38(%r30), p096b1
    627      1.1  mrg 	ldd		-0x30(%r30), p096b2
    628      1.1  mrg 	ldd		-0x58(%r30), p160c1
    629      1.1  mrg 	ldd		-0x50(%r30), p160c2
    630      1.1  mrg 	ldd		-0x18(%r30), p224d1
    631      1.1  mrg 	ldd		-0x10(%r30), p224d2
    632      1.1  mrg 	ldo		32(rp), rp
    633      1.1  mrg 
    634      1.1  mrg LDEF(end1)
    635      1.1  mrg 	add		p032a1, p032a2, m032
    636      1.1  mrg 	ldd		-0x80(%r30), p000a
    637      1.1  mrg 	add,dc		p096b1, p096b2, m096
    638      1.1  mrg 	add,dc		p160c1, p160c2, m160
    639      1.1  mrg 	ldd		-0x68(%r30), p064a
    640      1.1  mrg 	add,dc		p224d1, p224d2, m224
    641      1.1  mrg 	add,dc		%r0, %r0, m288
    642      1.1  mrg 	ldd		-0x40(%r30), p064b
    643      1.1  mrg 	depd,z		m032, 31, 32, ma000
    644      1.1  mrg 	ldd		-0x28(%r30), p128b
    645      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    646      1.1  mrg 	depd		m096, 31, 32, ma064
    647      1.1  mrg 	ldd		-0x60(%r30), p128c
    648      1.1  mrg 	extrd,u		m096, 31, 32, ma128
    649      1.1  mrg 	depd		m160, 31, 32, ma128
    650      1.1  mrg 	ldd		-0x48(%r30), p192c
    651      1.1  mrg 	extrd,u		m160, 31, 32, ma192
    652      1.1  mrg 	depd		m224, 31, 32, ma192
    653      1.1  mrg 	ldd		-0x20(%r30), p192d
    654      1.1  mrg 	extrd,u		m224, 31, 32, ma256
    655      1.1  mrg 	depd		m288, 31, 32, ma256
    656      1.1  mrg 	ldd		-0x88(%r30), p256d
    657      1.1  mrg 	add		climb, p000a, s000
    658      1.1  mrg 	add,dc		p064a, p064b, s064
    659      1.1  mrg 	ldd		0(rp), r000
    660      1.1  mrg 	add,dc		p128b, p128c, s128
    661      1.1  mrg 	add,dc		p192c, p192d, s192
    662      1.1  mrg 	ldd		8(rp), r064
    663      1.1  mrg 	add,dc		p256d, %r0, climb
    664      1.1  mrg 	ldd		16(rp), r128
    665      1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    666      1.1  mrg 	ldd		24(rp), r192
    667      1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    668      1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    669      1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    670      1.1  mrg 	add,dc		ma256, climb, climb
    671      1.1  mrg 	sub		r000, s000, s000	C accum rlimb 0
    672      1.1  mrg 	sub,db		r064, s064, s064	C accum rlimb 1
    673      1.1  mrg 	sub,db		r128, s128, s128	C accum rlimb 2
    674      1.1  mrg 	std		s000, 0(rp)
    675      1.1  mrg 	sub,db		r192, s192, s192	C accum rlimb 3
    676      1.1  mrg 	sub,db		%r0, climb, climb
    677      1.1  mrg 	sub		%r0, climb, climb
    678      1.1  mrg 	std		s064, 8(rp)
    679      1.1  mrg 	std		s128, 16(rp)
    680      1.1  mrg 	std		s192, 24(rp)
    681      1.1  mrg 
    682      1.1  mrg 	ldd		-0xb0(%r30), %r13
    683      1.1  mrg 	ldd		-0xb8(%r30), %r12
    684      1.1  mrg 	ldd		-0xc0(%r30), %r11
    685      1.1  mrg 	ldd		-0xc8(%r30), %r10
    686      1.1  mrg 	ldd		-0xd0(%r30), %r9
    687      1.1  mrg 	ldd		-0xd8(%r30), %r8
    688      1.1  mrg 	ldd		-0xe0(%r30), %r7
    689      1.1  mrg 	ldd		-0xe8(%r30), %r6
    690      1.1  mrg LDEF(done)
    691      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    692      1.1  mrg `	copy		climb, %r28
    693      1.1  mrg ',`	extrd,u		climb, 63, 32, %r29
    694      1.1  mrg 	extrd,u		climb, 31, 32, %r28
    695      1.1  mrg ')
    696      1.1  mrg 	ldd		-0xf0(%r30), %r5
    697      1.1  mrg 	ldd		-0xf8(%r30), %r4
    698      1.1  mrg 	bve		(%r2)
    699      1.1  mrg 	ldd,mb		-0x100(%r30), %r3
    700      1.1  mrg EPILOGUE(mpn_submul_1)
    701