Home | History | Annotate | Line # | Download | only in pa64
mul_1.asm revision 1.1.1.2
      1      1.1  mrg dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
      2      1.1  mrg dnl  the result in a second limb vector.
      3      1.1  mrg 
      4  1.1.1.2  mrg dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1.1.2  mrg dnl
      8      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg dnl  it under the terms of either:
     10  1.1.1.2  mrg dnl
     11  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg dnl      option) any later version.
     14  1.1.1.2  mrg dnl
     15  1.1.1.2  mrg dnl  or
     16  1.1.1.2  mrg dnl
     17  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg dnl      later version.
     20  1.1.1.2  mrg dnl
     21  1.1.1.2  mrg dnl  or both in parallel, as here.
     22  1.1.1.2  mrg dnl
     23      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg dnl  for more details.
     27  1.1.1.2  mrg dnl
     28  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg C		    cycles/limb
     35      1.1  mrg C 8000,8200:		6.5
     36      1.1  mrg C 8500,8600,8700:	5.625
     37      1.1  mrg 
     38      1.1  mrg C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     39      1.1  mrg C  could be saved there per call.
     40      1.1  mrg 
     41      1.1  mrg C  DESCRIPTION:
     42      1.1  mrg C  The main loop "BIG" is 4-way unrolled, mainly to allow
     43      1.1  mrg C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     44      1.1  mrg C  registers to the IU registers, have demanded a deep software pipeline, and
     45      1.1  mrg C  a lot of stack slots for partial products in flight.
     46      1.1  mrg C
     47      1.1  mrg C  CODE STRUCTURE:
     48      1.1  mrg C  save-some-registers
     49      1.1  mrg C  do 0, 1, 2, or 3 limbs
     50      1.1  mrg C  if done, restore-some-regs and return
     51      1.1  mrg C  save-many-regs
     52      1.1  mrg C  do 4, 8, ... limb
     53      1.1  mrg C  restore-all-regs
     54      1.1  mrg 
     55      1.1  mrg C  STACK LAYOUT:
     56      1.1  mrg C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     57      1.1  mrg C  slots marked FREE, as well as some slots in the caller's "frame marker".
     58      1.1  mrg C
     59      1.1  mrg C -00 <- r30
     60      1.1  mrg C -08  FREE
     61      1.1  mrg C -10  tmp
     62      1.1  mrg C -18  tmp
     63      1.1  mrg C -20  tmp
     64      1.1  mrg C -28  tmp
     65      1.1  mrg C -30  tmp
     66      1.1  mrg C -38  tmp
     67      1.1  mrg C -40  tmp
     68      1.1  mrg C -48  tmp
     69      1.1  mrg C -50  tmp
     70      1.1  mrg C -58  tmp
     71      1.1  mrg C -60  tmp
     72      1.1  mrg C -68  tmp
     73      1.1  mrg C -70  tmp
     74      1.1  mrg C -78  tmp
     75      1.1  mrg C -80  tmp
     76      1.1  mrg C -88  tmp
     77      1.1  mrg C -90  FREE
     78      1.1  mrg C -98  FREE
     79      1.1  mrg C -a0  FREE
     80      1.1  mrg C -a8  FREE
     81      1.1  mrg C -b0  r13
     82      1.1  mrg C -b8  r12
     83      1.1  mrg C -c0  r11
     84      1.1  mrg C -c8  r10
     85      1.1  mrg C -d0  r8
     86      1.1  mrg C -d8  r8
     87      1.1  mrg C -e0  r7
     88      1.1  mrg C -e8  r6
     89      1.1  mrg C -f0  r5
     90      1.1  mrg C -f8  r4
     91      1.1  mrg C -100 r3
     92      1.1  mrg C  Previous frame:
     93      1.1  mrg C  [unused area]
     94      1.1  mrg C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     95      1.1  mrg 
     96      1.1  mrg 
     97      1.1  mrg include(`../config.m4')
     98      1.1  mrg 
     99      1.1  mrg C INPUT PARAMETERS:
    100      1.1  mrg define(`rp',`%r26')	C
    101      1.1  mrg define(`up',`%r25')	C
    102      1.1  mrg define(`n',`%r24')	C
    103      1.1  mrg define(`vlimb',`%r23')	C
    104      1.1  mrg 
    105      1.1  mrg define(`climb',`%r23')	C
    106      1.1  mrg 
    107      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    108      1.1  mrg `	.level	2.0w
    109      1.1  mrg ',`	.level	2.0
    110      1.1  mrg ')
    111      1.1  mrg PROLOGUE(mpn_mul_1)
    112      1.1  mrg 
    113      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    114      1.1  mrg `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    115      1.1  mrg ')
    116      1.1  mrg 	std,ma		%r3, 0x100(%r30)
    117      1.1  mrg 	std		%r4, -0xf8(%r30)
    118      1.1  mrg 	std		%r5, -0xf0(%r30)
    119      1.1  mrg 	ldo		0(%r0), climb		C clear climb
    120      1.1  mrg 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    121      1.1  mrg 
    122      1.1  mrg define(`p032a1',`%r1')	C
    123      1.1  mrg define(`p032a2',`%r19')	C
    124      1.1  mrg 
    125      1.1  mrg define(`m032',`%r20')	C
    126      1.1  mrg define(`m096',`%r21')	C
    127      1.1  mrg 
    128      1.1  mrg define(`p000a',`%r22')	C
    129      1.1  mrg define(`p064a',`%r29')	C
    130      1.1  mrg 
    131      1.1  mrg define(`s000',`%r31')	C
    132      1.1  mrg 
    133      1.1  mrg define(`ma000',`%r4')	C
    134      1.1  mrg define(`ma064',`%r20')	C
    135      1.1  mrg 
    136      1.1  mrg C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
    137      1.1  mrg 
    138      1.1  mrg 	extrd,u		n, 63, 2, %r5
    139      1.1  mrg 	cmpb,=		%r5, %r0, L(BIG)
    140      1.1  mrg 	nop
    141      1.1  mrg 
    142      1.1  mrg 	fldd		0(up), %fr4
    143      1.1  mrg 	ldo		8(up), up
    144      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    145      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    146      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    147      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    148      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    149      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    150      1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    151      1.1  mrg 	addib,<>	-1, %r5, L(two_or_more)
    152      1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    153      1.1  mrg LDEF(one)
    154      1.1  mrg 	ldd		-0x78(%r30), p032a1
    155      1.1  mrg 	ldd		-0x70(%r30), p032a2
    156      1.1  mrg 	ldd		-0x80(%r30), p000a
    157      1.1  mrg 	b		L(0_one_out)
    158      1.1  mrg 	ldd		-0x68(%r30), p064a
    159      1.1  mrg 
    160      1.1  mrg LDEF(two_or_more)
    161      1.1  mrg 	fldd		0(up), %fr4
    162      1.1  mrg 	ldo		8(up), up
    163      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    164      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    165      1.1  mrg 	ldd		-0x78(%r30), p032a1
    166      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    167      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    168      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    169      1.1  mrg 	ldd		-0x70(%r30), p032a2
    170      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    171      1.1  mrg 	ldd		-0x80(%r30), p000a
    172      1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    173      1.1  mrg 	ldd		-0x68(%r30), p064a
    174      1.1  mrg 	addib,<>	-1, %r5, L(three_or_more)
    175      1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    176      1.1  mrg LDEF(two)
    177      1.1  mrg 	add		p032a1, p032a2, m032
    178      1.1  mrg 	add,dc		%r0, %r0, m096
    179      1.1  mrg 	depd,z		m032, 31, 32, ma000
    180      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    181      1.1  mrg 	b		L(0_two_out)
    182      1.1  mrg 	depd		m096, 31, 32, ma064
    183      1.1  mrg 
    184      1.1  mrg LDEF(three_or_more)
    185      1.1  mrg 	fldd		0(up), %fr4
    186      1.1  mrg 	add		p032a1, p032a2, m032
    187      1.1  mrg 	add,dc		%r0, %r0, m096
    188      1.1  mrg 	depd,z		m032, 31, 32, ma000
    189      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    190      1.1  mrg C	addib,=		-1, %r5, L(0_out)
    191      1.1  mrg 	depd		m096, 31, 32, ma064
    192      1.1  mrg LDEF(loop0)
    193      1.1  mrg C	xmpyu		%fr8R, %fr4L, %fr22
    194      1.1  mrg C	xmpyu		%fr8L, %fr4R, %fr23
    195      1.1  mrg C	ldd		-0x78(%r30), p032a1
    196      1.1  mrg C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    197      1.1  mrg C
    198      1.1  mrg C	xmpyu		%fr8R, %fr4R, %fr24
    199      1.1  mrg C	xmpyu		%fr8L, %fr4L, %fr25
    200      1.1  mrg C	ldd		-0x70(%r30), p032a2
    201      1.1  mrg C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    202      1.1  mrg C
    203      1.1  mrg C	ldo		8(rp), rp
    204      1.1  mrg C	add		climb, p000a, s000
    205      1.1  mrg C	ldd		-0x80(%r30), p000a
    206      1.1  mrg C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    207      1.1  mrg C
    208      1.1  mrg C	add,dc		p064a, %r0, climb
    209      1.1  mrg C	ldo		8(up), up
    210      1.1  mrg C	ldd		-0x68(%r30), p064a
    211      1.1  mrg C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    212      1.1  mrg C
    213      1.1  mrg C	add		ma000, s000, s000
    214      1.1  mrg C	add,dc		ma064, climb, climb
    215      1.1  mrg C	fldd		0(up), %fr4
    216      1.1  mrg C
    217      1.1  mrg C	std		s000, -8(rp)
    218      1.1  mrg C
    219      1.1  mrg C	add		p032a1, p032a2, m032
    220      1.1  mrg C	add,dc		%r0, %r0, m096
    221      1.1  mrg C
    222      1.1  mrg C	depd,z		m032, 31, 32, ma000
    223      1.1  mrg C	extrd,u		m032, 31, 32, ma064
    224      1.1  mrg C	addib,<>	-1, %r5, L(loop0)
    225      1.1  mrg C	depd		m096, 31, 32, ma064
    226      1.1  mrg LDEF(0_out)
    227      1.1  mrg 	ldo		8(up), up
    228      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    229      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    230      1.1  mrg 	ldd		-0x78(%r30), p032a1
    231      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    232      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr24
    233      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr25
    234      1.1  mrg 	ldd		-0x70(%r30), p032a2
    235      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    236      1.1  mrg 	ldo		8(rp), rp
    237      1.1  mrg 	add		climb, p000a, s000
    238      1.1  mrg 	ldd		-0x80(%r30), p000a
    239      1.1  mrg 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    240      1.1  mrg 	add,dc		p064a, %r0, climb
    241      1.1  mrg 	ldd		-0x68(%r30), p064a
    242      1.1  mrg 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    243      1.1  mrg 	add		ma000, s000, s000
    244      1.1  mrg 	add,dc		ma064, climb, climb
    245      1.1  mrg 	std		s000, -8(rp)
    246      1.1  mrg 	add		p032a1, p032a2, m032
    247      1.1  mrg 	add,dc		%r0, %r0, m096
    248      1.1  mrg 	depd,z		m032, 31, 32, ma000
    249      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    250      1.1  mrg 	depd		m096, 31, 32, ma064
    251      1.1  mrg LDEF(0_two_out)
    252      1.1  mrg 	ldd		-0x78(%r30), p032a1
    253      1.1  mrg 	ldd		-0x70(%r30), p032a2
    254      1.1  mrg 	ldo		8(rp), rp
    255      1.1  mrg 	add		climb, p000a, s000
    256      1.1  mrg 	ldd		-0x80(%r30), p000a
    257      1.1  mrg 	add,dc		p064a, %r0, climb
    258      1.1  mrg 	ldd		-0x68(%r30), p064a
    259      1.1  mrg 	add		ma000, s000, s000
    260      1.1  mrg 	add,dc		ma064, climb, climb
    261      1.1  mrg 	std		s000, -8(rp)
    262      1.1  mrg LDEF(0_one_out)
    263      1.1  mrg 	add		p032a1, p032a2, m032
    264      1.1  mrg 	add,dc		%r0, %r0, m096
    265      1.1  mrg 	depd,z		m032, 31, 32, ma000
    266      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    267      1.1  mrg 	depd		m096, 31, 32, ma064
    268      1.1  mrg 
    269      1.1  mrg 	add		climb, p000a, s000
    270      1.1  mrg 	add,dc		p064a, %r0, climb
    271      1.1  mrg 	add		ma000, s000, s000
    272      1.1  mrg 	add,dc		ma064, climb, climb
    273      1.1  mrg 	std		s000, 0(rp)
    274      1.1  mrg 
    275      1.1  mrg 	cmpib,>=	4, n, L(done)
    276      1.1  mrg 	ldo		8(rp), rp
    277      1.1  mrg 
    278      1.1  mrg C 4-way unrolled code.
    279      1.1  mrg 
    280      1.1  mrg LDEF(BIG)
    281      1.1  mrg 
    282      1.1  mrg define(`p032a1',`%r1')	C
    283      1.1  mrg define(`p032a2',`%r19')	C
    284      1.1  mrg define(`p096b1',`%r20')	C
    285      1.1  mrg define(`p096b2',`%r21')	C
    286      1.1  mrg define(`p160c1',`%r22')	C
    287      1.1  mrg define(`p160c2',`%r29')	C
    288      1.1  mrg define(`p224d1',`%r31')	C
    289      1.1  mrg define(`p224d2',`%r3')	C
    290      1.1  mrg 			C
    291      1.1  mrg define(`m032',`%r4')	C
    292      1.1  mrg define(`m096',`%r5')	C
    293      1.1  mrg define(`m160',`%r6')	C
    294      1.1  mrg define(`m224',`%r7')	C
    295      1.1  mrg define(`m288',`%r8')	C
    296      1.1  mrg 			C
    297      1.1  mrg define(`p000a',`%r1')	C
    298      1.1  mrg define(`p064a',`%r19')	C
    299      1.1  mrg define(`p064b',`%r20')	C
    300      1.1  mrg define(`p128b',`%r21')	C
    301      1.1  mrg define(`p128c',`%r22')	C
    302      1.1  mrg define(`p192c',`%r29')	C
    303      1.1  mrg define(`p192d',`%r31')	C
    304      1.1  mrg define(`p256d',`%r3')	C
    305      1.1  mrg 			C
    306      1.1  mrg define(`s000',`%r10')	C
    307      1.1  mrg define(`s064',`%r11')	C
    308      1.1  mrg define(`s128',`%r12')	C
    309      1.1  mrg define(`s192',`%r13')	C
    310      1.1  mrg 			C
    311      1.1  mrg define(`ma000',`%r9')	C
    312      1.1  mrg define(`ma064',`%r4')	C
    313      1.1  mrg define(`ma128',`%r5')	C
    314      1.1  mrg define(`ma192',`%r6')	C
    315      1.1  mrg define(`ma256',`%r7')	C
    316      1.1  mrg 
    317      1.1  mrg 	std		%r6, -0xe8(%r30)
    318      1.1  mrg 	std		%r7, -0xe0(%r30)
    319      1.1  mrg 	std		%r8, -0xd8(%r30)
    320      1.1  mrg 	std		%r9, -0xd0(%r30)
    321      1.1  mrg 	std		%r10, -0xc8(%r30)
    322      1.1  mrg 	std		%r11, -0xc0(%r30)
    323      1.1  mrg 	std		%r12, -0xb8(%r30)
    324      1.1  mrg 	std		%r13, -0xb0(%r30)
    325      1.1  mrg 
    326      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    327      1.1  mrg `	extrd,u		n, 61, 62, n		C right shift 2
    328      1.1  mrg ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    329      1.1  mrg ')
    330      1.1  mrg 
    331      1.1  mrg LDEF(4_or_more)
    332      1.1  mrg 	fldd		0(up), %fr4
    333      1.1  mrg 	fldd		8(up), %fr5
    334      1.1  mrg 	fldd		16(up), %fr6
    335      1.1  mrg 	fldd		24(up), %fr7
    336      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    337      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    338      1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    339      1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    340      1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    341      1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    342      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    343      1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    344      1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    345      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    346      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    347      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    348      1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    349      1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    350      1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    351      1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    352      1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    353      1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    354      1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    355      1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    356      1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    357      1.1  mrg 	addib,<>	-1, n, L(8_or_more)
    358      1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    359      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    360      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    361      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    362      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    363      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    364      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    365      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    366      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    367      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    368      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    369      1.1  mrg 	ldd		-0x78(%r30), p032a1
    370      1.1  mrg 	ldd		-0x70(%r30), p032a2
    371      1.1  mrg 	ldd		-0x38(%r30), p096b1
    372      1.1  mrg 	ldd		-0x30(%r30), p096b2
    373      1.1  mrg 	ldd		-0x58(%r30), p160c1
    374      1.1  mrg 	ldd		-0x50(%r30), p160c2
    375      1.1  mrg 	ldd		-0x18(%r30), p224d1
    376      1.1  mrg 	ldd		-0x10(%r30), p224d2
    377      1.1  mrg 	b		L(end1)
    378      1.1  mrg 	nop
    379      1.1  mrg 
    380      1.1  mrg LDEF(8_or_more)
    381      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    382      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    383      1.1  mrg 	ldo		32(up), up
    384      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    385      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    386      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    387      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    388      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    389      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    390      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    391      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    392      1.1  mrg 	fldd		0(up), %fr4
    393      1.1  mrg 	fldd		8(up), %fr5
    394      1.1  mrg 	fldd		16(up), %fr6
    395      1.1  mrg 	fldd		24(up), %fr7
    396      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    397      1.1  mrg 	ldd		-0x78(%r30), p032a1
    398      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    399      1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    400      1.1  mrg 	ldd		-0x70(%r30), p032a2
    401      1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    402      1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    403      1.1  mrg 	ldd		-0x38(%r30), p096b1
    404      1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    405      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    406      1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    407      1.1  mrg 	ldd		-0x30(%r30), p096b2
    408      1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    409      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    410      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    411      1.1  mrg 	ldd		-0x58(%r30), p160c1
    412      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    413      1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    414      1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    415      1.1  mrg 	ldd		-0x50(%r30), p160c2
    416      1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    417      1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    418      1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    419      1.1  mrg 	ldd		-0x18(%r30), p224d1
    420      1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    421      1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    422      1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    423      1.1  mrg 	ldd		-0x10(%r30), p224d2
    424      1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    425      1.1  mrg 	addib,=		-1, n, L(end2)
    426      1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    427      1.1  mrg LDEF(loop)
    428      1.1  mrg 	add		p032a1, p032a2, m032
    429      1.1  mrg 	ldd		-0x80(%r30), p000a
    430      1.1  mrg 	add,dc		p096b1, p096b2, m096
    431      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    432      1.1  mrg 
    433      1.1  mrg 	add,dc		p160c1, p160c2, m160
    434      1.1  mrg 	ldd		-0x68(%r30), p064a
    435      1.1  mrg 	add,dc		p224d1, p224d2, m224
    436      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    437      1.1  mrg 
    438      1.1  mrg 	add,dc		%r0, %r0, m288
    439      1.1  mrg 	ldd		-0x40(%r30), p064b
    440      1.1  mrg 	ldo		32(up), up
    441      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    442      1.1  mrg 
    443      1.1  mrg 	depd,z		m032, 31, 32, ma000
    444      1.1  mrg 	ldd		-0x28(%r30), p128b
    445      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    446      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    447      1.1  mrg 
    448      1.1  mrg 	depd		m096, 31, 32, ma064
    449      1.1  mrg 	ldd		-0x60(%r30), p128c
    450      1.1  mrg 	extrd,u		m096, 31, 32, ma128
    451      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    452      1.1  mrg 
    453      1.1  mrg 	depd		m160, 31, 32, ma128
    454      1.1  mrg 	ldd		-0x48(%r30), p192c
    455      1.1  mrg 	extrd,u		m160, 31, 32, ma192
    456      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    457      1.1  mrg 
    458      1.1  mrg 	depd		m224, 31, 32, ma192
    459      1.1  mrg 	ldd		-0x20(%r30), p192d
    460      1.1  mrg 	extrd,u		m224, 31, 32, ma256
    461      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    462      1.1  mrg 
    463      1.1  mrg 	depd		m288, 31, 32, ma256
    464      1.1  mrg 	ldd		-0x88(%r30), p256d
    465      1.1  mrg 	add		climb, p000a, s000
    466      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    467      1.1  mrg 
    468      1.1  mrg 	add,dc		p064a, p064b, s064
    469      1.1  mrg 	add,dc		p128b, p128c, s128
    470      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    471      1.1  mrg 
    472      1.1  mrg 	add,dc		p192c, p192d, s192
    473      1.1  mrg 	add,dc		p256d, %r0, climb
    474      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    475      1.1  mrg 
    476      1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    477      1.1  mrg 	fldd		0(up), %fr4
    478      1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    479      1.1  mrg 	std		s000, 0(rp)
    480      1.1  mrg 
    481      1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    482      1.1  mrg 	fldd		8(up), %fr5
    483      1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    484      1.1  mrg 	std		s064, 8(rp)
    485      1.1  mrg 
    486      1.1  mrg 	add,dc		ma256, climb, climb
    487      1.1  mrg 	fldd		16(up), %fr6
    488      1.1  mrg 	std		s128, 16(rp)
    489      1.1  mrg 
    490      1.1  mrg 	xmpyu		%fr8R, %fr4L, %fr22
    491      1.1  mrg 	ldd		-0x78(%r30), p032a1
    492      1.1  mrg 	xmpyu		%fr8L, %fr4R, %fr23
    493      1.1  mrg 	fldd		24(up), %fr7
    494      1.1  mrg 
    495      1.1  mrg 	xmpyu		%fr8R, %fr5L, %fr24
    496      1.1  mrg 	ldd		-0x70(%r30), p032a2
    497      1.1  mrg 	xmpyu		%fr8L, %fr5R, %fr25
    498      1.1  mrg 	std		s192, 24(rp)
    499      1.1  mrg 
    500      1.1  mrg 	xmpyu		%fr8R, %fr6L, %fr26
    501      1.1  mrg 	ldd		-0x38(%r30), p096b1
    502      1.1  mrg 	xmpyu		%fr8L, %fr6R, %fr27
    503      1.1  mrg 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    504      1.1  mrg 
    505      1.1  mrg 	xmpyu		%fr8R, %fr7L, %fr28
    506      1.1  mrg 	ldd		-0x30(%r30), p096b2
    507      1.1  mrg 	xmpyu		%fr8L, %fr7R, %fr29
    508      1.1  mrg 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    509      1.1  mrg 
    510      1.1  mrg 	xmpyu		%fr8R, %fr4R, %fr30
    511      1.1  mrg 	ldd		-0x58(%r30), p160c1
    512      1.1  mrg 	xmpyu		%fr8L, %fr4L, %fr31
    513      1.1  mrg 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    514      1.1  mrg 
    515      1.1  mrg 	xmpyu		%fr8R, %fr5R, %fr22
    516      1.1  mrg 	ldd		-0x50(%r30), p160c2
    517      1.1  mrg 	xmpyu		%fr8L, %fr5L, %fr23
    518      1.1  mrg 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    519      1.1  mrg 
    520      1.1  mrg 	xmpyu		%fr8R, %fr6R, %fr24
    521      1.1  mrg 	ldd		-0x18(%r30), p224d1
    522      1.1  mrg 	xmpyu		%fr8L, %fr6L, %fr25
    523      1.1  mrg 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    524      1.1  mrg 
    525      1.1  mrg 	xmpyu		%fr8R, %fr7R, %fr26
    526      1.1  mrg 	ldd		-0x10(%r30), p224d2
    527      1.1  mrg 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    528      1.1  mrg 	xmpyu		%fr8L, %fr7L, %fr27
    529      1.1  mrg 
    530      1.1  mrg 	addib,<>	-1, n, L(loop)
    531      1.1  mrg 	ldo		32(rp), rp
    532      1.1  mrg 
    533      1.1  mrg LDEF(end2)
    534      1.1  mrg 	add		p032a1, p032a2, m032
    535      1.1  mrg 	ldd		-0x80(%r30), p000a
    536      1.1  mrg 	add,dc		p096b1, p096b2, m096
    537      1.1  mrg 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    538      1.1  mrg 	add,dc		p160c1, p160c2, m160
    539      1.1  mrg 	ldd		-0x68(%r30), p064a
    540      1.1  mrg 	add,dc		p224d1, p224d2, m224
    541      1.1  mrg 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    542      1.1  mrg 	add,dc		%r0, %r0, m288
    543      1.1  mrg 	ldd		-0x40(%r30), p064b
    544      1.1  mrg 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    545      1.1  mrg 	depd,z		m032, 31, 32, ma000
    546      1.1  mrg 	ldd		-0x28(%r30), p128b
    547      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    548      1.1  mrg 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    549      1.1  mrg 	depd		m096, 31, 32, ma064
    550      1.1  mrg 	ldd		-0x60(%r30), p128c
    551      1.1  mrg 	extrd,u		m096, 31, 32, ma128
    552      1.1  mrg 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    553      1.1  mrg 	depd		m160, 31, 32, ma128
    554      1.1  mrg 	ldd		-0x48(%r30), p192c
    555      1.1  mrg 	extrd,u		m160, 31, 32, ma192
    556      1.1  mrg 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    557      1.1  mrg 	depd		m224, 31, 32, ma192
    558      1.1  mrg 	ldd		-0x20(%r30), p192d
    559      1.1  mrg 	extrd,u		m224, 31, 32, ma256
    560      1.1  mrg 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    561      1.1  mrg 	depd		m288, 31, 32, ma256
    562      1.1  mrg 	ldd		-0x88(%r30), p256d
    563      1.1  mrg 	add		climb, p000a, s000
    564      1.1  mrg 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    565      1.1  mrg 	add,dc		p064a, p064b, s064
    566      1.1  mrg 	add,dc		p128b, p128c, s128
    567      1.1  mrg 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    568      1.1  mrg 	add,dc		p192c, p192d, s192
    569      1.1  mrg 	add,dc		p256d, %r0, climb
    570      1.1  mrg 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    571      1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    572      1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    573      1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    574      1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    575      1.1  mrg 	add,dc		ma256, climb, climb
    576      1.1  mrg 	std		s000, 0(rp)
    577      1.1  mrg 	std		s064, 8(rp)
    578      1.1  mrg 	ldd		-0x78(%r30), p032a1
    579      1.1  mrg 	std		s128, 16(rp)
    580      1.1  mrg 	ldd		-0x70(%r30), p032a2
    581      1.1  mrg 	std		s192, 24(rp)
    582      1.1  mrg 	ldd		-0x38(%r30), p096b1
    583      1.1  mrg 	ldd		-0x30(%r30), p096b2
    584      1.1  mrg 	ldd		-0x58(%r30), p160c1
    585      1.1  mrg 	ldd		-0x50(%r30), p160c2
    586      1.1  mrg 	ldd		-0x18(%r30), p224d1
    587      1.1  mrg 	ldd		-0x10(%r30), p224d2
    588      1.1  mrg 	ldo		32(rp), rp
    589      1.1  mrg 
    590      1.1  mrg LDEF(end1)
    591      1.1  mrg 	add		p032a1, p032a2, m032
    592      1.1  mrg 	ldd		-0x80(%r30), p000a
    593      1.1  mrg 	add,dc		p096b1, p096b2, m096
    594      1.1  mrg 	add,dc		p160c1, p160c2, m160
    595      1.1  mrg 	ldd		-0x68(%r30), p064a
    596      1.1  mrg 	add,dc		p224d1, p224d2, m224
    597      1.1  mrg 	add,dc		%r0, %r0, m288
    598      1.1  mrg 	ldd		-0x40(%r30), p064b
    599      1.1  mrg 	depd,z		m032, 31, 32, ma000
    600      1.1  mrg 	ldd		-0x28(%r30), p128b
    601      1.1  mrg 	extrd,u		m032, 31, 32, ma064
    602      1.1  mrg 	depd		m096, 31, 32, ma064
    603      1.1  mrg 	ldd		-0x60(%r30), p128c
    604      1.1  mrg 	extrd,u		m096, 31, 32, ma128
    605      1.1  mrg 	depd		m160, 31, 32, ma128
    606      1.1  mrg 	ldd		-0x48(%r30), p192c
    607      1.1  mrg 	extrd,u		m160, 31, 32, ma192
    608      1.1  mrg 	depd		m224, 31, 32, ma192
    609      1.1  mrg 	ldd		-0x20(%r30), p192d
    610      1.1  mrg 	extrd,u		m224, 31, 32, ma256
    611      1.1  mrg 	depd		m288, 31, 32, ma256
    612      1.1  mrg 	ldd		-0x88(%r30), p256d
    613      1.1  mrg 	add		climb, p000a, s000
    614      1.1  mrg 	add,dc		p064a, p064b, s064
    615      1.1  mrg 	add,dc		p128b, p128c, s128
    616      1.1  mrg 	add,dc		p192c, p192d, s192
    617      1.1  mrg 	add,dc		p256d, %r0, climb
    618      1.1  mrg 	add		ma000, s000, s000	C accum mid 0
    619      1.1  mrg 	add,dc		ma064, s064, s064	C accum mid 1
    620      1.1  mrg 	add,dc		ma128, s128, s128	C accum mid 2
    621      1.1  mrg 	add,dc		ma192, s192, s192	C accum mid 3
    622      1.1  mrg 	add,dc		ma256, climb, climb
    623      1.1  mrg 	std		s000, 0(rp)
    624      1.1  mrg 	std		s064, 8(rp)
    625      1.1  mrg 	std		s128, 16(rp)
    626      1.1  mrg 	std		s192, 24(rp)
    627      1.1  mrg 
    628      1.1  mrg 	ldd		-0xb0(%r30), %r13
    629      1.1  mrg 	ldd		-0xb8(%r30), %r12
    630      1.1  mrg 	ldd		-0xc0(%r30), %r11
    631      1.1  mrg 	ldd		-0xc8(%r30), %r10
    632      1.1  mrg 	ldd		-0xd0(%r30), %r9
    633      1.1  mrg 	ldd		-0xd8(%r30), %r8
    634      1.1  mrg 	ldd		-0xe0(%r30), %r7
    635      1.1  mrg 	ldd		-0xe8(%r30), %r6
    636      1.1  mrg LDEF(done)
    637      1.1  mrg ifdef(`HAVE_ABI_2_0w',
    638      1.1  mrg `	copy		climb, %r28
    639      1.1  mrg ',`	extrd,u		climb, 63, 32, %r29
    640      1.1  mrg 	extrd,u		climb, 31, 32, %r28
    641      1.1  mrg ')
    642      1.1  mrg 	ldd		-0xf0(%r30), %r5
    643      1.1  mrg 	ldd		-0xf8(%r30), %r4
    644      1.1  mrg 	bve		(%r2)
    645      1.1  mrg 	ldd,mb		-0x100(%r30), %r3
    646      1.1  mrg EPILOGUE(mpn_mul_1)
    647