Home | History | Annotate | Line # | Download | only in pa64
mul_1.asm revision 1.1
      1 dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
      2 dnl  the result in a second limb vector.
      3 
      4 dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of the GNU Lesser General Public License as published
     10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11 dnl  your option) any later version.
     12 
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16 dnl  License for more details.
     17 
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 C		    cycles/limb
     24 C 8000,8200:		6.5
     25 C 8500,8600,8700:	5.625
     26 
     27 C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     28 C  could be saved there per call.
     29 
     30 C  DESCRIPTION:
     31 C  The main loop "BIG" is 4-way unrolled, mainly to allow
     32 C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     33 C  registers to the IU registers, have demanded a deep software pipeline, and
     34 C  a lot of stack slots for partial products in flight.
     35 C
     36 C  CODE STRUCTURE:
     37 C  save-some-registers
     38 C  do 0, 1, 2, or 3 limbs
     39 C  if done, restore-some-regs and return
     40 C  save-many-regs
     41 C  do 4, 8, ... limb
     42 C  restore-all-regs
     43 
     44 C  STACK LAYOUT:
     45 C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     46 C  slots marked FREE, as well as some slots in the caller's "frame marker".
     47 C
     48 C -00 <- r30
     49 C -08  FREE
     50 C -10  tmp
     51 C -18  tmp
     52 C -20  tmp
     53 C -28  tmp
     54 C -30  tmp
     55 C -38  tmp
     56 C -40  tmp
     57 C -48  tmp
     58 C -50  tmp
     59 C -58  tmp
     60 C -60  tmp
     61 C -68  tmp
     62 C -70  tmp
     63 C -78  tmp
     64 C -80  tmp
     65 C -88  tmp
     66 C -90  FREE
     67 C -98  FREE
     68 C -a0  FREE
     69 C -a8  FREE
     70 C -b0  r13
     71 C -b8  r12
     72 C -c0  r11
     73 C -c8  r10
     74 C -d0  r8
     75 C -d8  r8
     76 C -e0  r7
     77 C -e8  r6
     78 C -f0  r5
     79 C -f8  r4
     80 C -100 r3
     81 C  Previous frame:
     82 C  [unused area]
     83 C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     84 
     85 
     86 include(`../config.m4')
     87 
     88 C INPUT PARAMETERS:
     89 define(`rp',`%r26')	C
     90 define(`up',`%r25')	C
     91 define(`n',`%r24')	C
     92 define(`vlimb',`%r23')	C
     93 
     94 define(`climb',`%r23')	C
     95 
     96 ifdef(`HAVE_ABI_2_0w',
     97 `	.level	2.0w
     98 ',`	.level	2.0
     99 ')
    100 PROLOGUE(mpn_mul_1)
    101 
    102 ifdef(`HAVE_ABI_2_0w',
    103 `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    104 ')
    105 	std,ma		%r3, 0x100(%r30)
    106 	std		%r4, -0xf8(%r30)
    107 	std		%r5, -0xf0(%r30)
    108 	ldo		0(%r0), climb		C clear climb
    109 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    110 
    111 define(`p032a1',`%r1')	C
    112 define(`p032a2',`%r19')	C
    113 
    114 define(`m032',`%r20')	C
    115 define(`m096',`%r21')	C
    116 
    117 define(`p000a',`%r22')	C
    118 define(`p064a',`%r29')	C
    119 
    120 define(`s000',`%r31')	C
    121 
    122 define(`ma000',`%r4')	C
    123 define(`ma064',`%r20')	C
    124 
    125 C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
    126 
    127 	extrd,u		n, 63, 2, %r5
    128 	cmpb,=		%r5, %r0, L(BIG)
    129 	nop
    130 
    131 	fldd		0(up), %fr4
    132 	ldo		8(up), up
    133 	xmpyu		%fr8R, %fr4L, %fr22
    134 	xmpyu		%fr8L, %fr4R, %fr23
    135 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    136 	xmpyu		%fr8R, %fr4R, %fr24
    137 	xmpyu		%fr8L, %fr4L, %fr25
    138 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    139 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    140 	addib,<>	-1, %r5, L(two_or_more)
    141 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    142 LDEF(one)
    143 	ldd		-0x78(%r30), p032a1
    144 	ldd		-0x70(%r30), p032a2
    145 	ldd		-0x80(%r30), p000a
    146 	b		L(0_one_out)
    147 	ldd		-0x68(%r30), p064a
    148 
    149 LDEF(two_or_more)
    150 	fldd		0(up), %fr4
    151 	ldo		8(up), up
    152 	xmpyu		%fr8R, %fr4L, %fr22
    153 	xmpyu		%fr8L, %fr4R, %fr23
    154 	ldd		-0x78(%r30), p032a1
    155 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    156 	xmpyu		%fr8R, %fr4R, %fr24
    157 	xmpyu		%fr8L, %fr4L, %fr25
    158 	ldd		-0x70(%r30), p032a2
    159 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    160 	ldd		-0x80(%r30), p000a
    161 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    162 	ldd		-0x68(%r30), p064a
    163 	addib,<>	-1, %r5, L(three_or_more)
    164 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    165 LDEF(two)
    166 	add		p032a1, p032a2, m032
    167 	add,dc		%r0, %r0, m096
    168 	depd,z		m032, 31, 32, ma000
    169 	extrd,u		m032, 31, 32, ma064
    170 	b		L(0_two_out)
    171 	depd		m096, 31, 32, ma064
    172 
    173 LDEF(three_or_more)
    174 	fldd		0(up), %fr4
    175 	add		p032a1, p032a2, m032
    176 	add,dc		%r0, %r0, m096
    177 	depd,z		m032, 31, 32, ma000
    178 	extrd,u		m032, 31, 32, ma064
    179 C	addib,=		-1, %r5, L(0_out)
    180 	depd		m096, 31, 32, ma064
    181 LDEF(loop0)
    182 C	xmpyu		%fr8R, %fr4L, %fr22
    183 C	xmpyu		%fr8L, %fr4R, %fr23
    184 C	ldd		-0x78(%r30), p032a1
    185 C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    186 C
    187 C	xmpyu		%fr8R, %fr4R, %fr24
    188 C	xmpyu		%fr8L, %fr4L, %fr25
    189 C	ldd		-0x70(%r30), p032a2
    190 C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    191 C
    192 C	ldo		8(rp), rp
    193 C	add		climb, p000a, s000
    194 C	ldd		-0x80(%r30), p000a
    195 C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    196 C
    197 C	add,dc		p064a, %r0, climb
    198 C	ldo		8(up), up
    199 C	ldd		-0x68(%r30), p064a
    200 C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    201 C
    202 C	add		ma000, s000, s000
    203 C	add,dc		ma064, climb, climb
    204 C	fldd		0(up), %fr4
    205 C
    206 C	std		s000, -8(rp)
    207 C
    208 C	add		p032a1, p032a2, m032
    209 C	add,dc		%r0, %r0, m096
    210 C
    211 C	depd,z		m032, 31, 32, ma000
    212 C	extrd,u		m032, 31, 32, ma064
    213 C	addib,<>	-1, %r5, L(loop0)
    214 C	depd		m096, 31, 32, ma064
    215 LDEF(0_out)
    216 	ldo		8(up), up
    217 	xmpyu		%fr8R, %fr4L, %fr22
    218 	xmpyu		%fr8L, %fr4R, %fr23
    219 	ldd		-0x78(%r30), p032a1
    220 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    221 	xmpyu		%fr8R, %fr4R, %fr24
    222 	xmpyu		%fr8L, %fr4L, %fr25
    223 	ldd		-0x70(%r30), p032a2
    224 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    225 	ldo		8(rp), rp
    226 	add		climb, p000a, s000
    227 	ldd		-0x80(%r30), p000a
    228 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    229 	add,dc		p064a, %r0, climb
    230 	ldd		-0x68(%r30), p064a
    231 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    232 	add		ma000, s000, s000
    233 	add,dc		ma064, climb, climb
    234 	std		s000, -8(rp)
    235 	add		p032a1, p032a2, m032
    236 	add,dc		%r0, %r0, m096
    237 	depd,z		m032, 31, 32, ma000
    238 	extrd,u		m032, 31, 32, ma064
    239 	depd		m096, 31, 32, ma064
    240 LDEF(0_two_out)
    241 	ldd		-0x78(%r30), p032a1
    242 	ldd		-0x70(%r30), p032a2
    243 	ldo		8(rp), rp
    244 	add		climb, p000a, s000
    245 	ldd		-0x80(%r30), p000a
    246 	add,dc		p064a, %r0, climb
    247 	ldd		-0x68(%r30), p064a
    248 	add		ma000, s000, s000
    249 	add,dc		ma064, climb, climb
    250 	std		s000, -8(rp)
    251 LDEF(0_one_out)
    252 	add		p032a1, p032a2, m032
    253 	add,dc		%r0, %r0, m096
    254 	depd,z		m032, 31, 32, ma000
    255 	extrd,u		m032, 31, 32, ma064
    256 	depd		m096, 31, 32, ma064
    257 
    258 	add		climb, p000a, s000
    259 	add,dc		p064a, %r0, climb
    260 	add		ma000, s000, s000
    261 	add,dc		ma064, climb, climb
    262 	std		s000, 0(rp)
    263 
    264 	cmpib,>=	4, n, L(done)
    265 	ldo		8(rp), rp
    266 
    267 C 4-way unrolled code.
    268 
    269 LDEF(BIG)
    270 
    271 define(`p032a1',`%r1')	C
    272 define(`p032a2',`%r19')	C
    273 define(`p096b1',`%r20')	C
    274 define(`p096b2',`%r21')	C
    275 define(`p160c1',`%r22')	C
    276 define(`p160c2',`%r29')	C
    277 define(`p224d1',`%r31')	C
    278 define(`p224d2',`%r3')	C
    279 			C
    280 define(`m032',`%r4')	C
    281 define(`m096',`%r5')	C
    282 define(`m160',`%r6')	C
    283 define(`m224',`%r7')	C
    284 define(`m288',`%r8')	C
    285 			C
    286 define(`p000a',`%r1')	C
    287 define(`p064a',`%r19')	C
    288 define(`p064b',`%r20')	C
    289 define(`p128b',`%r21')	C
    290 define(`p128c',`%r22')	C
    291 define(`p192c',`%r29')	C
    292 define(`p192d',`%r31')	C
    293 define(`p256d',`%r3')	C
    294 			C
    295 define(`s000',`%r10')	C
    296 define(`s064',`%r11')	C
    297 define(`s128',`%r12')	C
    298 define(`s192',`%r13')	C
    299 			C
    300 define(`ma000',`%r9')	C
    301 define(`ma064',`%r4')	C
    302 define(`ma128',`%r5')	C
    303 define(`ma192',`%r6')	C
    304 define(`ma256',`%r7')	C
    305 
    306 	std		%r6, -0xe8(%r30)
    307 	std		%r7, -0xe0(%r30)
    308 	std		%r8, -0xd8(%r30)
    309 	std		%r9, -0xd0(%r30)
    310 	std		%r10, -0xc8(%r30)
    311 	std		%r11, -0xc0(%r30)
    312 	std		%r12, -0xb8(%r30)
    313 	std		%r13, -0xb0(%r30)
    314 
    315 ifdef(`HAVE_ABI_2_0w',
    316 `	extrd,u		n, 61, 62, n		C right shift 2
    317 ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    318 ')
    319 
    320 LDEF(4_or_more)
    321 	fldd		0(up), %fr4
    322 	fldd		8(up), %fr5
    323 	fldd		16(up), %fr6
    324 	fldd		24(up), %fr7
    325 	xmpyu		%fr8R, %fr4L, %fr22
    326 	xmpyu		%fr8L, %fr4R, %fr23
    327 	xmpyu		%fr8R, %fr5L, %fr24
    328 	xmpyu		%fr8L, %fr5R, %fr25
    329 	xmpyu		%fr8R, %fr6L, %fr26
    330 	xmpyu		%fr8L, %fr6R, %fr27
    331 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    332 	xmpyu		%fr8R, %fr7L, %fr28
    333 	xmpyu		%fr8L, %fr7R, %fr29
    334 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    335 	xmpyu		%fr8R, %fr4R, %fr30
    336 	xmpyu		%fr8L, %fr4L, %fr31
    337 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    338 	xmpyu		%fr8R, %fr5R, %fr22
    339 	xmpyu		%fr8L, %fr5L, %fr23
    340 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    341 	xmpyu		%fr8R, %fr6R, %fr24
    342 	xmpyu		%fr8L, %fr6L, %fr25
    343 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    344 	xmpyu		%fr8R, %fr7R, %fr26
    345 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    346 	addib,<>	-1, n, L(8_or_more)
    347 	xmpyu		%fr8L, %fr7L, %fr27
    348 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    349 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    350 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    351 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    352 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    353 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    354 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    355 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    356 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    357 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    358 	ldd		-0x78(%r30), p032a1
    359 	ldd		-0x70(%r30), p032a2
    360 	ldd		-0x38(%r30), p096b1
    361 	ldd		-0x30(%r30), p096b2
    362 	ldd		-0x58(%r30), p160c1
    363 	ldd		-0x50(%r30), p160c2
    364 	ldd		-0x18(%r30), p224d1
    365 	ldd		-0x10(%r30), p224d2
    366 	b		L(end1)
    367 	nop
    368 
    369 LDEF(8_or_more)
    370 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    371 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    372 	ldo		32(up), up
    373 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    374 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    375 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    376 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    377 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    378 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    379 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    380 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    381 	fldd		0(up), %fr4
    382 	fldd		8(up), %fr5
    383 	fldd		16(up), %fr6
    384 	fldd		24(up), %fr7
    385 	xmpyu		%fr8R, %fr4L, %fr22
    386 	ldd		-0x78(%r30), p032a1
    387 	xmpyu		%fr8L, %fr4R, %fr23
    388 	xmpyu		%fr8R, %fr5L, %fr24
    389 	ldd		-0x70(%r30), p032a2
    390 	xmpyu		%fr8L, %fr5R, %fr25
    391 	xmpyu		%fr8R, %fr6L, %fr26
    392 	ldd		-0x38(%r30), p096b1
    393 	xmpyu		%fr8L, %fr6R, %fr27
    394 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    395 	xmpyu		%fr8R, %fr7L, %fr28
    396 	ldd		-0x30(%r30), p096b2
    397 	xmpyu		%fr8L, %fr7R, %fr29
    398 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    399 	xmpyu		%fr8R, %fr4R, %fr30
    400 	ldd		-0x58(%r30), p160c1
    401 	xmpyu		%fr8L, %fr4L, %fr31
    402 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    403 	xmpyu		%fr8R, %fr5R, %fr22
    404 	ldd		-0x50(%r30), p160c2
    405 	xmpyu		%fr8L, %fr5L, %fr23
    406 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    407 	xmpyu		%fr8R, %fr6R, %fr24
    408 	ldd		-0x18(%r30), p224d1
    409 	xmpyu		%fr8L, %fr6L, %fr25
    410 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    411 	xmpyu		%fr8R, %fr7R, %fr26
    412 	ldd		-0x10(%r30), p224d2
    413 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    414 	addib,=		-1, n, L(end2)
    415 	xmpyu		%fr8L, %fr7L, %fr27
    416 LDEF(loop)
    417 	add		p032a1, p032a2, m032
    418 	ldd		-0x80(%r30), p000a
    419 	add,dc		p096b1, p096b2, m096
    420 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    421 
    422 	add,dc		p160c1, p160c2, m160
    423 	ldd		-0x68(%r30), p064a
    424 	add,dc		p224d1, p224d2, m224
    425 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    426 
    427 	add,dc		%r0, %r0, m288
    428 	ldd		-0x40(%r30), p064b
    429 	ldo		32(up), up
    430 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    431 
    432 	depd,z		m032, 31, 32, ma000
    433 	ldd		-0x28(%r30), p128b
    434 	extrd,u		m032, 31, 32, ma064
    435 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    436 
    437 	depd		m096, 31, 32, ma064
    438 	ldd		-0x60(%r30), p128c
    439 	extrd,u		m096, 31, 32, ma128
    440 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    441 
    442 	depd		m160, 31, 32, ma128
    443 	ldd		-0x48(%r30), p192c
    444 	extrd,u		m160, 31, 32, ma192
    445 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    446 
    447 	depd		m224, 31, 32, ma192
    448 	ldd		-0x20(%r30), p192d
    449 	extrd,u		m224, 31, 32, ma256
    450 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    451 
    452 	depd		m288, 31, 32, ma256
    453 	ldd		-0x88(%r30), p256d
    454 	add		climb, p000a, s000
    455 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    456 
    457 	add,dc		p064a, p064b, s064
    458 	add,dc		p128b, p128c, s128
    459 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    460 
    461 	add,dc		p192c, p192d, s192
    462 	add,dc		p256d, %r0, climb
    463 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    464 
    465 	add		ma000, s000, s000	C accum mid 0
    466 	fldd		0(up), %fr4
    467 	add,dc		ma064, s064, s064	C accum mid 1
    468 	std		s000, 0(rp)
    469 
    470 	add,dc		ma128, s128, s128	C accum mid 2
    471 	fldd		8(up), %fr5
    472 	add,dc		ma192, s192, s192	C accum mid 3
    473 	std		s064, 8(rp)
    474 
    475 	add,dc		ma256, climb, climb
    476 	fldd		16(up), %fr6
    477 	std		s128, 16(rp)
    478 
    479 	xmpyu		%fr8R, %fr4L, %fr22
    480 	ldd		-0x78(%r30), p032a1
    481 	xmpyu		%fr8L, %fr4R, %fr23
    482 	fldd		24(up), %fr7
    483 
    484 	xmpyu		%fr8R, %fr5L, %fr24
    485 	ldd		-0x70(%r30), p032a2
    486 	xmpyu		%fr8L, %fr5R, %fr25
    487 	std		s192, 24(rp)
    488 
    489 	xmpyu		%fr8R, %fr6L, %fr26
    490 	ldd		-0x38(%r30), p096b1
    491 	xmpyu		%fr8L, %fr6R, %fr27
    492 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    493 
    494 	xmpyu		%fr8R, %fr7L, %fr28
    495 	ldd		-0x30(%r30), p096b2
    496 	xmpyu		%fr8L, %fr7R, %fr29
    497 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    498 
    499 	xmpyu		%fr8R, %fr4R, %fr30
    500 	ldd		-0x58(%r30), p160c1
    501 	xmpyu		%fr8L, %fr4L, %fr31
    502 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    503 
    504 	xmpyu		%fr8R, %fr5R, %fr22
    505 	ldd		-0x50(%r30), p160c2
    506 	xmpyu		%fr8L, %fr5L, %fr23
    507 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    508 
    509 	xmpyu		%fr8R, %fr6R, %fr24
    510 	ldd		-0x18(%r30), p224d1
    511 	xmpyu		%fr8L, %fr6L, %fr25
    512 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    513 
    514 	xmpyu		%fr8R, %fr7R, %fr26
    515 	ldd		-0x10(%r30), p224d2
    516 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    517 	xmpyu		%fr8L, %fr7L, %fr27
    518 
    519 	addib,<>	-1, n, L(loop)
    520 	ldo		32(rp), rp
    521 
    522 LDEF(end2)
    523 	add		p032a1, p032a2, m032
    524 	ldd		-0x80(%r30), p000a
    525 	add,dc		p096b1, p096b2, m096
    526 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    527 	add,dc		p160c1, p160c2, m160
    528 	ldd		-0x68(%r30), p064a
    529 	add,dc		p224d1, p224d2, m224
    530 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    531 	add,dc		%r0, %r0, m288
    532 	ldd		-0x40(%r30), p064b
    533 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    534 	depd,z		m032, 31, 32, ma000
    535 	ldd		-0x28(%r30), p128b
    536 	extrd,u		m032, 31, 32, ma064
    537 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    538 	depd		m096, 31, 32, ma064
    539 	ldd		-0x60(%r30), p128c
    540 	extrd,u		m096, 31, 32, ma128
    541 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    542 	depd		m160, 31, 32, ma128
    543 	ldd		-0x48(%r30), p192c
    544 	extrd,u		m160, 31, 32, ma192
    545 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    546 	depd		m224, 31, 32, ma192
    547 	ldd		-0x20(%r30), p192d
    548 	extrd,u		m224, 31, 32, ma256
    549 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    550 	depd		m288, 31, 32, ma256
    551 	ldd		-0x88(%r30), p256d
    552 	add		climb, p000a, s000
    553 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    554 	add,dc		p064a, p064b, s064
    555 	add,dc		p128b, p128c, s128
    556 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    557 	add,dc		p192c, p192d, s192
    558 	add,dc		p256d, %r0, climb
    559 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    560 	add		ma000, s000, s000	C accum mid 0
    561 	add,dc		ma064, s064, s064	C accum mid 1
    562 	add,dc		ma128, s128, s128	C accum mid 2
    563 	add,dc		ma192, s192, s192	C accum mid 3
    564 	add,dc		ma256, climb, climb
    565 	std		s000, 0(rp)
    566 	std		s064, 8(rp)
    567 	ldd		-0x78(%r30), p032a1
    568 	std		s128, 16(rp)
    569 	ldd		-0x70(%r30), p032a2
    570 	std		s192, 24(rp)
    571 	ldd		-0x38(%r30), p096b1
    572 	ldd		-0x30(%r30), p096b2
    573 	ldd		-0x58(%r30), p160c1
    574 	ldd		-0x50(%r30), p160c2
    575 	ldd		-0x18(%r30), p224d1
    576 	ldd		-0x10(%r30), p224d2
    577 	ldo		32(rp), rp
    578 
    579 LDEF(end1)
    580 	add		p032a1, p032a2, m032
    581 	ldd		-0x80(%r30), p000a
    582 	add,dc		p096b1, p096b2, m096
    583 	add,dc		p160c1, p160c2, m160
    584 	ldd		-0x68(%r30), p064a
    585 	add,dc		p224d1, p224d2, m224
    586 	add,dc		%r0, %r0, m288
    587 	ldd		-0x40(%r30), p064b
    588 	depd,z		m032, 31, 32, ma000
    589 	ldd		-0x28(%r30), p128b
    590 	extrd,u		m032, 31, 32, ma064
    591 	depd		m096, 31, 32, ma064
    592 	ldd		-0x60(%r30), p128c
    593 	extrd,u		m096, 31, 32, ma128
    594 	depd		m160, 31, 32, ma128
    595 	ldd		-0x48(%r30), p192c
    596 	extrd,u		m160, 31, 32, ma192
    597 	depd		m224, 31, 32, ma192
    598 	ldd		-0x20(%r30), p192d
    599 	extrd,u		m224, 31, 32, ma256
    600 	depd		m288, 31, 32, ma256
    601 	ldd		-0x88(%r30), p256d
    602 	add		climb, p000a, s000
    603 	add,dc		p064a, p064b, s064
    604 	add,dc		p128b, p128c, s128
    605 	add,dc		p192c, p192d, s192
    606 	add,dc		p256d, %r0, climb
    607 	add		ma000, s000, s000	C accum mid 0
    608 	add,dc		ma064, s064, s064	C accum mid 1
    609 	add,dc		ma128, s128, s128	C accum mid 2
    610 	add,dc		ma192, s192, s192	C accum mid 3
    611 	add,dc		ma256, climb, climb
    612 	std		s000, 0(rp)
    613 	std		s064, 8(rp)
    614 	std		s128, 16(rp)
    615 	std		s192, 24(rp)
    616 
    617 	ldd		-0xb0(%r30), %r13
    618 	ldd		-0xb8(%r30), %r12
    619 	ldd		-0xc0(%r30), %r11
    620 	ldd		-0xc8(%r30), %r10
    621 	ldd		-0xd0(%r30), %r9
    622 	ldd		-0xd8(%r30), %r8
    623 	ldd		-0xe0(%r30), %r7
    624 	ldd		-0xe8(%r30), %r6
    625 LDEF(done)
    626 ifdef(`HAVE_ABI_2_0w',
    627 `	copy		climb, %r28
    628 ',`	extrd,u		climb, 63, 32, %r29
    629 	extrd,u		climb, 31, 32, %r28
    630 ')
    631 	ldd		-0xf0(%r30), %r5
    632 	ldd		-0xf8(%r30), %r4
    633 	bve		(%r2)
    634 	ldd,mb		-0x100(%r30), %r3
    635 EPILOGUE(mpn_mul_1)
    636