Home | History | Annotate | Line # | Download | only in pa64
mul_1.asm revision 1.1.1.2
      1 dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
      2 dnl  the result in a second limb vector.
      3 
      4 dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 C		    cycles/limb
     35 C 8000,8200:		6.5
     36 C 8500,8600,8700:	5.625
     37 
     38 C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     39 C  could be saved there per call.
     40 
     41 C  DESCRIPTION:
     42 C  The main loop "BIG" is 4-way unrolled, mainly to allow
     43 C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     44 C  registers to the IU registers, have demanded a deep software pipeline, and
     45 C  a lot of stack slots for partial products in flight.
     46 C
     47 C  CODE STRUCTURE:
     48 C  save-some-registers
     49 C  do 0, 1, 2, or 3 limbs
     50 C  if done, restore-some-regs and return
     51 C  save-many-regs
     52 C  do 4, 8, ... limb
     53 C  restore-all-regs
     54 
     55 C  STACK LAYOUT:
     56 C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     57 C  slots marked FREE, as well as some slots in the caller's "frame marker".
     58 C
     59 C -00 <- r30
     60 C -08  FREE
     61 C -10  tmp
     62 C -18  tmp
     63 C -20  tmp
     64 C -28  tmp
     65 C -30  tmp
     66 C -38  tmp
     67 C -40  tmp
     68 C -48  tmp
     69 C -50  tmp
     70 C -58  tmp
     71 C -60  tmp
     72 C -68  tmp
     73 C -70  tmp
     74 C -78  tmp
     75 C -80  tmp
     76 C -88  tmp
     77 C -90  FREE
     78 C -98  FREE
     79 C -a0  FREE
     80 C -a8  FREE
     81 C -b0  r13
     82 C -b8  r12
     83 C -c0  r11
     84 C -c8  r10
     85 C -d0  r8
     86 C -d8  r8
     87 C -e0  r7
     88 C -e8  r6
     89 C -f0  r5
     90 C -f8  r4
     91 C -100 r3
     92 C  Previous frame:
     93 C  [unused area]
     94 C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     95 
     96 
     97 include(`../config.m4')
     98 
     99 C INPUT PARAMETERS:
    100 define(`rp',`%r26')	C
    101 define(`up',`%r25')	C
    102 define(`n',`%r24')	C
    103 define(`vlimb',`%r23')	C
    104 
    105 define(`climb',`%r23')	C
    106 
    107 ifdef(`HAVE_ABI_2_0w',
    108 `	.level	2.0w
    109 ',`	.level	2.0
    110 ')
    111 PROLOGUE(mpn_mul_1)
    112 
    113 ifdef(`HAVE_ABI_2_0w',
    114 `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    115 ')
    116 	std,ma		%r3, 0x100(%r30)
    117 	std		%r4, -0xf8(%r30)
    118 	std		%r5, -0xf0(%r30)
    119 	ldo		0(%r0), climb		C clear climb
    120 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    121 
    122 define(`p032a1',`%r1')	C
    123 define(`p032a2',`%r19')	C
    124 
    125 define(`m032',`%r20')	C
    126 define(`m096',`%r21')	C
    127 
    128 define(`p000a',`%r22')	C
    129 define(`p064a',`%r29')	C
    130 
    131 define(`s000',`%r31')	C
    132 
    133 define(`ma000',`%r4')	C
    134 define(`ma064',`%r20')	C
    135 
    136 C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
    137 
    138 	extrd,u		n, 63, 2, %r5
    139 	cmpb,=		%r5, %r0, L(BIG)
    140 	nop
    141 
    142 	fldd		0(up), %fr4
    143 	ldo		8(up), up
    144 	xmpyu		%fr8R, %fr4L, %fr22
    145 	xmpyu		%fr8L, %fr4R, %fr23
    146 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    147 	xmpyu		%fr8R, %fr4R, %fr24
    148 	xmpyu		%fr8L, %fr4L, %fr25
    149 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    150 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    151 	addib,<>	-1, %r5, L(two_or_more)
    152 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    153 LDEF(one)
    154 	ldd		-0x78(%r30), p032a1
    155 	ldd		-0x70(%r30), p032a2
    156 	ldd		-0x80(%r30), p000a
    157 	b		L(0_one_out)
    158 	ldd		-0x68(%r30), p064a
    159 
    160 LDEF(two_or_more)
    161 	fldd		0(up), %fr4
    162 	ldo		8(up), up
    163 	xmpyu		%fr8R, %fr4L, %fr22
    164 	xmpyu		%fr8L, %fr4R, %fr23
    165 	ldd		-0x78(%r30), p032a1
    166 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    167 	xmpyu		%fr8R, %fr4R, %fr24
    168 	xmpyu		%fr8L, %fr4L, %fr25
    169 	ldd		-0x70(%r30), p032a2
    170 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    171 	ldd		-0x80(%r30), p000a
    172 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    173 	ldd		-0x68(%r30), p064a
    174 	addib,<>	-1, %r5, L(three_or_more)
    175 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    176 LDEF(two)
    177 	add		p032a1, p032a2, m032
    178 	add,dc		%r0, %r0, m096
    179 	depd,z		m032, 31, 32, ma000
    180 	extrd,u		m032, 31, 32, ma064
    181 	b		L(0_two_out)
    182 	depd		m096, 31, 32, ma064
    183 
    184 LDEF(three_or_more)
    185 	fldd		0(up), %fr4
    186 	add		p032a1, p032a2, m032
    187 	add,dc		%r0, %r0, m096
    188 	depd,z		m032, 31, 32, ma000
    189 	extrd,u		m032, 31, 32, ma064
    190 C	addib,=		-1, %r5, L(0_out)
    191 	depd		m096, 31, 32, ma064
    192 LDEF(loop0)
    193 C	xmpyu		%fr8R, %fr4L, %fr22
    194 C	xmpyu		%fr8L, %fr4R, %fr23
    195 C	ldd		-0x78(%r30), p032a1
    196 C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    197 C
    198 C	xmpyu		%fr8R, %fr4R, %fr24
    199 C	xmpyu		%fr8L, %fr4L, %fr25
    200 C	ldd		-0x70(%r30), p032a2
    201 C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    202 C
    203 C	ldo		8(rp), rp
    204 C	add		climb, p000a, s000
    205 C	ldd		-0x80(%r30), p000a
    206 C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    207 C
    208 C	add,dc		p064a, %r0, climb
    209 C	ldo		8(up), up
    210 C	ldd		-0x68(%r30), p064a
    211 C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    212 C
    213 C	add		ma000, s000, s000
    214 C	add,dc		ma064, climb, climb
    215 C	fldd		0(up), %fr4
    216 C
    217 C	std		s000, -8(rp)
    218 C
    219 C	add		p032a1, p032a2, m032
    220 C	add,dc		%r0, %r0, m096
    221 C
    222 C	depd,z		m032, 31, 32, ma000
    223 C	extrd,u		m032, 31, 32, ma064
    224 C	addib,<>	-1, %r5, L(loop0)
    225 C	depd		m096, 31, 32, ma064
    226 LDEF(0_out)
    227 	ldo		8(up), up
    228 	xmpyu		%fr8R, %fr4L, %fr22
    229 	xmpyu		%fr8L, %fr4R, %fr23
    230 	ldd		-0x78(%r30), p032a1
    231 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    232 	xmpyu		%fr8R, %fr4R, %fr24
    233 	xmpyu		%fr8L, %fr4L, %fr25
    234 	ldd		-0x70(%r30), p032a2
    235 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    236 	ldo		8(rp), rp
    237 	add		climb, p000a, s000
    238 	ldd		-0x80(%r30), p000a
    239 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    240 	add,dc		p064a, %r0, climb
    241 	ldd		-0x68(%r30), p064a
    242 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    243 	add		ma000, s000, s000
    244 	add,dc		ma064, climb, climb
    245 	std		s000, -8(rp)
    246 	add		p032a1, p032a2, m032
    247 	add,dc		%r0, %r0, m096
    248 	depd,z		m032, 31, 32, ma000
    249 	extrd,u		m032, 31, 32, ma064
    250 	depd		m096, 31, 32, ma064
    251 LDEF(0_two_out)
    252 	ldd		-0x78(%r30), p032a1
    253 	ldd		-0x70(%r30), p032a2
    254 	ldo		8(rp), rp
    255 	add		climb, p000a, s000
    256 	ldd		-0x80(%r30), p000a
    257 	add,dc		p064a, %r0, climb
    258 	ldd		-0x68(%r30), p064a
    259 	add		ma000, s000, s000
    260 	add,dc		ma064, climb, climb
    261 	std		s000, -8(rp)
    262 LDEF(0_one_out)
    263 	add		p032a1, p032a2, m032
    264 	add,dc		%r0, %r0, m096
    265 	depd,z		m032, 31, 32, ma000
    266 	extrd,u		m032, 31, 32, ma064
    267 	depd		m096, 31, 32, ma064
    268 
    269 	add		climb, p000a, s000
    270 	add,dc		p064a, %r0, climb
    271 	add		ma000, s000, s000
    272 	add,dc		ma064, climb, climb
    273 	std		s000, 0(rp)
    274 
    275 	cmpib,>=	4, n, L(done)
    276 	ldo		8(rp), rp
    277 
    278 C 4-way unrolled code.
    279 
    280 LDEF(BIG)
    281 
    282 define(`p032a1',`%r1')	C
    283 define(`p032a2',`%r19')	C
    284 define(`p096b1',`%r20')	C
    285 define(`p096b2',`%r21')	C
    286 define(`p160c1',`%r22')	C
    287 define(`p160c2',`%r29')	C
    288 define(`p224d1',`%r31')	C
    289 define(`p224d2',`%r3')	C
    290 			C
    291 define(`m032',`%r4')	C
    292 define(`m096',`%r5')	C
    293 define(`m160',`%r6')	C
    294 define(`m224',`%r7')	C
    295 define(`m288',`%r8')	C
    296 			C
    297 define(`p000a',`%r1')	C
    298 define(`p064a',`%r19')	C
    299 define(`p064b',`%r20')	C
    300 define(`p128b',`%r21')	C
    301 define(`p128c',`%r22')	C
    302 define(`p192c',`%r29')	C
    303 define(`p192d',`%r31')	C
    304 define(`p256d',`%r3')	C
    305 			C
    306 define(`s000',`%r10')	C
    307 define(`s064',`%r11')	C
    308 define(`s128',`%r12')	C
    309 define(`s192',`%r13')	C
    310 			C
    311 define(`ma000',`%r9')	C
    312 define(`ma064',`%r4')	C
    313 define(`ma128',`%r5')	C
    314 define(`ma192',`%r6')	C
    315 define(`ma256',`%r7')	C
    316 
    317 	std		%r6, -0xe8(%r30)
    318 	std		%r7, -0xe0(%r30)
    319 	std		%r8, -0xd8(%r30)
    320 	std		%r9, -0xd0(%r30)
    321 	std		%r10, -0xc8(%r30)
    322 	std		%r11, -0xc0(%r30)
    323 	std		%r12, -0xb8(%r30)
    324 	std		%r13, -0xb0(%r30)
    325 
    326 ifdef(`HAVE_ABI_2_0w',
    327 `	extrd,u		n, 61, 62, n		C right shift 2
    328 ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    329 ')
    330 
    331 LDEF(4_or_more)
    332 	fldd		0(up), %fr4
    333 	fldd		8(up), %fr5
    334 	fldd		16(up), %fr6
    335 	fldd		24(up), %fr7
    336 	xmpyu		%fr8R, %fr4L, %fr22
    337 	xmpyu		%fr8L, %fr4R, %fr23
    338 	xmpyu		%fr8R, %fr5L, %fr24
    339 	xmpyu		%fr8L, %fr5R, %fr25
    340 	xmpyu		%fr8R, %fr6L, %fr26
    341 	xmpyu		%fr8L, %fr6R, %fr27
    342 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    343 	xmpyu		%fr8R, %fr7L, %fr28
    344 	xmpyu		%fr8L, %fr7R, %fr29
    345 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    346 	xmpyu		%fr8R, %fr4R, %fr30
    347 	xmpyu		%fr8L, %fr4L, %fr31
    348 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    349 	xmpyu		%fr8R, %fr5R, %fr22
    350 	xmpyu		%fr8L, %fr5L, %fr23
    351 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    352 	xmpyu		%fr8R, %fr6R, %fr24
    353 	xmpyu		%fr8L, %fr6L, %fr25
    354 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    355 	xmpyu		%fr8R, %fr7R, %fr26
    356 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    357 	addib,<>	-1, n, L(8_or_more)
    358 	xmpyu		%fr8L, %fr7L, %fr27
    359 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    360 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    361 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    362 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    363 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    364 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    365 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    366 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    367 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    368 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    369 	ldd		-0x78(%r30), p032a1
    370 	ldd		-0x70(%r30), p032a2
    371 	ldd		-0x38(%r30), p096b1
    372 	ldd		-0x30(%r30), p096b2
    373 	ldd		-0x58(%r30), p160c1
    374 	ldd		-0x50(%r30), p160c2
    375 	ldd		-0x18(%r30), p224d1
    376 	ldd		-0x10(%r30), p224d2
    377 	b		L(end1)
    378 	nop
    379 
    380 LDEF(8_or_more)
    381 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    382 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    383 	ldo		32(up), up
    384 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    385 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    386 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    387 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    388 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    389 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    390 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    391 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    392 	fldd		0(up), %fr4
    393 	fldd		8(up), %fr5
    394 	fldd		16(up), %fr6
    395 	fldd		24(up), %fr7
    396 	xmpyu		%fr8R, %fr4L, %fr22
    397 	ldd		-0x78(%r30), p032a1
    398 	xmpyu		%fr8L, %fr4R, %fr23
    399 	xmpyu		%fr8R, %fr5L, %fr24
    400 	ldd		-0x70(%r30), p032a2
    401 	xmpyu		%fr8L, %fr5R, %fr25
    402 	xmpyu		%fr8R, %fr6L, %fr26
    403 	ldd		-0x38(%r30), p096b1
    404 	xmpyu		%fr8L, %fr6R, %fr27
    405 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    406 	xmpyu		%fr8R, %fr7L, %fr28
    407 	ldd		-0x30(%r30), p096b2
    408 	xmpyu		%fr8L, %fr7R, %fr29
    409 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    410 	xmpyu		%fr8R, %fr4R, %fr30
    411 	ldd		-0x58(%r30), p160c1
    412 	xmpyu		%fr8L, %fr4L, %fr31
    413 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    414 	xmpyu		%fr8R, %fr5R, %fr22
    415 	ldd		-0x50(%r30), p160c2
    416 	xmpyu		%fr8L, %fr5L, %fr23
    417 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    418 	xmpyu		%fr8R, %fr6R, %fr24
    419 	ldd		-0x18(%r30), p224d1
    420 	xmpyu		%fr8L, %fr6L, %fr25
    421 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    422 	xmpyu		%fr8R, %fr7R, %fr26
    423 	ldd		-0x10(%r30), p224d2
    424 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    425 	addib,=		-1, n, L(end2)
    426 	xmpyu		%fr8L, %fr7L, %fr27
    427 LDEF(loop)
    428 	add		p032a1, p032a2, m032
    429 	ldd		-0x80(%r30), p000a
    430 	add,dc		p096b1, p096b2, m096
    431 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    432 
    433 	add,dc		p160c1, p160c2, m160
    434 	ldd		-0x68(%r30), p064a
    435 	add,dc		p224d1, p224d2, m224
    436 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    437 
    438 	add,dc		%r0, %r0, m288
    439 	ldd		-0x40(%r30), p064b
    440 	ldo		32(up), up
    441 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    442 
    443 	depd,z		m032, 31, 32, ma000
    444 	ldd		-0x28(%r30), p128b
    445 	extrd,u		m032, 31, 32, ma064
    446 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    447 
    448 	depd		m096, 31, 32, ma064
    449 	ldd		-0x60(%r30), p128c
    450 	extrd,u		m096, 31, 32, ma128
    451 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    452 
    453 	depd		m160, 31, 32, ma128
    454 	ldd		-0x48(%r30), p192c
    455 	extrd,u		m160, 31, 32, ma192
    456 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    457 
    458 	depd		m224, 31, 32, ma192
    459 	ldd		-0x20(%r30), p192d
    460 	extrd,u		m224, 31, 32, ma256
    461 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    462 
    463 	depd		m288, 31, 32, ma256
    464 	ldd		-0x88(%r30), p256d
    465 	add		climb, p000a, s000
    466 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    467 
    468 	add,dc		p064a, p064b, s064
    469 	add,dc		p128b, p128c, s128
    470 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    471 
    472 	add,dc		p192c, p192d, s192
    473 	add,dc		p256d, %r0, climb
    474 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    475 
    476 	add		ma000, s000, s000	C accum mid 0
    477 	fldd		0(up), %fr4
    478 	add,dc		ma064, s064, s064	C accum mid 1
    479 	std		s000, 0(rp)
    480 
    481 	add,dc		ma128, s128, s128	C accum mid 2
    482 	fldd		8(up), %fr5
    483 	add,dc		ma192, s192, s192	C accum mid 3
    484 	std		s064, 8(rp)
    485 
    486 	add,dc		ma256, climb, climb
    487 	fldd		16(up), %fr6
    488 	std		s128, 16(rp)
    489 
    490 	xmpyu		%fr8R, %fr4L, %fr22
    491 	ldd		-0x78(%r30), p032a1
    492 	xmpyu		%fr8L, %fr4R, %fr23
    493 	fldd		24(up), %fr7
    494 
    495 	xmpyu		%fr8R, %fr5L, %fr24
    496 	ldd		-0x70(%r30), p032a2
    497 	xmpyu		%fr8L, %fr5R, %fr25
    498 	std		s192, 24(rp)
    499 
    500 	xmpyu		%fr8R, %fr6L, %fr26
    501 	ldd		-0x38(%r30), p096b1
    502 	xmpyu		%fr8L, %fr6R, %fr27
    503 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    504 
    505 	xmpyu		%fr8R, %fr7L, %fr28
    506 	ldd		-0x30(%r30), p096b2
    507 	xmpyu		%fr8L, %fr7R, %fr29
    508 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    509 
    510 	xmpyu		%fr8R, %fr4R, %fr30
    511 	ldd		-0x58(%r30), p160c1
    512 	xmpyu		%fr8L, %fr4L, %fr31
    513 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    514 
    515 	xmpyu		%fr8R, %fr5R, %fr22
    516 	ldd		-0x50(%r30), p160c2
    517 	xmpyu		%fr8L, %fr5L, %fr23
    518 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    519 
    520 	xmpyu		%fr8R, %fr6R, %fr24
    521 	ldd		-0x18(%r30), p224d1
    522 	xmpyu		%fr8L, %fr6L, %fr25
    523 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    524 
    525 	xmpyu		%fr8R, %fr7R, %fr26
    526 	ldd		-0x10(%r30), p224d2
    527 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    528 	xmpyu		%fr8L, %fr7L, %fr27
    529 
    530 	addib,<>	-1, n, L(loop)
    531 	ldo		32(rp), rp
    532 
    533 LDEF(end2)
    534 	add		p032a1, p032a2, m032
    535 	ldd		-0x80(%r30), p000a
    536 	add,dc		p096b1, p096b2, m096
    537 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    538 	add,dc		p160c1, p160c2, m160
    539 	ldd		-0x68(%r30), p064a
    540 	add,dc		p224d1, p224d2, m224
    541 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    542 	add,dc		%r0, %r0, m288
    543 	ldd		-0x40(%r30), p064b
    544 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    545 	depd,z		m032, 31, 32, ma000
    546 	ldd		-0x28(%r30), p128b
    547 	extrd,u		m032, 31, 32, ma064
    548 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    549 	depd		m096, 31, 32, ma064
    550 	ldd		-0x60(%r30), p128c
    551 	extrd,u		m096, 31, 32, ma128
    552 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    553 	depd		m160, 31, 32, ma128
    554 	ldd		-0x48(%r30), p192c
    555 	extrd,u		m160, 31, 32, ma192
    556 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    557 	depd		m224, 31, 32, ma192
    558 	ldd		-0x20(%r30), p192d
    559 	extrd,u		m224, 31, 32, ma256
    560 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    561 	depd		m288, 31, 32, ma256
    562 	ldd		-0x88(%r30), p256d
    563 	add		climb, p000a, s000
    564 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    565 	add,dc		p064a, p064b, s064
    566 	add,dc		p128b, p128c, s128
    567 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    568 	add,dc		p192c, p192d, s192
    569 	add,dc		p256d, %r0, climb
    570 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    571 	add		ma000, s000, s000	C accum mid 0
    572 	add,dc		ma064, s064, s064	C accum mid 1
    573 	add,dc		ma128, s128, s128	C accum mid 2
    574 	add,dc		ma192, s192, s192	C accum mid 3
    575 	add,dc		ma256, climb, climb
    576 	std		s000, 0(rp)
    577 	std		s064, 8(rp)
    578 	ldd		-0x78(%r30), p032a1
    579 	std		s128, 16(rp)
    580 	ldd		-0x70(%r30), p032a2
    581 	std		s192, 24(rp)
    582 	ldd		-0x38(%r30), p096b1
    583 	ldd		-0x30(%r30), p096b2
    584 	ldd		-0x58(%r30), p160c1
    585 	ldd		-0x50(%r30), p160c2
    586 	ldd		-0x18(%r30), p224d1
    587 	ldd		-0x10(%r30), p224d2
    588 	ldo		32(rp), rp
    589 
    590 LDEF(end1)
    591 	add		p032a1, p032a2, m032
    592 	ldd		-0x80(%r30), p000a
    593 	add,dc		p096b1, p096b2, m096
    594 	add,dc		p160c1, p160c2, m160
    595 	ldd		-0x68(%r30), p064a
    596 	add,dc		p224d1, p224d2, m224
    597 	add,dc		%r0, %r0, m288
    598 	ldd		-0x40(%r30), p064b
    599 	depd,z		m032, 31, 32, ma000
    600 	ldd		-0x28(%r30), p128b
    601 	extrd,u		m032, 31, 32, ma064
    602 	depd		m096, 31, 32, ma064
    603 	ldd		-0x60(%r30), p128c
    604 	extrd,u		m096, 31, 32, ma128
    605 	depd		m160, 31, 32, ma128
    606 	ldd		-0x48(%r30), p192c
    607 	extrd,u		m160, 31, 32, ma192
    608 	depd		m224, 31, 32, ma192
    609 	ldd		-0x20(%r30), p192d
    610 	extrd,u		m224, 31, 32, ma256
    611 	depd		m288, 31, 32, ma256
    612 	ldd		-0x88(%r30), p256d
    613 	add		climb, p000a, s000
    614 	add,dc		p064a, p064b, s064
    615 	add,dc		p128b, p128c, s128
    616 	add,dc		p192c, p192d, s192
    617 	add,dc		p256d, %r0, climb
    618 	add		ma000, s000, s000	C accum mid 0
    619 	add,dc		ma064, s064, s064	C accum mid 1
    620 	add,dc		ma128, s128, s128	C accum mid 2
    621 	add,dc		ma192, s192, s192	C accum mid 3
    622 	add,dc		ma256, climb, climb
    623 	std		s000, 0(rp)
    624 	std		s064, 8(rp)
    625 	std		s128, 16(rp)
    626 	std		s192, 24(rp)
    627 
    628 	ldd		-0xb0(%r30), %r13
    629 	ldd		-0xb8(%r30), %r12
    630 	ldd		-0xc0(%r30), %r11
    631 	ldd		-0xc8(%r30), %r10
    632 	ldd		-0xd0(%r30), %r9
    633 	ldd		-0xd8(%r30), %r8
    634 	ldd		-0xe0(%r30), %r7
    635 	ldd		-0xe8(%r30), %r6
    636 LDEF(done)
    637 ifdef(`HAVE_ABI_2_0w',
    638 `	copy		climb, %r28
    639 ',`	extrd,u		climb, 63, 32, %r29
    640 	extrd,u		climb, 31, 32, %r28
    641 ')
    642 	ldd		-0xf0(%r30), %r5
    643 	ldd		-0xf8(%r30), %r4
    644 	bve		(%r2)
    645 	ldd,mb		-0x100(%r30), %r3
    646 EPILOGUE(mpn_mul_1)
    647