Home | History | Annotate | Line # | Download | only in pa64
submul_1.asm revision 1.1
      1 dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
      2 dnl  subtract the result from a second limb vector.
      3 
      4 dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of the GNU Lesser General Public License as published
     10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11 dnl  your option) any later version.
     12 
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16 dnl  License for more details.
     17 
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 C		    cycles/limb
     24 C 8000,8200:		7
     25 C 8500,8600,8700:	6.5
     26 
     27 C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
     28 C  could be saved there per call.
     29 
     30 C  DESCRIPTION:
     31 C  The main loop "BIG" is 4-way unrolled, mainly to allow
     32 C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
     33 C  registers to the IU registers, have demanded a deep software pipeline, and
     34 C  a lot of stack slots for partial products in flight.
     35 C
     36 C  CODE STRUCTURE:
     37 C  save-some-registers
     38 C  do 0, 1, 2, or 3 limbs
     39 C  if done, restore-some-regs and return
     40 C  save-many-regs
     41 C  do 4, 8, ... limb
     42 C  restore-all-regs
     43 
     44 C  STACK LAYOUT:
     45 C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
     46 C  slots marked FREE, as well as some slots in the caller's "frame marker".
     47 C
     48 C -00 <- r30
     49 C -08  FREE
     50 C -10  tmp
     51 C -18  tmp
     52 C -20  tmp
     53 C -28  tmp
     54 C -30  tmp
     55 C -38  tmp
     56 C -40  tmp
     57 C -48  tmp
     58 C -50  tmp
     59 C -58  tmp
     60 C -60  tmp
     61 C -68  tmp
     62 C -70  tmp
     63 C -78  tmp
     64 C -80  tmp
     65 C -88  tmp
     66 C -90  FREE
     67 C -98  FREE
     68 C -a0  FREE
     69 C -a8  FREE
     70 C -b0  r13
     71 C -b8  r12
     72 C -c0  r11
     73 C -c8  r10
     74 C -d0  r8
     75 C -d8  r8
     76 C -e0  r7
     77 C -e8  r6
     78 C -f0  r5
     79 C -f8  r4
     80 C -100 r3
     81 C  Previous frame:
     82 C  [unused area]
     83 C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
     84 
     85 
     86 include(`../config.m4')
     87 
     88 C INPUT PARAMETERS:
     89 define(`rp',`%r26')	C
     90 define(`up',`%r25')	C
     91 define(`n',`%r24')	C
     92 define(`vlimb',`%r23')	C
     93 
     94 define(`climb',`%r23')	C
     95 
     96 ifdef(`HAVE_ABI_2_0w',
     97 `	.level	2.0w
     98 ',`	.level	2.0
     99 ')
    100 PROLOGUE(mpn_submul_1)
    101 
    102 ifdef(`HAVE_ABI_2_0w',
    103 `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
    104 ')
    105 	std,ma		%r3, 0x100(%r30)
    106 	std		%r4, -0xf8(%r30)
    107 	std		%r5, -0xf0(%r30)
    108 	ldo		0(%r0), climb		C clear climb
    109 	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
    110 
    111 define(`p032a1',`%r1')	C
    112 define(`p032a2',`%r19')	C
    113 
    114 define(`m032',`%r20')	C
    115 define(`m096',`%r21')	C
    116 
    117 define(`p000a',`%r22')	C
    118 define(`p064a',`%r29')	C
    119 
    120 define(`s000',`%r31')	C
    121 
    122 define(`ma000',`%r4')	C
    123 define(`ma064',`%r20')	C
    124 
    125 define(`r000',`%r3')	C
    126 
    127 	extrd,u		n, 63, 2, %r5
    128 	cmpb,=		%r5, %r0, L(BIG)
    129 	nop
    130 
    131 	fldd		0(up), %fr4
    132 	ldo		8(up), up
    133 	xmpyu		%fr8R, %fr4L, %fr22
    134 	xmpyu		%fr8L, %fr4R, %fr23
    135 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    136 	xmpyu		%fr8R, %fr4R, %fr24
    137 	xmpyu		%fr8L, %fr4L, %fr25
    138 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    139 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    140 	addib,<>	-1, %r5, L(two_or_more)
    141 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    142 LDEF(one)
    143 	ldd		-0x78(%r30), p032a1
    144 	ldd		-0x70(%r30), p032a2
    145 	ldd		-0x80(%r30), p000a
    146 	b		L(0_one_out)
    147 	ldd		-0x68(%r30), p064a
    148 
    149 LDEF(two_or_more)
    150 	fldd		0(up), %fr4
    151 	ldo		8(up), up
    152 	xmpyu		%fr8R, %fr4L, %fr22
    153 	xmpyu		%fr8L, %fr4R, %fr23
    154 	ldd		-0x78(%r30), p032a1
    155 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    156 	xmpyu		%fr8R, %fr4R, %fr24
    157 	xmpyu		%fr8L, %fr4L, %fr25
    158 	ldd		-0x70(%r30), p032a2
    159 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    160 	ldd		-0x80(%r30), p000a
    161 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    162 	ldd		-0x68(%r30), p064a
    163 	addib,<>	-1, %r5, L(three_or_more)
    164 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    165 LDEF(two)
    166 	add		p032a1, p032a2, m032
    167 	add,dc		%r0, %r0, m096
    168 	depd,z		m032, 31, 32, ma000
    169 	extrd,u		m032, 31, 32, ma064
    170 	ldd		0(rp), r000
    171 	b		L(0_two_out)
    172 	depd		m096, 31, 32, ma064
    173 
    174 LDEF(three_or_more)
    175 	fldd		0(up), %fr4
    176 	add		p032a1, p032a2, m032
    177 	add,dc		%r0, %r0, m096
    178 	depd,z		m032, 31, 32, ma000
    179 	extrd,u		m032, 31, 32, ma064
    180 	ldd		0(rp), r000
    181 C	addib,=		-1, %r5, L(0_out)
    182 	depd		m096, 31, 32, ma064
    183 LDEF(loop0)
    184 C	xmpyu		%fr8R, %fr4L, %fr22
    185 C	xmpyu		%fr8L, %fr4R, %fr23
    186 C	ldd		-0x78(%r30), p032a1
    187 C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    188 C
    189 C	xmpyu		%fr8R, %fr4R, %fr24
    190 C	xmpyu		%fr8L, %fr4L, %fr25
    191 C	ldd		-0x70(%r30), p032a2
    192 C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    193 C
    194 C	ldo		8(rp), rp
    195 C	add		climb, p000a, s000
    196 C	ldd		-0x80(%r30), p000a
    197 C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    198 C
    199 C	add,dc		p064a, %r0, climb
    200 C	ldo		8(up), up
    201 C	ldd		-0x68(%r30), p064a
    202 C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    203 C
    204 C	add		ma000, s000, s000
    205 C	add,dc		ma064, climb, climb
    206 C	fldd		0(up), %fr4
    207 C
    208 C	sub		r000, s000, s000
    209 C	sub,db		%r0, climb, climb
    210 C	sub		%r0, climb, climb
    211 C	std		s000, -8(rp)
    212 C
    213 C	add		p032a1, p032a2, m032
    214 C	add,dc		%r0, %r0, m096
    215 C
    216 C	depd,z		m032, 31, 32, ma000
    217 C	extrd,u		m032, 31, 32, ma064
    218 C	ldd		0(rp), r000
    219 C	addib,<>	-1, %r5, L(loop0)
    220 C	depd		m096, 31, 32, ma064
    221 LDEF(0_out)
    222 	ldo		8(up), up
    223 	xmpyu		%fr8R, %fr4L, %fr22
    224 	xmpyu		%fr8L, %fr4R, %fr23
    225 	ldd		-0x78(%r30), p032a1
    226 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    227 	xmpyu		%fr8R, %fr4R, %fr24
    228 	xmpyu		%fr8L, %fr4L, %fr25
    229 	ldd		-0x70(%r30), p032a2
    230 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    231 	ldo		8(rp), rp
    232 	add		climb, p000a, s000
    233 	ldd		-0x80(%r30), p000a
    234 	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
    235 	add,dc		p064a, %r0, climb
    236 	ldd		-0x68(%r30), p064a
    237 	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
    238 	add		ma000, s000, s000
    239 	add,dc		ma064, climb, climb
    240 	sub		r000, s000, s000
    241 	sub,db		%r0, climb, climb
    242 	sub		%r0, climb, climb
    243 	std		s000, -8(rp)
    244 	add		p032a1, p032a2, m032
    245 	add,dc		%r0, %r0, m096
    246 	depd,z		m032, 31, 32, ma000
    247 	extrd,u		m032, 31, 32, ma064
    248 	ldd		0(rp), r000
    249 	depd		m096, 31, 32, ma064
    250 LDEF(0_two_out)
    251 	ldd		-0x78(%r30), p032a1
    252 	ldd		-0x70(%r30), p032a2
    253 	ldo		8(rp), rp
    254 	add		climb, p000a, s000
    255 	ldd		-0x80(%r30), p000a
    256 	add,dc		p064a, %r0, climb
    257 	ldd		-0x68(%r30), p064a
    258 	add		ma000, s000, s000
    259 	add,dc		ma064, climb, climb
    260 	sub		r000, s000, s000
    261 	sub,db		%r0, climb, climb
    262 	sub		%r0, climb, climb
    263 	std		s000, -8(rp)
    264 LDEF(0_one_out)
    265 	add		p032a1, p032a2, m032
    266 	add,dc		%r0, %r0, m096
    267 	depd,z		m032, 31, 32, ma000
    268 	extrd,u		m032, 31, 32, ma064
    269 	ldd		0(rp), r000
    270 	depd		m096, 31, 32, ma064
    271 
    272 	add		climb, p000a, s000
    273 	add,dc		p064a, %r0, climb
    274 	add		ma000, s000, s000
    275 	add,dc		ma064, climb, climb
    276 	sub		r000, s000, s000
    277 	sub,db		%r0, climb, climb
    278 	sub		%r0, climb, climb
    279 	std		s000, 0(rp)
    280 
    281 	cmpib,>=	4, n, L(done)
    282 	ldo		8(rp), rp
    283 
    284 C 4-way unrolled code.
    285 
    286 LDEF(BIG)
    287 
    288 define(`p032a1',`%r1')	C
    289 define(`p032a2',`%r19')	C
    290 define(`p096b1',`%r20')	C
    291 define(`p096b2',`%r21')	C
    292 define(`p160c1',`%r22')	C
    293 define(`p160c2',`%r29')	C
    294 define(`p224d1',`%r31')	C
    295 define(`p224d2',`%r3')	C
    296 			C
    297 define(`m032',`%r4')	C
    298 define(`m096',`%r5')	C
    299 define(`m160',`%r6')	C
    300 define(`m224',`%r7')	C
    301 define(`m288',`%r8')	C
    302 			C
    303 define(`p000a',`%r1')	C
    304 define(`p064a',`%r19')	C
    305 define(`p064b',`%r20')	C
    306 define(`p128b',`%r21')	C
    307 define(`p128c',`%r22')	C
    308 define(`p192c',`%r29')	C
    309 define(`p192d',`%r31')	C
    310 define(`p256d',`%r3')	C
    311 			C
    312 define(`s000',`%r10')	C
    313 define(`s064',`%r11')	C
    314 define(`s128',`%r12')	C
    315 define(`s192',`%r13')	C
    316 			C
    317 define(`ma000',`%r9')	C
    318 define(`ma064',`%r4')	C
    319 define(`ma128',`%r5')	C
    320 define(`ma192',`%r6')	C
    321 define(`ma256',`%r7')	C
    322 			C
    323 define(`r000',`%r1')	C
    324 define(`r064',`%r19')	C
    325 define(`r128',`%r20')	C
    326 define(`r192',`%r21')	C
    327 
    328 	std		%r6, -0xe8(%r30)
    329 	std		%r7, -0xe0(%r30)
    330 	std		%r8, -0xd8(%r30)
    331 	std		%r9, -0xd0(%r30)
    332 	std		%r10, -0xc8(%r30)
    333 	std		%r11, -0xc0(%r30)
    334 	std		%r12, -0xb8(%r30)
    335 	std		%r13, -0xb0(%r30)
    336 
    337 ifdef(`HAVE_ABI_2_0w',
    338 `	extrd,u		n, 61, 62, n		C right shift 2
    339 ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
    340 ')
    341 
    342 LDEF(4_or_more)
    343 	fldd		0(up), %fr4
    344 	fldd		8(up), %fr5
    345 	fldd		16(up), %fr6
    346 	fldd		24(up), %fr7
    347 	xmpyu		%fr8R, %fr4L, %fr22
    348 	xmpyu		%fr8L, %fr4R, %fr23
    349 	xmpyu		%fr8R, %fr5L, %fr24
    350 	xmpyu		%fr8L, %fr5R, %fr25
    351 	xmpyu		%fr8R, %fr6L, %fr26
    352 	xmpyu		%fr8L, %fr6R, %fr27
    353 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    354 	xmpyu		%fr8R, %fr7L, %fr28
    355 	xmpyu		%fr8L, %fr7R, %fr29
    356 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    357 	xmpyu		%fr8R, %fr4R, %fr30
    358 	xmpyu		%fr8L, %fr4L, %fr31
    359 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    360 	xmpyu		%fr8R, %fr5R, %fr22
    361 	xmpyu		%fr8L, %fr5L, %fr23
    362 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    363 	xmpyu		%fr8R, %fr6R, %fr24
    364 	xmpyu		%fr8L, %fr6L, %fr25
    365 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    366 	xmpyu		%fr8R, %fr7R, %fr26
    367 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    368 	addib,<>	-1, n, L(8_or_more)
    369 	xmpyu		%fr8L, %fr7L, %fr27
    370 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    371 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    372 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    373 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    374 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    375 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    376 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    377 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    378 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    379 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    380 	ldd		-0x78(%r30), p032a1
    381 	ldd		-0x70(%r30), p032a2
    382 	ldd		-0x38(%r30), p096b1
    383 	ldd		-0x30(%r30), p096b2
    384 	ldd		-0x58(%r30), p160c1
    385 	ldd		-0x50(%r30), p160c2
    386 	ldd		-0x18(%r30), p224d1
    387 	ldd		-0x10(%r30), p224d2
    388 	b		L(end1)
    389 	nop
    390 
    391 LDEF(8_or_more)
    392 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    393 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    394 	ldo		32(up), up
    395 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    396 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    397 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    398 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    399 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    400 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    401 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    402 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    403 	fldd		0(up), %fr4
    404 	fldd		8(up), %fr5
    405 	fldd		16(up), %fr6
    406 	fldd		24(up), %fr7
    407 	xmpyu		%fr8R, %fr4L, %fr22
    408 	ldd		-0x78(%r30), p032a1
    409 	xmpyu		%fr8L, %fr4R, %fr23
    410 	xmpyu		%fr8R, %fr5L, %fr24
    411 	ldd		-0x70(%r30), p032a2
    412 	xmpyu		%fr8L, %fr5R, %fr25
    413 	xmpyu		%fr8R, %fr6L, %fr26
    414 	ldd		-0x38(%r30), p096b1
    415 	xmpyu		%fr8L, %fr6R, %fr27
    416 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    417 	xmpyu		%fr8R, %fr7L, %fr28
    418 	ldd		-0x30(%r30), p096b2
    419 	xmpyu		%fr8L, %fr7R, %fr29
    420 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    421 	xmpyu		%fr8R, %fr4R, %fr30
    422 	ldd		-0x58(%r30), p160c1
    423 	xmpyu		%fr8L, %fr4L, %fr31
    424 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    425 	xmpyu		%fr8R, %fr5R, %fr22
    426 	ldd		-0x50(%r30), p160c2
    427 	xmpyu		%fr8L, %fr5L, %fr23
    428 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    429 	xmpyu		%fr8R, %fr6R, %fr24
    430 	ldd		-0x18(%r30), p224d1
    431 	xmpyu		%fr8L, %fr6L, %fr25
    432 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    433 	xmpyu		%fr8R, %fr7R, %fr26
    434 	ldd		-0x10(%r30), p224d2
    435 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    436 	addib,=		-1, n, L(end2)
    437 	xmpyu		%fr8L, %fr7L, %fr27
    438 LDEF(loop)
    439 	add		p032a1, p032a2, m032
    440 	ldd		-0x80(%r30), p000a
    441 	add,dc		p096b1, p096b2, m096
    442 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    443 
    444 	add,dc		p160c1, p160c2, m160
    445 	ldd		-0x68(%r30), p064a
    446 	add,dc		p224d1, p224d2, m224
    447 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    448 
    449 	add,dc		%r0, %r0, m288
    450 	ldd		-0x40(%r30), p064b
    451 	ldo		32(up), up
    452 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    453 
    454 	depd,z		m032, 31, 32, ma000
    455 	ldd		-0x28(%r30), p128b
    456 	extrd,u		m032, 31, 32, ma064
    457 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    458 
    459 	depd		m096, 31, 32, ma064
    460 	ldd		-0x60(%r30), p128c
    461 	extrd,u		m096, 31, 32, ma128
    462 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    463 
    464 	depd		m160, 31, 32, ma128
    465 	ldd		-0x48(%r30), p192c
    466 	extrd,u		m160, 31, 32, ma192
    467 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    468 
    469 	depd		m224, 31, 32, ma192
    470 	ldd		-0x20(%r30), p192d
    471 	extrd,u		m224, 31, 32, ma256
    472 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    473 
    474 	depd		m288, 31, 32, ma256
    475 	ldd		-0x88(%r30), p256d
    476 	add		climb, p000a, s000
    477 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    478 
    479 	add,dc		p064a, p064b, s064
    480 	ldd		0(rp), r000
    481 	add,dc		p128b, p128c, s128
    482 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    483 
    484 	add,dc		p192c, p192d, s192
    485 	ldd		8(rp), r064
    486 	add,dc		p256d, %r0, climb
    487 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    488 
    489 	ldd		16(rp), r128
    490 	add		ma000, s000, s000	C accum mid 0
    491 	ldd		24(rp), r192
    492 	add,dc		ma064, s064, s064	C accum mid 1
    493 
    494 	add,dc		ma128, s128, s128	C accum mid 2
    495 	fldd		0(up), %fr4
    496 	add,dc		ma192, s192, s192	C accum mid 3
    497 	fldd		8(up), %fr5
    498 
    499 	add,dc		ma256, climb, climb
    500 	fldd		16(up), %fr6
    501 	sub		r000, s000, s000	C accum rlimb 0
    502 	fldd		24(up), %fr7
    503 
    504 	sub,db		r064, s064, s064	C accum rlimb 1
    505 	sub,db		r128, s128, s128	C accum rlimb 2
    506 	std		s000, 0(rp)
    507 
    508 	sub,db		r192, s192, s192	C accum rlimb 3
    509 	sub,db		%r0, climb, climb
    510 	sub		%r0, climb, climb
    511 	std		s064, 8(rp)
    512 
    513 	xmpyu		%fr8R, %fr4L, %fr22
    514 	ldd		-0x78(%r30), p032a1
    515 	xmpyu		%fr8L, %fr4R, %fr23
    516 	std		s128, 16(rp)
    517 
    518 	xmpyu		%fr8R, %fr5L, %fr24
    519 	ldd		-0x70(%r30), p032a2
    520 	xmpyu		%fr8L, %fr5R, %fr25
    521 	std		s192, 24(rp)
    522 
    523 	xmpyu		%fr8R, %fr6L, %fr26
    524 	ldd		-0x38(%r30), p096b1
    525 	xmpyu		%fr8L, %fr6R, %fr27
    526 	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
    527 
    528 	xmpyu		%fr8R, %fr7L, %fr28
    529 	ldd		-0x30(%r30), p096b2
    530 	xmpyu		%fr8L, %fr7R, %fr29
    531 	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
    532 
    533 	xmpyu		%fr8R, %fr4R, %fr30
    534 	ldd		-0x58(%r30), p160c1
    535 	xmpyu		%fr8L, %fr4L, %fr31
    536 	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
    537 
    538 	xmpyu		%fr8R, %fr5R, %fr22
    539 	ldd		-0x50(%r30), p160c2
    540 	xmpyu		%fr8L, %fr5L, %fr23
    541 	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
    542 
    543 	xmpyu		%fr8R, %fr6R, %fr24
    544 	ldd		-0x18(%r30), p224d1
    545 	xmpyu		%fr8L, %fr6L, %fr25
    546 	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
    547 
    548 	xmpyu		%fr8R, %fr7R, %fr26
    549 	ldd		-0x10(%r30), p224d2
    550 	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
    551 	xmpyu		%fr8L, %fr7L, %fr27
    552 
    553 	addib,<>	-1, n, L(loop)
    554 	ldo		32(rp), rp
    555 
    556 LDEF(end2)
    557 	add		p032a1, p032a2, m032
    558 	ldd		-0x80(%r30), p000a
    559 	add,dc		p096b1, p096b2, m096
    560 	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
    561 	add,dc		p160c1, p160c2, m160
    562 	ldd		-0x68(%r30), p064a
    563 	add,dc		p224d1, p224d2, m224
    564 	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
    565 	add,dc		%r0, %r0, m288
    566 	ldd		-0x40(%r30), p064b
    567 	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
    568 	depd,z		m032, 31, 32, ma000
    569 	ldd		-0x28(%r30), p128b
    570 	extrd,u		m032, 31, 32, ma064
    571 	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
    572 	depd		m096, 31, 32, ma064
    573 	ldd		-0x60(%r30), p128c
    574 	extrd,u		m096, 31, 32, ma128
    575 	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
    576 	depd		m160, 31, 32, ma128
    577 	ldd		-0x48(%r30), p192c
    578 	extrd,u		m160, 31, 32, ma192
    579 	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
    580 	depd		m224, 31, 32, ma192
    581 	ldd		-0x20(%r30), p192d
    582 	extrd,u		m224, 31, 32, ma256
    583 	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
    584 	depd		m288, 31, 32, ma256
    585 	ldd		-0x88(%r30), p256d
    586 	add		climb, p000a, s000
    587 	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
    588 	add,dc		p064a, p064b, s064
    589 	ldd		0(rp), r000
    590 	add,dc		p128b, p128c, s128
    591 	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
    592 	add,dc		p192c, p192d, s192
    593 	ldd		8(rp), r064
    594 	add,dc		p256d, %r0, climb
    595 	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
    596 	ldd		16(rp), r128
    597 	add		ma000, s000, s000	C accum mid 0
    598 	ldd		24(rp), r192
    599 	add,dc		ma064, s064, s064	C accum mid 1
    600 	add,dc		ma128, s128, s128	C accum mid 2
    601 	add,dc		ma192, s192, s192	C accum mid 3
    602 	add,dc		ma256, climb, climb
    603 	sub		r000, s000, s000	C accum rlimb 0
    604 	sub,db		r064, s064, s064	C accum rlimb 1
    605 	sub,db		r128, s128, s128	C accum rlimb 2
    606 	std		s000, 0(rp)
    607 	sub,db		r192, s192, s192	C accum rlimb 3
    608 	sub,db		%r0, climb, climb
    609 	sub		%r0, climb, climb
    610 	std		s064, 8(rp)
    611 	ldd		-0x78(%r30), p032a1
    612 	std		s128, 16(rp)
    613 	ldd		-0x70(%r30), p032a2
    614 	std		s192, 24(rp)
    615 	ldd		-0x38(%r30), p096b1
    616 	ldd		-0x30(%r30), p096b2
    617 	ldd		-0x58(%r30), p160c1
    618 	ldd		-0x50(%r30), p160c2
    619 	ldd		-0x18(%r30), p224d1
    620 	ldd		-0x10(%r30), p224d2
    621 	ldo		32(rp), rp
    622 
    623 LDEF(end1)
    624 	add		p032a1, p032a2, m032
    625 	ldd		-0x80(%r30), p000a
    626 	add,dc		p096b1, p096b2, m096
    627 	add,dc		p160c1, p160c2, m160
    628 	ldd		-0x68(%r30), p064a
    629 	add,dc		p224d1, p224d2, m224
    630 	add,dc		%r0, %r0, m288
    631 	ldd		-0x40(%r30), p064b
    632 	depd,z		m032, 31, 32, ma000
    633 	ldd		-0x28(%r30), p128b
    634 	extrd,u		m032, 31, 32, ma064
    635 	depd		m096, 31, 32, ma064
    636 	ldd		-0x60(%r30), p128c
    637 	extrd,u		m096, 31, 32, ma128
    638 	depd		m160, 31, 32, ma128
    639 	ldd		-0x48(%r30), p192c
    640 	extrd,u		m160, 31, 32, ma192
    641 	depd		m224, 31, 32, ma192
    642 	ldd		-0x20(%r30), p192d
    643 	extrd,u		m224, 31, 32, ma256
    644 	depd		m288, 31, 32, ma256
    645 	ldd		-0x88(%r30), p256d
    646 	add		climb, p000a, s000
    647 	add,dc		p064a, p064b, s064
    648 	ldd		0(rp), r000
    649 	add,dc		p128b, p128c, s128
    650 	add,dc		p192c, p192d, s192
    651 	ldd		8(rp), r064
    652 	add,dc		p256d, %r0, climb
    653 	ldd		16(rp), r128
    654 	add		ma000, s000, s000	C accum mid 0
    655 	ldd		24(rp), r192
    656 	add,dc		ma064, s064, s064	C accum mid 1
    657 	add,dc		ma128, s128, s128	C accum mid 2
    658 	add,dc		ma192, s192, s192	C accum mid 3
    659 	add,dc		ma256, climb, climb
    660 	sub		r000, s000, s000	C accum rlimb 0
    661 	sub,db		r064, s064, s064	C accum rlimb 1
    662 	sub,db		r128, s128, s128	C accum rlimb 2
    663 	std		s000, 0(rp)
    664 	sub,db		r192, s192, s192	C accum rlimb 3
    665 	sub,db		%r0, climb, climb
    666 	sub		%r0, climb, climb
    667 	std		s064, 8(rp)
    668 	std		s128, 16(rp)
    669 	std		s192, 24(rp)
    670 
    671 	ldd		-0xb0(%r30), %r13
    672 	ldd		-0xb8(%r30), %r12
    673 	ldd		-0xc0(%r30), %r11
    674 	ldd		-0xc8(%r30), %r10
    675 	ldd		-0xd0(%r30), %r9
    676 	ldd		-0xd8(%r30), %r8
    677 	ldd		-0xe0(%r30), %r7
    678 	ldd		-0xe8(%r30), %r6
    679 LDEF(done)
    680 ifdef(`HAVE_ABI_2_0w',
    681 `	copy		climb, %r28
    682 ',`	extrd,u		climb, 63, 32, %r29
    683 	extrd,u		climb, 31, 32, %r28
    684 ')
    685 	ldd		-0xf0(%r30), %r5
    686 	ldd		-0xf8(%r30), %r4
    687 	bve		(%r2)
    688 	ldd,mb		-0x100(%r30), %r3
    689 EPILOGUE(mpn_submul_1)
    690