submul_1.asm revision 1.1.1.2 1 dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2 dnl subtract the result from a second limb vector.
3
4 dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
31
32 include(`../config.m4')
33
34 C cycles/limb
35 C 8000,8200: 7
36 C 8500,8600,8700: 6.5
37
38 C The feed-in and wind-down code has not yet been scheduled. Many cycles
39 C could be saved there per call.
40
41 C DESCRIPTION:
42 C The main loop "BIG" is 4-way unrolled, mainly to allow
43 C effective use of ADD,DC. Delays in moving data via the cache from the FP
44 C registers to the IU registers, have demanded a deep software pipeline, and
45 C a lot of stack slots for partial products in flight.
46 C
47 C CODE STRUCTURE:
48 C save-some-registers
49 C do 0, 1, 2, or 3 limbs
50 C if done, restore-some-regs and return
51 C save-many-regs
52 C do 4, 8, ... limb
53 C restore-all-regs
54
55 C STACK LAYOUT:
56 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
57 C slots marked FREE, as well as some slots in the caller's "frame marker".
58 C
59 C -00 <- r30
60 C -08 FREE
61 C -10 tmp
62 C -18 tmp
63 C -20 tmp
64 C -28 tmp
65 C -30 tmp
66 C -38 tmp
67 C -40 tmp
68 C -48 tmp
69 C -50 tmp
70 C -58 tmp
71 C -60 tmp
72 C -68 tmp
73 C -70 tmp
74 C -78 tmp
75 C -80 tmp
76 C -88 tmp
77 C -90 FREE
78 C -98 FREE
79 C -a0 FREE
80 C -a8 FREE
81 C -b0 r13
82 C -b8 r12
83 C -c0 r11
84 C -c8 r10
85 C -d0 r8
86 C -d8 r8
87 C -e0 r7
88 C -e8 r6
89 C -f0 r5
90 C -f8 r4
91 C -100 r3
92 C Previous frame:
93 C [unused area]
94 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
95
96
97 include(`../config.m4')
98
99 C INPUT PARAMETERS:
100 define(`rp',`%r26') C
101 define(`up',`%r25') C
102 define(`n',`%r24') C
103 define(`vlimb',`%r23') C
104
105 define(`climb',`%r23') C
106
107 ifdef(`HAVE_ABI_2_0w',
108 ` .level 2.0w
109 ',` .level 2.0
110 ')
111 PROLOGUE(mpn_submul_1)
112
113 ifdef(`HAVE_ABI_2_0w',
114 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
115 ')
116 std,ma %r3, 0x100(%r30)
117 std %r4, -0xf8(%r30)
118 std %r5, -0xf0(%r30)
119 ldo 0(%r0), climb C clear climb
120 fldd -0x138(%r30), %fr8 C put vlimb in fp register
121
122 define(`p032a1',`%r1') C
123 define(`p032a2',`%r19') C
124
125 define(`m032',`%r20') C
126 define(`m096',`%r21') C
127
128 define(`p000a',`%r22') C
129 define(`p064a',`%r29') C
130
131 define(`s000',`%r31') C
132
133 define(`ma000',`%r4') C
134 define(`ma064',`%r20') C
135
136 define(`r000',`%r3') C
137
138 extrd,u n, 63, 2, %r5
139 cmpb,= %r5, %r0, L(BIG)
140 nop
141
142 fldd 0(up), %fr4
143 ldo 8(up), up
144 xmpyu %fr8R, %fr4L, %fr22
145 xmpyu %fr8L, %fr4R, %fr23
146 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
147 xmpyu %fr8R, %fr4R, %fr24
148 xmpyu %fr8L, %fr4L, %fr25
149 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
150 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
151 addib,<> -1, %r5, L(two_or_more)
152 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
153 LDEF(one)
154 ldd -0x78(%r30), p032a1
155 ldd -0x70(%r30), p032a2
156 ldd -0x80(%r30), p000a
157 b L(0_one_out)
158 ldd -0x68(%r30), p064a
159
160 LDEF(two_or_more)
161 fldd 0(up), %fr4
162 ldo 8(up), up
163 xmpyu %fr8R, %fr4L, %fr22
164 xmpyu %fr8L, %fr4R, %fr23
165 ldd -0x78(%r30), p032a1
166 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
167 xmpyu %fr8R, %fr4R, %fr24
168 xmpyu %fr8L, %fr4L, %fr25
169 ldd -0x70(%r30), p032a2
170 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
171 ldd -0x80(%r30), p000a
172 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
173 ldd -0x68(%r30), p064a
174 addib,<> -1, %r5, L(three_or_more)
175 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
176 LDEF(two)
177 add p032a1, p032a2, m032
178 add,dc %r0, %r0, m096
179 depd,z m032, 31, 32, ma000
180 extrd,u m032, 31, 32, ma064
181 ldd 0(rp), r000
182 b L(0_two_out)
183 depd m096, 31, 32, ma064
184
185 LDEF(three_or_more)
186 fldd 0(up), %fr4
187 add p032a1, p032a2, m032
188 add,dc %r0, %r0, m096
189 depd,z m032, 31, 32, ma000
190 extrd,u m032, 31, 32, ma064
191 ldd 0(rp), r000
192 C addib,= -1, %r5, L(0_out)
193 depd m096, 31, 32, ma064
194 LDEF(loop0)
195 C xmpyu %fr8R, %fr4L, %fr22
196 C xmpyu %fr8L, %fr4R, %fr23
197 C ldd -0x78(%r30), p032a1
198 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
199 C
200 C xmpyu %fr8R, %fr4R, %fr24
201 C xmpyu %fr8L, %fr4L, %fr25
202 C ldd -0x70(%r30), p032a2
203 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
204 C
205 C ldo 8(rp), rp
206 C add climb, p000a, s000
207 C ldd -0x80(%r30), p000a
208 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
209 C
210 C add,dc p064a, %r0, climb
211 C ldo 8(up), up
212 C ldd -0x68(%r30), p064a
213 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
214 C
215 C add ma000, s000, s000
216 C add,dc ma064, climb, climb
217 C fldd 0(up), %fr4
218 C
219 C sub r000, s000, s000
220 C sub,db %r0, climb, climb
221 C sub %r0, climb, climb
222 C std s000, -8(rp)
223 C
224 C add p032a1, p032a2, m032
225 C add,dc %r0, %r0, m096
226 C
227 C depd,z m032, 31, 32, ma000
228 C extrd,u m032, 31, 32, ma064
229 C ldd 0(rp), r000
230 C addib,<> -1, %r5, L(loop0)
231 C depd m096, 31, 32, ma064
232 LDEF(0_out)
233 ldo 8(up), up
234 xmpyu %fr8R, %fr4L, %fr22
235 xmpyu %fr8L, %fr4R, %fr23
236 ldd -0x78(%r30), p032a1
237 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
238 xmpyu %fr8R, %fr4R, %fr24
239 xmpyu %fr8L, %fr4L, %fr25
240 ldd -0x70(%r30), p032a2
241 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
242 ldo 8(rp), rp
243 add climb, p000a, s000
244 ldd -0x80(%r30), p000a
245 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
246 add,dc p064a, %r0, climb
247 ldd -0x68(%r30), p064a
248 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
249 add ma000, s000, s000
250 add,dc ma064, climb, climb
251 sub r000, s000, s000
252 sub,db %r0, climb, climb
253 sub %r0, climb, climb
254 std s000, -8(rp)
255 add p032a1, p032a2, m032
256 add,dc %r0, %r0, m096
257 depd,z m032, 31, 32, ma000
258 extrd,u m032, 31, 32, ma064
259 ldd 0(rp), r000
260 depd m096, 31, 32, ma064
261 LDEF(0_two_out)
262 ldd -0x78(%r30), p032a1
263 ldd -0x70(%r30), p032a2
264 ldo 8(rp), rp
265 add climb, p000a, s000
266 ldd -0x80(%r30), p000a
267 add,dc p064a, %r0, climb
268 ldd -0x68(%r30), p064a
269 add ma000, s000, s000
270 add,dc ma064, climb, climb
271 sub r000, s000, s000
272 sub,db %r0, climb, climb
273 sub %r0, climb, climb
274 std s000, -8(rp)
275 LDEF(0_one_out)
276 add p032a1, p032a2, m032
277 add,dc %r0, %r0, m096
278 depd,z m032, 31, 32, ma000
279 extrd,u m032, 31, 32, ma064
280 ldd 0(rp), r000
281 depd m096, 31, 32, ma064
282
283 add climb, p000a, s000
284 add,dc p064a, %r0, climb
285 add ma000, s000, s000
286 add,dc ma064, climb, climb
287 sub r000, s000, s000
288 sub,db %r0, climb, climb
289 sub %r0, climb, climb
290 std s000, 0(rp)
291
292 cmpib,>= 4, n, L(done)
293 ldo 8(rp), rp
294
295 C 4-way unrolled code.
296
297 LDEF(BIG)
298
299 define(`p032a1',`%r1') C
300 define(`p032a2',`%r19') C
301 define(`p096b1',`%r20') C
302 define(`p096b2',`%r21') C
303 define(`p160c1',`%r22') C
304 define(`p160c2',`%r29') C
305 define(`p224d1',`%r31') C
306 define(`p224d2',`%r3') C
307 C
308 define(`m032',`%r4') C
309 define(`m096',`%r5') C
310 define(`m160',`%r6') C
311 define(`m224',`%r7') C
312 define(`m288',`%r8') C
313 C
314 define(`p000a',`%r1') C
315 define(`p064a',`%r19') C
316 define(`p064b',`%r20') C
317 define(`p128b',`%r21') C
318 define(`p128c',`%r22') C
319 define(`p192c',`%r29') C
320 define(`p192d',`%r31') C
321 define(`p256d',`%r3') C
322 C
323 define(`s000',`%r10') C
324 define(`s064',`%r11') C
325 define(`s128',`%r12') C
326 define(`s192',`%r13') C
327 C
328 define(`ma000',`%r9') C
329 define(`ma064',`%r4') C
330 define(`ma128',`%r5') C
331 define(`ma192',`%r6') C
332 define(`ma256',`%r7') C
333 C
334 define(`r000',`%r1') C
335 define(`r064',`%r19') C
336 define(`r128',`%r20') C
337 define(`r192',`%r21') C
338
339 std %r6, -0xe8(%r30)
340 std %r7, -0xe0(%r30)
341 std %r8, -0xd8(%r30)
342 std %r9, -0xd0(%r30)
343 std %r10, -0xc8(%r30)
344 std %r11, -0xc0(%r30)
345 std %r12, -0xb8(%r30)
346 std %r13, -0xb0(%r30)
347
348 ifdef(`HAVE_ABI_2_0w',
349 ` extrd,u n, 61, 62, n C right shift 2
350 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
351 ')
352
353 LDEF(4_or_more)
354 fldd 0(up), %fr4
355 fldd 8(up), %fr5
356 fldd 16(up), %fr6
357 fldd 24(up), %fr7
358 xmpyu %fr8R, %fr4L, %fr22
359 xmpyu %fr8L, %fr4R, %fr23
360 xmpyu %fr8R, %fr5L, %fr24
361 xmpyu %fr8L, %fr5R, %fr25
362 xmpyu %fr8R, %fr6L, %fr26
363 xmpyu %fr8L, %fr6R, %fr27
364 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
365 xmpyu %fr8R, %fr7L, %fr28
366 xmpyu %fr8L, %fr7R, %fr29
367 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
368 xmpyu %fr8R, %fr4R, %fr30
369 xmpyu %fr8L, %fr4L, %fr31
370 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
371 xmpyu %fr8R, %fr5R, %fr22
372 xmpyu %fr8L, %fr5L, %fr23
373 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
374 xmpyu %fr8R, %fr6R, %fr24
375 xmpyu %fr8L, %fr6L, %fr25
376 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
377 xmpyu %fr8R, %fr7R, %fr26
378 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
379 addib,<> -1, n, L(8_or_more)
380 xmpyu %fr8L, %fr7L, %fr27
381 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
382 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
383 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
384 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
385 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
386 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
387 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
388 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
389 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
390 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
391 ldd -0x78(%r30), p032a1
392 ldd -0x70(%r30), p032a2
393 ldd -0x38(%r30), p096b1
394 ldd -0x30(%r30), p096b2
395 ldd -0x58(%r30), p160c1
396 ldd -0x50(%r30), p160c2
397 ldd -0x18(%r30), p224d1
398 ldd -0x10(%r30), p224d2
399 b L(end1)
400 nop
401
402 LDEF(8_or_more)
403 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
404 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
405 ldo 32(up), up
406 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
407 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
408 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
409 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
410 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
411 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
412 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
413 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
414 fldd 0(up), %fr4
415 fldd 8(up), %fr5
416 fldd 16(up), %fr6
417 fldd 24(up), %fr7
418 xmpyu %fr8R, %fr4L, %fr22
419 ldd -0x78(%r30), p032a1
420 xmpyu %fr8L, %fr4R, %fr23
421 xmpyu %fr8R, %fr5L, %fr24
422 ldd -0x70(%r30), p032a2
423 xmpyu %fr8L, %fr5R, %fr25
424 xmpyu %fr8R, %fr6L, %fr26
425 ldd -0x38(%r30), p096b1
426 xmpyu %fr8L, %fr6R, %fr27
427 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
428 xmpyu %fr8R, %fr7L, %fr28
429 ldd -0x30(%r30), p096b2
430 xmpyu %fr8L, %fr7R, %fr29
431 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
432 xmpyu %fr8R, %fr4R, %fr30
433 ldd -0x58(%r30), p160c1
434 xmpyu %fr8L, %fr4L, %fr31
435 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
436 xmpyu %fr8R, %fr5R, %fr22
437 ldd -0x50(%r30), p160c2
438 xmpyu %fr8L, %fr5L, %fr23
439 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
440 xmpyu %fr8R, %fr6R, %fr24
441 ldd -0x18(%r30), p224d1
442 xmpyu %fr8L, %fr6L, %fr25
443 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
444 xmpyu %fr8R, %fr7R, %fr26
445 ldd -0x10(%r30), p224d2
446 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
447 addib,= -1, n, L(end2)
448 xmpyu %fr8L, %fr7L, %fr27
449 LDEF(loop)
450 add p032a1, p032a2, m032
451 ldd -0x80(%r30), p000a
452 add,dc p096b1, p096b2, m096
453 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
454
455 add,dc p160c1, p160c2, m160
456 ldd -0x68(%r30), p064a
457 add,dc p224d1, p224d2, m224
458 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
459
460 add,dc %r0, %r0, m288
461 ldd -0x40(%r30), p064b
462 ldo 32(up), up
463 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
464
465 depd,z m032, 31, 32, ma000
466 ldd -0x28(%r30), p128b
467 extrd,u m032, 31, 32, ma064
468 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
469
470 depd m096, 31, 32, ma064
471 ldd -0x60(%r30), p128c
472 extrd,u m096, 31, 32, ma128
473 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
474
475 depd m160, 31, 32, ma128
476 ldd -0x48(%r30), p192c
477 extrd,u m160, 31, 32, ma192
478 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
479
480 depd m224, 31, 32, ma192
481 ldd -0x20(%r30), p192d
482 extrd,u m224, 31, 32, ma256
483 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
484
485 depd m288, 31, 32, ma256
486 ldd -0x88(%r30), p256d
487 add climb, p000a, s000
488 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
489
490 add,dc p064a, p064b, s064
491 ldd 0(rp), r000
492 add,dc p128b, p128c, s128
493 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
494
495 add,dc p192c, p192d, s192
496 ldd 8(rp), r064
497 add,dc p256d, %r0, climb
498 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
499
500 ldd 16(rp), r128
501 add ma000, s000, s000 C accum mid 0
502 ldd 24(rp), r192
503 add,dc ma064, s064, s064 C accum mid 1
504
505 add,dc ma128, s128, s128 C accum mid 2
506 fldd 0(up), %fr4
507 add,dc ma192, s192, s192 C accum mid 3
508 fldd 8(up), %fr5
509
510 add,dc ma256, climb, climb
511 fldd 16(up), %fr6
512 sub r000, s000, s000 C accum rlimb 0
513 fldd 24(up), %fr7
514
515 sub,db r064, s064, s064 C accum rlimb 1
516 sub,db r128, s128, s128 C accum rlimb 2
517 std s000, 0(rp)
518
519 sub,db r192, s192, s192 C accum rlimb 3
520 sub,db %r0, climb, climb
521 sub %r0, climb, climb
522 std s064, 8(rp)
523
524 xmpyu %fr8R, %fr4L, %fr22
525 ldd -0x78(%r30), p032a1
526 xmpyu %fr8L, %fr4R, %fr23
527 std s128, 16(rp)
528
529 xmpyu %fr8R, %fr5L, %fr24
530 ldd -0x70(%r30), p032a2
531 xmpyu %fr8L, %fr5R, %fr25
532 std s192, 24(rp)
533
534 xmpyu %fr8R, %fr6L, %fr26
535 ldd -0x38(%r30), p096b1
536 xmpyu %fr8L, %fr6R, %fr27
537 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
538
539 xmpyu %fr8R, %fr7L, %fr28
540 ldd -0x30(%r30), p096b2
541 xmpyu %fr8L, %fr7R, %fr29
542 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
543
544 xmpyu %fr8R, %fr4R, %fr30
545 ldd -0x58(%r30), p160c1
546 xmpyu %fr8L, %fr4L, %fr31
547 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
548
549 xmpyu %fr8R, %fr5R, %fr22
550 ldd -0x50(%r30), p160c2
551 xmpyu %fr8L, %fr5L, %fr23
552 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
553
554 xmpyu %fr8R, %fr6R, %fr24
555 ldd -0x18(%r30), p224d1
556 xmpyu %fr8L, %fr6L, %fr25
557 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
558
559 xmpyu %fr8R, %fr7R, %fr26
560 ldd -0x10(%r30), p224d2
561 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
562 xmpyu %fr8L, %fr7L, %fr27
563
564 addib,<> -1, n, L(loop)
565 ldo 32(rp), rp
566
567 LDEF(end2)
568 add p032a1, p032a2, m032
569 ldd -0x80(%r30), p000a
570 add,dc p096b1, p096b2, m096
571 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
572 add,dc p160c1, p160c2, m160
573 ldd -0x68(%r30), p064a
574 add,dc p224d1, p224d2, m224
575 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
576 add,dc %r0, %r0, m288
577 ldd -0x40(%r30), p064b
578 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
579 depd,z m032, 31, 32, ma000
580 ldd -0x28(%r30), p128b
581 extrd,u m032, 31, 32, ma064
582 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
583 depd m096, 31, 32, ma064
584 ldd -0x60(%r30), p128c
585 extrd,u m096, 31, 32, ma128
586 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
587 depd m160, 31, 32, ma128
588 ldd -0x48(%r30), p192c
589 extrd,u m160, 31, 32, ma192
590 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
591 depd m224, 31, 32, ma192
592 ldd -0x20(%r30), p192d
593 extrd,u m224, 31, 32, ma256
594 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
595 depd m288, 31, 32, ma256
596 ldd -0x88(%r30), p256d
597 add climb, p000a, s000
598 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
599 add,dc p064a, p064b, s064
600 ldd 0(rp), r000
601 add,dc p128b, p128c, s128
602 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
603 add,dc p192c, p192d, s192
604 ldd 8(rp), r064
605 add,dc p256d, %r0, climb
606 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
607 ldd 16(rp), r128
608 add ma000, s000, s000 C accum mid 0
609 ldd 24(rp), r192
610 add,dc ma064, s064, s064 C accum mid 1
611 add,dc ma128, s128, s128 C accum mid 2
612 add,dc ma192, s192, s192 C accum mid 3
613 add,dc ma256, climb, climb
614 sub r000, s000, s000 C accum rlimb 0
615 sub,db r064, s064, s064 C accum rlimb 1
616 sub,db r128, s128, s128 C accum rlimb 2
617 std s000, 0(rp)
618 sub,db r192, s192, s192 C accum rlimb 3
619 sub,db %r0, climb, climb
620 sub %r0, climb, climb
621 std s064, 8(rp)
622 ldd -0x78(%r30), p032a1
623 std s128, 16(rp)
624 ldd -0x70(%r30), p032a2
625 std s192, 24(rp)
626 ldd -0x38(%r30), p096b1
627 ldd -0x30(%r30), p096b2
628 ldd -0x58(%r30), p160c1
629 ldd -0x50(%r30), p160c2
630 ldd -0x18(%r30), p224d1
631 ldd -0x10(%r30), p224d2
632 ldo 32(rp), rp
633
634 LDEF(end1)
635 add p032a1, p032a2, m032
636 ldd -0x80(%r30), p000a
637 add,dc p096b1, p096b2, m096
638 add,dc p160c1, p160c2, m160
639 ldd -0x68(%r30), p064a
640 add,dc p224d1, p224d2, m224
641 add,dc %r0, %r0, m288
642 ldd -0x40(%r30), p064b
643 depd,z m032, 31, 32, ma000
644 ldd -0x28(%r30), p128b
645 extrd,u m032, 31, 32, ma064
646 depd m096, 31, 32, ma064
647 ldd -0x60(%r30), p128c
648 extrd,u m096, 31, 32, ma128
649 depd m160, 31, 32, ma128
650 ldd -0x48(%r30), p192c
651 extrd,u m160, 31, 32, ma192
652 depd m224, 31, 32, ma192
653 ldd -0x20(%r30), p192d
654 extrd,u m224, 31, 32, ma256
655 depd m288, 31, 32, ma256
656 ldd -0x88(%r30), p256d
657 add climb, p000a, s000
658 add,dc p064a, p064b, s064
659 ldd 0(rp), r000
660 add,dc p128b, p128c, s128
661 add,dc p192c, p192d, s192
662 ldd 8(rp), r064
663 add,dc p256d, %r0, climb
664 ldd 16(rp), r128
665 add ma000, s000, s000 C accum mid 0
666 ldd 24(rp), r192
667 add,dc ma064, s064, s064 C accum mid 1
668 add,dc ma128, s128, s128 C accum mid 2
669 add,dc ma192, s192, s192 C accum mid 3
670 add,dc ma256, climb, climb
671 sub r000, s000, s000 C accum rlimb 0
672 sub,db r064, s064, s064 C accum rlimb 1
673 sub,db r128, s128, s128 C accum rlimb 2
674 std s000, 0(rp)
675 sub,db r192, s192, s192 C accum rlimb 3
676 sub,db %r0, climb, climb
677 sub %r0, climb, climb
678 std s064, 8(rp)
679 std s128, 16(rp)
680 std s192, 24(rp)
681
682 ldd -0xb0(%r30), %r13
683 ldd -0xb8(%r30), %r12
684 ldd -0xc0(%r30), %r11
685 ldd -0xc8(%r30), %r10
686 ldd -0xd0(%r30), %r9
687 ldd -0xd8(%r30), %r8
688 ldd -0xe0(%r30), %r7
689 ldd -0xe8(%r30), %r6
690 LDEF(done)
691 ifdef(`HAVE_ABI_2_0w',
692 ` copy climb, %r28
693 ',` extrd,u climb, 63, 32, %r29
694 extrd,u climb, 31, 32, %r28
695 ')
696 ldd -0xf0(%r30), %r5
697 ldd -0xf8(%r30), %r4
698 bve (%r2)
699 ldd,mb -0x100(%r30), %r3
700 EPILOGUE(mpn_submul_1)
701