mul_1.asm revision 1.1 1 dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
3
4 dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5
6 dnl This file is part of the GNU MP Library.
7
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
12
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
17
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23 C cycles/limb
24 C 8000,8200: 6.5
25 C 8500,8600,8700: 5.625
26
27 C The feed-in and wind-down code has not yet been scheduled. Many cycles
28 C could be saved there per call.
29
30 C DESCRIPTION:
31 C The main loop "BIG" is 4-way unrolled, mainly to allow
32 C effective use of ADD,DC. Delays in moving data via the cache from the FP
33 C registers to the IU registers, have demanded a deep software pipeline, and
34 C a lot of stack slots for partial products in flight.
35 C
36 C CODE STRUCTURE:
37 C save-some-registers
38 C do 0, 1, 2, or 3 limbs
39 C if done, restore-some-regs and return
40 C save-many-regs
41 C do 4, 8, ... limb
42 C restore-all-regs
43
44 C STACK LAYOUT:
45 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46 C slots marked FREE, as well as some slots in the caller's "frame marker".
47 C
48 C -00 <- r30
49 C -08 FREE
50 C -10 tmp
51 C -18 tmp
52 C -20 tmp
53 C -28 tmp
54 C -30 tmp
55 C -38 tmp
56 C -40 tmp
57 C -48 tmp
58 C -50 tmp
59 C -58 tmp
60 C -60 tmp
61 C -68 tmp
62 C -70 tmp
63 C -78 tmp
64 C -80 tmp
65 C -88 tmp
66 C -90 FREE
67 C -98 FREE
68 C -a0 FREE
69 C -a8 FREE
70 C -b0 r13
71 C -b8 r12
72 C -c0 r11
73 C -c8 r10
74 C -d0 r8
75 C -d8 r8
76 C -e0 r7
77 C -e8 r6
78 C -f0 r5
79 C -f8 r4
80 C -100 r3
81 C Previous frame:
82 C [unused area]
83 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
84
85
86 include(`../config.m4')
87
88 C INPUT PARAMETERS:
89 define(`rp',`%r26') C
90 define(`up',`%r25') C
91 define(`n',`%r24') C
92 define(`vlimb',`%r23') C
93
94 define(`climb',`%r23') C
95
96 ifdef(`HAVE_ABI_2_0w',
97 ` .level 2.0w
98 ',` .level 2.0
99 ')
100 PROLOGUE(mpn_mul_1)
101
102 ifdef(`HAVE_ABI_2_0w',
103 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
104 ')
105 std,ma %r3, 0x100(%r30)
106 std %r4, -0xf8(%r30)
107 std %r5, -0xf0(%r30)
108 ldo 0(%r0), climb C clear climb
109 fldd -0x138(%r30), %fr8 C put vlimb in fp register
110
111 define(`p032a1',`%r1') C
112 define(`p032a2',`%r19') C
113
114 define(`m032',`%r20') C
115 define(`m096',`%r21') C
116
117 define(`p000a',`%r22') C
118 define(`p064a',`%r29') C
119
120 define(`s000',`%r31') C
121
122 define(`ma000',`%r4') C
123 define(`ma064',`%r20') C
124
125 C define(`r000',`%r3') C FIXME don't save r3 for n < 4.
126
127 extrd,u n, 63, 2, %r5
128 cmpb,= %r5, %r0, L(BIG)
129 nop
130
131 fldd 0(up), %fr4
132 ldo 8(up), up
133 xmpyu %fr8R, %fr4L, %fr22
134 xmpyu %fr8L, %fr4R, %fr23
135 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136 xmpyu %fr8R, %fr4R, %fr24
137 xmpyu %fr8L, %fr4L, %fr25
138 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140 addib,<> -1, %r5, L(two_or_more)
141 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
142 LDEF(one)
143 ldd -0x78(%r30), p032a1
144 ldd -0x70(%r30), p032a2
145 ldd -0x80(%r30), p000a
146 b L(0_one_out)
147 ldd -0x68(%r30), p064a
148
149 LDEF(two_or_more)
150 fldd 0(up), %fr4
151 ldo 8(up), up
152 xmpyu %fr8R, %fr4L, %fr22
153 xmpyu %fr8L, %fr4R, %fr23
154 ldd -0x78(%r30), p032a1
155 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156 xmpyu %fr8R, %fr4R, %fr24
157 xmpyu %fr8L, %fr4L, %fr25
158 ldd -0x70(%r30), p032a2
159 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160 ldd -0x80(%r30), p000a
161 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162 ldd -0x68(%r30), p064a
163 addib,<> -1, %r5, L(three_or_more)
164 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
165 LDEF(two)
166 add p032a1, p032a2, m032
167 add,dc %r0, %r0, m096
168 depd,z m032, 31, 32, ma000
169 extrd,u m032, 31, 32, ma064
170 b L(0_two_out)
171 depd m096, 31, 32, ma064
172
173 LDEF(three_or_more)
174 fldd 0(up), %fr4
175 add p032a1, p032a2, m032
176 add,dc %r0, %r0, m096
177 depd,z m032, 31, 32, ma000
178 extrd,u m032, 31, 32, ma064
179 C addib,= -1, %r5, L(0_out)
180 depd m096, 31, 32, ma064
181 LDEF(loop0)
182 C xmpyu %fr8R, %fr4L, %fr22
183 C xmpyu %fr8L, %fr4R, %fr23
184 C ldd -0x78(%r30), p032a1
185 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
186 C
187 C xmpyu %fr8R, %fr4R, %fr24
188 C xmpyu %fr8L, %fr4L, %fr25
189 C ldd -0x70(%r30), p032a2
190 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
191 C
192 C ldo 8(rp), rp
193 C add climb, p000a, s000
194 C ldd -0x80(%r30), p000a
195 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
196 C
197 C add,dc p064a, %r0, climb
198 C ldo 8(up), up
199 C ldd -0x68(%r30), p064a
200 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
201 C
202 C add ma000, s000, s000
203 C add,dc ma064, climb, climb
204 C fldd 0(up), %fr4
205 C
206 C std s000, -8(rp)
207 C
208 C add p032a1, p032a2, m032
209 C add,dc %r0, %r0, m096
210 C
211 C depd,z m032, 31, 32, ma000
212 C extrd,u m032, 31, 32, ma064
213 C addib,<> -1, %r5, L(loop0)
214 C depd m096, 31, 32, ma064
215 LDEF(0_out)
216 ldo 8(up), up
217 xmpyu %fr8R, %fr4L, %fr22
218 xmpyu %fr8L, %fr4R, %fr23
219 ldd -0x78(%r30), p032a1
220 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
221 xmpyu %fr8R, %fr4R, %fr24
222 xmpyu %fr8L, %fr4L, %fr25
223 ldd -0x70(%r30), p032a2
224 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
225 ldo 8(rp), rp
226 add climb, p000a, s000
227 ldd -0x80(%r30), p000a
228 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
229 add,dc p064a, %r0, climb
230 ldd -0x68(%r30), p064a
231 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
232 add ma000, s000, s000
233 add,dc ma064, climb, climb
234 std s000, -8(rp)
235 add p032a1, p032a2, m032
236 add,dc %r0, %r0, m096
237 depd,z m032, 31, 32, ma000
238 extrd,u m032, 31, 32, ma064
239 depd m096, 31, 32, ma064
240 LDEF(0_two_out)
241 ldd -0x78(%r30), p032a1
242 ldd -0x70(%r30), p032a2
243 ldo 8(rp), rp
244 add climb, p000a, s000
245 ldd -0x80(%r30), p000a
246 add,dc p064a, %r0, climb
247 ldd -0x68(%r30), p064a
248 add ma000, s000, s000
249 add,dc ma064, climb, climb
250 std s000, -8(rp)
251 LDEF(0_one_out)
252 add p032a1, p032a2, m032
253 add,dc %r0, %r0, m096
254 depd,z m032, 31, 32, ma000
255 extrd,u m032, 31, 32, ma064
256 depd m096, 31, 32, ma064
257
258 add climb, p000a, s000
259 add,dc p064a, %r0, climb
260 add ma000, s000, s000
261 add,dc ma064, climb, climb
262 std s000, 0(rp)
263
264 cmpib,>= 4, n, L(done)
265 ldo 8(rp), rp
266
267 C 4-way unrolled code.
268
269 LDEF(BIG)
270
271 define(`p032a1',`%r1') C
272 define(`p032a2',`%r19') C
273 define(`p096b1',`%r20') C
274 define(`p096b2',`%r21') C
275 define(`p160c1',`%r22') C
276 define(`p160c2',`%r29') C
277 define(`p224d1',`%r31') C
278 define(`p224d2',`%r3') C
279 C
280 define(`m032',`%r4') C
281 define(`m096',`%r5') C
282 define(`m160',`%r6') C
283 define(`m224',`%r7') C
284 define(`m288',`%r8') C
285 C
286 define(`p000a',`%r1') C
287 define(`p064a',`%r19') C
288 define(`p064b',`%r20') C
289 define(`p128b',`%r21') C
290 define(`p128c',`%r22') C
291 define(`p192c',`%r29') C
292 define(`p192d',`%r31') C
293 define(`p256d',`%r3') C
294 C
295 define(`s000',`%r10') C
296 define(`s064',`%r11') C
297 define(`s128',`%r12') C
298 define(`s192',`%r13') C
299 C
300 define(`ma000',`%r9') C
301 define(`ma064',`%r4') C
302 define(`ma128',`%r5') C
303 define(`ma192',`%r6') C
304 define(`ma256',`%r7') C
305
306 std %r6, -0xe8(%r30)
307 std %r7, -0xe0(%r30)
308 std %r8, -0xd8(%r30)
309 std %r9, -0xd0(%r30)
310 std %r10, -0xc8(%r30)
311 std %r11, -0xc0(%r30)
312 std %r12, -0xb8(%r30)
313 std %r13, -0xb0(%r30)
314
315 ifdef(`HAVE_ABI_2_0w',
316 ` extrd,u n, 61, 62, n C right shift 2
317 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
318 ')
319
320 LDEF(4_or_more)
321 fldd 0(up), %fr4
322 fldd 8(up), %fr5
323 fldd 16(up), %fr6
324 fldd 24(up), %fr7
325 xmpyu %fr8R, %fr4L, %fr22
326 xmpyu %fr8L, %fr4R, %fr23
327 xmpyu %fr8R, %fr5L, %fr24
328 xmpyu %fr8L, %fr5R, %fr25
329 xmpyu %fr8R, %fr6L, %fr26
330 xmpyu %fr8L, %fr6R, %fr27
331 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
332 xmpyu %fr8R, %fr7L, %fr28
333 xmpyu %fr8L, %fr7R, %fr29
334 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
335 xmpyu %fr8R, %fr4R, %fr30
336 xmpyu %fr8L, %fr4L, %fr31
337 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
338 xmpyu %fr8R, %fr5R, %fr22
339 xmpyu %fr8L, %fr5L, %fr23
340 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
341 xmpyu %fr8R, %fr6R, %fr24
342 xmpyu %fr8L, %fr6L, %fr25
343 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
344 xmpyu %fr8R, %fr7R, %fr26
345 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
346 addib,<> -1, n, L(8_or_more)
347 xmpyu %fr8L, %fr7L, %fr27
348 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
349 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
350 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
351 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
352 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
353 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
354 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
355 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
356 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
357 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
358 ldd -0x78(%r30), p032a1
359 ldd -0x70(%r30), p032a2
360 ldd -0x38(%r30), p096b1
361 ldd -0x30(%r30), p096b2
362 ldd -0x58(%r30), p160c1
363 ldd -0x50(%r30), p160c2
364 ldd -0x18(%r30), p224d1
365 ldd -0x10(%r30), p224d2
366 b L(end1)
367 nop
368
369 LDEF(8_or_more)
370 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
372 ldo 32(up), up
373 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
374 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
375 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
376 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
377 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
378 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
379 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
380 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
381 fldd 0(up), %fr4
382 fldd 8(up), %fr5
383 fldd 16(up), %fr6
384 fldd 24(up), %fr7
385 xmpyu %fr8R, %fr4L, %fr22
386 ldd -0x78(%r30), p032a1
387 xmpyu %fr8L, %fr4R, %fr23
388 xmpyu %fr8R, %fr5L, %fr24
389 ldd -0x70(%r30), p032a2
390 xmpyu %fr8L, %fr5R, %fr25
391 xmpyu %fr8R, %fr6L, %fr26
392 ldd -0x38(%r30), p096b1
393 xmpyu %fr8L, %fr6R, %fr27
394 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
395 xmpyu %fr8R, %fr7L, %fr28
396 ldd -0x30(%r30), p096b2
397 xmpyu %fr8L, %fr7R, %fr29
398 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
399 xmpyu %fr8R, %fr4R, %fr30
400 ldd -0x58(%r30), p160c1
401 xmpyu %fr8L, %fr4L, %fr31
402 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
403 xmpyu %fr8R, %fr5R, %fr22
404 ldd -0x50(%r30), p160c2
405 xmpyu %fr8L, %fr5L, %fr23
406 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
407 xmpyu %fr8R, %fr6R, %fr24
408 ldd -0x18(%r30), p224d1
409 xmpyu %fr8L, %fr6L, %fr25
410 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
411 xmpyu %fr8R, %fr7R, %fr26
412 ldd -0x10(%r30), p224d2
413 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
414 addib,= -1, n, L(end2)
415 xmpyu %fr8L, %fr7L, %fr27
416 LDEF(loop)
417 add p032a1, p032a2, m032
418 ldd -0x80(%r30), p000a
419 add,dc p096b1, p096b2, m096
420 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
421
422 add,dc p160c1, p160c2, m160
423 ldd -0x68(%r30), p064a
424 add,dc p224d1, p224d2, m224
425 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
426
427 add,dc %r0, %r0, m288
428 ldd -0x40(%r30), p064b
429 ldo 32(up), up
430 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
431
432 depd,z m032, 31, 32, ma000
433 ldd -0x28(%r30), p128b
434 extrd,u m032, 31, 32, ma064
435 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
436
437 depd m096, 31, 32, ma064
438 ldd -0x60(%r30), p128c
439 extrd,u m096, 31, 32, ma128
440 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
441
442 depd m160, 31, 32, ma128
443 ldd -0x48(%r30), p192c
444 extrd,u m160, 31, 32, ma192
445 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
446
447 depd m224, 31, 32, ma192
448 ldd -0x20(%r30), p192d
449 extrd,u m224, 31, 32, ma256
450 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
451
452 depd m288, 31, 32, ma256
453 ldd -0x88(%r30), p256d
454 add climb, p000a, s000
455 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
456
457 add,dc p064a, p064b, s064
458 add,dc p128b, p128c, s128
459 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
460
461 add,dc p192c, p192d, s192
462 add,dc p256d, %r0, climb
463 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
464
465 add ma000, s000, s000 C accum mid 0
466 fldd 0(up), %fr4
467 add,dc ma064, s064, s064 C accum mid 1
468 std s000, 0(rp)
469
470 add,dc ma128, s128, s128 C accum mid 2
471 fldd 8(up), %fr5
472 add,dc ma192, s192, s192 C accum mid 3
473 std s064, 8(rp)
474
475 add,dc ma256, climb, climb
476 fldd 16(up), %fr6
477 std s128, 16(rp)
478
479 xmpyu %fr8R, %fr4L, %fr22
480 ldd -0x78(%r30), p032a1
481 xmpyu %fr8L, %fr4R, %fr23
482 fldd 24(up), %fr7
483
484 xmpyu %fr8R, %fr5L, %fr24
485 ldd -0x70(%r30), p032a2
486 xmpyu %fr8L, %fr5R, %fr25
487 std s192, 24(rp)
488
489 xmpyu %fr8R, %fr6L, %fr26
490 ldd -0x38(%r30), p096b1
491 xmpyu %fr8L, %fr6R, %fr27
492 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
493
494 xmpyu %fr8R, %fr7L, %fr28
495 ldd -0x30(%r30), p096b2
496 xmpyu %fr8L, %fr7R, %fr29
497 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
498
499 xmpyu %fr8R, %fr4R, %fr30
500 ldd -0x58(%r30), p160c1
501 xmpyu %fr8L, %fr4L, %fr31
502 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
503
504 xmpyu %fr8R, %fr5R, %fr22
505 ldd -0x50(%r30), p160c2
506 xmpyu %fr8L, %fr5L, %fr23
507 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
508
509 xmpyu %fr8R, %fr6R, %fr24
510 ldd -0x18(%r30), p224d1
511 xmpyu %fr8L, %fr6L, %fr25
512 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
513
514 xmpyu %fr8R, %fr7R, %fr26
515 ldd -0x10(%r30), p224d2
516 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
517 xmpyu %fr8L, %fr7L, %fr27
518
519 addib,<> -1, n, L(loop)
520 ldo 32(rp), rp
521
522 LDEF(end2)
523 add p032a1, p032a2, m032
524 ldd -0x80(%r30), p000a
525 add,dc p096b1, p096b2, m096
526 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
527 add,dc p160c1, p160c2, m160
528 ldd -0x68(%r30), p064a
529 add,dc p224d1, p224d2, m224
530 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
531 add,dc %r0, %r0, m288
532 ldd -0x40(%r30), p064b
533 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
534 depd,z m032, 31, 32, ma000
535 ldd -0x28(%r30), p128b
536 extrd,u m032, 31, 32, ma064
537 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
538 depd m096, 31, 32, ma064
539 ldd -0x60(%r30), p128c
540 extrd,u m096, 31, 32, ma128
541 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
542 depd m160, 31, 32, ma128
543 ldd -0x48(%r30), p192c
544 extrd,u m160, 31, 32, ma192
545 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
546 depd m224, 31, 32, ma192
547 ldd -0x20(%r30), p192d
548 extrd,u m224, 31, 32, ma256
549 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
550 depd m288, 31, 32, ma256
551 ldd -0x88(%r30), p256d
552 add climb, p000a, s000
553 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
554 add,dc p064a, p064b, s064
555 add,dc p128b, p128c, s128
556 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
557 add,dc p192c, p192d, s192
558 add,dc p256d, %r0, climb
559 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
560 add ma000, s000, s000 C accum mid 0
561 add,dc ma064, s064, s064 C accum mid 1
562 add,dc ma128, s128, s128 C accum mid 2
563 add,dc ma192, s192, s192 C accum mid 3
564 add,dc ma256, climb, climb
565 std s000, 0(rp)
566 std s064, 8(rp)
567 ldd -0x78(%r30), p032a1
568 std s128, 16(rp)
569 ldd -0x70(%r30), p032a2
570 std s192, 24(rp)
571 ldd -0x38(%r30), p096b1
572 ldd -0x30(%r30), p096b2
573 ldd -0x58(%r30), p160c1
574 ldd -0x50(%r30), p160c2
575 ldd -0x18(%r30), p224d1
576 ldd -0x10(%r30), p224d2
577 ldo 32(rp), rp
578
579 LDEF(end1)
580 add p032a1, p032a2, m032
581 ldd -0x80(%r30), p000a
582 add,dc p096b1, p096b2, m096
583 add,dc p160c1, p160c2, m160
584 ldd -0x68(%r30), p064a
585 add,dc p224d1, p224d2, m224
586 add,dc %r0, %r0, m288
587 ldd -0x40(%r30), p064b
588 depd,z m032, 31, 32, ma000
589 ldd -0x28(%r30), p128b
590 extrd,u m032, 31, 32, ma064
591 depd m096, 31, 32, ma064
592 ldd -0x60(%r30), p128c
593 extrd,u m096, 31, 32, ma128
594 depd m160, 31, 32, ma128
595 ldd -0x48(%r30), p192c
596 extrd,u m160, 31, 32, ma192
597 depd m224, 31, 32, ma192
598 ldd -0x20(%r30), p192d
599 extrd,u m224, 31, 32, ma256
600 depd m288, 31, 32, ma256
601 ldd -0x88(%r30), p256d
602 add climb, p000a, s000
603 add,dc p064a, p064b, s064
604 add,dc p128b, p128c, s128
605 add,dc p192c, p192d, s192
606 add,dc p256d, %r0, climb
607 add ma000, s000, s000 C accum mid 0
608 add,dc ma064, s064, s064 C accum mid 1
609 add,dc ma128, s128, s128 C accum mid 2
610 add,dc ma192, s192, s192 C accum mid 3
611 add,dc ma256, climb, climb
612 std s000, 0(rp)
613 std s064, 8(rp)
614 std s128, 16(rp)
615 std s192, 24(rp)
616
617 ldd -0xb0(%r30), %r13
618 ldd -0xb8(%r30), %r12
619 ldd -0xc0(%r30), %r11
620 ldd -0xc8(%r30), %r10
621 ldd -0xd0(%r30), %r9
622 ldd -0xd8(%r30), %r8
623 ldd -0xe0(%r30), %r7
624 ldd -0xe8(%r30), %r6
625 LDEF(done)
626 ifdef(`HAVE_ABI_2_0w',
627 ` copy climb, %r28
628 ',` extrd,u climb, 63, 32, %r29
629 extrd,u climb, 31, 32, %r28
630 ')
631 ldd -0xf0(%r30), %r5
632 ldd -0xf8(%r30), %r4
633 bve (%r2)
634 ldd,mb -0x100(%r30), %r3
635 EPILOGUE(mpn_mul_1)
636