mul_1.asm revision 1.1 1 1.1 mrg dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 1.1 mrg dnl the result in a second limb vector.
3 1.1 mrg
4 1.1 mrg dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5 1.1 mrg
6 1.1 mrg dnl This file is part of the GNU MP Library.
7 1.1 mrg
8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1 mrg dnl it under the terms of the GNU Lesser General Public License as published
10 1.1 mrg dnl by the Free Software Foundation; either version 3 of the License, or (at
11 1.1 mrg dnl your option) any later version.
12 1.1 mrg
13 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 1.1 mrg dnl License for more details.
17 1.1 mrg
18 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
19 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 1.1 mrg
21 1.1 mrg include(`../config.m4')
22 1.1 mrg
23 1.1 mrg C cycles/limb
24 1.1 mrg C 8000,8200: 6.5
25 1.1 mrg C 8500,8600,8700: 5.625
26 1.1 mrg
27 1.1 mrg C The feed-in and wind-down code has not yet been scheduled. Many cycles
28 1.1 mrg C could be saved there per call.
29 1.1 mrg
30 1.1 mrg C DESCRIPTION:
31 1.1 mrg C The main loop "BIG" is 4-way unrolled, mainly to allow
32 1.1 mrg C effective use of ADD,DC. Delays in moving data via the cache from the FP
33 1.1 mrg C registers to the IU registers, have demanded a deep software pipeline, and
34 1.1 mrg C a lot of stack slots for partial products in flight.
35 1.1 mrg C
36 1.1 mrg C CODE STRUCTURE:
37 1.1 mrg C save-some-registers
38 1.1 mrg C do 0, 1, 2, or 3 limbs
39 1.1 mrg C if done, restore-some-regs and return
40 1.1 mrg C save-many-regs
41 1.1 mrg C do 4, 8, ... limb
42 1.1 mrg C restore-all-regs
43 1.1 mrg
44 1.1 mrg C STACK LAYOUT:
45 1.1 mrg C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46 1.1 mrg C slots marked FREE, as well as some slots in the caller's "frame marker".
47 1.1 mrg C
48 1.1 mrg C -00 <- r30
49 1.1 mrg C -08 FREE
50 1.1 mrg C -10 tmp
51 1.1 mrg C -18 tmp
52 1.1 mrg C -20 tmp
53 1.1 mrg C -28 tmp
54 1.1 mrg C -30 tmp
55 1.1 mrg C -38 tmp
56 1.1 mrg C -40 tmp
57 1.1 mrg C -48 tmp
58 1.1 mrg C -50 tmp
59 1.1 mrg C -58 tmp
60 1.1 mrg C -60 tmp
61 1.1 mrg C -68 tmp
62 1.1 mrg C -70 tmp
63 1.1 mrg C -78 tmp
64 1.1 mrg C -80 tmp
65 1.1 mrg C -88 tmp
66 1.1 mrg C -90 FREE
67 1.1 mrg C -98 FREE
68 1.1 mrg C -a0 FREE
69 1.1 mrg C -a8 FREE
70 1.1 mrg C -b0 r13
71 1.1 mrg C -b8 r12
72 1.1 mrg C -c0 r11
73 1.1 mrg C -c8 r10
74 1.1 mrg C -d0 r8
75 1.1 mrg C -d8 r8
76 1.1 mrg C -e0 r7
77 1.1 mrg C -e8 r6
78 1.1 mrg C -f0 r5
79 1.1 mrg C -f8 r4
80 1.1 mrg C -100 r3
81 1.1 mrg C Previous frame:
82 1.1 mrg C [unused area]
83 1.1 mrg C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
84 1.1 mrg
85 1.1 mrg
86 1.1 mrg include(`../config.m4')
87 1.1 mrg
88 1.1 mrg C INPUT PARAMETERS:
89 1.1 mrg define(`rp',`%r26') C
90 1.1 mrg define(`up',`%r25') C
91 1.1 mrg define(`n',`%r24') C
92 1.1 mrg define(`vlimb',`%r23') C
93 1.1 mrg
94 1.1 mrg define(`climb',`%r23') C
95 1.1 mrg
96 1.1 mrg ifdef(`HAVE_ABI_2_0w',
97 1.1 mrg ` .level 2.0w
98 1.1 mrg ',` .level 2.0
99 1.1 mrg ')
100 1.1 mrg PROLOGUE(mpn_mul_1)
101 1.1 mrg
102 1.1 mrg ifdef(`HAVE_ABI_2_0w',
103 1.1 mrg ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
104 1.1 mrg ')
105 1.1 mrg std,ma %r3, 0x100(%r30)
106 1.1 mrg std %r4, -0xf8(%r30)
107 1.1 mrg std %r5, -0xf0(%r30)
108 1.1 mrg ldo 0(%r0), climb C clear climb
109 1.1 mrg fldd -0x138(%r30), %fr8 C put vlimb in fp register
110 1.1 mrg
111 1.1 mrg define(`p032a1',`%r1') C
112 1.1 mrg define(`p032a2',`%r19') C
113 1.1 mrg
114 1.1 mrg define(`m032',`%r20') C
115 1.1 mrg define(`m096',`%r21') C
116 1.1 mrg
117 1.1 mrg define(`p000a',`%r22') C
118 1.1 mrg define(`p064a',`%r29') C
119 1.1 mrg
120 1.1 mrg define(`s000',`%r31') C
121 1.1 mrg
122 1.1 mrg define(`ma000',`%r4') C
123 1.1 mrg define(`ma064',`%r20') C
124 1.1 mrg
125 1.1 mrg C define(`r000',`%r3') C FIXME don't save r3 for n < 4.
126 1.1 mrg
127 1.1 mrg extrd,u n, 63, 2, %r5
128 1.1 mrg cmpb,= %r5, %r0, L(BIG)
129 1.1 mrg nop
130 1.1 mrg
131 1.1 mrg fldd 0(up), %fr4
132 1.1 mrg ldo 8(up), up
133 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
134 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
135 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
137 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
138 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140 1.1 mrg addib,<> -1, %r5, L(two_or_more)
141 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
142 1.1 mrg LDEF(one)
143 1.1 mrg ldd -0x78(%r30), p032a1
144 1.1 mrg ldd -0x70(%r30), p032a2
145 1.1 mrg ldd -0x80(%r30), p000a
146 1.1 mrg b L(0_one_out)
147 1.1 mrg ldd -0x68(%r30), p064a
148 1.1 mrg
149 1.1 mrg LDEF(two_or_more)
150 1.1 mrg fldd 0(up), %fr4
151 1.1 mrg ldo 8(up), up
152 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
153 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
154 1.1 mrg ldd -0x78(%r30), p032a1
155 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
157 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
158 1.1 mrg ldd -0x70(%r30), p032a2
159 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160 1.1 mrg ldd -0x80(%r30), p000a
161 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162 1.1 mrg ldd -0x68(%r30), p064a
163 1.1 mrg addib,<> -1, %r5, L(three_or_more)
164 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
165 1.1 mrg LDEF(two)
166 1.1 mrg add p032a1, p032a2, m032
167 1.1 mrg add,dc %r0, %r0, m096
168 1.1 mrg depd,z m032, 31, 32, ma000
169 1.1 mrg extrd,u m032, 31, 32, ma064
170 1.1 mrg b L(0_two_out)
171 1.1 mrg depd m096, 31, 32, ma064
172 1.1 mrg
173 1.1 mrg LDEF(three_or_more)
174 1.1 mrg fldd 0(up), %fr4
175 1.1 mrg add p032a1, p032a2, m032
176 1.1 mrg add,dc %r0, %r0, m096
177 1.1 mrg depd,z m032, 31, 32, ma000
178 1.1 mrg extrd,u m032, 31, 32, ma064
179 1.1 mrg C addib,= -1, %r5, L(0_out)
180 1.1 mrg depd m096, 31, 32, ma064
181 1.1 mrg LDEF(loop0)
182 1.1 mrg C xmpyu %fr8R, %fr4L, %fr22
183 1.1 mrg C xmpyu %fr8L, %fr4R, %fr23
184 1.1 mrg C ldd -0x78(%r30), p032a1
185 1.1 mrg C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
186 1.1 mrg C
187 1.1 mrg C xmpyu %fr8R, %fr4R, %fr24
188 1.1 mrg C xmpyu %fr8L, %fr4L, %fr25
189 1.1 mrg C ldd -0x70(%r30), p032a2
190 1.1 mrg C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
191 1.1 mrg C
192 1.1 mrg C ldo 8(rp), rp
193 1.1 mrg C add climb, p000a, s000
194 1.1 mrg C ldd -0x80(%r30), p000a
195 1.1 mrg C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
196 1.1 mrg C
197 1.1 mrg C add,dc p064a, %r0, climb
198 1.1 mrg C ldo 8(up), up
199 1.1 mrg C ldd -0x68(%r30), p064a
200 1.1 mrg C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
201 1.1 mrg C
202 1.1 mrg C add ma000, s000, s000
203 1.1 mrg C add,dc ma064, climb, climb
204 1.1 mrg C fldd 0(up), %fr4
205 1.1 mrg C
206 1.1 mrg C std s000, -8(rp)
207 1.1 mrg C
208 1.1 mrg C add p032a1, p032a2, m032
209 1.1 mrg C add,dc %r0, %r0, m096
210 1.1 mrg C
211 1.1 mrg C depd,z m032, 31, 32, ma000
212 1.1 mrg C extrd,u m032, 31, 32, ma064
213 1.1 mrg C addib,<> -1, %r5, L(loop0)
214 1.1 mrg C depd m096, 31, 32, ma064
215 1.1 mrg LDEF(0_out)
216 1.1 mrg ldo 8(up), up
217 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
218 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
219 1.1 mrg ldd -0x78(%r30), p032a1
220 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
221 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
222 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
223 1.1 mrg ldd -0x70(%r30), p032a2
224 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
225 1.1 mrg ldo 8(rp), rp
226 1.1 mrg add climb, p000a, s000
227 1.1 mrg ldd -0x80(%r30), p000a
228 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
229 1.1 mrg add,dc p064a, %r0, climb
230 1.1 mrg ldd -0x68(%r30), p064a
231 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
232 1.1 mrg add ma000, s000, s000
233 1.1 mrg add,dc ma064, climb, climb
234 1.1 mrg std s000, -8(rp)
235 1.1 mrg add p032a1, p032a2, m032
236 1.1 mrg add,dc %r0, %r0, m096
237 1.1 mrg depd,z m032, 31, 32, ma000
238 1.1 mrg extrd,u m032, 31, 32, ma064
239 1.1 mrg depd m096, 31, 32, ma064
240 1.1 mrg LDEF(0_two_out)
241 1.1 mrg ldd -0x78(%r30), p032a1
242 1.1 mrg ldd -0x70(%r30), p032a2
243 1.1 mrg ldo 8(rp), rp
244 1.1 mrg add climb, p000a, s000
245 1.1 mrg ldd -0x80(%r30), p000a
246 1.1 mrg add,dc p064a, %r0, climb
247 1.1 mrg ldd -0x68(%r30), p064a
248 1.1 mrg add ma000, s000, s000
249 1.1 mrg add,dc ma064, climb, climb
250 1.1 mrg std s000, -8(rp)
251 1.1 mrg LDEF(0_one_out)
252 1.1 mrg add p032a1, p032a2, m032
253 1.1 mrg add,dc %r0, %r0, m096
254 1.1 mrg depd,z m032, 31, 32, ma000
255 1.1 mrg extrd,u m032, 31, 32, ma064
256 1.1 mrg depd m096, 31, 32, ma064
257 1.1 mrg
258 1.1 mrg add climb, p000a, s000
259 1.1 mrg add,dc p064a, %r0, climb
260 1.1 mrg add ma000, s000, s000
261 1.1 mrg add,dc ma064, climb, climb
262 1.1 mrg std s000, 0(rp)
263 1.1 mrg
264 1.1 mrg cmpib,>= 4, n, L(done)
265 1.1 mrg ldo 8(rp), rp
266 1.1 mrg
267 1.1 mrg C 4-way unrolled code.
268 1.1 mrg
269 1.1 mrg LDEF(BIG)
270 1.1 mrg
271 1.1 mrg define(`p032a1',`%r1') C
272 1.1 mrg define(`p032a2',`%r19') C
273 1.1 mrg define(`p096b1',`%r20') C
274 1.1 mrg define(`p096b2',`%r21') C
275 1.1 mrg define(`p160c1',`%r22') C
276 1.1 mrg define(`p160c2',`%r29') C
277 1.1 mrg define(`p224d1',`%r31') C
278 1.1 mrg define(`p224d2',`%r3') C
279 1.1 mrg C
280 1.1 mrg define(`m032',`%r4') C
281 1.1 mrg define(`m096',`%r5') C
282 1.1 mrg define(`m160',`%r6') C
283 1.1 mrg define(`m224',`%r7') C
284 1.1 mrg define(`m288',`%r8') C
285 1.1 mrg C
286 1.1 mrg define(`p000a',`%r1') C
287 1.1 mrg define(`p064a',`%r19') C
288 1.1 mrg define(`p064b',`%r20') C
289 1.1 mrg define(`p128b',`%r21') C
290 1.1 mrg define(`p128c',`%r22') C
291 1.1 mrg define(`p192c',`%r29') C
292 1.1 mrg define(`p192d',`%r31') C
293 1.1 mrg define(`p256d',`%r3') C
294 1.1 mrg C
295 1.1 mrg define(`s000',`%r10') C
296 1.1 mrg define(`s064',`%r11') C
297 1.1 mrg define(`s128',`%r12') C
298 1.1 mrg define(`s192',`%r13') C
299 1.1 mrg C
300 1.1 mrg define(`ma000',`%r9') C
301 1.1 mrg define(`ma064',`%r4') C
302 1.1 mrg define(`ma128',`%r5') C
303 1.1 mrg define(`ma192',`%r6') C
304 1.1 mrg define(`ma256',`%r7') C
305 1.1 mrg
306 1.1 mrg std %r6, -0xe8(%r30)
307 1.1 mrg std %r7, -0xe0(%r30)
308 1.1 mrg std %r8, -0xd8(%r30)
309 1.1 mrg std %r9, -0xd0(%r30)
310 1.1 mrg std %r10, -0xc8(%r30)
311 1.1 mrg std %r11, -0xc0(%r30)
312 1.1 mrg std %r12, -0xb8(%r30)
313 1.1 mrg std %r13, -0xb0(%r30)
314 1.1 mrg
315 1.1 mrg ifdef(`HAVE_ABI_2_0w',
316 1.1 mrg ` extrd,u n, 61, 62, n C right shift 2
317 1.1 mrg ',` extrd,u n, 61, 30, n C right shift 2, zero extend
318 1.1 mrg ')
319 1.1 mrg
320 1.1 mrg LDEF(4_or_more)
321 1.1 mrg fldd 0(up), %fr4
322 1.1 mrg fldd 8(up), %fr5
323 1.1 mrg fldd 16(up), %fr6
324 1.1 mrg fldd 24(up), %fr7
325 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
326 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
327 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
328 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
329 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
330 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
331 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
332 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
333 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
334 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
335 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
336 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
337 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
338 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
339 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
340 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
341 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
342 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
343 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
344 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
345 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
346 1.1 mrg addib,<> -1, n, L(8_or_more)
347 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
348 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
349 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
350 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
351 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
352 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
353 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
354 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
355 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
356 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
357 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
358 1.1 mrg ldd -0x78(%r30), p032a1
359 1.1 mrg ldd -0x70(%r30), p032a2
360 1.1 mrg ldd -0x38(%r30), p096b1
361 1.1 mrg ldd -0x30(%r30), p096b2
362 1.1 mrg ldd -0x58(%r30), p160c1
363 1.1 mrg ldd -0x50(%r30), p160c2
364 1.1 mrg ldd -0x18(%r30), p224d1
365 1.1 mrg ldd -0x10(%r30), p224d2
366 1.1 mrg b L(end1)
367 1.1 mrg nop
368 1.1 mrg
369 1.1 mrg LDEF(8_or_more)
370 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
372 1.1 mrg ldo 32(up), up
373 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
374 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
375 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
376 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
377 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
378 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
379 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
380 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
381 1.1 mrg fldd 0(up), %fr4
382 1.1 mrg fldd 8(up), %fr5
383 1.1 mrg fldd 16(up), %fr6
384 1.1 mrg fldd 24(up), %fr7
385 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
386 1.1 mrg ldd -0x78(%r30), p032a1
387 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
388 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
389 1.1 mrg ldd -0x70(%r30), p032a2
390 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
391 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
392 1.1 mrg ldd -0x38(%r30), p096b1
393 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
394 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
395 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
396 1.1 mrg ldd -0x30(%r30), p096b2
397 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
398 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
399 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
400 1.1 mrg ldd -0x58(%r30), p160c1
401 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
402 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
403 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
404 1.1 mrg ldd -0x50(%r30), p160c2
405 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
406 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
407 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
408 1.1 mrg ldd -0x18(%r30), p224d1
409 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
410 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
411 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
412 1.1 mrg ldd -0x10(%r30), p224d2
413 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
414 1.1 mrg addib,= -1, n, L(end2)
415 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
416 1.1 mrg LDEF(loop)
417 1.1 mrg add p032a1, p032a2, m032
418 1.1 mrg ldd -0x80(%r30), p000a
419 1.1 mrg add,dc p096b1, p096b2, m096
420 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
421 1.1 mrg
422 1.1 mrg add,dc p160c1, p160c2, m160
423 1.1 mrg ldd -0x68(%r30), p064a
424 1.1 mrg add,dc p224d1, p224d2, m224
425 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
426 1.1 mrg
427 1.1 mrg add,dc %r0, %r0, m288
428 1.1 mrg ldd -0x40(%r30), p064b
429 1.1 mrg ldo 32(up), up
430 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
431 1.1 mrg
432 1.1 mrg depd,z m032, 31, 32, ma000
433 1.1 mrg ldd -0x28(%r30), p128b
434 1.1 mrg extrd,u m032, 31, 32, ma064
435 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
436 1.1 mrg
437 1.1 mrg depd m096, 31, 32, ma064
438 1.1 mrg ldd -0x60(%r30), p128c
439 1.1 mrg extrd,u m096, 31, 32, ma128
440 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
441 1.1 mrg
442 1.1 mrg depd m160, 31, 32, ma128
443 1.1 mrg ldd -0x48(%r30), p192c
444 1.1 mrg extrd,u m160, 31, 32, ma192
445 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
446 1.1 mrg
447 1.1 mrg depd m224, 31, 32, ma192
448 1.1 mrg ldd -0x20(%r30), p192d
449 1.1 mrg extrd,u m224, 31, 32, ma256
450 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
451 1.1 mrg
452 1.1 mrg depd m288, 31, 32, ma256
453 1.1 mrg ldd -0x88(%r30), p256d
454 1.1 mrg add climb, p000a, s000
455 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
456 1.1 mrg
457 1.1 mrg add,dc p064a, p064b, s064
458 1.1 mrg add,dc p128b, p128c, s128
459 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
460 1.1 mrg
461 1.1 mrg add,dc p192c, p192d, s192
462 1.1 mrg add,dc p256d, %r0, climb
463 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
464 1.1 mrg
465 1.1 mrg add ma000, s000, s000 C accum mid 0
466 1.1 mrg fldd 0(up), %fr4
467 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
468 1.1 mrg std s000, 0(rp)
469 1.1 mrg
470 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
471 1.1 mrg fldd 8(up), %fr5
472 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
473 1.1 mrg std s064, 8(rp)
474 1.1 mrg
475 1.1 mrg add,dc ma256, climb, climb
476 1.1 mrg fldd 16(up), %fr6
477 1.1 mrg std s128, 16(rp)
478 1.1 mrg
479 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
480 1.1 mrg ldd -0x78(%r30), p032a1
481 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
482 1.1 mrg fldd 24(up), %fr7
483 1.1 mrg
484 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
485 1.1 mrg ldd -0x70(%r30), p032a2
486 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
487 1.1 mrg std s192, 24(rp)
488 1.1 mrg
489 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
490 1.1 mrg ldd -0x38(%r30), p096b1
491 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
492 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
493 1.1 mrg
494 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
495 1.1 mrg ldd -0x30(%r30), p096b2
496 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
497 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
498 1.1 mrg
499 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
500 1.1 mrg ldd -0x58(%r30), p160c1
501 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
502 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
503 1.1 mrg
504 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
505 1.1 mrg ldd -0x50(%r30), p160c2
506 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
507 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
508 1.1 mrg
509 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
510 1.1 mrg ldd -0x18(%r30), p224d1
511 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
512 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
513 1.1 mrg
514 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
515 1.1 mrg ldd -0x10(%r30), p224d2
516 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
517 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
518 1.1 mrg
519 1.1 mrg addib,<> -1, n, L(loop)
520 1.1 mrg ldo 32(rp), rp
521 1.1 mrg
522 1.1 mrg LDEF(end2)
523 1.1 mrg add p032a1, p032a2, m032
524 1.1 mrg ldd -0x80(%r30), p000a
525 1.1 mrg add,dc p096b1, p096b2, m096
526 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
527 1.1 mrg add,dc p160c1, p160c2, m160
528 1.1 mrg ldd -0x68(%r30), p064a
529 1.1 mrg add,dc p224d1, p224d2, m224
530 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
531 1.1 mrg add,dc %r0, %r0, m288
532 1.1 mrg ldd -0x40(%r30), p064b
533 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
534 1.1 mrg depd,z m032, 31, 32, ma000
535 1.1 mrg ldd -0x28(%r30), p128b
536 1.1 mrg extrd,u m032, 31, 32, ma064
537 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
538 1.1 mrg depd m096, 31, 32, ma064
539 1.1 mrg ldd -0x60(%r30), p128c
540 1.1 mrg extrd,u m096, 31, 32, ma128
541 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
542 1.1 mrg depd m160, 31, 32, ma128
543 1.1 mrg ldd -0x48(%r30), p192c
544 1.1 mrg extrd,u m160, 31, 32, ma192
545 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
546 1.1 mrg depd m224, 31, 32, ma192
547 1.1 mrg ldd -0x20(%r30), p192d
548 1.1 mrg extrd,u m224, 31, 32, ma256
549 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
550 1.1 mrg depd m288, 31, 32, ma256
551 1.1 mrg ldd -0x88(%r30), p256d
552 1.1 mrg add climb, p000a, s000
553 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
554 1.1 mrg add,dc p064a, p064b, s064
555 1.1 mrg add,dc p128b, p128c, s128
556 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
557 1.1 mrg add,dc p192c, p192d, s192
558 1.1 mrg add,dc p256d, %r0, climb
559 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
560 1.1 mrg add ma000, s000, s000 C accum mid 0
561 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
562 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
563 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
564 1.1 mrg add,dc ma256, climb, climb
565 1.1 mrg std s000, 0(rp)
566 1.1 mrg std s064, 8(rp)
567 1.1 mrg ldd -0x78(%r30), p032a1
568 1.1 mrg std s128, 16(rp)
569 1.1 mrg ldd -0x70(%r30), p032a2
570 1.1 mrg std s192, 24(rp)
571 1.1 mrg ldd -0x38(%r30), p096b1
572 1.1 mrg ldd -0x30(%r30), p096b2
573 1.1 mrg ldd -0x58(%r30), p160c1
574 1.1 mrg ldd -0x50(%r30), p160c2
575 1.1 mrg ldd -0x18(%r30), p224d1
576 1.1 mrg ldd -0x10(%r30), p224d2
577 1.1 mrg ldo 32(rp), rp
578 1.1 mrg
579 1.1 mrg LDEF(end1)
580 1.1 mrg add p032a1, p032a2, m032
581 1.1 mrg ldd -0x80(%r30), p000a
582 1.1 mrg add,dc p096b1, p096b2, m096
583 1.1 mrg add,dc p160c1, p160c2, m160
584 1.1 mrg ldd -0x68(%r30), p064a
585 1.1 mrg add,dc p224d1, p224d2, m224
586 1.1 mrg add,dc %r0, %r0, m288
587 1.1 mrg ldd -0x40(%r30), p064b
588 1.1 mrg depd,z m032, 31, 32, ma000
589 1.1 mrg ldd -0x28(%r30), p128b
590 1.1 mrg extrd,u m032, 31, 32, ma064
591 1.1 mrg depd m096, 31, 32, ma064
592 1.1 mrg ldd -0x60(%r30), p128c
593 1.1 mrg extrd,u m096, 31, 32, ma128
594 1.1 mrg depd m160, 31, 32, ma128
595 1.1 mrg ldd -0x48(%r30), p192c
596 1.1 mrg extrd,u m160, 31, 32, ma192
597 1.1 mrg depd m224, 31, 32, ma192
598 1.1 mrg ldd -0x20(%r30), p192d
599 1.1 mrg extrd,u m224, 31, 32, ma256
600 1.1 mrg depd m288, 31, 32, ma256
601 1.1 mrg ldd -0x88(%r30), p256d
602 1.1 mrg add climb, p000a, s000
603 1.1 mrg add,dc p064a, p064b, s064
604 1.1 mrg add,dc p128b, p128c, s128
605 1.1 mrg add,dc p192c, p192d, s192
606 1.1 mrg add,dc p256d, %r0, climb
607 1.1 mrg add ma000, s000, s000 C accum mid 0
608 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
609 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
610 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
611 1.1 mrg add,dc ma256, climb, climb
612 1.1 mrg std s000, 0(rp)
613 1.1 mrg std s064, 8(rp)
614 1.1 mrg std s128, 16(rp)
615 1.1 mrg std s192, 24(rp)
616 1.1 mrg
617 1.1 mrg ldd -0xb0(%r30), %r13
618 1.1 mrg ldd -0xb8(%r30), %r12
619 1.1 mrg ldd -0xc0(%r30), %r11
620 1.1 mrg ldd -0xc8(%r30), %r10
621 1.1 mrg ldd -0xd0(%r30), %r9
622 1.1 mrg ldd -0xd8(%r30), %r8
623 1.1 mrg ldd -0xe0(%r30), %r7
624 1.1 mrg ldd -0xe8(%r30), %r6
625 1.1 mrg LDEF(done)
626 1.1 mrg ifdef(`HAVE_ABI_2_0w',
627 1.1 mrg ` copy climb, %r28
628 1.1 mrg ',` extrd,u climb, 63, 32, %r29
629 1.1 mrg extrd,u climb, 31, 32, %r28
630 1.1 mrg ')
631 1.1 mrg ldd -0xf0(%r30), %r5
632 1.1 mrg ldd -0xf8(%r30), %r4
633 1.1 mrg bve (%r2)
634 1.1 mrg ldd,mb -0x100(%r30), %r3
635 1.1 mrg EPILOGUE(mpn_mul_1)
636