mul_1.asm revision 1.1.1.2 1 1.1 mrg dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 1.1 mrg dnl the result in a second limb vector.
3 1.1 mrg
4 1.1.1.2 mrg dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5 1.1 mrg
6 1.1 mrg dnl This file is part of the GNU MP Library.
7 1.1.1.2 mrg dnl
8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1.1.2 mrg dnl it under the terms of either:
10 1.1.1.2 mrg dnl
11 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free
12 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your
13 1.1.1.2 mrg dnl option) any later version.
14 1.1.1.2 mrg dnl
15 1.1.1.2 mrg dnl or
16 1.1.1.2 mrg dnl
17 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software
18 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any
19 1.1.1.2 mrg dnl later version.
20 1.1.1.2 mrg dnl
21 1.1.1.2 mrg dnl or both in parallel, as here.
22 1.1.1.2 mrg dnl
23 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 1.1.1.2 mrg dnl for more details.
27 1.1.1.2 mrg dnl
28 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the
29 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/.
31 1.1 mrg
32 1.1 mrg include(`../config.m4')
33 1.1 mrg
34 1.1 mrg C cycles/limb
35 1.1 mrg C 8000,8200: 6.5
36 1.1 mrg C 8500,8600,8700: 5.625
37 1.1 mrg
38 1.1 mrg C The feed-in and wind-down code has not yet been scheduled. Many cycles
39 1.1 mrg C could be saved there per call.
40 1.1 mrg
41 1.1 mrg C DESCRIPTION:
42 1.1 mrg C The main loop "BIG" is 4-way unrolled, mainly to allow
43 1.1 mrg C effective use of ADD,DC. Delays in moving data via the cache from the FP
44 1.1 mrg C registers to the IU registers, have demanded a deep software pipeline, and
45 1.1 mrg C a lot of stack slots for partial products in flight.
46 1.1 mrg C
47 1.1 mrg C CODE STRUCTURE:
48 1.1 mrg C save-some-registers
49 1.1 mrg C do 0, 1, 2, or 3 limbs
50 1.1 mrg C if done, restore-some-regs and return
51 1.1 mrg C save-many-regs
52 1.1 mrg C do 4, 8, ... limb
53 1.1 mrg C restore-all-regs
54 1.1 mrg
55 1.1 mrg C STACK LAYOUT:
56 1.1 mrg C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
57 1.1 mrg C slots marked FREE, as well as some slots in the caller's "frame marker".
58 1.1 mrg C
59 1.1 mrg C -00 <- r30
60 1.1 mrg C -08 FREE
61 1.1 mrg C -10 tmp
62 1.1 mrg C -18 tmp
63 1.1 mrg C -20 tmp
64 1.1 mrg C -28 tmp
65 1.1 mrg C -30 tmp
66 1.1 mrg C -38 tmp
67 1.1 mrg C -40 tmp
68 1.1 mrg C -48 tmp
69 1.1 mrg C -50 tmp
70 1.1 mrg C -58 tmp
71 1.1 mrg C -60 tmp
72 1.1 mrg C -68 tmp
73 1.1 mrg C -70 tmp
74 1.1 mrg C -78 tmp
75 1.1 mrg C -80 tmp
76 1.1 mrg C -88 tmp
77 1.1 mrg C -90 FREE
78 1.1 mrg C -98 FREE
79 1.1 mrg C -a0 FREE
80 1.1 mrg C -a8 FREE
81 1.1 mrg C -b0 r13
82 1.1 mrg C -b8 r12
83 1.1 mrg C -c0 r11
84 1.1 mrg C -c8 r10
85 1.1 mrg C -d0 r8
86 1.1 mrg C -d8 r8
87 1.1 mrg C -e0 r7
88 1.1 mrg C -e8 r6
89 1.1 mrg C -f0 r5
90 1.1 mrg C -f8 r4
91 1.1 mrg C -100 r3
92 1.1 mrg C Previous frame:
93 1.1 mrg C [unused area]
94 1.1 mrg C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
95 1.1 mrg
96 1.1 mrg
97 1.1 mrg include(`../config.m4')
98 1.1 mrg
99 1.1 mrg C INPUT PARAMETERS:
100 1.1 mrg define(`rp',`%r26') C
101 1.1 mrg define(`up',`%r25') C
102 1.1 mrg define(`n',`%r24') C
103 1.1 mrg define(`vlimb',`%r23') C
104 1.1 mrg
105 1.1 mrg define(`climb',`%r23') C
106 1.1 mrg
107 1.1 mrg ifdef(`HAVE_ABI_2_0w',
108 1.1 mrg ` .level 2.0w
109 1.1 mrg ',` .level 2.0
110 1.1 mrg ')
111 1.1 mrg PROLOGUE(mpn_mul_1)
112 1.1 mrg
113 1.1 mrg ifdef(`HAVE_ABI_2_0w',
114 1.1 mrg ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
115 1.1 mrg ')
116 1.1 mrg std,ma %r3, 0x100(%r30)
117 1.1 mrg std %r4, -0xf8(%r30)
118 1.1 mrg std %r5, -0xf0(%r30)
119 1.1 mrg ldo 0(%r0), climb C clear climb
120 1.1 mrg fldd -0x138(%r30), %fr8 C put vlimb in fp register
121 1.1 mrg
122 1.1 mrg define(`p032a1',`%r1') C
123 1.1 mrg define(`p032a2',`%r19') C
124 1.1 mrg
125 1.1 mrg define(`m032',`%r20') C
126 1.1 mrg define(`m096',`%r21') C
127 1.1 mrg
128 1.1 mrg define(`p000a',`%r22') C
129 1.1 mrg define(`p064a',`%r29') C
130 1.1 mrg
131 1.1 mrg define(`s000',`%r31') C
132 1.1 mrg
133 1.1 mrg define(`ma000',`%r4') C
134 1.1 mrg define(`ma064',`%r20') C
135 1.1 mrg
136 1.1 mrg C define(`r000',`%r3') C FIXME don't save r3 for n < 4.
137 1.1 mrg
138 1.1 mrg extrd,u n, 63, 2, %r5
139 1.1 mrg cmpb,= %r5, %r0, L(BIG)
140 1.1 mrg nop
141 1.1 mrg
142 1.1 mrg fldd 0(up), %fr4
143 1.1 mrg ldo 8(up), up
144 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
145 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
146 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
147 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
148 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
149 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
150 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
151 1.1 mrg addib,<> -1, %r5, L(two_or_more)
152 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
153 1.1 mrg LDEF(one)
154 1.1 mrg ldd -0x78(%r30), p032a1
155 1.1 mrg ldd -0x70(%r30), p032a2
156 1.1 mrg ldd -0x80(%r30), p000a
157 1.1 mrg b L(0_one_out)
158 1.1 mrg ldd -0x68(%r30), p064a
159 1.1 mrg
160 1.1 mrg LDEF(two_or_more)
161 1.1 mrg fldd 0(up), %fr4
162 1.1 mrg ldo 8(up), up
163 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
164 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
165 1.1 mrg ldd -0x78(%r30), p032a1
166 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
167 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
168 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
169 1.1 mrg ldd -0x70(%r30), p032a2
170 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
171 1.1 mrg ldd -0x80(%r30), p000a
172 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
173 1.1 mrg ldd -0x68(%r30), p064a
174 1.1 mrg addib,<> -1, %r5, L(three_or_more)
175 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
176 1.1 mrg LDEF(two)
177 1.1 mrg add p032a1, p032a2, m032
178 1.1 mrg add,dc %r0, %r0, m096
179 1.1 mrg depd,z m032, 31, 32, ma000
180 1.1 mrg extrd,u m032, 31, 32, ma064
181 1.1 mrg b L(0_two_out)
182 1.1 mrg depd m096, 31, 32, ma064
183 1.1 mrg
184 1.1 mrg LDEF(three_or_more)
185 1.1 mrg fldd 0(up), %fr4
186 1.1 mrg add p032a1, p032a2, m032
187 1.1 mrg add,dc %r0, %r0, m096
188 1.1 mrg depd,z m032, 31, 32, ma000
189 1.1 mrg extrd,u m032, 31, 32, ma064
190 1.1 mrg C addib,= -1, %r5, L(0_out)
191 1.1 mrg depd m096, 31, 32, ma064
192 1.1 mrg LDEF(loop0)
193 1.1 mrg C xmpyu %fr8R, %fr4L, %fr22
194 1.1 mrg C xmpyu %fr8L, %fr4R, %fr23
195 1.1 mrg C ldd -0x78(%r30), p032a1
196 1.1 mrg C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
197 1.1 mrg C
198 1.1 mrg C xmpyu %fr8R, %fr4R, %fr24
199 1.1 mrg C xmpyu %fr8L, %fr4L, %fr25
200 1.1 mrg C ldd -0x70(%r30), p032a2
201 1.1 mrg C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
202 1.1 mrg C
203 1.1 mrg C ldo 8(rp), rp
204 1.1 mrg C add climb, p000a, s000
205 1.1 mrg C ldd -0x80(%r30), p000a
206 1.1 mrg C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
207 1.1 mrg C
208 1.1 mrg C add,dc p064a, %r0, climb
209 1.1 mrg C ldo 8(up), up
210 1.1 mrg C ldd -0x68(%r30), p064a
211 1.1 mrg C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
212 1.1 mrg C
213 1.1 mrg C add ma000, s000, s000
214 1.1 mrg C add,dc ma064, climb, climb
215 1.1 mrg C fldd 0(up), %fr4
216 1.1 mrg C
217 1.1 mrg C std s000, -8(rp)
218 1.1 mrg C
219 1.1 mrg C add p032a1, p032a2, m032
220 1.1 mrg C add,dc %r0, %r0, m096
221 1.1 mrg C
222 1.1 mrg C depd,z m032, 31, 32, ma000
223 1.1 mrg C extrd,u m032, 31, 32, ma064
224 1.1 mrg C addib,<> -1, %r5, L(loop0)
225 1.1 mrg C depd m096, 31, 32, ma064
226 1.1 mrg LDEF(0_out)
227 1.1 mrg ldo 8(up), up
228 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
229 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
230 1.1 mrg ldd -0x78(%r30), p032a1
231 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
232 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
233 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
234 1.1 mrg ldd -0x70(%r30), p032a2
235 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
236 1.1 mrg ldo 8(rp), rp
237 1.1 mrg add climb, p000a, s000
238 1.1 mrg ldd -0x80(%r30), p000a
239 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
240 1.1 mrg add,dc p064a, %r0, climb
241 1.1 mrg ldd -0x68(%r30), p064a
242 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
243 1.1 mrg add ma000, s000, s000
244 1.1 mrg add,dc ma064, climb, climb
245 1.1 mrg std s000, -8(rp)
246 1.1 mrg add p032a1, p032a2, m032
247 1.1 mrg add,dc %r0, %r0, m096
248 1.1 mrg depd,z m032, 31, 32, ma000
249 1.1 mrg extrd,u m032, 31, 32, ma064
250 1.1 mrg depd m096, 31, 32, ma064
251 1.1 mrg LDEF(0_two_out)
252 1.1 mrg ldd -0x78(%r30), p032a1
253 1.1 mrg ldd -0x70(%r30), p032a2
254 1.1 mrg ldo 8(rp), rp
255 1.1 mrg add climb, p000a, s000
256 1.1 mrg ldd -0x80(%r30), p000a
257 1.1 mrg add,dc p064a, %r0, climb
258 1.1 mrg ldd -0x68(%r30), p064a
259 1.1 mrg add ma000, s000, s000
260 1.1 mrg add,dc ma064, climb, climb
261 1.1 mrg std s000, -8(rp)
262 1.1 mrg LDEF(0_one_out)
263 1.1 mrg add p032a1, p032a2, m032
264 1.1 mrg add,dc %r0, %r0, m096
265 1.1 mrg depd,z m032, 31, 32, ma000
266 1.1 mrg extrd,u m032, 31, 32, ma064
267 1.1 mrg depd m096, 31, 32, ma064
268 1.1 mrg
269 1.1 mrg add climb, p000a, s000
270 1.1 mrg add,dc p064a, %r0, climb
271 1.1 mrg add ma000, s000, s000
272 1.1 mrg add,dc ma064, climb, climb
273 1.1 mrg std s000, 0(rp)
274 1.1 mrg
275 1.1 mrg cmpib,>= 4, n, L(done)
276 1.1 mrg ldo 8(rp), rp
277 1.1 mrg
278 1.1 mrg C 4-way unrolled code.
279 1.1 mrg
280 1.1 mrg LDEF(BIG)
281 1.1 mrg
282 1.1 mrg define(`p032a1',`%r1') C
283 1.1 mrg define(`p032a2',`%r19') C
284 1.1 mrg define(`p096b1',`%r20') C
285 1.1 mrg define(`p096b2',`%r21') C
286 1.1 mrg define(`p160c1',`%r22') C
287 1.1 mrg define(`p160c2',`%r29') C
288 1.1 mrg define(`p224d1',`%r31') C
289 1.1 mrg define(`p224d2',`%r3') C
290 1.1 mrg C
291 1.1 mrg define(`m032',`%r4') C
292 1.1 mrg define(`m096',`%r5') C
293 1.1 mrg define(`m160',`%r6') C
294 1.1 mrg define(`m224',`%r7') C
295 1.1 mrg define(`m288',`%r8') C
296 1.1 mrg C
297 1.1 mrg define(`p000a',`%r1') C
298 1.1 mrg define(`p064a',`%r19') C
299 1.1 mrg define(`p064b',`%r20') C
300 1.1 mrg define(`p128b',`%r21') C
301 1.1 mrg define(`p128c',`%r22') C
302 1.1 mrg define(`p192c',`%r29') C
303 1.1 mrg define(`p192d',`%r31') C
304 1.1 mrg define(`p256d',`%r3') C
305 1.1 mrg C
306 1.1 mrg define(`s000',`%r10') C
307 1.1 mrg define(`s064',`%r11') C
308 1.1 mrg define(`s128',`%r12') C
309 1.1 mrg define(`s192',`%r13') C
310 1.1 mrg C
311 1.1 mrg define(`ma000',`%r9') C
312 1.1 mrg define(`ma064',`%r4') C
313 1.1 mrg define(`ma128',`%r5') C
314 1.1 mrg define(`ma192',`%r6') C
315 1.1 mrg define(`ma256',`%r7') C
316 1.1 mrg
317 1.1 mrg std %r6, -0xe8(%r30)
318 1.1 mrg std %r7, -0xe0(%r30)
319 1.1 mrg std %r8, -0xd8(%r30)
320 1.1 mrg std %r9, -0xd0(%r30)
321 1.1 mrg std %r10, -0xc8(%r30)
322 1.1 mrg std %r11, -0xc0(%r30)
323 1.1 mrg std %r12, -0xb8(%r30)
324 1.1 mrg std %r13, -0xb0(%r30)
325 1.1 mrg
326 1.1 mrg ifdef(`HAVE_ABI_2_0w',
327 1.1 mrg ` extrd,u n, 61, 62, n C right shift 2
328 1.1 mrg ',` extrd,u n, 61, 30, n C right shift 2, zero extend
329 1.1 mrg ')
330 1.1 mrg
331 1.1 mrg LDEF(4_or_more)
332 1.1 mrg fldd 0(up), %fr4
333 1.1 mrg fldd 8(up), %fr5
334 1.1 mrg fldd 16(up), %fr6
335 1.1 mrg fldd 24(up), %fr7
336 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
337 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
338 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
339 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
340 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
341 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
342 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
343 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
344 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
345 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
346 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
347 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
348 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
349 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
350 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
351 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
352 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
353 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
354 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
355 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
356 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
357 1.1 mrg addib,<> -1, n, L(8_or_more)
358 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
359 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
360 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
361 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
362 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
363 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
364 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
365 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
366 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
367 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
368 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
369 1.1 mrg ldd -0x78(%r30), p032a1
370 1.1 mrg ldd -0x70(%r30), p032a2
371 1.1 mrg ldd -0x38(%r30), p096b1
372 1.1 mrg ldd -0x30(%r30), p096b2
373 1.1 mrg ldd -0x58(%r30), p160c1
374 1.1 mrg ldd -0x50(%r30), p160c2
375 1.1 mrg ldd -0x18(%r30), p224d1
376 1.1 mrg ldd -0x10(%r30), p224d2
377 1.1 mrg b L(end1)
378 1.1 mrg nop
379 1.1 mrg
380 1.1 mrg LDEF(8_or_more)
381 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
382 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
383 1.1 mrg ldo 32(up), up
384 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
385 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
386 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
387 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
388 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
389 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
390 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
391 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
392 1.1 mrg fldd 0(up), %fr4
393 1.1 mrg fldd 8(up), %fr5
394 1.1 mrg fldd 16(up), %fr6
395 1.1 mrg fldd 24(up), %fr7
396 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
397 1.1 mrg ldd -0x78(%r30), p032a1
398 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
399 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
400 1.1 mrg ldd -0x70(%r30), p032a2
401 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
402 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
403 1.1 mrg ldd -0x38(%r30), p096b1
404 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
405 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
406 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
407 1.1 mrg ldd -0x30(%r30), p096b2
408 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
409 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
410 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
411 1.1 mrg ldd -0x58(%r30), p160c1
412 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
413 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
414 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
415 1.1 mrg ldd -0x50(%r30), p160c2
416 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
417 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
418 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
419 1.1 mrg ldd -0x18(%r30), p224d1
420 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
421 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
422 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
423 1.1 mrg ldd -0x10(%r30), p224d2
424 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
425 1.1 mrg addib,= -1, n, L(end2)
426 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
427 1.1 mrg LDEF(loop)
428 1.1 mrg add p032a1, p032a2, m032
429 1.1 mrg ldd -0x80(%r30), p000a
430 1.1 mrg add,dc p096b1, p096b2, m096
431 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
432 1.1 mrg
433 1.1 mrg add,dc p160c1, p160c2, m160
434 1.1 mrg ldd -0x68(%r30), p064a
435 1.1 mrg add,dc p224d1, p224d2, m224
436 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
437 1.1 mrg
438 1.1 mrg add,dc %r0, %r0, m288
439 1.1 mrg ldd -0x40(%r30), p064b
440 1.1 mrg ldo 32(up), up
441 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
442 1.1 mrg
443 1.1 mrg depd,z m032, 31, 32, ma000
444 1.1 mrg ldd -0x28(%r30), p128b
445 1.1 mrg extrd,u m032, 31, 32, ma064
446 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
447 1.1 mrg
448 1.1 mrg depd m096, 31, 32, ma064
449 1.1 mrg ldd -0x60(%r30), p128c
450 1.1 mrg extrd,u m096, 31, 32, ma128
451 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
452 1.1 mrg
453 1.1 mrg depd m160, 31, 32, ma128
454 1.1 mrg ldd -0x48(%r30), p192c
455 1.1 mrg extrd,u m160, 31, 32, ma192
456 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
457 1.1 mrg
458 1.1 mrg depd m224, 31, 32, ma192
459 1.1 mrg ldd -0x20(%r30), p192d
460 1.1 mrg extrd,u m224, 31, 32, ma256
461 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
462 1.1 mrg
463 1.1 mrg depd m288, 31, 32, ma256
464 1.1 mrg ldd -0x88(%r30), p256d
465 1.1 mrg add climb, p000a, s000
466 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
467 1.1 mrg
468 1.1 mrg add,dc p064a, p064b, s064
469 1.1 mrg add,dc p128b, p128c, s128
470 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
471 1.1 mrg
472 1.1 mrg add,dc p192c, p192d, s192
473 1.1 mrg add,dc p256d, %r0, climb
474 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
475 1.1 mrg
476 1.1 mrg add ma000, s000, s000 C accum mid 0
477 1.1 mrg fldd 0(up), %fr4
478 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
479 1.1 mrg std s000, 0(rp)
480 1.1 mrg
481 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
482 1.1 mrg fldd 8(up), %fr5
483 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
484 1.1 mrg std s064, 8(rp)
485 1.1 mrg
486 1.1 mrg add,dc ma256, climb, climb
487 1.1 mrg fldd 16(up), %fr6
488 1.1 mrg std s128, 16(rp)
489 1.1 mrg
490 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
491 1.1 mrg ldd -0x78(%r30), p032a1
492 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
493 1.1 mrg fldd 24(up), %fr7
494 1.1 mrg
495 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
496 1.1 mrg ldd -0x70(%r30), p032a2
497 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
498 1.1 mrg std s192, 24(rp)
499 1.1 mrg
500 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
501 1.1 mrg ldd -0x38(%r30), p096b1
502 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
503 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
504 1.1 mrg
505 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
506 1.1 mrg ldd -0x30(%r30), p096b2
507 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
508 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
509 1.1 mrg
510 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
511 1.1 mrg ldd -0x58(%r30), p160c1
512 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
513 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
514 1.1 mrg
515 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
516 1.1 mrg ldd -0x50(%r30), p160c2
517 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
518 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
519 1.1 mrg
520 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
521 1.1 mrg ldd -0x18(%r30), p224d1
522 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
523 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
524 1.1 mrg
525 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
526 1.1 mrg ldd -0x10(%r30), p224d2
527 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
528 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
529 1.1 mrg
530 1.1 mrg addib,<> -1, n, L(loop)
531 1.1 mrg ldo 32(rp), rp
532 1.1 mrg
533 1.1 mrg LDEF(end2)
534 1.1 mrg add p032a1, p032a2, m032
535 1.1 mrg ldd -0x80(%r30), p000a
536 1.1 mrg add,dc p096b1, p096b2, m096
537 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
538 1.1 mrg add,dc p160c1, p160c2, m160
539 1.1 mrg ldd -0x68(%r30), p064a
540 1.1 mrg add,dc p224d1, p224d2, m224
541 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
542 1.1 mrg add,dc %r0, %r0, m288
543 1.1 mrg ldd -0x40(%r30), p064b
544 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
545 1.1 mrg depd,z m032, 31, 32, ma000
546 1.1 mrg ldd -0x28(%r30), p128b
547 1.1 mrg extrd,u m032, 31, 32, ma064
548 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
549 1.1 mrg depd m096, 31, 32, ma064
550 1.1 mrg ldd -0x60(%r30), p128c
551 1.1 mrg extrd,u m096, 31, 32, ma128
552 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
553 1.1 mrg depd m160, 31, 32, ma128
554 1.1 mrg ldd -0x48(%r30), p192c
555 1.1 mrg extrd,u m160, 31, 32, ma192
556 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
557 1.1 mrg depd m224, 31, 32, ma192
558 1.1 mrg ldd -0x20(%r30), p192d
559 1.1 mrg extrd,u m224, 31, 32, ma256
560 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
561 1.1 mrg depd m288, 31, 32, ma256
562 1.1 mrg ldd -0x88(%r30), p256d
563 1.1 mrg add climb, p000a, s000
564 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
565 1.1 mrg add,dc p064a, p064b, s064
566 1.1 mrg add,dc p128b, p128c, s128
567 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
568 1.1 mrg add,dc p192c, p192d, s192
569 1.1 mrg add,dc p256d, %r0, climb
570 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
571 1.1 mrg add ma000, s000, s000 C accum mid 0
572 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
573 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
574 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
575 1.1 mrg add,dc ma256, climb, climb
576 1.1 mrg std s000, 0(rp)
577 1.1 mrg std s064, 8(rp)
578 1.1 mrg ldd -0x78(%r30), p032a1
579 1.1 mrg std s128, 16(rp)
580 1.1 mrg ldd -0x70(%r30), p032a2
581 1.1 mrg std s192, 24(rp)
582 1.1 mrg ldd -0x38(%r30), p096b1
583 1.1 mrg ldd -0x30(%r30), p096b2
584 1.1 mrg ldd -0x58(%r30), p160c1
585 1.1 mrg ldd -0x50(%r30), p160c2
586 1.1 mrg ldd -0x18(%r30), p224d1
587 1.1 mrg ldd -0x10(%r30), p224d2
588 1.1 mrg ldo 32(rp), rp
589 1.1 mrg
590 1.1 mrg LDEF(end1)
591 1.1 mrg add p032a1, p032a2, m032
592 1.1 mrg ldd -0x80(%r30), p000a
593 1.1 mrg add,dc p096b1, p096b2, m096
594 1.1 mrg add,dc p160c1, p160c2, m160
595 1.1 mrg ldd -0x68(%r30), p064a
596 1.1 mrg add,dc p224d1, p224d2, m224
597 1.1 mrg add,dc %r0, %r0, m288
598 1.1 mrg ldd -0x40(%r30), p064b
599 1.1 mrg depd,z m032, 31, 32, ma000
600 1.1 mrg ldd -0x28(%r30), p128b
601 1.1 mrg extrd,u m032, 31, 32, ma064
602 1.1 mrg depd m096, 31, 32, ma064
603 1.1 mrg ldd -0x60(%r30), p128c
604 1.1 mrg extrd,u m096, 31, 32, ma128
605 1.1 mrg depd m160, 31, 32, ma128
606 1.1 mrg ldd -0x48(%r30), p192c
607 1.1 mrg extrd,u m160, 31, 32, ma192
608 1.1 mrg depd m224, 31, 32, ma192
609 1.1 mrg ldd -0x20(%r30), p192d
610 1.1 mrg extrd,u m224, 31, 32, ma256
611 1.1 mrg depd m288, 31, 32, ma256
612 1.1 mrg ldd -0x88(%r30), p256d
613 1.1 mrg add climb, p000a, s000
614 1.1 mrg add,dc p064a, p064b, s064
615 1.1 mrg add,dc p128b, p128c, s128
616 1.1 mrg add,dc p192c, p192d, s192
617 1.1 mrg add,dc p256d, %r0, climb
618 1.1 mrg add ma000, s000, s000 C accum mid 0
619 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
620 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
621 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
622 1.1 mrg add,dc ma256, climb, climb
623 1.1 mrg std s000, 0(rp)
624 1.1 mrg std s064, 8(rp)
625 1.1 mrg std s128, 16(rp)
626 1.1 mrg std s192, 24(rp)
627 1.1 mrg
628 1.1 mrg ldd -0xb0(%r30), %r13
629 1.1 mrg ldd -0xb8(%r30), %r12
630 1.1 mrg ldd -0xc0(%r30), %r11
631 1.1 mrg ldd -0xc8(%r30), %r10
632 1.1 mrg ldd -0xd0(%r30), %r9
633 1.1 mrg ldd -0xd8(%r30), %r8
634 1.1 mrg ldd -0xe0(%r30), %r7
635 1.1 mrg ldd -0xe8(%r30), %r6
636 1.1 mrg LDEF(done)
637 1.1 mrg ifdef(`HAVE_ABI_2_0w',
638 1.1 mrg ` copy climb, %r28
639 1.1 mrg ',` extrd,u climb, 63, 32, %r29
640 1.1 mrg extrd,u climb, 31, 32, %r28
641 1.1 mrg ')
642 1.1 mrg ldd -0xf0(%r30), %r5
643 1.1 mrg ldd -0xf8(%r30), %r4
644 1.1 mrg bve (%r2)
645 1.1 mrg ldd,mb -0x100(%r30), %r3
646 1.1 mrg EPILOGUE(mpn_mul_1)
647