submul_1.asm revision 1.1.1.2 1 1.1 mrg dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2 1.1 mrg dnl subtract the result from a second limb vector.
3 1.1 mrg
4 1.1.1.2 mrg dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5 1.1 mrg
6 1.1 mrg dnl This file is part of the GNU MP Library.
7 1.1.1.2 mrg dnl
8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1.1.2 mrg dnl it under the terms of either:
10 1.1.1.2 mrg dnl
11 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free
12 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your
13 1.1.1.2 mrg dnl option) any later version.
14 1.1.1.2 mrg dnl
15 1.1.1.2 mrg dnl or
16 1.1.1.2 mrg dnl
17 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software
18 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any
19 1.1.1.2 mrg dnl later version.
20 1.1.1.2 mrg dnl
21 1.1.1.2 mrg dnl or both in parallel, as here.
22 1.1.1.2 mrg dnl
23 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 1.1.1.2 mrg dnl for more details.
27 1.1.1.2 mrg dnl
28 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the
29 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/.
31 1.1 mrg
32 1.1 mrg include(`../config.m4')
33 1.1 mrg
34 1.1 mrg C cycles/limb
35 1.1 mrg C 8000,8200: 7
36 1.1 mrg C 8500,8600,8700: 6.5
37 1.1 mrg
38 1.1 mrg C The feed-in and wind-down code has not yet been scheduled. Many cycles
39 1.1 mrg C could be saved there per call.
40 1.1 mrg
41 1.1 mrg C DESCRIPTION:
42 1.1 mrg C The main loop "BIG" is 4-way unrolled, mainly to allow
43 1.1 mrg C effective use of ADD,DC. Delays in moving data via the cache from the FP
44 1.1 mrg C registers to the IU registers, have demanded a deep software pipeline, and
45 1.1 mrg C a lot of stack slots for partial products in flight.
46 1.1 mrg C
47 1.1 mrg C CODE STRUCTURE:
48 1.1 mrg C save-some-registers
49 1.1 mrg C do 0, 1, 2, or 3 limbs
50 1.1 mrg C if done, restore-some-regs and return
51 1.1 mrg C save-many-regs
52 1.1 mrg C do 4, 8, ... limb
53 1.1 mrg C restore-all-regs
54 1.1 mrg
55 1.1 mrg C STACK LAYOUT:
56 1.1 mrg C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
57 1.1 mrg C slots marked FREE, as well as some slots in the caller's "frame marker".
58 1.1 mrg C
59 1.1 mrg C -00 <- r30
60 1.1 mrg C -08 FREE
61 1.1 mrg C -10 tmp
62 1.1 mrg C -18 tmp
63 1.1 mrg C -20 tmp
64 1.1 mrg C -28 tmp
65 1.1 mrg C -30 tmp
66 1.1 mrg C -38 tmp
67 1.1 mrg C -40 tmp
68 1.1 mrg C -48 tmp
69 1.1 mrg C -50 tmp
70 1.1 mrg C -58 tmp
71 1.1 mrg C -60 tmp
72 1.1 mrg C -68 tmp
73 1.1 mrg C -70 tmp
74 1.1 mrg C -78 tmp
75 1.1 mrg C -80 tmp
76 1.1 mrg C -88 tmp
77 1.1 mrg C -90 FREE
78 1.1 mrg C -98 FREE
79 1.1 mrg C -a0 FREE
80 1.1 mrg C -a8 FREE
81 1.1 mrg C -b0 r13
82 1.1 mrg C -b8 r12
83 1.1 mrg C -c0 r11
84 1.1 mrg C -c8 r10
85 1.1 mrg C -d0 r8
86 1.1 mrg C -d8 r8
87 1.1 mrg C -e0 r7
88 1.1 mrg C -e8 r6
89 1.1 mrg C -f0 r5
90 1.1 mrg C -f8 r4
91 1.1 mrg C -100 r3
92 1.1 mrg C Previous frame:
93 1.1 mrg C [unused area]
94 1.1 mrg C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
95 1.1 mrg
96 1.1 mrg
97 1.1 mrg include(`../config.m4')
98 1.1 mrg
99 1.1 mrg C INPUT PARAMETERS:
100 1.1 mrg define(`rp',`%r26') C
101 1.1 mrg define(`up',`%r25') C
102 1.1 mrg define(`n',`%r24') C
103 1.1 mrg define(`vlimb',`%r23') C
104 1.1 mrg
105 1.1 mrg define(`climb',`%r23') C
106 1.1 mrg
107 1.1 mrg ifdef(`HAVE_ABI_2_0w',
108 1.1 mrg ` .level 2.0w
109 1.1 mrg ',` .level 2.0
110 1.1 mrg ')
111 1.1 mrg PROLOGUE(mpn_submul_1)
112 1.1 mrg
113 1.1 mrg ifdef(`HAVE_ABI_2_0w',
114 1.1 mrg ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
115 1.1 mrg ')
116 1.1 mrg std,ma %r3, 0x100(%r30)
117 1.1 mrg std %r4, -0xf8(%r30)
118 1.1 mrg std %r5, -0xf0(%r30)
119 1.1 mrg ldo 0(%r0), climb C clear climb
120 1.1 mrg fldd -0x138(%r30), %fr8 C put vlimb in fp register
121 1.1 mrg
122 1.1 mrg define(`p032a1',`%r1') C
123 1.1 mrg define(`p032a2',`%r19') C
124 1.1 mrg
125 1.1 mrg define(`m032',`%r20') C
126 1.1 mrg define(`m096',`%r21') C
127 1.1 mrg
128 1.1 mrg define(`p000a',`%r22') C
129 1.1 mrg define(`p064a',`%r29') C
130 1.1 mrg
131 1.1 mrg define(`s000',`%r31') C
132 1.1 mrg
133 1.1 mrg define(`ma000',`%r4') C
134 1.1 mrg define(`ma064',`%r20') C
135 1.1 mrg
136 1.1 mrg define(`r000',`%r3') C
137 1.1 mrg
138 1.1 mrg extrd,u n, 63, 2, %r5
139 1.1 mrg cmpb,= %r5, %r0, L(BIG)
140 1.1 mrg nop
141 1.1 mrg
142 1.1 mrg fldd 0(up), %fr4
143 1.1 mrg ldo 8(up), up
144 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
145 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
146 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
147 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
148 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
149 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
150 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
151 1.1 mrg addib,<> -1, %r5, L(two_or_more)
152 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
153 1.1 mrg LDEF(one)
154 1.1 mrg ldd -0x78(%r30), p032a1
155 1.1 mrg ldd -0x70(%r30), p032a2
156 1.1 mrg ldd -0x80(%r30), p000a
157 1.1 mrg b L(0_one_out)
158 1.1 mrg ldd -0x68(%r30), p064a
159 1.1 mrg
160 1.1 mrg LDEF(two_or_more)
161 1.1 mrg fldd 0(up), %fr4
162 1.1 mrg ldo 8(up), up
163 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
164 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
165 1.1 mrg ldd -0x78(%r30), p032a1
166 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
167 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
168 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
169 1.1 mrg ldd -0x70(%r30), p032a2
170 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
171 1.1 mrg ldd -0x80(%r30), p000a
172 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
173 1.1 mrg ldd -0x68(%r30), p064a
174 1.1 mrg addib,<> -1, %r5, L(three_or_more)
175 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
176 1.1 mrg LDEF(two)
177 1.1 mrg add p032a1, p032a2, m032
178 1.1 mrg add,dc %r0, %r0, m096
179 1.1 mrg depd,z m032, 31, 32, ma000
180 1.1 mrg extrd,u m032, 31, 32, ma064
181 1.1 mrg ldd 0(rp), r000
182 1.1 mrg b L(0_two_out)
183 1.1 mrg depd m096, 31, 32, ma064
184 1.1 mrg
185 1.1 mrg LDEF(three_or_more)
186 1.1 mrg fldd 0(up), %fr4
187 1.1 mrg add p032a1, p032a2, m032
188 1.1 mrg add,dc %r0, %r0, m096
189 1.1 mrg depd,z m032, 31, 32, ma000
190 1.1 mrg extrd,u m032, 31, 32, ma064
191 1.1 mrg ldd 0(rp), r000
192 1.1 mrg C addib,= -1, %r5, L(0_out)
193 1.1 mrg depd m096, 31, 32, ma064
194 1.1 mrg LDEF(loop0)
195 1.1 mrg C xmpyu %fr8R, %fr4L, %fr22
196 1.1 mrg C xmpyu %fr8L, %fr4R, %fr23
197 1.1 mrg C ldd -0x78(%r30), p032a1
198 1.1 mrg C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
199 1.1 mrg C
200 1.1 mrg C xmpyu %fr8R, %fr4R, %fr24
201 1.1 mrg C xmpyu %fr8L, %fr4L, %fr25
202 1.1 mrg C ldd -0x70(%r30), p032a2
203 1.1 mrg C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
204 1.1 mrg C
205 1.1 mrg C ldo 8(rp), rp
206 1.1 mrg C add climb, p000a, s000
207 1.1 mrg C ldd -0x80(%r30), p000a
208 1.1 mrg C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
209 1.1 mrg C
210 1.1 mrg C add,dc p064a, %r0, climb
211 1.1 mrg C ldo 8(up), up
212 1.1 mrg C ldd -0x68(%r30), p064a
213 1.1 mrg C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
214 1.1 mrg C
215 1.1 mrg C add ma000, s000, s000
216 1.1 mrg C add,dc ma064, climb, climb
217 1.1 mrg C fldd 0(up), %fr4
218 1.1 mrg C
219 1.1 mrg C sub r000, s000, s000
220 1.1 mrg C sub,db %r0, climb, climb
221 1.1 mrg C sub %r0, climb, climb
222 1.1 mrg C std s000, -8(rp)
223 1.1 mrg C
224 1.1 mrg C add p032a1, p032a2, m032
225 1.1 mrg C add,dc %r0, %r0, m096
226 1.1 mrg C
227 1.1 mrg C depd,z m032, 31, 32, ma000
228 1.1 mrg C extrd,u m032, 31, 32, ma064
229 1.1 mrg C ldd 0(rp), r000
230 1.1 mrg C addib,<> -1, %r5, L(loop0)
231 1.1 mrg C depd m096, 31, 32, ma064
232 1.1 mrg LDEF(0_out)
233 1.1 mrg ldo 8(up), up
234 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
235 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
236 1.1 mrg ldd -0x78(%r30), p032a1
237 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
238 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
239 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
240 1.1 mrg ldd -0x70(%r30), p032a2
241 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
242 1.1 mrg ldo 8(rp), rp
243 1.1 mrg add climb, p000a, s000
244 1.1 mrg ldd -0x80(%r30), p000a
245 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
246 1.1 mrg add,dc p064a, %r0, climb
247 1.1 mrg ldd -0x68(%r30), p064a
248 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
249 1.1 mrg add ma000, s000, s000
250 1.1 mrg add,dc ma064, climb, climb
251 1.1 mrg sub r000, s000, s000
252 1.1 mrg sub,db %r0, climb, climb
253 1.1 mrg sub %r0, climb, climb
254 1.1 mrg std s000, -8(rp)
255 1.1 mrg add p032a1, p032a2, m032
256 1.1 mrg add,dc %r0, %r0, m096
257 1.1 mrg depd,z m032, 31, 32, ma000
258 1.1 mrg extrd,u m032, 31, 32, ma064
259 1.1 mrg ldd 0(rp), r000
260 1.1 mrg depd m096, 31, 32, ma064
261 1.1 mrg LDEF(0_two_out)
262 1.1 mrg ldd -0x78(%r30), p032a1
263 1.1 mrg ldd -0x70(%r30), p032a2
264 1.1 mrg ldo 8(rp), rp
265 1.1 mrg add climb, p000a, s000
266 1.1 mrg ldd -0x80(%r30), p000a
267 1.1 mrg add,dc p064a, %r0, climb
268 1.1 mrg ldd -0x68(%r30), p064a
269 1.1 mrg add ma000, s000, s000
270 1.1 mrg add,dc ma064, climb, climb
271 1.1 mrg sub r000, s000, s000
272 1.1 mrg sub,db %r0, climb, climb
273 1.1 mrg sub %r0, climb, climb
274 1.1 mrg std s000, -8(rp)
275 1.1 mrg LDEF(0_one_out)
276 1.1 mrg add p032a1, p032a2, m032
277 1.1 mrg add,dc %r0, %r0, m096
278 1.1 mrg depd,z m032, 31, 32, ma000
279 1.1 mrg extrd,u m032, 31, 32, ma064
280 1.1 mrg ldd 0(rp), r000
281 1.1 mrg depd m096, 31, 32, ma064
282 1.1 mrg
283 1.1 mrg add climb, p000a, s000
284 1.1 mrg add,dc p064a, %r0, climb
285 1.1 mrg add ma000, s000, s000
286 1.1 mrg add,dc ma064, climb, climb
287 1.1 mrg sub r000, s000, s000
288 1.1 mrg sub,db %r0, climb, climb
289 1.1 mrg sub %r0, climb, climb
290 1.1 mrg std s000, 0(rp)
291 1.1 mrg
292 1.1 mrg cmpib,>= 4, n, L(done)
293 1.1 mrg ldo 8(rp), rp
294 1.1 mrg
295 1.1 mrg C 4-way unrolled code.
296 1.1 mrg
297 1.1 mrg LDEF(BIG)
298 1.1 mrg
299 1.1 mrg define(`p032a1',`%r1') C
300 1.1 mrg define(`p032a2',`%r19') C
301 1.1 mrg define(`p096b1',`%r20') C
302 1.1 mrg define(`p096b2',`%r21') C
303 1.1 mrg define(`p160c1',`%r22') C
304 1.1 mrg define(`p160c2',`%r29') C
305 1.1 mrg define(`p224d1',`%r31') C
306 1.1 mrg define(`p224d2',`%r3') C
307 1.1 mrg C
308 1.1 mrg define(`m032',`%r4') C
309 1.1 mrg define(`m096',`%r5') C
310 1.1 mrg define(`m160',`%r6') C
311 1.1 mrg define(`m224',`%r7') C
312 1.1 mrg define(`m288',`%r8') C
313 1.1 mrg C
314 1.1 mrg define(`p000a',`%r1') C
315 1.1 mrg define(`p064a',`%r19') C
316 1.1 mrg define(`p064b',`%r20') C
317 1.1 mrg define(`p128b',`%r21') C
318 1.1 mrg define(`p128c',`%r22') C
319 1.1 mrg define(`p192c',`%r29') C
320 1.1 mrg define(`p192d',`%r31') C
321 1.1 mrg define(`p256d',`%r3') C
322 1.1 mrg C
323 1.1 mrg define(`s000',`%r10') C
324 1.1 mrg define(`s064',`%r11') C
325 1.1 mrg define(`s128',`%r12') C
326 1.1 mrg define(`s192',`%r13') C
327 1.1 mrg C
328 1.1 mrg define(`ma000',`%r9') C
329 1.1 mrg define(`ma064',`%r4') C
330 1.1 mrg define(`ma128',`%r5') C
331 1.1 mrg define(`ma192',`%r6') C
332 1.1 mrg define(`ma256',`%r7') C
333 1.1 mrg C
334 1.1 mrg define(`r000',`%r1') C
335 1.1 mrg define(`r064',`%r19') C
336 1.1 mrg define(`r128',`%r20') C
337 1.1 mrg define(`r192',`%r21') C
338 1.1 mrg
339 1.1 mrg std %r6, -0xe8(%r30)
340 1.1 mrg std %r7, -0xe0(%r30)
341 1.1 mrg std %r8, -0xd8(%r30)
342 1.1 mrg std %r9, -0xd0(%r30)
343 1.1 mrg std %r10, -0xc8(%r30)
344 1.1 mrg std %r11, -0xc0(%r30)
345 1.1 mrg std %r12, -0xb8(%r30)
346 1.1 mrg std %r13, -0xb0(%r30)
347 1.1 mrg
348 1.1 mrg ifdef(`HAVE_ABI_2_0w',
349 1.1 mrg ` extrd,u n, 61, 62, n C right shift 2
350 1.1 mrg ',` extrd,u n, 61, 30, n C right shift 2, zero extend
351 1.1 mrg ')
352 1.1 mrg
353 1.1 mrg LDEF(4_or_more)
354 1.1 mrg fldd 0(up), %fr4
355 1.1 mrg fldd 8(up), %fr5
356 1.1 mrg fldd 16(up), %fr6
357 1.1 mrg fldd 24(up), %fr7
358 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
359 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
360 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
361 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
362 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
363 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
364 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
365 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
366 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
367 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
368 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
369 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
370 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
371 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
372 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
373 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
374 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
375 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
376 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
377 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
378 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
379 1.1 mrg addib,<> -1, n, L(8_or_more)
380 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
381 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
382 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
383 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
384 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
385 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
386 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
387 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
388 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
389 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
390 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
391 1.1 mrg ldd -0x78(%r30), p032a1
392 1.1 mrg ldd -0x70(%r30), p032a2
393 1.1 mrg ldd -0x38(%r30), p096b1
394 1.1 mrg ldd -0x30(%r30), p096b2
395 1.1 mrg ldd -0x58(%r30), p160c1
396 1.1 mrg ldd -0x50(%r30), p160c2
397 1.1 mrg ldd -0x18(%r30), p224d1
398 1.1 mrg ldd -0x10(%r30), p224d2
399 1.1 mrg b L(end1)
400 1.1 mrg nop
401 1.1 mrg
402 1.1 mrg LDEF(8_or_more)
403 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
404 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
405 1.1 mrg ldo 32(up), up
406 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
407 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
408 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
409 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
410 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
411 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
412 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
413 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
414 1.1 mrg fldd 0(up), %fr4
415 1.1 mrg fldd 8(up), %fr5
416 1.1 mrg fldd 16(up), %fr6
417 1.1 mrg fldd 24(up), %fr7
418 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
419 1.1 mrg ldd -0x78(%r30), p032a1
420 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
421 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
422 1.1 mrg ldd -0x70(%r30), p032a2
423 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
424 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
425 1.1 mrg ldd -0x38(%r30), p096b1
426 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
427 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
428 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
429 1.1 mrg ldd -0x30(%r30), p096b2
430 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
431 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
432 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
433 1.1 mrg ldd -0x58(%r30), p160c1
434 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
435 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
436 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
437 1.1 mrg ldd -0x50(%r30), p160c2
438 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
439 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
440 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
441 1.1 mrg ldd -0x18(%r30), p224d1
442 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
443 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
444 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
445 1.1 mrg ldd -0x10(%r30), p224d2
446 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
447 1.1 mrg addib,= -1, n, L(end2)
448 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
449 1.1 mrg LDEF(loop)
450 1.1 mrg add p032a1, p032a2, m032
451 1.1 mrg ldd -0x80(%r30), p000a
452 1.1 mrg add,dc p096b1, p096b2, m096
453 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
454 1.1 mrg
455 1.1 mrg add,dc p160c1, p160c2, m160
456 1.1 mrg ldd -0x68(%r30), p064a
457 1.1 mrg add,dc p224d1, p224d2, m224
458 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
459 1.1 mrg
460 1.1 mrg add,dc %r0, %r0, m288
461 1.1 mrg ldd -0x40(%r30), p064b
462 1.1 mrg ldo 32(up), up
463 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
464 1.1 mrg
465 1.1 mrg depd,z m032, 31, 32, ma000
466 1.1 mrg ldd -0x28(%r30), p128b
467 1.1 mrg extrd,u m032, 31, 32, ma064
468 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
469 1.1 mrg
470 1.1 mrg depd m096, 31, 32, ma064
471 1.1 mrg ldd -0x60(%r30), p128c
472 1.1 mrg extrd,u m096, 31, 32, ma128
473 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
474 1.1 mrg
475 1.1 mrg depd m160, 31, 32, ma128
476 1.1 mrg ldd -0x48(%r30), p192c
477 1.1 mrg extrd,u m160, 31, 32, ma192
478 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
479 1.1 mrg
480 1.1 mrg depd m224, 31, 32, ma192
481 1.1 mrg ldd -0x20(%r30), p192d
482 1.1 mrg extrd,u m224, 31, 32, ma256
483 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
484 1.1 mrg
485 1.1 mrg depd m288, 31, 32, ma256
486 1.1 mrg ldd -0x88(%r30), p256d
487 1.1 mrg add climb, p000a, s000
488 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
489 1.1 mrg
490 1.1 mrg add,dc p064a, p064b, s064
491 1.1 mrg ldd 0(rp), r000
492 1.1 mrg add,dc p128b, p128c, s128
493 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
494 1.1 mrg
495 1.1 mrg add,dc p192c, p192d, s192
496 1.1 mrg ldd 8(rp), r064
497 1.1 mrg add,dc p256d, %r0, climb
498 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
499 1.1 mrg
500 1.1 mrg ldd 16(rp), r128
501 1.1 mrg add ma000, s000, s000 C accum mid 0
502 1.1 mrg ldd 24(rp), r192
503 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
504 1.1 mrg
505 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
506 1.1 mrg fldd 0(up), %fr4
507 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
508 1.1 mrg fldd 8(up), %fr5
509 1.1 mrg
510 1.1 mrg add,dc ma256, climb, climb
511 1.1 mrg fldd 16(up), %fr6
512 1.1 mrg sub r000, s000, s000 C accum rlimb 0
513 1.1 mrg fldd 24(up), %fr7
514 1.1 mrg
515 1.1 mrg sub,db r064, s064, s064 C accum rlimb 1
516 1.1 mrg sub,db r128, s128, s128 C accum rlimb 2
517 1.1 mrg std s000, 0(rp)
518 1.1 mrg
519 1.1 mrg sub,db r192, s192, s192 C accum rlimb 3
520 1.1 mrg sub,db %r0, climb, climb
521 1.1 mrg sub %r0, climb, climb
522 1.1 mrg std s064, 8(rp)
523 1.1 mrg
524 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
525 1.1 mrg ldd -0x78(%r30), p032a1
526 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
527 1.1 mrg std s128, 16(rp)
528 1.1 mrg
529 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
530 1.1 mrg ldd -0x70(%r30), p032a2
531 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
532 1.1 mrg std s192, 24(rp)
533 1.1 mrg
534 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
535 1.1 mrg ldd -0x38(%r30), p096b1
536 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
537 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
538 1.1 mrg
539 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
540 1.1 mrg ldd -0x30(%r30), p096b2
541 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
542 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
543 1.1 mrg
544 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
545 1.1 mrg ldd -0x58(%r30), p160c1
546 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
547 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
548 1.1 mrg
549 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
550 1.1 mrg ldd -0x50(%r30), p160c2
551 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
552 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
553 1.1 mrg
554 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
555 1.1 mrg ldd -0x18(%r30), p224d1
556 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
557 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
558 1.1 mrg
559 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
560 1.1 mrg ldd -0x10(%r30), p224d2
561 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
562 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
563 1.1 mrg
564 1.1 mrg addib,<> -1, n, L(loop)
565 1.1 mrg ldo 32(rp), rp
566 1.1 mrg
567 1.1 mrg LDEF(end2)
568 1.1 mrg add p032a1, p032a2, m032
569 1.1 mrg ldd -0x80(%r30), p000a
570 1.1 mrg add,dc p096b1, p096b2, m096
571 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
572 1.1 mrg add,dc p160c1, p160c2, m160
573 1.1 mrg ldd -0x68(%r30), p064a
574 1.1 mrg add,dc p224d1, p224d2, m224
575 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
576 1.1 mrg add,dc %r0, %r0, m288
577 1.1 mrg ldd -0x40(%r30), p064b
578 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
579 1.1 mrg depd,z m032, 31, 32, ma000
580 1.1 mrg ldd -0x28(%r30), p128b
581 1.1 mrg extrd,u m032, 31, 32, ma064
582 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
583 1.1 mrg depd m096, 31, 32, ma064
584 1.1 mrg ldd -0x60(%r30), p128c
585 1.1 mrg extrd,u m096, 31, 32, ma128
586 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
587 1.1 mrg depd m160, 31, 32, ma128
588 1.1 mrg ldd -0x48(%r30), p192c
589 1.1 mrg extrd,u m160, 31, 32, ma192
590 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
591 1.1 mrg depd m224, 31, 32, ma192
592 1.1 mrg ldd -0x20(%r30), p192d
593 1.1 mrg extrd,u m224, 31, 32, ma256
594 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
595 1.1 mrg depd m288, 31, 32, ma256
596 1.1 mrg ldd -0x88(%r30), p256d
597 1.1 mrg add climb, p000a, s000
598 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
599 1.1 mrg add,dc p064a, p064b, s064
600 1.1 mrg ldd 0(rp), r000
601 1.1 mrg add,dc p128b, p128c, s128
602 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
603 1.1 mrg add,dc p192c, p192d, s192
604 1.1 mrg ldd 8(rp), r064
605 1.1 mrg add,dc p256d, %r0, climb
606 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
607 1.1 mrg ldd 16(rp), r128
608 1.1 mrg add ma000, s000, s000 C accum mid 0
609 1.1 mrg ldd 24(rp), r192
610 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
611 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
612 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
613 1.1 mrg add,dc ma256, climb, climb
614 1.1 mrg sub r000, s000, s000 C accum rlimb 0
615 1.1 mrg sub,db r064, s064, s064 C accum rlimb 1
616 1.1 mrg sub,db r128, s128, s128 C accum rlimb 2
617 1.1 mrg std s000, 0(rp)
618 1.1 mrg sub,db r192, s192, s192 C accum rlimb 3
619 1.1 mrg sub,db %r0, climb, climb
620 1.1 mrg sub %r0, climb, climb
621 1.1 mrg std s064, 8(rp)
622 1.1 mrg ldd -0x78(%r30), p032a1
623 1.1 mrg std s128, 16(rp)
624 1.1 mrg ldd -0x70(%r30), p032a2
625 1.1 mrg std s192, 24(rp)
626 1.1 mrg ldd -0x38(%r30), p096b1
627 1.1 mrg ldd -0x30(%r30), p096b2
628 1.1 mrg ldd -0x58(%r30), p160c1
629 1.1 mrg ldd -0x50(%r30), p160c2
630 1.1 mrg ldd -0x18(%r30), p224d1
631 1.1 mrg ldd -0x10(%r30), p224d2
632 1.1 mrg ldo 32(rp), rp
633 1.1 mrg
634 1.1 mrg LDEF(end1)
635 1.1 mrg add p032a1, p032a2, m032
636 1.1 mrg ldd -0x80(%r30), p000a
637 1.1 mrg add,dc p096b1, p096b2, m096
638 1.1 mrg add,dc p160c1, p160c2, m160
639 1.1 mrg ldd -0x68(%r30), p064a
640 1.1 mrg add,dc p224d1, p224d2, m224
641 1.1 mrg add,dc %r0, %r0, m288
642 1.1 mrg ldd -0x40(%r30), p064b
643 1.1 mrg depd,z m032, 31, 32, ma000
644 1.1 mrg ldd -0x28(%r30), p128b
645 1.1 mrg extrd,u m032, 31, 32, ma064
646 1.1 mrg depd m096, 31, 32, ma064
647 1.1 mrg ldd -0x60(%r30), p128c
648 1.1 mrg extrd,u m096, 31, 32, ma128
649 1.1 mrg depd m160, 31, 32, ma128
650 1.1 mrg ldd -0x48(%r30), p192c
651 1.1 mrg extrd,u m160, 31, 32, ma192
652 1.1 mrg depd m224, 31, 32, ma192
653 1.1 mrg ldd -0x20(%r30), p192d
654 1.1 mrg extrd,u m224, 31, 32, ma256
655 1.1 mrg depd m288, 31, 32, ma256
656 1.1 mrg ldd -0x88(%r30), p256d
657 1.1 mrg add climb, p000a, s000
658 1.1 mrg add,dc p064a, p064b, s064
659 1.1 mrg ldd 0(rp), r000
660 1.1 mrg add,dc p128b, p128c, s128
661 1.1 mrg add,dc p192c, p192d, s192
662 1.1 mrg ldd 8(rp), r064
663 1.1 mrg add,dc p256d, %r0, climb
664 1.1 mrg ldd 16(rp), r128
665 1.1 mrg add ma000, s000, s000 C accum mid 0
666 1.1 mrg ldd 24(rp), r192
667 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
668 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
669 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
670 1.1 mrg add,dc ma256, climb, climb
671 1.1 mrg sub r000, s000, s000 C accum rlimb 0
672 1.1 mrg sub,db r064, s064, s064 C accum rlimb 1
673 1.1 mrg sub,db r128, s128, s128 C accum rlimb 2
674 1.1 mrg std s000, 0(rp)
675 1.1 mrg sub,db r192, s192, s192 C accum rlimb 3
676 1.1 mrg sub,db %r0, climb, climb
677 1.1 mrg sub %r0, climb, climb
678 1.1 mrg std s064, 8(rp)
679 1.1 mrg std s128, 16(rp)
680 1.1 mrg std s192, 24(rp)
681 1.1 mrg
682 1.1 mrg ldd -0xb0(%r30), %r13
683 1.1 mrg ldd -0xb8(%r30), %r12
684 1.1 mrg ldd -0xc0(%r30), %r11
685 1.1 mrg ldd -0xc8(%r30), %r10
686 1.1 mrg ldd -0xd0(%r30), %r9
687 1.1 mrg ldd -0xd8(%r30), %r8
688 1.1 mrg ldd -0xe0(%r30), %r7
689 1.1 mrg ldd -0xe8(%r30), %r6
690 1.1 mrg LDEF(done)
691 1.1 mrg ifdef(`HAVE_ABI_2_0w',
692 1.1 mrg ` copy climb, %r28
693 1.1 mrg ',` extrd,u climb, 63, 32, %r29
694 1.1 mrg extrd,u climb, 31, 32, %r28
695 1.1 mrg ')
696 1.1 mrg ldd -0xf0(%r30), %r5
697 1.1 mrg ldd -0xf8(%r30), %r4
698 1.1 mrg bve (%r2)
699 1.1 mrg ldd,mb -0x100(%r30), %r3
700 1.1 mrg EPILOGUE(mpn_submul_1)
701