submul_1.asm revision 1.1 1 1.1 mrg dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2 1.1 mrg dnl subtract the result from a second limb vector.
3 1.1 mrg
4 1.1 mrg dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5 1.1 mrg
6 1.1 mrg dnl This file is part of the GNU MP Library.
7 1.1 mrg
8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1 mrg dnl it under the terms of the GNU Lesser General Public License as published
10 1.1 mrg dnl by the Free Software Foundation; either version 3 of the License, or (at
11 1.1 mrg dnl your option) any later version.
12 1.1 mrg
13 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 1.1 mrg dnl License for more details.
17 1.1 mrg
18 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
19 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 1.1 mrg
21 1.1 mrg include(`../config.m4')
22 1.1 mrg
23 1.1 mrg C cycles/limb
24 1.1 mrg C 8000,8200: 7
25 1.1 mrg C 8500,8600,8700: 6.5
26 1.1 mrg
27 1.1 mrg C The feed-in and wind-down code has not yet been scheduled. Many cycles
28 1.1 mrg C could be saved there per call.
29 1.1 mrg
30 1.1 mrg C DESCRIPTION:
31 1.1 mrg C The main loop "BIG" is 4-way unrolled, mainly to allow
32 1.1 mrg C effective use of ADD,DC. Delays in moving data via the cache from the FP
33 1.1 mrg C registers to the IU registers, have demanded a deep software pipeline, and
34 1.1 mrg C a lot of stack slots for partial products in flight.
35 1.1 mrg C
36 1.1 mrg C CODE STRUCTURE:
37 1.1 mrg C save-some-registers
38 1.1 mrg C do 0, 1, 2, or 3 limbs
39 1.1 mrg C if done, restore-some-regs and return
40 1.1 mrg C save-many-regs
41 1.1 mrg C do 4, 8, ... limb
42 1.1 mrg C restore-all-regs
43 1.1 mrg
44 1.1 mrg C STACK LAYOUT:
45 1.1 mrg C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46 1.1 mrg C slots marked FREE, as well as some slots in the caller's "frame marker".
47 1.1 mrg C
48 1.1 mrg C -00 <- r30
49 1.1 mrg C -08 FREE
50 1.1 mrg C -10 tmp
51 1.1 mrg C -18 tmp
52 1.1 mrg C -20 tmp
53 1.1 mrg C -28 tmp
54 1.1 mrg C -30 tmp
55 1.1 mrg C -38 tmp
56 1.1 mrg C -40 tmp
57 1.1 mrg C -48 tmp
58 1.1 mrg C -50 tmp
59 1.1 mrg C -58 tmp
60 1.1 mrg C -60 tmp
61 1.1 mrg C -68 tmp
62 1.1 mrg C -70 tmp
63 1.1 mrg C -78 tmp
64 1.1 mrg C -80 tmp
65 1.1 mrg C -88 tmp
66 1.1 mrg C -90 FREE
67 1.1 mrg C -98 FREE
68 1.1 mrg C -a0 FREE
69 1.1 mrg C -a8 FREE
70 1.1 mrg C -b0 r13
71 1.1 mrg C -b8 r12
72 1.1 mrg C -c0 r11
73 1.1 mrg C -c8 r10
74 1.1 mrg C -d0 r8
75 1.1 mrg C -d8 r8
76 1.1 mrg C -e0 r7
77 1.1 mrg C -e8 r6
78 1.1 mrg C -f0 r5
79 1.1 mrg C -f8 r4
80 1.1 mrg C -100 r3
81 1.1 mrg C Previous frame:
82 1.1 mrg C [unused area]
83 1.1 mrg C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
84 1.1 mrg
85 1.1 mrg
86 1.1 mrg include(`../config.m4')
87 1.1 mrg
88 1.1 mrg C INPUT PARAMETERS:
89 1.1 mrg define(`rp',`%r26') C
90 1.1 mrg define(`up',`%r25') C
91 1.1 mrg define(`n',`%r24') C
92 1.1 mrg define(`vlimb',`%r23') C
93 1.1 mrg
94 1.1 mrg define(`climb',`%r23') C
95 1.1 mrg
96 1.1 mrg ifdef(`HAVE_ABI_2_0w',
97 1.1 mrg ` .level 2.0w
98 1.1 mrg ',` .level 2.0
99 1.1 mrg ')
100 1.1 mrg PROLOGUE(mpn_submul_1)
101 1.1 mrg
102 1.1 mrg ifdef(`HAVE_ABI_2_0w',
103 1.1 mrg ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
104 1.1 mrg ')
105 1.1 mrg std,ma %r3, 0x100(%r30)
106 1.1 mrg std %r4, -0xf8(%r30)
107 1.1 mrg std %r5, -0xf0(%r30)
108 1.1 mrg ldo 0(%r0), climb C clear climb
109 1.1 mrg fldd -0x138(%r30), %fr8 C put vlimb in fp register
110 1.1 mrg
111 1.1 mrg define(`p032a1',`%r1') C
112 1.1 mrg define(`p032a2',`%r19') C
113 1.1 mrg
114 1.1 mrg define(`m032',`%r20') C
115 1.1 mrg define(`m096',`%r21') C
116 1.1 mrg
117 1.1 mrg define(`p000a',`%r22') C
118 1.1 mrg define(`p064a',`%r29') C
119 1.1 mrg
120 1.1 mrg define(`s000',`%r31') C
121 1.1 mrg
122 1.1 mrg define(`ma000',`%r4') C
123 1.1 mrg define(`ma064',`%r20') C
124 1.1 mrg
125 1.1 mrg define(`r000',`%r3') C
126 1.1 mrg
127 1.1 mrg extrd,u n, 63, 2, %r5
128 1.1 mrg cmpb,= %r5, %r0, L(BIG)
129 1.1 mrg nop
130 1.1 mrg
131 1.1 mrg fldd 0(up), %fr4
132 1.1 mrg ldo 8(up), up
133 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
134 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
135 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
137 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
138 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140 1.1 mrg addib,<> -1, %r5, L(two_or_more)
141 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
142 1.1 mrg LDEF(one)
143 1.1 mrg ldd -0x78(%r30), p032a1
144 1.1 mrg ldd -0x70(%r30), p032a2
145 1.1 mrg ldd -0x80(%r30), p000a
146 1.1 mrg b L(0_one_out)
147 1.1 mrg ldd -0x68(%r30), p064a
148 1.1 mrg
149 1.1 mrg LDEF(two_or_more)
150 1.1 mrg fldd 0(up), %fr4
151 1.1 mrg ldo 8(up), up
152 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
153 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
154 1.1 mrg ldd -0x78(%r30), p032a1
155 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
157 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
158 1.1 mrg ldd -0x70(%r30), p032a2
159 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160 1.1 mrg ldd -0x80(%r30), p000a
161 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162 1.1 mrg ldd -0x68(%r30), p064a
163 1.1 mrg addib,<> -1, %r5, L(three_or_more)
164 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
165 1.1 mrg LDEF(two)
166 1.1 mrg add p032a1, p032a2, m032
167 1.1 mrg add,dc %r0, %r0, m096
168 1.1 mrg depd,z m032, 31, 32, ma000
169 1.1 mrg extrd,u m032, 31, 32, ma064
170 1.1 mrg ldd 0(rp), r000
171 1.1 mrg b L(0_two_out)
172 1.1 mrg depd m096, 31, 32, ma064
173 1.1 mrg
174 1.1 mrg LDEF(three_or_more)
175 1.1 mrg fldd 0(up), %fr4
176 1.1 mrg add p032a1, p032a2, m032
177 1.1 mrg add,dc %r0, %r0, m096
178 1.1 mrg depd,z m032, 31, 32, ma000
179 1.1 mrg extrd,u m032, 31, 32, ma064
180 1.1 mrg ldd 0(rp), r000
181 1.1 mrg C addib,= -1, %r5, L(0_out)
182 1.1 mrg depd m096, 31, 32, ma064
183 1.1 mrg LDEF(loop0)
184 1.1 mrg C xmpyu %fr8R, %fr4L, %fr22
185 1.1 mrg C xmpyu %fr8L, %fr4R, %fr23
186 1.1 mrg C ldd -0x78(%r30), p032a1
187 1.1 mrg C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
188 1.1 mrg C
189 1.1 mrg C xmpyu %fr8R, %fr4R, %fr24
190 1.1 mrg C xmpyu %fr8L, %fr4L, %fr25
191 1.1 mrg C ldd -0x70(%r30), p032a2
192 1.1 mrg C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
193 1.1 mrg C
194 1.1 mrg C ldo 8(rp), rp
195 1.1 mrg C add climb, p000a, s000
196 1.1 mrg C ldd -0x80(%r30), p000a
197 1.1 mrg C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
198 1.1 mrg C
199 1.1 mrg C add,dc p064a, %r0, climb
200 1.1 mrg C ldo 8(up), up
201 1.1 mrg C ldd -0x68(%r30), p064a
202 1.1 mrg C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
203 1.1 mrg C
204 1.1 mrg C add ma000, s000, s000
205 1.1 mrg C add,dc ma064, climb, climb
206 1.1 mrg C fldd 0(up), %fr4
207 1.1 mrg C
208 1.1 mrg C sub r000, s000, s000
209 1.1 mrg C sub,db %r0, climb, climb
210 1.1 mrg C sub %r0, climb, climb
211 1.1 mrg C std s000, -8(rp)
212 1.1 mrg C
213 1.1 mrg C add p032a1, p032a2, m032
214 1.1 mrg C add,dc %r0, %r0, m096
215 1.1 mrg C
216 1.1 mrg C depd,z m032, 31, 32, ma000
217 1.1 mrg C extrd,u m032, 31, 32, ma064
218 1.1 mrg C ldd 0(rp), r000
219 1.1 mrg C addib,<> -1, %r5, L(loop0)
220 1.1 mrg C depd m096, 31, 32, ma064
221 1.1 mrg LDEF(0_out)
222 1.1 mrg ldo 8(up), up
223 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
224 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
225 1.1 mrg ldd -0x78(%r30), p032a1
226 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
227 1.1 mrg xmpyu %fr8R, %fr4R, %fr24
228 1.1 mrg xmpyu %fr8L, %fr4L, %fr25
229 1.1 mrg ldd -0x70(%r30), p032a2
230 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
231 1.1 mrg ldo 8(rp), rp
232 1.1 mrg add climb, p000a, s000
233 1.1 mrg ldd -0x80(%r30), p000a
234 1.1 mrg fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
235 1.1 mrg add,dc p064a, %r0, climb
236 1.1 mrg ldd -0x68(%r30), p064a
237 1.1 mrg fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
238 1.1 mrg add ma000, s000, s000
239 1.1 mrg add,dc ma064, climb, climb
240 1.1 mrg sub r000, s000, s000
241 1.1 mrg sub,db %r0, climb, climb
242 1.1 mrg sub %r0, climb, climb
243 1.1 mrg std s000, -8(rp)
244 1.1 mrg add p032a1, p032a2, m032
245 1.1 mrg add,dc %r0, %r0, m096
246 1.1 mrg depd,z m032, 31, 32, ma000
247 1.1 mrg extrd,u m032, 31, 32, ma064
248 1.1 mrg ldd 0(rp), r000
249 1.1 mrg depd m096, 31, 32, ma064
250 1.1 mrg LDEF(0_two_out)
251 1.1 mrg ldd -0x78(%r30), p032a1
252 1.1 mrg ldd -0x70(%r30), p032a2
253 1.1 mrg ldo 8(rp), rp
254 1.1 mrg add climb, p000a, s000
255 1.1 mrg ldd -0x80(%r30), p000a
256 1.1 mrg add,dc p064a, %r0, climb
257 1.1 mrg ldd -0x68(%r30), p064a
258 1.1 mrg add ma000, s000, s000
259 1.1 mrg add,dc ma064, climb, climb
260 1.1 mrg sub r000, s000, s000
261 1.1 mrg sub,db %r0, climb, climb
262 1.1 mrg sub %r0, climb, climb
263 1.1 mrg std s000, -8(rp)
264 1.1 mrg LDEF(0_one_out)
265 1.1 mrg add p032a1, p032a2, m032
266 1.1 mrg add,dc %r0, %r0, m096
267 1.1 mrg depd,z m032, 31, 32, ma000
268 1.1 mrg extrd,u m032, 31, 32, ma064
269 1.1 mrg ldd 0(rp), r000
270 1.1 mrg depd m096, 31, 32, ma064
271 1.1 mrg
272 1.1 mrg add climb, p000a, s000
273 1.1 mrg add,dc p064a, %r0, climb
274 1.1 mrg add ma000, s000, s000
275 1.1 mrg add,dc ma064, climb, climb
276 1.1 mrg sub r000, s000, s000
277 1.1 mrg sub,db %r0, climb, climb
278 1.1 mrg sub %r0, climb, climb
279 1.1 mrg std s000, 0(rp)
280 1.1 mrg
281 1.1 mrg cmpib,>= 4, n, L(done)
282 1.1 mrg ldo 8(rp), rp
283 1.1 mrg
284 1.1 mrg C 4-way unrolled code.
285 1.1 mrg
286 1.1 mrg LDEF(BIG)
287 1.1 mrg
288 1.1 mrg define(`p032a1',`%r1') C
289 1.1 mrg define(`p032a2',`%r19') C
290 1.1 mrg define(`p096b1',`%r20') C
291 1.1 mrg define(`p096b2',`%r21') C
292 1.1 mrg define(`p160c1',`%r22') C
293 1.1 mrg define(`p160c2',`%r29') C
294 1.1 mrg define(`p224d1',`%r31') C
295 1.1 mrg define(`p224d2',`%r3') C
296 1.1 mrg C
297 1.1 mrg define(`m032',`%r4') C
298 1.1 mrg define(`m096',`%r5') C
299 1.1 mrg define(`m160',`%r6') C
300 1.1 mrg define(`m224',`%r7') C
301 1.1 mrg define(`m288',`%r8') C
302 1.1 mrg C
303 1.1 mrg define(`p000a',`%r1') C
304 1.1 mrg define(`p064a',`%r19') C
305 1.1 mrg define(`p064b',`%r20') C
306 1.1 mrg define(`p128b',`%r21') C
307 1.1 mrg define(`p128c',`%r22') C
308 1.1 mrg define(`p192c',`%r29') C
309 1.1 mrg define(`p192d',`%r31') C
310 1.1 mrg define(`p256d',`%r3') C
311 1.1 mrg C
312 1.1 mrg define(`s000',`%r10') C
313 1.1 mrg define(`s064',`%r11') C
314 1.1 mrg define(`s128',`%r12') C
315 1.1 mrg define(`s192',`%r13') C
316 1.1 mrg C
317 1.1 mrg define(`ma000',`%r9') C
318 1.1 mrg define(`ma064',`%r4') C
319 1.1 mrg define(`ma128',`%r5') C
320 1.1 mrg define(`ma192',`%r6') C
321 1.1 mrg define(`ma256',`%r7') C
322 1.1 mrg C
323 1.1 mrg define(`r000',`%r1') C
324 1.1 mrg define(`r064',`%r19') C
325 1.1 mrg define(`r128',`%r20') C
326 1.1 mrg define(`r192',`%r21') C
327 1.1 mrg
328 1.1 mrg std %r6, -0xe8(%r30)
329 1.1 mrg std %r7, -0xe0(%r30)
330 1.1 mrg std %r8, -0xd8(%r30)
331 1.1 mrg std %r9, -0xd0(%r30)
332 1.1 mrg std %r10, -0xc8(%r30)
333 1.1 mrg std %r11, -0xc0(%r30)
334 1.1 mrg std %r12, -0xb8(%r30)
335 1.1 mrg std %r13, -0xb0(%r30)
336 1.1 mrg
337 1.1 mrg ifdef(`HAVE_ABI_2_0w',
338 1.1 mrg ` extrd,u n, 61, 62, n C right shift 2
339 1.1 mrg ',` extrd,u n, 61, 30, n C right shift 2, zero extend
340 1.1 mrg ')
341 1.1 mrg
342 1.1 mrg LDEF(4_or_more)
343 1.1 mrg fldd 0(up), %fr4
344 1.1 mrg fldd 8(up), %fr5
345 1.1 mrg fldd 16(up), %fr6
346 1.1 mrg fldd 24(up), %fr7
347 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
348 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
349 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
350 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
351 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
352 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
353 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
354 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
355 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
356 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
357 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
358 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
359 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
360 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
361 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
362 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
363 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
364 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
365 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
366 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
367 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
368 1.1 mrg addib,<> -1, n, L(8_or_more)
369 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
370 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
372 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
373 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
374 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
375 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
376 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
377 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
378 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
379 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
380 1.1 mrg ldd -0x78(%r30), p032a1
381 1.1 mrg ldd -0x70(%r30), p032a2
382 1.1 mrg ldd -0x38(%r30), p096b1
383 1.1 mrg ldd -0x30(%r30), p096b2
384 1.1 mrg ldd -0x58(%r30), p160c1
385 1.1 mrg ldd -0x50(%r30), p160c2
386 1.1 mrg ldd -0x18(%r30), p224d1
387 1.1 mrg ldd -0x10(%r30), p224d2
388 1.1 mrg b L(end1)
389 1.1 mrg nop
390 1.1 mrg
391 1.1 mrg LDEF(8_or_more)
392 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
393 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
394 1.1 mrg ldo 32(up), up
395 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
396 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
397 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
398 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
399 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
400 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
401 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
402 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
403 1.1 mrg fldd 0(up), %fr4
404 1.1 mrg fldd 8(up), %fr5
405 1.1 mrg fldd 16(up), %fr6
406 1.1 mrg fldd 24(up), %fr7
407 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
408 1.1 mrg ldd -0x78(%r30), p032a1
409 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
410 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
411 1.1 mrg ldd -0x70(%r30), p032a2
412 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
413 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
414 1.1 mrg ldd -0x38(%r30), p096b1
415 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
416 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
417 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
418 1.1 mrg ldd -0x30(%r30), p096b2
419 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
420 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
421 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
422 1.1 mrg ldd -0x58(%r30), p160c1
423 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
424 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
425 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
426 1.1 mrg ldd -0x50(%r30), p160c2
427 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
428 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
429 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
430 1.1 mrg ldd -0x18(%r30), p224d1
431 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
432 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
433 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
434 1.1 mrg ldd -0x10(%r30), p224d2
435 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
436 1.1 mrg addib,= -1, n, L(end2)
437 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
438 1.1 mrg LDEF(loop)
439 1.1 mrg add p032a1, p032a2, m032
440 1.1 mrg ldd -0x80(%r30), p000a
441 1.1 mrg add,dc p096b1, p096b2, m096
442 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
443 1.1 mrg
444 1.1 mrg add,dc p160c1, p160c2, m160
445 1.1 mrg ldd -0x68(%r30), p064a
446 1.1 mrg add,dc p224d1, p224d2, m224
447 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
448 1.1 mrg
449 1.1 mrg add,dc %r0, %r0, m288
450 1.1 mrg ldd -0x40(%r30), p064b
451 1.1 mrg ldo 32(up), up
452 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
453 1.1 mrg
454 1.1 mrg depd,z m032, 31, 32, ma000
455 1.1 mrg ldd -0x28(%r30), p128b
456 1.1 mrg extrd,u m032, 31, 32, ma064
457 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
458 1.1 mrg
459 1.1 mrg depd m096, 31, 32, ma064
460 1.1 mrg ldd -0x60(%r30), p128c
461 1.1 mrg extrd,u m096, 31, 32, ma128
462 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
463 1.1 mrg
464 1.1 mrg depd m160, 31, 32, ma128
465 1.1 mrg ldd -0x48(%r30), p192c
466 1.1 mrg extrd,u m160, 31, 32, ma192
467 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
468 1.1 mrg
469 1.1 mrg depd m224, 31, 32, ma192
470 1.1 mrg ldd -0x20(%r30), p192d
471 1.1 mrg extrd,u m224, 31, 32, ma256
472 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
473 1.1 mrg
474 1.1 mrg depd m288, 31, 32, ma256
475 1.1 mrg ldd -0x88(%r30), p256d
476 1.1 mrg add climb, p000a, s000
477 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
478 1.1 mrg
479 1.1 mrg add,dc p064a, p064b, s064
480 1.1 mrg ldd 0(rp), r000
481 1.1 mrg add,dc p128b, p128c, s128
482 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
483 1.1 mrg
484 1.1 mrg add,dc p192c, p192d, s192
485 1.1 mrg ldd 8(rp), r064
486 1.1 mrg add,dc p256d, %r0, climb
487 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
488 1.1 mrg
489 1.1 mrg ldd 16(rp), r128
490 1.1 mrg add ma000, s000, s000 C accum mid 0
491 1.1 mrg ldd 24(rp), r192
492 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
493 1.1 mrg
494 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
495 1.1 mrg fldd 0(up), %fr4
496 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
497 1.1 mrg fldd 8(up), %fr5
498 1.1 mrg
499 1.1 mrg add,dc ma256, climb, climb
500 1.1 mrg fldd 16(up), %fr6
501 1.1 mrg sub r000, s000, s000 C accum rlimb 0
502 1.1 mrg fldd 24(up), %fr7
503 1.1 mrg
504 1.1 mrg sub,db r064, s064, s064 C accum rlimb 1
505 1.1 mrg sub,db r128, s128, s128 C accum rlimb 2
506 1.1 mrg std s000, 0(rp)
507 1.1 mrg
508 1.1 mrg sub,db r192, s192, s192 C accum rlimb 3
509 1.1 mrg sub,db %r0, climb, climb
510 1.1 mrg sub %r0, climb, climb
511 1.1 mrg std s064, 8(rp)
512 1.1 mrg
513 1.1 mrg xmpyu %fr8R, %fr4L, %fr22
514 1.1 mrg ldd -0x78(%r30), p032a1
515 1.1 mrg xmpyu %fr8L, %fr4R, %fr23
516 1.1 mrg std s128, 16(rp)
517 1.1 mrg
518 1.1 mrg xmpyu %fr8R, %fr5L, %fr24
519 1.1 mrg ldd -0x70(%r30), p032a2
520 1.1 mrg xmpyu %fr8L, %fr5R, %fr25
521 1.1 mrg std s192, 24(rp)
522 1.1 mrg
523 1.1 mrg xmpyu %fr8R, %fr6L, %fr26
524 1.1 mrg ldd -0x38(%r30), p096b1
525 1.1 mrg xmpyu %fr8L, %fr6R, %fr27
526 1.1 mrg fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
527 1.1 mrg
528 1.1 mrg xmpyu %fr8R, %fr7L, %fr28
529 1.1 mrg ldd -0x30(%r30), p096b2
530 1.1 mrg xmpyu %fr8L, %fr7R, %fr29
531 1.1 mrg fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
532 1.1 mrg
533 1.1 mrg xmpyu %fr8R, %fr4R, %fr30
534 1.1 mrg ldd -0x58(%r30), p160c1
535 1.1 mrg xmpyu %fr8L, %fr4L, %fr31
536 1.1 mrg fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
537 1.1 mrg
538 1.1 mrg xmpyu %fr8R, %fr5R, %fr22
539 1.1 mrg ldd -0x50(%r30), p160c2
540 1.1 mrg xmpyu %fr8L, %fr5L, %fr23
541 1.1 mrg fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
542 1.1 mrg
543 1.1 mrg xmpyu %fr8R, %fr6R, %fr24
544 1.1 mrg ldd -0x18(%r30), p224d1
545 1.1 mrg xmpyu %fr8L, %fr6L, %fr25
546 1.1 mrg fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
547 1.1 mrg
548 1.1 mrg xmpyu %fr8R, %fr7R, %fr26
549 1.1 mrg ldd -0x10(%r30), p224d2
550 1.1 mrg fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
551 1.1 mrg xmpyu %fr8L, %fr7L, %fr27
552 1.1 mrg
553 1.1 mrg addib,<> -1, n, L(loop)
554 1.1 mrg ldo 32(rp), rp
555 1.1 mrg
556 1.1 mrg LDEF(end2)
557 1.1 mrg add p032a1, p032a2, m032
558 1.1 mrg ldd -0x80(%r30), p000a
559 1.1 mrg add,dc p096b1, p096b2, m096
560 1.1 mrg fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
561 1.1 mrg add,dc p160c1, p160c2, m160
562 1.1 mrg ldd -0x68(%r30), p064a
563 1.1 mrg add,dc p224d1, p224d2, m224
564 1.1 mrg fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
565 1.1 mrg add,dc %r0, %r0, m288
566 1.1 mrg ldd -0x40(%r30), p064b
567 1.1 mrg fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
568 1.1 mrg depd,z m032, 31, 32, ma000
569 1.1 mrg ldd -0x28(%r30), p128b
570 1.1 mrg extrd,u m032, 31, 32, ma064
571 1.1 mrg fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
572 1.1 mrg depd m096, 31, 32, ma064
573 1.1 mrg ldd -0x60(%r30), p128c
574 1.1 mrg extrd,u m096, 31, 32, ma128
575 1.1 mrg fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
576 1.1 mrg depd m160, 31, 32, ma128
577 1.1 mrg ldd -0x48(%r30), p192c
578 1.1 mrg extrd,u m160, 31, 32, ma192
579 1.1 mrg fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
580 1.1 mrg depd m224, 31, 32, ma192
581 1.1 mrg ldd -0x20(%r30), p192d
582 1.1 mrg extrd,u m224, 31, 32, ma256
583 1.1 mrg fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
584 1.1 mrg depd m288, 31, 32, ma256
585 1.1 mrg ldd -0x88(%r30), p256d
586 1.1 mrg add climb, p000a, s000
587 1.1 mrg fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
588 1.1 mrg add,dc p064a, p064b, s064
589 1.1 mrg ldd 0(rp), r000
590 1.1 mrg add,dc p128b, p128c, s128
591 1.1 mrg fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
592 1.1 mrg add,dc p192c, p192d, s192
593 1.1 mrg ldd 8(rp), r064
594 1.1 mrg add,dc p256d, %r0, climb
595 1.1 mrg fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
596 1.1 mrg ldd 16(rp), r128
597 1.1 mrg add ma000, s000, s000 C accum mid 0
598 1.1 mrg ldd 24(rp), r192
599 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
600 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
601 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
602 1.1 mrg add,dc ma256, climb, climb
603 1.1 mrg sub r000, s000, s000 C accum rlimb 0
604 1.1 mrg sub,db r064, s064, s064 C accum rlimb 1
605 1.1 mrg sub,db r128, s128, s128 C accum rlimb 2
606 1.1 mrg std s000, 0(rp)
607 1.1 mrg sub,db r192, s192, s192 C accum rlimb 3
608 1.1 mrg sub,db %r0, climb, climb
609 1.1 mrg sub %r0, climb, climb
610 1.1 mrg std s064, 8(rp)
611 1.1 mrg ldd -0x78(%r30), p032a1
612 1.1 mrg std s128, 16(rp)
613 1.1 mrg ldd -0x70(%r30), p032a2
614 1.1 mrg std s192, 24(rp)
615 1.1 mrg ldd -0x38(%r30), p096b1
616 1.1 mrg ldd -0x30(%r30), p096b2
617 1.1 mrg ldd -0x58(%r30), p160c1
618 1.1 mrg ldd -0x50(%r30), p160c2
619 1.1 mrg ldd -0x18(%r30), p224d1
620 1.1 mrg ldd -0x10(%r30), p224d2
621 1.1 mrg ldo 32(rp), rp
622 1.1 mrg
623 1.1 mrg LDEF(end1)
624 1.1 mrg add p032a1, p032a2, m032
625 1.1 mrg ldd -0x80(%r30), p000a
626 1.1 mrg add,dc p096b1, p096b2, m096
627 1.1 mrg add,dc p160c1, p160c2, m160
628 1.1 mrg ldd -0x68(%r30), p064a
629 1.1 mrg add,dc p224d1, p224d2, m224
630 1.1 mrg add,dc %r0, %r0, m288
631 1.1 mrg ldd -0x40(%r30), p064b
632 1.1 mrg depd,z m032, 31, 32, ma000
633 1.1 mrg ldd -0x28(%r30), p128b
634 1.1 mrg extrd,u m032, 31, 32, ma064
635 1.1 mrg depd m096, 31, 32, ma064
636 1.1 mrg ldd -0x60(%r30), p128c
637 1.1 mrg extrd,u m096, 31, 32, ma128
638 1.1 mrg depd m160, 31, 32, ma128
639 1.1 mrg ldd -0x48(%r30), p192c
640 1.1 mrg extrd,u m160, 31, 32, ma192
641 1.1 mrg depd m224, 31, 32, ma192
642 1.1 mrg ldd -0x20(%r30), p192d
643 1.1 mrg extrd,u m224, 31, 32, ma256
644 1.1 mrg depd m288, 31, 32, ma256
645 1.1 mrg ldd -0x88(%r30), p256d
646 1.1 mrg add climb, p000a, s000
647 1.1 mrg add,dc p064a, p064b, s064
648 1.1 mrg ldd 0(rp), r000
649 1.1 mrg add,dc p128b, p128c, s128
650 1.1 mrg add,dc p192c, p192d, s192
651 1.1 mrg ldd 8(rp), r064
652 1.1 mrg add,dc p256d, %r0, climb
653 1.1 mrg ldd 16(rp), r128
654 1.1 mrg add ma000, s000, s000 C accum mid 0
655 1.1 mrg ldd 24(rp), r192
656 1.1 mrg add,dc ma064, s064, s064 C accum mid 1
657 1.1 mrg add,dc ma128, s128, s128 C accum mid 2
658 1.1 mrg add,dc ma192, s192, s192 C accum mid 3
659 1.1 mrg add,dc ma256, climb, climb
660 1.1 mrg sub r000, s000, s000 C accum rlimb 0
661 1.1 mrg sub,db r064, s064, s064 C accum rlimb 1
662 1.1 mrg sub,db r128, s128, s128 C accum rlimb 2
663 1.1 mrg std s000, 0(rp)
664 1.1 mrg sub,db r192, s192, s192 C accum rlimb 3
665 1.1 mrg sub,db %r0, climb, climb
666 1.1 mrg sub %r0, climb, climb
667 1.1 mrg std s064, 8(rp)
668 1.1 mrg std s128, 16(rp)
669 1.1 mrg std s192, 24(rp)
670 1.1 mrg
671 1.1 mrg ldd -0xb0(%r30), %r13
672 1.1 mrg ldd -0xb8(%r30), %r12
673 1.1 mrg ldd -0xc0(%r30), %r11
674 1.1 mrg ldd -0xc8(%r30), %r10
675 1.1 mrg ldd -0xd0(%r30), %r9
676 1.1 mrg ldd -0xd8(%r30), %r8
677 1.1 mrg ldd -0xe0(%r30), %r7
678 1.1 mrg ldd -0xe8(%r30), %r6
679 1.1 mrg LDEF(done)
680 1.1 mrg ifdef(`HAVE_ABI_2_0w',
681 1.1 mrg ` copy climb, %r28
682 1.1 mrg ',` extrd,u climb, 63, 32, %r29
683 1.1 mrg extrd,u climb, 31, 32, %r28
684 1.1 mrg ')
685 1.1 mrg ldd -0xf0(%r30), %r5
686 1.1 mrg ldd -0xf8(%r30), %r4
687 1.1 mrg bve (%r2)
688 1.1 mrg ldd,mb -0x100(%r30), %r3
689 1.1 mrg EPILOGUE(mpn_submul_1)
690