sqr_basecase.asm revision 1.1.1.2 1 dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
2
3 dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4
5 dnl Copyright 2011 Free Software Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
32
33 include(`../config.m4')
34
35 C TODO
36 C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
37 C 4 large loops into one; we could use it for the outer loop branch.
38 C * Optimise code outside of inner loops.
39 C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
40 C outer each loop. ("Overlapping software pipelining")
41 C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
42 C all pushes.
43 C * Perhaps write special code for n < M, for some small M.
44 C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
45 C with even less pipelined code.
46 C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
47 C Consider breaking out earlier, saving high the cost of short loops.
48
49 C void mpn_sqr_basecase (mp_ptr wp,
50 C mp_srcptr xp, mp_size_t xn);
51
52 define(`rp', `%edi')
53 define(`up', `%esi')
54 define(`n', `%ecx')
55
56 define(`un', `%ebp')
57
58 TEXT
59 ALIGN(16)
60 PROLOGUE(mpn_sqr_basecase)
61 push %edi
62 push %esi
63 mov 12(%esp), rp
64 mov 16(%esp), up
65 mov 20(%esp), n
66
67 lea 4(rp), rp C write triangular product starting at rp[1]
68 dec n
69 movd (up), %mm7
70
71 jz L(one)
72 lea 4(up), up
73 push %ebx
74 push %ebp
75 mov n, %eax
76
77 movd (up), %mm0
78 neg n
79 pmuludq %mm7, %mm0
80 pxor %mm6, %mm6
81 mov n, un
82
83 and $3, %eax
84 jz L(of0)
85 cmp $2, %eax
86 jc L(of1)
87 jz L(of2)
88
89 C ================================================================
90 jmp L(m3)
91 ALIGN(16)
92 L(lm3): movd -4(up), %mm0
93 pmuludq %mm7, %mm0
94 psrlq $32, %mm6
95 lea 16(rp), rp
96 paddq %mm0, %mm6
97 movd (up), %mm0
98 pmuludq %mm7, %mm0
99 movd %mm6, -4(rp)
100 psrlq $32, %mm6
101 L(m3): paddq %mm0, %mm6
102 movd 4(up), %mm0
103 pmuludq %mm7, %mm0
104 movd %mm6, (rp)
105 psrlq $32, %mm6
106 paddq %mm0, %mm6
107 movd 8(up), %mm0
108 pmuludq %mm7, %mm0
109 movd %mm6, 4(rp)
110 psrlq $32, %mm6
111 paddq %mm0, %mm6
112 add $4, un
113 movd %mm6, 8(rp)
114 lea 16(up), up
115 js L(lm3)
116
117 psrlq $32, %mm6
118 movd %mm6, 12(rp)
119
120 inc n
121 C jz L(done)
122 lea -12(up), up
123 lea 4(rp), rp
124 jmp L(ol2)
125
126 C ================================================================
127 ALIGN(16)
128 L(lm0): movd (up), %mm0
129 pmuludq %mm7, %mm0
130 psrlq $32, %mm6
131 lea 16(rp), rp
132 L(of0): paddq %mm0, %mm6
133 movd 4(up), %mm0
134 pmuludq %mm7, %mm0
135 movd %mm6, (rp)
136 psrlq $32, %mm6
137 paddq %mm0, %mm6
138 movd 8(up), %mm0
139 pmuludq %mm7, %mm0
140 movd %mm6, 4(rp)
141 psrlq $32, %mm6
142 paddq %mm0, %mm6
143 movd 12(up), %mm0
144 pmuludq %mm7, %mm0
145 movd %mm6, 8(rp)
146 psrlq $32, %mm6
147 paddq %mm0, %mm6
148 add $4, un
149 movd %mm6, 12(rp)
150 lea 16(up), up
151 js L(lm0)
152
153 psrlq $32, %mm6
154 movd %mm6, 16(rp)
155
156 inc n
157 C jz L(done)
158 lea -8(up), up
159 lea 8(rp), rp
160 jmp L(ol3)
161
162 C ================================================================
163 ALIGN(16)
164 L(lm1): movd -12(up), %mm0
165 pmuludq %mm7, %mm0
166 psrlq $32, %mm6
167 lea 16(rp), rp
168 paddq %mm0, %mm6
169 movd -8(up), %mm0
170 pmuludq %mm7, %mm0
171 movd %mm6, -12(rp)
172 psrlq $32, %mm6
173 paddq %mm0, %mm6
174 movd -4(up), %mm0
175 pmuludq %mm7, %mm0
176 movd %mm6, -8(rp)
177 psrlq $32, %mm6
178 paddq %mm0, %mm6
179 movd (up), %mm0
180 pmuludq %mm7, %mm0
181 movd %mm6, -4(rp)
182 psrlq $32, %mm6
183 L(of1): paddq %mm0, %mm6
184 add $4, un
185 movd %mm6, (rp)
186 lea 16(up), up
187 js L(lm1)
188
189 psrlq $32, %mm6
190 movd %mm6, 4(rp)
191
192 inc n
193 jz L(done) C goes away when we add special n=2 code
194 lea -20(up), up
195 lea -4(rp), rp
196 jmp L(ol0)
197
198 C ================================================================
199 ALIGN(16)
200 L(lm2): movd -8(up), %mm0
201 pmuludq %mm7, %mm0
202 psrlq $32, %mm6
203 lea 16(rp), rp
204 paddq %mm0, %mm6
205 movd -4(up), %mm0
206 pmuludq %mm7, %mm0
207 movd %mm6, -8(rp)
208 psrlq $32, %mm6
209 paddq %mm0, %mm6
210 movd (up), %mm0
211 pmuludq %mm7, %mm0
212 movd %mm6, -4(rp)
213 psrlq $32, %mm6
214 L(of2): paddq %mm0, %mm6
215 movd 4(up), %mm0
216 pmuludq %mm7, %mm0
217 movd %mm6, (rp)
218 psrlq $32, %mm6
219 paddq %mm0, %mm6
220 add $4, un
221 movd %mm6, 4(rp)
222 lea 16(up), up
223 js L(lm2)
224
225 psrlq $32, %mm6
226 movd %mm6, 8(rp)
227
228 inc n
229 C jz L(done)
230 lea -16(up), up
231 C lea (rp), rp
232 C jmp L(ol1)
233
234 C ================================================================
235
236 L(ol1): lea 4(up,n,4), up
237 movd (up), %mm7 C read next U invariant limb
238 lea 8(rp,n,4), rp
239 mov n, un
240
241 movd 4(up), %mm1
242 pmuludq %mm7, %mm1
243 sar $2, un
244 movd %mm1, %ebx
245 inc un
246 jz L(re1)
247
248 movd 8(up), %mm0
249 pmuludq %mm7, %mm0
250 xor %edx, %edx C zero edx and CF
251 jmp L(a1)
252
253 L(la1): adc $0, %edx
254 add %ebx, 12(rp)
255 movd %mm0, %eax
256 pmuludq %mm7, %mm1
257 lea 16(rp), rp
258 psrlq $32, %mm0
259 adc %edx, %eax
260 movd %mm0, %edx
261 movd %mm1, %ebx
262 movd 8(up), %mm0
263 pmuludq %mm7, %mm0
264 adc $0, %edx
265 add %eax, (rp)
266 L(a1): psrlq $32, %mm1
267 adc %edx, %ebx
268 movd %mm1, %edx
269 movd %mm0, %eax
270 movd 12(up), %mm1
271 pmuludq %mm7, %mm1
272 adc $0, %edx
273 add %ebx, 4(rp)
274 psrlq $32, %mm0
275 adc %edx, %eax
276 movd %mm0, %edx
277 movd %mm1, %ebx
278 lea 16(up), up
279 movd (up), %mm0
280 adc $0, %edx
281 add %eax, 8(rp)
282 psrlq $32, %mm1
283 adc %edx, %ebx
284 movd %mm1, %edx
285 pmuludq %mm7, %mm0
286 inc un
287 movd 4(up), %mm1
288 jnz L(la1)
289
290 adc un, %edx C un is zero here
291 add %ebx, 12(rp)
292 movd %mm0, %eax
293 pmuludq %mm7, %mm1
294 lea 16(rp), rp
295 psrlq $32, %mm0
296 adc %edx, %eax
297 movd %mm0, %edx
298 movd %mm1, %ebx
299 adc un, %edx
300 add %eax, (rp)
301 psrlq $32, %mm1
302 adc %edx, %ebx
303 movd %mm1, %eax
304 adc un, %eax
305 add %ebx, 4(rp)
306 adc un, %eax
307 mov %eax, 8(rp)
308
309 inc n
310
311 C ================================================================
312
313 L(ol0): lea (up,n,4), up
314 movd 4(up), %mm7 C read next U invariant limb
315 lea 4(rp,n,4), rp
316 mov n, un
317
318 movd 8(up), %mm0
319 pmuludq %mm7, %mm0
320 sar $2, un
321 movd 12(up), %mm1
322 movd %mm0, %eax
323 pmuludq %mm7, %mm1
324 xor %edx, %edx C zero edx and CF
325 jmp L(a0)
326
327 L(la0): adc $0, %edx
328 add %ebx, 12(rp)
329 movd %mm0, %eax
330 pmuludq %mm7, %mm1
331 lea 16(rp), rp
332 psrlq $32, %mm0
333 adc %edx, %eax
334 movd %mm0, %edx
335 movd %mm1, %ebx
336 movd 8(up), %mm0
337 pmuludq %mm7, %mm0
338 adc $0, %edx
339 add %eax, (rp)
340 psrlq $32, %mm1
341 adc %edx, %ebx
342 movd %mm1, %edx
343 movd %mm0, %eax
344 movd 12(up), %mm1
345 pmuludq %mm7, %mm1
346 adc $0, %edx
347 add %ebx, 4(rp)
348 L(a0): psrlq $32, %mm0
349 adc %edx, %eax
350 movd %mm0, %edx
351 movd %mm1, %ebx
352 lea 16(up), up
353 movd (up), %mm0
354 adc $0, %edx
355 add %eax, 8(rp)
356 psrlq $32, %mm1
357 adc %edx, %ebx
358 movd %mm1, %edx
359 pmuludq %mm7, %mm0
360 inc un
361 movd 4(up), %mm1
362 jnz L(la0)
363
364 adc un, %edx C un is zero here
365 add %ebx, 12(rp)
366 movd %mm0, %eax
367 pmuludq %mm7, %mm1
368 lea 16(rp), rp
369 psrlq $32, %mm0
370 adc %edx, %eax
371 movd %mm0, %edx
372 movd %mm1, %ebx
373 adc un, %edx
374 add %eax, (rp)
375 psrlq $32, %mm1
376 adc %edx, %ebx
377 movd %mm1, %eax
378 adc un, %eax
379 add %ebx, 4(rp)
380 adc un, %eax
381 mov %eax, 8(rp)
382
383 inc n
384
385 C ================================================================
386
387 L(ol3): lea 12(up,n,4), up
388 movd -8(up), %mm7 C read next U invariant limb
389 lea (rp,n,4), rp C put rp back
390 mov n, un
391
392 movd -4(up), %mm1
393 pmuludq %mm7, %mm1
394 sar $2, un
395 movd %mm1, %ebx
396 movd (up), %mm0
397 xor %edx, %edx C zero edx and CF
398 jmp L(a3)
399
400 L(la3): adc $0, %edx
401 add %ebx, 12(rp)
402 movd %mm0, %eax
403 pmuludq %mm7, %mm1
404 lea 16(rp), rp
405 psrlq $32, %mm0
406 adc %edx, %eax
407 movd %mm0, %edx
408 movd %mm1, %ebx
409 movd 8(up), %mm0
410 pmuludq %mm7, %mm0
411 adc $0, %edx
412 add %eax, (rp)
413 psrlq $32, %mm1
414 adc %edx, %ebx
415 movd %mm1, %edx
416 movd %mm0, %eax
417 movd 12(up), %mm1
418 pmuludq %mm7, %mm1
419 adc $0, %edx
420 add %ebx, 4(rp)
421 psrlq $32, %mm0
422 adc %edx, %eax
423 movd %mm0, %edx
424 movd %mm1, %ebx
425 lea 16(up), up
426 movd (up), %mm0
427 adc $0, %edx
428 add %eax, 8(rp)
429 L(a3): psrlq $32, %mm1
430 adc %edx, %ebx
431 movd %mm1, %edx
432 pmuludq %mm7, %mm0
433 inc un
434 movd 4(up), %mm1
435 jnz L(la3)
436
437 adc un, %edx C un is zero here
438 add %ebx, 12(rp)
439 movd %mm0, %eax
440 pmuludq %mm7, %mm1
441 lea 16(rp), rp
442 psrlq $32, %mm0
443 adc %edx, %eax
444 movd %mm0, %edx
445 movd %mm1, %ebx
446 adc un, %edx
447 add %eax, (rp)
448 psrlq $32, %mm1
449 adc %edx, %ebx
450 movd %mm1, %eax
451 adc un, %eax
452 add %ebx, 4(rp)
453 adc un, %eax
454 mov %eax, 8(rp)
455
456 inc n
457
458 C ================================================================
459
460 L(ol2): lea 8(up,n,4), up
461 movd -4(up), %mm7 C read next U invariant limb
462 lea 12(rp,n,4), rp
463 mov n, un
464
465 movd (up), %mm0
466 pmuludq %mm7, %mm0
467 xor %edx, %edx
468 sar $2, un
469 movd 4(up), %mm1
470 test un, un C clear carry
471 movd %mm0, %eax
472 pmuludq %mm7, %mm1
473 inc un
474 jnz L(a2)
475 jmp L(re2)
476
477 L(la2): adc $0, %edx
478 add %ebx, 12(rp)
479 movd %mm0, %eax
480 pmuludq %mm7, %mm1
481 lea 16(rp), rp
482 L(a2): psrlq $32, %mm0
483 adc %edx, %eax
484 movd %mm0, %edx
485 movd %mm1, %ebx
486 movd 8(up), %mm0
487 pmuludq %mm7, %mm0
488 adc $0, %edx
489 add %eax, (rp)
490 psrlq $32, %mm1
491 adc %edx, %ebx
492 movd %mm1, %edx
493 movd %mm0, %eax
494 movd 12(up), %mm1
495 pmuludq %mm7, %mm1
496 adc $0, %edx
497 add %ebx, 4(rp)
498 psrlq $32, %mm0
499 adc %edx, %eax
500 movd %mm0, %edx
501 movd %mm1, %ebx
502 lea 16(up), up
503 movd (up), %mm0
504 adc $0, %edx
505 add %eax, 8(rp)
506 psrlq $32, %mm1
507 adc %edx, %ebx
508 movd %mm1, %edx
509 pmuludq %mm7, %mm0
510 inc un
511 movd 4(up), %mm1
512 jnz L(la2)
513
514 adc un, %edx C un is zero here
515 add %ebx, 12(rp)
516 movd %mm0, %eax
517 pmuludq %mm7, %mm1
518 lea 16(rp), rp
519 psrlq $32, %mm0
520 adc %edx, %eax
521 movd %mm0, %edx
522 movd %mm1, %ebx
523 adc un, %edx
524 add %eax, (rp)
525 psrlq $32, %mm1
526 adc %edx, %ebx
527 movd %mm1, %eax
528 adc un, %eax
529 add %ebx, 4(rp)
530 adc un, %eax
531 mov %eax, 8(rp)
532
533 inc n
534 jmp L(ol1)
535
536 C ================================================================
537 L(re2): psrlq $32, %mm0
538 movd (up), %mm7 C read next U invariant limb
539 adc %edx, %eax
540 movd %mm0, %edx
541 movd %mm1, %ebx
542 adc un, %edx
543 add %eax, (rp)
544 lea 4(rp), rp
545 psrlq $32, %mm1
546 adc %edx, %ebx
547 movd %mm1, %eax
548 movd 4(up), %mm1
549 adc un, %eax
550 add %ebx, (rp)
551 pmuludq %mm7, %mm1
552 adc un, %eax
553 mov %eax, 4(rp)
554 movd %mm1, %ebx
555
556 L(re1): psrlq $32, %mm1
557 add %ebx, 4(rp)
558 movd %mm1, %eax
559 adc un, %eax
560 xor n, n C make n zeroness assumption below true
561 mov %eax, 8(rp)
562
563 L(done): C n is zero here
564 mov 24(%esp), up
565 mov 28(%esp), %eax
566
567 movd (up), %mm0
568 inc %eax
569 pmuludq %mm0, %mm0
570 lea 4(up), up
571 mov 20(%esp), rp
572 shr %eax
573 movd %mm0, (rp)
574 psrlq $32, %mm0
575 lea -12(rp), rp
576 mov %eax, 28(%esp)
577 jnc L(odd)
578
579 movd %mm0, %ebp
580 movd (up), %mm0
581 lea 8(rp), rp
582 pmuludq %mm0, %mm0
583 lea -4(up), up
584 add 8(rp), %ebp
585 movd %mm0, %edx
586 adc 12(rp), %edx
587 rcr n
588 jmp L(ent)
589
590 C ALIGN(16) C alignment seems irrelevant
591 L(top): movd (up), %mm1
592 adc n, n
593 movd %mm0, %eax
594 pmuludq %mm1, %mm1
595 movd 4(up), %mm0
596 adc (rp), %eax
597 movd %mm1, %ebx
598 pmuludq %mm0, %mm0
599 psrlq $32, %mm1
600 adc 4(rp), %ebx
601 movd %mm1, %ebp
602 movd %mm0, %edx
603 adc 8(rp), %ebp
604 adc 12(rp), %edx
605 rcr n C FIXME: isn't this awfully slow on atom???
606 adc %eax, (rp)
607 adc %ebx, 4(rp)
608 L(ent): lea 8(up), up
609 adc %ebp, 8(rp)
610 psrlq $32, %mm0
611 adc %edx, 12(rp)
612 L(odd): decl 28(%esp)
613 lea 16(rp), rp
614 jnz L(top)
615
616 L(end): adc n, n
617 movd %mm0, %eax
618 adc n, %eax
619 mov %eax, (rp)
620
621 L(rtn): emms
622 pop %ebp
623 pop %ebx
624 pop %esi
625 pop %edi
626 ret
627
628 L(one): pmuludq %mm7, %mm7
629 movq %mm7, -4(rp)
630 emms
631 pop %esi
632 pop %edi
633 ret
634 EPILOGUE()
635