mul_1.asm revision 1.1 1 dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
3
4 dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
5 dnl Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
13
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
18
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24 C cycles/limb
25 C POWER3/PPC630: 6-18
26 C POWER4/PPC970: 7.25
27 C POWER5: 7.75
28
29 C TODO
30 C * Try to reduce the number of needed live registers (at least r5 and r10
31 C could be combined)
32 C * Optimize feed-in code, for speed and size.
33 C * Clean up r12/r7 usage in feed-in code.
34
35 C INPUT PARAMETERS
36 define(`rp', `r3')
37 define(`up', `r4')
38 define(`n', `r5')
39 define(`vl', `r6')
40
41 ASM_START()
42 PROLOGUE(mpn_mul_1c)
43 std r27, -40(r1)
44 std r26, -48(r1)
45 mr r12, r7
46 b L(ent)
47 EPILOGUE()
48 PROLOGUE(mpn_mul_1)
49 std r27, -40(r1)
50 std r26, -48(r1)
51 li r12, 0 C cy_limb = 0
52 L(ent): ld r26, 0(up)
53
54 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
55 cmpdi cr6, r0, 2
56 addic n, n, 3 C compute count...
57 srdi n, n, 2 C ...for ctr
58 mtctr n C copy count into ctr
59 beq cr0, L(b00)
60 blt cr6, L(b01)
61 beq cr6, L(b10)
62
63 L(b11): mr r7, r12
64 mulld r0, r26, r6
65 mulhdu r12, r26, r6
66 addi up, up, 8
67 addc r0, r0, r7
68 std r0, 0(rp)
69 addi rp, rp, 8
70 b L(fic)
71
72 L(b00): ld r27, 8(up)
73 addi up, up, 16
74 mulld r0, r26, r6
75 mulhdu r5, r26, r6
76 mulld r7, r27, r6
77 mulhdu r8, r27, r6
78 addc r0, r0, r12
79 adde r7, r7, r5
80 addze r12, r8
81 std r0, 0(rp)
82 std r7, 8(rp)
83 addi rp, rp, 16
84 b L(fic)
85
86 nop C alignment
87 L(b01): bdnz L(gt1)
88 mulld r0, r26, r6
89 mulhdu r8, r26, r6
90 addc r0, r0, r12
91 std r0, 0(rp)
92 b L(ret)
93 L(gt1): ld r27, 8(up)
94 nop
95 mulld r0, r26, r6
96 mulhdu r5, r26, r6
97 ld r26, 16(up)
98 mulld r7, r27, r6
99 mulhdu r8, r27, r6
100 mulld r9, r26, r6
101 mulhdu r10, r26, r6
102 addc r0, r0, r12
103 adde r7, r7, r5
104 adde r9, r9, r8
105 addze r12, r10
106 std r0, 0(rp)
107 std r7, 8(rp)
108 std r9, 16(rp)
109 addi up, up, 24
110 addi rp, rp, 24
111 b L(fic)
112
113 nop
114 L(fic): ld r26, 0(up)
115 L(b10): ld r27, 8(up)
116 addi up, up, 16
117 bdz L(end)
118
119 L(top): mulld r0, r26, r6
120 mulhdu r5, r26, r6
121 ld r26, 0(up)
122 nop
123
124 mulld r7, r27, r6
125 mulhdu r8, r27, r6
126 ld r27, 8(up)
127 nop
128
129 adde r0, r0, r12
130 adde r7, r7, r5
131
132 mulld r9, r26, r6
133 mulhdu r10, r26, r6
134 ld r26, 16(up)
135 nop
136
137 mulld r11, r27, r6
138 mulhdu r12, r27, r6
139 ld r27, 24(up)
140
141 std r0, 0(rp)
142 adde r9, r9, r8
143 std r7, 8(rp)
144 adde r11, r11, r10
145 std r9, 16(rp)
146 addi up, up, 32
147 std r11, 24(rp)
148
149 addi rp, rp, 32
150 bdnz L(top)
151
152 L(end): mulld r0, r26, r6
153 mulhdu r5, r26, r6
154
155 mulld r7, r27, r6
156 mulhdu r8, r27, r6
157
158 adde r0, r0, r12
159 adde r7, r7, r5
160
161 std r0, 0(rp)
162 std r7, 8(rp)
163 L(ret): addze r3, r8
164 ld r27, -40(r1)
165 ld r26, -48(r1)
166 blr
167 EPILOGUE()
168