toom33_mul.c revision 1.1.1.2 1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 size. Or more accurately, bn <= an < (3/2)bn.
3
4 Contributed to the GNU project by Torbjorn Granlund.
5 Additional improvements by Marco Bodrato.
6
7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11 Copyright 2006, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
23 License for more details.
24
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
27
28
29 #include "gmp.h"
30 #include "gmp-impl.h"
31
32 /* Evaluate in: -1, 0, +1, +2, +inf
33
34 <-s--><--n--><--n--><--n-->
35 ____ ______ ______ ______
36 |_a3_|___a2_|___a1_|___a0_|
37 |b3_|___b2_|___b1_|___b0_|
38 <-t-><--n--><--n--><--n-->
39
40 v0 = a0 * b0 # A(0)*B(0)
41 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
42 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
43 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
44 vinf= a2 * b2 # A(inf)*B(inf)
45 */
46
47 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
48 #define MAYBE_mul_basecase 1
49 #define MAYBE_mul_toom33 1
50 #else
51 #define MAYBE_mul_basecase \
52 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
53 #define MAYBE_mul_toom33 \
54 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
55 #endif
56
57 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
58 multiplication at the infinity point. We may have
59 MAYBE_mul_basecase == 0, and still get s just below
60 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
61 s == 1 and mpn_toom22_mul will crash.
62 */
63
64 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
65 do { \
66 if (MAYBE_mul_basecase \
67 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
68 mpn_mul_basecase (p, a, n, b, n); \
69 else if (! MAYBE_mul_toom33 \
70 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
71 mpn_toom22_mul (p, a, n, b, n, ws); \
72 else \
73 mpn_toom33_mul (p, a, n, b, n, ws); \
74 } while (0)
75
76 void
77 mpn_toom33_mul (mp_ptr pp,
78 mp_srcptr ap, mp_size_t an,
79 mp_srcptr bp, mp_size_t bn,
80 mp_ptr scratch)
81 {
82 const int __gmpn_cpuvec_initialized = 1;
83 mp_size_t n, s, t;
84 int vm1_neg;
85 mp_limb_t cy, vinf0;
86 mp_ptr gp;
87 mp_ptr as1, asm1, as2;
88 mp_ptr bs1, bsm1, bs2;
89
90 #define a0 ap
91 #define a1 (ap + n)
92 #define a2 (ap + 2*n)
93 #define b0 bp
94 #define b1 (bp + n)
95 #define b2 (bp + 2*n)
96
97 n = (an + 2) / (size_t) 3;
98
99 s = an - 2 * n;
100 t = bn - 2 * n;
101
102 ASSERT (an >= bn);
103
104 ASSERT (0 < s && s <= n);
105 ASSERT (0 < t && t <= n);
106
107 as1 = scratch + 4 * n + 4;
108 asm1 = scratch + 2 * n + 2;
109 as2 = pp + n + 1;
110
111 bs1 = pp;
112 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
113 bs2 = pp + 2 * n + 2;
114
115 gp = scratch;
116
117 vm1_neg = 0;
118
119 /* Compute as1 and asm1. */
120 cy = mpn_add (gp, a0, n, a2, s);
121 #if HAVE_NATIVE_mpn_add_n_sub_n
122 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
123 {
124 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
125 as1[n] = cy >> 1;
126 asm1[n] = 0;
127 vm1_neg = 1;
128 }
129 else
130 {
131 mp_limb_t cy2;
132 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
133 as1[n] = cy + (cy2 >> 1);
134 asm1[n] = cy - (cy2 & 1);
135 }
136 #else
137 as1[n] = cy + mpn_add_n (as1, gp, a1, n);
138 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
139 {
140 mpn_sub_n (asm1, a1, gp, n);
141 asm1[n] = 0;
142 vm1_neg = 1;
143 }
144 else
145 {
146 cy -= mpn_sub_n (asm1, gp, a1, n);
147 asm1[n] = cy;
148 }
149 #endif
150
151 /* Compute as2. */
152 #if HAVE_NATIVE_mpn_rsblsh1_n
153 cy = mpn_add_n (as2, a2, as1, s);
154 if (s != n)
155 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
156 cy += as1[n];
157 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
158 #else
159 #if HAVE_NATIVE_mpn_addlsh1_n
160 cy = mpn_addlsh1_n (as2, a1, a2, s);
161 if (s != n)
162 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
163 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
164 #else
165 cy = mpn_add_n (as2, a2, as1, s);
166 if (s != n)
167 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
168 cy += as1[n];
169 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
170 cy -= mpn_sub_n (as2, as2, a0, n);
171 #endif
172 #endif
173 as2[n] = cy;
174
175 /* Compute bs1 and bsm1. */
176 cy = mpn_add (gp, b0, n, b2, t);
177 #if HAVE_NATIVE_mpn_add_n_sub_n
178 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
179 {
180 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
181 bs1[n] = cy >> 1;
182 bsm1[n] = 0;
183 vm1_neg ^= 1;
184 }
185 else
186 {
187 mp_limb_t cy2;
188 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
189 bs1[n] = cy + (cy2 >> 1);
190 bsm1[n] = cy - (cy2 & 1);
191 }
192 #else
193 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
194 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
195 {
196 mpn_sub_n (bsm1, b1, gp, n);
197 bsm1[n] = 0;
198 vm1_neg ^= 1;
199 }
200 else
201 {
202 cy -= mpn_sub_n (bsm1, gp, b1, n);
203 bsm1[n] = cy;
204 }
205 #endif
206
207 /* Compute bs2. */
208 #if HAVE_NATIVE_mpn_rsblsh1_n
209 cy = mpn_add_n (bs2, b2, bs1, t);
210 if (t != n)
211 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
212 cy += bs1[n];
213 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
214 #else
215 #if HAVE_NATIVE_mpn_addlsh1_n
216 cy = mpn_addlsh1_n (bs2, b1, b2, t);
217 if (t != n)
218 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
219 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
220 #else
221 cy = mpn_add_n (bs2, bs1, b2, t);
222 if (t != n)
223 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
224 cy += bs1[n];
225 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
226 cy -= mpn_sub_n (bs2, bs2, b0, n);
227 #endif
228 #endif
229 bs2[n] = cy;
230
231 ASSERT (as1[n] <= 2);
232 ASSERT (bs1[n] <= 2);
233 ASSERT (asm1[n] <= 1);
234 ASSERT (bsm1[n] <= 1);
235 ASSERT (as2[n] <= 6);
236 ASSERT (bs2[n] <= 6);
237
238 #define v0 pp /* 2n */
239 #define v1 (pp + 2 * n) /* 2n+1 */
240 #define vinf (pp + 4 * n) /* s+t */
241 #define vm1 scratch /* 2n+1 */
242 #define v2 (scratch + 2 * n + 1) /* 2n+2 */
243 #define scratch_out (scratch + 5 * n + 5)
244
245 /* vm1, 2n+1 limbs */
246 #ifdef SMALLER_RECURSION
247 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
248 cy = 0;
249 if (asm1[n] != 0)
250 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
251 if (bsm1[n] != 0)
252 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
253 vm1[2 * n] = cy;
254 #else
255 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
256 #endif
257
258 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
259
260 /* vinf, s+t limbs */
261 if (s > t) mpn_mul (vinf, a2, s, b2, t);
262 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
263
264 vinf0 = vinf[0]; /* v1 overlaps with this */
265
266 #ifdef SMALLER_RECURSION
267 /* v1, 2n+1 limbs */
268 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
269 if (as1[n] == 1)
270 {
271 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
272 }
273 else if (as1[n] != 0)
274 {
275 #if HAVE_NATIVE_mpn_addlsh1_n
276 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
277 #else
278 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
279 #endif
280 }
281 else
282 cy = 0;
283 if (bs1[n] == 1)
284 {
285 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
286 }
287 else if (bs1[n] != 0)
288 {
289 #if HAVE_NATIVE_mpn_addlsh1_n
290 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
291 #else
292 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
293 #endif
294 }
295 v1[2 * n] = cy;
296 #else
297 cy = vinf[1];
298 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
299 vinf[1] = cy;
300 #endif
301
302 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
303
304 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
305 }
306