toom33_mul.c revision 1.1.1.3 1 1.1 mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 1.1 mrg size. Or more accurately, bn <= an < (3/2)bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund.
5 1.1 mrg Additional improvements by Marco Bodrato.
6 1.1 mrg
7 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 1.1 mrg
11 1.1.1.3 mrg Copyright 2006-2008, 2010, 2012 Free Software Foundation, Inc.
12 1.1 mrg
13 1.1 mrg This file is part of the GNU MP Library.
14 1.1 mrg
15 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
16 1.1.1.3 mrg it under the terms of either:
17 1.1.1.3 mrg
18 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
19 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
20 1.1.1.3 mrg option) any later version.
21 1.1.1.3 mrg
22 1.1.1.3 mrg or
23 1.1.1.3 mrg
24 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
25 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
26 1.1.1.3 mrg later version.
27 1.1.1.3 mrg
28 1.1.1.3 mrg or both in parallel, as here.
29 1.1 mrg
30 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
31 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 1.1.1.3 mrg for more details.
34 1.1 mrg
35 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
36 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
37 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
38 1.1 mrg
39 1.1 mrg
40 1.1 mrg #include "gmp.h"
41 1.1 mrg #include "gmp-impl.h"
42 1.1 mrg
43 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf
44 1.1 mrg
45 1.1.1.3 mrg <-s--><--n--><--n-->
46 1.1.1.3 mrg ____ ______ ______
47 1.1.1.3 mrg |_a2_|___a1_|___a0_|
48 1.1.1.3 mrg |b2_|___b1_|___b0_|
49 1.1.1.3 mrg <-t-><--n--><--n-->
50 1.1 mrg
51 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
52 1.1 mrg v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
53 1.1 mrg vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
54 1.1 mrg v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
55 1.1 mrg vinf= a2 * b2 # A(inf)*B(inf)
56 1.1 mrg */
57 1.1 mrg
58 1.1.1.2 mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
59 1.1 mrg #define MAYBE_mul_basecase 1
60 1.1 mrg #define MAYBE_mul_toom33 1
61 1.1 mrg #else
62 1.1 mrg #define MAYBE_mul_basecase \
63 1.1 mrg (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
64 1.1 mrg #define MAYBE_mul_toom33 \
65 1.1 mrg (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
66 1.1 mrg #endif
67 1.1 mrg
68 1.1 mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
69 1.1 mrg multiplication at the infinity point. We may have
70 1.1 mrg MAYBE_mul_basecase == 0, and still get s just below
71 1.1 mrg MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
72 1.1 mrg s == 1 and mpn_toom22_mul will crash.
73 1.1 mrg */
74 1.1 mrg
75 1.1 mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
76 1.1 mrg do { \
77 1.1 mrg if (MAYBE_mul_basecase \
78 1.1 mrg && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
79 1.1 mrg mpn_mul_basecase (p, a, n, b, n); \
80 1.1 mrg else if (! MAYBE_mul_toom33 \
81 1.1 mrg || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
82 1.1 mrg mpn_toom22_mul (p, a, n, b, n, ws); \
83 1.1 mrg else \
84 1.1 mrg mpn_toom33_mul (p, a, n, b, n, ws); \
85 1.1 mrg } while (0)
86 1.1 mrg
87 1.1 mrg void
88 1.1 mrg mpn_toom33_mul (mp_ptr pp,
89 1.1 mrg mp_srcptr ap, mp_size_t an,
90 1.1 mrg mp_srcptr bp, mp_size_t bn,
91 1.1 mrg mp_ptr scratch)
92 1.1 mrg {
93 1.1.1.2 mrg const int __gmpn_cpuvec_initialized = 1;
94 1.1 mrg mp_size_t n, s, t;
95 1.1 mrg int vm1_neg;
96 1.1 mrg mp_limb_t cy, vinf0;
97 1.1 mrg mp_ptr gp;
98 1.1 mrg mp_ptr as1, asm1, as2;
99 1.1 mrg mp_ptr bs1, bsm1, bs2;
100 1.1 mrg
101 1.1 mrg #define a0 ap
102 1.1 mrg #define a1 (ap + n)
103 1.1 mrg #define a2 (ap + 2*n)
104 1.1 mrg #define b0 bp
105 1.1 mrg #define b1 (bp + n)
106 1.1 mrg #define b2 (bp + 2*n)
107 1.1 mrg
108 1.1 mrg n = (an + 2) / (size_t) 3;
109 1.1 mrg
110 1.1 mrg s = an - 2 * n;
111 1.1 mrg t = bn - 2 * n;
112 1.1 mrg
113 1.1 mrg ASSERT (an >= bn);
114 1.1 mrg
115 1.1 mrg ASSERT (0 < s && s <= n);
116 1.1 mrg ASSERT (0 < t && t <= n);
117 1.1 mrg
118 1.1 mrg as1 = scratch + 4 * n + 4;
119 1.1 mrg asm1 = scratch + 2 * n + 2;
120 1.1 mrg as2 = pp + n + 1;
121 1.1 mrg
122 1.1 mrg bs1 = pp;
123 1.1 mrg bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
124 1.1 mrg bs2 = pp + 2 * n + 2;
125 1.1 mrg
126 1.1 mrg gp = scratch;
127 1.1 mrg
128 1.1 mrg vm1_neg = 0;
129 1.1 mrg
130 1.1 mrg /* Compute as1 and asm1. */
131 1.1 mrg cy = mpn_add (gp, a0, n, a2, s);
132 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
133 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
134 1.1 mrg {
135 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
136 1.1 mrg as1[n] = cy >> 1;
137 1.1 mrg asm1[n] = 0;
138 1.1 mrg vm1_neg = 1;
139 1.1 mrg }
140 1.1 mrg else
141 1.1 mrg {
142 1.1 mrg mp_limb_t cy2;
143 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
144 1.1 mrg as1[n] = cy + (cy2 >> 1);
145 1.1 mrg asm1[n] = cy - (cy2 & 1);
146 1.1 mrg }
147 1.1 mrg #else
148 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n);
149 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
150 1.1 mrg {
151 1.1 mrg mpn_sub_n (asm1, a1, gp, n);
152 1.1 mrg asm1[n] = 0;
153 1.1 mrg vm1_neg = 1;
154 1.1 mrg }
155 1.1 mrg else
156 1.1 mrg {
157 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n);
158 1.1 mrg asm1[n] = cy;
159 1.1 mrg }
160 1.1 mrg #endif
161 1.1 mrg
162 1.1 mrg /* Compute as2. */
163 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
164 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
165 1.1 mrg if (s != n)
166 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
167 1.1 mrg cy += as1[n];
168 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
169 1.1 mrg #else
170 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
171 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s);
172 1.1 mrg if (s != n)
173 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
174 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
175 1.1 mrg #else
176 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
177 1.1 mrg if (s != n)
178 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
179 1.1 mrg cy += as1[n];
180 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
181 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n);
182 1.1 mrg #endif
183 1.1 mrg #endif
184 1.1 mrg as2[n] = cy;
185 1.1 mrg
186 1.1 mrg /* Compute bs1 and bsm1. */
187 1.1 mrg cy = mpn_add (gp, b0, n, b2, t);
188 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
189 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
190 1.1 mrg {
191 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
192 1.1 mrg bs1[n] = cy >> 1;
193 1.1 mrg bsm1[n] = 0;
194 1.1 mrg vm1_neg ^= 1;
195 1.1 mrg }
196 1.1 mrg else
197 1.1 mrg {
198 1.1 mrg mp_limb_t cy2;
199 1.1 mrg cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
200 1.1 mrg bs1[n] = cy + (cy2 >> 1);
201 1.1 mrg bsm1[n] = cy - (cy2 & 1);
202 1.1 mrg }
203 1.1 mrg #else
204 1.1 mrg bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
205 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
206 1.1 mrg {
207 1.1 mrg mpn_sub_n (bsm1, b1, gp, n);
208 1.1 mrg bsm1[n] = 0;
209 1.1 mrg vm1_neg ^= 1;
210 1.1 mrg }
211 1.1 mrg else
212 1.1 mrg {
213 1.1 mrg cy -= mpn_sub_n (bsm1, gp, b1, n);
214 1.1 mrg bsm1[n] = cy;
215 1.1 mrg }
216 1.1 mrg #endif
217 1.1 mrg
218 1.1 mrg /* Compute bs2. */
219 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
220 1.1 mrg cy = mpn_add_n (bs2, b2, bs1, t);
221 1.1 mrg if (t != n)
222 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
223 1.1 mrg cy += bs1[n];
224 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
225 1.1 mrg #else
226 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
227 1.1 mrg cy = mpn_addlsh1_n (bs2, b1, b2, t);
228 1.1 mrg if (t != n)
229 1.1 mrg cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
230 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
231 1.1 mrg #else
232 1.1 mrg cy = mpn_add_n (bs2, bs1, b2, t);
233 1.1 mrg if (t != n)
234 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
235 1.1 mrg cy += bs1[n];
236 1.1 mrg cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
237 1.1 mrg cy -= mpn_sub_n (bs2, bs2, b0, n);
238 1.1 mrg #endif
239 1.1 mrg #endif
240 1.1 mrg bs2[n] = cy;
241 1.1 mrg
242 1.1 mrg ASSERT (as1[n] <= 2);
243 1.1 mrg ASSERT (bs1[n] <= 2);
244 1.1 mrg ASSERT (asm1[n] <= 1);
245 1.1 mrg ASSERT (bsm1[n] <= 1);
246 1.1 mrg ASSERT (as2[n] <= 6);
247 1.1 mrg ASSERT (bs2[n] <= 6);
248 1.1 mrg
249 1.1 mrg #define v0 pp /* 2n */
250 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
251 1.1 mrg #define vinf (pp + 4 * n) /* s+t */
252 1.1 mrg #define vm1 scratch /* 2n+1 */
253 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
254 1.1 mrg #define scratch_out (scratch + 5 * n + 5)
255 1.1 mrg
256 1.1 mrg /* vm1, 2n+1 limbs */
257 1.1 mrg #ifdef SMALLER_RECURSION
258 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
259 1.1 mrg cy = 0;
260 1.1 mrg if (asm1[n] != 0)
261 1.1 mrg cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
262 1.1 mrg if (bsm1[n] != 0)
263 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
264 1.1 mrg vm1[2 * n] = cy;
265 1.1 mrg #else
266 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
267 1.1 mrg #endif
268 1.1 mrg
269 1.1 mrg TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
270 1.1 mrg
271 1.1 mrg /* vinf, s+t limbs */
272 1.1 mrg if (s > t) mpn_mul (vinf, a2, s, b2, t);
273 1.1 mrg else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
274 1.1 mrg
275 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */
276 1.1 mrg
277 1.1 mrg #ifdef SMALLER_RECURSION
278 1.1 mrg /* v1, 2n+1 limbs */
279 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
280 1.1 mrg if (as1[n] == 1)
281 1.1 mrg {
282 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
283 1.1 mrg }
284 1.1 mrg else if (as1[n] != 0)
285 1.1 mrg {
286 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
287 1.1 mrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
288 1.1 mrg #else
289 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
290 1.1 mrg #endif
291 1.1 mrg }
292 1.1 mrg else
293 1.1 mrg cy = 0;
294 1.1 mrg if (bs1[n] == 1)
295 1.1 mrg {
296 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
297 1.1 mrg }
298 1.1 mrg else if (bs1[n] != 0)
299 1.1 mrg {
300 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
301 1.1 mrg cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
302 1.1 mrg #else
303 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
304 1.1 mrg #endif
305 1.1 mrg }
306 1.1 mrg v1[2 * n] = cy;
307 1.1 mrg #else
308 1.1 mrg cy = vinf[1];
309 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
310 1.1 mrg vinf[1] = cy;
311 1.1 mrg #endif
312 1.1 mrg
313 1.1 mrg TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
314 1.1 mrg
315 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
316 1.1 mrg }
317