toom33_mul.c revision 1.1.1.4 1 1.1 mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 1.1 mrg size. Or more accurately, bn <= an < (3/2)bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund.
5 1.1 mrg Additional improvements by Marco Bodrato.
6 1.1 mrg
7 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 1.1 mrg
11 1.1.1.4 mrg Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
12 1.1 mrg
13 1.1 mrg This file is part of the GNU MP Library.
14 1.1 mrg
15 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
16 1.1.1.3 mrg it under the terms of either:
17 1.1.1.3 mrg
18 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
19 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
20 1.1.1.3 mrg option) any later version.
21 1.1.1.3 mrg
22 1.1.1.3 mrg or
23 1.1.1.3 mrg
24 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
25 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
26 1.1.1.3 mrg later version.
27 1.1.1.3 mrg
28 1.1.1.3 mrg or both in parallel, as here.
29 1.1 mrg
30 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
31 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 1.1.1.3 mrg for more details.
34 1.1 mrg
35 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
36 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
37 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
38 1.1 mrg
39 1.1 mrg
40 1.1 mrg #include "gmp-impl.h"
41 1.1 mrg
42 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf
43 1.1 mrg
44 1.1.1.3 mrg <-s--><--n--><--n-->
45 1.1.1.3 mrg ____ ______ ______
46 1.1.1.3 mrg |_a2_|___a1_|___a0_|
47 1.1.1.3 mrg |b2_|___b1_|___b0_|
48 1.1.1.3 mrg <-t-><--n--><--n-->
49 1.1 mrg
50 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
51 1.1 mrg v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
52 1.1 mrg vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
53 1.1 mrg v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
54 1.1 mrg vinf= a2 * b2 # A(inf)*B(inf)
55 1.1 mrg */
56 1.1 mrg
57 1.1.1.2 mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
58 1.1 mrg #define MAYBE_mul_basecase 1
59 1.1 mrg #define MAYBE_mul_toom33 1
60 1.1 mrg #else
61 1.1 mrg #define MAYBE_mul_basecase \
62 1.1 mrg (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
63 1.1 mrg #define MAYBE_mul_toom33 \
64 1.1 mrg (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
65 1.1 mrg #endif
66 1.1 mrg
67 1.1 mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
68 1.1 mrg multiplication at the infinity point. We may have
69 1.1 mrg MAYBE_mul_basecase == 0, and still get s just below
70 1.1 mrg MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
71 1.1 mrg s == 1 and mpn_toom22_mul will crash.
72 1.1 mrg */
73 1.1 mrg
74 1.1 mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
75 1.1 mrg do { \
76 1.1 mrg if (MAYBE_mul_basecase \
77 1.1 mrg && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
78 1.1 mrg mpn_mul_basecase (p, a, n, b, n); \
79 1.1 mrg else if (! MAYBE_mul_toom33 \
80 1.1 mrg || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
81 1.1 mrg mpn_toom22_mul (p, a, n, b, n, ws); \
82 1.1 mrg else \
83 1.1 mrg mpn_toom33_mul (p, a, n, b, n, ws); \
84 1.1 mrg } while (0)
85 1.1 mrg
86 1.1 mrg void
87 1.1 mrg mpn_toom33_mul (mp_ptr pp,
88 1.1 mrg mp_srcptr ap, mp_size_t an,
89 1.1 mrg mp_srcptr bp, mp_size_t bn,
90 1.1 mrg mp_ptr scratch)
91 1.1 mrg {
92 1.1.1.2 mrg const int __gmpn_cpuvec_initialized = 1;
93 1.1 mrg mp_size_t n, s, t;
94 1.1 mrg int vm1_neg;
95 1.1 mrg mp_limb_t cy, vinf0;
96 1.1 mrg mp_ptr gp;
97 1.1 mrg mp_ptr as1, asm1, as2;
98 1.1 mrg mp_ptr bs1, bsm1, bs2;
99 1.1 mrg
100 1.1 mrg #define a0 ap
101 1.1 mrg #define a1 (ap + n)
102 1.1 mrg #define a2 (ap + 2*n)
103 1.1 mrg #define b0 bp
104 1.1 mrg #define b1 (bp + n)
105 1.1 mrg #define b2 (bp + 2*n)
106 1.1 mrg
107 1.1 mrg n = (an + 2) / (size_t) 3;
108 1.1 mrg
109 1.1 mrg s = an - 2 * n;
110 1.1 mrg t = bn - 2 * n;
111 1.1 mrg
112 1.1 mrg ASSERT (an >= bn);
113 1.1 mrg
114 1.1 mrg ASSERT (0 < s && s <= n);
115 1.1 mrg ASSERT (0 < t && t <= n);
116 1.1 mrg
117 1.1 mrg as1 = scratch + 4 * n + 4;
118 1.1 mrg asm1 = scratch + 2 * n + 2;
119 1.1 mrg as2 = pp + n + 1;
120 1.1 mrg
121 1.1 mrg bs1 = pp;
122 1.1 mrg bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
123 1.1 mrg bs2 = pp + 2 * n + 2;
124 1.1 mrg
125 1.1 mrg gp = scratch;
126 1.1 mrg
127 1.1 mrg vm1_neg = 0;
128 1.1 mrg
129 1.1 mrg /* Compute as1 and asm1. */
130 1.1 mrg cy = mpn_add (gp, a0, n, a2, s);
131 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
132 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
133 1.1 mrg {
134 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
135 1.1 mrg as1[n] = cy >> 1;
136 1.1 mrg asm1[n] = 0;
137 1.1 mrg vm1_neg = 1;
138 1.1 mrg }
139 1.1 mrg else
140 1.1 mrg {
141 1.1 mrg mp_limb_t cy2;
142 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
143 1.1 mrg as1[n] = cy + (cy2 >> 1);
144 1.1 mrg asm1[n] = cy - (cy2 & 1);
145 1.1 mrg }
146 1.1 mrg #else
147 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n);
148 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
149 1.1 mrg {
150 1.1 mrg mpn_sub_n (asm1, a1, gp, n);
151 1.1 mrg asm1[n] = 0;
152 1.1 mrg vm1_neg = 1;
153 1.1 mrg }
154 1.1 mrg else
155 1.1 mrg {
156 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n);
157 1.1 mrg asm1[n] = cy;
158 1.1 mrg }
159 1.1 mrg #endif
160 1.1 mrg
161 1.1 mrg /* Compute as2. */
162 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
163 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
164 1.1 mrg if (s != n)
165 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
166 1.1 mrg cy += as1[n];
167 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
168 1.1 mrg #else
169 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
170 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s);
171 1.1 mrg if (s != n)
172 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
173 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
174 1.1 mrg #else
175 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
176 1.1 mrg if (s != n)
177 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
178 1.1 mrg cy += as1[n];
179 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
180 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n);
181 1.1 mrg #endif
182 1.1 mrg #endif
183 1.1 mrg as2[n] = cy;
184 1.1 mrg
185 1.1 mrg /* Compute bs1 and bsm1. */
186 1.1 mrg cy = mpn_add (gp, b0, n, b2, t);
187 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
188 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
189 1.1 mrg {
190 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
191 1.1 mrg bs1[n] = cy >> 1;
192 1.1 mrg bsm1[n] = 0;
193 1.1 mrg vm1_neg ^= 1;
194 1.1 mrg }
195 1.1 mrg else
196 1.1 mrg {
197 1.1 mrg mp_limb_t cy2;
198 1.1 mrg cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
199 1.1 mrg bs1[n] = cy + (cy2 >> 1);
200 1.1 mrg bsm1[n] = cy - (cy2 & 1);
201 1.1 mrg }
202 1.1 mrg #else
203 1.1 mrg bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
204 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
205 1.1 mrg {
206 1.1 mrg mpn_sub_n (bsm1, b1, gp, n);
207 1.1 mrg bsm1[n] = 0;
208 1.1 mrg vm1_neg ^= 1;
209 1.1 mrg }
210 1.1 mrg else
211 1.1 mrg {
212 1.1 mrg cy -= mpn_sub_n (bsm1, gp, b1, n);
213 1.1 mrg bsm1[n] = cy;
214 1.1 mrg }
215 1.1 mrg #endif
216 1.1 mrg
217 1.1 mrg /* Compute bs2. */
218 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
219 1.1 mrg cy = mpn_add_n (bs2, b2, bs1, t);
220 1.1 mrg if (t != n)
221 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
222 1.1 mrg cy += bs1[n];
223 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
224 1.1 mrg #else
225 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
226 1.1 mrg cy = mpn_addlsh1_n (bs2, b1, b2, t);
227 1.1 mrg if (t != n)
228 1.1 mrg cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
229 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
230 1.1 mrg #else
231 1.1 mrg cy = mpn_add_n (bs2, bs1, b2, t);
232 1.1 mrg if (t != n)
233 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
234 1.1 mrg cy += bs1[n];
235 1.1 mrg cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
236 1.1 mrg cy -= mpn_sub_n (bs2, bs2, b0, n);
237 1.1 mrg #endif
238 1.1 mrg #endif
239 1.1 mrg bs2[n] = cy;
240 1.1 mrg
241 1.1 mrg ASSERT (as1[n] <= 2);
242 1.1 mrg ASSERT (bs1[n] <= 2);
243 1.1 mrg ASSERT (asm1[n] <= 1);
244 1.1 mrg ASSERT (bsm1[n] <= 1);
245 1.1 mrg ASSERT (as2[n] <= 6);
246 1.1 mrg ASSERT (bs2[n] <= 6);
247 1.1 mrg
248 1.1 mrg #define v0 pp /* 2n */
249 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
250 1.1 mrg #define vinf (pp + 4 * n) /* s+t */
251 1.1 mrg #define vm1 scratch /* 2n+1 */
252 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
253 1.1 mrg #define scratch_out (scratch + 5 * n + 5)
254 1.1 mrg
255 1.1 mrg /* vm1, 2n+1 limbs */
256 1.1 mrg #ifdef SMALLER_RECURSION
257 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
258 1.1 mrg cy = 0;
259 1.1 mrg if (asm1[n] != 0)
260 1.1 mrg cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
261 1.1 mrg if (bsm1[n] != 0)
262 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
263 1.1 mrg vm1[2 * n] = cy;
264 1.1 mrg #else
265 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
266 1.1 mrg #endif
267 1.1 mrg
268 1.1 mrg TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
269 1.1 mrg
270 1.1 mrg /* vinf, s+t limbs */
271 1.1 mrg if (s > t) mpn_mul (vinf, a2, s, b2, t);
272 1.1 mrg else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
273 1.1 mrg
274 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */
275 1.1 mrg
276 1.1 mrg #ifdef SMALLER_RECURSION
277 1.1 mrg /* v1, 2n+1 limbs */
278 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
279 1.1 mrg if (as1[n] == 1)
280 1.1 mrg {
281 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
282 1.1 mrg }
283 1.1 mrg else if (as1[n] != 0)
284 1.1 mrg {
285 1.1.1.4 mrg #if HAVE_NATIVE_mpn_addlsh1_n_ip1
286 1.1.1.4 mrg cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
287 1.1 mrg #else
288 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
289 1.1 mrg #endif
290 1.1 mrg }
291 1.1 mrg else
292 1.1 mrg cy = 0;
293 1.1 mrg if (bs1[n] == 1)
294 1.1 mrg {
295 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
296 1.1 mrg }
297 1.1 mrg else if (bs1[n] != 0)
298 1.1 mrg {
299 1.1.1.4 mrg #if HAVE_NATIVE_mpn_addlsh1_n_ip1
300 1.1.1.4 mrg cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
301 1.1 mrg #else
302 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
303 1.1 mrg #endif
304 1.1 mrg }
305 1.1 mrg v1[2 * n] = cy;
306 1.1 mrg #else
307 1.1 mrg cy = vinf[1];
308 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
309 1.1 mrg vinf[1] = cy;
310 1.1 mrg #endif
311 1.1 mrg
312 1.1 mrg TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
313 1.1 mrg
314 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
315 1.1 mrg }
316