toom33_mul.c revision 1.1.1.2 1 1.1 mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 1.1 mrg size. Or more accurately, bn <= an < (3/2)bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund.
5 1.1 mrg Additional improvements by Marco Bodrato.
6 1.1 mrg
7 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 1.1 mrg
11 1.1.1.2 mrg Copyright 2006, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
12 1.1 mrg
13 1.1 mrg This file is part of the GNU MP Library.
14 1.1 mrg
15 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
16 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
17 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
18 1.1 mrg option) any later version.
19 1.1 mrg
20 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
21 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
23 1.1 mrg License for more details.
24 1.1 mrg
25 1.1 mrg You should have received a copy of the GNU Lesser General Public License
26 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
27 1.1 mrg
28 1.1 mrg
29 1.1 mrg #include "gmp.h"
30 1.1 mrg #include "gmp-impl.h"
31 1.1 mrg
32 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf
33 1.1 mrg
34 1.1 mrg <-s--><--n--><--n--><--n-->
35 1.1 mrg ____ ______ ______ ______
36 1.1 mrg |_a3_|___a2_|___a1_|___a0_|
37 1.1 mrg |b3_|___b2_|___b1_|___b0_|
38 1.1 mrg <-t-><--n--><--n--><--n-->
39 1.1 mrg
40 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
41 1.1 mrg v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
42 1.1 mrg vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
43 1.1 mrg v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
44 1.1 mrg vinf= a2 * b2 # A(inf)*B(inf)
45 1.1 mrg */
46 1.1 mrg
47 1.1.1.2 mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
48 1.1 mrg #define MAYBE_mul_basecase 1
49 1.1 mrg #define MAYBE_mul_toom33 1
50 1.1 mrg #else
51 1.1 mrg #define MAYBE_mul_basecase \
52 1.1 mrg (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
53 1.1 mrg #define MAYBE_mul_toom33 \
54 1.1 mrg (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
55 1.1 mrg #endif
56 1.1 mrg
57 1.1 mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
58 1.1 mrg multiplication at the infinity point. We may have
59 1.1 mrg MAYBE_mul_basecase == 0, and still get s just below
60 1.1 mrg MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
61 1.1 mrg s == 1 and mpn_toom22_mul will crash.
62 1.1 mrg */
63 1.1 mrg
64 1.1 mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
65 1.1 mrg do { \
66 1.1 mrg if (MAYBE_mul_basecase \
67 1.1 mrg && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
68 1.1 mrg mpn_mul_basecase (p, a, n, b, n); \
69 1.1 mrg else if (! MAYBE_mul_toom33 \
70 1.1 mrg || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
71 1.1 mrg mpn_toom22_mul (p, a, n, b, n, ws); \
72 1.1 mrg else \
73 1.1 mrg mpn_toom33_mul (p, a, n, b, n, ws); \
74 1.1 mrg } while (0)
75 1.1 mrg
76 1.1 mrg void
77 1.1 mrg mpn_toom33_mul (mp_ptr pp,
78 1.1 mrg mp_srcptr ap, mp_size_t an,
79 1.1 mrg mp_srcptr bp, mp_size_t bn,
80 1.1 mrg mp_ptr scratch)
81 1.1 mrg {
82 1.1.1.2 mrg const int __gmpn_cpuvec_initialized = 1;
83 1.1 mrg mp_size_t n, s, t;
84 1.1 mrg int vm1_neg;
85 1.1 mrg mp_limb_t cy, vinf0;
86 1.1 mrg mp_ptr gp;
87 1.1 mrg mp_ptr as1, asm1, as2;
88 1.1 mrg mp_ptr bs1, bsm1, bs2;
89 1.1 mrg
90 1.1 mrg #define a0 ap
91 1.1 mrg #define a1 (ap + n)
92 1.1 mrg #define a2 (ap + 2*n)
93 1.1 mrg #define b0 bp
94 1.1 mrg #define b1 (bp + n)
95 1.1 mrg #define b2 (bp + 2*n)
96 1.1 mrg
97 1.1 mrg n = (an + 2) / (size_t) 3;
98 1.1 mrg
99 1.1 mrg s = an - 2 * n;
100 1.1 mrg t = bn - 2 * n;
101 1.1 mrg
102 1.1 mrg ASSERT (an >= bn);
103 1.1 mrg
104 1.1 mrg ASSERT (0 < s && s <= n);
105 1.1 mrg ASSERT (0 < t && t <= n);
106 1.1 mrg
107 1.1 mrg as1 = scratch + 4 * n + 4;
108 1.1 mrg asm1 = scratch + 2 * n + 2;
109 1.1 mrg as2 = pp + n + 1;
110 1.1 mrg
111 1.1 mrg bs1 = pp;
112 1.1 mrg bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
113 1.1 mrg bs2 = pp + 2 * n + 2;
114 1.1 mrg
115 1.1 mrg gp = scratch;
116 1.1 mrg
117 1.1 mrg vm1_neg = 0;
118 1.1 mrg
119 1.1 mrg /* Compute as1 and asm1. */
120 1.1 mrg cy = mpn_add (gp, a0, n, a2, s);
121 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
122 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
123 1.1 mrg {
124 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
125 1.1 mrg as1[n] = cy >> 1;
126 1.1 mrg asm1[n] = 0;
127 1.1 mrg vm1_neg = 1;
128 1.1 mrg }
129 1.1 mrg else
130 1.1 mrg {
131 1.1 mrg mp_limb_t cy2;
132 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
133 1.1 mrg as1[n] = cy + (cy2 >> 1);
134 1.1 mrg asm1[n] = cy - (cy2 & 1);
135 1.1 mrg }
136 1.1 mrg #else
137 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n);
138 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
139 1.1 mrg {
140 1.1 mrg mpn_sub_n (asm1, a1, gp, n);
141 1.1 mrg asm1[n] = 0;
142 1.1 mrg vm1_neg = 1;
143 1.1 mrg }
144 1.1 mrg else
145 1.1 mrg {
146 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n);
147 1.1 mrg asm1[n] = cy;
148 1.1 mrg }
149 1.1 mrg #endif
150 1.1 mrg
151 1.1 mrg /* Compute as2. */
152 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
153 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
154 1.1 mrg if (s != n)
155 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
156 1.1 mrg cy += as1[n];
157 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
158 1.1 mrg #else
159 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
160 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s);
161 1.1 mrg if (s != n)
162 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
163 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
164 1.1 mrg #else
165 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
166 1.1 mrg if (s != n)
167 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
168 1.1 mrg cy += as1[n];
169 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
170 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n);
171 1.1 mrg #endif
172 1.1 mrg #endif
173 1.1 mrg as2[n] = cy;
174 1.1 mrg
175 1.1 mrg /* Compute bs1 and bsm1. */
176 1.1 mrg cy = mpn_add (gp, b0, n, b2, t);
177 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
178 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
179 1.1 mrg {
180 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
181 1.1 mrg bs1[n] = cy >> 1;
182 1.1 mrg bsm1[n] = 0;
183 1.1 mrg vm1_neg ^= 1;
184 1.1 mrg }
185 1.1 mrg else
186 1.1 mrg {
187 1.1 mrg mp_limb_t cy2;
188 1.1 mrg cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
189 1.1 mrg bs1[n] = cy + (cy2 >> 1);
190 1.1 mrg bsm1[n] = cy - (cy2 & 1);
191 1.1 mrg }
192 1.1 mrg #else
193 1.1 mrg bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
194 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
195 1.1 mrg {
196 1.1 mrg mpn_sub_n (bsm1, b1, gp, n);
197 1.1 mrg bsm1[n] = 0;
198 1.1 mrg vm1_neg ^= 1;
199 1.1 mrg }
200 1.1 mrg else
201 1.1 mrg {
202 1.1 mrg cy -= mpn_sub_n (bsm1, gp, b1, n);
203 1.1 mrg bsm1[n] = cy;
204 1.1 mrg }
205 1.1 mrg #endif
206 1.1 mrg
207 1.1 mrg /* Compute bs2. */
208 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
209 1.1 mrg cy = mpn_add_n (bs2, b2, bs1, t);
210 1.1 mrg if (t != n)
211 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
212 1.1 mrg cy += bs1[n];
213 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
214 1.1 mrg #else
215 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
216 1.1 mrg cy = mpn_addlsh1_n (bs2, b1, b2, t);
217 1.1 mrg if (t != n)
218 1.1 mrg cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
219 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
220 1.1 mrg #else
221 1.1 mrg cy = mpn_add_n (bs2, bs1, b2, t);
222 1.1 mrg if (t != n)
223 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
224 1.1 mrg cy += bs1[n];
225 1.1 mrg cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
226 1.1 mrg cy -= mpn_sub_n (bs2, bs2, b0, n);
227 1.1 mrg #endif
228 1.1 mrg #endif
229 1.1 mrg bs2[n] = cy;
230 1.1 mrg
231 1.1 mrg ASSERT (as1[n] <= 2);
232 1.1 mrg ASSERT (bs1[n] <= 2);
233 1.1 mrg ASSERT (asm1[n] <= 1);
234 1.1 mrg ASSERT (bsm1[n] <= 1);
235 1.1 mrg ASSERT (as2[n] <= 6);
236 1.1 mrg ASSERT (bs2[n] <= 6);
237 1.1 mrg
238 1.1 mrg #define v0 pp /* 2n */
239 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
240 1.1 mrg #define vinf (pp + 4 * n) /* s+t */
241 1.1 mrg #define vm1 scratch /* 2n+1 */
242 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
243 1.1 mrg #define scratch_out (scratch + 5 * n + 5)
244 1.1 mrg
245 1.1 mrg /* vm1, 2n+1 limbs */
246 1.1 mrg #ifdef SMALLER_RECURSION
247 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
248 1.1 mrg cy = 0;
249 1.1 mrg if (asm1[n] != 0)
250 1.1 mrg cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
251 1.1 mrg if (bsm1[n] != 0)
252 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
253 1.1 mrg vm1[2 * n] = cy;
254 1.1 mrg #else
255 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
256 1.1 mrg #endif
257 1.1 mrg
258 1.1 mrg TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
259 1.1 mrg
260 1.1 mrg /* vinf, s+t limbs */
261 1.1 mrg if (s > t) mpn_mul (vinf, a2, s, b2, t);
262 1.1 mrg else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
263 1.1 mrg
264 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */
265 1.1 mrg
266 1.1 mrg #ifdef SMALLER_RECURSION
267 1.1 mrg /* v1, 2n+1 limbs */
268 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
269 1.1 mrg if (as1[n] == 1)
270 1.1 mrg {
271 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
272 1.1 mrg }
273 1.1 mrg else if (as1[n] != 0)
274 1.1 mrg {
275 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
276 1.1 mrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
277 1.1 mrg #else
278 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
279 1.1 mrg #endif
280 1.1 mrg }
281 1.1 mrg else
282 1.1 mrg cy = 0;
283 1.1 mrg if (bs1[n] == 1)
284 1.1 mrg {
285 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
286 1.1 mrg }
287 1.1 mrg else if (bs1[n] != 0)
288 1.1 mrg {
289 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
290 1.1 mrg cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
291 1.1 mrg #else
292 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
293 1.1 mrg #endif
294 1.1 mrg }
295 1.1 mrg v1[2 * n] = cy;
296 1.1 mrg #else
297 1.1 mrg cy = vinf[1];
298 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
299 1.1 mrg vinf[1] = cy;
300 1.1 mrg #endif
301 1.1 mrg
302 1.1 mrg TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
303 1.1 mrg
304 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
305 1.1 mrg }
306