toom33_mul.c revision 1.1 1 1.1 mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 1.1 mrg size. Or more accurately, bn <= an < (3/2)bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund.
5 1.1 mrg Additional improvements by Marco Bodrato.
6 1.1 mrg
7 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 1.1 mrg
11 1.1 mrg Copyright 2006, 2007, 2008, 2010 Free Software Foundation, Inc.
12 1.1 mrg
13 1.1 mrg This file is part of the GNU MP Library.
14 1.1 mrg
15 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
16 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
17 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
18 1.1 mrg option) any later version.
19 1.1 mrg
20 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
21 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
23 1.1 mrg License for more details.
24 1.1 mrg
25 1.1 mrg You should have received a copy of the GNU Lesser General Public License
26 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
27 1.1 mrg
28 1.1 mrg
29 1.1 mrg #include "gmp.h"
30 1.1 mrg #include "gmp-impl.h"
31 1.1 mrg
32 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf
33 1.1 mrg
34 1.1 mrg <-s--><--n--><--n--><--n-->
35 1.1 mrg ____ ______ ______ ______
36 1.1 mrg |_a3_|___a2_|___a1_|___a0_|
37 1.1 mrg |b3_|___b2_|___b1_|___b0_|
38 1.1 mrg <-t-><--n--><--n--><--n-->
39 1.1 mrg
40 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
41 1.1 mrg v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
42 1.1 mrg vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
43 1.1 mrg v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
44 1.1 mrg vinf= a2 * b2 # A(inf)*B(inf)
45 1.1 mrg */
46 1.1 mrg
47 1.1 mrg #if TUNE_PROGRAM_BUILD
48 1.1 mrg #define MAYBE_mul_basecase 1
49 1.1 mrg #define MAYBE_mul_toom33 1
50 1.1 mrg #else
51 1.1 mrg #define MAYBE_mul_basecase \
52 1.1 mrg (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
53 1.1 mrg #define MAYBE_mul_toom33 \
54 1.1 mrg (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
55 1.1 mrg #endif
56 1.1 mrg
57 1.1 mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
58 1.1 mrg multiplication at the infinity point. We may have
59 1.1 mrg MAYBE_mul_basecase == 0, and still get s just below
60 1.1 mrg MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
61 1.1 mrg s == 1 and mpn_toom22_mul will crash.
62 1.1 mrg */
63 1.1 mrg
64 1.1 mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
65 1.1 mrg do { \
66 1.1 mrg if (MAYBE_mul_basecase \
67 1.1 mrg && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
68 1.1 mrg mpn_mul_basecase (p, a, n, b, n); \
69 1.1 mrg else if (! MAYBE_mul_toom33 \
70 1.1 mrg || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
71 1.1 mrg mpn_toom22_mul (p, a, n, b, n, ws); \
72 1.1 mrg else \
73 1.1 mrg mpn_toom33_mul (p, a, n, b, n, ws); \
74 1.1 mrg } while (0)
75 1.1 mrg
76 1.1 mrg void
77 1.1 mrg mpn_toom33_mul (mp_ptr pp,
78 1.1 mrg mp_srcptr ap, mp_size_t an,
79 1.1 mrg mp_srcptr bp, mp_size_t bn,
80 1.1 mrg mp_ptr scratch)
81 1.1 mrg {
82 1.1 mrg mp_size_t n, s, t;
83 1.1 mrg int vm1_neg;
84 1.1 mrg mp_limb_t cy, vinf0;
85 1.1 mrg mp_ptr gp;
86 1.1 mrg mp_ptr as1, asm1, as2;
87 1.1 mrg mp_ptr bs1, bsm1, bs2;
88 1.1 mrg
89 1.1 mrg #define a0 ap
90 1.1 mrg #define a1 (ap + n)
91 1.1 mrg #define a2 (ap + 2*n)
92 1.1 mrg #define b0 bp
93 1.1 mrg #define b1 (bp + n)
94 1.1 mrg #define b2 (bp + 2*n)
95 1.1 mrg
96 1.1 mrg n = (an + 2) / (size_t) 3;
97 1.1 mrg
98 1.1 mrg s = an - 2 * n;
99 1.1 mrg t = bn - 2 * n;
100 1.1 mrg
101 1.1 mrg ASSERT (an >= bn);
102 1.1 mrg
103 1.1 mrg ASSERT (0 < s && s <= n);
104 1.1 mrg ASSERT (0 < t && t <= n);
105 1.1 mrg
106 1.1 mrg as1 = scratch + 4 * n + 4;
107 1.1 mrg asm1 = scratch + 2 * n + 2;
108 1.1 mrg as2 = pp + n + 1;
109 1.1 mrg
110 1.1 mrg bs1 = pp;
111 1.1 mrg bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
112 1.1 mrg bs2 = pp + 2 * n + 2;
113 1.1 mrg
114 1.1 mrg gp = scratch;
115 1.1 mrg
116 1.1 mrg vm1_neg = 0;
117 1.1 mrg
118 1.1 mrg /* Compute as1 and asm1. */
119 1.1 mrg cy = mpn_add (gp, a0, n, a2, s);
120 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
121 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
122 1.1 mrg {
123 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
124 1.1 mrg as1[n] = cy >> 1;
125 1.1 mrg asm1[n] = 0;
126 1.1 mrg vm1_neg = 1;
127 1.1 mrg }
128 1.1 mrg else
129 1.1 mrg {
130 1.1 mrg mp_limb_t cy2;
131 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
132 1.1 mrg as1[n] = cy + (cy2 >> 1);
133 1.1 mrg asm1[n] = cy - (cy2 & 1);
134 1.1 mrg }
135 1.1 mrg #else
136 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n);
137 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
138 1.1 mrg {
139 1.1 mrg mpn_sub_n (asm1, a1, gp, n);
140 1.1 mrg asm1[n] = 0;
141 1.1 mrg vm1_neg = 1;
142 1.1 mrg }
143 1.1 mrg else
144 1.1 mrg {
145 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n);
146 1.1 mrg asm1[n] = cy;
147 1.1 mrg }
148 1.1 mrg #endif
149 1.1 mrg
150 1.1 mrg /* Compute as2. */
151 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
152 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
153 1.1 mrg if (s != n)
154 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
155 1.1 mrg cy += as1[n];
156 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
157 1.1 mrg #else
158 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
159 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s);
160 1.1 mrg if (s != n)
161 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
162 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
163 1.1 mrg #else
164 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
165 1.1 mrg if (s != n)
166 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
167 1.1 mrg cy += as1[n];
168 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
169 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n);
170 1.1 mrg #endif
171 1.1 mrg #endif
172 1.1 mrg as2[n] = cy;
173 1.1 mrg
174 1.1 mrg /* Compute bs1 and bsm1. */
175 1.1 mrg cy = mpn_add (gp, b0, n, b2, t);
176 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
177 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
178 1.1 mrg {
179 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
180 1.1 mrg bs1[n] = cy >> 1;
181 1.1 mrg bsm1[n] = 0;
182 1.1 mrg vm1_neg ^= 1;
183 1.1 mrg }
184 1.1 mrg else
185 1.1 mrg {
186 1.1 mrg mp_limb_t cy2;
187 1.1 mrg cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
188 1.1 mrg bs1[n] = cy + (cy2 >> 1);
189 1.1 mrg bsm1[n] = cy - (cy2 & 1);
190 1.1 mrg }
191 1.1 mrg #else
192 1.1 mrg bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
193 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
194 1.1 mrg {
195 1.1 mrg mpn_sub_n (bsm1, b1, gp, n);
196 1.1 mrg bsm1[n] = 0;
197 1.1 mrg vm1_neg ^= 1;
198 1.1 mrg }
199 1.1 mrg else
200 1.1 mrg {
201 1.1 mrg cy -= mpn_sub_n (bsm1, gp, b1, n);
202 1.1 mrg bsm1[n] = cy;
203 1.1 mrg }
204 1.1 mrg #endif
205 1.1 mrg
206 1.1 mrg /* Compute bs2. */
207 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
208 1.1 mrg cy = mpn_add_n (bs2, b2, bs1, t);
209 1.1 mrg if (t != n)
210 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
211 1.1 mrg cy += bs1[n];
212 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
213 1.1 mrg #else
214 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
215 1.1 mrg cy = mpn_addlsh1_n (bs2, b1, b2, t);
216 1.1 mrg if (t != n)
217 1.1 mrg cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
218 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
219 1.1 mrg #else
220 1.1 mrg cy = mpn_add_n (bs2, bs1, b2, t);
221 1.1 mrg if (t != n)
222 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
223 1.1 mrg cy += bs1[n];
224 1.1 mrg cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
225 1.1 mrg cy -= mpn_sub_n (bs2, bs2, b0, n);
226 1.1 mrg #endif
227 1.1 mrg #endif
228 1.1 mrg bs2[n] = cy;
229 1.1 mrg
230 1.1 mrg ASSERT (as1[n] <= 2);
231 1.1 mrg ASSERT (bs1[n] <= 2);
232 1.1 mrg ASSERT (asm1[n] <= 1);
233 1.1 mrg ASSERT (bsm1[n] <= 1);
234 1.1 mrg ASSERT (as2[n] <= 6);
235 1.1 mrg ASSERT (bs2[n] <= 6);
236 1.1 mrg
237 1.1 mrg #define v0 pp /* 2n */
238 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
239 1.1 mrg #define vinf (pp + 4 * n) /* s+t */
240 1.1 mrg #define vm1 scratch /* 2n+1 */
241 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
242 1.1 mrg #define scratch_out (scratch + 5 * n + 5)
243 1.1 mrg
244 1.1 mrg /* vm1, 2n+1 limbs */
245 1.1 mrg #ifdef SMALLER_RECURSION
246 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
247 1.1 mrg cy = 0;
248 1.1 mrg if (asm1[n] != 0)
249 1.1 mrg cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
250 1.1 mrg if (bsm1[n] != 0)
251 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
252 1.1 mrg vm1[2 * n] = cy;
253 1.1 mrg #else
254 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
255 1.1 mrg #endif
256 1.1 mrg
257 1.1 mrg TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
258 1.1 mrg
259 1.1 mrg /* vinf, s+t limbs */
260 1.1 mrg if (s > t) mpn_mul (vinf, a2, s, b2, t);
261 1.1 mrg else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
262 1.1 mrg
263 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */
264 1.1 mrg
265 1.1 mrg #ifdef SMALLER_RECURSION
266 1.1 mrg /* v1, 2n+1 limbs */
267 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
268 1.1 mrg if (as1[n] == 1)
269 1.1 mrg {
270 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
271 1.1 mrg }
272 1.1 mrg else if (as1[n] != 0)
273 1.1 mrg {
274 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
275 1.1 mrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
276 1.1 mrg #else
277 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
278 1.1 mrg #endif
279 1.1 mrg }
280 1.1 mrg else
281 1.1 mrg cy = 0;
282 1.1 mrg if (bs1[n] == 1)
283 1.1 mrg {
284 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
285 1.1 mrg }
286 1.1 mrg else if (bs1[n] != 0)
287 1.1 mrg {
288 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
289 1.1 mrg cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
290 1.1 mrg #else
291 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
292 1.1 mrg #endif
293 1.1 mrg }
294 1.1 mrg v1[2 * n] = cy;
295 1.1 mrg #else
296 1.1 mrg cy = vinf[1];
297 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
298 1.1 mrg vinf[1] = cy;
299 1.1 mrg #endif
300 1.1 mrg
301 1.1 mrg TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
302 1.1 mrg
303 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
304 1.1 mrg }
305