mul.c revision 1.1.1.2 1 /* mpn_mul -- Multiply two natural numbers.
2
3 Contributed to the GNU project by Torbjorn Granlund.
4
5 Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
6 2006, 2007, 2009, 2010, 2012 Free Software Foundation, Inc.
7
8 This file is part of the GNU MP Library.
9
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of the GNU Lesser General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 The GNU MP Library is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
18 License for more details.
19
20 You should have received a copy of the GNU Lesser General Public License
21 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
22
23 #include "gmp.h"
24 #include "gmp-impl.h"
25
26
27 #ifndef MUL_BASECASE_MAX_UN
28 #define MUL_BASECASE_MAX_UN 500
29 #endif
30
31 /* Areas where the different toom algorithms can be called (extracted
32 from the t-toom*.c files, and ignoring small constant offsets):
33
34 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
35 4/7 6/7
36 6/11
37 |--------------------| toom22 (small)
38 || toom22 (large)
39 |xxxx| toom22 called
40 |-------------------------------------| toom32
41 |xxxxxxxxxxxxxxxx| | toom32 called
42 |------------| toom33
43 |x| toom33 called
44 |---------------------------------| | toom42
45 |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called
46 |--------------------| toom43
47 |xxxxxxxxxx| toom43 called
48 |-----------------------------| toom52 (unused)
49 |--------| toom44
50 |xxxxxxxx| toom44 called
51 |--------------------| | toom53
52 |xxxxxx| toom53 called
53 |-------------------------| toom62 (unused)
54 |----------------| toom54 (unused)
55 |--------------------| toom63
56 |xxxxxxxxx| | toom63 called
57 |---------------------------------| toom6h
58 |xxxxxxxx| toom6h called
59 |-------------------------| toom8h (32 bit)
60 |------------------------------------------| toom8h (64 bit)
61 |xxxxxxxx| toom8h called
62 */
63
64 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
65 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
66
67 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
68 (pointed to by VP, with VN limbs), and store the result at PRODP. The
69 result is UN + VN limbs. Return the most significant limb of the result.
70
71 NOTE: The space pointed to by PRODP is overwritten before finished with U
72 and V, so overlap is an error.
73
74 Argument constraints:
75 1. UN >= VN.
76 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
77 the multiplier and the multiplicand. */
78
79 /*
80 * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
81 ideal lines of the surrounding algorithms. Is that optimal?
82
83 * The toomX3 code now uses a structure similar to the one of toomX2, except
84 that it loops longer in the unbalanced case. The result is that the
85 remaining area might have un < vn. Should we fix the toomX2 code in a
86 similar way?
87
88 * The toomX3 code is used for the largest non-FFT unbalanced operands. It
89 therefore calls mpn_mul recursively for certain cases.
90
91 * Allocate static temp space using THRESHOLD variables (except for toom44
92 when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
93
94 * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
95 have the same vn threshold. This is not true, we should actually use
96 mul_basecase for slightly larger operands for toom32 than for toom22, and
97 even larger for toom42.
98
99 * That problem is even more prevalent for toomX3. We therefore use special
100 THRESHOLD variables there.
101
102 * Is our ITCH allocation correct?
103 */
104
105 #define ITCH (16*vn + 100)
106
107 mp_limb_t
108 mpn_mul (mp_ptr prodp,
109 mp_srcptr up, mp_size_t un,
110 mp_srcptr vp, mp_size_t vn)
111 {
112 ASSERT (un >= vn);
113 ASSERT (vn >= 1);
114 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
115 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
116
117 if (un == vn)
118 {
119 if (up == vp)
120 mpn_sqr (prodp, up, un);
121 else
122 mpn_mul_n (prodp, up, vp, un);
123 }
124 else if (vn < MUL_TOOM22_THRESHOLD)
125 { /* plain schoolbook multiplication */
126
127 /* Unless un is very large, or else if have an applicable mpn_mul_N,
128 perform basecase multiply directly. */
129 if (un <= MUL_BASECASE_MAX_UN
130 #if HAVE_NATIVE_mpn_mul_2
131 || vn <= 2
132 #else
133 || vn == 1
134 #endif
135 )
136 mpn_mul_basecase (prodp, up, un, vp, vn);
137 else
138 {
139 /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
140 locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
141 these pieces with the vp[] operand. After each such partial
142 multiplication (but the last) we copy the most significant vn
143 limbs into a temporary buffer since that part would otherwise be
144 overwritten by the next multiplication. After the next
145 multiplication, we add it back. This illustrates the situation:
146
147 -->vn<--
148 | |<------- un ------->|
149 _____________________|
150 X /|
151 /XX__________________/ |
152 _____________________ |
153 X / |
154 /XX__________________/ |
155 _____________________ |
156 / / |
157 /____________________/ |
158 ==================================================================
159
160 The parts marked with X are the parts whose sums are copied into
161 the temporary buffer. */
162
163 mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
164 mp_limb_t cy;
165 ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
166
167 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
168 prodp += MUL_BASECASE_MAX_UN;
169 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
170 up += MUL_BASECASE_MAX_UN;
171 un -= MUL_BASECASE_MAX_UN;
172 while (un > MUL_BASECASE_MAX_UN)
173 {
174 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
175 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
176 mpn_incr_u (prodp + vn, cy);
177 prodp += MUL_BASECASE_MAX_UN;
178 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
179 up += MUL_BASECASE_MAX_UN;
180 un -= MUL_BASECASE_MAX_UN;
181 }
182 if (un > vn)
183 {
184 mpn_mul_basecase (prodp, up, un, vp, vn);
185 }
186 else
187 {
188 ASSERT (un > 0);
189 mpn_mul_basecase (prodp, vp, vn, up, un);
190 }
191 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
192 mpn_incr_u (prodp + vn, cy);
193 }
194 }
195 else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
196 {
197 /* Use ToomX2 variants */
198 mp_ptr scratch;
199 TMP_SDECL; TMP_SMARK;
200
201 scratch = TMP_SALLOC_LIMBS (ITCH);
202
203 /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
204 square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
205 wise; we would get better balance by slightly moving the bound. We
206 will sometimes end up with un < vn, like in the X3 arm below. */
207 if (un >= 3 * vn)
208 {
209 mp_limb_t cy;
210 mp_ptr ws;
211
212 /* The maximum ws usage is for the mpn_mul result. */
213 ws = TMP_SALLOC_LIMBS (4 * vn);
214
215 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
216 un -= 2 * vn;
217 up += 2 * vn;
218 prodp += 2 * vn;
219
220 while (un >= 3 * vn)
221 {
222 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
223 un -= 2 * vn;
224 up += 2 * vn;
225 cy = mpn_add_n (prodp, prodp, ws, vn);
226 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
227 mpn_incr_u (prodp + vn, cy);
228 prodp += 2 * vn;
229 }
230
231 /* vn <= un < 3vn */
232
233 if (4 * un < 5 * vn)
234 mpn_toom22_mul (ws, up, un, vp, vn, scratch);
235 else if (4 * un < 7 * vn)
236 mpn_toom32_mul (ws, up, un, vp, vn, scratch);
237 else
238 mpn_toom42_mul (ws, up, un, vp, vn, scratch);
239
240 cy = mpn_add_n (prodp, prodp, ws, vn);
241 MPN_COPY (prodp + vn, ws + vn, un);
242 mpn_incr_u (prodp + vn, cy);
243 }
244 else
245 {
246 if (4 * un < 5 * vn)
247 mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
248 else if (4 * un < 7 * vn)
249 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
250 else
251 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
252 }
253 TMP_SFREE;
254 }
255 else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
256 BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
257 {
258 /* Handle the largest operands that are not in the FFT range. The 2nd
259 condition makes very unbalanced operands avoid the FFT code (except
260 perhaps as coefficient products of the Toom code. */
261
262 if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
263 {
264 /* Use ToomX3 variants */
265 mp_ptr scratch;
266 TMP_SDECL; TMP_SMARK;
267
268 scratch = TMP_SALLOC_LIMBS (ITCH);
269
270 if (2 * un >= 5 * vn)
271 {
272 mp_limb_t cy;
273 mp_ptr ws;
274
275 /* The maximum ws usage is for the mpn_mul result. */
276 ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
277
278 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
279 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
280 else
281 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
282 un -= 2 * vn;
283 up += 2 * vn;
284 prodp += 2 * vn;
285
286 while (2 * un >= 5 * vn) /* un >= 2.5vn */
287 {
288 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
289 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
290 else
291 mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
292 un -= 2 * vn;
293 up += 2 * vn;
294 cy = mpn_add_n (prodp, prodp, ws, vn);
295 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
296 mpn_incr_u (prodp + vn, cy);
297 prodp += 2 * vn;
298 }
299
300 /* vn / 2 <= un < 2.5vn */
301
302 if (un < vn)
303 mpn_mul (ws, vp, vn, up, un);
304 else
305 mpn_mul (ws, up, un, vp, vn);
306
307 cy = mpn_add_n (prodp, prodp, ws, vn);
308 MPN_COPY (prodp + vn, ws + vn, un);
309 mpn_incr_u (prodp + vn, cy);
310 }
311 else
312 {
313 if (6 * un < 7 * vn)
314 mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
315 else if (2 * un < 3 * vn)
316 {
317 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
318 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
319 else
320 mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
321 }
322 else if (6 * un < 11 * vn)
323 {
324 if (4 * un < 7 * vn)
325 {
326 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
327 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
328 else
329 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
330 }
331 else
332 {
333 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
334 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
335 else
336 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
337 }
338 }
339 else
340 {
341 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
342 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
343 else
344 mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
345 }
346 }
347 TMP_SFREE;
348 }
349 else
350 {
351 mp_ptr scratch;
352 TMP_DECL; TMP_MARK;
353
354 if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
355 {
356 scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
357 mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
358 }
359 else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
360 {
361 scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
362 mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
363 }
364 else
365 {
366 scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
367 mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
368 }
369 TMP_FREE;
370 }
371 }
372 else
373 {
374 if (un >= 8 * vn)
375 {
376 mp_limb_t cy;
377 mp_ptr ws;
378 TMP_DECL; TMP_MARK;
379
380 /* The maximum ws usage is for the mpn_mul result. */
381 ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
382
383 mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
384 un -= 3 * vn;
385 up += 3 * vn;
386 prodp += 3 * vn;
387
388 while (2 * un >= 7 * vn) /* un >= 3.5vn */
389 {
390 mpn_fft_mul (ws, up, 3 * vn, vp, vn);
391 un -= 3 * vn;
392 up += 3 * vn;
393 cy = mpn_add_n (prodp, prodp, ws, vn);
394 MPN_COPY (prodp + vn, ws + vn, 3 * vn);
395 mpn_incr_u (prodp + vn, cy);
396 prodp += 3 * vn;
397 }
398
399 /* vn / 2 <= un < 3.5vn */
400
401 if (un < vn)
402 mpn_mul (ws, vp, vn, up, un);
403 else
404 mpn_mul (ws, up, un, vp, vn);
405
406 cy = mpn_add_n (prodp, prodp, ws, vn);
407 MPN_COPY (prodp + vn, ws + vn, un);
408 mpn_incr_u (prodp + vn, cy);
409
410 TMP_FREE;
411 }
412 else
413 mpn_fft_mul (prodp, up, un, vp, vn);
414 }
415
416 return prodp[un + vn - 1]; /* historic */
417 }
418