toom62_mul.c revision 1.1.1.3 1 1.1 mrg /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
2 1.1 mrg as large as bn. Or more accurately, (5/2)bn < an < 6bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 1.1 mrg
6 1.1 mrg The idea of applying toom to unbalanced multiplication is due to Marco
7 1.1 mrg Bodrato and Alberto Zanoni.
8 1.1 mrg
9 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12 1.1 mrg
13 1.1.1.3 mrg Copyright 2006-2008, 2012 Free Software Foundation, Inc.
14 1.1 mrg
15 1.1 mrg This file is part of the GNU MP Library.
16 1.1 mrg
17 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
18 1.1.1.3 mrg it under the terms of either:
19 1.1.1.3 mrg
20 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
21 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
22 1.1.1.3 mrg option) any later version.
23 1.1.1.3 mrg
24 1.1.1.3 mrg or
25 1.1.1.3 mrg
26 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
27 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
28 1.1.1.3 mrg later version.
29 1.1.1.3 mrg
30 1.1.1.3 mrg or both in parallel, as here.
31 1.1 mrg
32 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
33 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
35 1.1.1.3 mrg for more details.
36 1.1 mrg
37 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
38 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
39 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
40 1.1 mrg
41 1.1 mrg
42 1.1 mrg #include "gmp.h"
43 1.1 mrg #include "gmp-impl.h"
44 1.1 mrg
45 1.1 mrg /* Evaluate in:
46 1.1 mrg 0, +1, -1, +2, -2, 1/2, +inf
47 1.1 mrg
48 1.1 mrg <-s-><--n--><--n--><--n--><--n--><--n-->
49 1.1 mrg ___ ______ ______ ______ ______ ______
50 1.1 mrg |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
51 1.1 mrg |_b1_|___b0_|
52 1.1 mrg <-t--><--n-->
53 1.1 mrg
54 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
55 1.1 mrg v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1
56 1.1 mrg vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
57 1.1 mrg v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2
58 1.1 mrg vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0
59 1.1 mrg vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2
60 1.1 mrg vinf= a5 * b1 # A(inf)*B(inf)
61 1.1 mrg */
62 1.1 mrg
63 1.1 mrg void
64 1.1 mrg mpn_toom62_mul (mp_ptr pp,
65 1.1 mrg mp_srcptr ap, mp_size_t an,
66 1.1 mrg mp_srcptr bp, mp_size_t bn,
67 1.1 mrg mp_ptr scratch)
68 1.1 mrg {
69 1.1 mrg mp_size_t n, s, t;
70 1.1 mrg mp_limb_t cy;
71 1.1 mrg mp_ptr as1, asm1, as2, asm2, ash;
72 1.1 mrg mp_ptr bs1, bsm1, bs2, bsm2, bsh;
73 1.1 mrg mp_ptr gp;
74 1.1 mrg enum toom7_flags aflags, bflags;
75 1.1 mrg TMP_DECL;
76 1.1 mrg
77 1.1 mrg #define a0 ap
78 1.1 mrg #define a1 (ap + n)
79 1.1 mrg #define a2 (ap + 2*n)
80 1.1 mrg #define a3 (ap + 3*n)
81 1.1 mrg #define a4 (ap + 4*n)
82 1.1 mrg #define a5 (ap + 5*n)
83 1.1 mrg #define b0 bp
84 1.1 mrg #define b1 (bp + n)
85 1.1 mrg
86 1.1 mrg n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
87 1.1 mrg
88 1.1 mrg s = an - 5 * n;
89 1.1 mrg t = bn - n;
90 1.1 mrg
91 1.1 mrg ASSERT (0 < s && s <= n);
92 1.1 mrg ASSERT (0 < t && t <= n);
93 1.1 mrg
94 1.1 mrg TMP_MARK;
95 1.1 mrg
96 1.1 mrg as1 = TMP_SALLOC_LIMBS (n + 1);
97 1.1 mrg asm1 = TMP_SALLOC_LIMBS (n + 1);
98 1.1 mrg as2 = TMP_SALLOC_LIMBS (n + 1);
99 1.1 mrg asm2 = TMP_SALLOC_LIMBS (n + 1);
100 1.1 mrg ash = TMP_SALLOC_LIMBS (n + 1);
101 1.1 mrg
102 1.1 mrg bs1 = TMP_SALLOC_LIMBS (n + 1);
103 1.1 mrg bsm1 = TMP_SALLOC_LIMBS (n);
104 1.1 mrg bs2 = TMP_SALLOC_LIMBS (n + 1);
105 1.1 mrg bsm2 = TMP_SALLOC_LIMBS (n + 1);
106 1.1 mrg bsh = TMP_SALLOC_LIMBS (n + 1);
107 1.1 mrg
108 1.1 mrg gp = pp;
109 1.1 mrg
110 1.1 mrg /* Compute as1 and asm1. */
111 1.1.1.2 mrg aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
112 1.1 mrg
113 1.1 mrg /* Compute as2 and asm2. */
114 1.1.1.3 mrg aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp)));
115 1.1 mrg
116 1.1 mrg /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
117 1.1 mrg = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */
118 1.1 mrg
119 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
120 1.1 mrg cy = mpn_addlsh1_n (ash, a1, a0, n);
121 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
122 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
123 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
124 1.1 mrg if (s < n)
125 1.1 mrg {
126 1.1 mrg mp_limb_t cy2;
127 1.1 mrg cy2 = mpn_addlsh1_n (ash, a5, ash, s);
128 1.1 mrg ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
129 1.1 mrg MPN_INCR_U (ash + s, n+1-s, cy2);
130 1.1 mrg }
131 1.1 mrg else
132 1.1 mrg ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
133 1.1 mrg #else
134 1.1 mrg cy = mpn_lshift (ash, a0, n, 1);
135 1.1 mrg cy += mpn_add_n (ash, ash, a1, n);
136 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
137 1.1 mrg cy += mpn_add_n (ash, ash, a2, n);
138 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
139 1.1 mrg cy += mpn_add_n (ash, ash, a3, n);
140 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
141 1.1 mrg cy += mpn_add_n (ash, ash, a4, n);
142 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
143 1.1 mrg ash[n] = cy + mpn_add (ash, ash, n, a5, s);
144 1.1 mrg #endif
145 1.1 mrg
146 1.1 mrg /* Compute bs1 and bsm1. */
147 1.1 mrg if (t == n)
148 1.1 mrg {
149 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
150 1.1 mrg if (mpn_cmp (b0, b1, n) < 0)
151 1.1 mrg {
152 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
153 1.1 mrg bflags = toom7_w3_neg;
154 1.1 mrg }
155 1.1 mrg else
156 1.1 mrg {
157 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
158 1.1.1.2 mrg bflags = (enum toom7_flags) 0;
159 1.1 mrg }
160 1.1 mrg bs1[n] = cy >> 1;
161 1.1 mrg #else
162 1.1 mrg bs1[n] = mpn_add_n (bs1, b0, b1, n);
163 1.1 mrg if (mpn_cmp (b0, b1, n) < 0)
164 1.1 mrg {
165 1.1 mrg mpn_sub_n (bsm1, b1, b0, n);
166 1.1 mrg bflags = toom7_w3_neg;
167 1.1 mrg }
168 1.1 mrg else
169 1.1 mrg {
170 1.1 mrg mpn_sub_n (bsm1, b0, b1, n);
171 1.1.1.2 mrg bflags = (enum toom7_flags) 0;
172 1.1 mrg }
173 1.1 mrg #endif
174 1.1 mrg }
175 1.1 mrg else
176 1.1 mrg {
177 1.1 mrg bs1[n] = mpn_add (bs1, b0, n, b1, t);
178 1.1 mrg if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
179 1.1 mrg {
180 1.1 mrg mpn_sub_n (bsm1, b1, b0, t);
181 1.1 mrg MPN_ZERO (bsm1 + t, n - t);
182 1.1 mrg bflags = toom7_w3_neg;
183 1.1 mrg }
184 1.1 mrg else
185 1.1 mrg {
186 1.1 mrg mpn_sub (bsm1, b0, n, b1, t);
187 1.1.1.2 mrg bflags = (enum toom7_flags) 0;
188 1.1 mrg }
189 1.1 mrg }
190 1.1 mrg
191 1.1 mrg /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
192 1.1 mrg bsm1 - b1 */
193 1.1 mrg mpn_add (bs2, bs1, n + 1, b1, t);
194 1.1 mrg if (bflags & toom7_w3_neg)
195 1.1 mrg {
196 1.1 mrg bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
197 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
198 1.1 mrg }
199 1.1 mrg else
200 1.1 mrg {
201 1.1 mrg /* FIXME: Simplify this logic? */
202 1.1 mrg if (t < n)
203 1.1 mrg {
204 1.1 mrg if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
205 1.1 mrg {
206 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
207 1.1 mrg MPN_ZERO (bsm2 + t, n + 1 - t);
208 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
209 1.1 mrg }
210 1.1 mrg else
211 1.1 mrg {
212 1.1 mrg ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
213 1.1 mrg bsm2[n] = 0;
214 1.1 mrg }
215 1.1 mrg }
216 1.1 mrg else
217 1.1 mrg {
218 1.1 mrg if (mpn_cmp (bsm1, b1, n) < 0)
219 1.1 mrg {
220 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
221 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
222 1.1 mrg }
223 1.1 mrg else
224 1.1 mrg {
225 1.1.1.2 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
226 1.1 mrg }
227 1.1 mrg bsm2[n] = 0;
228 1.1 mrg }
229 1.1 mrg }
230 1.1 mrg
231 1.1.1.2 mrg /* Compute bsh, recycling bs1. bsh=bs1+b0; */
232 1.1.1.2 mrg bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
233 1.1 mrg
234 1.1 mrg ASSERT (as1[n] <= 5);
235 1.1 mrg ASSERT (bs1[n] <= 1);
236 1.1 mrg ASSERT (asm1[n] <= 2);
237 1.1 mrg ASSERT (as2[n] <= 62);
238 1.1 mrg ASSERT (bs2[n] <= 2);
239 1.1 mrg ASSERT (asm2[n] <= 41);
240 1.1 mrg ASSERT (bsm2[n] <= 1);
241 1.1 mrg ASSERT (ash[n] <= 62);
242 1.1 mrg ASSERT (bsh[n] <= 2);
243 1.1 mrg
244 1.1 mrg #define v0 pp /* 2n */
245 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
246 1.1 mrg #define vinf (pp + 6 * n) /* s+t */
247 1.1 mrg #define v2 scratch /* 2n+1 */
248 1.1 mrg #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
249 1.1 mrg #define vh (scratch + 4 * n + 2) /* 2n+1 */
250 1.1 mrg #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
251 1.1 mrg #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */
252 1.1 mrg /* Total scratch need: 10*n+5 */
253 1.1 mrg
254 1.1 mrg /* Must be in allocation order, as they overwrite one limb beyond
255 1.1 mrg * 2n+1. */
256 1.1 mrg mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
257 1.1 mrg mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */
258 1.1 mrg mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */
259 1.1 mrg
260 1.1 mrg /* vm1, 2n+1 limbs */
261 1.1 mrg mpn_mul_n (vm1, asm1, bsm1, n);
262 1.1 mrg cy = 0;
263 1.1 mrg if (asm1[n] == 1)
264 1.1 mrg {
265 1.1 mrg cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
266 1.1 mrg }
267 1.1 mrg else if (asm1[n] == 2)
268 1.1 mrg {
269 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
270 1.1 mrg cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
271 1.1 mrg #else
272 1.1 mrg cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
273 1.1 mrg #endif
274 1.1 mrg }
275 1.1 mrg vm1[2 * n] = cy;
276 1.1 mrg
277 1.1 mrg /* v1, 2n+1 limbs */
278 1.1 mrg mpn_mul_n (v1, as1, bs1, n);
279 1.1 mrg if (as1[n] == 1)
280 1.1 mrg {
281 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
282 1.1 mrg }
283 1.1 mrg else if (as1[n] == 2)
284 1.1 mrg {
285 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
286 1.1 mrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
287 1.1 mrg #else
288 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
289 1.1 mrg #endif
290 1.1 mrg }
291 1.1 mrg else if (as1[n] != 0)
292 1.1 mrg {
293 1.1 mrg cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
294 1.1 mrg }
295 1.1 mrg else
296 1.1 mrg cy = 0;
297 1.1 mrg if (bs1[n] != 0)
298 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
299 1.1 mrg v1[2 * n] = cy;
300 1.1 mrg
301 1.1 mrg mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */
302 1.1 mrg
303 1.1 mrg /* vinf, s+t limbs */
304 1.1 mrg if (s > t) mpn_mul (vinf, a5, s, b1, t);
305 1.1 mrg else mpn_mul (vinf, b1, t, a5, s);
306 1.1 mrg
307 1.1.1.2 mrg mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
308 1.1 mrg vm2, vm1, v2, vh, s + t, scratch_out);
309 1.1 mrg
310 1.1 mrg TMP_FREE;
311 1.1 mrg }
312