toom62_mul.c revision 1.1.1.2 1 1.1 mrg /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
2 1.1 mrg as large as bn. Or more accurately, (5/2)bn < an < 6bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 1.1 mrg
6 1.1 mrg The idea of applying toom to unbalanced multiplication is due to Marco
7 1.1 mrg Bodrato and Alberto Zanoni.
8 1.1 mrg
9 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12 1.1 mrg
13 1.1.1.2 mrg Copyright 2006, 2007, 2008, 2012 Free Software Foundation, Inc.
14 1.1 mrg
15 1.1 mrg This file is part of the GNU MP Library.
16 1.1 mrg
17 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
18 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
19 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
20 1.1 mrg option) any later version.
21 1.1 mrg
22 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
23 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 1.1 mrg License for more details.
26 1.1 mrg
27 1.1 mrg You should have received a copy of the GNU Lesser General Public License
28 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
29 1.1 mrg
30 1.1 mrg
31 1.1 mrg #include "gmp.h"
32 1.1 mrg #include "gmp-impl.h"
33 1.1 mrg
34 1.1 mrg /* Evaluate in:
35 1.1 mrg 0, +1, -1, +2, -2, 1/2, +inf
36 1.1 mrg
37 1.1 mrg <-s-><--n--><--n--><--n--><--n--><--n-->
38 1.1 mrg ___ ______ ______ ______ ______ ______
39 1.1 mrg |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
40 1.1 mrg |_b1_|___b0_|
41 1.1 mrg <-t--><--n-->
42 1.1 mrg
43 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
44 1.1 mrg v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1
45 1.1 mrg vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
46 1.1 mrg v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2
47 1.1 mrg vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0
48 1.1 mrg vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2
49 1.1 mrg vinf= a5 * b1 # A(inf)*B(inf)
50 1.1 mrg */
51 1.1 mrg
52 1.1 mrg void
53 1.1 mrg mpn_toom62_mul (mp_ptr pp,
54 1.1 mrg mp_srcptr ap, mp_size_t an,
55 1.1 mrg mp_srcptr bp, mp_size_t bn,
56 1.1 mrg mp_ptr scratch)
57 1.1 mrg {
58 1.1 mrg mp_size_t n, s, t;
59 1.1 mrg mp_limb_t cy;
60 1.1 mrg mp_ptr as1, asm1, as2, asm2, ash;
61 1.1 mrg mp_ptr bs1, bsm1, bs2, bsm2, bsh;
62 1.1 mrg mp_ptr gp;
63 1.1 mrg enum toom7_flags aflags, bflags;
64 1.1 mrg TMP_DECL;
65 1.1 mrg
66 1.1 mrg #define a0 ap
67 1.1 mrg #define a1 (ap + n)
68 1.1 mrg #define a2 (ap + 2*n)
69 1.1 mrg #define a3 (ap + 3*n)
70 1.1 mrg #define a4 (ap + 4*n)
71 1.1 mrg #define a5 (ap + 5*n)
72 1.1 mrg #define b0 bp
73 1.1 mrg #define b1 (bp + n)
74 1.1 mrg
75 1.1 mrg n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
76 1.1 mrg
77 1.1 mrg s = an - 5 * n;
78 1.1 mrg t = bn - n;
79 1.1 mrg
80 1.1 mrg ASSERT (0 < s && s <= n);
81 1.1 mrg ASSERT (0 < t && t <= n);
82 1.1 mrg
83 1.1 mrg TMP_MARK;
84 1.1 mrg
85 1.1 mrg as1 = TMP_SALLOC_LIMBS (n + 1);
86 1.1 mrg asm1 = TMP_SALLOC_LIMBS (n + 1);
87 1.1 mrg as2 = TMP_SALLOC_LIMBS (n + 1);
88 1.1 mrg asm2 = TMP_SALLOC_LIMBS (n + 1);
89 1.1 mrg ash = TMP_SALLOC_LIMBS (n + 1);
90 1.1 mrg
91 1.1 mrg bs1 = TMP_SALLOC_LIMBS (n + 1);
92 1.1 mrg bsm1 = TMP_SALLOC_LIMBS (n);
93 1.1 mrg bs2 = TMP_SALLOC_LIMBS (n + 1);
94 1.1 mrg bsm2 = TMP_SALLOC_LIMBS (n + 1);
95 1.1 mrg bsh = TMP_SALLOC_LIMBS (n + 1);
96 1.1 mrg
97 1.1 mrg gp = pp;
98 1.1 mrg
99 1.1 mrg /* Compute as1 and asm1. */
100 1.1.1.2 mrg aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
101 1.1 mrg
102 1.1 mrg /* Compute as2 and asm2. */
103 1.1.1.2 mrg aflags = (enum toom7_flags) (aflags | toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp));
104 1.1 mrg
105 1.1 mrg /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
106 1.1 mrg = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */
107 1.1 mrg
108 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
109 1.1 mrg cy = mpn_addlsh1_n (ash, a1, a0, n);
110 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
111 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
112 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
113 1.1 mrg if (s < n)
114 1.1 mrg {
115 1.1 mrg mp_limb_t cy2;
116 1.1 mrg cy2 = mpn_addlsh1_n (ash, a5, ash, s);
117 1.1 mrg ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
118 1.1 mrg MPN_INCR_U (ash + s, n+1-s, cy2);
119 1.1 mrg }
120 1.1 mrg else
121 1.1 mrg ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
122 1.1 mrg #else
123 1.1 mrg cy = mpn_lshift (ash, a0, n, 1);
124 1.1 mrg cy += mpn_add_n (ash, ash, a1, n);
125 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
126 1.1 mrg cy += mpn_add_n (ash, ash, a2, n);
127 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
128 1.1 mrg cy += mpn_add_n (ash, ash, a3, n);
129 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
130 1.1 mrg cy += mpn_add_n (ash, ash, a4, n);
131 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1);
132 1.1 mrg ash[n] = cy + mpn_add (ash, ash, n, a5, s);
133 1.1 mrg #endif
134 1.1 mrg
135 1.1 mrg /* Compute bs1 and bsm1. */
136 1.1 mrg if (t == n)
137 1.1 mrg {
138 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
139 1.1 mrg if (mpn_cmp (b0, b1, n) < 0)
140 1.1 mrg {
141 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
142 1.1 mrg bflags = toom7_w3_neg;
143 1.1 mrg }
144 1.1 mrg else
145 1.1 mrg {
146 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
147 1.1.1.2 mrg bflags = (enum toom7_flags) 0;
148 1.1 mrg }
149 1.1 mrg bs1[n] = cy >> 1;
150 1.1 mrg #else
151 1.1 mrg bs1[n] = mpn_add_n (bs1, b0, b1, n);
152 1.1 mrg if (mpn_cmp (b0, b1, n) < 0)
153 1.1 mrg {
154 1.1 mrg mpn_sub_n (bsm1, b1, b0, n);
155 1.1 mrg bflags = toom7_w3_neg;
156 1.1 mrg }
157 1.1 mrg else
158 1.1 mrg {
159 1.1 mrg mpn_sub_n (bsm1, b0, b1, n);
160 1.1.1.2 mrg bflags = (enum toom7_flags) 0;
161 1.1 mrg }
162 1.1 mrg #endif
163 1.1 mrg }
164 1.1 mrg else
165 1.1 mrg {
166 1.1 mrg bs1[n] = mpn_add (bs1, b0, n, b1, t);
167 1.1 mrg if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
168 1.1 mrg {
169 1.1 mrg mpn_sub_n (bsm1, b1, b0, t);
170 1.1 mrg MPN_ZERO (bsm1 + t, n - t);
171 1.1 mrg bflags = toom7_w3_neg;
172 1.1 mrg }
173 1.1 mrg else
174 1.1 mrg {
175 1.1 mrg mpn_sub (bsm1, b0, n, b1, t);
176 1.1.1.2 mrg bflags = (enum toom7_flags) 0;
177 1.1 mrg }
178 1.1 mrg }
179 1.1 mrg
180 1.1 mrg /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
181 1.1 mrg bsm1 - b1 */
182 1.1 mrg mpn_add (bs2, bs1, n + 1, b1, t);
183 1.1 mrg if (bflags & toom7_w3_neg)
184 1.1 mrg {
185 1.1 mrg bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
186 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
187 1.1 mrg }
188 1.1 mrg else
189 1.1 mrg {
190 1.1 mrg /* FIXME: Simplify this logic? */
191 1.1 mrg if (t < n)
192 1.1 mrg {
193 1.1 mrg if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
194 1.1 mrg {
195 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
196 1.1 mrg MPN_ZERO (bsm2 + t, n + 1 - t);
197 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
198 1.1 mrg }
199 1.1 mrg else
200 1.1 mrg {
201 1.1 mrg ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
202 1.1 mrg bsm2[n] = 0;
203 1.1 mrg }
204 1.1 mrg }
205 1.1 mrg else
206 1.1 mrg {
207 1.1 mrg if (mpn_cmp (bsm1, b1, n) < 0)
208 1.1 mrg {
209 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
210 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
211 1.1 mrg }
212 1.1 mrg else
213 1.1 mrg {
214 1.1.1.2 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
215 1.1 mrg }
216 1.1 mrg bsm2[n] = 0;
217 1.1 mrg }
218 1.1 mrg }
219 1.1 mrg
220 1.1.1.2 mrg /* Compute bsh, recycling bs1. bsh=bs1+b0; */
221 1.1.1.2 mrg bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
222 1.1 mrg
223 1.1 mrg ASSERT (as1[n] <= 5);
224 1.1 mrg ASSERT (bs1[n] <= 1);
225 1.1 mrg ASSERT (asm1[n] <= 2);
226 1.1 mrg ASSERT (as2[n] <= 62);
227 1.1 mrg ASSERT (bs2[n] <= 2);
228 1.1 mrg ASSERT (asm2[n] <= 41);
229 1.1 mrg ASSERT (bsm2[n] <= 1);
230 1.1 mrg ASSERT (ash[n] <= 62);
231 1.1 mrg ASSERT (bsh[n] <= 2);
232 1.1 mrg
233 1.1 mrg #define v0 pp /* 2n */
234 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
235 1.1 mrg #define vinf (pp + 6 * n) /* s+t */
236 1.1 mrg #define v2 scratch /* 2n+1 */
237 1.1 mrg #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
238 1.1 mrg #define vh (scratch + 4 * n + 2) /* 2n+1 */
239 1.1 mrg #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
240 1.1 mrg #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */
241 1.1 mrg /* Total scratch need: 10*n+5 */
242 1.1 mrg
243 1.1 mrg /* Must be in allocation order, as they overwrite one limb beyond
244 1.1 mrg * 2n+1. */
245 1.1 mrg mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
246 1.1 mrg mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */
247 1.1 mrg mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */
248 1.1 mrg
249 1.1 mrg /* vm1, 2n+1 limbs */
250 1.1 mrg mpn_mul_n (vm1, asm1, bsm1, n);
251 1.1 mrg cy = 0;
252 1.1 mrg if (asm1[n] == 1)
253 1.1 mrg {
254 1.1 mrg cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
255 1.1 mrg }
256 1.1 mrg else if (asm1[n] == 2)
257 1.1 mrg {
258 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
259 1.1 mrg cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
260 1.1 mrg #else
261 1.1 mrg cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
262 1.1 mrg #endif
263 1.1 mrg }
264 1.1 mrg vm1[2 * n] = cy;
265 1.1 mrg
266 1.1 mrg /* v1, 2n+1 limbs */
267 1.1 mrg mpn_mul_n (v1, as1, bs1, n);
268 1.1 mrg if (as1[n] == 1)
269 1.1 mrg {
270 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
271 1.1 mrg }
272 1.1 mrg else if (as1[n] == 2)
273 1.1 mrg {
274 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
275 1.1 mrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
276 1.1 mrg #else
277 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
278 1.1 mrg #endif
279 1.1 mrg }
280 1.1 mrg else if (as1[n] != 0)
281 1.1 mrg {
282 1.1 mrg cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
283 1.1 mrg }
284 1.1 mrg else
285 1.1 mrg cy = 0;
286 1.1 mrg if (bs1[n] != 0)
287 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
288 1.1 mrg v1[2 * n] = cy;
289 1.1 mrg
290 1.1 mrg mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */
291 1.1 mrg
292 1.1 mrg /* vinf, s+t limbs */
293 1.1 mrg if (s > t) mpn_mul (vinf, a5, s, b1, t);
294 1.1 mrg else mpn_mul (vinf, b1, t, a5, s);
295 1.1 mrg
296 1.1.1.2 mrg mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
297 1.1 mrg vm2, vm1, v2, vh, s + t, scratch_out);
298 1.1 mrg
299 1.1 mrg TMP_FREE;
300 1.1 mrg }
301