toom43_mul.c revision 1.1.1.3 1 1.1 mrg /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
2 1.1 mrg times as large as bn. Or more accurately, bn < an < 2 bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Marco Bodrato.
5 1.1 mrg
6 1.1 mrg The idea of applying toom to unbalanced multiplication is due to Marco
7 1.1 mrg Bodrato and Alberto Zanoni.
8 1.1 mrg
9 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12 1.1 mrg
13 1.1 mrg Copyright 2009 Free Software Foundation, Inc.
14 1.1 mrg
15 1.1 mrg This file is part of the GNU MP Library.
16 1.1 mrg
17 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
18 1.1.1.3 mrg it under the terms of either:
19 1.1.1.3 mrg
20 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
21 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
22 1.1.1.3 mrg option) any later version.
23 1.1.1.3 mrg
24 1.1.1.3 mrg or
25 1.1.1.3 mrg
26 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
27 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
28 1.1.1.3 mrg later version.
29 1.1.1.3 mrg
30 1.1.1.3 mrg or both in parallel, as here.
31 1.1 mrg
32 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
33 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
35 1.1.1.3 mrg for more details.
36 1.1 mrg
37 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
38 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
39 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
40 1.1 mrg
41 1.1 mrg
42 1.1 mrg #include "gmp.h"
43 1.1 mrg #include "gmp-impl.h"
44 1.1 mrg
45 1.1 mrg /* Evaluate in: -2, -1, 0, +1, +2, +inf
46 1.1 mrg
47 1.1 mrg <-s-><--n--><--n--><--n-->
48 1.1 mrg ___ ______ ______ ______
49 1.1 mrg |a3_|___a2_|___a1_|___a0_|
50 1.1 mrg |_b2_|___b1_|___b0_|
51 1.1 mrg <-t--><--n--><--n-->
52 1.1 mrg
53 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
54 1.1 mrg v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2
55 1.1 mrg vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1
56 1.1 mrg v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6
57 1.1 mrg vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4
58 1.1 mrg vinf= a3 * b2 # A(inf)*B(inf)
59 1.1 mrg */
60 1.1 mrg
61 1.1 mrg void
62 1.1 mrg mpn_toom43_mul (mp_ptr pp,
63 1.1 mrg mp_srcptr ap, mp_size_t an,
64 1.1 mrg mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
65 1.1 mrg {
66 1.1 mrg mp_size_t n, s, t;
67 1.1 mrg enum toom6_flags flags;
68 1.1 mrg mp_limb_t cy;
69 1.1 mrg
70 1.1 mrg #define a0 ap
71 1.1 mrg #define a1 (ap + n)
72 1.1 mrg #define a2 (ap + 2 * n)
73 1.1 mrg #define a3 (ap + 3 * n)
74 1.1 mrg #define b0 bp
75 1.1 mrg #define b1 (bp + n)
76 1.1 mrg #define b2 (bp + 2 * n)
77 1.1 mrg
78 1.1 mrg n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
79 1.1 mrg
80 1.1 mrg s = an - 3 * n;
81 1.1 mrg t = bn - 2 * n;
82 1.1 mrg
83 1.1 mrg ASSERT (0 < s && s <= n);
84 1.1 mrg ASSERT (0 < t && t <= n);
85 1.1 mrg
86 1.1 mrg /* This is true whenever an >= 25 or bn >= 19, I think. It
87 1.1 mrg guarantees that we can fit 5 values of size n+1 in the product
88 1.1 mrg area. */
89 1.1 mrg ASSERT (s+t >= 5);
90 1.1 mrg
91 1.1 mrg #define v0 pp /* 2n */
92 1.1 mrg #define vm1 (scratch) /* 2n+1 */
93 1.1 mrg #define v1 (pp + 2*n) /* 2n+1 */
94 1.1 mrg #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
95 1.1 mrg #define v2 (scratch + 4 * n + 2) /* 2n+1 */
96 1.1 mrg #define vinf (pp + 5 * n) /* s+t */
97 1.1 mrg #define bs1 pp /* n+1 */
98 1.1 mrg #define bsm1 (scratch + 2 * n + 2) /* n+1 */
99 1.1 mrg #define asm1 (scratch + 3 * n + 3) /* n+1 */
100 1.1 mrg #define asm2 (scratch + 4 * n + 4) /* n+1 */
101 1.1 mrg #define bsm2 (pp + n + 1) /* n+1 */
102 1.1 mrg #define bs2 (pp + 2 * n + 2) /* n+1 */
103 1.1 mrg #define as2 (pp + 3 * n + 3) /* n+1 */
104 1.1 mrg #define as1 (pp + 4 * n + 4) /* n+1 */
105 1.1 mrg
106 1.1 mrg /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
107 1.1 mrg limb, because products will overwrite 2n+2 limbs. */
108 1.1 mrg
109 1.1 mrg #define a0a2 scratch
110 1.1 mrg #define b0b2 scratch
111 1.1 mrg #define a1a3 asm1
112 1.1 mrg #define b1d bsm1
113 1.1 mrg
114 1.1 mrg /* Compute as2 and asm2. */
115 1.1.1.2 mrg flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
116 1.1 mrg
117 1.1 mrg /* Compute bs2 and bsm2. */
118 1.1 mrg b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */
119 1.1 mrg cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */
120 1.1 mrg cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */
121 1.1 mrg if (t != n)
122 1.1 mrg cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
123 1.1 mrg b0b2[n] = cy;
124 1.1 mrg
125 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
126 1.1 mrg if (mpn_cmp (b0b2, b1d, n+1) < 0)
127 1.1 mrg {
128 1.1 mrg mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
129 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
130 1.1 mrg }
131 1.1 mrg else
132 1.1 mrg {
133 1.1 mrg mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
134 1.1 mrg }
135 1.1 mrg #else
136 1.1 mrg mpn_add_n (bs2, b0b2, b1d, n+1);
137 1.1 mrg if (mpn_cmp (b0b2, b1d, n+1) < 0)
138 1.1 mrg {
139 1.1 mrg mpn_sub_n (bsm2, b1d, b0b2, n+1);
140 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
141 1.1 mrg }
142 1.1 mrg else
143 1.1 mrg {
144 1.1 mrg mpn_sub_n (bsm2, b0b2, b1d, n+1);
145 1.1 mrg }
146 1.1 mrg #endif
147 1.1 mrg
148 1.1 mrg /* Compute as1 and asm1. */
149 1.1.1.3 mrg flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2)));
150 1.1 mrg
151 1.1 mrg /* Compute bs1 and bsm1. */
152 1.1 mrg bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
153 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
154 1.1 mrg if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
155 1.1 mrg {
156 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
157 1.1 mrg bs1[n] = cy >> 1;
158 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
159 1.1 mrg }
160 1.1 mrg else
161 1.1 mrg {
162 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
163 1.1 mrg bs1[n] = bsm1[n] + (cy >> 1);
164 1.1 mrg bsm1[n]-= cy & 1;
165 1.1 mrg }
166 1.1 mrg #else
167 1.1 mrg bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
168 1.1 mrg if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
169 1.1 mrg {
170 1.1 mrg mpn_sub_n (bsm1, b1, bsm1, n);
171 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
172 1.1 mrg }
173 1.1 mrg else
174 1.1 mrg {
175 1.1 mrg bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
176 1.1 mrg }
177 1.1 mrg #endif
178 1.1 mrg
179 1.1 mrg ASSERT (as1[n] <= 3);
180 1.1 mrg ASSERT (bs1[n] <= 2);
181 1.1 mrg ASSERT (asm1[n] <= 1);
182 1.1 mrg ASSERT (bsm1[n] <= 1);
183 1.1 mrg ASSERT (as2[n] <=14);
184 1.1 mrg ASSERT (bs2[n] <= 6);
185 1.1 mrg ASSERT (asm2[n] <= 9);
186 1.1 mrg ASSERT (bsm2[n] <= 4);
187 1.1 mrg
188 1.1 mrg /* vm1, 2n+1 limbs */
189 1.1 mrg mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */
190 1.1 mrg
191 1.1 mrg /* vm2, 2n+1 limbs */
192 1.1 mrg mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
193 1.1 mrg
194 1.1 mrg /* v2, 2n+1 limbs */
195 1.1 mrg mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
196 1.1 mrg
197 1.1 mrg /* v1, 2n+1 limbs */
198 1.1 mrg mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
199 1.1 mrg
200 1.1 mrg /* vinf, s+t limbs */ /* W0 */
201 1.1 mrg if (s > t) mpn_mul (vinf, a3, s, b2, t);
202 1.1 mrg else mpn_mul (vinf, b2, t, a3, s);
203 1.1 mrg
204 1.1 mrg /* v0, 2n limbs */
205 1.1 mrg mpn_mul_n (v0, ap, bp, n); /* W5 */
206 1.1 mrg
207 1.1 mrg mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
208 1.1 mrg
209 1.1 mrg #undef v0
210 1.1 mrg #undef vm1
211 1.1 mrg #undef v1
212 1.1 mrg #undef vm2
213 1.1 mrg #undef v2
214 1.1 mrg #undef vinf
215 1.1 mrg #undef bs1
216 1.1 mrg #undef bs2
217 1.1 mrg #undef bsm1
218 1.1 mrg #undef bsm2
219 1.1 mrg #undef asm1
220 1.1 mrg #undef asm2
221 1.1 mrg /* #undef as1 */
222 1.1 mrg /* #undef as2 */
223 1.1 mrg #undef a0a2
224 1.1 mrg #undef b0b2
225 1.1 mrg #undef a1a3
226 1.1 mrg #undef b1d
227 1.1 mrg #undef a0
228 1.1 mrg #undef a1
229 1.1 mrg #undef a2
230 1.1 mrg #undef a3
231 1.1 mrg #undef b0
232 1.1 mrg #undef b1
233 1.1 mrg #undef b2
234 1.1 mrg }
235