toom43_mul.c revision 1.1.1.2 1 1.1 mrg /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
2 1.1 mrg times as large as bn. Or more accurately, bn < an < 2 bn.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Marco Bodrato.
5 1.1 mrg
6 1.1 mrg The idea of applying toom to unbalanced multiplication is due to Marco
7 1.1 mrg Bodrato and Alberto Zanoni.
8 1.1 mrg
9 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12 1.1 mrg
13 1.1 mrg Copyright 2009 Free Software Foundation, Inc.
14 1.1 mrg
15 1.1 mrg This file is part of the GNU MP Library.
16 1.1 mrg
17 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
18 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
19 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
20 1.1 mrg option) any later version.
21 1.1 mrg
22 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
23 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 1.1 mrg License for more details.
26 1.1 mrg
27 1.1 mrg You should have received a copy of the GNU Lesser General Public License
28 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
29 1.1 mrg
30 1.1 mrg
31 1.1 mrg #include "gmp.h"
32 1.1 mrg #include "gmp-impl.h"
33 1.1 mrg
34 1.1 mrg /* Evaluate in: -2, -1, 0, +1, +2, +inf
35 1.1 mrg
36 1.1 mrg <-s-><--n--><--n--><--n-->
37 1.1 mrg ___ ______ ______ ______
38 1.1 mrg |a3_|___a2_|___a1_|___a0_|
39 1.1 mrg |_b2_|___b1_|___b0_|
40 1.1 mrg <-t--><--n--><--n-->
41 1.1 mrg
42 1.1 mrg v0 = a0 * b0 # A(0)*B(0)
43 1.1 mrg v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2
44 1.1 mrg vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1
45 1.1 mrg v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6
46 1.1 mrg vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4
47 1.1 mrg vinf= a3 * b2 # A(inf)*B(inf)
48 1.1 mrg */
49 1.1 mrg
50 1.1 mrg void
51 1.1 mrg mpn_toom43_mul (mp_ptr pp,
52 1.1 mrg mp_srcptr ap, mp_size_t an,
53 1.1 mrg mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
54 1.1 mrg {
55 1.1 mrg mp_size_t n, s, t;
56 1.1 mrg enum toom6_flags flags;
57 1.1 mrg mp_limb_t cy;
58 1.1 mrg
59 1.1 mrg #define a0 ap
60 1.1 mrg #define a1 (ap + n)
61 1.1 mrg #define a2 (ap + 2 * n)
62 1.1 mrg #define a3 (ap + 3 * n)
63 1.1 mrg #define b0 bp
64 1.1 mrg #define b1 (bp + n)
65 1.1 mrg #define b2 (bp + 2 * n)
66 1.1 mrg
67 1.1 mrg n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
68 1.1 mrg
69 1.1 mrg s = an - 3 * n;
70 1.1 mrg t = bn - 2 * n;
71 1.1 mrg
72 1.1 mrg ASSERT (0 < s && s <= n);
73 1.1 mrg ASSERT (0 < t && t <= n);
74 1.1 mrg
75 1.1 mrg /* This is true whenever an >= 25 or bn >= 19, I think. It
76 1.1 mrg guarantees that we can fit 5 values of size n+1 in the product
77 1.1 mrg area. */
78 1.1 mrg ASSERT (s+t >= 5);
79 1.1 mrg
80 1.1 mrg #define v0 pp /* 2n */
81 1.1 mrg #define vm1 (scratch) /* 2n+1 */
82 1.1 mrg #define v1 (pp + 2*n) /* 2n+1 */
83 1.1 mrg #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
84 1.1 mrg #define v2 (scratch + 4 * n + 2) /* 2n+1 */
85 1.1 mrg #define vinf (pp + 5 * n) /* s+t */
86 1.1 mrg #define bs1 pp /* n+1 */
87 1.1 mrg #define bsm1 (scratch + 2 * n + 2) /* n+1 */
88 1.1 mrg #define asm1 (scratch + 3 * n + 3) /* n+1 */
89 1.1 mrg #define asm2 (scratch + 4 * n + 4) /* n+1 */
90 1.1 mrg #define bsm2 (pp + n + 1) /* n+1 */
91 1.1 mrg #define bs2 (pp + 2 * n + 2) /* n+1 */
92 1.1 mrg #define as2 (pp + 3 * n + 3) /* n+1 */
93 1.1 mrg #define as1 (pp + 4 * n + 4) /* n+1 */
94 1.1 mrg
95 1.1 mrg /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
96 1.1 mrg limb, because products will overwrite 2n+2 limbs. */
97 1.1 mrg
98 1.1 mrg #define a0a2 scratch
99 1.1 mrg #define b0b2 scratch
100 1.1 mrg #define a1a3 asm1
101 1.1 mrg #define b1d bsm1
102 1.1 mrg
103 1.1 mrg /* Compute as2 and asm2. */
104 1.1.1.2 mrg flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
105 1.1 mrg
106 1.1 mrg /* Compute bs2 and bsm2. */
107 1.1 mrg b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */
108 1.1 mrg cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */
109 1.1 mrg cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */
110 1.1 mrg if (t != n)
111 1.1 mrg cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
112 1.1 mrg b0b2[n] = cy;
113 1.1 mrg
114 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
115 1.1 mrg if (mpn_cmp (b0b2, b1d, n+1) < 0)
116 1.1 mrg {
117 1.1 mrg mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
118 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
119 1.1 mrg }
120 1.1 mrg else
121 1.1 mrg {
122 1.1 mrg mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
123 1.1 mrg }
124 1.1 mrg #else
125 1.1 mrg mpn_add_n (bs2, b0b2, b1d, n+1);
126 1.1 mrg if (mpn_cmp (b0b2, b1d, n+1) < 0)
127 1.1 mrg {
128 1.1 mrg mpn_sub_n (bsm2, b1d, b0b2, n+1);
129 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
130 1.1 mrg }
131 1.1 mrg else
132 1.1 mrg {
133 1.1 mrg mpn_sub_n (bsm2, b0b2, b1d, n+1);
134 1.1 mrg }
135 1.1 mrg #endif
136 1.1 mrg
137 1.1 mrg /* Compute as1 and asm1. */
138 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2));
139 1.1 mrg
140 1.1 mrg /* Compute bs1 and bsm1. */
141 1.1 mrg bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
142 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
143 1.1 mrg if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
144 1.1 mrg {
145 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
146 1.1 mrg bs1[n] = cy >> 1;
147 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
148 1.1 mrg }
149 1.1 mrg else
150 1.1 mrg {
151 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
152 1.1 mrg bs1[n] = bsm1[n] + (cy >> 1);
153 1.1 mrg bsm1[n]-= cy & 1;
154 1.1 mrg }
155 1.1 mrg #else
156 1.1 mrg bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
157 1.1 mrg if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
158 1.1 mrg {
159 1.1 mrg mpn_sub_n (bsm1, b1, bsm1, n);
160 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
161 1.1 mrg }
162 1.1 mrg else
163 1.1 mrg {
164 1.1 mrg bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
165 1.1 mrg }
166 1.1 mrg #endif
167 1.1 mrg
168 1.1 mrg ASSERT (as1[n] <= 3);
169 1.1 mrg ASSERT (bs1[n] <= 2);
170 1.1 mrg ASSERT (asm1[n] <= 1);
171 1.1 mrg ASSERT (bsm1[n] <= 1);
172 1.1 mrg ASSERT (as2[n] <=14);
173 1.1 mrg ASSERT (bs2[n] <= 6);
174 1.1 mrg ASSERT (asm2[n] <= 9);
175 1.1 mrg ASSERT (bsm2[n] <= 4);
176 1.1 mrg
177 1.1 mrg /* vm1, 2n+1 limbs */
178 1.1 mrg mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */
179 1.1 mrg
180 1.1 mrg /* vm2, 2n+1 limbs */
181 1.1 mrg mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
182 1.1 mrg
183 1.1 mrg /* v2, 2n+1 limbs */
184 1.1 mrg mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
185 1.1 mrg
186 1.1 mrg /* v1, 2n+1 limbs */
187 1.1 mrg mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
188 1.1 mrg
189 1.1 mrg /* vinf, s+t limbs */ /* W0 */
190 1.1 mrg if (s > t) mpn_mul (vinf, a3, s, b2, t);
191 1.1 mrg else mpn_mul (vinf, b2, t, a3, s);
192 1.1 mrg
193 1.1 mrg /* v0, 2n limbs */
194 1.1 mrg mpn_mul_n (v0, ap, bp, n); /* W5 */
195 1.1 mrg
196 1.1 mrg mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
197 1.1 mrg
198 1.1 mrg #undef v0
199 1.1 mrg #undef vm1
200 1.1 mrg #undef v1
201 1.1 mrg #undef vm2
202 1.1 mrg #undef v2
203 1.1 mrg #undef vinf
204 1.1 mrg #undef bs1
205 1.1 mrg #undef bs2
206 1.1 mrg #undef bsm1
207 1.1 mrg #undef bsm2
208 1.1 mrg #undef asm1
209 1.1 mrg #undef asm2
210 1.1 mrg /* #undef as1 */
211 1.1 mrg /* #undef as2 */
212 1.1 mrg #undef a0a2
213 1.1 mrg #undef b0b2
214 1.1 mrg #undef a1a3
215 1.1 mrg #undef b1d
216 1.1 mrg #undef a0
217 1.1 mrg #undef a1
218 1.1 mrg #undef a2
219 1.1 mrg #undef a3
220 1.1 mrg #undef b0
221 1.1 mrg #undef b1
222 1.1 mrg #undef b2
223 1.1 mrg }
224