1 1.1 mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in 2 1.1 mrg size. Or more accurately, bn <= an < (3/2)bn. 3 1.1 mrg 4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund. 5 1.1 mrg Additional improvements by Marco Bodrato. 6 1.1 mrg 7 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 8 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 10 1.1 mrg 11 1.1.1.4 mrg Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc. 12 1.1 mrg 13 1.1 mrg This file is part of the GNU MP Library. 14 1.1 mrg 15 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 16 1.1.1.3 mrg it under the terms of either: 17 1.1.1.3 mrg 18 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free 19 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your 20 1.1.1.3 mrg option) any later version. 21 1.1.1.3 mrg 22 1.1.1.3 mrg or 23 1.1.1.3 mrg 24 1.1.1.3 mrg * the GNU General Public License as published by the Free Software 25 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any 26 1.1.1.3 mrg later version. 27 1.1.1.3 mrg 28 1.1.1.3 mrg or both in parallel, as here. 29 1.1 mrg 30 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 31 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 32 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 33 1.1.1.3 mrg for more details. 34 1.1 mrg 35 1.1.1.3 mrg You should have received copies of the GNU General Public License and the 36 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 37 1.1.1.3 mrg see https://www.gnu.org/licenses/. */ 38 1.1 mrg 39 1.1 mrg 40 1.1 mrg #include "gmp-impl.h" 41 1.1 mrg 42 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf 43 1.1 mrg 44 1.1.1.3 mrg <-s--><--n--><--n--> 45 1.1.1.3 mrg ____ ______ ______ 46 1.1.1.3 mrg |_a2_|___a1_|___a0_| 47 1.1.1.3 mrg |b2_|___b1_|___b0_| 48 1.1.1.3 mrg <-t-><--n--><--n--> 49 1.1 mrg 50 1.1 mrg v0 = a0 * b0 # A(0)*B(0) 51 1.1 mrg v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 52 1.1 mrg vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 53 1.1 mrg v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 54 1.1 mrg vinf= a2 * b2 # A(inf)*B(inf) 55 1.1 mrg */ 56 1.1 mrg 57 1.1.1.2 mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY 58 1.1 mrg #define MAYBE_mul_basecase 1 59 1.1 mrg #define MAYBE_mul_toom33 1 60 1.1 mrg #else 61 1.1 mrg #define MAYBE_mul_basecase \ 62 1.1 mrg (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) 63 1.1 mrg #define MAYBE_mul_toom33 \ 64 1.1 mrg (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) 65 1.1 mrg #endif 66 1.1 mrg 67 1.1 mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced 68 1.1 mrg multiplication at the infinity point. We may have 69 1.1 mrg MAYBE_mul_basecase == 0, and still get s just below 70 1.1 mrg MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get 71 1.1 mrg s == 1 and mpn_toom22_mul will crash. 72 1.1 mrg */ 73 1.1 mrg 74 1.1 mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws) \ 75 1.1 mrg do { \ 76 1.1 mrg if (MAYBE_mul_basecase \ 77 1.1 mrg && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ 78 1.1 mrg mpn_mul_basecase (p, a, n, b, n); \ 79 1.1 mrg else if (! MAYBE_mul_toom33 \ 80 1.1 mrg || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ 81 1.1 mrg mpn_toom22_mul (p, a, n, b, n, ws); \ 82 1.1 mrg else \ 83 1.1 mrg mpn_toom33_mul (p, a, n, b, n, ws); \ 84 1.1 mrg } while (0) 85 1.1 mrg 86 1.1 mrg void 87 1.1 mrg mpn_toom33_mul (mp_ptr pp, 88 1.1 mrg mp_srcptr ap, mp_size_t an, 89 1.1 mrg mp_srcptr bp, mp_size_t bn, 90 1.1 mrg mp_ptr scratch) 91 1.1 mrg { 92 1.1.1.2 mrg const int __gmpn_cpuvec_initialized = 1; 93 1.1 mrg mp_size_t n, s, t; 94 1.1 mrg int vm1_neg; 95 1.1 mrg mp_limb_t cy, vinf0; 96 1.1 mrg mp_ptr gp; 97 1.1 mrg mp_ptr as1, asm1, as2; 98 1.1 mrg mp_ptr bs1, bsm1, bs2; 99 1.1 mrg 100 1.1 mrg #define a0 ap 101 1.1 mrg #define a1 (ap + n) 102 1.1 mrg #define a2 (ap + 2*n) 103 1.1 mrg #define b0 bp 104 1.1 mrg #define b1 (bp + n) 105 1.1 mrg #define b2 (bp + 2*n) 106 1.1 mrg 107 1.1 mrg n = (an + 2) / (size_t) 3; 108 1.1 mrg 109 1.1 mrg s = an - 2 * n; 110 1.1 mrg t = bn - 2 * n; 111 1.1 mrg 112 1.1 mrg ASSERT (an >= bn); 113 1.1 mrg 114 1.1 mrg ASSERT (0 < s && s <= n); 115 1.1 mrg ASSERT (0 < t && t <= n); 116 1.1 mrg 117 1.1 mrg as1 = scratch + 4 * n + 4; 118 1.1 mrg asm1 = scratch + 2 * n + 2; 119 1.1 mrg as2 = pp + n + 1; 120 1.1 mrg 121 1.1 mrg bs1 = pp; 122 1.1 mrg bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ 123 1.1 mrg bs2 = pp + 2 * n + 2; 124 1.1 mrg 125 1.1 mrg gp = scratch; 126 1.1 mrg 127 1.1 mrg vm1_neg = 0; 128 1.1 mrg 129 1.1 mrg /* Compute as1 and asm1. */ 130 1.1 mrg cy = mpn_add (gp, a0, n, a2, s); 131 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n 132 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 133 1.1 mrg { 134 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); 135 1.1 mrg as1[n] = cy >> 1; 136 1.1 mrg asm1[n] = 0; 137 1.1 mrg vm1_neg = 1; 138 1.1 mrg } 139 1.1 mrg else 140 1.1 mrg { 141 1.1 mrg mp_limb_t cy2; 142 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); 143 1.1 mrg as1[n] = cy + (cy2 >> 1); 144 1.1 mrg asm1[n] = cy - (cy2 & 1); 145 1.1 mrg } 146 1.1 mrg #else 147 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n); 148 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 149 1.1 mrg { 150 1.1 mrg mpn_sub_n (asm1, a1, gp, n); 151 1.1 mrg asm1[n] = 0; 152 1.1 mrg vm1_neg = 1; 153 1.1 mrg } 154 1.1 mrg else 155 1.1 mrg { 156 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n); 157 1.1 mrg asm1[n] = cy; 158 1.1 mrg } 159 1.1 mrg #endif 160 1.1 mrg 161 1.1 mrg /* Compute as2. */ 162 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n 163 1.1 mrg cy = mpn_add_n (as2, a2, as1, s); 164 1.1 mrg if (s != n) 165 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 166 1.1 mrg cy += as1[n]; 167 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); 168 1.1 mrg #else 169 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n 170 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s); 171 1.1 mrg if (s != n) 172 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 173 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 174 1.1 mrg #else 175 1.1 mrg cy = mpn_add_n (as2, a2, as1, s); 176 1.1 mrg if (s != n) 177 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 178 1.1 mrg cy += as1[n]; 179 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 180 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n); 181 1.1 mrg #endif 182 1.1 mrg #endif 183 1.1 mrg as2[n] = cy; 184 1.1 mrg 185 1.1 mrg /* Compute bs1 and bsm1. */ 186 1.1 mrg cy = mpn_add (gp, b0, n, b2, t); 187 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n 188 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 189 1.1 mrg { 190 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); 191 1.1 mrg bs1[n] = cy >> 1; 192 1.1 mrg bsm1[n] = 0; 193 1.1 mrg vm1_neg ^= 1; 194 1.1 mrg } 195 1.1 mrg else 196 1.1 mrg { 197 1.1 mrg mp_limb_t cy2; 198 1.1 mrg cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); 199 1.1 mrg bs1[n] = cy + (cy2 >> 1); 200 1.1 mrg bsm1[n] = cy - (cy2 & 1); 201 1.1 mrg } 202 1.1 mrg #else 203 1.1 mrg bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); 204 1.1 mrg if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 205 1.1 mrg { 206 1.1 mrg mpn_sub_n (bsm1, b1, gp, n); 207 1.1 mrg bsm1[n] = 0; 208 1.1 mrg vm1_neg ^= 1; 209 1.1 mrg } 210 1.1 mrg else 211 1.1 mrg { 212 1.1 mrg cy -= mpn_sub_n (bsm1, gp, b1, n); 213 1.1 mrg bsm1[n] = cy; 214 1.1 mrg } 215 1.1 mrg #endif 216 1.1 mrg 217 1.1 mrg /* Compute bs2. */ 218 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n 219 1.1 mrg cy = mpn_add_n (bs2, b2, bs1, t); 220 1.1 mrg if (t != n) 221 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 222 1.1 mrg cy += bs1[n]; 223 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); 224 1.1 mrg #else 225 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n 226 1.1 mrg cy = mpn_addlsh1_n (bs2, b1, b2, t); 227 1.1 mrg if (t != n) 228 1.1 mrg cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); 229 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); 230 1.1 mrg #else 231 1.1 mrg cy = mpn_add_n (bs2, bs1, b2, t); 232 1.1 mrg if (t != n) 233 1.1 mrg cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 234 1.1 mrg cy += bs1[n]; 235 1.1 mrg cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); 236 1.1 mrg cy -= mpn_sub_n (bs2, bs2, b0, n); 237 1.1 mrg #endif 238 1.1 mrg #endif 239 1.1 mrg bs2[n] = cy; 240 1.1 mrg 241 1.1 mrg ASSERT (as1[n] <= 2); 242 1.1 mrg ASSERT (bs1[n] <= 2); 243 1.1 mrg ASSERT (asm1[n] <= 1); 244 1.1 mrg ASSERT (bsm1[n] <= 1); 245 1.1 mrg ASSERT (as2[n] <= 6); 246 1.1 mrg ASSERT (bs2[n] <= 6); 247 1.1 mrg 248 1.1 mrg #define v0 pp /* 2n */ 249 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */ 250 1.1 mrg #define vinf (pp + 4 * n) /* s+t */ 251 1.1 mrg #define vm1 scratch /* 2n+1 */ 252 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 253 1.1 mrg #define scratch_out (scratch + 5 * n + 5) 254 1.1 mrg 255 1.1 mrg /* vm1, 2n+1 limbs */ 256 1.1 mrg #ifdef SMALLER_RECURSION 257 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); 258 1.1 mrg cy = 0; 259 1.1 mrg if (asm1[n] != 0) 260 1.1 mrg cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 261 1.1 mrg if (bsm1[n] != 0) 262 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 263 1.1 mrg vm1[2 * n] = cy; 264 1.1 mrg #else 265 1.1 mrg TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out); 266 1.1 mrg #endif 267 1.1 mrg 268 1.1 mrg TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 269 1.1 mrg 270 1.1 mrg /* vinf, s+t limbs */ 271 1.1 mrg if (s > t) mpn_mul (vinf, a2, s, b2, t); 272 1.1 mrg else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); 273 1.1 mrg 274 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */ 275 1.1 mrg 276 1.1 mrg #ifdef SMALLER_RECURSION 277 1.1 mrg /* v1, 2n+1 limbs */ 278 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); 279 1.1 mrg if (as1[n] == 1) 280 1.1 mrg { 281 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 282 1.1 mrg } 283 1.1 mrg else if (as1[n] != 0) 284 1.1 mrg { 285 1.1.1.4 mrg #if HAVE_NATIVE_mpn_addlsh1_n_ip1 286 1.1.1.4 mrg cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); 287 1.1 mrg #else 288 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 289 1.1 mrg #endif 290 1.1 mrg } 291 1.1 mrg else 292 1.1 mrg cy = 0; 293 1.1 mrg if (bs1[n] == 1) 294 1.1 mrg { 295 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n); 296 1.1 mrg } 297 1.1 mrg else if (bs1[n] != 0) 298 1.1 mrg { 299 1.1.1.4 mrg #if HAVE_NATIVE_mpn_addlsh1_n_ip1 300 1.1.1.4 mrg cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); 301 1.1 mrg #else 302 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 303 1.1 mrg #endif 304 1.1 mrg } 305 1.1 mrg v1[2 * n] = cy; 306 1.1 mrg #else 307 1.1 mrg cy = vinf[1]; 308 1.1 mrg TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); 309 1.1 mrg vinf[1] = cy; 310 1.1 mrg #endif 311 1.1 mrg 312 1.1 mrg TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ 313 1.1 mrg 314 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); 315 1.1 mrg } 316