1 1.1 mrg /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times 2 1.1 mrg as large as bn. Or more accurately, (5/2)bn < an < 6bn. 3 1.1 mrg 4 1.1 mrg Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 1.1 mrg 6 1.1 mrg The idea of applying toom to unbalanced multiplication is due to Marco 7 1.1 mrg Bodrato and Alberto Zanoni. 8 1.1 mrg 9 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 1.1 mrg 13 1.1.1.3 mrg Copyright 2006-2008, 2012 Free Software Foundation, Inc. 14 1.1 mrg 15 1.1 mrg This file is part of the GNU MP Library. 16 1.1 mrg 17 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 18 1.1.1.3 mrg it under the terms of either: 19 1.1.1.3 mrg 20 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free 21 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your 22 1.1.1.3 mrg option) any later version. 23 1.1.1.3 mrg 24 1.1.1.3 mrg or 25 1.1.1.3 mrg 26 1.1.1.3 mrg * the GNU General Public License as published by the Free Software 27 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any 28 1.1.1.3 mrg later version. 29 1.1.1.3 mrg 30 1.1.1.3 mrg or both in parallel, as here. 31 1.1 mrg 32 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 33 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 34 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 35 1.1.1.3 mrg for more details. 36 1.1 mrg 37 1.1.1.3 mrg You should have received copies of the GNU General Public License and the 38 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 39 1.1.1.3 mrg see https://www.gnu.org/licenses/. */ 40 1.1 mrg 41 1.1 mrg 42 1.1 mrg #include "gmp-impl.h" 43 1.1 mrg 44 1.1 mrg /* Evaluate in: 45 1.1 mrg 0, +1, -1, +2, -2, 1/2, +inf 46 1.1 mrg 47 1.1 mrg <-s-><--n--><--n--><--n--><--n--><--n--> 48 1.1 mrg ___ ______ ______ ______ ______ ______ 49 1.1 mrg |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| 50 1.1 mrg |_b1_|___b0_| 51 1.1 mrg <-t--><--n--> 52 1.1 mrg 53 1.1 mrg v0 = a0 * b0 # A(0)*B(0) 54 1.1 mrg v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1 55 1.1 mrg vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 56 1.1 mrg v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2 57 1.1 mrg vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0 58 1.1 mrg vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2 59 1.1 mrg vinf= a5 * b1 # A(inf)*B(inf) 60 1.1 mrg */ 61 1.1 mrg 62 1.1 mrg void 63 1.1 mrg mpn_toom62_mul (mp_ptr pp, 64 1.1 mrg mp_srcptr ap, mp_size_t an, 65 1.1 mrg mp_srcptr bp, mp_size_t bn, 66 1.1 mrg mp_ptr scratch) 67 1.1 mrg { 68 1.1 mrg mp_size_t n, s, t; 69 1.1 mrg mp_limb_t cy; 70 1.1 mrg mp_ptr as1, asm1, as2, asm2, ash; 71 1.1 mrg mp_ptr bs1, bsm1, bs2, bsm2, bsh; 72 1.1 mrg mp_ptr gp; 73 1.1 mrg enum toom7_flags aflags, bflags; 74 1.1 mrg TMP_DECL; 75 1.1 mrg 76 1.1 mrg #define a0 ap 77 1.1 mrg #define a1 (ap + n) 78 1.1 mrg #define a2 (ap + 2*n) 79 1.1 mrg #define a3 (ap + 3*n) 80 1.1 mrg #define a4 (ap + 4*n) 81 1.1 mrg #define a5 (ap + 5*n) 82 1.1 mrg #define b0 bp 83 1.1 mrg #define b1 (bp + n) 84 1.1 mrg 85 1.1 mrg n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1); 86 1.1 mrg 87 1.1 mrg s = an - 5 * n; 88 1.1 mrg t = bn - n; 89 1.1 mrg 90 1.1 mrg ASSERT (0 < s && s <= n); 91 1.1 mrg ASSERT (0 < t && t <= n); 92 1.1 mrg 93 1.1 mrg TMP_MARK; 94 1.1 mrg 95 1.1 mrg as1 = TMP_SALLOC_LIMBS (n + 1); 96 1.1 mrg asm1 = TMP_SALLOC_LIMBS (n + 1); 97 1.1 mrg as2 = TMP_SALLOC_LIMBS (n + 1); 98 1.1 mrg asm2 = TMP_SALLOC_LIMBS (n + 1); 99 1.1 mrg ash = TMP_SALLOC_LIMBS (n + 1); 100 1.1 mrg 101 1.1 mrg bs1 = TMP_SALLOC_LIMBS (n + 1); 102 1.1 mrg bsm1 = TMP_SALLOC_LIMBS (n); 103 1.1 mrg bs2 = TMP_SALLOC_LIMBS (n + 1); 104 1.1 mrg bsm2 = TMP_SALLOC_LIMBS (n + 1); 105 1.1 mrg bsh = TMP_SALLOC_LIMBS (n + 1); 106 1.1 mrg 107 1.1 mrg gp = pp; 108 1.1 mrg 109 1.1 mrg /* Compute as1 and asm1. */ 110 1.1.1.2 mrg aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp)); 111 1.1 mrg 112 1.1 mrg /* Compute as2 and asm2. */ 113 1.1.1.3 mrg aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp))); 114 1.1 mrg 115 1.1 mrg /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5 116 1.1 mrg = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */ 117 1.1 mrg 118 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n 119 1.1 mrg cy = mpn_addlsh1_n (ash, a1, a0, n); 120 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); 121 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); 122 1.1 mrg cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); 123 1.1 mrg if (s < n) 124 1.1 mrg { 125 1.1 mrg mp_limb_t cy2; 126 1.1 mrg cy2 = mpn_addlsh1_n (ash, a5, ash, s); 127 1.1 mrg ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); 128 1.1 mrg MPN_INCR_U (ash + s, n+1-s, cy2); 129 1.1 mrg } 130 1.1 mrg else 131 1.1 mrg ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n); 132 1.1 mrg #else 133 1.1 mrg cy = mpn_lshift (ash, a0, n, 1); 134 1.1 mrg cy += mpn_add_n (ash, ash, a1, n); 135 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1); 136 1.1 mrg cy += mpn_add_n (ash, ash, a2, n); 137 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1); 138 1.1 mrg cy += mpn_add_n (ash, ash, a3, n); 139 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1); 140 1.1 mrg cy += mpn_add_n (ash, ash, a4, n); 141 1.1 mrg cy = 2*cy + mpn_lshift (ash, ash, n, 1); 142 1.1 mrg ash[n] = cy + mpn_add (ash, ash, n, a5, s); 143 1.1 mrg #endif 144 1.1 mrg 145 1.1 mrg /* Compute bs1 and bsm1. */ 146 1.1 mrg if (t == n) 147 1.1 mrg { 148 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n 149 1.1 mrg if (mpn_cmp (b0, b1, n) < 0) 150 1.1 mrg { 151 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); 152 1.1 mrg bflags = toom7_w3_neg; 153 1.1 mrg } 154 1.1 mrg else 155 1.1 mrg { 156 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); 157 1.1.1.2 mrg bflags = (enum toom7_flags) 0; 158 1.1 mrg } 159 1.1 mrg bs1[n] = cy >> 1; 160 1.1 mrg #else 161 1.1 mrg bs1[n] = mpn_add_n (bs1, b0, b1, n); 162 1.1 mrg if (mpn_cmp (b0, b1, n) < 0) 163 1.1 mrg { 164 1.1 mrg mpn_sub_n (bsm1, b1, b0, n); 165 1.1 mrg bflags = toom7_w3_neg; 166 1.1 mrg } 167 1.1 mrg else 168 1.1 mrg { 169 1.1 mrg mpn_sub_n (bsm1, b0, b1, n); 170 1.1.1.2 mrg bflags = (enum toom7_flags) 0; 171 1.1 mrg } 172 1.1 mrg #endif 173 1.1 mrg } 174 1.1 mrg else 175 1.1 mrg { 176 1.1 mrg bs1[n] = mpn_add (bs1, b0, n, b1, t); 177 1.1 mrg if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) 178 1.1 mrg { 179 1.1 mrg mpn_sub_n (bsm1, b1, b0, t); 180 1.1 mrg MPN_ZERO (bsm1 + t, n - t); 181 1.1 mrg bflags = toom7_w3_neg; 182 1.1 mrg } 183 1.1 mrg else 184 1.1 mrg { 185 1.1 mrg mpn_sub (bsm1, b0, n, b1, t); 186 1.1.1.2 mrg bflags = (enum toom7_flags) 0; 187 1.1 mrg } 188 1.1 mrg } 189 1.1 mrg 190 1.1 mrg /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 = 191 1.1 mrg bsm1 - b1 */ 192 1.1 mrg mpn_add (bs2, bs1, n + 1, b1, t); 193 1.1 mrg if (bflags & toom7_w3_neg) 194 1.1 mrg { 195 1.1 mrg bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); 196 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg); 197 1.1 mrg } 198 1.1 mrg else 199 1.1 mrg { 200 1.1 mrg /* FIXME: Simplify this logic? */ 201 1.1 mrg if (t < n) 202 1.1 mrg { 203 1.1 mrg if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) 204 1.1 mrg { 205 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t)); 206 1.1 mrg MPN_ZERO (bsm2 + t, n + 1 - t); 207 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg); 208 1.1 mrg } 209 1.1 mrg else 210 1.1 mrg { 211 1.1 mrg ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t)); 212 1.1 mrg bsm2[n] = 0; 213 1.1 mrg } 214 1.1 mrg } 215 1.1 mrg else 216 1.1 mrg { 217 1.1 mrg if (mpn_cmp (bsm1, b1, n) < 0) 218 1.1 mrg { 219 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n)); 220 1.1.1.2 mrg bflags = (enum toom7_flags) (bflags | toom7_w1_neg); 221 1.1 mrg } 222 1.1 mrg else 223 1.1 mrg { 224 1.1.1.2 mrg ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n)); 225 1.1 mrg } 226 1.1 mrg bsm2[n] = 0; 227 1.1 mrg } 228 1.1 mrg } 229 1.1 mrg 230 1.1.1.2 mrg /* Compute bsh, recycling bs1. bsh=bs1+b0; */ 231 1.1.1.2 mrg bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n); 232 1.1 mrg 233 1.1 mrg ASSERT (as1[n] <= 5); 234 1.1 mrg ASSERT (bs1[n] <= 1); 235 1.1 mrg ASSERT (asm1[n] <= 2); 236 1.1 mrg ASSERT (as2[n] <= 62); 237 1.1 mrg ASSERT (bs2[n] <= 2); 238 1.1 mrg ASSERT (asm2[n] <= 41); 239 1.1 mrg ASSERT (bsm2[n] <= 1); 240 1.1 mrg ASSERT (ash[n] <= 62); 241 1.1 mrg ASSERT (bsh[n] <= 2); 242 1.1 mrg 243 1.1 mrg #define v0 pp /* 2n */ 244 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */ 245 1.1 mrg #define vinf (pp + 6 * n) /* s+t */ 246 1.1 mrg #define v2 scratch /* 2n+1 */ 247 1.1 mrg #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 248 1.1 mrg #define vh (scratch + 4 * n + 2) /* 2n+1 */ 249 1.1 mrg #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ 250 1.1 mrg #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ 251 1.1 mrg /* Total scratch need: 10*n+5 */ 252 1.1 mrg 253 1.1 mrg /* Must be in allocation order, as they overwrite one limb beyond 254 1.1 mrg * 2n+1. */ 255 1.1 mrg mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ 256 1.1 mrg mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ 257 1.1 mrg mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ 258 1.1 mrg 259 1.1 mrg /* vm1, 2n+1 limbs */ 260 1.1 mrg mpn_mul_n (vm1, asm1, bsm1, n); 261 1.1 mrg cy = 0; 262 1.1 mrg if (asm1[n] == 1) 263 1.1 mrg { 264 1.1 mrg cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 265 1.1 mrg } 266 1.1 mrg else if (asm1[n] == 2) 267 1.1 mrg { 268 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n 269 1.1 mrg cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); 270 1.1 mrg #else 271 1.1 mrg cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); 272 1.1 mrg #endif 273 1.1 mrg } 274 1.1 mrg vm1[2 * n] = cy; 275 1.1 mrg 276 1.1 mrg /* v1, 2n+1 limbs */ 277 1.1 mrg mpn_mul_n (v1, as1, bs1, n); 278 1.1 mrg if (as1[n] == 1) 279 1.1 mrg { 280 1.1 mrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 281 1.1 mrg } 282 1.1 mrg else if (as1[n] == 2) 283 1.1 mrg { 284 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n 285 1.1 mrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 286 1.1 mrg #else 287 1.1 mrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 288 1.1 mrg #endif 289 1.1 mrg } 290 1.1 mrg else if (as1[n] != 0) 291 1.1 mrg { 292 1.1 mrg cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); 293 1.1 mrg } 294 1.1 mrg else 295 1.1 mrg cy = 0; 296 1.1 mrg if (bs1[n] != 0) 297 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n); 298 1.1 mrg v1[2 * n] = cy; 299 1.1 mrg 300 1.1 mrg mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ 301 1.1 mrg 302 1.1 mrg /* vinf, s+t limbs */ 303 1.1 mrg if (s > t) mpn_mul (vinf, a5, s, b1, t); 304 1.1 mrg else mpn_mul (vinf, b1, t, a5, s); 305 1.1 mrg 306 1.1.1.2 mrg mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags), 307 1.1 mrg vm2, vm1, v2, vh, s + t, scratch_out); 308 1.1 mrg 309 1.1 mrg TMP_FREE; 310 1.1 mrg } 311