1 1.1 mrg /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 2 1.1 mrg times as large as bn. Or more accurately, bn < an < 2 bn. 3 1.1 mrg 4 1.1 mrg Contributed to the GNU project by Marco Bodrato. 5 1.1 mrg 6 1.1 mrg The idea of applying toom to unbalanced multiplication is due to Marco 7 1.1 mrg Bodrato and Alberto Zanoni. 8 1.1 mrg 9 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 1.1 mrg 13 1.1 mrg Copyright 2009 Free Software Foundation, Inc. 14 1.1 mrg 15 1.1 mrg This file is part of the GNU MP Library. 16 1.1 mrg 17 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 18 1.1.1.3 mrg it under the terms of either: 19 1.1.1.3 mrg 20 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free 21 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your 22 1.1.1.3 mrg option) any later version. 23 1.1.1.3 mrg 24 1.1.1.3 mrg or 25 1.1.1.3 mrg 26 1.1.1.3 mrg * the GNU General Public License as published by the Free Software 27 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any 28 1.1.1.3 mrg later version. 29 1.1.1.3 mrg 30 1.1.1.3 mrg or both in parallel, as here. 31 1.1 mrg 32 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 33 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 34 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 35 1.1.1.3 mrg for more details. 36 1.1 mrg 37 1.1.1.3 mrg You should have received copies of the GNU General Public License and the 38 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 39 1.1.1.3 mrg see https://www.gnu.org/licenses/. */ 40 1.1 mrg 41 1.1 mrg 42 1.1 mrg #include "gmp-impl.h" 43 1.1 mrg 44 1.1 mrg /* Evaluate in: -2, -1, 0, +1, +2, +inf 45 1.1 mrg 46 1.1 mrg <-s-><--n--><--n--><--n--> 47 1.1 mrg ___ ______ ______ ______ 48 1.1 mrg |a3_|___a2_|___a1_|___a0_| 49 1.1 mrg |_b2_|___b1_|___b0_| 50 1.1 mrg <-t--><--n--><--n--> 51 1.1 mrg 52 1.1 mrg v0 = a0 * b0 # A(0)*B(0) 53 1.1 mrg v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2 54 1.1 mrg vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1 55 1.1 mrg v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6 56 1.1 mrg vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4 57 1.1 mrg vinf= a3 * b2 # A(inf)*B(inf) 58 1.1 mrg */ 59 1.1 mrg 60 1.1 mrg void 61 1.1 mrg mpn_toom43_mul (mp_ptr pp, 62 1.1 mrg mp_srcptr ap, mp_size_t an, 63 1.1 mrg mp_srcptr bp, mp_size_t bn, mp_ptr scratch) 64 1.1 mrg { 65 1.1 mrg mp_size_t n, s, t; 66 1.1 mrg enum toom6_flags flags; 67 1.1 mrg mp_limb_t cy; 68 1.1 mrg 69 1.1 mrg #define a0 ap 70 1.1 mrg #define a1 (ap + n) 71 1.1 mrg #define a2 (ap + 2 * n) 72 1.1 mrg #define a3 (ap + 3 * n) 73 1.1 mrg #define b0 bp 74 1.1 mrg #define b1 (bp + n) 75 1.1 mrg #define b2 (bp + 2 * n) 76 1.1 mrg 77 1.1 mrg n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3); 78 1.1 mrg 79 1.1 mrg s = an - 3 * n; 80 1.1 mrg t = bn - 2 * n; 81 1.1 mrg 82 1.1 mrg ASSERT (0 < s && s <= n); 83 1.1 mrg ASSERT (0 < t && t <= n); 84 1.1 mrg 85 1.1 mrg /* This is true whenever an >= 25 or bn >= 19, I think. It 86 1.1 mrg guarantees that we can fit 5 values of size n+1 in the product 87 1.1 mrg area. */ 88 1.1 mrg ASSERT (s+t >= 5); 89 1.1 mrg 90 1.1 mrg #define v0 pp /* 2n */ 91 1.1 mrg #define vm1 (scratch) /* 2n+1 */ 92 1.1 mrg #define v1 (pp + 2*n) /* 2n+1 */ 93 1.1 mrg #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 94 1.1 mrg #define v2 (scratch + 4 * n + 2) /* 2n+1 */ 95 1.1 mrg #define vinf (pp + 5 * n) /* s+t */ 96 1.1 mrg #define bs1 pp /* n+1 */ 97 1.1 mrg #define bsm1 (scratch + 2 * n + 2) /* n+1 */ 98 1.1 mrg #define asm1 (scratch + 3 * n + 3) /* n+1 */ 99 1.1 mrg #define asm2 (scratch + 4 * n + 4) /* n+1 */ 100 1.1 mrg #define bsm2 (pp + n + 1) /* n+1 */ 101 1.1 mrg #define bs2 (pp + 2 * n + 2) /* n+1 */ 102 1.1 mrg #define as2 (pp + 3 * n + 3) /* n+1 */ 103 1.1 mrg #define as1 (pp + 4 * n + 4) /* n+1 */ 104 1.1 mrg 105 1.1 mrg /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra 106 1.1 mrg limb, because products will overwrite 2n+2 limbs. */ 107 1.1 mrg 108 1.1 mrg #define a0a2 scratch 109 1.1 mrg #define b0b2 scratch 110 1.1 mrg #define a1a3 asm1 111 1.1 mrg #define b1d bsm1 112 1.1 mrg 113 1.1 mrg /* Compute as2 and asm2. */ 114 1.1.1.2 mrg flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3)); 115 1.1 mrg 116 1.1 mrg /* Compute bs2 and bsm2. */ 117 1.1 mrg b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */ 118 1.1 mrg cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */ 119 1.1 mrg cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */ 120 1.1 mrg if (t != n) 121 1.1 mrg cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy); 122 1.1 mrg b0b2[n] = cy; 123 1.1 mrg 124 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n 125 1.1 mrg if (mpn_cmp (b0b2, b1d, n+1) < 0) 126 1.1 mrg { 127 1.1 mrg mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1); 128 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); 129 1.1 mrg } 130 1.1 mrg else 131 1.1 mrg { 132 1.1 mrg mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1); 133 1.1 mrg } 134 1.1 mrg #else 135 1.1 mrg mpn_add_n (bs2, b0b2, b1d, n+1); 136 1.1 mrg if (mpn_cmp (b0b2, b1d, n+1) < 0) 137 1.1 mrg { 138 1.1 mrg mpn_sub_n (bsm2, b1d, b0b2, n+1); 139 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); 140 1.1 mrg } 141 1.1 mrg else 142 1.1 mrg { 143 1.1 mrg mpn_sub_n (bsm2, b0b2, b1d, n+1); 144 1.1 mrg } 145 1.1 mrg #endif 146 1.1 mrg 147 1.1 mrg /* Compute as1 and asm1. */ 148 1.1.1.3 mrg flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2))); 149 1.1 mrg 150 1.1 mrg /* Compute bs1 and bsm1. */ 151 1.1 mrg bsm1[n] = mpn_add (bsm1, b0, n, b2, t); 152 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n 153 1.1 mrg if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) 154 1.1 mrg { 155 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n); 156 1.1 mrg bs1[n] = cy >> 1; 157 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); 158 1.1 mrg } 159 1.1 mrg else 160 1.1 mrg { 161 1.1 mrg cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n); 162 1.1 mrg bs1[n] = bsm1[n] + (cy >> 1); 163 1.1 mrg bsm1[n]-= cy & 1; 164 1.1 mrg } 165 1.1 mrg #else 166 1.1 mrg bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n); 167 1.1 mrg if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) 168 1.1 mrg { 169 1.1 mrg mpn_sub_n (bsm1, b1, bsm1, n); 170 1.1.1.2 mrg flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); 171 1.1 mrg } 172 1.1 mrg else 173 1.1 mrg { 174 1.1 mrg bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n); 175 1.1 mrg } 176 1.1 mrg #endif 177 1.1 mrg 178 1.1 mrg ASSERT (as1[n] <= 3); 179 1.1 mrg ASSERT (bs1[n] <= 2); 180 1.1 mrg ASSERT (asm1[n] <= 1); 181 1.1 mrg ASSERT (bsm1[n] <= 1); 182 1.1 mrg ASSERT (as2[n] <=14); 183 1.1 mrg ASSERT (bs2[n] <= 6); 184 1.1 mrg ASSERT (asm2[n] <= 9); 185 1.1 mrg ASSERT (bsm2[n] <= 4); 186 1.1 mrg 187 1.1 mrg /* vm1, 2n+1 limbs */ 188 1.1 mrg mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */ 189 1.1 mrg 190 1.1 mrg /* vm2, 2n+1 limbs */ 191 1.1 mrg mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */ 192 1.1 mrg 193 1.1 mrg /* v2, 2n+1 limbs */ 194 1.1 mrg mpn_mul_n (v2, as2, bs2, n+1); /* W1 */ 195 1.1 mrg 196 1.1 mrg /* v1, 2n+1 limbs */ 197 1.1 mrg mpn_mul_n (v1, as1, bs1, n+1); /* W3 */ 198 1.1 mrg 199 1.1 mrg /* vinf, s+t limbs */ /* W0 */ 200 1.1 mrg if (s > t) mpn_mul (vinf, a3, s, b2, t); 201 1.1 mrg else mpn_mul (vinf, b2, t, a3, s); 202 1.1 mrg 203 1.1 mrg /* v0, 2n limbs */ 204 1.1 mrg mpn_mul_n (v0, ap, bp, n); /* W5 */ 205 1.1 mrg 206 1.1 mrg mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s); 207 1.1 mrg 208 1.1 mrg #undef v0 209 1.1 mrg #undef vm1 210 1.1 mrg #undef v1 211 1.1 mrg #undef vm2 212 1.1 mrg #undef v2 213 1.1 mrg #undef vinf 214 1.1 mrg #undef bs1 215 1.1 mrg #undef bs2 216 1.1 mrg #undef bsm1 217 1.1 mrg #undef bsm2 218 1.1 mrg #undef asm1 219 1.1 mrg #undef asm2 220 1.1 mrg /* #undef as1 */ 221 1.1 mrg /* #undef as2 */ 222 1.1 mrg #undef a0a2 223 1.1 mrg #undef b0b2 224 1.1 mrg #undef a1a3 225 1.1 mrg #undef b1d 226 1.1 mrg #undef a0 227 1.1 mrg #undef a1 228 1.1 mrg #undef a2 229 1.1 mrg #undef a3 230 1.1 mrg #undef b0 231 1.1 mrg #undef b1 232 1.1 mrg #undef b2 233 1.1 mrg } 234