1 1.1 mrg /* mpn_mulmid -- middle product 2 1.1 mrg 3 1.1 mrg Contributed by David Harvey. 4 1.1 mrg 5 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 1.1 mrg GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 1.1 mrg 9 1.1 mrg Copyright 2011 Free Software Foundation, Inc. 10 1.1 mrg 11 1.1 mrg This file is part of the GNU MP Library. 12 1.1 mrg 13 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 14 1.1.1.2 mrg it under the terms of either: 15 1.1.1.2 mrg 16 1.1.1.2 mrg * the GNU Lesser General Public License as published by the Free 17 1.1.1.2 mrg Software Foundation; either version 3 of the License, or (at your 18 1.1.1.2 mrg option) any later version. 19 1.1.1.2 mrg 20 1.1.1.2 mrg or 21 1.1.1.2 mrg 22 1.1.1.2 mrg * the GNU General Public License as published by the Free Software 23 1.1.1.2 mrg Foundation; either version 2 of the License, or (at your option) any 24 1.1.1.2 mrg later version. 25 1.1.1.2 mrg 26 1.1.1.2 mrg or both in parallel, as here. 27 1.1 mrg 28 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 29 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 1.1.1.2 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 1.1.1.2 mrg for more details. 32 1.1 mrg 33 1.1.1.2 mrg You should have received copies of the GNU General Public License and the 34 1.1.1.2 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 35 1.1.1.2 mrg see https://www.gnu.org/licenses/. */ 36 1.1 mrg 37 1.1 mrg 38 1.1 mrg #include "gmp-impl.h" 39 1.1 mrg 40 1.1 mrg 41 1.1 mrg #define CHUNK (200 + MULMID_TOOM42_THRESHOLD) 42 1.1 mrg 43 1.1 mrg 44 1.1 mrg void 45 1.1 mrg mpn_mulmid (mp_ptr rp, 46 1.1 mrg mp_srcptr ap, mp_size_t an, 47 1.1 mrg mp_srcptr bp, mp_size_t bn) 48 1.1 mrg { 49 1.1 mrg mp_size_t rn, k; 50 1.1 mrg mp_ptr scratch, temp; 51 1.1 mrg 52 1.1 mrg ASSERT (an >= bn); 53 1.1 mrg ASSERT (bn >= 1); 54 1.1 mrg ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an)); 55 1.1 mrg ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn)); 56 1.1 mrg 57 1.1 mrg if (bn < MULMID_TOOM42_THRESHOLD) 58 1.1 mrg { 59 1.1 mrg /* region not tall enough to make toom42 worthwhile for any portion */ 60 1.1 mrg 61 1.1 mrg if (an < CHUNK) 62 1.1 mrg { 63 1.1 mrg /* region not too wide either, just call basecase directly */ 64 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn); 65 1.1 mrg return; 66 1.1 mrg } 67 1.1 mrg 68 1.1 mrg /* Region quite wide. For better locality, use basecase on chunks: 69 1.1 mrg 70 1.1 mrg AAABBBCC.. 71 1.1 mrg .AAABBBCC. 72 1.1 mrg ..AAABBBCC 73 1.1 mrg */ 74 1.1 mrg 75 1.1 mrg k = CHUNK - bn + 1; /* number of diagonals per chunk */ 76 1.1 mrg 77 1.1 mrg /* first chunk (marked A in the above diagram) */ 78 1.1 mrg mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn); 79 1.1 mrg 80 1.1 mrg /* remaining chunks (B, C, etc) */ 81 1.1 mrg an -= k; 82 1.1 mrg 83 1.1 mrg while (an >= CHUNK) 84 1.1 mrg { 85 1.1 mrg mp_limb_t t0, t1, cy; 86 1.1 mrg ap += k, rp += k; 87 1.1 mrg t0 = rp[0], t1 = rp[1]; 88 1.1 mrg mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn); 89 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */ 90 1.1 mrg MPN_INCR_U (rp + 1, k + 1, t1 + cy); 91 1.1 mrg an -= k; 92 1.1 mrg } 93 1.1 mrg 94 1.1 mrg if (an >= bn) 95 1.1 mrg { 96 1.1 mrg /* last remaining chunk */ 97 1.1 mrg mp_limb_t t0, t1, cy; 98 1.1 mrg ap += k, rp += k; 99 1.1 mrg t0 = rp[0], t1 = rp[1]; 100 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn); 101 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); 102 1.1 mrg MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy); 103 1.1 mrg } 104 1.1 mrg 105 1.1 mrg return; 106 1.1 mrg } 107 1.1 mrg 108 1.1 mrg /* region is tall enough for toom42 */ 109 1.1 mrg 110 1.1 mrg rn = an - bn + 1; 111 1.1 mrg 112 1.1 mrg if (rn < MULMID_TOOM42_THRESHOLD) 113 1.1 mrg { 114 1.1 mrg /* region not wide enough to make toom42 worthwhile for any portion */ 115 1.1 mrg 116 1.1 mrg TMP_DECL; 117 1.1 mrg 118 1.1 mrg if (bn < CHUNK) 119 1.1 mrg { 120 1.1 mrg /* region not too tall either, just call basecase directly */ 121 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn); 122 1.1 mrg return; 123 1.1 mrg } 124 1.1 mrg 125 1.1 mrg /* Region quite tall. For better locality, use basecase on chunks: 126 1.1 mrg 127 1.1 mrg AAAAA.... 128 1.1 mrg .AAAAA... 129 1.1 mrg ..BBBBB.. 130 1.1 mrg ...BBBBB. 131 1.1 mrg ....CCCCC 132 1.1 mrg */ 133 1.1 mrg 134 1.1 mrg TMP_MARK; 135 1.1 mrg 136 1.1 mrg temp = TMP_ALLOC_LIMBS (rn + 2); 137 1.1 mrg 138 1.1 mrg /* first chunk (marked A in the above diagram) */ 139 1.1 mrg bp += bn - CHUNK, an -= bn - CHUNK; 140 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, CHUNK); 141 1.1 mrg 142 1.1 mrg /* remaining chunks (B, C, etc) */ 143 1.1 mrg bn -= CHUNK; 144 1.1 mrg 145 1.1 mrg while (bn >= CHUNK) 146 1.1 mrg { 147 1.1 mrg ap += CHUNK, bp -= CHUNK; 148 1.1 mrg mpn_mulmid_basecase (temp, ap, an, bp, CHUNK); 149 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2); 150 1.1 mrg bn -= CHUNK; 151 1.1 mrg } 152 1.1 mrg 153 1.1 mrg if (bn) 154 1.1 mrg { 155 1.1 mrg /* last remaining chunk */ 156 1.1 mrg ap += CHUNK, bp -= bn; 157 1.1 mrg mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn); 158 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2); 159 1.1 mrg } 160 1.1 mrg 161 1.1 mrg TMP_FREE; 162 1.1 mrg return; 163 1.1 mrg } 164 1.1 mrg 165 1.1 mrg /* we're definitely going to use toom42 somewhere */ 166 1.1 mrg 167 1.1 mrg if (bn > rn) 168 1.1 mrg { 169 1.1 mrg /* slice region into chunks, use toom42 on all chunks except possibly 170 1.1 mrg the last: 171 1.1 mrg 172 1.1 mrg AA.... 173 1.1 mrg .AA... 174 1.1 mrg ..BB.. 175 1.1 mrg ...BB. 176 1.1 mrg ....CC 177 1.1 mrg */ 178 1.1 mrg 179 1.1 mrg TMP_DECL; 180 1.1 mrg TMP_MARK; 181 1.1 mrg 182 1.1 mrg temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn)); 183 1.1 mrg scratch = temp + rn + 2; 184 1.1 mrg 185 1.1 mrg /* first chunk (marked A in the above diagram) */ 186 1.1 mrg bp += bn - rn; 187 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, rn, scratch); 188 1.1 mrg 189 1.1 mrg /* remaining chunks (B, C, etc) */ 190 1.1 mrg bn -= rn; 191 1.1 mrg 192 1.1 mrg while (bn >= rn) 193 1.1 mrg { 194 1.1 mrg ap += rn, bp -= rn; 195 1.1 mrg mpn_toom42_mulmid (temp, ap, bp, rn, scratch); 196 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2); 197 1.1 mrg bn -= rn; 198 1.1 mrg } 199 1.1 mrg 200 1.1 mrg if (bn) 201 1.1 mrg { 202 1.1 mrg /* last remaining chunk */ 203 1.1 mrg ap += rn, bp -= bn; 204 1.1 mrg mpn_mulmid (temp, ap, rn + bn - 1, bp, bn); 205 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2); 206 1.1 mrg } 207 1.1 mrg 208 1.1 mrg TMP_FREE; 209 1.1 mrg } 210 1.1 mrg else 211 1.1 mrg { 212 1.1 mrg /* slice region into chunks, use toom42 on all chunks except possibly 213 1.1 mrg the last: 214 1.1 mrg 215 1.1 mrg AAABBBCC.. 216 1.1 mrg .AAABBBCC. 217 1.1 mrg ..AAABBBCC 218 1.1 mrg */ 219 1.1 mrg 220 1.1 mrg TMP_DECL; 221 1.1 mrg TMP_MARK; 222 1.1 mrg 223 1.1 mrg scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn)); 224 1.1 mrg 225 1.1 mrg /* first chunk (marked A in the above diagram) */ 226 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, bn, scratch); 227 1.1 mrg 228 1.1 mrg /* remaining chunks (B, C, etc) */ 229 1.1 mrg rn -= bn; 230 1.1 mrg 231 1.1 mrg while (rn >= bn) 232 1.1 mrg { 233 1.1 mrg mp_limb_t t0, t1, cy; 234 1.1 mrg ap += bn, rp += bn; 235 1.1 mrg t0 = rp[0], t1 = rp[1]; 236 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, bn, scratch); 237 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */ 238 1.1 mrg MPN_INCR_U (rp + 1, bn + 1, t1 + cy); 239 1.1 mrg rn -= bn; 240 1.1 mrg } 241 1.1 mrg 242 1.1 mrg TMP_FREE; 243 1.1 mrg 244 1.1 mrg if (rn) 245 1.1 mrg { 246 1.1 mrg /* last remaining chunk */ 247 1.1 mrg mp_limb_t t0, t1, cy; 248 1.1 mrg ap += bn, rp += bn; 249 1.1 mrg t0 = rp[0], t1 = rp[1]; 250 1.1 mrg mpn_mulmid (rp, ap, rn + bn - 1, bp, bn); 251 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); 252 1.1 mrg MPN_INCR_U (rp + 1, rn + 1, t1 + cy); 253 1.1 mrg } 254 1.1 mrg } 255 1.1 mrg } 256