Home | History | Annotate | Line # | Download | only in generic
mul.c revision 1.1.1.4
      1 /* mpn_mul -- Multiply two natural numbers.
      2 
      3    Contributed to the GNU project by Torbjorn Granlund.
      4 
      5 Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
      6 2014, 2019 Free Software Foundation, Inc.
      7 
      8 This file is part of the GNU MP Library.
      9 
     10 The GNU MP Library is free software; you can redistribute it and/or modify
     11 it under the terms of either:
     12 
     13   * the GNU Lesser General Public License as published by the Free
     14     Software Foundation; either version 3 of the License, or (at your
     15     option) any later version.
     16 
     17 or
     18 
     19   * the GNU General Public License as published by the Free Software
     20     Foundation; either version 2 of the License, or (at your option) any
     21     later version.
     22 
     23 or both in parallel, as here.
     24 
     25 The GNU MP Library is distributed in the hope that it will be useful, but
     26 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     27 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     28 for more details.
     29 
     30 You should have received copies of the GNU General Public License and the
     31 GNU Lesser General Public License along with the GNU MP Library.  If not,
     32 see https://www.gnu.org/licenses/.  */
     33 
     34 #include "gmp-impl.h"
     35 
     36 
     37 #ifndef MUL_BASECASE_MAX_UN
     38 #define MUL_BASECASE_MAX_UN 500
     39 #endif
     40 
     41 /* Areas where the different toom algorithms can be called (extracted
     42    from the t-toom*.c files, and ignoring small constant offsets):
     43 
     44    1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
     45                                         4/7              6/7
     46 				       6/11
     47                                        |--------------------| toom22 (small)
     48                                                            || toom22 (large)
     49                                                        |xxxx| toom22 called
     50                       |-------------------------------------| toom32
     51                                          |xxxxxxxxxxxxxxxx| | toom32 called
     52                                                |------------| toom33
     53                                                           |x| toom33 called
     54              |---------------------------------|            | toom42
     55 	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
     56                                        |--------------------| toom43
     57                                                |xxxxxxxxxx|   toom43 called
     58          |-----------------------------|                      toom52 (unused)
     59                                                    |--------| toom44
     60 						   |xxxxxxxx| toom44 called
     61                               |--------------------|        | toom53
     62                                         |xxxxxx|              toom53 called
     63     |-------------------------|                               toom62 (unused)
     64                                            |----------------| toom54 (unused)
     65                       |--------------------|                  toom63
     66 	                      |xxxxxxxxx|                   | toom63 called
     67                           |---------------------------------| toom6h
     68 						   |xxxxxxxx| toom6h called
     69                                   |-------------------------| toom8h (32 bit)
     70                  |------------------------------------------| toom8h (64 bit)
     71 						   |xxxxxxxx| toom8h called
     72 */
     73 
     74 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
     75 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
     76 
     77 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
     78    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
     79    result is UN + VN limbs.  Return the most significant limb of the result.
     80 
     81    NOTE: The space pointed to by PRODP is overwritten before finished with U
     82    and V, so overlap is an error.
     83 
     84    Argument constraints:
     85    1. UN >= VN.
     86    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
     87       the multiplier and the multiplicand.  */
     88 
     89 /*
     90   * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
     91     ideal lines of the surrounding algorithms.  Is that optimal?
     92 
     93   * The toomX3 code now uses a structure similar to the one of toomX2, except
     94     that it loops longer in the unbalanced case.  The result is that the
     95     remaining area might have un < vn.  Should we fix the toomX2 code in a
     96     similar way?
     97 
     98   * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
     99     therefore calls mpn_mul recursively for certain cases.
    100 
    101   * Allocate static temp space using THRESHOLD variables (except for toom44
    102     when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
    103 
    104   * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
    105     have the same vn threshold.  This is not true, we should actually use
    106     mul_basecase for slightly larger operands for toom32 than for toom22, and
    107     even larger for toom42.
    108 
    109   * That problem is even more prevalent for toomX3.  We therefore use special
    110     THRESHOLD variables there.
    111 */
    112 
    113 mp_limb_t
    114 mpn_mul (mp_ptr prodp,
    115 	 mp_srcptr up, mp_size_t un,
    116 	 mp_srcptr vp, mp_size_t vn)
    117 {
    118   ASSERT (un >= vn);
    119   ASSERT (vn >= 1);
    120   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
    121   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
    122 
    123   if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
    124     {
    125       /* When un (and thus vn) is below the toom22 range, do mul_basecase.
    126 	 Test un and not vn here not to thwart the un >> vn code below.
    127 	 This special case is not necessary, but cuts the overhead for the
    128 	 smallest operands. */
    129       mpn_mul_basecase (prodp, up, un, vp, vn);
    130     }
    131   else if (un == vn)
    132     {
    133       mpn_mul_n (prodp, up, vp, un);
    134     }
    135   else if (vn < MUL_TOOM22_THRESHOLD)
    136     { /* plain schoolbook multiplication */
    137 
    138       /* Unless un is very large, or else if have an applicable mpn_mul_N,
    139 	 perform basecase multiply directly.  */
    140       if (un <= MUL_BASECASE_MAX_UN
    141 #if HAVE_NATIVE_mpn_mul_2
    142 	  || vn <= 2
    143 #else
    144 	  || vn == 1
    145 #endif
    146 	  )
    147 	mpn_mul_basecase (prodp, up, un, vp, vn);
    148       else
    149 	{
    150 	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
    151 	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
    152 	     these pieces with the vp[] operand.  After each such partial
    153 	     multiplication (but the last) we copy the most significant vn
    154 	     limbs into a temporary buffer since that part would otherwise be
    155 	     overwritten by the next multiplication.  After the next
    156 	     multiplication, we add it back.  This illustrates the situation:
    157 
    158                                                     -->vn<--
    159                                                       |  |<------- un ------->|
    160                                                          _____________________|
    161                                                         X                    /|
    162                                                       /XX__________________/  |
    163                                     _____________________                     |
    164                                    X                    /                     |
    165                                  /XX__________________/                       |
    166                _____________________                                          |
    167               /                    /                                          |
    168             /____________________/                                            |
    169 	    ==================================================================
    170 
    171 	    The parts marked with X are the parts whose sums are copied into
    172 	    the temporary buffer.  */
    173 
    174 	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
    175 	  mp_limb_t cy;
    176 	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
    177 
    178 	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
    179 	  prodp += MUL_BASECASE_MAX_UN;
    180 	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
    181 	  up += MUL_BASECASE_MAX_UN;
    182 	  un -= MUL_BASECASE_MAX_UN;
    183 	  while (un > MUL_BASECASE_MAX_UN)
    184 	    {
    185 	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
    186 	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
    187 	      mpn_incr_u (prodp + vn, cy);
    188 	      prodp += MUL_BASECASE_MAX_UN;
    189 	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
    190 	      up += MUL_BASECASE_MAX_UN;
    191 	      un -= MUL_BASECASE_MAX_UN;
    192 	    }
    193 	  if (un > vn)
    194 	    {
    195 	      mpn_mul_basecase (prodp, up, un, vp, vn);
    196 	    }
    197 	  else
    198 	    {
    199 	      ASSERT (un > 0);
    200 	      mpn_mul_basecase (prodp, vp, vn, up, un);
    201 	    }
    202 	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
    203 	  mpn_incr_u (prodp + vn, cy);
    204 	}
    205     }
    206   else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
    207     {
    208       /* Use ToomX2 variants */
    209       mp_ptr scratch;
    210       TMP_SDECL; TMP_SMARK;
    211 
    212 #define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
    213       scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
    214       ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
    215       ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
    216       ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
    217 #undef ITCH_TOOMX2
    218 
    219       /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
    220 	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
    221 	 wise; we would get better balance by slightly moving the bound.  We
    222 	 will sometimes end up with un < vn, like in the X3 arm below.  */
    223       if (un >= 3 * vn)
    224 	{
    225 	  mp_limb_t cy;
    226 	  mp_ptr ws;
    227 
    228 	  /* The maximum ws usage is for the mpn_mul result.  */
    229 	  ws = TMP_SALLOC_LIMBS (4 * vn);
    230 
    231 	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
    232 	  un -= 2 * vn;
    233 	  up += 2 * vn;
    234 	  prodp += 2 * vn;
    235 
    236 	  while (un >= 3 * vn)
    237 	    {
    238 	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
    239 	      un -= 2 * vn;
    240 	      up += 2 * vn;
    241 	      cy = mpn_add_n (prodp, prodp, ws, vn);
    242 	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
    243 	      mpn_incr_u (prodp + vn, cy);
    244 	      prodp += 2 * vn;
    245 	    }
    246 
    247 	  /* vn <= un < 3vn */
    248 
    249 	  if (4 * un < 5 * vn)
    250 	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
    251 	  else if (4 * un < 7 * vn)
    252 	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
    253 	  else
    254 	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
    255 
    256 	  cy = mpn_add_n (prodp, prodp, ws, vn);
    257 	  MPN_COPY (prodp + vn, ws + vn, un);
    258 	  mpn_incr_u (prodp + vn, cy);
    259 	}
    260       else
    261 	{
    262 	  if (4 * un < 5 * vn)
    263 	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
    264 	  else if (4 * un < 7 * vn)
    265 	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
    266 	  else
    267 	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
    268 	}
    269       TMP_SFREE;
    270     }
    271   else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
    272 	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
    273     {
    274       /* Handle the largest operands that are not in the FFT range.  The 2nd
    275 	 condition makes very unbalanced operands avoid the FFT code (except
    276 	 perhaps as coefficient products of the Toom code.  */
    277 
    278       if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
    279 	{
    280 	  /* Use ToomX3 variants */
    281 	  mp_ptr scratch;
    282 	  TMP_DECL; TMP_MARK;
    283 
    284 #define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
    285 	  scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
    286 	  ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
    287 	  ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
    288 	  ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
    289 	  ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
    290 	  ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
    291 	  ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
    292 #undef ITCH_TOOMX3
    293 
    294 	  if (2 * un >= 5 * vn)
    295 	    {
    296 	      mp_limb_t cy;
    297 	      mp_ptr ws;
    298 
    299 	      /* The maximum ws usage is for the mpn_mul result.  */
    300 	      ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
    301 
    302 	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
    303 		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
    304 	      else
    305 		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
    306 	      un -= 2 * vn;
    307 	      up += 2 * vn;
    308 	      prodp += 2 * vn;
    309 
    310 	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
    311 		{
    312 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
    313 		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
    314 		  else
    315 		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
    316 		  un -= 2 * vn;
    317 		  up += 2 * vn;
    318 		  cy = mpn_add_n (prodp, prodp, ws, vn);
    319 		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
    320 		  mpn_incr_u (prodp + vn, cy);
    321 		  prodp += 2 * vn;
    322 		}
    323 
    324 	      /* vn / 2 <= un < 2.5vn */
    325 
    326 	      if (un < vn)
    327 		mpn_mul (ws, vp, vn, up, un);
    328 	      else
    329 		mpn_mul (ws, up, un, vp, vn);
    330 
    331 	      cy = mpn_add_n (prodp, prodp, ws, vn);
    332 	      MPN_COPY (prodp + vn, ws + vn, un);
    333 	      mpn_incr_u (prodp + vn, cy);
    334 	    }
    335 	  else
    336 	    {
    337 	      if (6 * un < 7 * vn)
    338 		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
    339 	      else if (2 * un < 3 * vn)
    340 		{
    341 		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
    342 		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
    343 		  else
    344 		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
    345 		}
    346 	      else if (6 * un < 11 * vn)
    347 		{
    348 		  if (4 * un < 7 * vn)
    349 		    {
    350 		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
    351 			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
    352 		      else
    353 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
    354 		    }
    355 		  else
    356 		    {
    357 		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
    358 			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
    359 		      else
    360 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
    361 		    }
    362 		}
    363 	      else
    364 		{
    365 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
    366 		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
    367 		  else
    368 		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
    369 		}
    370 	    }
    371 	  TMP_FREE;
    372 	}
    373       else
    374 	{
    375 	  mp_ptr scratch;
    376 	  TMP_DECL; TMP_MARK;
    377 
    378 	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
    379 	    {
    380 	      scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
    381 	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
    382 	    }
    383 	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
    384 	    {
    385 	      scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
    386 	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
    387 	    }
    388 	  else
    389 	    {
    390 	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
    391 	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
    392 	    }
    393 	  TMP_FREE;
    394 	}
    395     }
    396   else
    397     {
    398       if (un >= 8 * vn)
    399 	{
    400 	  mp_limb_t cy;
    401 	  mp_ptr ws;
    402 	  TMP_DECL; TMP_MARK;
    403 
    404 	  /* The maximum ws usage is for the mpn_mul result.  */
    405 	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
    406 
    407 	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
    408 	  un -= 3 * vn;
    409 	  up += 3 * vn;
    410 	  prodp += 3 * vn;
    411 
    412 	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
    413 	    {
    414 	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
    415 	      un -= 3 * vn;
    416 	      up += 3 * vn;
    417 	      cy = mpn_add_n (prodp, prodp, ws, vn);
    418 	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
    419 	      mpn_incr_u (prodp + vn, cy);
    420 	      prodp += 3 * vn;
    421 	    }
    422 
    423 	  /* vn / 2 <= un < 3.5vn */
    424 
    425 	  if (un < vn)
    426 	    mpn_mul (ws, vp, vn, up, un);
    427 	  else
    428 	    mpn_mul (ws, up, un, vp, vn);
    429 
    430 	  cy = mpn_add_n (prodp, prodp, ws, vn);
    431 	  MPN_COPY (prodp + vn, ws + vn, un);
    432 	  mpn_incr_u (prodp + vn, cy);
    433 
    434 	  TMP_FREE;
    435 	}
    436       else
    437 	mpn_fft_mul (prodp, up, un, vp, vn);
    438     }
    439 
    440   return prodp[un + vn - 1];	/* historic */
    441 }
    442