Home | History | Annotate | Line # | Download | only in generic
mul.c revision 1.1.1.2
      1 /* mpn_mul -- Multiply two natural numbers.
      2 
      3    Contributed to the GNU project by Torbjorn Granlund.
      4 
      5 Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
      6 2006, 2007, 2009, 2010, 2012 Free Software Foundation, Inc.
      7 
      8 This file is part of the GNU MP Library.
      9 
     10 The GNU MP Library is free software; you can redistribute it and/or modify
     11 it under the terms of the GNU Lesser General Public License as published by
     12 the Free Software Foundation; either version 3 of the License, or (at your
     13 option) any later version.
     14 
     15 The GNU MP Library is distributed in the hope that it will be useful, but
     16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     17 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     18 License for more details.
     19 
     20 You should have received a copy of the GNU Lesser General Public License
     21 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     22 
     23 #include "gmp.h"
     24 #include "gmp-impl.h"
     25 
     26 
     27 #ifndef MUL_BASECASE_MAX_UN
     28 #define MUL_BASECASE_MAX_UN 500
     29 #endif
     30 
     31 /* Areas where the different toom algorithms can be called (extracted
     32    from the t-toom*.c files, and ignoring small constant offsets):
     33 
     34    1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
     35                                         4/7              6/7
     36 				       6/11
     37                                        |--------------------| toom22 (small)
     38                                                            || toom22 (large)
     39                                                        |xxxx| toom22 called
     40                       |-------------------------------------| toom32
     41                                          |xxxxxxxxxxxxxxxx| | toom32 called
     42                                                |------------| toom33
     43                                                           |x| toom33 called
     44              |---------------------------------|            | toom42
     45 	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
     46                                        |--------------------| toom43
     47                                                |xxxxxxxxxx|   toom43 called
     48          |-----------------------------|                      toom52 (unused)
     49                                                    |--------| toom44
     50 						   |xxxxxxxx| toom44 called
     51                               |--------------------|        | toom53
     52                                         |xxxxxx|              toom53 called
     53     |-------------------------|                               toom62 (unused)
     54                                            |----------------| toom54 (unused)
     55                       |--------------------|                  toom63
     56 	                      |xxxxxxxxx|                   | toom63 called
     57                           |---------------------------------| toom6h
     58 						   |xxxxxxxx| toom6h called
     59                                   |-------------------------| toom8h (32 bit)
     60                  |------------------------------------------| toom8h (64 bit)
     61 						   |xxxxxxxx| toom8h called
     62 */
     63 
     64 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
     65 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
     66 
     67 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
     68    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
     69    result is UN + VN limbs.  Return the most significant limb of the result.
     70 
     71    NOTE: The space pointed to by PRODP is overwritten before finished with U
     72    and V, so overlap is an error.
     73 
     74    Argument constraints:
     75    1. UN >= VN.
     76    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
     77       the multiplier and the multiplicand.  */
     78 
     79 /*
     80   * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
     81     ideal lines of the surrounding algorithms.  Is that optimal?
     82 
     83   * The toomX3 code now uses a structure similar to the one of toomX2, except
     84     that it loops longer in the unbalanced case.  The result is that the
     85     remaining area might have un < vn.  Should we fix the toomX2 code in a
     86     similar way?
     87 
     88   * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
     89     therefore calls mpn_mul recursively for certain cases.
     90 
     91   * Allocate static temp space using THRESHOLD variables (except for toom44
     92     when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
     93 
     94   * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
     95     have the same vn threshold.  This is not true, we should actually use
     96     mul_basecase for slightly larger operands for toom32 than for toom22, and
     97     even larger for toom42.
     98 
     99   * That problem is even more prevalent for toomX3.  We therefore use special
    100     THRESHOLD variables there.
    101 
    102   * Is our ITCH allocation correct?
    103 */
    104 
    105 #define ITCH (16*vn + 100)
    106 
    107 mp_limb_t
    108 mpn_mul (mp_ptr prodp,
    109 	 mp_srcptr up, mp_size_t un,
    110 	 mp_srcptr vp, mp_size_t vn)
    111 {
    112   ASSERT (un >= vn);
    113   ASSERT (vn >= 1);
    114   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
    115   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
    116 
    117   if (un == vn)
    118     {
    119       if (up == vp)
    120 	mpn_sqr (prodp, up, un);
    121       else
    122 	mpn_mul_n (prodp, up, vp, un);
    123     }
    124   else if (vn < MUL_TOOM22_THRESHOLD)
    125     { /* plain schoolbook multiplication */
    126 
    127       /* Unless un is very large, or else if have an applicable mpn_mul_N,
    128 	 perform basecase multiply directly.  */
    129       if (un <= MUL_BASECASE_MAX_UN
    130 #if HAVE_NATIVE_mpn_mul_2
    131 	  || vn <= 2
    132 #else
    133 	  || vn == 1
    134 #endif
    135 	  )
    136 	mpn_mul_basecase (prodp, up, un, vp, vn);
    137       else
    138 	{
    139 	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
    140 	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
    141 	     these pieces with the vp[] operand.  After each such partial
    142 	     multiplication (but the last) we copy the most significant vn
    143 	     limbs into a temporary buffer since that part would otherwise be
    144 	     overwritten by the next multiplication.  After the next
    145 	     multiplication, we add it back.  This illustrates the situation:
    146 
    147                                                     -->vn<--
    148                                                       |  |<------- un ------->|
    149                                                          _____________________|
    150                                                         X                    /|
    151                                                       /XX__________________/  |
    152                                     _____________________                     |
    153                                    X                    /                     |
    154                                  /XX__________________/                       |
    155                _____________________                                          |
    156               /                    /                                          |
    157             /____________________/                                            |
    158 	    ==================================================================
    159 
    160 	    The parts marked with X are the parts whose sums are copied into
    161 	    the temporary buffer.  */
    162 
    163 	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
    164 	  mp_limb_t cy;
    165 	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
    166 
    167 	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
    168 	  prodp += MUL_BASECASE_MAX_UN;
    169 	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
    170 	  up += MUL_BASECASE_MAX_UN;
    171 	  un -= MUL_BASECASE_MAX_UN;
    172 	  while (un > MUL_BASECASE_MAX_UN)
    173 	    {
    174 	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
    175 	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
    176 	      mpn_incr_u (prodp + vn, cy);
    177 	      prodp += MUL_BASECASE_MAX_UN;
    178 	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
    179 	      up += MUL_BASECASE_MAX_UN;
    180 	      un -= MUL_BASECASE_MAX_UN;
    181 	    }
    182 	  if (un > vn)
    183 	    {
    184 	      mpn_mul_basecase (prodp, up, un, vp, vn);
    185 	    }
    186 	  else
    187 	    {
    188 	      ASSERT (un > 0);
    189 	      mpn_mul_basecase (prodp, vp, vn, up, un);
    190 	    }
    191 	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
    192 	  mpn_incr_u (prodp + vn, cy);
    193 	}
    194     }
    195   else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
    196     {
    197       /* Use ToomX2 variants */
    198       mp_ptr scratch;
    199       TMP_SDECL; TMP_SMARK;
    200 
    201       scratch = TMP_SALLOC_LIMBS (ITCH);
    202 
    203       /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
    204 	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
    205 	 wise; we would get better balance by slightly moving the bound.  We
    206 	 will sometimes end up with un < vn, like in the X3 arm below.  */
    207       if (un >= 3 * vn)
    208 	{
    209 	  mp_limb_t cy;
    210 	  mp_ptr ws;
    211 
    212 	  /* The maximum ws usage is for the mpn_mul result.  */
    213 	  ws = TMP_SALLOC_LIMBS (4 * vn);
    214 
    215 	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
    216 	  un -= 2 * vn;
    217 	  up += 2 * vn;
    218 	  prodp += 2 * vn;
    219 
    220 	  while (un >= 3 * vn)
    221 	    {
    222 	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
    223 	      un -= 2 * vn;
    224 	      up += 2 * vn;
    225 	      cy = mpn_add_n (prodp, prodp, ws, vn);
    226 	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
    227 	      mpn_incr_u (prodp + vn, cy);
    228 	      prodp += 2 * vn;
    229 	    }
    230 
    231 	  /* vn <= un < 3vn */
    232 
    233 	  if (4 * un < 5 * vn)
    234 	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
    235 	  else if (4 * un < 7 * vn)
    236 	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
    237 	  else
    238 	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
    239 
    240 	  cy = mpn_add_n (prodp, prodp, ws, vn);
    241 	  MPN_COPY (prodp + vn, ws + vn, un);
    242 	  mpn_incr_u (prodp + vn, cy);
    243 	}
    244       else
    245 	{
    246 	  if (4 * un < 5 * vn)
    247 	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
    248 	  else if (4 * un < 7 * vn)
    249 	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
    250 	  else
    251 	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
    252 	}
    253       TMP_SFREE;
    254     }
    255   else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
    256 	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
    257     {
    258       /* Handle the largest operands that are not in the FFT range.  The 2nd
    259 	 condition makes very unbalanced operands avoid the FFT code (except
    260 	 perhaps as coefficient products of the Toom code.  */
    261 
    262       if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
    263 	{
    264 	  /* Use ToomX3 variants */
    265 	  mp_ptr scratch;
    266 	  TMP_SDECL; TMP_SMARK;
    267 
    268 	  scratch = TMP_SALLOC_LIMBS (ITCH);
    269 
    270 	  if (2 * un >= 5 * vn)
    271 	    {
    272 	      mp_limb_t cy;
    273 	      mp_ptr ws;
    274 
    275 	      /* The maximum ws usage is for the mpn_mul result.  */
    276 	      ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
    277 
    278 	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
    279 		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
    280 	      else
    281 		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
    282 	      un -= 2 * vn;
    283 	      up += 2 * vn;
    284 	      prodp += 2 * vn;
    285 
    286 	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
    287 		{
    288 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
    289 		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
    290 		  else
    291 		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
    292 		  un -= 2 * vn;
    293 		  up += 2 * vn;
    294 		  cy = mpn_add_n (prodp, prodp, ws, vn);
    295 		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
    296 		  mpn_incr_u (prodp + vn, cy);
    297 		  prodp += 2 * vn;
    298 		}
    299 
    300 	      /* vn / 2 <= un < 2.5vn */
    301 
    302 	      if (un < vn)
    303 		mpn_mul (ws, vp, vn, up, un);
    304 	      else
    305 		mpn_mul (ws, up, un, vp, vn);
    306 
    307 	      cy = mpn_add_n (prodp, prodp, ws, vn);
    308 	      MPN_COPY (prodp + vn, ws + vn, un);
    309 	      mpn_incr_u (prodp + vn, cy);
    310 	    }
    311 	  else
    312 	    {
    313 	      if (6 * un < 7 * vn)
    314 		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
    315 	      else if (2 * un < 3 * vn)
    316 		{
    317 		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
    318 		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
    319 		  else
    320 		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
    321 		}
    322 	      else if (6 * un < 11 * vn)
    323 		{
    324 		  if (4 * un < 7 * vn)
    325 		    {
    326 		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
    327 			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
    328 		      else
    329 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
    330 		    }
    331 		  else
    332 		    {
    333 		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
    334 			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
    335 		      else
    336 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
    337 		    }
    338 		}
    339 	      else
    340 		{
    341 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
    342 		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
    343 		  else
    344 		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
    345 		}
    346 	    }
    347 	  TMP_SFREE;
    348 	}
    349       else
    350 	{
    351 	  mp_ptr scratch;
    352 	  TMP_DECL; TMP_MARK;
    353 
    354 	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
    355 	    {
    356 	      scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
    357 	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
    358 	    }
    359 	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
    360 	    {
    361 	      scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
    362 	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
    363 	    }
    364 	  else
    365 	    {
    366 	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
    367 	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
    368 	    }
    369 	  TMP_FREE;
    370 	}
    371     }
    372   else
    373     {
    374       if (un >= 8 * vn)
    375 	{
    376 	  mp_limb_t cy;
    377 	  mp_ptr ws;
    378 	  TMP_DECL; TMP_MARK;
    379 
    380 	  /* The maximum ws usage is for the mpn_mul result.  */
    381 	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
    382 
    383 	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
    384 	  un -= 3 * vn;
    385 	  up += 3 * vn;
    386 	  prodp += 3 * vn;
    387 
    388 	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
    389 	    {
    390 	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
    391 	      un -= 3 * vn;
    392 	      up += 3 * vn;
    393 	      cy = mpn_add_n (prodp, prodp, ws, vn);
    394 	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
    395 	      mpn_incr_u (prodp + vn, cy);
    396 	      prodp += 3 * vn;
    397 	    }
    398 
    399 	  /* vn / 2 <= un < 3.5vn */
    400 
    401 	  if (un < vn)
    402 	    mpn_mul (ws, vp, vn, up, un);
    403 	  else
    404 	    mpn_mul (ws, up, un, vp, vn);
    405 
    406 	  cy = mpn_add_n (prodp, prodp, ws, vn);
    407 	  MPN_COPY (prodp + vn, ws + vn, un);
    408 	  mpn_incr_u (prodp + vn, cy);
    409 
    410 	  TMP_FREE;
    411 	}
    412       else
    413 	mpn_fft_mul (prodp, up, un, vp, vn);
    414     }
    415 
    416   return prodp[un + vn - 1];	/* historic */
    417 }
    418