Home | History | Annotate | Line # | Download | only in generic
toom62_mul.c revision 1.1.1.3
      1      1.1  mrg /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
      2      1.1  mrg    as large as bn.  Or more accurately, (5/2)bn < an < 6bn.
      3      1.1  mrg 
      4      1.1  mrg    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
      5      1.1  mrg 
      6      1.1  mrg    The idea of applying toom to unbalanced multiplication is due to Marco
      7      1.1  mrg    Bodrato and Alberto Zanoni.
      8      1.1  mrg 
      9      1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
     10      1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     11      1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     12      1.1  mrg 
     13  1.1.1.3  mrg Copyright 2006-2008, 2012 Free Software Foundation, Inc.
     14      1.1  mrg 
     15      1.1  mrg This file is part of the GNU MP Library.
     16      1.1  mrg 
     17      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     18  1.1.1.3  mrg it under the terms of either:
     19  1.1.1.3  mrg 
     20  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     21  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     22  1.1.1.3  mrg     option) any later version.
     23  1.1.1.3  mrg 
     24  1.1.1.3  mrg or
     25  1.1.1.3  mrg 
     26  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     27  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     28  1.1.1.3  mrg     later version.
     29  1.1.1.3  mrg 
     30  1.1.1.3  mrg or both in parallel, as here.
     31      1.1  mrg 
     32      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     33      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     34  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     35  1.1.1.3  mrg for more details.
     36      1.1  mrg 
     37  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     38  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     39  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     40      1.1  mrg 
     41      1.1  mrg 
     42      1.1  mrg #include "gmp.h"
     43      1.1  mrg #include "gmp-impl.h"
     44      1.1  mrg 
     45      1.1  mrg /* Evaluate in:
     46      1.1  mrg    0, +1, -1, +2, -2, 1/2, +inf
     47      1.1  mrg 
     48      1.1  mrg   <-s-><--n--><--n--><--n--><--n--><--n-->
     49      1.1  mrg    ___ ______ ______ ______ ______ ______
     50      1.1  mrg   |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
     51      1.1  mrg 			     |_b1_|___b0_|
     52      1.1  mrg 			     <-t--><--n-->
     53      1.1  mrg 
     54      1.1  mrg   v0  =    a0                       *   b0      #    A(0)*B(0)
     55      1.1  mrg   v1  = (  a0+  a1+ a2+ a3+  a4+  a5)*( b0+ b1) #    A(1)*B(1)      ah  <= 5   bh <= 1
     56      1.1  mrg   vm1 = (  a0-  a1+ a2- a3+  a4-  a5)*( b0- b1) #   A(-1)*B(-1)    |ah| <= 2   bh  = 0
     57      1.1  mrg   v2  = (  a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) #    A(2)*B(2)      ah  <= 62  bh <= 2
     58      1.1  mrg   vm2 = (  a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) #   A(-2)*B(-2)    -41<=ah<=20 -1<=bh<=0
     59      1.1  mrg   vh  = (32a0+16a1+8a2+4a3+ 2a4+  a5)*(2b0+ b1) #  A(1/2)*B(1/2)    ah  <= 62  bh <= 2
     60      1.1  mrg   vinf=                           a5 *      b1  #  A(inf)*B(inf)
     61      1.1  mrg */
     62      1.1  mrg 
     63      1.1  mrg void
     64      1.1  mrg mpn_toom62_mul (mp_ptr pp,
     65      1.1  mrg 		mp_srcptr ap, mp_size_t an,
     66      1.1  mrg 		mp_srcptr bp, mp_size_t bn,
     67      1.1  mrg 		mp_ptr scratch)
     68      1.1  mrg {
     69      1.1  mrg   mp_size_t n, s, t;
     70      1.1  mrg   mp_limb_t cy;
     71      1.1  mrg   mp_ptr as1, asm1, as2, asm2, ash;
     72      1.1  mrg   mp_ptr bs1, bsm1, bs2, bsm2, bsh;
     73      1.1  mrg   mp_ptr gp;
     74      1.1  mrg   enum toom7_flags aflags, bflags;
     75      1.1  mrg   TMP_DECL;
     76      1.1  mrg 
     77      1.1  mrg #define a0  ap
     78      1.1  mrg #define a1  (ap + n)
     79      1.1  mrg #define a2  (ap + 2*n)
     80      1.1  mrg #define a3  (ap + 3*n)
     81      1.1  mrg #define a4  (ap + 4*n)
     82      1.1  mrg #define a5  (ap + 5*n)
     83      1.1  mrg #define b0  bp
     84      1.1  mrg #define b1  (bp + n)
     85      1.1  mrg 
     86      1.1  mrg   n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
     87      1.1  mrg 
     88      1.1  mrg   s = an - 5 * n;
     89      1.1  mrg   t = bn - n;
     90      1.1  mrg 
     91      1.1  mrg   ASSERT (0 < s && s <= n);
     92      1.1  mrg   ASSERT (0 < t && t <= n);
     93      1.1  mrg 
     94      1.1  mrg   TMP_MARK;
     95      1.1  mrg 
     96      1.1  mrg   as1 = TMP_SALLOC_LIMBS (n + 1);
     97      1.1  mrg   asm1 = TMP_SALLOC_LIMBS (n + 1);
     98      1.1  mrg   as2 = TMP_SALLOC_LIMBS (n + 1);
     99      1.1  mrg   asm2 = TMP_SALLOC_LIMBS (n + 1);
    100      1.1  mrg   ash = TMP_SALLOC_LIMBS (n + 1);
    101      1.1  mrg 
    102      1.1  mrg   bs1 = TMP_SALLOC_LIMBS (n + 1);
    103      1.1  mrg   bsm1 = TMP_SALLOC_LIMBS (n);
    104      1.1  mrg   bs2 = TMP_SALLOC_LIMBS (n + 1);
    105      1.1  mrg   bsm2 = TMP_SALLOC_LIMBS (n + 1);
    106      1.1  mrg   bsh = TMP_SALLOC_LIMBS (n + 1);
    107      1.1  mrg 
    108      1.1  mrg   gp = pp;
    109      1.1  mrg 
    110      1.1  mrg   /* Compute as1 and asm1.  */
    111  1.1.1.2  mrg   aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
    112      1.1  mrg 
    113      1.1  mrg   /* Compute as2 and asm2. */
    114  1.1.1.3  mrg   aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp)));
    115      1.1  mrg 
    116      1.1  mrg   /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
    117      1.1  mrg      = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5  */
    118      1.1  mrg 
    119      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    120      1.1  mrg   cy = mpn_addlsh1_n (ash, a1, a0, n);
    121      1.1  mrg   cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
    122      1.1  mrg   cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
    123      1.1  mrg   cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
    124      1.1  mrg   if (s < n)
    125      1.1  mrg     {
    126      1.1  mrg       mp_limb_t cy2;
    127      1.1  mrg       cy2 = mpn_addlsh1_n (ash, a5, ash, s);
    128      1.1  mrg       ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
    129      1.1  mrg       MPN_INCR_U (ash + s, n+1-s, cy2);
    130      1.1  mrg     }
    131      1.1  mrg   else
    132      1.1  mrg     ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
    133      1.1  mrg #else
    134      1.1  mrg   cy = mpn_lshift (ash, a0, n, 1);
    135      1.1  mrg   cy += mpn_add_n (ash, ash, a1, n);
    136      1.1  mrg   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
    137      1.1  mrg   cy += mpn_add_n (ash, ash, a2, n);
    138      1.1  mrg   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
    139      1.1  mrg   cy += mpn_add_n (ash, ash, a3, n);
    140      1.1  mrg   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
    141      1.1  mrg   cy += mpn_add_n (ash, ash, a4, n);
    142      1.1  mrg   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
    143      1.1  mrg   ash[n] = cy + mpn_add (ash, ash, n, a5, s);
    144      1.1  mrg #endif
    145      1.1  mrg 
    146      1.1  mrg   /* Compute bs1 and bsm1.  */
    147      1.1  mrg   if (t == n)
    148      1.1  mrg     {
    149      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    150      1.1  mrg       if (mpn_cmp (b0, b1, n) < 0)
    151      1.1  mrg 	{
    152      1.1  mrg 	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
    153      1.1  mrg 	  bflags = toom7_w3_neg;
    154      1.1  mrg 	}
    155      1.1  mrg       else
    156      1.1  mrg 	{
    157      1.1  mrg 	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
    158  1.1.1.2  mrg 	  bflags = (enum toom7_flags) 0;
    159      1.1  mrg 	}
    160      1.1  mrg       bs1[n] = cy >> 1;
    161      1.1  mrg #else
    162      1.1  mrg       bs1[n] = mpn_add_n (bs1, b0, b1, n);
    163      1.1  mrg       if (mpn_cmp (b0, b1, n) < 0)
    164      1.1  mrg 	{
    165      1.1  mrg 	  mpn_sub_n (bsm1, b1, b0, n);
    166      1.1  mrg 	  bflags = toom7_w3_neg;
    167      1.1  mrg 	}
    168      1.1  mrg       else
    169      1.1  mrg 	{
    170      1.1  mrg 	  mpn_sub_n (bsm1, b0, b1, n);
    171  1.1.1.2  mrg 	  bflags = (enum toom7_flags) 0;
    172      1.1  mrg 	}
    173      1.1  mrg #endif
    174      1.1  mrg     }
    175      1.1  mrg   else
    176      1.1  mrg     {
    177      1.1  mrg       bs1[n] = mpn_add (bs1, b0, n, b1, t);
    178      1.1  mrg       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
    179      1.1  mrg 	{
    180      1.1  mrg 	  mpn_sub_n (bsm1, b1, b0, t);
    181      1.1  mrg 	  MPN_ZERO (bsm1 + t, n - t);
    182      1.1  mrg 	  bflags = toom7_w3_neg;
    183      1.1  mrg 	}
    184      1.1  mrg       else
    185      1.1  mrg 	{
    186      1.1  mrg 	  mpn_sub (bsm1, b0, n, b1, t);
    187  1.1.1.2  mrg 	  bflags = (enum toom7_flags) 0;
    188      1.1  mrg 	}
    189      1.1  mrg     }
    190      1.1  mrg 
    191      1.1  mrg   /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
    192      1.1  mrg      bsm1 - b1 */
    193      1.1  mrg   mpn_add (bs2, bs1, n + 1, b1, t);
    194      1.1  mrg   if (bflags & toom7_w3_neg)
    195      1.1  mrg     {
    196      1.1  mrg       bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
    197  1.1.1.2  mrg       bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
    198      1.1  mrg     }
    199      1.1  mrg   else
    200      1.1  mrg     {
    201      1.1  mrg       /* FIXME: Simplify this logic? */
    202      1.1  mrg       if (t < n)
    203      1.1  mrg 	{
    204      1.1  mrg 	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
    205      1.1  mrg 	    {
    206      1.1  mrg 	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
    207      1.1  mrg 	      MPN_ZERO (bsm2 + t, n + 1 - t);
    208  1.1.1.2  mrg 	      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
    209      1.1  mrg 	    }
    210      1.1  mrg 	  else
    211      1.1  mrg 	    {
    212      1.1  mrg 	      ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
    213      1.1  mrg 	      bsm2[n] = 0;
    214      1.1  mrg 	    }
    215      1.1  mrg 	}
    216      1.1  mrg       else
    217      1.1  mrg 	{
    218      1.1  mrg 	  if (mpn_cmp (bsm1, b1, n) < 0)
    219      1.1  mrg 	    {
    220      1.1  mrg 	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
    221  1.1.1.2  mrg 	      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
    222      1.1  mrg 	    }
    223      1.1  mrg 	  else
    224      1.1  mrg 	    {
    225  1.1.1.2  mrg 	      ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
    226      1.1  mrg 	    }
    227      1.1  mrg 	  bsm2[n] = 0;
    228      1.1  mrg 	}
    229      1.1  mrg     }
    230      1.1  mrg 
    231  1.1.1.2  mrg   /* Compute bsh, recycling bs1. bsh=bs1+b0;  */
    232  1.1.1.2  mrg   bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
    233      1.1  mrg 
    234      1.1  mrg   ASSERT (as1[n] <= 5);
    235      1.1  mrg   ASSERT (bs1[n] <= 1);
    236      1.1  mrg   ASSERT (asm1[n] <= 2);
    237      1.1  mrg   ASSERT (as2[n] <= 62);
    238      1.1  mrg   ASSERT (bs2[n] <= 2);
    239      1.1  mrg   ASSERT (asm2[n] <= 41);
    240      1.1  mrg   ASSERT (bsm2[n] <= 1);
    241      1.1  mrg   ASSERT (ash[n] <= 62);
    242      1.1  mrg   ASSERT (bsh[n] <= 2);
    243      1.1  mrg 
    244      1.1  mrg #define v0    pp				/* 2n */
    245      1.1  mrg #define v1    (pp + 2 * n)			/* 2n+1 */
    246      1.1  mrg #define vinf  (pp + 6 * n)			/* s+t */
    247      1.1  mrg #define v2    scratch				/* 2n+1 */
    248      1.1  mrg #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
    249      1.1  mrg #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
    250      1.1  mrg #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
    251      1.1  mrg #define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
    252      1.1  mrg   /* Total scratch need: 10*n+5 */
    253      1.1  mrg 
    254      1.1  mrg   /* Must be in allocation order, as they overwrite one limb beyond
    255      1.1  mrg    * 2n+1. */
    256      1.1  mrg   mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
    257      1.1  mrg   mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
    258      1.1  mrg   mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
    259      1.1  mrg 
    260      1.1  mrg   /* vm1, 2n+1 limbs */
    261      1.1  mrg   mpn_mul_n (vm1, asm1, bsm1, n);
    262      1.1  mrg   cy = 0;
    263      1.1  mrg   if (asm1[n] == 1)
    264      1.1  mrg     {
    265      1.1  mrg       cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    266      1.1  mrg     }
    267      1.1  mrg   else if (asm1[n] == 2)
    268      1.1  mrg     {
    269      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    270      1.1  mrg       cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
    271      1.1  mrg #else
    272      1.1  mrg       cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
    273      1.1  mrg #endif
    274      1.1  mrg     }
    275      1.1  mrg   vm1[2 * n] = cy;
    276      1.1  mrg 
    277      1.1  mrg   /* v1, 2n+1 limbs */
    278      1.1  mrg   mpn_mul_n (v1, as1, bs1, n);
    279      1.1  mrg   if (as1[n] == 1)
    280      1.1  mrg     {
    281      1.1  mrg       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    282      1.1  mrg     }
    283      1.1  mrg   else if (as1[n] == 2)
    284      1.1  mrg     {
    285      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    286      1.1  mrg       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
    287      1.1  mrg #else
    288      1.1  mrg       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    289      1.1  mrg #endif
    290      1.1  mrg     }
    291      1.1  mrg   else if (as1[n] != 0)
    292      1.1  mrg     {
    293      1.1  mrg       cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
    294      1.1  mrg     }
    295      1.1  mrg   else
    296      1.1  mrg     cy = 0;
    297      1.1  mrg   if (bs1[n] != 0)
    298      1.1  mrg     cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    299      1.1  mrg   v1[2 * n] = cy;
    300      1.1  mrg 
    301      1.1  mrg   mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
    302      1.1  mrg 
    303      1.1  mrg   /* vinf, s+t limbs */
    304      1.1  mrg   if (s > t)  mpn_mul (vinf, a5, s, b1, t);
    305      1.1  mrg   else        mpn_mul (vinf, b1, t, a5, s);
    306      1.1  mrg 
    307  1.1.1.2  mrg   mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
    308      1.1  mrg 			     vm2, vm1, v2, vh, s + t, scratch_out);
    309      1.1  mrg 
    310      1.1  mrg   TMP_FREE;
    311      1.1  mrg }
    312