Home | History | Annotate | Line # | Download | only in generic
toom33_mul.c revision 1.1
      1  1.1  mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
      2  1.1  mrg    size.  Or more accurately, bn <= an < (3/2)bn.
      3  1.1  mrg 
      4  1.1  mrg    Contributed to the GNU project by Torbjorn Granlund.
      5  1.1  mrg    Additional improvements by Marco Bodrato.
      6  1.1  mrg 
      7  1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      8  1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      9  1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     10  1.1  mrg 
     11  1.1  mrg Copyright 2006, 2007, 2008, 2010 Free Software Foundation, Inc.
     12  1.1  mrg 
     13  1.1  mrg This file is part of the GNU MP Library.
     14  1.1  mrg 
     15  1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     16  1.1  mrg it under the terms of the GNU Lesser General Public License as published by
     17  1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
     18  1.1  mrg option) any later version.
     19  1.1  mrg 
     20  1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     21  1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     22  1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     23  1.1  mrg License for more details.
     24  1.1  mrg 
     25  1.1  mrg You should have received a copy of the GNU Lesser General Public License
     26  1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     27  1.1  mrg 
     28  1.1  mrg 
     29  1.1  mrg #include "gmp.h"
     30  1.1  mrg #include "gmp-impl.h"
     31  1.1  mrg 
     32  1.1  mrg /* Evaluate in: -1, 0, +1, +2, +inf
     33  1.1  mrg 
     34  1.1  mrg   <-s--><--n--><--n--><--n-->
     35  1.1  mrg    ____ ______ ______ ______
     36  1.1  mrg   |_a3_|___a2_|___a1_|___a0_|
     37  1.1  mrg    |b3_|___b2_|___b1_|___b0_|
     38  1.1  mrg    <-t-><--n--><--n--><--n-->
     39  1.1  mrg 
     40  1.1  mrg   v0  =  a0         * b0          #   A(0)*B(0)
     41  1.1  mrg   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
     42  1.1  mrg   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
     43  1.1  mrg   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
     44  1.1  mrg   vinf=          a2 *         b2  # A(inf)*B(inf)
     45  1.1  mrg */
     46  1.1  mrg 
     47  1.1  mrg #if TUNE_PROGRAM_BUILD
     48  1.1  mrg #define MAYBE_mul_basecase 1
     49  1.1  mrg #define MAYBE_mul_toom33   1
     50  1.1  mrg #else
     51  1.1  mrg #define MAYBE_mul_basecase						\
     52  1.1  mrg   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
     53  1.1  mrg #define MAYBE_mul_toom33						\
     54  1.1  mrg   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
     55  1.1  mrg #endif
     56  1.1  mrg 
     57  1.1  mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
     58  1.1  mrg    multiplication at the infinity point. We may have
     59  1.1  mrg    MAYBE_mul_basecase == 0, and still get s just below
     60  1.1  mrg    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
     61  1.1  mrg    s == 1 and mpn_toom22_mul will crash.
     62  1.1  mrg */
     63  1.1  mrg 
     64  1.1  mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
     65  1.1  mrg   do {									\
     66  1.1  mrg     if (MAYBE_mul_basecase						\
     67  1.1  mrg 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
     68  1.1  mrg       mpn_mul_basecase (p, a, n, b, n);					\
     69  1.1  mrg     else if (! MAYBE_mul_toom33						\
     70  1.1  mrg 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
     71  1.1  mrg       mpn_toom22_mul (p, a, n, b, n, ws);				\
     72  1.1  mrg     else								\
     73  1.1  mrg       mpn_toom33_mul (p, a, n, b, n, ws);				\
     74  1.1  mrg   } while (0)
     75  1.1  mrg 
     76  1.1  mrg void
     77  1.1  mrg mpn_toom33_mul (mp_ptr pp,
     78  1.1  mrg 		mp_srcptr ap, mp_size_t an,
     79  1.1  mrg 		mp_srcptr bp, mp_size_t bn,
     80  1.1  mrg 		mp_ptr scratch)
     81  1.1  mrg {
     82  1.1  mrg   mp_size_t n, s, t;
     83  1.1  mrg   int vm1_neg;
     84  1.1  mrg   mp_limb_t cy, vinf0;
     85  1.1  mrg   mp_ptr gp;
     86  1.1  mrg   mp_ptr as1, asm1, as2;
     87  1.1  mrg   mp_ptr bs1, bsm1, bs2;
     88  1.1  mrg 
     89  1.1  mrg #define a0  ap
     90  1.1  mrg #define a1  (ap + n)
     91  1.1  mrg #define a2  (ap + 2*n)
     92  1.1  mrg #define b0  bp
     93  1.1  mrg #define b1  (bp + n)
     94  1.1  mrg #define b2  (bp + 2*n)
     95  1.1  mrg 
     96  1.1  mrg   n = (an + 2) / (size_t) 3;
     97  1.1  mrg 
     98  1.1  mrg   s = an - 2 * n;
     99  1.1  mrg   t = bn - 2 * n;
    100  1.1  mrg 
    101  1.1  mrg   ASSERT (an >= bn);
    102  1.1  mrg 
    103  1.1  mrg   ASSERT (0 < s && s <= n);
    104  1.1  mrg   ASSERT (0 < t && t <= n);
    105  1.1  mrg 
    106  1.1  mrg   as1  = scratch + 4 * n + 4;
    107  1.1  mrg   asm1 = scratch + 2 * n + 2;
    108  1.1  mrg   as2 = pp + n + 1;
    109  1.1  mrg 
    110  1.1  mrg   bs1 = pp;
    111  1.1  mrg   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
    112  1.1  mrg   bs2 = pp + 2 * n + 2;
    113  1.1  mrg 
    114  1.1  mrg   gp = scratch;
    115  1.1  mrg 
    116  1.1  mrg   vm1_neg = 0;
    117  1.1  mrg 
    118  1.1  mrg   /* Compute as1 and asm1.  */
    119  1.1  mrg   cy = mpn_add (gp, a0, n, a2, s);
    120  1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    121  1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    122  1.1  mrg     {
    123  1.1  mrg       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
    124  1.1  mrg       as1[n] = cy >> 1;
    125  1.1  mrg       asm1[n] = 0;
    126  1.1  mrg       vm1_neg = 1;
    127  1.1  mrg     }
    128  1.1  mrg   else
    129  1.1  mrg     {
    130  1.1  mrg       mp_limb_t cy2;
    131  1.1  mrg       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    132  1.1  mrg       as1[n] = cy + (cy2 >> 1);
    133  1.1  mrg       asm1[n] = cy - (cy2 & 1);
    134  1.1  mrg     }
    135  1.1  mrg #else
    136  1.1  mrg   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    137  1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    138  1.1  mrg     {
    139  1.1  mrg       mpn_sub_n (asm1, a1, gp, n);
    140  1.1  mrg       asm1[n] = 0;
    141  1.1  mrg       vm1_neg = 1;
    142  1.1  mrg     }
    143  1.1  mrg   else
    144  1.1  mrg     {
    145  1.1  mrg       cy -= mpn_sub_n (asm1, gp, a1, n);
    146  1.1  mrg       asm1[n] = cy;
    147  1.1  mrg     }
    148  1.1  mrg #endif
    149  1.1  mrg 
    150  1.1  mrg   /* Compute as2.  */
    151  1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    152  1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    153  1.1  mrg   if (s != n)
    154  1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    155  1.1  mrg   cy += as1[n];
    156  1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    157  1.1  mrg #else
    158  1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    159  1.1  mrg   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    160  1.1  mrg   if (s != n)
    161  1.1  mrg     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    162  1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    163  1.1  mrg #else
    164  1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    165  1.1  mrg   if (s != n)
    166  1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    167  1.1  mrg   cy += as1[n];
    168  1.1  mrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    169  1.1  mrg   cy -= mpn_sub_n (as2, as2, a0, n);
    170  1.1  mrg #endif
    171  1.1  mrg #endif
    172  1.1  mrg   as2[n] = cy;
    173  1.1  mrg 
    174  1.1  mrg   /* Compute bs1 and bsm1.  */
    175  1.1  mrg   cy = mpn_add (gp, b0, n, b2, t);
    176  1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    177  1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    178  1.1  mrg     {
    179  1.1  mrg       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
    180  1.1  mrg       bs1[n] = cy >> 1;
    181  1.1  mrg       bsm1[n] = 0;
    182  1.1  mrg       vm1_neg ^= 1;
    183  1.1  mrg     }
    184  1.1  mrg   else
    185  1.1  mrg     {
    186  1.1  mrg       mp_limb_t cy2;
    187  1.1  mrg       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
    188  1.1  mrg       bs1[n] = cy + (cy2 >> 1);
    189  1.1  mrg       bsm1[n] = cy - (cy2 & 1);
    190  1.1  mrg     }
    191  1.1  mrg #else
    192  1.1  mrg   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
    193  1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    194  1.1  mrg     {
    195  1.1  mrg       mpn_sub_n (bsm1, b1, gp, n);
    196  1.1  mrg       bsm1[n] = 0;
    197  1.1  mrg       vm1_neg ^= 1;
    198  1.1  mrg     }
    199  1.1  mrg   else
    200  1.1  mrg     {
    201  1.1  mrg       cy -= mpn_sub_n (bsm1, gp, b1, n);
    202  1.1  mrg       bsm1[n] = cy;
    203  1.1  mrg     }
    204  1.1  mrg #endif
    205  1.1  mrg 
    206  1.1  mrg   /* Compute bs2.  */
    207  1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    208  1.1  mrg   cy = mpn_add_n (bs2, b2, bs1, t);
    209  1.1  mrg   if (t != n)
    210  1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    211  1.1  mrg   cy += bs1[n];
    212  1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
    213  1.1  mrg #else
    214  1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    215  1.1  mrg   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
    216  1.1  mrg   if (t != n)
    217  1.1  mrg     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
    218  1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
    219  1.1  mrg #else
    220  1.1  mrg   cy  = mpn_add_n (bs2, bs1, b2, t);
    221  1.1  mrg   if (t != n)
    222  1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    223  1.1  mrg   cy += bs1[n];
    224  1.1  mrg   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
    225  1.1  mrg   cy -= mpn_sub_n (bs2, bs2, b0, n);
    226  1.1  mrg #endif
    227  1.1  mrg #endif
    228  1.1  mrg   bs2[n] = cy;
    229  1.1  mrg 
    230  1.1  mrg   ASSERT (as1[n] <= 2);
    231  1.1  mrg   ASSERT (bs1[n] <= 2);
    232  1.1  mrg   ASSERT (asm1[n] <= 1);
    233  1.1  mrg   ASSERT (bsm1[n] <= 1);
    234  1.1  mrg   ASSERT (as2[n] <= 6);
    235  1.1  mrg   ASSERT (bs2[n] <= 6);
    236  1.1  mrg 
    237  1.1  mrg #define v0    pp				/* 2n */
    238  1.1  mrg #define v1    (pp + 2 * n)			/* 2n+1 */
    239  1.1  mrg #define vinf  (pp + 4 * n)			/* s+t */
    240  1.1  mrg #define vm1   scratch				/* 2n+1 */
    241  1.1  mrg #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    242  1.1  mrg #define scratch_out  (scratch + 5 * n + 5)
    243  1.1  mrg 
    244  1.1  mrg   /* vm1, 2n+1 limbs */
    245  1.1  mrg #ifdef SMALLER_RECURSION
    246  1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
    247  1.1  mrg   cy = 0;
    248  1.1  mrg   if (asm1[n] != 0)
    249  1.1  mrg     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    250  1.1  mrg   if (bsm1[n] != 0)
    251  1.1  mrg     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    252  1.1  mrg   vm1[2 * n] = cy;
    253  1.1  mrg #else
    254  1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
    255  1.1  mrg #endif
    256  1.1  mrg 
    257  1.1  mrg   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    258  1.1  mrg 
    259  1.1  mrg   /* vinf, s+t limbs */
    260  1.1  mrg   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
    261  1.1  mrg   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
    262  1.1  mrg 
    263  1.1  mrg   vinf0 = vinf[0];				/* v1 overlaps with this */
    264  1.1  mrg 
    265  1.1  mrg #ifdef SMALLER_RECURSION
    266  1.1  mrg   /* v1, 2n+1 limbs */
    267  1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
    268  1.1  mrg   if (as1[n] == 1)
    269  1.1  mrg     {
    270  1.1  mrg       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    271  1.1  mrg     }
    272  1.1  mrg   else if (as1[n] != 0)
    273  1.1  mrg     {
    274  1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    275  1.1  mrg       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
    276  1.1  mrg #else
    277  1.1  mrg       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    278  1.1  mrg #endif
    279  1.1  mrg     }
    280  1.1  mrg   else
    281  1.1  mrg     cy = 0;
    282  1.1  mrg   if (bs1[n] == 1)
    283  1.1  mrg     {
    284  1.1  mrg       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    285  1.1  mrg     }
    286  1.1  mrg   else if (bs1[n] != 0)
    287  1.1  mrg     {
    288  1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    289  1.1  mrg       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    290  1.1  mrg #else
    291  1.1  mrg       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    292  1.1  mrg #endif
    293  1.1  mrg     }
    294  1.1  mrg   v1[2 * n] = cy;
    295  1.1  mrg #else
    296  1.1  mrg   cy = vinf[1];
    297  1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
    298  1.1  mrg   vinf[1] = cy;
    299  1.1  mrg #endif
    300  1.1  mrg 
    301  1.1  mrg   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
    302  1.1  mrg 
    303  1.1  mrg   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
    304  1.1  mrg }
    305