Home | History | Annotate | Line # | Download | only in generic
toom33_mul.c revision 1.1.1.2
      1      1.1  mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
      2      1.1  mrg    size.  Or more accurately, bn <= an < (3/2)bn.
      3      1.1  mrg 
      4      1.1  mrg    Contributed to the GNU project by Torbjorn Granlund.
      5      1.1  mrg    Additional improvements by Marco Bodrato.
      6      1.1  mrg 
      7      1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      8      1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      9      1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     10      1.1  mrg 
     11  1.1.1.2  mrg Copyright 2006, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
     12      1.1  mrg 
     13      1.1  mrg This file is part of the GNU MP Library.
     14      1.1  mrg 
     15      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     16      1.1  mrg it under the terms of the GNU Lesser General Public License as published by
     17      1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
     18      1.1  mrg option) any later version.
     19      1.1  mrg 
     20      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     21      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     22      1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     23      1.1  mrg License for more details.
     24      1.1  mrg 
     25      1.1  mrg You should have received a copy of the GNU Lesser General Public License
     26      1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     27      1.1  mrg 
     28      1.1  mrg 
     29      1.1  mrg #include "gmp.h"
     30      1.1  mrg #include "gmp-impl.h"
     31      1.1  mrg 
     32      1.1  mrg /* Evaluate in: -1, 0, +1, +2, +inf
     33      1.1  mrg 
     34      1.1  mrg   <-s--><--n--><--n--><--n-->
     35      1.1  mrg    ____ ______ ______ ______
     36      1.1  mrg   |_a3_|___a2_|___a1_|___a0_|
     37      1.1  mrg    |b3_|___b2_|___b1_|___b0_|
     38      1.1  mrg    <-t-><--n--><--n--><--n-->
     39      1.1  mrg 
     40      1.1  mrg   v0  =  a0         * b0          #   A(0)*B(0)
     41      1.1  mrg   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
     42      1.1  mrg   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
     43      1.1  mrg   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
     44      1.1  mrg   vinf=          a2 *         b2  # A(inf)*B(inf)
     45      1.1  mrg */
     46      1.1  mrg 
     47  1.1.1.2  mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     48      1.1  mrg #define MAYBE_mul_basecase 1
     49      1.1  mrg #define MAYBE_mul_toom33   1
     50      1.1  mrg #else
     51      1.1  mrg #define MAYBE_mul_basecase						\
     52      1.1  mrg   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
     53      1.1  mrg #define MAYBE_mul_toom33						\
     54      1.1  mrg   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
     55      1.1  mrg #endif
     56      1.1  mrg 
     57      1.1  mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
     58      1.1  mrg    multiplication at the infinity point. We may have
     59      1.1  mrg    MAYBE_mul_basecase == 0, and still get s just below
     60      1.1  mrg    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
     61      1.1  mrg    s == 1 and mpn_toom22_mul will crash.
     62      1.1  mrg */
     63      1.1  mrg 
     64      1.1  mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
     65      1.1  mrg   do {									\
     66      1.1  mrg     if (MAYBE_mul_basecase						\
     67      1.1  mrg 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
     68      1.1  mrg       mpn_mul_basecase (p, a, n, b, n);					\
     69      1.1  mrg     else if (! MAYBE_mul_toom33						\
     70      1.1  mrg 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
     71      1.1  mrg       mpn_toom22_mul (p, a, n, b, n, ws);				\
     72      1.1  mrg     else								\
     73      1.1  mrg       mpn_toom33_mul (p, a, n, b, n, ws);				\
     74      1.1  mrg   } while (0)
     75      1.1  mrg 
     76      1.1  mrg void
     77      1.1  mrg mpn_toom33_mul (mp_ptr pp,
     78      1.1  mrg 		mp_srcptr ap, mp_size_t an,
     79      1.1  mrg 		mp_srcptr bp, mp_size_t bn,
     80      1.1  mrg 		mp_ptr scratch)
     81      1.1  mrg {
     82  1.1.1.2  mrg   const int __gmpn_cpuvec_initialized = 1;
     83      1.1  mrg   mp_size_t n, s, t;
     84      1.1  mrg   int vm1_neg;
     85      1.1  mrg   mp_limb_t cy, vinf0;
     86      1.1  mrg   mp_ptr gp;
     87      1.1  mrg   mp_ptr as1, asm1, as2;
     88      1.1  mrg   mp_ptr bs1, bsm1, bs2;
     89      1.1  mrg 
     90      1.1  mrg #define a0  ap
     91      1.1  mrg #define a1  (ap + n)
     92      1.1  mrg #define a2  (ap + 2*n)
     93      1.1  mrg #define b0  bp
     94      1.1  mrg #define b1  (bp + n)
     95      1.1  mrg #define b2  (bp + 2*n)
     96      1.1  mrg 
     97      1.1  mrg   n = (an + 2) / (size_t) 3;
     98      1.1  mrg 
     99      1.1  mrg   s = an - 2 * n;
    100      1.1  mrg   t = bn - 2 * n;
    101      1.1  mrg 
    102      1.1  mrg   ASSERT (an >= bn);
    103      1.1  mrg 
    104      1.1  mrg   ASSERT (0 < s && s <= n);
    105      1.1  mrg   ASSERT (0 < t && t <= n);
    106      1.1  mrg 
    107      1.1  mrg   as1  = scratch + 4 * n + 4;
    108      1.1  mrg   asm1 = scratch + 2 * n + 2;
    109      1.1  mrg   as2 = pp + n + 1;
    110      1.1  mrg 
    111      1.1  mrg   bs1 = pp;
    112      1.1  mrg   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
    113      1.1  mrg   bs2 = pp + 2 * n + 2;
    114      1.1  mrg 
    115      1.1  mrg   gp = scratch;
    116      1.1  mrg 
    117      1.1  mrg   vm1_neg = 0;
    118      1.1  mrg 
    119      1.1  mrg   /* Compute as1 and asm1.  */
    120      1.1  mrg   cy = mpn_add (gp, a0, n, a2, s);
    121      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    122      1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    123      1.1  mrg     {
    124      1.1  mrg       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
    125      1.1  mrg       as1[n] = cy >> 1;
    126      1.1  mrg       asm1[n] = 0;
    127      1.1  mrg       vm1_neg = 1;
    128      1.1  mrg     }
    129      1.1  mrg   else
    130      1.1  mrg     {
    131      1.1  mrg       mp_limb_t cy2;
    132      1.1  mrg       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    133      1.1  mrg       as1[n] = cy + (cy2 >> 1);
    134      1.1  mrg       asm1[n] = cy - (cy2 & 1);
    135      1.1  mrg     }
    136      1.1  mrg #else
    137      1.1  mrg   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    138      1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    139      1.1  mrg     {
    140      1.1  mrg       mpn_sub_n (asm1, a1, gp, n);
    141      1.1  mrg       asm1[n] = 0;
    142      1.1  mrg       vm1_neg = 1;
    143      1.1  mrg     }
    144      1.1  mrg   else
    145      1.1  mrg     {
    146      1.1  mrg       cy -= mpn_sub_n (asm1, gp, a1, n);
    147      1.1  mrg       asm1[n] = cy;
    148      1.1  mrg     }
    149      1.1  mrg #endif
    150      1.1  mrg 
    151      1.1  mrg   /* Compute as2.  */
    152      1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    153      1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    154      1.1  mrg   if (s != n)
    155      1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    156      1.1  mrg   cy += as1[n];
    157      1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    158      1.1  mrg #else
    159      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    160      1.1  mrg   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    161      1.1  mrg   if (s != n)
    162      1.1  mrg     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    163      1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    164      1.1  mrg #else
    165      1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    166      1.1  mrg   if (s != n)
    167      1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    168      1.1  mrg   cy += as1[n];
    169      1.1  mrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    170      1.1  mrg   cy -= mpn_sub_n (as2, as2, a0, n);
    171      1.1  mrg #endif
    172      1.1  mrg #endif
    173      1.1  mrg   as2[n] = cy;
    174      1.1  mrg 
    175      1.1  mrg   /* Compute bs1 and bsm1.  */
    176      1.1  mrg   cy = mpn_add (gp, b0, n, b2, t);
    177      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    178      1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    179      1.1  mrg     {
    180      1.1  mrg       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
    181      1.1  mrg       bs1[n] = cy >> 1;
    182      1.1  mrg       bsm1[n] = 0;
    183      1.1  mrg       vm1_neg ^= 1;
    184      1.1  mrg     }
    185      1.1  mrg   else
    186      1.1  mrg     {
    187      1.1  mrg       mp_limb_t cy2;
    188      1.1  mrg       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
    189      1.1  mrg       bs1[n] = cy + (cy2 >> 1);
    190      1.1  mrg       bsm1[n] = cy - (cy2 & 1);
    191      1.1  mrg     }
    192      1.1  mrg #else
    193      1.1  mrg   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
    194      1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    195      1.1  mrg     {
    196      1.1  mrg       mpn_sub_n (bsm1, b1, gp, n);
    197      1.1  mrg       bsm1[n] = 0;
    198      1.1  mrg       vm1_neg ^= 1;
    199      1.1  mrg     }
    200      1.1  mrg   else
    201      1.1  mrg     {
    202      1.1  mrg       cy -= mpn_sub_n (bsm1, gp, b1, n);
    203      1.1  mrg       bsm1[n] = cy;
    204      1.1  mrg     }
    205      1.1  mrg #endif
    206      1.1  mrg 
    207      1.1  mrg   /* Compute bs2.  */
    208      1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    209      1.1  mrg   cy = mpn_add_n (bs2, b2, bs1, t);
    210      1.1  mrg   if (t != n)
    211      1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    212      1.1  mrg   cy += bs1[n];
    213      1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
    214      1.1  mrg #else
    215      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    216      1.1  mrg   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
    217      1.1  mrg   if (t != n)
    218      1.1  mrg     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
    219      1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
    220      1.1  mrg #else
    221      1.1  mrg   cy  = mpn_add_n (bs2, bs1, b2, t);
    222      1.1  mrg   if (t != n)
    223      1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    224      1.1  mrg   cy += bs1[n];
    225      1.1  mrg   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
    226      1.1  mrg   cy -= mpn_sub_n (bs2, bs2, b0, n);
    227      1.1  mrg #endif
    228      1.1  mrg #endif
    229      1.1  mrg   bs2[n] = cy;
    230      1.1  mrg 
    231      1.1  mrg   ASSERT (as1[n] <= 2);
    232      1.1  mrg   ASSERT (bs1[n] <= 2);
    233      1.1  mrg   ASSERT (asm1[n] <= 1);
    234      1.1  mrg   ASSERT (bsm1[n] <= 1);
    235      1.1  mrg   ASSERT (as2[n] <= 6);
    236      1.1  mrg   ASSERT (bs2[n] <= 6);
    237      1.1  mrg 
    238      1.1  mrg #define v0    pp				/* 2n */
    239      1.1  mrg #define v1    (pp + 2 * n)			/* 2n+1 */
    240      1.1  mrg #define vinf  (pp + 4 * n)			/* s+t */
    241      1.1  mrg #define vm1   scratch				/* 2n+1 */
    242      1.1  mrg #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    243      1.1  mrg #define scratch_out  (scratch + 5 * n + 5)
    244      1.1  mrg 
    245      1.1  mrg   /* vm1, 2n+1 limbs */
    246      1.1  mrg #ifdef SMALLER_RECURSION
    247      1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
    248      1.1  mrg   cy = 0;
    249      1.1  mrg   if (asm1[n] != 0)
    250      1.1  mrg     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    251      1.1  mrg   if (bsm1[n] != 0)
    252      1.1  mrg     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    253      1.1  mrg   vm1[2 * n] = cy;
    254      1.1  mrg #else
    255      1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
    256      1.1  mrg #endif
    257      1.1  mrg 
    258      1.1  mrg   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    259      1.1  mrg 
    260      1.1  mrg   /* vinf, s+t limbs */
    261      1.1  mrg   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
    262      1.1  mrg   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
    263      1.1  mrg 
    264      1.1  mrg   vinf0 = vinf[0];				/* v1 overlaps with this */
    265      1.1  mrg 
    266      1.1  mrg #ifdef SMALLER_RECURSION
    267      1.1  mrg   /* v1, 2n+1 limbs */
    268      1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
    269      1.1  mrg   if (as1[n] == 1)
    270      1.1  mrg     {
    271      1.1  mrg       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    272      1.1  mrg     }
    273      1.1  mrg   else if (as1[n] != 0)
    274      1.1  mrg     {
    275      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    276      1.1  mrg       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
    277      1.1  mrg #else
    278      1.1  mrg       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    279      1.1  mrg #endif
    280      1.1  mrg     }
    281      1.1  mrg   else
    282      1.1  mrg     cy = 0;
    283      1.1  mrg   if (bs1[n] == 1)
    284      1.1  mrg     {
    285      1.1  mrg       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    286      1.1  mrg     }
    287      1.1  mrg   else if (bs1[n] != 0)
    288      1.1  mrg     {
    289      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    290      1.1  mrg       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    291      1.1  mrg #else
    292      1.1  mrg       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    293      1.1  mrg #endif
    294      1.1  mrg     }
    295      1.1  mrg   v1[2 * n] = cy;
    296      1.1  mrg #else
    297      1.1  mrg   cy = vinf[1];
    298      1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
    299      1.1  mrg   vinf[1] = cy;
    300      1.1  mrg #endif
    301      1.1  mrg 
    302      1.1  mrg   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
    303      1.1  mrg 
    304      1.1  mrg   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
    305      1.1  mrg }
    306