Home | History | Annotate | Line # | Download | only in generic
toom33_mul.c revision 1.1.1.3
      1      1.1  mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
      2      1.1  mrg    size.  Or more accurately, bn <= an < (3/2)bn.
      3      1.1  mrg 
      4      1.1  mrg    Contributed to the GNU project by Torbjorn Granlund.
      5      1.1  mrg    Additional improvements by Marco Bodrato.
      6      1.1  mrg 
      7      1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      8      1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      9      1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     10      1.1  mrg 
     11  1.1.1.3  mrg Copyright 2006-2008, 2010, 2012 Free Software Foundation, Inc.
     12      1.1  mrg 
     13      1.1  mrg This file is part of the GNU MP Library.
     14      1.1  mrg 
     15      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     16  1.1.1.3  mrg it under the terms of either:
     17  1.1.1.3  mrg 
     18  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     19  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     20  1.1.1.3  mrg     option) any later version.
     21  1.1.1.3  mrg 
     22  1.1.1.3  mrg or
     23  1.1.1.3  mrg 
     24  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     25  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     26  1.1.1.3  mrg     later version.
     27  1.1.1.3  mrg 
     28  1.1.1.3  mrg or both in parallel, as here.
     29      1.1  mrg 
     30      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     31      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     32  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     33  1.1.1.3  mrg for more details.
     34      1.1  mrg 
     35  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     36  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     37  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     38      1.1  mrg 
     39      1.1  mrg 
     40      1.1  mrg #include "gmp.h"
     41      1.1  mrg #include "gmp-impl.h"
     42      1.1  mrg 
     43      1.1  mrg /* Evaluate in: -1, 0, +1, +2, +inf
     44      1.1  mrg 
     45  1.1.1.3  mrg   <-s--><--n--><--n-->
     46  1.1.1.3  mrg    ____ ______ ______
     47  1.1.1.3  mrg   |_a2_|___a1_|___a0_|
     48  1.1.1.3  mrg    |b2_|___b1_|___b0_|
     49  1.1.1.3  mrg    <-t-><--n--><--n-->
     50      1.1  mrg 
     51      1.1  mrg   v0  =  a0         * b0          #   A(0)*B(0)
     52      1.1  mrg   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
     53      1.1  mrg   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
     54      1.1  mrg   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
     55      1.1  mrg   vinf=          a2 *         b2  # A(inf)*B(inf)
     56      1.1  mrg */
     57      1.1  mrg 
     58  1.1.1.2  mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     59      1.1  mrg #define MAYBE_mul_basecase 1
     60      1.1  mrg #define MAYBE_mul_toom33   1
     61      1.1  mrg #else
     62      1.1  mrg #define MAYBE_mul_basecase						\
     63      1.1  mrg   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
     64      1.1  mrg #define MAYBE_mul_toom33						\
     65      1.1  mrg   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
     66      1.1  mrg #endif
     67      1.1  mrg 
     68      1.1  mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
     69      1.1  mrg    multiplication at the infinity point. We may have
     70      1.1  mrg    MAYBE_mul_basecase == 0, and still get s just below
     71      1.1  mrg    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
     72      1.1  mrg    s == 1 and mpn_toom22_mul will crash.
     73      1.1  mrg */
     74      1.1  mrg 
     75      1.1  mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
     76      1.1  mrg   do {									\
     77      1.1  mrg     if (MAYBE_mul_basecase						\
     78      1.1  mrg 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
     79      1.1  mrg       mpn_mul_basecase (p, a, n, b, n);					\
     80      1.1  mrg     else if (! MAYBE_mul_toom33						\
     81      1.1  mrg 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
     82      1.1  mrg       mpn_toom22_mul (p, a, n, b, n, ws);				\
     83      1.1  mrg     else								\
     84      1.1  mrg       mpn_toom33_mul (p, a, n, b, n, ws);				\
     85      1.1  mrg   } while (0)
     86      1.1  mrg 
     87      1.1  mrg void
     88      1.1  mrg mpn_toom33_mul (mp_ptr pp,
     89      1.1  mrg 		mp_srcptr ap, mp_size_t an,
     90      1.1  mrg 		mp_srcptr bp, mp_size_t bn,
     91      1.1  mrg 		mp_ptr scratch)
     92      1.1  mrg {
     93  1.1.1.2  mrg   const int __gmpn_cpuvec_initialized = 1;
     94      1.1  mrg   mp_size_t n, s, t;
     95      1.1  mrg   int vm1_neg;
     96      1.1  mrg   mp_limb_t cy, vinf0;
     97      1.1  mrg   mp_ptr gp;
     98      1.1  mrg   mp_ptr as1, asm1, as2;
     99      1.1  mrg   mp_ptr bs1, bsm1, bs2;
    100      1.1  mrg 
    101      1.1  mrg #define a0  ap
    102      1.1  mrg #define a1  (ap + n)
    103      1.1  mrg #define a2  (ap + 2*n)
    104      1.1  mrg #define b0  bp
    105      1.1  mrg #define b1  (bp + n)
    106      1.1  mrg #define b2  (bp + 2*n)
    107      1.1  mrg 
    108      1.1  mrg   n = (an + 2) / (size_t) 3;
    109      1.1  mrg 
    110      1.1  mrg   s = an - 2 * n;
    111      1.1  mrg   t = bn - 2 * n;
    112      1.1  mrg 
    113      1.1  mrg   ASSERT (an >= bn);
    114      1.1  mrg 
    115      1.1  mrg   ASSERT (0 < s && s <= n);
    116      1.1  mrg   ASSERT (0 < t && t <= n);
    117      1.1  mrg 
    118      1.1  mrg   as1  = scratch + 4 * n + 4;
    119      1.1  mrg   asm1 = scratch + 2 * n + 2;
    120      1.1  mrg   as2 = pp + n + 1;
    121      1.1  mrg 
    122      1.1  mrg   bs1 = pp;
    123      1.1  mrg   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
    124      1.1  mrg   bs2 = pp + 2 * n + 2;
    125      1.1  mrg 
    126      1.1  mrg   gp = scratch;
    127      1.1  mrg 
    128      1.1  mrg   vm1_neg = 0;
    129      1.1  mrg 
    130      1.1  mrg   /* Compute as1 and asm1.  */
    131      1.1  mrg   cy = mpn_add (gp, a0, n, a2, s);
    132      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    133      1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    134      1.1  mrg     {
    135      1.1  mrg       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
    136      1.1  mrg       as1[n] = cy >> 1;
    137      1.1  mrg       asm1[n] = 0;
    138      1.1  mrg       vm1_neg = 1;
    139      1.1  mrg     }
    140      1.1  mrg   else
    141      1.1  mrg     {
    142      1.1  mrg       mp_limb_t cy2;
    143      1.1  mrg       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    144      1.1  mrg       as1[n] = cy + (cy2 >> 1);
    145      1.1  mrg       asm1[n] = cy - (cy2 & 1);
    146      1.1  mrg     }
    147      1.1  mrg #else
    148      1.1  mrg   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    149      1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    150      1.1  mrg     {
    151      1.1  mrg       mpn_sub_n (asm1, a1, gp, n);
    152      1.1  mrg       asm1[n] = 0;
    153      1.1  mrg       vm1_neg = 1;
    154      1.1  mrg     }
    155      1.1  mrg   else
    156      1.1  mrg     {
    157      1.1  mrg       cy -= mpn_sub_n (asm1, gp, a1, n);
    158      1.1  mrg       asm1[n] = cy;
    159      1.1  mrg     }
    160      1.1  mrg #endif
    161      1.1  mrg 
    162      1.1  mrg   /* Compute as2.  */
    163      1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    164      1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    165      1.1  mrg   if (s != n)
    166      1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    167      1.1  mrg   cy += as1[n];
    168      1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    169      1.1  mrg #else
    170      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    171      1.1  mrg   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    172      1.1  mrg   if (s != n)
    173      1.1  mrg     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    174      1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    175      1.1  mrg #else
    176      1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    177      1.1  mrg   if (s != n)
    178      1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    179      1.1  mrg   cy += as1[n];
    180      1.1  mrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    181      1.1  mrg   cy -= mpn_sub_n (as2, as2, a0, n);
    182      1.1  mrg #endif
    183      1.1  mrg #endif
    184      1.1  mrg   as2[n] = cy;
    185      1.1  mrg 
    186      1.1  mrg   /* Compute bs1 and bsm1.  */
    187      1.1  mrg   cy = mpn_add (gp, b0, n, b2, t);
    188      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    189      1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    190      1.1  mrg     {
    191      1.1  mrg       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
    192      1.1  mrg       bs1[n] = cy >> 1;
    193      1.1  mrg       bsm1[n] = 0;
    194      1.1  mrg       vm1_neg ^= 1;
    195      1.1  mrg     }
    196      1.1  mrg   else
    197      1.1  mrg     {
    198      1.1  mrg       mp_limb_t cy2;
    199      1.1  mrg       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
    200      1.1  mrg       bs1[n] = cy + (cy2 >> 1);
    201      1.1  mrg       bsm1[n] = cy - (cy2 & 1);
    202      1.1  mrg     }
    203      1.1  mrg #else
    204      1.1  mrg   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
    205      1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    206      1.1  mrg     {
    207      1.1  mrg       mpn_sub_n (bsm1, b1, gp, n);
    208      1.1  mrg       bsm1[n] = 0;
    209      1.1  mrg       vm1_neg ^= 1;
    210      1.1  mrg     }
    211      1.1  mrg   else
    212      1.1  mrg     {
    213      1.1  mrg       cy -= mpn_sub_n (bsm1, gp, b1, n);
    214      1.1  mrg       bsm1[n] = cy;
    215      1.1  mrg     }
    216      1.1  mrg #endif
    217      1.1  mrg 
    218      1.1  mrg   /* Compute bs2.  */
    219      1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    220      1.1  mrg   cy = mpn_add_n (bs2, b2, bs1, t);
    221      1.1  mrg   if (t != n)
    222      1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    223      1.1  mrg   cy += bs1[n];
    224      1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
    225      1.1  mrg #else
    226      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    227      1.1  mrg   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
    228      1.1  mrg   if (t != n)
    229      1.1  mrg     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
    230      1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
    231      1.1  mrg #else
    232      1.1  mrg   cy  = mpn_add_n (bs2, bs1, b2, t);
    233      1.1  mrg   if (t != n)
    234      1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    235      1.1  mrg   cy += bs1[n];
    236      1.1  mrg   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
    237      1.1  mrg   cy -= mpn_sub_n (bs2, bs2, b0, n);
    238      1.1  mrg #endif
    239      1.1  mrg #endif
    240      1.1  mrg   bs2[n] = cy;
    241      1.1  mrg 
    242      1.1  mrg   ASSERT (as1[n] <= 2);
    243      1.1  mrg   ASSERT (bs1[n] <= 2);
    244      1.1  mrg   ASSERT (asm1[n] <= 1);
    245      1.1  mrg   ASSERT (bsm1[n] <= 1);
    246      1.1  mrg   ASSERT (as2[n] <= 6);
    247      1.1  mrg   ASSERT (bs2[n] <= 6);
    248      1.1  mrg 
    249      1.1  mrg #define v0    pp				/* 2n */
    250      1.1  mrg #define v1    (pp + 2 * n)			/* 2n+1 */
    251      1.1  mrg #define vinf  (pp + 4 * n)			/* s+t */
    252      1.1  mrg #define vm1   scratch				/* 2n+1 */
    253      1.1  mrg #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    254      1.1  mrg #define scratch_out  (scratch + 5 * n + 5)
    255      1.1  mrg 
    256      1.1  mrg   /* vm1, 2n+1 limbs */
    257      1.1  mrg #ifdef SMALLER_RECURSION
    258      1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
    259      1.1  mrg   cy = 0;
    260      1.1  mrg   if (asm1[n] != 0)
    261      1.1  mrg     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    262      1.1  mrg   if (bsm1[n] != 0)
    263      1.1  mrg     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    264      1.1  mrg   vm1[2 * n] = cy;
    265      1.1  mrg #else
    266      1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
    267      1.1  mrg #endif
    268      1.1  mrg 
    269      1.1  mrg   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    270      1.1  mrg 
    271      1.1  mrg   /* vinf, s+t limbs */
    272      1.1  mrg   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
    273      1.1  mrg   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
    274      1.1  mrg 
    275      1.1  mrg   vinf0 = vinf[0];				/* v1 overlaps with this */
    276      1.1  mrg 
    277      1.1  mrg #ifdef SMALLER_RECURSION
    278      1.1  mrg   /* v1, 2n+1 limbs */
    279      1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
    280      1.1  mrg   if (as1[n] == 1)
    281      1.1  mrg     {
    282      1.1  mrg       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    283      1.1  mrg     }
    284      1.1  mrg   else if (as1[n] != 0)
    285      1.1  mrg     {
    286      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    287      1.1  mrg       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
    288      1.1  mrg #else
    289      1.1  mrg       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    290      1.1  mrg #endif
    291      1.1  mrg     }
    292      1.1  mrg   else
    293      1.1  mrg     cy = 0;
    294      1.1  mrg   if (bs1[n] == 1)
    295      1.1  mrg     {
    296      1.1  mrg       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    297      1.1  mrg     }
    298      1.1  mrg   else if (bs1[n] != 0)
    299      1.1  mrg     {
    300      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    301      1.1  mrg       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    302      1.1  mrg #else
    303      1.1  mrg       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    304      1.1  mrg #endif
    305      1.1  mrg     }
    306      1.1  mrg   v1[2 * n] = cy;
    307      1.1  mrg #else
    308      1.1  mrg   cy = vinf[1];
    309      1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
    310      1.1  mrg   vinf[1] = cy;
    311      1.1  mrg #endif
    312      1.1  mrg 
    313      1.1  mrg   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
    314      1.1  mrg 
    315      1.1  mrg   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
    316      1.1  mrg }
    317