Home | History | Annotate | Line # | Download | only in generic
toom33_mul.c revision 1.1.1.4
      1      1.1  mrg /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
      2      1.1  mrg    size.  Or more accurately, bn <= an < (3/2)bn.
      3      1.1  mrg 
      4      1.1  mrg    Contributed to the GNU project by Torbjorn Granlund.
      5      1.1  mrg    Additional improvements by Marco Bodrato.
      6      1.1  mrg 
      7      1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      8      1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      9      1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     10      1.1  mrg 
     11  1.1.1.4  mrg Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
     12      1.1  mrg 
     13      1.1  mrg This file is part of the GNU MP Library.
     14      1.1  mrg 
     15      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     16  1.1.1.3  mrg it under the terms of either:
     17  1.1.1.3  mrg 
     18  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     19  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     20  1.1.1.3  mrg     option) any later version.
     21  1.1.1.3  mrg 
     22  1.1.1.3  mrg or
     23  1.1.1.3  mrg 
     24  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     25  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     26  1.1.1.3  mrg     later version.
     27  1.1.1.3  mrg 
     28  1.1.1.3  mrg or both in parallel, as here.
     29      1.1  mrg 
     30      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     31      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     32  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     33  1.1.1.3  mrg for more details.
     34      1.1  mrg 
     35  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     36  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     37  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     38      1.1  mrg 
     39      1.1  mrg 
     40      1.1  mrg #include "gmp-impl.h"
     41      1.1  mrg 
     42      1.1  mrg /* Evaluate in: -1, 0, +1, +2, +inf
     43      1.1  mrg 
     44  1.1.1.3  mrg   <-s--><--n--><--n-->
     45  1.1.1.3  mrg    ____ ______ ______
     46  1.1.1.3  mrg   |_a2_|___a1_|___a0_|
     47  1.1.1.3  mrg    |b2_|___b1_|___b0_|
     48  1.1.1.3  mrg    <-t-><--n--><--n-->
     49      1.1  mrg 
     50      1.1  mrg   v0  =  a0         * b0          #   A(0)*B(0)
     51      1.1  mrg   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
     52      1.1  mrg   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
     53      1.1  mrg   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
     54      1.1  mrg   vinf=          a2 *         b2  # A(inf)*B(inf)
     55      1.1  mrg */
     56      1.1  mrg 
     57  1.1.1.2  mrg #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     58      1.1  mrg #define MAYBE_mul_basecase 1
     59      1.1  mrg #define MAYBE_mul_toom33   1
     60      1.1  mrg #else
     61      1.1  mrg #define MAYBE_mul_basecase						\
     62      1.1  mrg   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
     63      1.1  mrg #define MAYBE_mul_toom33						\
     64      1.1  mrg   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
     65      1.1  mrg #endif
     66      1.1  mrg 
     67      1.1  mrg /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
     68      1.1  mrg    multiplication at the infinity point. We may have
     69      1.1  mrg    MAYBE_mul_basecase == 0, and still get s just below
     70      1.1  mrg    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
     71      1.1  mrg    s == 1 and mpn_toom22_mul will crash.
     72      1.1  mrg */
     73      1.1  mrg 
     74      1.1  mrg #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
     75      1.1  mrg   do {									\
     76      1.1  mrg     if (MAYBE_mul_basecase						\
     77      1.1  mrg 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
     78      1.1  mrg       mpn_mul_basecase (p, a, n, b, n);					\
     79      1.1  mrg     else if (! MAYBE_mul_toom33						\
     80      1.1  mrg 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
     81      1.1  mrg       mpn_toom22_mul (p, a, n, b, n, ws);				\
     82      1.1  mrg     else								\
     83      1.1  mrg       mpn_toom33_mul (p, a, n, b, n, ws);				\
     84      1.1  mrg   } while (0)
     85      1.1  mrg 
     86      1.1  mrg void
     87      1.1  mrg mpn_toom33_mul (mp_ptr pp,
     88      1.1  mrg 		mp_srcptr ap, mp_size_t an,
     89      1.1  mrg 		mp_srcptr bp, mp_size_t bn,
     90      1.1  mrg 		mp_ptr scratch)
     91      1.1  mrg {
     92  1.1.1.2  mrg   const int __gmpn_cpuvec_initialized = 1;
     93      1.1  mrg   mp_size_t n, s, t;
     94      1.1  mrg   int vm1_neg;
     95      1.1  mrg   mp_limb_t cy, vinf0;
     96      1.1  mrg   mp_ptr gp;
     97      1.1  mrg   mp_ptr as1, asm1, as2;
     98      1.1  mrg   mp_ptr bs1, bsm1, bs2;
     99      1.1  mrg 
    100      1.1  mrg #define a0  ap
    101      1.1  mrg #define a1  (ap + n)
    102      1.1  mrg #define a2  (ap + 2*n)
    103      1.1  mrg #define b0  bp
    104      1.1  mrg #define b1  (bp + n)
    105      1.1  mrg #define b2  (bp + 2*n)
    106      1.1  mrg 
    107      1.1  mrg   n = (an + 2) / (size_t) 3;
    108      1.1  mrg 
    109      1.1  mrg   s = an - 2 * n;
    110      1.1  mrg   t = bn - 2 * n;
    111      1.1  mrg 
    112      1.1  mrg   ASSERT (an >= bn);
    113      1.1  mrg 
    114      1.1  mrg   ASSERT (0 < s && s <= n);
    115      1.1  mrg   ASSERT (0 < t && t <= n);
    116      1.1  mrg 
    117      1.1  mrg   as1  = scratch + 4 * n + 4;
    118      1.1  mrg   asm1 = scratch + 2 * n + 2;
    119      1.1  mrg   as2 = pp + n + 1;
    120      1.1  mrg 
    121      1.1  mrg   bs1 = pp;
    122      1.1  mrg   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
    123      1.1  mrg   bs2 = pp + 2 * n + 2;
    124      1.1  mrg 
    125      1.1  mrg   gp = scratch;
    126      1.1  mrg 
    127      1.1  mrg   vm1_neg = 0;
    128      1.1  mrg 
    129      1.1  mrg   /* Compute as1 and asm1.  */
    130      1.1  mrg   cy = mpn_add (gp, a0, n, a2, s);
    131      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    132      1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    133      1.1  mrg     {
    134      1.1  mrg       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
    135      1.1  mrg       as1[n] = cy >> 1;
    136      1.1  mrg       asm1[n] = 0;
    137      1.1  mrg       vm1_neg = 1;
    138      1.1  mrg     }
    139      1.1  mrg   else
    140      1.1  mrg     {
    141      1.1  mrg       mp_limb_t cy2;
    142      1.1  mrg       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    143      1.1  mrg       as1[n] = cy + (cy2 >> 1);
    144      1.1  mrg       asm1[n] = cy - (cy2 & 1);
    145      1.1  mrg     }
    146      1.1  mrg #else
    147      1.1  mrg   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    148      1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    149      1.1  mrg     {
    150      1.1  mrg       mpn_sub_n (asm1, a1, gp, n);
    151      1.1  mrg       asm1[n] = 0;
    152      1.1  mrg       vm1_neg = 1;
    153      1.1  mrg     }
    154      1.1  mrg   else
    155      1.1  mrg     {
    156      1.1  mrg       cy -= mpn_sub_n (asm1, gp, a1, n);
    157      1.1  mrg       asm1[n] = cy;
    158      1.1  mrg     }
    159      1.1  mrg #endif
    160      1.1  mrg 
    161      1.1  mrg   /* Compute as2.  */
    162      1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    163      1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    164      1.1  mrg   if (s != n)
    165      1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    166      1.1  mrg   cy += as1[n];
    167      1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    168      1.1  mrg #else
    169      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    170      1.1  mrg   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    171      1.1  mrg   if (s != n)
    172      1.1  mrg     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    173      1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    174      1.1  mrg #else
    175      1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    176      1.1  mrg   if (s != n)
    177      1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    178      1.1  mrg   cy += as1[n];
    179      1.1  mrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    180      1.1  mrg   cy -= mpn_sub_n (as2, as2, a0, n);
    181      1.1  mrg #endif
    182      1.1  mrg #endif
    183      1.1  mrg   as2[n] = cy;
    184      1.1  mrg 
    185      1.1  mrg   /* Compute bs1 and bsm1.  */
    186      1.1  mrg   cy = mpn_add (gp, b0, n, b2, t);
    187      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    188      1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    189      1.1  mrg     {
    190      1.1  mrg       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
    191      1.1  mrg       bs1[n] = cy >> 1;
    192      1.1  mrg       bsm1[n] = 0;
    193      1.1  mrg       vm1_neg ^= 1;
    194      1.1  mrg     }
    195      1.1  mrg   else
    196      1.1  mrg     {
    197      1.1  mrg       mp_limb_t cy2;
    198      1.1  mrg       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
    199      1.1  mrg       bs1[n] = cy + (cy2 >> 1);
    200      1.1  mrg       bsm1[n] = cy - (cy2 & 1);
    201      1.1  mrg     }
    202      1.1  mrg #else
    203      1.1  mrg   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
    204      1.1  mrg   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    205      1.1  mrg     {
    206      1.1  mrg       mpn_sub_n (bsm1, b1, gp, n);
    207      1.1  mrg       bsm1[n] = 0;
    208      1.1  mrg       vm1_neg ^= 1;
    209      1.1  mrg     }
    210      1.1  mrg   else
    211      1.1  mrg     {
    212      1.1  mrg       cy -= mpn_sub_n (bsm1, gp, b1, n);
    213      1.1  mrg       bsm1[n] = cy;
    214      1.1  mrg     }
    215      1.1  mrg #endif
    216      1.1  mrg 
    217      1.1  mrg   /* Compute bs2.  */
    218      1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    219      1.1  mrg   cy = mpn_add_n (bs2, b2, bs1, t);
    220      1.1  mrg   if (t != n)
    221      1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    222      1.1  mrg   cy += bs1[n];
    223      1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
    224      1.1  mrg #else
    225      1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    226      1.1  mrg   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
    227      1.1  mrg   if (t != n)
    228      1.1  mrg     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
    229      1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
    230      1.1  mrg #else
    231      1.1  mrg   cy  = mpn_add_n (bs2, bs1, b2, t);
    232      1.1  mrg   if (t != n)
    233      1.1  mrg     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    234      1.1  mrg   cy += bs1[n];
    235      1.1  mrg   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
    236      1.1  mrg   cy -= mpn_sub_n (bs2, bs2, b0, n);
    237      1.1  mrg #endif
    238      1.1  mrg #endif
    239      1.1  mrg   bs2[n] = cy;
    240      1.1  mrg 
    241      1.1  mrg   ASSERT (as1[n] <= 2);
    242      1.1  mrg   ASSERT (bs1[n] <= 2);
    243      1.1  mrg   ASSERT (asm1[n] <= 1);
    244      1.1  mrg   ASSERT (bsm1[n] <= 1);
    245      1.1  mrg   ASSERT (as2[n] <= 6);
    246      1.1  mrg   ASSERT (bs2[n] <= 6);
    247      1.1  mrg 
    248      1.1  mrg #define v0    pp				/* 2n */
    249      1.1  mrg #define v1    (pp + 2 * n)			/* 2n+1 */
    250      1.1  mrg #define vinf  (pp + 4 * n)			/* s+t */
    251      1.1  mrg #define vm1   scratch				/* 2n+1 */
    252      1.1  mrg #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    253      1.1  mrg #define scratch_out  (scratch + 5 * n + 5)
    254      1.1  mrg 
    255      1.1  mrg   /* vm1, 2n+1 limbs */
    256      1.1  mrg #ifdef SMALLER_RECURSION
    257      1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
    258      1.1  mrg   cy = 0;
    259      1.1  mrg   if (asm1[n] != 0)
    260      1.1  mrg     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    261      1.1  mrg   if (bsm1[n] != 0)
    262      1.1  mrg     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    263      1.1  mrg   vm1[2 * n] = cy;
    264      1.1  mrg #else
    265      1.1  mrg   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
    266      1.1  mrg #endif
    267      1.1  mrg 
    268      1.1  mrg   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    269      1.1  mrg 
    270      1.1  mrg   /* vinf, s+t limbs */
    271      1.1  mrg   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
    272      1.1  mrg   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
    273      1.1  mrg 
    274      1.1  mrg   vinf0 = vinf[0];				/* v1 overlaps with this */
    275      1.1  mrg 
    276      1.1  mrg #ifdef SMALLER_RECURSION
    277      1.1  mrg   /* v1, 2n+1 limbs */
    278      1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
    279      1.1  mrg   if (as1[n] == 1)
    280      1.1  mrg     {
    281      1.1  mrg       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    282      1.1  mrg     }
    283      1.1  mrg   else if (as1[n] != 0)
    284      1.1  mrg     {
    285  1.1.1.4  mrg #if HAVE_NATIVE_mpn_addlsh1_n_ip1
    286  1.1.1.4  mrg       cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
    287      1.1  mrg #else
    288      1.1  mrg       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    289      1.1  mrg #endif
    290      1.1  mrg     }
    291      1.1  mrg   else
    292      1.1  mrg     cy = 0;
    293      1.1  mrg   if (bs1[n] == 1)
    294      1.1  mrg     {
    295      1.1  mrg       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    296      1.1  mrg     }
    297      1.1  mrg   else if (bs1[n] != 0)
    298      1.1  mrg     {
    299  1.1.1.4  mrg #if HAVE_NATIVE_mpn_addlsh1_n_ip1
    300  1.1.1.4  mrg       cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
    301      1.1  mrg #else
    302      1.1  mrg       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    303      1.1  mrg #endif
    304      1.1  mrg     }
    305      1.1  mrg   v1[2 * n] = cy;
    306      1.1  mrg #else
    307      1.1  mrg   cy = vinf[1];
    308      1.1  mrg   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
    309      1.1  mrg   vinf[1] = cy;
    310      1.1  mrg #endif
    311      1.1  mrg 
    312      1.1  mrg   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
    313      1.1  mrg 
    314      1.1  mrg   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
    315      1.1  mrg }
    316