Home | History | Annotate | Line # | Download | only in generic
toom33_mul.c revision 1.1.1.4
      1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
      2    size.  Or more accurately, bn <= an < (3/2)bn.
      3 
      4    Contributed to the GNU project by Torbjorn Granlund.
      5    Additional improvements by Marco Bodrato.
      6 
      7    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      8    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      9    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     10 
     11 Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
     12 
     13 This file is part of the GNU MP Library.
     14 
     15 The GNU MP Library is free software; you can redistribute it and/or modify
     16 it under the terms of either:
     17 
     18   * the GNU Lesser General Public License as published by the Free
     19     Software Foundation; either version 3 of the License, or (at your
     20     option) any later version.
     21 
     22 or
     23 
     24   * the GNU General Public License as published by the Free Software
     25     Foundation; either version 2 of the License, or (at your option) any
     26     later version.
     27 
     28 or both in parallel, as here.
     29 
     30 The GNU MP Library is distributed in the hope that it will be useful, but
     31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     33 for more details.
     34 
     35 You should have received copies of the GNU General Public License and the
     36 GNU Lesser General Public License along with the GNU MP Library.  If not,
     37 see https://www.gnu.org/licenses/.  */
     38 
     39 
     40 #include "gmp-impl.h"
     41 
     42 /* Evaluate in: -1, 0, +1, +2, +inf
     43 
     44   <-s--><--n--><--n-->
     45    ____ ______ ______
     46   |_a2_|___a1_|___a0_|
     47    |b2_|___b1_|___b0_|
     48    <-t-><--n--><--n-->
     49 
     50   v0  =  a0         * b0          #   A(0)*B(0)
     51   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
     52   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
     53   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
     54   vinf=          a2 *         b2  # A(inf)*B(inf)
     55 */
     56 
     57 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     58 #define MAYBE_mul_basecase 1
     59 #define MAYBE_mul_toom33   1
     60 #else
     61 #define MAYBE_mul_basecase						\
     62   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
     63 #define MAYBE_mul_toom33						\
     64   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
     65 #endif
     66 
     67 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
     68    multiplication at the infinity point. We may have
     69    MAYBE_mul_basecase == 0, and still get s just below
     70    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
     71    s == 1 and mpn_toom22_mul will crash.
     72 */
     73 
     74 #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
     75   do {									\
     76     if (MAYBE_mul_basecase						\
     77 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
     78       mpn_mul_basecase (p, a, n, b, n);					\
     79     else if (! MAYBE_mul_toom33						\
     80 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
     81       mpn_toom22_mul (p, a, n, b, n, ws);				\
     82     else								\
     83       mpn_toom33_mul (p, a, n, b, n, ws);				\
     84   } while (0)
     85 
     86 void
     87 mpn_toom33_mul (mp_ptr pp,
     88 		mp_srcptr ap, mp_size_t an,
     89 		mp_srcptr bp, mp_size_t bn,
     90 		mp_ptr scratch)
     91 {
     92   const int __gmpn_cpuvec_initialized = 1;
     93   mp_size_t n, s, t;
     94   int vm1_neg;
     95   mp_limb_t cy, vinf0;
     96   mp_ptr gp;
     97   mp_ptr as1, asm1, as2;
     98   mp_ptr bs1, bsm1, bs2;
     99 
    100 #define a0  ap
    101 #define a1  (ap + n)
    102 #define a2  (ap + 2*n)
    103 #define b0  bp
    104 #define b1  (bp + n)
    105 #define b2  (bp + 2*n)
    106 
    107   n = (an + 2) / (size_t) 3;
    108 
    109   s = an - 2 * n;
    110   t = bn - 2 * n;
    111 
    112   ASSERT (an >= bn);
    113 
    114   ASSERT (0 < s && s <= n);
    115   ASSERT (0 < t && t <= n);
    116 
    117   as1  = scratch + 4 * n + 4;
    118   asm1 = scratch + 2 * n + 2;
    119   as2 = pp + n + 1;
    120 
    121   bs1 = pp;
    122   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
    123   bs2 = pp + 2 * n + 2;
    124 
    125   gp = scratch;
    126 
    127   vm1_neg = 0;
    128 
    129   /* Compute as1 and asm1.  */
    130   cy = mpn_add (gp, a0, n, a2, s);
    131 #if HAVE_NATIVE_mpn_add_n_sub_n
    132   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    133     {
    134       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
    135       as1[n] = cy >> 1;
    136       asm1[n] = 0;
    137       vm1_neg = 1;
    138     }
    139   else
    140     {
    141       mp_limb_t cy2;
    142       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    143       as1[n] = cy + (cy2 >> 1);
    144       asm1[n] = cy - (cy2 & 1);
    145     }
    146 #else
    147   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    148   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    149     {
    150       mpn_sub_n (asm1, a1, gp, n);
    151       asm1[n] = 0;
    152       vm1_neg = 1;
    153     }
    154   else
    155     {
    156       cy -= mpn_sub_n (asm1, gp, a1, n);
    157       asm1[n] = cy;
    158     }
    159 #endif
    160 
    161   /* Compute as2.  */
    162 #if HAVE_NATIVE_mpn_rsblsh1_n
    163   cy = mpn_add_n (as2, a2, as1, s);
    164   if (s != n)
    165     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    166   cy += as1[n];
    167   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    168 #else
    169 #if HAVE_NATIVE_mpn_addlsh1_n
    170   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    171   if (s != n)
    172     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    173   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    174 #else
    175   cy = mpn_add_n (as2, a2, as1, s);
    176   if (s != n)
    177     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    178   cy += as1[n];
    179   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    180   cy -= mpn_sub_n (as2, as2, a0, n);
    181 #endif
    182 #endif
    183   as2[n] = cy;
    184 
    185   /* Compute bs1 and bsm1.  */
    186   cy = mpn_add (gp, b0, n, b2, t);
    187 #if HAVE_NATIVE_mpn_add_n_sub_n
    188   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    189     {
    190       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
    191       bs1[n] = cy >> 1;
    192       bsm1[n] = 0;
    193       vm1_neg ^= 1;
    194     }
    195   else
    196     {
    197       mp_limb_t cy2;
    198       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
    199       bs1[n] = cy + (cy2 >> 1);
    200       bsm1[n] = cy - (cy2 & 1);
    201     }
    202 #else
    203   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
    204   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    205     {
    206       mpn_sub_n (bsm1, b1, gp, n);
    207       bsm1[n] = 0;
    208       vm1_neg ^= 1;
    209     }
    210   else
    211     {
    212       cy -= mpn_sub_n (bsm1, gp, b1, n);
    213       bsm1[n] = cy;
    214     }
    215 #endif
    216 
    217   /* Compute bs2.  */
    218 #if HAVE_NATIVE_mpn_rsblsh1_n
    219   cy = mpn_add_n (bs2, b2, bs1, t);
    220   if (t != n)
    221     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    222   cy += bs1[n];
    223   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
    224 #else
    225 #if HAVE_NATIVE_mpn_addlsh1_n
    226   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
    227   if (t != n)
    228     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
    229   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
    230 #else
    231   cy  = mpn_add_n (bs2, bs1, b2, t);
    232   if (t != n)
    233     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    234   cy += bs1[n];
    235   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
    236   cy -= mpn_sub_n (bs2, bs2, b0, n);
    237 #endif
    238 #endif
    239   bs2[n] = cy;
    240 
    241   ASSERT (as1[n] <= 2);
    242   ASSERT (bs1[n] <= 2);
    243   ASSERT (asm1[n] <= 1);
    244   ASSERT (bsm1[n] <= 1);
    245   ASSERT (as2[n] <= 6);
    246   ASSERT (bs2[n] <= 6);
    247 
    248 #define v0    pp				/* 2n */
    249 #define v1    (pp + 2 * n)			/* 2n+1 */
    250 #define vinf  (pp + 4 * n)			/* s+t */
    251 #define vm1   scratch				/* 2n+1 */
    252 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    253 #define scratch_out  (scratch + 5 * n + 5)
    254 
    255   /* vm1, 2n+1 limbs */
    256 #ifdef SMALLER_RECURSION
    257   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
    258   cy = 0;
    259   if (asm1[n] != 0)
    260     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    261   if (bsm1[n] != 0)
    262     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    263   vm1[2 * n] = cy;
    264 #else
    265   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
    266 #endif
    267 
    268   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    269 
    270   /* vinf, s+t limbs */
    271   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
    272   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
    273 
    274   vinf0 = vinf[0];				/* v1 overlaps with this */
    275 
    276 #ifdef SMALLER_RECURSION
    277   /* v1, 2n+1 limbs */
    278   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
    279   if (as1[n] == 1)
    280     {
    281       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    282     }
    283   else if (as1[n] != 0)
    284     {
    285 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
    286       cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
    287 #else
    288       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    289 #endif
    290     }
    291   else
    292     cy = 0;
    293   if (bs1[n] == 1)
    294     {
    295       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    296     }
    297   else if (bs1[n] != 0)
    298     {
    299 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
    300       cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
    301 #else
    302       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    303 #endif
    304     }
    305   v1[2 * n] = cy;
    306 #else
    307   cy = vinf[1];
    308   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
    309   vinf[1] = cy;
    310 #endif
    311 
    312   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
    313 
    314   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
    315 }
    316