Home | History | Annotate | Line # | Download | only in generic
toom33_mul.c revision 1.1.1.2
      1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
      2    size.  Or more accurately, bn <= an < (3/2)bn.
      3 
      4    Contributed to the GNU project by Torbjorn Granlund.
      5    Additional improvements by Marco Bodrato.
      6 
      7    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      8    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      9    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     10 
     11 Copyright 2006, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
     12 
     13 This file is part of the GNU MP Library.
     14 
     15 The GNU MP Library is free software; you can redistribute it and/or modify
     16 it under the terms of the GNU Lesser General Public License as published by
     17 the Free Software Foundation; either version 3 of the License, or (at your
     18 option) any later version.
     19 
     20 The GNU MP Library is distributed in the hope that it will be useful, but
     21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     23 License for more details.
     24 
     25 You should have received a copy of the GNU Lesser General Public License
     26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     27 
     28 
     29 #include "gmp.h"
     30 #include "gmp-impl.h"
     31 
     32 /* Evaluate in: -1, 0, +1, +2, +inf
     33 
     34   <-s--><--n--><--n--><--n-->
     35    ____ ______ ______ ______
     36   |_a3_|___a2_|___a1_|___a0_|
     37    |b3_|___b2_|___b1_|___b0_|
     38    <-t-><--n--><--n--><--n-->
     39 
     40   v0  =  a0         * b0          #   A(0)*B(0)
     41   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
     42   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
     43   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
     44   vinf=          a2 *         b2  # A(inf)*B(inf)
     45 */
     46 
     47 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     48 #define MAYBE_mul_basecase 1
     49 #define MAYBE_mul_toom33   1
     50 #else
     51 #define MAYBE_mul_basecase						\
     52   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
     53 #define MAYBE_mul_toom33						\
     54   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
     55 #endif
     56 
     57 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
     58    multiplication at the infinity point. We may have
     59    MAYBE_mul_basecase == 0, and still get s just below
     60    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
     61    s == 1 and mpn_toom22_mul will crash.
     62 */
     63 
     64 #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
     65   do {									\
     66     if (MAYBE_mul_basecase						\
     67 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
     68       mpn_mul_basecase (p, a, n, b, n);					\
     69     else if (! MAYBE_mul_toom33						\
     70 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
     71       mpn_toom22_mul (p, a, n, b, n, ws);				\
     72     else								\
     73       mpn_toom33_mul (p, a, n, b, n, ws);				\
     74   } while (0)
     75 
     76 void
     77 mpn_toom33_mul (mp_ptr pp,
     78 		mp_srcptr ap, mp_size_t an,
     79 		mp_srcptr bp, mp_size_t bn,
     80 		mp_ptr scratch)
     81 {
     82   const int __gmpn_cpuvec_initialized = 1;
     83   mp_size_t n, s, t;
     84   int vm1_neg;
     85   mp_limb_t cy, vinf0;
     86   mp_ptr gp;
     87   mp_ptr as1, asm1, as2;
     88   mp_ptr bs1, bsm1, bs2;
     89 
     90 #define a0  ap
     91 #define a1  (ap + n)
     92 #define a2  (ap + 2*n)
     93 #define b0  bp
     94 #define b1  (bp + n)
     95 #define b2  (bp + 2*n)
     96 
     97   n = (an + 2) / (size_t) 3;
     98 
     99   s = an - 2 * n;
    100   t = bn - 2 * n;
    101 
    102   ASSERT (an >= bn);
    103 
    104   ASSERT (0 < s && s <= n);
    105   ASSERT (0 < t && t <= n);
    106 
    107   as1  = scratch + 4 * n + 4;
    108   asm1 = scratch + 2 * n + 2;
    109   as2 = pp + n + 1;
    110 
    111   bs1 = pp;
    112   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
    113   bs2 = pp + 2 * n + 2;
    114 
    115   gp = scratch;
    116 
    117   vm1_neg = 0;
    118 
    119   /* Compute as1 and asm1.  */
    120   cy = mpn_add (gp, a0, n, a2, s);
    121 #if HAVE_NATIVE_mpn_add_n_sub_n
    122   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    123     {
    124       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
    125       as1[n] = cy >> 1;
    126       asm1[n] = 0;
    127       vm1_neg = 1;
    128     }
    129   else
    130     {
    131       mp_limb_t cy2;
    132       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    133       as1[n] = cy + (cy2 >> 1);
    134       asm1[n] = cy - (cy2 & 1);
    135     }
    136 #else
    137   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    138   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    139     {
    140       mpn_sub_n (asm1, a1, gp, n);
    141       asm1[n] = 0;
    142       vm1_neg = 1;
    143     }
    144   else
    145     {
    146       cy -= mpn_sub_n (asm1, gp, a1, n);
    147       asm1[n] = cy;
    148     }
    149 #endif
    150 
    151   /* Compute as2.  */
    152 #if HAVE_NATIVE_mpn_rsblsh1_n
    153   cy = mpn_add_n (as2, a2, as1, s);
    154   if (s != n)
    155     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    156   cy += as1[n];
    157   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    158 #else
    159 #if HAVE_NATIVE_mpn_addlsh1_n
    160   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    161   if (s != n)
    162     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    163   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    164 #else
    165   cy = mpn_add_n (as2, a2, as1, s);
    166   if (s != n)
    167     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    168   cy += as1[n];
    169   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    170   cy -= mpn_sub_n (as2, as2, a0, n);
    171 #endif
    172 #endif
    173   as2[n] = cy;
    174 
    175   /* Compute bs1 and bsm1.  */
    176   cy = mpn_add (gp, b0, n, b2, t);
    177 #if HAVE_NATIVE_mpn_add_n_sub_n
    178   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    179     {
    180       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
    181       bs1[n] = cy >> 1;
    182       bsm1[n] = 0;
    183       vm1_neg ^= 1;
    184     }
    185   else
    186     {
    187       mp_limb_t cy2;
    188       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
    189       bs1[n] = cy + (cy2 >> 1);
    190       bsm1[n] = cy - (cy2 & 1);
    191     }
    192 #else
    193   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
    194   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
    195     {
    196       mpn_sub_n (bsm1, b1, gp, n);
    197       bsm1[n] = 0;
    198       vm1_neg ^= 1;
    199     }
    200   else
    201     {
    202       cy -= mpn_sub_n (bsm1, gp, b1, n);
    203       bsm1[n] = cy;
    204     }
    205 #endif
    206 
    207   /* Compute bs2.  */
    208 #if HAVE_NATIVE_mpn_rsblsh1_n
    209   cy = mpn_add_n (bs2, b2, bs1, t);
    210   if (t != n)
    211     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    212   cy += bs1[n];
    213   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
    214 #else
    215 #if HAVE_NATIVE_mpn_addlsh1_n
    216   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
    217   if (t != n)
    218     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
    219   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
    220 #else
    221   cy  = mpn_add_n (bs2, bs1, b2, t);
    222   if (t != n)
    223     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
    224   cy += bs1[n];
    225   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
    226   cy -= mpn_sub_n (bs2, bs2, b0, n);
    227 #endif
    228 #endif
    229   bs2[n] = cy;
    230 
    231   ASSERT (as1[n] <= 2);
    232   ASSERT (bs1[n] <= 2);
    233   ASSERT (asm1[n] <= 1);
    234   ASSERT (bsm1[n] <= 1);
    235   ASSERT (as2[n] <= 6);
    236   ASSERT (bs2[n] <= 6);
    237 
    238 #define v0    pp				/* 2n */
    239 #define v1    (pp + 2 * n)			/* 2n+1 */
    240 #define vinf  (pp + 4 * n)			/* s+t */
    241 #define vm1   scratch				/* 2n+1 */
    242 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    243 #define scratch_out  (scratch + 5 * n + 5)
    244 
    245   /* vm1, 2n+1 limbs */
    246 #ifdef SMALLER_RECURSION
    247   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
    248   cy = 0;
    249   if (asm1[n] != 0)
    250     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
    251   if (bsm1[n] != 0)
    252     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    253   vm1[2 * n] = cy;
    254 #else
    255   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
    256 #endif
    257 
    258   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    259 
    260   /* vinf, s+t limbs */
    261   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
    262   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
    263 
    264   vinf0 = vinf[0];				/* v1 overlaps with this */
    265 
    266 #ifdef SMALLER_RECURSION
    267   /* v1, 2n+1 limbs */
    268   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
    269   if (as1[n] == 1)
    270     {
    271       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
    272     }
    273   else if (as1[n] != 0)
    274     {
    275 #if HAVE_NATIVE_mpn_addlsh1_n
    276       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
    277 #else
    278       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
    279 #endif
    280     }
    281   else
    282     cy = 0;
    283   if (bs1[n] == 1)
    284     {
    285       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    286     }
    287   else if (bs1[n] != 0)
    288     {
    289 #if HAVE_NATIVE_mpn_addlsh1_n
    290       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    291 #else
    292       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    293 #endif
    294     }
    295   v1[2 * n] = cy;
    296 #else
    297   cy = vinf[1];
    298   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
    299   vinf[1] = cy;
    300 #endif
    301 
    302   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
    303 
    304   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
    305 }
    306