Home | History | Annotate | Line # | Download | only in generic
toom3_sqr.c revision 1.1.1.1.8.1
      1          1.1  mrg /* mpn_toom3_sqr -- Square {ap,an}.
      2          1.1  mrg 
      3          1.1  mrg    Contributed to the GNU project by Torbjorn Granlund.
      4          1.1  mrg    Additional improvements by Marco Bodrato.
      5          1.1  mrg 
      6          1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      7          1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      8          1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      9          1.1  mrg 
     10  1.1.1.1.8.1  tls Copyright 2006, 2007, 2008, 2009, 2010, 2012 Free Software Foundation, Inc.
     11          1.1  mrg 
     12          1.1  mrg This file is part of the GNU MP Library.
     13          1.1  mrg 
     14          1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     15          1.1  mrg it under the terms of the GNU Lesser General Public License as published by
     16          1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
     17          1.1  mrg option) any later version.
     18          1.1  mrg 
     19          1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     20          1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     21          1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     22          1.1  mrg License for more details.
     23          1.1  mrg 
     24          1.1  mrg You should have received a copy of the GNU Lesser General Public License
     25          1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     26          1.1  mrg 
     27          1.1  mrg 
     28          1.1  mrg #include "gmp.h"
     29          1.1  mrg #include "gmp-impl.h"
     30          1.1  mrg 
     31          1.1  mrg /* Evaluate in: -1, 0, +1, +2, +inf
     32          1.1  mrg 
     33          1.1  mrg   <-s--><--n--><--n-->
     34          1.1  mrg    ____ ______ ______
     35          1.1  mrg   |_a2_|___a1_|___a0_|
     36          1.1  mrg 
     37          1.1  mrg   v0  =  a0         ^2 #   A(0)^2
     38          1.1  mrg   v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
     39          1.1  mrg   vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
     40          1.1  mrg   v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
     41          1.1  mrg   vinf=          a2 ^2 # A(inf)^2
     42          1.1  mrg */
     43          1.1  mrg 
     44  1.1.1.1.8.1  tls #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     45          1.1  mrg #define MAYBE_sqr_basecase 1
     46          1.1  mrg #define MAYBE_sqr_toom3   1
     47          1.1  mrg #else
     48          1.1  mrg #define MAYBE_sqr_basecase						\
     49          1.1  mrg   (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
     50          1.1  mrg #define MAYBE_sqr_toom3							\
     51          1.1  mrg   (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
     52          1.1  mrg #endif
     53          1.1  mrg 
     54          1.1  mrg #define TOOM3_SQR_REC(p, a, n, ws)					\
     55          1.1  mrg   do {									\
     56          1.1  mrg     if (MAYBE_sqr_basecase						\
     57          1.1  mrg 	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
     58          1.1  mrg       mpn_sqr_basecase (p, a, n);					\
     59          1.1  mrg     else if (! MAYBE_sqr_toom3						\
     60          1.1  mrg 	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
     61          1.1  mrg       mpn_toom2_sqr (p, a, n, ws);					\
     62          1.1  mrg     else								\
     63          1.1  mrg       mpn_toom3_sqr (p, a, n, ws);					\
     64          1.1  mrg   } while (0)
     65          1.1  mrg 
     66          1.1  mrg void
     67          1.1  mrg mpn_toom3_sqr (mp_ptr pp,
     68          1.1  mrg 	       mp_srcptr ap, mp_size_t an,
     69          1.1  mrg 	       mp_ptr scratch)
     70          1.1  mrg {
     71  1.1.1.1.8.1  tls   const int __gmpn_cpuvec_initialized = 1;
     72          1.1  mrg   mp_size_t n, s;
     73          1.1  mrg   mp_limb_t cy, vinf0;
     74          1.1  mrg   mp_ptr gp;
     75          1.1  mrg   mp_ptr as1, asm1, as2;
     76          1.1  mrg 
     77          1.1  mrg #define a0  ap
     78          1.1  mrg #define a1  (ap + n)
     79          1.1  mrg #define a2  (ap + 2*n)
     80          1.1  mrg 
     81          1.1  mrg   n = (an + 2) / (size_t) 3;
     82          1.1  mrg 
     83          1.1  mrg   s = an - 2 * n;
     84          1.1  mrg 
     85          1.1  mrg   ASSERT (0 < s && s <= n);
     86          1.1  mrg 
     87          1.1  mrg   as1 = scratch + 4 * n + 4;
     88          1.1  mrg   asm1 = scratch + 2 * n + 2;
     89          1.1  mrg   as2 = pp + n + 1;
     90          1.1  mrg 
     91          1.1  mrg   gp = scratch;
     92          1.1  mrg 
     93          1.1  mrg   /* Compute as1 and asm1.  */
     94          1.1  mrg   cy = mpn_add (gp, a0, n, a2, s);
     95          1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
     96          1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
     97          1.1  mrg     {
     98          1.1  mrg       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
     99          1.1  mrg       as1[n] = cy >> 1;
    100          1.1  mrg       asm1[n] = 0;
    101          1.1  mrg     }
    102          1.1  mrg   else
    103          1.1  mrg     {
    104          1.1  mrg       mp_limb_t cy2;
    105          1.1  mrg       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    106          1.1  mrg       as1[n] = cy + (cy2 >> 1);
    107          1.1  mrg       asm1[n] = cy - (cy2 & 1);
    108          1.1  mrg     }
    109          1.1  mrg #else
    110          1.1  mrg   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    111          1.1  mrg   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    112          1.1  mrg     {
    113          1.1  mrg       mpn_sub_n (asm1, a1, gp, n);
    114          1.1  mrg       asm1[n] = 0;
    115          1.1  mrg     }
    116          1.1  mrg   else
    117          1.1  mrg     {
    118          1.1  mrg       cy -= mpn_sub_n (asm1, gp, a1, n);
    119          1.1  mrg       asm1[n] = cy;
    120          1.1  mrg     }
    121          1.1  mrg #endif
    122          1.1  mrg 
    123          1.1  mrg   /* Compute as2.  */
    124          1.1  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
    125          1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    126          1.1  mrg   if (s != n)
    127          1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    128          1.1  mrg   cy += as1[n];
    129          1.1  mrg   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    130          1.1  mrg #else
    131          1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    132          1.1  mrg   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    133          1.1  mrg   if (s != n)
    134          1.1  mrg     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    135          1.1  mrg   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    136          1.1  mrg #else
    137          1.1  mrg   cy = mpn_add_n (as2, a2, as1, s);
    138          1.1  mrg   if (s != n)
    139          1.1  mrg     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    140          1.1  mrg   cy += as1[n];
    141          1.1  mrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    142          1.1  mrg   cy -= mpn_sub_n (as2, as2, a0, n);
    143          1.1  mrg #endif
    144          1.1  mrg #endif
    145          1.1  mrg   as2[n] = cy;
    146          1.1  mrg 
    147          1.1  mrg   ASSERT (as1[n] <= 2);
    148          1.1  mrg   ASSERT (asm1[n] <= 1);
    149          1.1  mrg 
    150          1.1  mrg #define v0    pp				/* 2n */
    151          1.1  mrg #define v1    (pp + 2 * n)			/* 2n+1 */
    152          1.1  mrg #define vinf  (pp + 4 * n)			/* s+s */
    153          1.1  mrg #define vm1   scratch				/* 2n+1 */
    154          1.1  mrg #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    155          1.1  mrg #define scratch_out  (scratch + 5 * n + 5)
    156          1.1  mrg 
    157          1.1  mrg   /* vm1, 2n+1 limbs */
    158          1.1  mrg #ifdef SMALLER_RECURSION
    159          1.1  mrg   TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
    160          1.1  mrg   cy = 0;
    161          1.1  mrg   if (asm1[n] != 0)
    162          1.1  mrg     cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    163          1.1  mrg   if (asm1[n] != 0)
    164          1.1  mrg     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    165          1.1  mrg   vm1[2 * n] = cy;
    166          1.1  mrg #else
    167          1.1  mrg   TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
    168          1.1  mrg #endif
    169          1.1  mrg 
    170          1.1  mrg   TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    171          1.1  mrg 
    172          1.1  mrg   TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
    173          1.1  mrg 
    174          1.1  mrg   vinf0 = vinf[0];				/* v1 overlaps with this */
    175          1.1  mrg 
    176          1.1  mrg #ifdef SMALLER_RECURSION
    177          1.1  mrg   /* v1, 2n+1 limbs */
    178          1.1  mrg   TOOM3_SQR_REC (v1, as1, n, scratch_out);
    179          1.1  mrg   if (as1[n] == 1)
    180          1.1  mrg     {
    181          1.1  mrg       cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
    182          1.1  mrg     }
    183          1.1  mrg   else if (as1[n] != 0)
    184          1.1  mrg     {
    185          1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    186          1.1  mrg       cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    187          1.1  mrg #else
    188          1.1  mrg       cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    189          1.1  mrg #endif
    190          1.1  mrg     }
    191          1.1  mrg   else
    192          1.1  mrg     cy = 0;
    193          1.1  mrg   if (as1[n] == 1)
    194          1.1  mrg     {
    195          1.1  mrg       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    196          1.1  mrg     }
    197          1.1  mrg   else if (as1[n] != 0)
    198          1.1  mrg     {
    199          1.1  mrg #if HAVE_NATIVE_mpn_addlsh1_n
    200          1.1  mrg       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    201          1.1  mrg #else
    202          1.1  mrg       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    203          1.1  mrg #endif
    204          1.1  mrg     }
    205          1.1  mrg   v1[2 * n] = cy;
    206          1.1  mrg #else
    207          1.1  mrg   cy = vinf[1];
    208          1.1  mrg   TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
    209          1.1  mrg   vinf[1] = cy;
    210          1.1  mrg #endif
    211          1.1  mrg 
    212          1.1  mrg   TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
    213          1.1  mrg 
    214          1.1  mrg   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
    215          1.1  mrg }
    216