Home | History | Annotate | Line # | Download | only in generic
toom3_sqr.c revision 1.1.1.1.8.1
      1 /* mpn_toom3_sqr -- Square {ap,an}.
      2 
      3    Contributed to the GNU project by Torbjorn Granlund.
      4    Additional improvements by Marco Bodrato.
      5 
      6    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      7    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      8    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      9 
     10 Copyright 2006, 2007, 2008, 2009, 2010, 2012 Free Software Foundation, Inc.
     11 
     12 This file is part of the GNU MP Library.
     13 
     14 The GNU MP Library is free software; you can redistribute it and/or modify
     15 it under the terms of the GNU Lesser General Public License as published by
     16 the Free Software Foundation; either version 3 of the License, or (at your
     17 option) any later version.
     18 
     19 The GNU MP Library is distributed in the hope that it will be useful, but
     20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     22 License for more details.
     23 
     24 You should have received a copy of the GNU Lesser General Public License
     25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     26 
     27 
     28 #include "gmp.h"
     29 #include "gmp-impl.h"
     30 
     31 /* Evaluate in: -1, 0, +1, +2, +inf
     32 
     33   <-s--><--n--><--n-->
     34    ____ ______ ______
     35   |_a2_|___a1_|___a0_|
     36 
     37   v0  =  a0         ^2 #   A(0)^2
     38   v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
     39   vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
     40   v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
     41   vinf=          a2 ^2 # A(inf)^2
     42 */
     43 
     44 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
     45 #define MAYBE_sqr_basecase 1
     46 #define MAYBE_sqr_toom3   1
     47 #else
     48 #define MAYBE_sqr_basecase						\
     49   (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
     50 #define MAYBE_sqr_toom3							\
     51   (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
     52 #endif
     53 
     54 #define TOOM3_SQR_REC(p, a, n, ws)					\
     55   do {									\
     56     if (MAYBE_sqr_basecase						\
     57 	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
     58       mpn_sqr_basecase (p, a, n);					\
     59     else if (! MAYBE_sqr_toom3						\
     60 	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
     61       mpn_toom2_sqr (p, a, n, ws);					\
     62     else								\
     63       mpn_toom3_sqr (p, a, n, ws);					\
     64   } while (0)
     65 
     66 void
     67 mpn_toom3_sqr (mp_ptr pp,
     68 	       mp_srcptr ap, mp_size_t an,
     69 	       mp_ptr scratch)
     70 {
     71   const int __gmpn_cpuvec_initialized = 1;
     72   mp_size_t n, s;
     73   mp_limb_t cy, vinf0;
     74   mp_ptr gp;
     75   mp_ptr as1, asm1, as2;
     76 
     77 #define a0  ap
     78 #define a1  (ap + n)
     79 #define a2  (ap + 2*n)
     80 
     81   n = (an + 2) / (size_t) 3;
     82 
     83   s = an - 2 * n;
     84 
     85   ASSERT (0 < s && s <= n);
     86 
     87   as1 = scratch + 4 * n + 4;
     88   asm1 = scratch + 2 * n + 2;
     89   as2 = pp + n + 1;
     90 
     91   gp = scratch;
     92 
     93   /* Compute as1 and asm1.  */
     94   cy = mpn_add (gp, a0, n, a2, s);
     95 #if HAVE_NATIVE_mpn_add_n_sub_n
     96   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
     97     {
     98       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
     99       as1[n] = cy >> 1;
    100       asm1[n] = 0;
    101     }
    102   else
    103     {
    104       mp_limb_t cy2;
    105       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
    106       as1[n] = cy + (cy2 >> 1);
    107       asm1[n] = cy - (cy2 & 1);
    108     }
    109 #else
    110   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
    111   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
    112     {
    113       mpn_sub_n (asm1, a1, gp, n);
    114       asm1[n] = 0;
    115     }
    116   else
    117     {
    118       cy -= mpn_sub_n (asm1, gp, a1, n);
    119       asm1[n] = cy;
    120     }
    121 #endif
    122 
    123   /* Compute as2.  */
    124 #if HAVE_NATIVE_mpn_rsblsh1_n
    125   cy = mpn_add_n (as2, a2, as1, s);
    126   if (s != n)
    127     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    128   cy += as1[n];
    129   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
    130 #else
    131 #if HAVE_NATIVE_mpn_addlsh1_n
    132   cy  = mpn_addlsh1_n (as2, a1, a2, s);
    133   if (s != n)
    134     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
    135   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
    136 #else
    137   cy = mpn_add_n (as2, a2, as1, s);
    138   if (s != n)
    139     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
    140   cy += as1[n];
    141   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
    142   cy -= mpn_sub_n (as2, as2, a0, n);
    143 #endif
    144 #endif
    145   as2[n] = cy;
    146 
    147   ASSERT (as1[n] <= 2);
    148   ASSERT (asm1[n] <= 1);
    149 
    150 #define v0    pp				/* 2n */
    151 #define v1    (pp + 2 * n)			/* 2n+1 */
    152 #define vinf  (pp + 4 * n)			/* s+s */
    153 #define vm1   scratch				/* 2n+1 */
    154 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
    155 #define scratch_out  (scratch + 5 * n + 5)
    156 
    157   /* vm1, 2n+1 limbs */
    158 #ifdef SMALLER_RECURSION
    159   TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
    160   cy = 0;
    161   if (asm1[n] != 0)
    162     cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    163   if (asm1[n] != 0)
    164     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
    165   vm1[2 * n] = cy;
    166 #else
    167   TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
    168 #endif
    169 
    170   TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
    171 
    172   TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
    173 
    174   vinf0 = vinf[0];				/* v1 overlaps with this */
    175 
    176 #ifdef SMALLER_RECURSION
    177   /* v1, 2n+1 limbs */
    178   TOOM3_SQR_REC (v1, as1, n, scratch_out);
    179   if (as1[n] == 1)
    180     {
    181       cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
    182     }
    183   else if (as1[n] != 0)
    184     {
    185 #if HAVE_NATIVE_mpn_addlsh1_n
    186       cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    187 #else
    188       cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    189 #endif
    190     }
    191   else
    192     cy = 0;
    193   if (as1[n] == 1)
    194     {
    195       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
    196     }
    197   else if (as1[n] != 0)
    198     {
    199 #if HAVE_NATIVE_mpn_addlsh1_n
    200       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
    201 #else
    202       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
    203 #endif
    204     }
    205   v1[2 * n] = cy;
    206 #else
    207   cy = vinf[1];
    208   TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
    209   vinf[1] = cy;
    210 #endif
    211 
    212   TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
    213 
    214   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
    215 }
    216