Home | History | Annotate | Line # | Download | only in generic
toom_interpolate_6pts.c revision 1.1.1.1.8.1
      1 /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
      2 
      3    Contributed to the GNU project by Marco Bodrato.
      4 
      5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      8 
      9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
     10 
     11 This file is part of the GNU MP Library.
     12 
     13 The GNU MP Library is free software; you can redistribute it and/or modify
     14 it under the terms of the GNU Lesser General Public License as published by
     15 the Free Software Foundation; either version 3 of the License, or (at your
     16 option) any later version.
     17 
     18 The GNU MP Library is distributed in the hope that it will be useful, but
     19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     21 License for more details.
     22 
     23 You should have received a copy of the GNU Lesser General Public License
     24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     25 
     26 #include "gmp.h"
     27 #include "gmp-impl.h"
     28 
     29 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
     30 #ifndef mpn_divexact_by3
     31 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
     32 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
     33 #else
     34 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
     35 #endif
     36 #endif
     37 
     38 /* Interpolation for Toom-3.5, using the evaluation points: infinity,
     39    1, -1, 2, -2. More precisely, we want to compute
     40    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
     41    six values
     42 
     43      w5 = f(0),
     44      w4 = f(-1),
     45      w3 = f(1)
     46      w2 = f(-2),
     47      w1 = f(2),
     48      w0 = limit at infinity of f(x) / x^5,
     49 
     50    The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
     51    {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
     52    {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
     53    significant limbs small). f(-1) and f(-2) may be negative, signs
     54    determined by the flag bits. All intermediate results are positive.
     55    Inputs are destroyed.
     56 
     57    Interpolation sequence was taken from the paper: "Integer and
     58    Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
     59    Some slight variations were introduced: adaptation to "gmp
     60    instruction set", and a final saving of an operation by interlacing
     61    interpolation and recomposition phases.
     62 */
     63 
     64 void
     65 mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
     66 			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
     67 			   mp_size_t w0n)
     68 {
     69   mp_limb_t cy;
     70   /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
     71   mp_limb_t cy4, cy6, embankment;
     72 
     73   ASSERT( n > 0 );
     74   ASSERT( 2*n >= w0n && w0n > 0 );
     75 
     76 #define w5  pp					/* 2n   */
     77 #define w3  (pp + 2 * n)			/* 2n+1 */
     78 #define w0  (pp + 5 * n)			/* w0n  */
     79 
     80   /* Interpolate with sequence:
     81      W2 =(W1 - W2)>>2
     82      W1 =(W1 - W5)>>1
     83      W1 =(W1 - W2)>>1
     84      W4 =(W3 - W4)>>1
     85      W2 =(W2 - W4)/3
     86      W3 = W3 - W4 - W5
     87      W1 =(W1 - W3)/3
     88      // Last steps are mixed with recomposition...
     89      W2 = W2 - W0<<2
     90      W4 = W4 - W2
     91      W3 = W3 - W1
     92      W2 = W2 - W0
     93   */
     94 
     95   /* W2 =(W1 - W2)>>2 */
     96   if (flags & toom6_vm2_neg)
     97     mpn_add_n (w2, w1, w2, 2 * n + 1);
     98   else
     99     mpn_sub_n (w2, w1, w2, 2 * n + 1);
    100   mpn_rshift (w2, w2, 2 * n + 1, 2);
    101 
    102   /* W1 =(W1 - W5)>>1 */
    103   w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
    104   mpn_rshift (w1, w1, 2 * n + 1, 1);
    105 
    106   /* W1 =(W1 - W2)>>1 */
    107 #if HAVE_NATIVE_mpn_rsh1sub_n
    108   mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
    109 #else
    110   mpn_sub_n (w1, w1, w2, 2 * n + 1);
    111   mpn_rshift (w1, w1, 2 * n + 1, 1);
    112 #endif
    113 
    114   /* W4 =(W3 - W4)>>1 */
    115   if (flags & toom6_vm1_neg)
    116     {
    117 #if HAVE_NATIVE_mpn_rsh1add_n
    118       mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
    119 #else
    120       mpn_add_n (w4, w3, w4, 2 * n + 1);
    121       mpn_rshift (w4, w4, 2 * n + 1, 1);
    122 #endif
    123     }
    124   else
    125     {
    126 #if HAVE_NATIVE_mpn_rsh1sub_n
    127       mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
    128 #else
    129       mpn_sub_n (w4, w3, w4, 2 * n + 1);
    130       mpn_rshift (w4, w4, 2 * n + 1, 1);
    131 #endif
    132     }
    133 
    134   /* W2 =(W2 - W4)/3 */
    135   mpn_sub_n (w2, w2, w4, 2 * n + 1);
    136   mpn_divexact_by3 (w2, w2, 2 * n + 1);
    137 
    138   /* W3 = W3 - W4 - W5 */
    139   mpn_sub_n (w3, w3, w4, 2 * n + 1);
    140   w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
    141 
    142   /* W1 =(W1 - W3)/3 */
    143   mpn_sub_n (w1, w1, w3, 2 * n + 1);
    144   mpn_divexact_by3 (w1, w1, 2 * n + 1);
    145 
    146   /*
    147     [1 0 0 0 0 0;
    148      0 1 0 0 0 0;
    149      1 0 1 0 0 0;
    150      0 1 0 1 0 0;
    151      1 0 1 0 1 0;
    152      0 0 0 0 0 1]
    153 
    154     pp[] prior to operations:
    155      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
    156 
    157     summation scheme for remaining operations:
    158      |______________5|n_____4|n_____3|n_____2|n______|n______|pp
    159      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
    160 				    || H w4  | L w4  |
    161 		    || H w2  | L w2  |
    162 	    || H w1  | L w1  |
    163 			    ||-H w1  |-L w1  |
    164 		     |-H w0  |-L w0 ||-H w2  |-L w2  |
    165   */
    166   cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
    167   MPN_INCR_U (pp + 3 * n + 1, n, cy);
    168 
    169   /* W2 -= W0<<2 */
    170 #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
    171 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
    172   cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
    173 #else
    174   cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
    175 #endif
    176 #else
    177   /* {W4,2*n+1} is now free and can be overwritten. */
    178   cy = mpn_lshift(w4, w0, w0n, 2);
    179   cy+= mpn_sub_n(w2, w2, w4, w0n);
    180 #endif
    181   MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
    182 
    183   /* W4L = W4L - W2L */
    184   cy = mpn_sub_n (pp + n, pp + n, w2, n);
    185   MPN_DECR_U (w3, 2 * n + 1, cy);
    186 
    187   /* W3H = W3H + W2L */
    188   cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
    189   /* W1L + W2H */
    190   cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
    191   MPN_INCR_U (w1 + n, n + 1, cy);
    192 
    193   /* W0 = W0 + W1H */
    194   if (LIKELY (w0n > n))
    195     cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
    196   else
    197     cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
    198 
    199   /*
    200     summation scheme for the next operation:
    201      |...____5|n_____4|n_____3|n_____2|n______|n______|pp
    202      |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
    203 		     ...-w0___|-w1_w2 |
    204   */
    205   /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
    206   cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
    207 
    208   /* embankment is a "dirty trick" to avoid carry/borrow propagation
    209      beyond allocated memory */
    210   embankment = w0[w0n - 1] - 1;
    211   w0[w0n - 1] = 1;
    212   if (LIKELY (w0n > n)) {
    213     if (cy4 > cy6)
    214       MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
    215     else
    216       MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
    217     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
    218     MPN_INCR_U (w0 + n, w0n - n, cy6);
    219   } else {
    220     MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
    221     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
    222   }
    223   w0[w0n - 1] += embankment;
    224 
    225 #undef w5
    226 #undef w3
    227 #undef w0
    228 
    229 }
    230