Home | History | Annotate | Line # | Download | only in generic
toom_interpolate_12pts.c revision 1.1.1.4
      1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
      2 
      3    Contributed to the GNU project by Marco Bodrato.
      4 
      5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      8 
      9 Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
     10 
     11 This file is part of the GNU MP Library.
     12 
     13 The GNU MP Library is free software; you can redistribute it and/or modify
     14 it under the terms of either:
     15 
     16   * the GNU Lesser General Public License as published by the Free
     17     Software Foundation; either version 3 of the License, or (at your
     18     option) any later version.
     19 
     20 or
     21 
     22   * the GNU General Public License as published by the Free Software
     23     Foundation; either version 2 of the License, or (at your option) any
     24     later version.
     25 
     26 or both in parallel, as here.
     27 
     28 The GNU MP Library is distributed in the hope that it will be useful, but
     29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     31 for more details.
     32 
     33 You should have received copies of the GNU General Public License and the
     34 GNU Lesser General Public License along with the GNU MP Library.  If not,
     35 see https://www.gnu.org/licenses/.  */
     36 
     37 
     38 #include "gmp-impl.h"
     39 
     40 
     41 #if HAVE_NATIVE_mpn_sublsh_n
     42 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
     43 #else
     44 static mp_limb_t
     45 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
     46 {
     47 #if USE_MUL_1 && 0
     48   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
     49 #else
     50   mp_limb_t __cy;
     51   __cy = mpn_lshift(ws,src,n,s);
     52   return    __cy + mpn_sub_n(dst,dst,ws,n);
     53 #endif
     54 }
     55 #endif
     56 
     57 #if HAVE_NATIVE_mpn_addlsh_n
     58 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
     59 #else
     60 static mp_limb_t
     61 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
     62 {
     63 #if USE_MUL_1 && 0
     64   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
     65 #else
     66   mp_limb_t __cy;
     67   __cy = mpn_lshift(ws,src,n,s);
     68   return    __cy + mpn_add_n(dst,dst,ws,n);
     69 #endif
     70 }
     71 #endif
     72 
     73 #if HAVE_NATIVE_mpn_subrsh
     74 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
     75 #else
     76 /* FIXME: This is not a correct definition, it assumes no carry */
     77 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
     78 do {									\
     79   mp_limb_t __cy;							\
     80   MPN_DECR_U (dst, nd, src[0] >> s);					\
     81   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
     82   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
     83 } while (0)
     84 #endif
     85 
     86 
     87 #if GMP_NUMB_BITS < 21
     88 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
     89 #endif
     90 
     91 #if GMP_NUMB_BITS < 16
     92 #error Not implemented: divexact_by42525 needs splitting.
     93 #endif
     94 
     95 #if GMP_NUMB_BITS < 12
     96 #error Not implemented: Hard to adapt...
     97 #endif
     98 
     99 /* FIXME: tuneup should decide the best variant */
    100 #ifndef AORSMUL_FASTER_AORS_AORSLSH
    101 #define AORSMUL_FASTER_AORS_AORSLSH 1
    102 #endif
    103 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
    104 #define AORSMUL_FASTER_AORS_2AORSLSH 1
    105 #endif
    106 #ifndef AORSMUL_FASTER_2AORSLSH
    107 #define AORSMUL_FASTER_2AORSLSH 1
    108 #endif
    109 #ifndef AORSMUL_FASTER_3AORSLSH
    110 #define AORSMUL_FASTER_3AORSLSH 1
    111 #endif
    112 
    113 #define BINVERT_9 \
    114   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
    115 
    116 #define BINVERT_255 \
    117   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
    118 
    119   /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
    120 #if GMP_LIMB_BITS == 32
    121 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
    122 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
    123 #else
    124 #if GMP_LIMB_BITS == 64
    125 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
    126 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
    127 #endif
    128 #endif
    129 
    130 #ifndef mpn_divexact_by255
    131 #if GMP_NUMB_BITS % 8 == 0
    132 #define mpn_divexact_by255(dst,src,size) \
    133   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
    134 #else
    135 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    136 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
    137 #else
    138 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
    139 #endif
    140 #endif
    141 #endif
    142 
    143 #ifndef mpn_divexact_by9x4
    144 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    145 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
    146 #else
    147 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
    148 #endif
    149 #endif
    150 
    151 #ifndef mpn_divexact_by42525
    152 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
    153 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
    154 #else
    155 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
    156 #endif
    157 #endif
    158 
    159 #ifndef mpn_divexact_by2835x4
    160 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
    161 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
    162 #else
    163 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
    164 #endif
    165 #endif
    166 
    167 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
    168    points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
    169    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
    170    degree 11 (or 10), given the 12 (rsp. 11) values:
    171 
    172      r0 = limit at infinity of f(x) / x^11,
    173      r1 = f(4),f(-4),
    174      r2 = f(2),f(-2),
    175      r3 = f(1),f(-1),
    176      r4 = f(1/4),f(-1/4),
    177      r5 = f(1/2),f(-1/2),
    178      r6 = f(0).
    179 
    180    All couples of the form f(n),f(-n) must be already mixed with
    181    toom_couple_handling(f(n),...,f(-n),...)
    182 
    183    The result is stored in {pp, spt + 7*n (or 6*n)}.
    184    At entry, r6 is stored at {pp, 2n},
    185    r4 is stored at {pp + 3n, 3n + 1}.
    186    r2 is stored at {pp + 7n, 3n + 1}.
    187    r0 is stored at {pp +11n, spt}.
    188 
    189    The other values are 3n+1 limbs each (with most significant limbs small).
    190 
    191    Negative intermediate results are stored two-complemented.
    192    Inputs are destroyed.
    193 */
    194 
    195 void
    196 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
    197 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
    198 {
    199   mp_limb_t cy;
    200   mp_size_t n3;
    201   mp_size_t n3p1;
    202   n3 = 3 * n;
    203   n3p1 = n3 + 1;
    204 
    205 #define   r4    (pp + n3)			/* 3n+1 */
    206 #define   r2    (pp + 7 * n)			/* 3n+1 */
    207 #define   r0    (pp +11 * n)			/* s+t <= 2*n */
    208 
    209   /******************************* interpolation *****************************/
    210   if (half != 0) {
    211     cy = mpn_sub_n (r3, r3, r0, spt);
    212     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
    213 
    214     cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    215     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    216     DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
    217 
    218     cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    219     MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    220     DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
    221   };
    222 
    223   r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
    224   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
    225 
    226 #if HAVE_NATIVE_mpn_add_n_sub_n
    227   mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
    228 #else
    229   ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
    230   mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
    231   MP_PTR_SWAP(r1, wsi);
    232 #endif
    233 
    234   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
    235   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
    236 
    237 #if HAVE_NATIVE_mpn_add_n_sub_n
    238   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
    239 #else
    240   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
    241   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
    242   MP_PTR_SWAP(r5, wsi);
    243 #endif
    244 
    245   r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
    246 
    247 #if AORSMUL_FASTER_AORS_AORSLSH
    248   mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
    249 #else
    250   mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
    251   DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
    252 #endif
    253   /* A division by 2835x4 follows. Warning: the operand can be negative! */
    254   mpn_divexact_by2835x4(r4, r4, n3p1);
    255   if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    256     r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
    257 
    258 #if AORSMUL_FASTER_2AORSLSH
    259   mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
    260 #else
    261   DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
    262   DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
    263 #endif
    264   mpn_divexact_by255(r5, r5, n3p1);
    265 
    266   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
    267 
    268 #if AORSMUL_FASTER_3AORSLSH
    269   ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
    270 #else
    271   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
    272   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
    273   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
    274 #endif
    275   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
    276   mpn_divexact_by42525(r1, r1, n3p1);
    277 
    278 #if AORSMUL_FASTER_AORS_2AORSLSH
    279   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
    280 #else
    281   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
    282   ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
    283   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
    284 #endif
    285   mpn_divexact_by9x4(r2, r2, n3p1);
    286 
    287   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
    288 
    289 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
    290   mpn_rsh1sub_n (r4, r2, r4, n3p1);
    291   r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
    292 #else
    293   mpn_sub_n (r4, r2, r4, n3p1);
    294   ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
    295 #endif
    296   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
    297 
    298 #ifdef HAVE_NATIVE_mpn_rsh1add_n
    299   mpn_rsh1add_n (r5, r5, r1, n3p1);
    300   r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
    301 #else
    302   mpn_add_n (r5, r5, r1, n3p1);
    303   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
    304 #endif
    305 
    306   /* last interpolation steps... */
    307   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
    308   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
    309   /* ... could be mixed with recomposition
    310 	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
    311   */
    312 
    313   /***************************** recomposition *******************************/
    314   /*
    315     pp[] prior to operations:
    316     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
    317 
    318     summation scheme for remaining operations:
    319     |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    320     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
    321 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
    322   */
    323 
    324   cy = mpn_add_n (pp + n, pp + n, r5, n);
    325   cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
    326 #if HAVE_NATIVE_mpn_add_nc
    327   cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
    328 #else
    329   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
    330   cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
    331 #endif
    332   MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
    333 
    334   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
    335   cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
    336 #if HAVE_NATIVE_mpn_add_nc
    337   cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
    338 #else
    339   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
    340   cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
    341 #endif
    342   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
    343 
    344   pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
    345   if (half) {
    346     cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
    347 #if HAVE_NATIVE_mpn_add_nc
    348     if (LIKELY (spt > n)) {
    349       cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
    350       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    351     } else {
    352       ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    353     }
    354 #else
    355     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    356     if (LIKELY (spt > n)) {
    357       cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
    358       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    359     } else {
    360       ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    361     }
    362 #endif
    363   } else {
    364     ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
    365   }
    366 
    367 #undef   r0
    368 #undef   r2
    369 #undef   r4
    370 }
    371