Home | History | Annotate | Line # | Download | only in generic
toom_interpolate_12pts.c revision 1.1.1.3
      1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
      2 
      3    Contributed to the GNU project by Marco Bodrato.
      4 
      5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      8 
      9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
     10 
     11 This file is part of the GNU MP Library.
     12 
     13 The GNU MP Library is free software; you can redistribute it and/or modify
     14 it under the terms of either:
     15 
     16   * the GNU Lesser General Public License as published by the Free
     17     Software Foundation; either version 3 of the License, or (at your
     18     option) any later version.
     19 
     20 or
     21 
     22   * the GNU General Public License as published by the Free Software
     23     Foundation; either version 2 of the License, or (at your option) any
     24     later version.
     25 
     26 or both in parallel, as here.
     27 
     28 The GNU MP Library is distributed in the hope that it will be useful, but
     29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     31 for more details.
     32 
     33 You should have received copies of the GNU General Public License and the
     34 GNU Lesser General Public License along with the GNU MP Library.  If not,
     35 see https://www.gnu.org/licenses/.  */
     36 
     37 
     38 #include "gmp.h"
     39 #include "gmp-impl.h"
     40 
     41 
     42 #if HAVE_NATIVE_mpn_sublsh_n
     43 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
     44 #else
     45 static mp_limb_t
     46 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
     47 {
     48 #if USE_MUL_1 && 0
     49   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
     50 #else
     51   mp_limb_t __cy;
     52   __cy = mpn_lshift(ws,src,n,s);
     53   return    __cy + mpn_sub_n(dst,dst,ws,n);
     54 #endif
     55 }
     56 #endif
     57 
     58 #if HAVE_NATIVE_mpn_addlsh_n
     59 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
     60 #else
     61 static mp_limb_t
     62 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
     63 {
     64 #if USE_MUL_1 && 0
     65   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
     66 #else
     67   mp_limb_t __cy;
     68   __cy = mpn_lshift(ws,src,n,s);
     69   return    __cy + mpn_add_n(dst,dst,ws,n);
     70 #endif
     71 }
     72 #endif
     73 
     74 #if HAVE_NATIVE_mpn_subrsh
     75 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
     76 #else
     77 /* FIXME: This is not a correct definition, it assumes no carry */
     78 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
     79 do {									\
     80   mp_limb_t __cy;							\
     81   MPN_DECR_U (dst, nd, src[0] >> s);					\
     82   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
     83   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
     84 } while (0)
     85 #endif
     86 
     87 
     88 #if GMP_NUMB_BITS < 21
     89 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
     90 #endif
     91 
     92 #if GMP_NUMB_BITS < 16
     93 #error Not implemented: divexact_by42525 needs splitting.
     94 #endif
     95 
     96 #if GMP_NUMB_BITS < 12
     97 #error Not implemented: Hard to adapt...
     98 #endif
     99 
    100 /* FIXME: tuneup should decide the best variant */
    101 #ifndef AORSMUL_FASTER_AORS_AORSLSH
    102 #define AORSMUL_FASTER_AORS_AORSLSH 1
    103 #endif
    104 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
    105 #define AORSMUL_FASTER_AORS_2AORSLSH 1
    106 #endif
    107 #ifndef AORSMUL_FASTER_2AORSLSH
    108 #define AORSMUL_FASTER_2AORSLSH 1
    109 #endif
    110 #ifndef AORSMUL_FASTER_3AORSLSH
    111 #define AORSMUL_FASTER_3AORSLSH 1
    112 #endif
    113 
    114 #define BINVERT_9 \
    115   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
    116 
    117 #define BINVERT_255 \
    118   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
    119 
    120   /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
    121 #if GMP_LIMB_BITS == 32
    122 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
    123 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
    124 #else
    125 #if GMP_LIMB_BITS == 64
    126 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
    127 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
    128 #endif
    129 #endif
    130 
    131 #ifndef mpn_divexact_by255
    132 #if GMP_NUMB_BITS % 8 == 0
    133 #define mpn_divexact_by255(dst,src,size) \
    134   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
    135 #else
    136 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    137 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
    138 #else
    139 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
    140 #endif
    141 #endif
    142 #endif
    143 
    144 #ifndef mpn_divexact_by9x4
    145 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
    146 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
    147 #else
    148 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
    149 #endif
    150 #endif
    151 
    152 #ifndef mpn_divexact_by42525
    153 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
    154 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
    155 #else
    156 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
    157 #endif
    158 #endif
    159 
    160 #ifndef mpn_divexact_by2835x4
    161 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
    162 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
    163 #else
    164 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
    165 #endif
    166 #endif
    167 
    168 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
    169    points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
    170    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
    171    degree 11 (or 10), given the 12 (rsp. 11) values:
    172 
    173      r0 = limit at infinity of f(x) / x^7,
    174      r1 = f(4),f(-4),
    175      r2 = f(2),f(-2),
    176      r3 = f(1),f(-1),
    177      r4 = f(1/4),f(-1/4),
    178      r5 = f(1/2),f(-1/2),
    179      r6 = f(0).
    180 
    181    All couples of the form f(n),f(-n) must be already mixed with
    182    toom_couple_handling(f(n),...,f(-n),...)
    183 
    184    The result is stored in {pp, spt + 7*n (or 6*n)}.
    185    At entry, r6 is stored at {pp, 2n},
    186    r4 is stored at {pp + 3n, 3n + 1}.
    187    r2 is stored at {pp + 7n, 3n + 1}.
    188    r0 is stored at {pp +11n, spt}.
    189 
    190    The other values are 3n+1 limbs each (with most significant limbs small).
    191 
    192    Negative intermediate results are stored two-complemented.
    193    Inputs are destroyed.
    194 */
    195 
    196 void
    197 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
    198 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
    199 {
    200   mp_limb_t cy;
    201   mp_size_t n3;
    202   mp_size_t n3p1;
    203   n3 = 3 * n;
    204   n3p1 = n3 + 1;
    205 
    206 #define   r4    (pp + n3)			/* 3n+1 */
    207 #define   r2    (pp + 7 * n)			/* 3n+1 */
    208 #define   r0    (pp +11 * n)			/* s+t <= 2*n */
    209 
    210   /******************************* interpolation *****************************/
    211   if (half != 0) {
    212     cy = mpn_sub_n (r3, r3, r0, spt);
    213     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
    214 
    215     cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    216     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    217     DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
    218 
    219     cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    220     MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    221     DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
    222   };
    223 
    224   r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
    225   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
    226 
    227 #if HAVE_NATIVE_mpn_add_n_sub_n
    228   mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
    229 #else
    230   ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
    231   mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
    232   MP_PTR_SWAP(r1, wsi);
    233 #endif
    234 
    235   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
    236   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
    237 
    238 #if HAVE_NATIVE_mpn_add_n_sub_n
    239   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
    240 #else
    241   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
    242   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
    243   MP_PTR_SWAP(r5, wsi);
    244 #endif
    245 
    246   r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
    247 
    248 #if AORSMUL_FASTER_AORS_AORSLSH
    249   mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
    250 #else
    251   mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
    252   DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
    253 #endif
    254   /* A division by 2835x4 follows. Warning: the operand can be negative! */
    255   mpn_divexact_by2835x4(r4, r4, n3p1);
    256   if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    257     r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
    258 
    259 #if AORSMUL_FASTER_2AORSLSH
    260   mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
    261 #else
    262   DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
    263   DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
    264 #endif
    265   mpn_divexact_by255(r5, r5, n3p1);
    266 
    267   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
    268 
    269 #if AORSMUL_FASTER_3AORSLSH
    270   ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
    271 #else
    272   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
    273   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
    274   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
    275 #endif
    276   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
    277   mpn_divexact_by42525(r1, r1, n3p1);
    278 
    279 #if AORSMUL_FASTER_AORS_2AORSLSH
    280   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
    281 #else
    282   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
    283   ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
    284   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
    285 #endif
    286   mpn_divexact_by9x4(r2, r2, n3p1);
    287 
    288   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
    289 
    290   mpn_sub_n (r4, r2, r4, n3p1);
    291   ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
    292   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
    293 
    294   mpn_add_n (r5, r5, r1, n3p1);
    295   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
    296 
    297   /* last interpolation steps... */
    298   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
    299   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
    300   /* ... could be mixed with recomposition
    301 	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
    302   */
    303 
    304   /***************************** recomposition *******************************/
    305   /*
    306     pp[] prior to operations:
    307     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
    308 
    309     summation scheme for remaining operations:
    310     |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    311     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
    312 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
    313   */
    314 
    315   cy = mpn_add_n (pp + n, pp + n, r5, n);
    316   cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
    317 #if HAVE_NATIVE_mpn_add_nc
    318   cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
    319 #else
    320   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
    321   cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
    322 #endif
    323   MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
    324 
    325   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
    326   cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
    327 #if HAVE_NATIVE_mpn_add_nc
    328   cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
    329 #else
    330   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
    331   cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
    332 #endif
    333   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
    334 
    335   pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
    336   if (half) {
    337     cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
    338 #if HAVE_NATIVE_mpn_add_nc
    339     if (LIKELY (spt > n)) {
    340       cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
    341       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    342     } else {
    343       ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    344     }
    345 #else
    346     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    347     if (LIKELY (spt > n)) {
    348       cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
    349       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    350     } else {
    351       ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    352     }
    353 #endif
    354   } else {
    355     ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
    356   }
    357 
    358 #undef   r0
    359 #undef   r2
    360 #undef   r4
    361 }
    362