Home | History | Annotate | Line # | Download | only in generic
toom43_mul.c revision 1.1.1.3
      1      1.1  mrg /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
      2      1.1  mrg    times as large as bn.  Or more accurately, bn < an < 2 bn.
      3      1.1  mrg 
      4      1.1  mrg    Contributed to the GNU project by Marco Bodrato.
      5      1.1  mrg 
      6      1.1  mrg    The idea of applying toom to unbalanced multiplication is due to Marco
      7      1.1  mrg    Bodrato and Alberto Zanoni.
      8      1.1  mrg 
      9      1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
     10      1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     11      1.1  mrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
     12      1.1  mrg 
     13      1.1  mrg Copyright 2009 Free Software Foundation, Inc.
     14      1.1  mrg 
     15      1.1  mrg This file is part of the GNU MP Library.
     16      1.1  mrg 
     17      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     18  1.1.1.3  mrg it under the terms of either:
     19  1.1.1.3  mrg 
     20  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     21  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     22  1.1.1.3  mrg     option) any later version.
     23  1.1.1.3  mrg 
     24  1.1.1.3  mrg or
     25  1.1.1.3  mrg 
     26  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     27  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     28  1.1.1.3  mrg     later version.
     29  1.1.1.3  mrg 
     30  1.1.1.3  mrg or both in parallel, as here.
     31      1.1  mrg 
     32      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     33      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     34  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     35  1.1.1.3  mrg for more details.
     36      1.1  mrg 
     37  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     38  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     39  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     40      1.1  mrg 
     41      1.1  mrg 
     42      1.1  mrg #include "gmp.h"
     43      1.1  mrg #include "gmp-impl.h"
     44      1.1  mrg 
     45      1.1  mrg /* Evaluate in: -2, -1, 0, +1, +2, +inf
     46      1.1  mrg 
     47      1.1  mrg   <-s-><--n--><--n--><--n-->
     48      1.1  mrg    ___ ______ ______ ______
     49      1.1  mrg   |a3_|___a2_|___a1_|___a0_|
     50      1.1  mrg 	|_b2_|___b1_|___b0_|
     51      1.1  mrg 	<-t--><--n--><--n-->
     52      1.1  mrg 
     53      1.1  mrg   v0  =  a0             * b0          #   A(0)*B(0)
     54      1.1  mrg   v1  = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 3  bh <= 2
     55      1.1  mrg   vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1 |bh|<= 1
     56      1.1  mrg   v2  = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 14 bh <= 6
     57      1.1  mrg   vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) #  A(-2)*B(-2)    |ah| <= 9 |bh|<= 4
     58      1.1  mrg   vinf=              a3 *         b2  # A(inf)*B(inf)
     59      1.1  mrg */
     60      1.1  mrg 
     61      1.1  mrg void
     62      1.1  mrg mpn_toom43_mul (mp_ptr pp,
     63      1.1  mrg 		mp_srcptr ap, mp_size_t an,
     64      1.1  mrg 		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
     65      1.1  mrg {
     66      1.1  mrg   mp_size_t n, s, t;
     67      1.1  mrg   enum toom6_flags flags;
     68      1.1  mrg   mp_limb_t cy;
     69      1.1  mrg 
     70      1.1  mrg #define a0  ap
     71      1.1  mrg #define a1  (ap + n)
     72      1.1  mrg #define a2  (ap + 2 * n)
     73      1.1  mrg #define a3  (ap + 3 * n)
     74      1.1  mrg #define b0  bp
     75      1.1  mrg #define b1  (bp + n)
     76      1.1  mrg #define b2  (bp + 2 * n)
     77      1.1  mrg 
     78      1.1  mrg   n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
     79      1.1  mrg 
     80      1.1  mrg   s = an - 3 * n;
     81      1.1  mrg   t = bn - 2 * n;
     82      1.1  mrg 
     83      1.1  mrg   ASSERT (0 < s && s <= n);
     84      1.1  mrg   ASSERT (0 < t && t <= n);
     85      1.1  mrg 
     86      1.1  mrg   /* This is true whenever an >= 25 or bn >= 19, I think. It
     87      1.1  mrg      guarantees that we can fit 5 values of size n+1 in the product
     88      1.1  mrg      area. */
     89      1.1  mrg   ASSERT (s+t >= 5);
     90      1.1  mrg 
     91      1.1  mrg #define v0    pp				/* 2n */
     92      1.1  mrg #define vm1   (scratch)				/* 2n+1 */
     93      1.1  mrg #define v1    (pp + 2*n)			/* 2n+1 */
     94      1.1  mrg #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
     95      1.1  mrg #define v2    (scratch + 4 * n + 2)		/* 2n+1 */
     96      1.1  mrg #define vinf  (pp + 5 * n)			/* s+t */
     97      1.1  mrg #define bs1    pp				/* n+1 */
     98      1.1  mrg #define bsm1  (scratch + 2 * n + 2)		/* n+1 */
     99      1.1  mrg #define asm1  (scratch + 3 * n + 3)		/* n+1 */
    100      1.1  mrg #define asm2  (scratch + 4 * n + 4)		/* n+1 */
    101      1.1  mrg #define bsm2  (pp + n + 1)			/* n+1 */
    102      1.1  mrg #define bs2   (pp + 2 * n + 2)			/* n+1 */
    103      1.1  mrg #define as2   (pp + 3 * n + 3)			/* n+1 */
    104      1.1  mrg #define as1   (pp + 4 * n + 4)			/* n+1 */
    105      1.1  mrg 
    106      1.1  mrg   /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
    107      1.1  mrg      limb, because products will overwrite 2n+2 limbs. */
    108      1.1  mrg 
    109      1.1  mrg #define a0a2  scratch
    110      1.1  mrg #define b0b2  scratch
    111      1.1  mrg #define a1a3  asm1
    112      1.1  mrg #define b1d   bsm1
    113      1.1  mrg 
    114      1.1  mrg   /* Compute as2 and asm2.  */
    115  1.1.1.2  mrg   flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
    116      1.1  mrg 
    117      1.1  mrg   /* Compute bs2 and bsm2.  */
    118      1.1  mrg   b1d[n] = mpn_lshift (b1d, b1, n, 1);			/*       2b1      */
    119      1.1  mrg   cy  = mpn_lshift (b0b2, b2, t, 2);			/*  4b2           */
    120      1.1  mrg   cy += mpn_add_n (b0b2, b0b2, b0, t);			/*  4b2      + b0 */
    121      1.1  mrg   if (t != n)
    122      1.1  mrg     cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
    123      1.1  mrg   b0b2[n] = cy;
    124      1.1  mrg 
    125      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    126      1.1  mrg   if (mpn_cmp (b0b2, b1d, n+1) < 0)
    127      1.1  mrg     {
    128      1.1  mrg       mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
    129  1.1.1.2  mrg       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
    130      1.1  mrg     }
    131      1.1  mrg   else
    132      1.1  mrg     {
    133      1.1  mrg       mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
    134      1.1  mrg     }
    135      1.1  mrg #else
    136      1.1  mrg   mpn_add_n (bs2, b0b2, b1d, n+1);
    137      1.1  mrg   if (mpn_cmp (b0b2, b1d, n+1) < 0)
    138      1.1  mrg     {
    139      1.1  mrg       mpn_sub_n (bsm2, b1d, b0b2, n+1);
    140  1.1.1.2  mrg       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
    141      1.1  mrg     }
    142      1.1  mrg   else
    143      1.1  mrg     {
    144      1.1  mrg       mpn_sub_n (bsm2, b0b2, b1d, n+1);
    145      1.1  mrg     }
    146      1.1  mrg #endif
    147      1.1  mrg 
    148      1.1  mrg   /* Compute as1 and asm1.  */
    149  1.1.1.3  mrg   flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2)));
    150      1.1  mrg 
    151      1.1  mrg   /* Compute bs1 and bsm1.  */
    152      1.1  mrg   bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
    153      1.1  mrg #if HAVE_NATIVE_mpn_add_n_sub_n
    154      1.1  mrg   if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
    155      1.1  mrg     {
    156      1.1  mrg       cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
    157      1.1  mrg       bs1[n] = cy >> 1;
    158  1.1.1.2  mrg       flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
    159      1.1  mrg     }
    160      1.1  mrg   else
    161      1.1  mrg     {
    162      1.1  mrg       cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
    163      1.1  mrg       bs1[n] = bsm1[n] + (cy >> 1);
    164      1.1  mrg       bsm1[n]-= cy & 1;
    165      1.1  mrg     }
    166      1.1  mrg #else
    167      1.1  mrg   bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
    168      1.1  mrg   if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
    169      1.1  mrg     {
    170      1.1  mrg       mpn_sub_n (bsm1, b1, bsm1, n);
    171  1.1.1.2  mrg       flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
    172      1.1  mrg     }
    173      1.1  mrg   else
    174      1.1  mrg     {
    175      1.1  mrg       bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
    176      1.1  mrg     }
    177      1.1  mrg #endif
    178      1.1  mrg 
    179      1.1  mrg   ASSERT (as1[n] <= 3);
    180      1.1  mrg   ASSERT (bs1[n] <= 2);
    181      1.1  mrg   ASSERT (asm1[n] <= 1);
    182      1.1  mrg   ASSERT (bsm1[n] <= 1);
    183      1.1  mrg   ASSERT (as2[n] <=14);
    184      1.1  mrg   ASSERT (bs2[n] <= 6);
    185      1.1  mrg   ASSERT (asm2[n] <= 9);
    186      1.1  mrg   ASSERT (bsm2[n] <= 4);
    187      1.1  mrg 
    188      1.1  mrg   /* vm1, 2n+1 limbs */
    189      1.1  mrg   mpn_mul_n (vm1, asm1, bsm1, n+1);  /* W4 */
    190      1.1  mrg 
    191      1.1  mrg   /* vm2, 2n+1 limbs */
    192      1.1  mrg   mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */
    193      1.1  mrg 
    194      1.1  mrg   /* v2, 2n+1 limbs */
    195      1.1  mrg   mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */
    196      1.1  mrg 
    197      1.1  mrg   /* v1, 2n+1 limbs */
    198      1.1  mrg   mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */
    199      1.1  mrg 
    200      1.1  mrg   /* vinf, s+t limbs */   /* W0 */
    201      1.1  mrg   if (s > t)  mpn_mul (vinf, a3, s, b2, t);
    202      1.1  mrg   else        mpn_mul (vinf, b2, t, a3, s);
    203      1.1  mrg 
    204      1.1  mrg   /* v0, 2n limbs */
    205      1.1  mrg   mpn_mul_n (v0, ap, bp, n);  /* W5 */
    206      1.1  mrg 
    207      1.1  mrg   mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
    208      1.1  mrg 
    209      1.1  mrg #undef v0
    210      1.1  mrg #undef vm1
    211      1.1  mrg #undef v1
    212      1.1  mrg #undef vm2
    213      1.1  mrg #undef v2
    214      1.1  mrg #undef vinf
    215      1.1  mrg #undef bs1
    216      1.1  mrg #undef bs2
    217      1.1  mrg #undef bsm1
    218      1.1  mrg #undef bsm2
    219      1.1  mrg #undef asm1
    220      1.1  mrg #undef asm2
    221      1.1  mrg /* #undef as1 */
    222      1.1  mrg /* #undef as2 */
    223      1.1  mrg #undef a0a2
    224      1.1  mrg #undef b0b2
    225      1.1  mrg #undef a1a3
    226      1.1  mrg #undef b1d
    227      1.1  mrg #undef a0
    228      1.1  mrg #undef a1
    229      1.1  mrg #undef a2
    230      1.1  mrg #undef a3
    231      1.1  mrg #undef b0
    232      1.1  mrg #undef b1
    233      1.1  mrg #undef b2
    234      1.1  mrg }
    235