Home | History | Annotate | Line # | Download | only in generic
mulmid.c revision 1.1
      1  1.1  mrg /* mpn_mulmid -- middle product
      2  1.1  mrg 
      3  1.1  mrg    Contributed by David Harvey.
      4  1.1  mrg 
      5  1.1  mrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
      6  1.1  mrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      7  1.1  mrg    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      8  1.1  mrg 
      9  1.1  mrg Copyright 2011 Free Software Foundation, Inc.
     10  1.1  mrg 
     11  1.1  mrg This file is part of the GNU MP Library.
     12  1.1  mrg 
     13  1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     14  1.1  mrg it under the terms of the GNU Lesser General Public License as published by
     15  1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
     16  1.1  mrg option) any later version.
     17  1.1  mrg 
     18  1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     19  1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     20  1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     21  1.1  mrg License for more details.
     22  1.1  mrg 
     23  1.1  mrg You should have received a copy of the GNU Lesser General Public License
     24  1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     25  1.1  mrg 
     26  1.1  mrg 
     27  1.1  mrg #include "gmp.h"
     28  1.1  mrg #include "gmp-impl.h"
     29  1.1  mrg 
     30  1.1  mrg 
     31  1.1  mrg #define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
     32  1.1  mrg 
     33  1.1  mrg 
     34  1.1  mrg void
     35  1.1  mrg mpn_mulmid (mp_ptr rp,
     36  1.1  mrg             mp_srcptr ap, mp_size_t an,
     37  1.1  mrg             mp_srcptr bp, mp_size_t bn)
     38  1.1  mrg {
     39  1.1  mrg   mp_size_t rn, k;
     40  1.1  mrg   mp_ptr scratch, temp;
     41  1.1  mrg 
     42  1.1  mrg   ASSERT (an >= bn);
     43  1.1  mrg   ASSERT (bn >= 1);
     44  1.1  mrg   ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
     45  1.1  mrg   ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
     46  1.1  mrg 
     47  1.1  mrg   if (bn < MULMID_TOOM42_THRESHOLD)
     48  1.1  mrg     {
     49  1.1  mrg       /* region not tall enough to make toom42 worthwhile for any portion */
     50  1.1  mrg 
     51  1.1  mrg       if (an < CHUNK)
     52  1.1  mrg 	{
     53  1.1  mrg 	  /* region not too wide either, just call basecase directly */
     54  1.1  mrg 	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
     55  1.1  mrg 	  return;
     56  1.1  mrg 	}
     57  1.1  mrg 
     58  1.1  mrg       /* Region quite wide. For better locality, use basecase on chunks:
     59  1.1  mrg 
     60  1.1  mrg 	 AAABBBCC..
     61  1.1  mrg 	 .AAABBBCC.
     62  1.1  mrg 	 ..AAABBBCC
     63  1.1  mrg       */
     64  1.1  mrg 
     65  1.1  mrg       k = CHUNK - bn + 1;    /* number of diagonals per chunk */
     66  1.1  mrg 
     67  1.1  mrg       /* first chunk (marked A in the above diagram) */
     68  1.1  mrg       mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
     69  1.1  mrg 
     70  1.1  mrg       /* remaining chunks (B, C, etc) */
     71  1.1  mrg       an -= k;
     72  1.1  mrg 
     73  1.1  mrg       while (an >= CHUNK)
     74  1.1  mrg 	{
     75  1.1  mrg 	  mp_limb_t t0, t1, cy;
     76  1.1  mrg 	  ap += k, rp += k;
     77  1.1  mrg 	  t0 = rp[0], t1 = rp[1];
     78  1.1  mrg 	  mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
     79  1.1  mrg 	  ADDC_LIMB (cy, rp[0], rp[0], t0);    /* add back saved limbs */
     80  1.1  mrg 	  MPN_INCR_U (rp + 1, k + 1, t1 + cy);
     81  1.1  mrg 	  an -= k;
     82  1.1  mrg 	}
     83  1.1  mrg 
     84  1.1  mrg       if (an >= bn)
     85  1.1  mrg 	{
     86  1.1  mrg 	  /* last remaining chunk */
     87  1.1  mrg 	  mp_limb_t t0, t1, cy;
     88  1.1  mrg 	  ap += k, rp += k;
     89  1.1  mrg 	  t0 = rp[0], t1 = rp[1];
     90  1.1  mrg 	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
     91  1.1  mrg 	  ADDC_LIMB (cy, rp[0], rp[0], t0);
     92  1.1  mrg 	  MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
     93  1.1  mrg 	}
     94  1.1  mrg 
     95  1.1  mrg       return;
     96  1.1  mrg     }
     97  1.1  mrg 
     98  1.1  mrg   /* region is tall enough for toom42 */
     99  1.1  mrg 
    100  1.1  mrg   rn = an - bn + 1;
    101  1.1  mrg 
    102  1.1  mrg   if (rn < MULMID_TOOM42_THRESHOLD)
    103  1.1  mrg     {
    104  1.1  mrg       /* region not wide enough to make toom42 worthwhile for any portion */
    105  1.1  mrg 
    106  1.1  mrg       TMP_DECL;
    107  1.1  mrg 
    108  1.1  mrg       if (bn < CHUNK)
    109  1.1  mrg 	{
    110  1.1  mrg 	  /* region not too tall either, just call basecase directly */
    111  1.1  mrg 	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
    112  1.1  mrg 	  return;
    113  1.1  mrg 	}
    114  1.1  mrg 
    115  1.1  mrg       /* Region quite tall. For better locality, use basecase on chunks:
    116  1.1  mrg 
    117  1.1  mrg 	 AAAAA....
    118  1.1  mrg 	 .AAAAA...
    119  1.1  mrg 	 ..BBBBB..
    120  1.1  mrg 	 ...BBBBB.
    121  1.1  mrg 	 ....CCCCC
    122  1.1  mrg       */
    123  1.1  mrg 
    124  1.1  mrg       TMP_MARK;
    125  1.1  mrg 
    126  1.1  mrg       temp = TMP_ALLOC_LIMBS (rn + 2);
    127  1.1  mrg 
    128  1.1  mrg       /* first chunk (marked A in the above diagram) */
    129  1.1  mrg       bp += bn - CHUNK, an -= bn - CHUNK;
    130  1.1  mrg       mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
    131  1.1  mrg 
    132  1.1  mrg       /* remaining chunks (B, C, etc) */
    133  1.1  mrg       bn -= CHUNK;
    134  1.1  mrg 
    135  1.1  mrg       while (bn >= CHUNK)
    136  1.1  mrg 	{
    137  1.1  mrg 	  ap += CHUNK, bp -= CHUNK;
    138  1.1  mrg 	  mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
    139  1.1  mrg 	  mpn_add_n (rp, rp, temp, rn + 2);
    140  1.1  mrg 	  bn -= CHUNK;
    141  1.1  mrg 	}
    142  1.1  mrg 
    143  1.1  mrg       if (bn)
    144  1.1  mrg 	{
    145  1.1  mrg 	  /* last remaining chunk */
    146  1.1  mrg 	  ap += CHUNK, bp -= bn;
    147  1.1  mrg 	  mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
    148  1.1  mrg 	  mpn_add_n (rp, rp, temp, rn + 2);
    149  1.1  mrg 	}
    150  1.1  mrg 
    151  1.1  mrg       TMP_FREE;
    152  1.1  mrg       return;
    153  1.1  mrg     }
    154  1.1  mrg 
    155  1.1  mrg   /* we're definitely going to use toom42 somewhere */
    156  1.1  mrg 
    157  1.1  mrg   if (bn > rn)
    158  1.1  mrg     {
    159  1.1  mrg       /* slice region into chunks, use toom42 on all chunks except possibly
    160  1.1  mrg 	 the last:
    161  1.1  mrg 
    162  1.1  mrg          AA....
    163  1.1  mrg          .AA...
    164  1.1  mrg          ..BB..
    165  1.1  mrg          ...BB.
    166  1.1  mrg          ....CC
    167  1.1  mrg       */
    168  1.1  mrg 
    169  1.1  mrg       TMP_DECL;
    170  1.1  mrg       TMP_MARK;
    171  1.1  mrg 
    172  1.1  mrg       temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
    173  1.1  mrg       scratch = temp + rn + 2;
    174  1.1  mrg 
    175  1.1  mrg       /* first chunk (marked A in the above diagram) */
    176  1.1  mrg       bp += bn - rn;
    177  1.1  mrg       mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
    178  1.1  mrg 
    179  1.1  mrg       /* remaining chunks (B, C, etc) */
    180  1.1  mrg       bn -= rn;
    181  1.1  mrg 
    182  1.1  mrg       while (bn >= rn)
    183  1.1  mrg         {
    184  1.1  mrg           ap += rn, bp -= rn;
    185  1.1  mrg 	  mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
    186  1.1  mrg           mpn_add_n (rp, rp, temp, rn + 2);
    187  1.1  mrg           bn -= rn;
    188  1.1  mrg         }
    189  1.1  mrg 
    190  1.1  mrg       if (bn)
    191  1.1  mrg         {
    192  1.1  mrg           /* last remaining chunk */
    193  1.1  mrg           ap += rn, bp -= bn;
    194  1.1  mrg 	  mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
    195  1.1  mrg           mpn_add_n (rp, rp, temp, rn + 2);
    196  1.1  mrg         }
    197  1.1  mrg 
    198  1.1  mrg       TMP_FREE;
    199  1.1  mrg     }
    200  1.1  mrg   else
    201  1.1  mrg     {
    202  1.1  mrg       /* slice region into chunks, use toom42 on all chunks except possibly
    203  1.1  mrg 	 the last:
    204  1.1  mrg 
    205  1.1  mrg          AAABBBCC..
    206  1.1  mrg          .AAABBBCC.
    207  1.1  mrg          ..AAABBBCC
    208  1.1  mrg       */
    209  1.1  mrg 
    210  1.1  mrg       TMP_DECL;
    211  1.1  mrg       TMP_MARK;
    212  1.1  mrg 
    213  1.1  mrg       scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
    214  1.1  mrg 
    215  1.1  mrg       /* first chunk (marked A in the above diagram) */
    216  1.1  mrg       mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
    217  1.1  mrg 
    218  1.1  mrg       /* remaining chunks (B, C, etc) */
    219  1.1  mrg       rn -= bn;
    220  1.1  mrg 
    221  1.1  mrg       while (rn >= bn)
    222  1.1  mrg         {
    223  1.1  mrg 	  mp_limb_t t0, t1, cy;
    224  1.1  mrg           ap += bn, rp += bn;
    225  1.1  mrg           t0 = rp[0], t1 = rp[1];
    226  1.1  mrg           mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
    227  1.1  mrg 	  ADDC_LIMB (cy, rp[0], rp[0], t0);     /* add back saved limbs */
    228  1.1  mrg 	  MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
    229  1.1  mrg 	  rn -= bn;
    230  1.1  mrg         }
    231  1.1  mrg 
    232  1.1  mrg       TMP_FREE;
    233  1.1  mrg 
    234  1.1  mrg       if (rn)
    235  1.1  mrg         {
    236  1.1  mrg           /* last remaining chunk */
    237  1.1  mrg 	  mp_limb_t t0, t1, cy;
    238  1.1  mrg           ap += bn, rp += bn;
    239  1.1  mrg           t0 = rp[0], t1 = rp[1];
    240  1.1  mrg           mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
    241  1.1  mrg 	  ADDC_LIMB (cy, rp[0], rp[0], t0);
    242  1.1  mrg 	  MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
    243  1.1  mrg         }
    244  1.1  mrg     }
    245  1.1  mrg }
    246