Home | History | Annotate | Line # | Download | only in tune
tuneup.c revision 1.1.1.2
      1      1.1  mrg /* Create tuned thresholds for various algorithms.
      2      1.1  mrg 
      3  1.1.1.2  mrg Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010,
      4  1.1.1.2  mrg 2011, 2012 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg This file is part of the GNU MP Library.
      7      1.1  mrg 
      8      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
      9      1.1  mrg it under the terms of the GNU Lesser General Public License as published by
     10      1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
     11      1.1  mrg option) any later version.
     12      1.1  mrg 
     13      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     14      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15      1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16      1.1  mrg License for more details.
     17      1.1  mrg 
     18      1.1  mrg You should have received a copy of the GNU Lesser General Public License
     19      1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     20      1.1  mrg 
     21      1.1  mrg 
     22      1.1  mrg /* Usage: tuneup [-t] [-t] [-p precision]
     23      1.1  mrg 
     24      1.1  mrg    -t turns on some diagnostic traces, a second -t turns on more traces.
     25      1.1  mrg 
     26      1.1  mrg    Notes:
     27      1.1  mrg 
     28      1.1  mrg    The code here isn't a vision of loveliness, mainly because it's subject
     29      1.1  mrg    to ongoing changes according to new things wanting to be tuned, and
     30      1.1  mrg    practical requirements of systems tested.
     31      1.1  mrg 
     32      1.1  mrg    Sometimes running the program twice produces slightly different results.
     33      1.1  mrg    This is probably because there's so little separating algorithms near
     34      1.1  mrg    their crossover, and on that basis it should make little or no difference
     35      1.1  mrg    to the final speed of the relevant routines, but nothing has been done to
     36      1.1  mrg    check that carefully.
     37      1.1  mrg 
     38      1.1  mrg    Algorithm:
     39      1.1  mrg 
     40      1.1  mrg    The thresholds are determined as follows.  A crossover may not be a
     41      1.1  mrg    single size but rather a range where it oscillates between method A or
     42      1.1  mrg    method B faster.  If the threshold is set making B used where A is faster
     43      1.1  mrg    (or vice versa) that's bad.  Badness is the percentage time lost and
     44      1.1  mrg    total badness is the sum of this over all sizes measured.  The threshold
     45      1.1  mrg    is set to minimize total badness.
     46      1.1  mrg 
     47      1.1  mrg    Suppose, as sizes increase, method B becomes faster than method A.  The
     48      1.1  mrg    effect of the rule is that, as you look at increasing sizes, isolated
     49      1.1  mrg    points where B is faster are ignored, but when it's consistently faster,
     50      1.1  mrg    or faster on balance, then the threshold is set there.  The same result
     51      1.1  mrg    is obtained thinking in the other direction of A becoming faster at
     52      1.1  mrg    smaller sizes.
     53      1.1  mrg 
     54      1.1  mrg    In practice the thresholds tend to be chosen to bring on the next
     55      1.1  mrg    algorithm fairly quickly.
     56      1.1  mrg 
     57      1.1  mrg    This rule is attractive because it's got a basis in reason and is fairly
     58      1.1  mrg    easy to implement, but no work has been done to actually compare it in
     59      1.1  mrg    absolute terms to other possibilities.
     60      1.1  mrg 
     61      1.1  mrg    Implementation:
     62      1.1  mrg 
     63      1.1  mrg    In a normal library build the thresholds are constants.  To tune them
     64      1.1  mrg    selected objects are recompiled with the thresholds as global variables
     65      1.1  mrg    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
     66      1.1  mrg    the end of gmp-impl.h, and rules in tune/Makefile.am.
     67      1.1  mrg 
     68      1.1  mrg    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
     69      1.1  mrg    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
     70      1.1  mrg    level, but recurse into the basecase.
     71      1.1  mrg 
     72      1.1  mrg    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
     73      1.1  mrg    Other routines in turn will make use of both of those.  Naturally the
     74      1.1  mrg    dependants must be tuned first.
     75      1.1  mrg 
     76      1.1  mrg    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
     77      1.1  mrg    just a threshold based on comparing two routines (mpn_divrem_1 and
     78      1.1  mrg    mpn_divexact_1), and no further use of the value determined.
     79      1.1  mrg 
     80      1.1  mrg    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
     81      1.1  mrg    just comparisons between certain routines on representative data.
     82      1.1  mrg 
     83      1.1  mrg    Shortcuts are applied when native (assembler) versions of routines exist.
     84      1.1  mrg    For instance a native mpn_sqr_basecase is assumed to be always faster
     85      1.1  mrg    than mpn_mul_basecase, with no measuring.
     86      1.1  mrg 
     87      1.1  mrg    No attempt is made to tune within assembler routines, for instance
     88      1.1  mrg    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
     89      1.1  mrg    written and tuned all by hand.  Assembler routines that might have hard
     90      1.1  mrg    limits are recompiled though, to make them accept a bigger range of sizes
     91      1.1  mrg    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
     92      1.1  mrg 
     93      1.1  mrg    Limitations:
     94      1.1  mrg 
     95      1.1  mrg    The FFTs aren't subject to the same badness rule as the other thresholds,
     96      1.1  mrg    so each k is probably being brought on a touch early.  This isn't likely
     97      1.1  mrg    to make a difference, and the simpler probing means fewer tests.
     98      1.1  mrg 
     99      1.1  mrg */
    100      1.1  mrg 
    101      1.1  mrg #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
    102      1.1  mrg 
    103      1.1  mrg #include "config.h"
    104      1.1  mrg 
    105      1.1  mrg #include <math.h>
    106      1.1  mrg #include <stdio.h>
    107      1.1  mrg #include <stdlib.h>
    108      1.1  mrg #include <time.h>
    109      1.1  mrg #if HAVE_UNISTD_H
    110      1.1  mrg #include <unistd.h>
    111      1.1  mrg #endif
    112      1.1  mrg 
    113      1.1  mrg #include "gmp.h"
    114      1.1  mrg #include "gmp-impl.h"
    115      1.1  mrg #include "longlong.h"
    116      1.1  mrg 
    117      1.1  mrg #include "tests.h"
    118      1.1  mrg #include "speed.h"
    119      1.1  mrg 
    120      1.1  mrg #if !HAVE_DECL_OPTARG
    121      1.1  mrg extern char *optarg;
    122      1.1  mrg extern int optind, opterr;
    123      1.1  mrg #endif
    124      1.1  mrg 
    125      1.1  mrg 
    126      1.1  mrg #define DEFAULT_MAX_SIZE   1000  /* limbs */
    127      1.1  mrg 
    128      1.1  mrg #if WANT_FFT
    129      1.1  mrg mp_size_t  option_fft_max_size = 50000;  /* limbs */
    130      1.1  mrg #else
    131      1.1  mrg mp_size_t  option_fft_max_size = 0;
    132      1.1  mrg #endif
    133      1.1  mrg int        option_trace = 0;
    134      1.1  mrg int        option_fft_trace = 0;
    135      1.1  mrg struct speed_params  s;
    136      1.1  mrg 
    137      1.1  mrg struct dat_t {
    138      1.1  mrg   mp_size_t  size;
    139      1.1  mrg   double     d;
    140      1.1  mrg } *dat = NULL;
    141      1.1  mrg int  ndat = 0;
    142      1.1  mrg int  allocdat = 0;
    143      1.1  mrg 
    144      1.1  mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
    145      1.1  mrg    case use zero here, which for params.max_size means no limit.  */
    146      1.1  mrg #ifndef TUNE_SQR_TOOM2_MAX
    147      1.1  mrg #define TUNE_SQR_TOOM2_MAX  0
    148      1.1  mrg #endif
    149      1.1  mrg 
    150      1.1  mrg mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
    151      1.1  mrg mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
    152      1.1  mrg mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
    153      1.1  mrg mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
    154      1.1  mrg mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
    155      1.1  mrg mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
    156      1.1  mrg mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
    157      1.1  mrg mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
    158      1.1  mrg mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
    159  1.1.1.2  mrg mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
    160      1.1  mrg mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
    161      1.1  mrg mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
    162      1.1  mrg mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
    163      1.1  mrg mp_size_t  sqr_toom2_threshold
    164      1.1  mrg   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
    165      1.1  mrg mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
    166      1.1  mrg mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
    167      1.1  mrg mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
    168      1.1  mrg mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
    169      1.1  mrg mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
    170      1.1  mrg mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
    171      1.1  mrg mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
    172      1.1  mrg mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
    173      1.1  mrg mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
    174  1.1.1.2  mrg mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
    175      1.1  mrg mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
    176      1.1  mrg mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
    177  1.1.1.2  mrg mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
    178      1.1  mrg mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
    179      1.1  mrg mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
    180      1.1  mrg mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
    181      1.1  mrg mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
    182      1.1  mrg mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
    183      1.1  mrg mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
    184      1.1  mrg mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    185      1.1  mrg mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
    186      1.1  mrg mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    187      1.1  mrg mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
    188      1.1  mrg mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
    189      1.1  mrg mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
    190      1.1  mrg mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
    191      1.1  mrg mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
    192      1.1  mrg mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
    193      1.1  mrg mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
    194      1.1  mrg mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
    195      1.1  mrg mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
    196      1.1  mrg mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
    197  1.1.1.2  mrg mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
    198  1.1.1.2  mrg mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
    199      1.1  mrg mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
    200      1.1  mrg mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
    201      1.1  mrg mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
    202      1.1  mrg mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
    203      1.1  mrg mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
    204      1.1  mrg mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
    205  1.1.1.2  mrg int	   mod_1_1p_method		= 0;
    206      1.1  mrg mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    207      1.1  mrg mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    208      1.1  mrg mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
    209      1.1  mrg mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
    210      1.1  mrg mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
    211      1.1  mrg mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
    212      1.1  mrg mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
    213      1.1  mrg mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
    214      1.1  mrg mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
    215      1.1  mrg mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
    216  1.1.1.2  mrg mp_size_t  fac_odd_threshold            = 0;
    217  1.1.1.2  mrg mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
    218      1.1  mrg 
    219      1.1  mrg mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
    220      1.1  mrg mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
    221      1.1  mrg 
    222      1.1  mrg struct param_t {
    223      1.1  mrg   const char        *name;
    224      1.1  mrg   speed_function_t  function;
    225      1.1  mrg   speed_function_t  function2;
    226      1.1  mrg   double            step_factor;    /* how much to step relatively */
    227      1.1  mrg   int               step;           /* how much to step absolutely */
    228      1.1  mrg   double            function_fudge; /* multiplier for "function" speeds */
    229      1.1  mrg   int               stop_since_change;
    230      1.1  mrg   double            stop_factor;
    231      1.1  mrg   mp_size_t         min_size;
    232      1.1  mrg   int               min_is_always;
    233      1.1  mrg   mp_size_t         max_size;
    234      1.1  mrg   mp_size_t         check_size;
    235      1.1  mrg   mp_size_t         size_extra;
    236      1.1  mrg 
    237      1.1  mrg #define DATA_HIGH_LT_R  1
    238      1.1  mrg #define DATA_HIGH_GE_R  2
    239      1.1  mrg   int               data_high;
    240      1.1  mrg 
    241      1.1  mrg   int               noprint;
    242      1.1  mrg };
    243      1.1  mrg 
    244      1.1  mrg 
    245      1.1  mrg /* These are normally undefined when false, which suits "#if" fine.
    246      1.1  mrg    But give them zero values so they can be used in plain C "if"s.  */
    247      1.1  mrg #ifndef UDIV_PREINV_ALWAYS
    248      1.1  mrg #define UDIV_PREINV_ALWAYS 0
    249      1.1  mrg #endif
    250      1.1  mrg #ifndef HAVE_NATIVE_mpn_divexact_1
    251      1.1  mrg #define HAVE_NATIVE_mpn_divexact_1 0
    252      1.1  mrg #endif
    253      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_1
    254      1.1  mrg #define HAVE_NATIVE_mpn_divrem_1 0
    255      1.1  mrg #endif
    256      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_2
    257      1.1  mrg #define HAVE_NATIVE_mpn_divrem_2 0
    258      1.1  mrg #endif
    259      1.1  mrg #ifndef HAVE_NATIVE_mpn_mod_1
    260      1.1  mrg #define HAVE_NATIVE_mpn_mod_1 0
    261      1.1  mrg #endif
    262  1.1.1.2  mrg #ifndef HAVE_NATIVE_mpn_mod_1_1p
    263  1.1.1.2  mrg #define HAVE_NATIVE_mpn_mod_1_1p 0
    264  1.1.1.2  mrg #endif
    265      1.1  mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd
    266      1.1  mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0
    267      1.1  mrg #endif
    268      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
    269      1.1  mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0
    270      1.1  mrg #endif
    271      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1
    272      1.1  mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0
    273      1.1  mrg #endif
    274      1.1  mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase
    275      1.1  mrg #define HAVE_NATIVE_mpn_sqr_basecase 0
    276      1.1  mrg #endif
    277      1.1  mrg 
    278      1.1  mrg 
    279      1.1  mrg #define MAX3(a,b,c)  MAX (MAX (a, b), c)
    280      1.1  mrg 
    281      1.1  mrg mp_limb_t
    282      1.1  mrg randlimb_norm (void)
    283      1.1  mrg {
    284      1.1  mrg   mp_limb_t  n;
    285      1.1  mrg   mpn_random (&n, 1);
    286      1.1  mrg   n |= GMP_NUMB_HIGHBIT;
    287      1.1  mrg   return n;
    288      1.1  mrg }
    289      1.1  mrg 
    290      1.1  mrg #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
    291      1.1  mrg 
    292      1.1  mrg mp_limb_t
    293      1.1  mrg randlimb_half (void)
    294      1.1  mrg {
    295      1.1  mrg   mp_limb_t  n;
    296      1.1  mrg   mpn_random (&n, 1);
    297      1.1  mrg   n &= GMP_NUMB_HALFMASK;
    298      1.1  mrg   n += (n==0);
    299      1.1  mrg   return n;
    300      1.1  mrg }
    301      1.1  mrg 
    302      1.1  mrg 
    303      1.1  mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger
    304      1.1  mrg    if necessary.  */
    305      1.1  mrg void
    306      1.1  mrg add_dat (mp_size_t size, double d)
    307      1.1  mrg {
    308      1.1  mrg #define ALLOCDAT_STEP  500
    309      1.1  mrg 
    310      1.1  mrg   ASSERT_ALWAYS (ndat <= allocdat);
    311      1.1  mrg 
    312      1.1  mrg   if (ndat == allocdat)
    313      1.1  mrg     {
    314      1.1  mrg       dat = (struct dat_t *) __gmp_allocate_or_reallocate
    315      1.1  mrg         (dat, allocdat * sizeof(dat[0]),
    316      1.1  mrg          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
    317      1.1  mrg       allocdat += ALLOCDAT_STEP;
    318      1.1  mrg     }
    319      1.1  mrg 
    320      1.1  mrg   dat[ndat].size = size;
    321      1.1  mrg   dat[ndat].d = d;
    322      1.1  mrg   ndat++;
    323      1.1  mrg }
    324      1.1  mrg 
    325      1.1  mrg 
    326      1.1  mrg /* Return the threshold size based on the data accumulated. */
    327      1.1  mrg mp_size_t
    328      1.1  mrg analyze_dat (int final)
    329      1.1  mrg {
    330      1.1  mrg   double  x, min_x;
    331      1.1  mrg   int     j, min_j;
    332      1.1  mrg 
    333      1.1  mrg   /* If the threshold is set at dat[0].size, any positive values are bad. */
    334      1.1  mrg   x = 0.0;
    335      1.1  mrg   for (j = 0; j < ndat; j++)
    336      1.1  mrg     if (dat[j].d > 0.0)
    337      1.1  mrg       x += dat[j].d;
    338      1.1  mrg 
    339      1.1  mrg   if (option_trace >= 2 && final)
    340      1.1  mrg     {
    341      1.1  mrg       printf ("\n");
    342      1.1  mrg       printf ("x is the sum of the badness from setting thresh at given size\n");
    343      1.1  mrg       printf ("  (minimum x is sought)\n");
    344      1.1  mrg       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
    345      1.1  mrg     }
    346      1.1  mrg 
    347      1.1  mrg   min_x = x;
    348      1.1  mrg   min_j = 0;
    349      1.1  mrg 
    350      1.1  mrg 
    351      1.1  mrg   /* When stepping to the next dat[j].size, positive values are no longer
    352      1.1  mrg      bad (so subtracted), negative values become bad (so add the absolute
    353      1.1  mrg      value, meaning subtract). */
    354      1.1  mrg   for (j = 0; j < ndat; x -= dat[j].d, j++)
    355      1.1  mrg     {
    356      1.1  mrg       if (option_trace >= 2 && final)
    357      1.1  mrg         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
    358      1.1  mrg 
    359      1.1  mrg       if (x < min_x)
    360      1.1  mrg         {
    361      1.1  mrg           min_x = x;
    362      1.1  mrg           min_j = j;
    363      1.1  mrg         }
    364      1.1  mrg     }
    365      1.1  mrg 
    366      1.1  mrg   return min_j;
    367      1.1  mrg }
    368      1.1  mrg 
    369      1.1  mrg 
    370  1.1.1.2  mrg /* Measuring for recompiled mpn/generic/divrem_1.c, mpn/generic/mod_1.c
    371  1.1.1.2  mrg  * and mpz/fac_ui.c */
    372      1.1  mrg 
    373  1.1.1.2  mrg mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
    374  1.1.1.2  mrg mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
    375  1.1.1.2  mrg void mpz_fac_ui_tune (mpz_ptr, unsigned long);
    376      1.1  mrg 
    377      1.1  mrg double
    378      1.1  mrg speed_mpn_mod_1_tune (struct speed_params *s)
    379      1.1  mrg {
    380      1.1  mrg   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
    381      1.1  mrg }
    382      1.1  mrg double
    383      1.1  mrg speed_mpn_divrem_1_tune (struct speed_params *s)
    384      1.1  mrg {
    385      1.1  mrg   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
    386      1.1  mrg }
    387  1.1.1.2  mrg double
    388  1.1.1.2  mrg speed_mpz_fac_ui_tune (struct speed_params *s)
    389  1.1.1.2  mrg {
    390  1.1.1.2  mrg   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
    391  1.1.1.2  mrg }
    392      1.1  mrg 
    393      1.1  mrg 
    394      1.1  mrg double
    395      1.1  mrg tuneup_measure (speed_function_t fun,
    396      1.1  mrg                 const struct param_t *param,
    397      1.1  mrg                 struct speed_params *s)
    398      1.1  mrg {
    399      1.1  mrg   static struct param_t  dummy;
    400      1.1  mrg   double   t;
    401      1.1  mrg   TMP_DECL;
    402      1.1  mrg 
    403      1.1  mrg   if (! param)
    404      1.1  mrg     param = &dummy;
    405      1.1  mrg 
    406      1.1  mrg   s->size += param->size_extra;
    407      1.1  mrg 
    408      1.1  mrg   TMP_MARK;
    409      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
    410      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
    411      1.1  mrg 
    412      1.1  mrg   mpn_random (s->xp, s->size);
    413      1.1  mrg   mpn_random (s->yp, s->size);
    414      1.1  mrg 
    415      1.1  mrg   switch (param->data_high) {
    416      1.1  mrg   case DATA_HIGH_LT_R:
    417      1.1  mrg     s->xp[s->size-1] %= s->r;
    418      1.1  mrg     s->yp[s->size-1] %= s->r;
    419      1.1  mrg     break;
    420      1.1  mrg   case DATA_HIGH_GE_R:
    421      1.1  mrg     s->xp[s->size-1] |= s->r;
    422      1.1  mrg     s->yp[s->size-1] |= s->r;
    423      1.1  mrg     break;
    424      1.1  mrg   }
    425      1.1  mrg 
    426      1.1  mrg   t = speed_measure (fun, s);
    427      1.1  mrg 
    428      1.1  mrg   s->size -= param->size_extra;
    429      1.1  mrg 
    430      1.1  mrg   TMP_FREE;
    431      1.1  mrg   return t;
    432      1.1  mrg }
    433      1.1  mrg 
    434      1.1  mrg 
    435      1.1  mrg #define PRINT_WIDTH  31
    436      1.1  mrg 
    437      1.1  mrg void
    438      1.1  mrg print_define_start (const char *name)
    439      1.1  mrg {
    440      1.1  mrg   printf ("#define %-*s  ", PRINT_WIDTH, name);
    441      1.1  mrg   if (option_trace)
    442      1.1  mrg     printf ("...\n");
    443      1.1  mrg }
    444      1.1  mrg 
    445      1.1  mrg void
    446      1.1  mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark)
    447      1.1  mrg {
    448      1.1  mrg   if (option_trace)
    449      1.1  mrg     printf ("#define %-*s  ", PRINT_WIDTH, name);
    450      1.1  mrg 
    451      1.1  mrg   if (value == MP_SIZE_T_MAX)
    452      1.1  mrg     printf ("MP_SIZE_T_MAX");
    453      1.1  mrg   else
    454      1.1  mrg     printf ("%5ld", (long) value);
    455      1.1  mrg 
    456      1.1  mrg   if (remark != NULL)
    457      1.1  mrg     printf ("  /* %s */", remark);
    458      1.1  mrg   printf ("\n");
    459      1.1  mrg   fflush (stdout);
    460      1.1  mrg }
    461      1.1  mrg 
    462      1.1  mrg void
    463      1.1  mrg print_define_end (const char *name, mp_size_t value)
    464      1.1  mrg {
    465      1.1  mrg   const char  *remark;
    466      1.1  mrg   if (value == MP_SIZE_T_MAX)
    467      1.1  mrg     remark = "never";
    468      1.1  mrg   else if (value == 0)
    469      1.1  mrg     remark = "always";
    470      1.1  mrg   else
    471      1.1  mrg     remark = NULL;
    472      1.1  mrg   print_define_end_remark (name, value, remark);
    473      1.1  mrg }
    474      1.1  mrg 
    475      1.1  mrg void
    476      1.1  mrg print_define (const char *name, mp_size_t value)
    477      1.1  mrg {
    478      1.1  mrg   print_define_start (name);
    479      1.1  mrg   print_define_end (name, value);
    480      1.1  mrg }
    481      1.1  mrg 
    482      1.1  mrg void
    483      1.1  mrg print_define_remark (const char *name, mp_size_t value, const char *remark)
    484      1.1  mrg {
    485      1.1  mrg   print_define_start (name);
    486      1.1  mrg   print_define_end_remark (name, value, remark);
    487      1.1  mrg }
    488      1.1  mrg 
    489      1.1  mrg 
    490      1.1  mrg void
    491      1.1  mrg one (mp_size_t *threshold, struct param_t *param)
    492      1.1  mrg {
    493      1.1  mrg   int  since_positive, since_thresh_change;
    494      1.1  mrg   int  thresh_idx, new_thresh_idx;
    495      1.1  mrg 
    496      1.1  mrg #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
    497      1.1  mrg 
    498      1.1  mrg   DEFAULT (param->function_fudge, 1.0);
    499      1.1  mrg   DEFAULT (param->function2, param->function);
    500      1.1  mrg   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
    501      1.1  mrg   DEFAULT (param->step, 1);            /* small steps by default */
    502      1.1  mrg   DEFAULT (param->stop_since_change, 80);
    503      1.1  mrg   DEFAULT (param->stop_factor, 1.2);
    504      1.1  mrg   DEFAULT (param->min_size, 10);
    505      1.1  mrg   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
    506      1.1  mrg 
    507      1.1  mrg   if (param->check_size != 0)
    508      1.1  mrg     {
    509      1.1  mrg       double   t1, t2;
    510      1.1  mrg       s.size = param->check_size;
    511      1.1  mrg 
    512      1.1  mrg       *threshold = s.size+1;
    513      1.1  mrg       t1 = tuneup_measure (param->function, param, &s);
    514      1.1  mrg 
    515      1.1  mrg       *threshold = s.size;
    516      1.1  mrg       t2 = tuneup_measure (param->function2, param, &s);
    517      1.1  mrg       if (t1 == -1.0 || t2 == -1.0)
    518      1.1  mrg         {
    519      1.1  mrg           printf ("Oops, can't run both functions at size %ld\n",
    520      1.1  mrg                   (long) s.size);
    521      1.1  mrg           abort ();
    522      1.1  mrg         }
    523      1.1  mrg       t1 *= param->function_fudge;
    524      1.1  mrg 
    525      1.1  mrg       /* ask that t2 is at least 4% below t1 */
    526      1.1  mrg       if (t1 < t2*1.04)
    527      1.1  mrg         {
    528      1.1  mrg           if (option_trace)
    529      1.1  mrg             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
    530      1.1  mrg           *threshold = MP_SIZE_T_MAX;
    531      1.1  mrg           if (! param->noprint)
    532      1.1  mrg             print_define (param->name, *threshold);
    533      1.1  mrg           return;
    534      1.1  mrg         }
    535      1.1  mrg 
    536      1.1  mrg       if (option_trace >= 2)
    537      1.1  mrg         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
    538      1.1  mrg                 (long) s.size, t1, t2);
    539      1.1  mrg     }
    540      1.1  mrg 
    541      1.1  mrg   if (! param->noprint || option_trace)
    542      1.1  mrg     print_define_start (param->name);
    543      1.1  mrg 
    544      1.1  mrg   ndat = 0;
    545      1.1  mrg   since_positive = 0;
    546      1.1  mrg   since_thresh_change = 0;
    547      1.1  mrg   thresh_idx = 0;
    548      1.1  mrg 
    549      1.1  mrg   if (option_trace >= 2)
    550      1.1  mrg     {
    551      1.1  mrg       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
    552      1.1  mrg       printf ("              (seconds)    (seconds)    diff    thresh\n");
    553      1.1  mrg     }
    554      1.1  mrg 
    555      1.1  mrg   for (s.size = param->min_size;
    556      1.1  mrg        s.size < param->max_size;
    557      1.1  mrg        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
    558      1.1  mrg     {
    559      1.1  mrg       double   ti, tiplus1, d;
    560      1.1  mrg 
    561      1.1  mrg       /*
    562      1.1  mrg         FIXME: check minimum size requirements are met, possibly by just
    563      1.1  mrg         checking for the -1 returns from the speed functions.
    564      1.1  mrg       */
    565      1.1  mrg 
    566      1.1  mrg       /* using method A at this size */
    567      1.1  mrg       *threshold = s.size+1;
    568      1.1  mrg       ti = tuneup_measure (param->function, param, &s);
    569      1.1  mrg       if (ti == -1.0)
    570      1.1  mrg         abort ();
    571      1.1  mrg       ti *= param->function_fudge;
    572      1.1  mrg 
    573      1.1  mrg       /* using method B at this size */
    574      1.1  mrg       *threshold = s.size;
    575      1.1  mrg       tiplus1 = tuneup_measure (param->function2, param, &s);
    576      1.1  mrg       if (tiplus1 == -1.0)
    577      1.1  mrg         abort ();
    578      1.1  mrg 
    579      1.1  mrg       /* Calculate the fraction by which the one or the other routine is
    580      1.1  mrg          slower.  */
    581      1.1  mrg       if (tiplus1 >= ti)
    582      1.1  mrg         d = (tiplus1 - ti) / tiplus1;  /* negative */
    583      1.1  mrg       else
    584      1.1  mrg         d = (tiplus1 - ti) / ti;       /* positive */
    585      1.1  mrg 
    586      1.1  mrg       add_dat (s.size, d);
    587      1.1  mrg 
    588      1.1  mrg       new_thresh_idx = analyze_dat (0);
    589      1.1  mrg 
    590      1.1  mrg       if (option_trace >= 2)
    591      1.1  mrg         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
    592      1.1  mrg                 (long) s.size, ti, tiplus1, d,
    593      1.1  mrg                 ti > tiplus1 ? '#' : ' ',
    594      1.1  mrg                 (long) dat[new_thresh_idx].size);
    595      1.1  mrg 
    596      1.1  mrg       /* Stop if the last time method i was faster was more than a
    597      1.1  mrg          certain number of measurements ago.  */
    598      1.1  mrg #define STOP_SINCE_POSITIVE  200
    599      1.1  mrg       if (d >= 0)
    600      1.1  mrg         since_positive = 0;
    601      1.1  mrg       else
    602      1.1  mrg         if (++since_positive > STOP_SINCE_POSITIVE)
    603      1.1  mrg           {
    604      1.1  mrg             if (option_trace >= 1)
    605      1.1  mrg               printf ("stopped due to since_positive (%d)\n",
    606      1.1  mrg                       STOP_SINCE_POSITIVE);
    607      1.1  mrg             break;
    608      1.1  mrg           }
    609      1.1  mrg 
    610      1.1  mrg       /* Stop if method A has become slower by a certain factor. */
    611      1.1  mrg       if (ti >= tiplus1 * param->stop_factor)
    612      1.1  mrg         {
    613      1.1  mrg           if (option_trace >= 1)
    614      1.1  mrg             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
    615      1.1  mrg                     param->stop_factor);
    616      1.1  mrg           break;
    617      1.1  mrg         }
    618      1.1  mrg 
    619      1.1  mrg       /* Stop if the threshold implied hasn't changed in a certain
    620      1.1  mrg          number of measurements.  (It's this condition that usually
    621      1.1  mrg          stops the loop.) */
    622      1.1  mrg       if (thresh_idx != new_thresh_idx)
    623      1.1  mrg         since_thresh_change = 0, thresh_idx = new_thresh_idx;
    624      1.1  mrg       else
    625      1.1  mrg         if (++since_thresh_change > param->stop_since_change)
    626      1.1  mrg           {
    627      1.1  mrg             if (option_trace >= 1)
    628      1.1  mrg               printf ("stopped due to since_thresh_change (%d)\n",
    629      1.1  mrg                       param->stop_since_change);
    630      1.1  mrg             break;
    631      1.1  mrg           }
    632      1.1  mrg 
    633      1.1  mrg       /* Stop if the threshold implied is more than a certain number of
    634      1.1  mrg          measurements ago.  */
    635      1.1  mrg #define STOP_SINCE_AFTER   500
    636      1.1  mrg       if (ndat - thresh_idx > STOP_SINCE_AFTER)
    637      1.1  mrg         {
    638      1.1  mrg           if (option_trace >= 1)
    639      1.1  mrg             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
    640      1.1  mrg                     STOP_SINCE_AFTER);
    641      1.1  mrg           break;
    642      1.1  mrg         }
    643      1.1  mrg 
    644      1.1  mrg       /* Stop when the size limit is reached before the end of the
    645      1.1  mrg          crossover, but only show this as an error for >= the default max
    646      1.1  mrg          size.  FIXME: Maybe should make it a param choice whether this is
    647      1.1  mrg          an error.  */
    648      1.1  mrg       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
    649      1.1  mrg         {
    650      1.1  mrg           fprintf (stderr, "%s\n", param->name);
    651      1.1  mrg           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
    652      1.1  mrg                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    653      1.1  mrg           fprintf (stderr, "    max size reached before end of crossover\n");
    654      1.1  mrg           break;
    655      1.1  mrg         }
    656      1.1  mrg     }
    657      1.1  mrg 
    658      1.1  mrg   if (option_trace >= 1)
    659      1.1  mrg     printf ("sizes %ld to %ld total %d measurements\n",
    660      1.1  mrg             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    661      1.1  mrg 
    662      1.1  mrg   *threshold = dat[analyze_dat (1)].size;
    663      1.1  mrg 
    664      1.1  mrg   if (param->min_is_always)
    665      1.1  mrg     {
    666      1.1  mrg       if (*threshold == param->min_size)
    667      1.1  mrg         *threshold = 0;
    668      1.1  mrg     }
    669      1.1  mrg 
    670      1.1  mrg   if (! param->noprint || option_trace)
    671      1.1  mrg     print_define_end (param->name, *threshold);
    672      1.1  mrg }
    673      1.1  mrg 
    674      1.1  mrg 
    675      1.1  mrg /* Special probing for the fft thresholds.  The size restrictions on the
    676      1.1  mrg    FFTs mean the graph of time vs size has a step effect.  See this for
    677      1.1  mrg    example using
    678      1.1  mrg 
    679      1.1  mrg        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
    680      1.1  mrg        gnuplot foo.gnuplot
    681      1.1  mrg 
    682      1.1  mrg    The current approach is to compare routines at the midpoint of relevant
    683      1.1  mrg    steps.  Arguably a more sophisticated system of threshold data is wanted
    684      1.1  mrg    if this step effect remains. */
    685      1.1  mrg 
    686      1.1  mrg struct fft_param_t {
    687      1.1  mrg   const char        *table_name;
    688      1.1  mrg   const char        *threshold_name;
    689      1.1  mrg   const char        *modf_threshold_name;
    690      1.1  mrg   mp_size_t         *p_threshold;
    691      1.1  mrg   mp_size_t         *p_modf_threshold;
    692      1.1  mrg   mp_size_t         first_size;
    693      1.1  mrg   mp_size_t         max_size;
    694      1.1  mrg   speed_function_t  function;
    695      1.1  mrg   speed_function_t  mul_modf_function;
    696      1.1  mrg   speed_function_t  mul_function;
    697      1.1  mrg   mp_size_t         sqr;
    698      1.1  mrg };
    699      1.1  mrg 
    700      1.1  mrg 
    701      1.1  mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
    702      1.1  mrg    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
    703      1.1  mrg    of 2^(k-1) bits. */
    704      1.1  mrg 
    705      1.1  mrg mp_size_t
    706      1.1  mrg fft_step_size (int k)
    707      1.1  mrg {
    708      1.1  mrg   mp_size_t  step;
    709      1.1  mrg 
    710      1.1  mrg   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
    711      1.1  mrg   step *= (mp_size_t) 1 << k;
    712      1.1  mrg 
    713      1.1  mrg   if (step <= 0)
    714      1.1  mrg     {
    715      1.1  mrg       printf ("Can't handle k=%d\n", k);
    716      1.1  mrg       abort ();
    717      1.1  mrg     }
    718      1.1  mrg 
    719      1.1  mrg   return step;
    720      1.1  mrg }
    721      1.1  mrg 
    722      1.1  mrg mp_size_t
    723      1.1  mrg fft_next_size (mp_size_t pl, int k)
    724      1.1  mrg {
    725      1.1  mrg   mp_size_t  m = fft_step_size (k);
    726      1.1  mrg 
    727      1.1  mrg /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
    728      1.1  mrg 
    729      1.1  mrg   if (pl == 0 || (pl & (m-1)) != 0)
    730      1.1  mrg     pl = (pl | (m-1)) + 1;
    731      1.1  mrg 
    732      1.1  mrg /*    printf (" %ld\n", pl); */
    733      1.1  mrg   return pl;
    734      1.1  mrg }
    735      1.1  mrg 
    736      1.1  mrg #define NMAX_DEFAULT 1000000
    737      1.1  mrg #define MAX_REPS 25
    738      1.1  mrg #define MIN_REPS 5
    739      1.1  mrg 
    740      1.1  mrg static inline size_t
    741      1.1  mrg mpn_mul_fft_lcm (size_t a, unsigned int k)
    742      1.1  mrg {
    743      1.1  mrg   unsigned int l = k;
    744      1.1  mrg 
    745      1.1  mrg   while (a % 2 == 0 && k > 0)
    746      1.1  mrg     {
    747      1.1  mrg       a >>= 1;
    748      1.1  mrg       k--;
    749      1.1  mrg     }
    750      1.1  mrg   return a << l;
    751      1.1  mrg }
    752      1.1  mrg 
    753      1.1  mrg mp_size_t
    754      1.1  mrg fftfill (mp_size_t pl, int k, int sqr)
    755      1.1  mrg {
    756      1.1  mrg   mp_size_t maxLK;
    757      1.1  mrg   mp_bitcnt_t N, Nprime, nprime, M;
    758      1.1  mrg 
    759      1.1  mrg   N = pl * GMP_NUMB_BITS;
    760      1.1  mrg   M = N >> k;
    761      1.1  mrg 
    762      1.1  mrg   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
    763      1.1  mrg 
    764      1.1  mrg   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
    765      1.1  mrg   nprime = Nprime / GMP_NUMB_BITS;
    766      1.1  mrg   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    767      1.1  mrg     {
    768      1.1  mrg       size_t K2;
    769      1.1  mrg       for (;;)
    770      1.1  mrg 	{
    771      1.1  mrg 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
    772      1.1  mrg 	  if ((nprime & (K2 - 1)) == 0)
    773      1.1  mrg 	    break;
    774      1.1  mrg 	  nprime = (nprime + K2 - 1) & -K2;
    775      1.1  mrg 	  Nprime = nprime * GMP_LIMB_BITS;
    776      1.1  mrg 	}
    777      1.1  mrg     }
    778      1.1  mrg   ASSERT_ALWAYS (nprime < pl);
    779      1.1  mrg 
    780      1.1  mrg   return Nprime;
    781      1.1  mrg }
    782      1.1  mrg 
    783      1.1  mrg static int
    784      1.1  mrg compare_double (const void *ap, const void *bp)
    785      1.1  mrg {
    786      1.1  mrg   double a = * (const double *) ap;
    787      1.1  mrg   double b = * (const double *) bp;
    788      1.1  mrg 
    789      1.1  mrg   if (a < b)
    790      1.1  mrg     return -1;
    791      1.1  mrg   else if (a > b)
    792      1.1  mrg     return 1;
    793      1.1  mrg   else
    794      1.1  mrg     return 0;
    795      1.1  mrg }
    796      1.1  mrg 
    797      1.1  mrg double
    798      1.1  mrg median (double *times, int n)
    799      1.1  mrg {
    800      1.1  mrg   qsort (times, n, sizeof (double), compare_double);
    801      1.1  mrg   return times[n/2];
    802      1.1  mrg }
    803      1.1  mrg 
    804      1.1  mrg #define FFT_CACHE_SIZE 25
    805      1.1  mrg typedef struct fft_cache
    806      1.1  mrg {
    807      1.1  mrg   mp_size_t n;
    808      1.1  mrg   double time;
    809      1.1  mrg } fft_cache_t;
    810      1.1  mrg 
    811      1.1  mrg fft_cache_t fft_cache[FFT_CACHE_SIZE];
    812      1.1  mrg 
    813      1.1  mrg double
    814      1.1  mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
    815      1.1  mrg 		int n_measurements)
    816      1.1  mrg {
    817      1.1  mrg   int i;
    818      1.1  mrg   double t, ttab[MAX_REPS];
    819      1.1  mrg 
    820      1.1  mrg   if (fft_cache[k].n == n)
    821      1.1  mrg     return fft_cache[k].time;
    822      1.1  mrg 
    823      1.1  mrg   for (i = 0; i < n_measurements; i++)
    824      1.1  mrg     {
    825      1.1  mrg       speed_starttime ();
    826      1.1  mrg       mpn_mul_fft (rp, n, ap, n, bp, n, k);
    827      1.1  mrg       ttab[i] = speed_endtime ();
    828      1.1  mrg     }
    829      1.1  mrg 
    830      1.1  mrg   t = median (ttab, n_measurements);
    831      1.1  mrg   fft_cache[k].n = n;
    832      1.1  mrg   fft_cache[k].time = t;
    833      1.1  mrg   return t;
    834      1.1  mrg }
    835      1.1  mrg 
    836      1.1  mrg #define INSERT_FFTTAB(idx, nval, kval)					\
    837      1.1  mrg   do {									\
    838      1.1  mrg     fft_tab[idx].n = nval;						\
    839      1.1  mrg     fft_tab[idx].k = kval;						\
    840      1.1  mrg     fft_tab[idx+1].n = -1;	/* sentinel */				\
    841      1.1  mrg     fft_tab[idx+1].k = -1;						\
    842      1.1  mrg   } while (0)
    843      1.1  mrg 
    844      1.1  mrg int
    845      1.1  mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
    846      1.1  mrg {
    847      1.1  mrg   mp_size_t n, n1, prev_n1;
    848      1.1  mrg   int k, best_k, last_best_k, kmax;
    849      1.1  mrg   int eff, prev_eff;
    850      1.1  mrg   double t0, t1;
    851      1.1  mrg   int n_measurements;
    852      1.1  mrg   mp_limb_t *ap, *bp, *rp;
    853      1.1  mrg   mp_size_t alloc;
    854      1.1  mrg   char *linepref;
    855      1.1  mrg   struct fft_table_nk *fft_tab;
    856      1.1  mrg 
    857      1.1  mrg   fft_tab = mpn_fft_table3[p->sqr];
    858      1.1  mrg 
    859      1.1  mrg   for (k = 0; k < FFT_CACHE_SIZE; k++)
    860      1.1  mrg     fft_cache[k].n = 0;
    861      1.1  mrg 
    862      1.1  mrg   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    863      1.1  mrg     {
    864      1.1  mrg       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
    865      1.1  mrg     }
    866      1.1  mrg 
    867      1.1  mrg   if (print)
    868      1.1  mrg     printf ("#define %s%*s", p->table_name, 38, "");
    869      1.1  mrg 
    870      1.1  mrg   if (idx == 0)
    871      1.1  mrg     {
    872      1.1  mrg       INSERT_FFTTAB (0, nmin, initial_k);
    873      1.1  mrg 
    874      1.1  mrg       if (print)
    875      1.1  mrg 	{
    876      1.1  mrg 	  printf ("\\\n  { ");
    877      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
    878      1.1  mrg 	  linepref = "    ";
    879      1.1  mrg 	}
    880      1.1  mrg 
    881      1.1  mrg       idx = 1;
    882      1.1  mrg     }
    883      1.1  mrg 
    884      1.1  mrg   ap = malloc (sizeof (mp_limb_t));
    885      1.1  mrg   if (p->sqr)
    886      1.1  mrg     bp = ap;
    887      1.1  mrg   else
    888      1.1  mrg     bp = malloc (sizeof (mp_limb_t));
    889      1.1  mrg   rp = malloc (sizeof (mp_limb_t));
    890      1.1  mrg   alloc = 1;
    891      1.1  mrg 
    892      1.1  mrg   /* Round n to comply to initial k value */
    893      1.1  mrg   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
    894      1.1  mrg 
    895      1.1  mrg   n_measurements = (18 - initial_k) | 1;
    896      1.1  mrg   n_measurements = MAX (n_measurements, MIN_REPS);
    897      1.1  mrg   n_measurements = MIN (n_measurements, MAX_REPS);
    898      1.1  mrg 
    899      1.1  mrg   last_best_k = initial_k;
    900      1.1  mrg   best_k = initial_k;
    901      1.1  mrg 
    902      1.1  mrg   while (n < nmax)
    903      1.1  mrg     {
    904      1.1  mrg       int start_k, end_k;
    905      1.1  mrg 
    906      1.1  mrg       /* Assume the current best k is best until we hit its next FFT step.  */
    907      1.1  mrg       t0 = 99999;
    908      1.1  mrg 
    909      1.1  mrg       prev_n1 = n + 1;
    910      1.1  mrg 
    911      1.1  mrg       start_k = MAX (4, best_k - 4);
    912      1.1  mrg       end_k = MIN (24, best_k + 4);
    913      1.1  mrg       for (k = start_k; k <= end_k; k++)
    914      1.1  mrg 	{
    915      1.1  mrg           n1 = mpn_fft_next_size (prev_n1, k);
    916      1.1  mrg 
    917      1.1  mrg 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
    918      1.1  mrg 
    919      1.1  mrg 	  if (eff < 70)		/* avoid measuring too slow fft:s */
    920      1.1  mrg 	    continue;
    921      1.1  mrg 
    922      1.1  mrg 	  if (n1 > alloc)
    923      1.1  mrg 	    {
    924      1.1  mrg 	      alloc = n1;
    925      1.1  mrg 	      if (p->sqr)
    926      1.1  mrg 		{
    927      1.1  mrg 		  ap = realloc (ap, sizeof (mp_limb_t));
    928      1.1  mrg 		  rp = realloc (rp, sizeof (mp_limb_t));
    929      1.1  mrg 		  ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
    930      1.1  mrg 		  mpn_random (ap, alloc);
    931      1.1  mrg 		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
    932      1.1  mrg 		}
    933      1.1  mrg 	      else
    934      1.1  mrg 		{
    935      1.1  mrg 		  ap = realloc (ap, sizeof (mp_limb_t));
    936      1.1  mrg 		  bp = realloc (bp, sizeof (mp_limb_t));
    937      1.1  mrg 		  rp = realloc (rp, sizeof (mp_limb_t));
    938      1.1  mrg 		  ap = realloc (ap, alloc * sizeof (mp_limb_t));
    939      1.1  mrg 		  mpn_random (ap, alloc);
    940      1.1  mrg 		  bp = realloc (bp, alloc * sizeof (mp_limb_t));
    941      1.1  mrg 		  mpn_random (bp, alloc);
    942      1.1  mrg 		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
    943      1.1  mrg 		}
    944      1.1  mrg 	    }
    945      1.1  mrg 
    946      1.1  mrg 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
    947      1.1  mrg 
    948      1.1  mrg 	  if (t1 * n_measurements > 0.3)
    949      1.1  mrg 	    n_measurements -= 2;
    950      1.1  mrg 	  n_measurements = MAX (n_measurements, MIN_REPS);
    951      1.1  mrg 
    952      1.1  mrg 	  if (t1 < t0)
    953      1.1  mrg 	    {
    954      1.1  mrg 	      best_k = k;
    955      1.1  mrg 	      t0 = t1;
    956      1.1  mrg 	    }
    957      1.1  mrg 	}
    958      1.1  mrg 
    959      1.1  mrg       n1 = mpn_fft_next_size (prev_n1, best_k);
    960      1.1  mrg 
    961      1.1  mrg       if (last_best_k != best_k)
    962      1.1  mrg 	{
    963      1.1  mrg 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
    964      1.1  mrg 
    965      1.1  mrg 	  if (idx >= FFT_TABLE3_SIZE)
    966      1.1  mrg 	    {
    967      1.1  mrg 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
    968      1.1  mrg 	      abort ();
    969      1.1  mrg 	    }
    970      1.1  mrg 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
    971      1.1  mrg 
    972      1.1  mrg 	  if (print)
    973      1.1  mrg 	    {
    974      1.1  mrg 	      printf (", ");
    975      1.1  mrg 	      if (idx % 4 == 0)
    976      1.1  mrg 		printf ("\\\n    ");
    977      1.1  mrg 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
    978      1.1  mrg 	    }
    979      1.1  mrg 
    980      1.1  mrg 	  if (option_trace >= 2)
    981      1.1  mrg 	    {
    982      1.1  mrg 	      printf ("{%lu,%u}\n", prev_n1, best_k);
    983      1.1  mrg 	      fflush (stdout);
    984      1.1  mrg 	    }
    985      1.1  mrg 
    986      1.1  mrg 	  last_best_k = best_k;
    987      1.1  mrg 	  idx++;
    988      1.1  mrg 	}
    989      1.1  mrg 
    990      1.1  mrg       for (;;)
    991      1.1  mrg 	{
    992      1.1  mrg 	  prev_n1 = n1;
    993      1.1  mrg 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
    994      1.1  mrg 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
    995      1.1  mrg 	  eff = fftfill (n1, best_k, p->sqr);
    996      1.1  mrg 
    997      1.1  mrg 	  if (eff != prev_eff)
    998      1.1  mrg 	    break;
    999      1.1  mrg 	}
   1000      1.1  mrg 
   1001      1.1  mrg       n = prev_n1;
   1002      1.1  mrg     }
   1003      1.1  mrg 
   1004      1.1  mrg   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
   1005      1.1  mrg   kmax = MIN (kmax, 25-1);
   1006      1.1  mrg   for (k = last_best_k + 1; k <= kmax; k++)
   1007      1.1  mrg     {
   1008      1.1  mrg       if (idx >= FFT_TABLE3_SIZE)
   1009      1.1  mrg 	{
   1010      1.1  mrg 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   1011      1.1  mrg 	  abort ();
   1012      1.1  mrg 	}
   1013      1.1  mrg       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
   1014      1.1  mrg 
   1015      1.1  mrg       if (print)
   1016      1.1  mrg 	{
   1017      1.1  mrg 	  printf (", ");
   1018      1.1  mrg 	  if (idx % 4 == 0)
   1019      1.1  mrg 	    printf ("\\\n    ");
   1020      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1021      1.1  mrg 	}
   1022      1.1  mrg 
   1023      1.1  mrg       idx++;
   1024      1.1  mrg     }
   1025      1.1  mrg 
   1026      1.1  mrg   if (print)
   1027      1.1  mrg     printf (" }\n");
   1028      1.1  mrg 
   1029      1.1  mrg   free (ap);
   1030      1.1  mrg   if (! p->sqr)
   1031      1.1  mrg     free (bp);
   1032      1.1  mrg   free (rp);
   1033      1.1  mrg 
   1034      1.1  mrg   return idx;
   1035      1.1  mrg }
   1036      1.1  mrg 
   1037      1.1  mrg void
   1038      1.1  mrg fft (struct fft_param_t *p)
   1039      1.1  mrg {
   1040      1.1  mrg   mp_size_t  size;
   1041      1.1  mrg   int        k, idx, initial_k;
   1042      1.1  mrg 
   1043      1.1  mrg   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
   1044      1.1  mrg 
   1045      1.1  mrg #if 1
   1046      1.1  mrg   {
   1047      1.1  mrg     /* Use plain one() mechanism, for some reasonable initial values of k.  The
   1048      1.1  mrg        advantage is that we don't depend on mpn_fft_table3, which can therefore
   1049      1.1  mrg        leave it completely uninitialized.  */
   1050      1.1  mrg 
   1051      1.1  mrg     static struct param_t param;
   1052      1.1  mrg     mp_size_t thres, best_thres;
   1053      1.1  mrg     int best_k;
   1054      1.1  mrg     char buf[20];
   1055      1.1  mrg 
   1056      1.1  mrg     best_thres = MP_SIZE_T_MAX;
   1057      1.1  mrg     best_k = -1;
   1058      1.1  mrg 
   1059      1.1  mrg     for (k = 5; k <= 7; k++)
   1060      1.1  mrg       {
   1061      1.1  mrg 	param.name = p->modf_threshold_name;
   1062      1.1  mrg 	param.min_size = 100;
   1063      1.1  mrg 	param.max_size = 2000;
   1064      1.1  mrg 	param.function  = p->mul_function;
   1065      1.1  mrg 	param.step_factor = 0.0;
   1066      1.1  mrg 	param.step = 4;
   1067      1.1  mrg 	param.function2 = p->mul_modf_function;
   1068      1.1  mrg 	param.noprint = 1;
   1069      1.1  mrg 	s.r = k;
   1070      1.1  mrg 	one (&thres, &param);
   1071      1.1  mrg 	if (thres < best_thres)
   1072      1.1  mrg 	  {
   1073      1.1  mrg 	    best_thres = thres;
   1074      1.1  mrg 	    best_k = k;
   1075      1.1  mrg 	  }
   1076      1.1  mrg       }
   1077      1.1  mrg 
   1078      1.1  mrg     *(p->p_modf_threshold) = best_thres;
   1079      1.1  mrg     sprintf (buf, "k = %d", best_k);
   1080      1.1  mrg     print_define_remark (p->modf_threshold_name, best_thres, buf);
   1081      1.1  mrg     initial_k = best_k;
   1082      1.1  mrg   }
   1083      1.1  mrg #else
   1084      1.1  mrg   size = p->first_size;
   1085      1.1  mrg   for (;;)
   1086      1.1  mrg     {
   1087      1.1  mrg       double  tk, tm;
   1088      1.1  mrg 
   1089      1.1  mrg       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
   1090      1.1  mrg       k = mpn_fft_best_k (size, p->sqr);
   1091      1.1  mrg 
   1092      1.1  mrg       if (size >= p->max_size)
   1093      1.1  mrg         break;
   1094      1.1  mrg 
   1095      1.1  mrg       s.size = size + fft_step_size (k) / 2;
   1096      1.1  mrg       s.r = k;
   1097      1.1  mrg       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
   1098      1.1  mrg       if (tk == -1.0)
   1099      1.1  mrg         abort ();
   1100      1.1  mrg 
   1101      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1102      1.1  mrg       if (tm == -1.0)
   1103      1.1  mrg         abort ();
   1104      1.1  mrg 
   1105      1.1  mrg       if (option_trace >= 2)
   1106      1.1  mrg         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
   1107      1.1  mrg                 (long) size,
   1108      1.1  mrg                 (long) size + fft_step_size (k) / 2, k, tk,
   1109      1.1  mrg                 (long) s.size, tm);
   1110      1.1  mrg 
   1111      1.1  mrg       if (tk < tm)
   1112      1.1  mrg         {
   1113      1.1  mrg 	  *p->p_modf_threshold = s.size;
   1114      1.1  mrg 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
   1115      1.1  mrg 	  break;
   1116      1.1  mrg         }
   1117      1.1  mrg     }
   1118      1.1  mrg   initial_k = ?;
   1119      1.1  mrg #endif
   1120      1.1  mrg 
   1121      1.1  mrg   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
   1122      1.1  mrg 
   1123      1.1  mrg   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
   1124      1.1  mrg   printf ("#define %s_SIZE %d\n", p->table_name, idx);
   1125      1.1  mrg 
   1126      1.1  mrg   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
   1127      1.1  mrg 
   1128      1.1  mrg   size = 2 * *p->p_modf_threshold;	/* OK? */
   1129      1.1  mrg   for (;;)
   1130      1.1  mrg     {
   1131      1.1  mrg       double  tk, tm;
   1132      1.1  mrg       mp_size_t mulmod_size, mul_size;;
   1133      1.1  mrg 
   1134      1.1  mrg       if (size >= p->max_size)
   1135      1.1  mrg         break;
   1136      1.1  mrg 
   1137      1.1  mrg       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
   1138      1.1  mrg       mul_size = (size + mulmod_size) / 2;	/* middle of step */
   1139      1.1  mrg 
   1140      1.1  mrg       s.size = mulmod_size;
   1141      1.1  mrg       tk = tuneup_measure (p->function, NULL, &s);
   1142      1.1  mrg       if (tk == -1.0)
   1143      1.1  mrg         abort ();
   1144      1.1  mrg 
   1145      1.1  mrg       s.size = mul_size;
   1146      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1147      1.1  mrg       if (tm == -1.0)
   1148      1.1  mrg         abort ();
   1149      1.1  mrg 
   1150      1.1  mrg       if (option_trace >= 2)
   1151      1.1  mrg         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
   1152      1.1  mrg                 (long) size,
   1153      1.1  mrg                 (long) mulmod_size, tk,
   1154      1.1  mrg                 (long) mul_size, tm);
   1155      1.1  mrg 
   1156      1.1  mrg       size = mulmod_size;
   1157      1.1  mrg 
   1158      1.1  mrg       if (tk < tm)
   1159      1.1  mrg         {
   1160      1.1  mrg 	  *p->p_threshold = s.size;
   1161      1.1  mrg 	  print_define (p->threshold_name, *p->p_threshold);
   1162      1.1  mrg 	  break;
   1163      1.1  mrg         }
   1164      1.1  mrg     }
   1165      1.1  mrg }
   1166      1.1  mrg 
   1167      1.1  mrg 
   1168      1.1  mrg 
   1169      1.1  mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
   1170      1.1  mrg    giving wrong results.  */
   1171      1.1  mrg void
   1172      1.1  mrg tune_mul_n (void)
   1173      1.1  mrg {
   1174      1.1  mrg   static struct param_t  param;
   1175  1.1.1.2  mrg   mp_size_t next_toom_start;
   1176  1.1.1.2  mrg   int something_changed;
   1177      1.1  mrg 
   1178      1.1  mrg   param.function = speed_mpn_mul_n;
   1179      1.1  mrg 
   1180      1.1  mrg   param.name = "MUL_TOOM22_THRESHOLD";
   1181      1.1  mrg   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
   1182      1.1  mrg   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
   1183      1.1  mrg   one (&mul_toom22_threshold, &param);
   1184      1.1  mrg 
   1185  1.1.1.2  mrg   param.noprint = 1;
   1186  1.1.1.2  mrg 
   1187  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1188  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1189  1.1.1.2  mrg   something_changed = 1;
   1190  1.1.1.2  mrg   while (something_changed)
   1191  1.1.1.2  mrg     {
   1192  1.1.1.2  mrg       something_changed = 0;
   1193  1.1.1.2  mrg 
   1194  1.1.1.2  mrg 	next_toom_start = mul_toom22_threshold;
   1195  1.1.1.2  mrg 
   1196  1.1.1.2  mrg 	if (mul_toom33_threshold != 0)
   1197  1.1.1.2  mrg 	  {
   1198  1.1.1.2  mrg 	    param.name = "MUL_TOOM33_THRESHOLD";
   1199  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
   1200  1.1.1.2  mrg 	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
   1201  1.1.1.2  mrg 	    one (&mul_toom33_threshold, &param);
   1202  1.1.1.2  mrg 
   1203  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
   1204  1.1.1.2  mrg 	      {
   1205  1.1.1.2  mrg 		mul_toom33_threshold = 0;
   1206  1.1.1.2  mrg 		something_changed = 1;
   1207  1.1.1.2  mrg 	      }
   1208  1.1.1.2  mrg 	  }
   1209  1.1.1.2  mrg 
   1210  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
   1211  1.1.1.2  mrg 
   1212  1.1.1.2  mrg 	if (mul_toom44_threshold != 0)
   1213  1.1.1.2  mrg 	  {
   1214  1.1.1.2  mrg 	    param.name = "MUL_TOOM44_THRESHOLD";
   1215  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
   1216  1.1.1.2  mrg 	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
   1217  1.1.1.2  mrg 	    one (&mul_toom44_threshold, &param);
   1218  1.1.1.2  mrg 
   1219  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
   1220  1.1.1.2  mrg 	      {
   1221  1.1.1.2  mrg 		mul_toom44_threshold = 0;
   1222  1.1.1.2  mrg 		something_changed = 1;
   1223  1.1.1.2  mrg 	      }
   1224  1.1.1.2  mrg 	  }
   1225  1.1.1.2  mrg 
   1226  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
   1227  1.1.1.2  mrg 
   1228  1.1.1.2  mrg 	if (mul_toom6h_threshold != 0)
   1229  1.1.1.2  mrg 	  {
   1230  1.1.1.2  mrg 	    param.name = "MUL_TOOM6H_THRESHOLD";
   1231  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
   1232  1.1.1.2  mrg 	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
   1233  1.1.1.2  mrg 	    one (&mul_toom6h_threshold, &param);
   1234  1.1.1.2  mrg 
   1235  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
   1236  1.1.1.2  mrg 	      {
   1237  1.1.1.2  mrg 		mul_toom6h_threshold = 0;
   1238  1.1.1.2  mrg 		something_changed = 1;
   1239  1.1.1.2  mrg 	      }
   1240  1.1.1.2  mrg 	  }
   1241  1.1.1.2  mrg 
   1242  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
   1243  1.1.1.2  mrg 
   1244  1.1.1.2  mrg 	if (mul_toom8h_threshold != 0)
   1245  1.1.1.2  mrg 	  {
   1246  1.1.1.2  mrg 	    param.name = "MUL_TOOM8H_THRESHOLD";
   1247  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
   1248  1.1.1.2  mrg 	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
   1249  1.1.1.2  mrg 	    one (&mul_toom8h_threshold, &param);
   1250  1.1.1.2  mrg 
   1251  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
   1252  1.1.1.2  mrg 	      {
   1253  1.1.1.2  mrg 		mul_toom8h_threshold = 0;
   1254  1.1.1.2  mrg 		something_changed = 1;
   1255  1.1.1.2  mrg 	      }
   1256  1.1.1.2  mrg 	  }
   1257  1.1.1.2  mrg     }
   1258  1.1.1.2  mrg 
   1259  1.1.1.2  mrg     print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
   1260  1.1.1.2  mrg     print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
   1261  1.1.1.2  mrg     print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
   1262  1.1.1.2  mrg     print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
   1263      1.1  mrg 
   1264      1.1  mrg   /* disabled until tuned */
   1265      1.1  mrg   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1266      1.1  mrg }
   1267      1.1  mrg 
   1268      1.1  mrg void
   1269      1.1  mrg tune_mul (void)
   1270      1.1  mrg {
   1271      1.1  mrg   static struct param_t  param;
   1272      1.1  mrg   mp_size_t thres;
   1273      1.1  mrg 
   1274      1.1  mrg   param.noprint = 1;
   1275      1.1  mrg 
   1276      1.1  mrg   param.function = speed_mpn_toom32_for_toom43_mul;
   1277      1.1  mrg   param.function2 = speed_mpn_toom43_for_toom32_mul;
   1278      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
   1279  1.1.1.2  mrg   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
   1280      1.1  mrg   one (&thres, &param);
   1281  1.1.1.2  mrg   mul_toom32_to_toom43_threshold = thres * 17 / 24;
   1282      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
   1283      1.1  mrg 
   1284      1.1  mrg   param.function = speed_mpn_toom32_for_toom53_mul;
   1285      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom32_mul;
   1286      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
   1287  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
   1288      1.1  mrg   one (&thres, &param);
   1289  1.1.1.2  mrg   mul_toom32_to_toom53_threshold = thres * 19 / 30;
   1290      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
   1291      1.1  mrg 
   1292      1.1  mrg   param.function = speed_mpn_toom42_for_toom53_mul;
   1293      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom42_mul;
   1294      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
   1295  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
   1296      1.1  mrg   one (&thres, &param);
   1297  1.1.1.2  mrg   mul_toom42_to_toom53_threshold = thres * 11 / 20;
   1298      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
   1299      1.1  mrg 
   1300      1.1  mrg   param.function = speed_mpn_toom42_mul;
   1301      1.1  mrg   param.function2 = speed_mpn_toom63_mul;
   1302      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
   1303  1.1.1.2  mrg   param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
   1304      1.1  mrg   one (&thres, &param);
   1305  1.1.1.2  mrg   mul_toom42_to_toom63_threshold = thres / 2;
   1306      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
   1307  1.1.1.2  mrg 
   1308  1.1.1.2  mrg   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
   1309  1.1.1.2  mrg   param.function = speed_mpn_toom43_for_toom54_mul;
   1310  1.1.1.2  mrg   param.function2 = speed_mpn_toom54_for_toom43_mul;
   1311  1.1.1.2  mrg   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
   1312  1.1.1.2  mrg   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
   1313  1.1.1.2  mrg   one (&thres, &param);
   1314  1.1.1.2  mrg   mul_toom43_to_toom54_threshold = thres * 5 / 6;
   1315  1.1.1.2  mrg   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
   1316      1.1  mrg }
   1317      1.1  mrg 
   1318      1.1  mrg 
   1319      1.1  mrg void
   1320      1.1  mrg tune_mullo (void)
   1321      1.1  mrg {
   1322      1.1  mrg   static struct param_t  param;
   1323      1.1  mrg 
   1324      1.1  mrg   param.function = speed_mpn_mullo_n;
   1325      1.1  mrg 
   1326      1.1  mrg   param.name = "MULLO_BASECASE_THRESHOLD";
   1327      1.1  mrg   param.min_size = 1;
   1328      1.1  mrg   param.min_is_always = 1;
   1329      1.1  mrg   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
   1330      1.1  mrg   param.stop_factor = 1.5;
   1331      1.1  mrg   param.noprint = 1;
   1332      1.1  mrg   one (&mullo_basecase_threshold, &param);
   1333      1.1  mrg 
   1334      1.1  mrg   param.name = "MULLO_DC_THRESHOLD";
   1335      1.1  mrg   param.min_size = 8;
   1336      1.1  mrg   param.min_is_always = 0;
   1337      1.1  mrg   param.max_size = 1000;
   1338      1.1  mrg   one (&mullo_dc_threshold, &param);
   1339      1.1  mrg 
   1340      1.1  mrg   if (mullo_basecase_threshold >= mullo_dc_threshold)
   1341      1.1  mrg     {
   1342      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
   1343      1.1  mrg       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
   1344      1.1  mrg     }
   1345      1.1  mrg   else
   1346      1.1  mrg     {
   1347      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
   1348      1.1  mrg       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
   1349      1.1  mrg     }
   1350      1.1  mrg 
   1351      1.1  mrg #if WANT_FFT
   1352      1.1  mrg   param.name = "MULLO_MUL_N_THRESHOLD";
   1353      1.1  mrg   param.min_size = mullo_dc_threshold;
   1354      1.1  mrg   param.max_size = 2 * mul_fft_threshold;
   1355      1.1  mrg   param.noprint = 0;
   1356      1.1  mrg   param.step_factor = 0.03;
   1357      1.1  mrg   one (&mullo_mul_n_threshold, &param);
   1358      1.1  mrg #else
   1359      1.1  mrg   print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
   1360      1.1  mrg                            "without FFT use mullo forever");
   1361      1.1  mrg #endif
   1362      1.1  mrg }
   1363      1.1  mrg 
   1364      1.1  mrg void
   1365  1.1.1.2  mrg tune_mulmid (void)
   1366  1.1.1.2  mrg {
   1367  1.1.1.2  mrg   static struct param_t  param;
   1368  1.1.1.2  mrg 
   1369  1.1.1.2  mrg   param.name = "MULMID_TOOM42_THRESHOLD";
   1370  1.1.1.2  mrg   param.function = speed_mpn_mulmid_n;
   1371  1.1.1.2  mrg   param.min_size = 4;
   1372  1.1.1.2  mrg   param.max_size = 100;
   1373  1.1.1.2  mrg   one (&mulmid_toom42_threshold, &param);
   1374  1.1.1.2  mrg }
   1375  1.1.1.2  mrg 
   1376  1.1.1.2  mrg void
   1377      1.1  mrg tune_mulmod_bnm1 (void)
   1378      1.1  mrg {
   1379      1.1  mrg   static struct param_t  param;
   1380      1.1  mrg 
   1381      1.1  mrg   param.name = "MULMOD_BNM1_THRESHOLD";
   1382      1.1  mrg   param.function = speed_mpn_mulmod_bnm1;
   1383      1.1  mrg   param.min_size = 4;
   1384      1.1  mrg   param.max_size = 100;
   1385      1.1  mrg   one (&mulmod_bnm1_threshold, &param);
   1386      1.1  mrg }
   1387      1.1  mrg 
   1388      1.1  mrg void
   1389      1.1  mrg tune_sqrmod_bnm1 (void)
   1390      1.1  mrg {
   1391      1.1  mrg   static struct param_t  param;
   1392      1.1  mrg 
   1393      1.1  mrg   param.name = "SQRMOD_BNM1_THRESHOLD";
   1394      1.1  mrg   param.function = speed_mpn_sqrmod_bnm1;
   1395      1.1  mrg   param.min_size = 4;
   1396      1.1  mrg   param.max_size = 100;
   1397      1.1  mrg   one (&sqrmod_bnm1_threshold, &param);
   1398      1.1  mrg }
   1399      1.1  mrg 
   1400      1.1  mrg 
   1401      1.1  mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
   1402      1.1  mrg    is faster only at size==2 then we don't want to bother with extra code
   1403      1.1  mrg    just for that.  Start karatsuba from 4 same as MUL above.  */
   1404      1.1  mrg 
   1405      1.1  mrg void
   1406      1.1  mrg tune_sqr (void)
   1407      1.1  mrg {
   1408      1.1  mrg   /* disabled until tuned */
   1409      1.1  mrg   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1410      1.1  mrg 
   1411      1.1  mrg   if (HAVE_NATIVE_mpn_sqr_basecase)
   1412      1.1  mrg     {
   1413      1.1  mrg       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
   1414      1.1  mrg       sqr_basecase_threshold = 0;
   1415      1.1  mrg     }
   1416      1.1  mrg   else
   1417      1.1  mrg     {
   1418      1.1  mrg       static struct param_t  param;
   1419      1.1  mrg       param.name = "SQR_BASECASE_THRESHOLD";
   1420      1.1  mrg       param.function = speed_mpn_sqr;
   1421      1.1  mrg       param.min_size = 3;
   1422      1.1  mrg       param.min_is_always = 1;
   1423      1.1  mrg       param.max_size = TUNE_SQR_TOOM2_MAX;
   1424      1.1  mrg       param.noprint = 1;
   1425      1.1  mrg       one (&sqr_basecase_threshold, &param);
   1426      1.1  mrg     }
   1427      1.1  mrg 
   1428      1.1  mrg   {
   1429      1.1  mrg     static struct param_t  param;
   1430      1.1  mrg     param.name = "SQR_TOOM2_THRESHOLD";
   1431      1.1  mrg     param.function = speed_mpn_sqr;
   1432      1.1  mrg     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
   1433      1.1  mrg     param.max_size = TUNE_SQR_TOOM2_MAX;
   1434      1.1  mrg     param.noprint = 1;
   1435      1.1  mrg     one (&sqr_toom2_threshold, &param);
   1436      1.1  mrg 
   1437      1.1  mrg     if (! HAVE_NATIVE_mpn_sqr_basecase
   1438      1.1  mrg         && sqr_toom2_threshold < sqr_basecase_threshold)
   1439      1.1  mrg       {
   1440      1.1  mrg         /* Karatsuba becomes faster than mul_basecase before
   1441      1.1  mrg            sqr_basecase does.  Arrange for the expression
   1442      1.1  mrg            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
   1443      1.1  mrg            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
   1444      1.1  mrg            SQR_TOOM2_THRESHOLD to zero, making
   1445      1.1  mrg            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
   1446      1.1  mrg 
   1447      1.1  mrg         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
   1448      1.1  mrg         SQR_TOOM2_THRESHOLD = 0;
   1449      1.1  mrg 
   1450      1.1  mrg         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
   1451      1.1  mrg                              "toom2");
   1452      1.1  mrg         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
   1453      1.1  mrg                              "never sqr_basecase");
   1454      1.1  mrg       }
   1455      1.1  mrg     else
   1456      1.1  mrg       {
   1457      1.1  mrg         if (! HAVE_NATIVE_mpn_sqr_basecase)
   1458      1.1  mrg           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
   1459      1.1  mrg         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
   1460      1.1  mrg       }
   1461      1.1  mrg   }
   1462      1.1  mrg 
   1463      1.1  mrg   {
   1464      1.1  mrg     static struct param_t  param;
   1465  1.1.1.2  mrg     mp_size_t next_toom_start;
   1466  1.1.1.2  mrg     int something_changed;
   1467      1.1  mrg 
   1468      1.1  mrg     param.function = speed_mpn_sqr;
   1469  1.1.1.2  mrg     param.noprint = 1;
   1470      1.1  mrg 
   1471  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1472  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1473  1.1.1.2  mrg     something_changed = 1;
   1474  1.1.1.2  mrg     while (something_changed)
   1475  1.1.1.2  mrg       {
   1476  1.1.1.2  mrg 	something_changed = 0;
   1477  1.1.1.2  mrg 
   1478  1.1.1.2  mrg 	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
   1479  1.1.1.2  mrg 
   1480  1.1.1.2  mrg 	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
   1481  1.1.1.2  mrg 	param.name = "SQR_TOOM3_THRESHOLD";
   1482  1.1.1.2  mrg 	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
   1483  1.1.1.2  mrg 	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
   1484  1.1.1.2  mrg 	one (&sqr_toom3_threshold, &param);
   1485  1.1.1.2  mrg 
   1486  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
   1487  1.1.1.2  mrg 
   1488  1.1.1.2  mrg 	if (sqr_toom4_threshold != 0)
   1489  1.1.1.2  mrg 	  {
   1490  1.1.1.2  mrg 	    param.name = "SQR_TOOM4_THRESHOLD";
   1491  1.1.1.2  mrg 	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
   1492  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
   1493  1.1.1.2  mrg 	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
   1494  1.1.1.2  mrg 	    one (&sqr_toom4_threshold, &param);
   1495  1.1.1.2  mrg 
   1496  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
   1497  1.1.1.2  mrg 	      {
   1498  1.1.1.2  mrg 		sqr_toom4_threshold = 0;
   1499  1.1.1.2  mrg 		something_changed = 1;
   1500  1.1.1.2  mrg 	      }
   1501  1.1.1.2  mrg 	  }
   1502  1.1.1.2  mrg 
   1503  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
   1504  1.1.1.2  mrg 
   1505  1.1.1.2  mrg 	if (sqr_toom6_threshold != 0)
   1506  1.1.1.2  mrg 	  {
   1507  1.1.1.2  mrg 	    param.name = "SQR_TOOM6_THRESHOLD";
   1508  1.1.1.2  mrg 	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
   1509  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
   1510  1.1.1.2  mrg 	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
   1511  1.1.1.2  mrg 	    one (&sqr_toom6_threshold, &param);
   1512  1.1.1.2  mrg 
   1513  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
   1514  1.1.1.2  mrg 	      {
   1515  1.1.1.2  mrg 		sqr_toom6_threshold = 0;
   1516  1.1.1.2  mrg 		something_changed = 1;
   1517  1.1.1.2  mrg 	      }
   1518  1.1.1.2  mrg 	  }
   1519  1.1.1.2  mrg 
   1520  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
   1521  1.1.1.2  mrg 
   1522  1.1.1.2  mrg 	if (sqr_toom8_threshold != 0)
   1523  1.1.1.2  mrg 	  {
   1524  1.1.1.2  mrg 	    param.name = "SQR_TOOM8_THRESHOLD";
   1525  1.1.1.2  mrg 	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
   1526  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
   1527  1.1.1.2  mrg 	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
   1528  1.1.1.2  mrg 	    one (&sqr_toom8_threshold, &param);
   1529  1.1.1.2  mrg 
   1530  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
   1531  1.1.1.2  mrg 	      {
   1532  1.1.1.2  mrg 		sqr_toom8_threshold = 0;
   1533  1.1.1.2  mrg 		something_changed = 1;
   1534  1.1.1.2  mrg 	      }
   1535  1.1.1.2  mrg 	  }
   1536  1.1.1.2  mrg       }
   1537  1.1.1.2  mrg 
   1538  1.1.1.2  mrg     print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
   1539  1.1.1.2  mrg     print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
   1540  1.1.1.2  mrg     print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
   1541  1.1.1.2  mrg     print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
   1542      1.1  mrg   }
   1543      1.1  mrg }
   1544      1.1  mrg 
   1545      1.1  mrg 
   1546      1.1  mrg void
   1547      1.1  mrg tune_dc_div (void)
   1548      1.1  mrg {
   1549      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1550      1.1  mrg   {
   1551      1.1  mrg     static struct param_t  param;
   1552      1.1  mrg     param.name = "DC_DIV_QR_THRESHOLD";
   1553      1.1  mrg     param.function = speed_mpn_sbpi1_div_qr;
   1554      1.1  mrg     param.function2 = speed_mpn_dcpi1_div_qr;
   1555      1.1  mrg     param.min_size = 6;
   1556      1.1  mrg     one (&dc_div_qr_threshold, &param);
   1557      1.1  mrg   }
   1558      1.1  mrg   {
   1559      1.1  mrg     static struct param_t  param;
   1560      1.1  mrg     param.name = "DC_DIVAPPR_Q_THRESHOLD";
   1561      1.1  mrg     param.function = speed_mpn_sbpi1_divappr_q;
   1562      1.1  mrg     param.function2 = speed_mpn_dcpi1_divappr_q;
   1563      1.1  mrg     param.min_size = 6;
   1564      1.1  mrg     one (&dc_divappr_q_threshold, &param);
   1565      1.1  mrg   }
   1566      1.1  mrg }
   1567      1.1  mrg 
   1568      1.1  mrg static double
   1569      1.1  mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
   1570      1.1  mrg {
   1571      1.1  mrg   if (s->size < DC_DIV_QR_THRESHOLD)
   1572      1.1  mrg     return speed_mpn_sbpi1_div_qr (s);
   1573      1.1  mrg   else
   1574      1.1  mrg     return speed_mpn_dcpi1_div_qr (s);
   1575      1.1  mrg }
   1576      1.1  mrg 
   1577      1.1  mrg void
   1578      1.1  mrg tune_mu_div (void)
   1579      1.1  mrg {
   1580      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1581      1.1  mrg   {
   1582      1.1  mrg     static struct param_t  param;
   1583      1.1  mrg     param.name = "MU_DIV_QR_THRESHOLD";
   1584      1.1  mrg     param.function = speed_mpn_dcpi1_div_qr;
   1585      1.1  mrg     param.function2 = speed_mpn_mu_div_qr;
   1586  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1587      1.1  mrg     param.max_size = 5000;
   1588      1.1  mrg     param.step_factor = 0.02;
   1589      1.1  mrg     one (&mu_div_qr_threshold, &param);
   1590      1.1  mrg   }
   1591      1.1  mrg   {
   1592      1.1  mrg     static struct param_t  param;
   1593      1.1  mrg     param.name = "MU_DIVAPPR_Q_THRESHOLD";
   1594      1.1  mrg     param.function = speed_mpn_dcpi1_divappr_q;
   1595      1.1  mrg     param.function2 = speed_mpn_mu_divappr_q;
   1596  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1597      1.1  mrg     param.max_size = 5000;
   1598      1.1  mrg     param.step_factor = 0.02;
   1599      1.1  mrg     one (&mu_divappr_q_threshold, &param);
   1600      1.1  mrg   }
   1601      1.1  mrg   {
   1602      1.1  mrg     static struct param_t  param;
   1603      1.1  mrg     param.name = "MUPI_DIV_QR_THRESHOLD";
   1604      1.1  mrg     param.function = speed_mpn_sbordcpi1_div_qr;
   1605      1.1  mrg     param.function2 = speed_mpn_mupi_div_qr;
   1606      1.1  mrg     param.min_size = 6;
   1607      1.1  mrg     param.min_is_always = 1;
   1608      1.1  mrg     param.max_size = 1000;
   1609      1.1  mrg     param.step_factor = 0.02;
   1610      1.1  mrg     one (&mupi_div_qr_threshold, &param);
   1611      1.1  mrg   }
   1612      1.1  mrg }
   1613      1.1  mrg 
   1614      1.1  mrg void
   1615      1.1  mrg tune_dc_bdiv (void)
   1616      1.1  mrg {
   1617      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1618      1.1  mrg   {
   1619      1.1  mrg     static struct param_t  param;
   1620      1.1  mrg     param.name = "DC_BDIV_QR_THRESHOLD";
   1621      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_qr;
   1622      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_qr;
   1623      1.1  mrg     param.min_size = 4;
   1624      1.1  mrg     one (&dc_bdiv_qr_threshold, &param);
   1625      1.1  mrg   }
   1626      1.1  mrg   {
   1627      1.1  mrg     static struct param_t  param;
   1628      1.1  mrg     param.name = "DC_BDIV_Q_THRESHOLD";
   1629      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_q;
   1630      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_q;
   1631      1.1  mrg     param.min_size = 4;
   1632      1.1  mrg     one (&dc_bdiv_q_threshold, &param);
   1633      1.1  mrg   }
   1634      1.1  mrg }
   1635      1.1  mrg 
   1636      1.1  mrg void
   1637      1.1  mrg tune_mu_bdiv (void)
   1638      1.1  mrg {
   1639      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1640      1.1  mrg   {
   1641      1.1  mrg     static struct param_t  param;
   1642      1.1  mrg     param.name = "MU_BDIV_QR_THRESHOLD";
   1643      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_qr;
   1644      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_qr;
   1645  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1646      1.1  mrg     param.max_size = 5000;
   1647      1.1  mrg     param.step_factor = 0.02;
   1648      1.1  mrg     one (&mu_bdiv_qr_threshold, &param);
   1649      1.1  mrg   }
   1650      1.1  mrg   {
   1651      1.1  mrg     static struct param_t  param;
   1652      1.1  mrg     param.name = "MU_BDIV_Q_THRESHOLD";
   1653      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_q;
   1654      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_q;
   1655  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1656      1.1  mrg     param.max_size = 5000;
   1657      1.1  mrg     param.step_factor = 0.02;
   1658      1.1  mrg     one (&mu_bdiv_q_threshold, &param);
   1659      1.1  mrg   }
   1660      1.1  mrg }
   1661      1.1  mrg 
   1662      1.1  mrg void
   1663      1.1  mrg tune_invertappr (void)
   1664      1.1  mrg {
   1665      1.1  mrg   static struct param_t  param;
   1666      1.1  mrg 
   1667      1.1  mrg   param.function = speed_mpn_ni_invertappr;
   1668      1.1  mrg   param.name = "INV_MULMOD_BNM1_THRESHOLD";
   1669      1.1  mrg   param.min_size = 4;
   1670      1.1  mrg   one (&inv_mulmod_bnm1_threshold, &param);
   1671      1.1  mrg 
   1672      1.1  mrg   param.function = speed_mpn_invertappr;
   1673      1.1  mrg   param.name = "INV_NEWTON_THRESHOLD";
   1674      1.1  mrg   param.min_size = 3;
   1675      1.1  mrg   one (&inv_newton_threshold, &param);
   1676      1.1  mrg }
   1677      1.1  mrg 
   1678      1.1  mrg void
   1679      1.1  mrg tune_invert (void)
   1680      1.1  mrg {
   1681      1.1  mrg   static struct param_t  param;
   1682      1.1  mrg 
   1683      1.1  mrg   param.function = speed_mpn_invert;
   1684      1.1  mrg   param.name = "INV_APPR_THRESHOLD";
   1685      1.1  mrg   param.min_size = 3;
   1686      1.1  mrg   one (&inv_appr_threshold, &param);
   1687      1.1  mrg }
   1688      1.1  mrg 
   1689      1.1  mrg void
   1690      1.1  mrg tune_binvert (void)
   1691      1.1  mrg {
   1692      1.1  mrg   static struct param_t  param;
   1693      1.1  mrg 
   1694      1.1  mrg   param.function = speed_mpn_binvert;
   1695      1.1  mrg   param.name = "BINV_NEWTON_THRESHOLD";
   1696      1.1  mrg   param.min_size = 8;		/* pointless with smaller operands */
   1697      1.1  mrg   one (&binv_newton_threshold, &param);
   1698      1.1  mrg }
   1699      1.1  mrg 
   1700      1.1  mrg void
   1701      1.1  mrg tune_redc (void)
   1702      1.1  mrg {
   1703      1.1  mrg #define TUNE_REDC_2_MAX 100
   1704      1.1  mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
   1705      1.1  mrg #define WANT_REDC_2 1
   1706      1.1  mrg #endif
   1707      1.1  mrg 
   1708      1.1  mrg #if WANT_REDC_2
   1709      1.1  mrg   {
   1710      1.1  mrg     static struct param_t  param;
   1711      1.1  mrg     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1712      1.1  mrg     param.function = speed_mpn_redc_1;
   1713      1.1  mrg     param.function2 = speed_mpn_redc_2;
   1714      1.1  mrg     param.min_size = 1;
   1715      1.1  mrg     param.min_is_always = 1;
   1716      1.1  mrg     param.max_size = TUNE_REDC_2_MAX;
   1717      1.1  mrg     param.noprint = 1;
   1718  1.1.1.2  mrg     param.stop_factor = 1.5;
   1719      1.1  mrg     one (&redc_1_to_redc_2_threshold, &param);
   1720      1.1  mrg   }
   1721      1.1  mrg   {
   1722      1.1  mrg     static struct param_t  param;
   1723      1.1  mrg     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
   1724      1.1  mrg     param.function = speed_mpn_redc_2;
   1725      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1726      1.1  mrg     param.min_size = 16;
   1727      1.1  mrg     param.noprint = 1;
   1728      1.1  mrg     one (&redc_2_to_redc_n_threshold, &param);
   1729      1.1  mrg   }
   1730  1.1.1.2  mrg   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
   1731      1.1  mrg     {
   1732  1.1.1.2  mrg       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
   1733  1.1.1.2  mrg 
   1734  1.1.1.2  mrg       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
   1735  1.1.1.2  mrg 	 REDC_1_TO_REDC_2_THRESHOLD.  */
   1736  1.1.1.2  mrg       {
   1737  1.1.1.2  mrg 	static struct param_t  param;
   1738  1.1.1.2  mrg 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1739  1.1.1.2  mrg 	param.function = speed_mpn_redc_1;
   1740  1.1.1.2  mrg 	param.function2 = speed_mpn_redc_n;
   1741  1.1.1.2  mrg 	param.min_size = 16;
   1742  1.1.1.2  mrg 	param.noprint = 1;
   1743  1.1.1.2  mrg 	one (&redc_1_to_redc_2_threshold, &param);
   1744  1.1.1.2  mrg       }
   1745      1.1  mrg     }
   1746  1.1.1.2  mrg   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
   1747  1.1.1.2  mrg   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
   1748      1.1  mrg #else
   1749      1.1  mrg   {
   1750      1.1  mrg     static struct param_t  param;
   1751      1.1  mrg     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
   1752      1.1  mrg     param.function = speed_mpn_redc_1;
   1753      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1754      1.1  mrg     param.min_size = 16;
   1755      1.1  mrg     one (&redc_1_to_redc_n_threshold, &param);
   1756      1.1  mrg   }
   1757      1.1  mrg #endif
   1758      1.1  mrg }
   1759      1.1  mrg 
   1760      1.1  mrg void
   1761      1.1  mrg tune_matrix22_mul (void)
   1762      1.1  mrg {
   1763      1.1  mrg   static struct param_t  param;
   1764      1.1  mrg   param.name = "MATRIX22_STRASSEN_THRESHOLD";
   1765      1.1  mrg   param.function = speed_mpn_matrix22_mul;
   1766      1.1  mrg   param.min_size = 2;
   1767      1.1  mrg   one (&matrix22_strassen_threshold, &param);
   1768      1.1  mrg }
   1769      1.1  mrg 
   1770      1.1  mrg void
   1771      1.1  mrg tune_hgcd (void)
   1772      1.1  mrg {
   1773      1.1  mrg   static struct param_t  param;
   1774      1.1  mrg   param.name = "HGCD_THRESHOLD";
   1775      1.1  mrg   param.function = speed_mpn_hgcd;
   1776      1.1  mrg   /* We seem to get strange results for small sizes */
   1777      1.1  mrg   param.min_size = 30;
   1778      1.1  mrg   one (&hgcd_threshold, &param);
   1779      1.1  mrg }
   1780      1.1  mrg 
   1781      1.1  mrg void
   1782  1.1.1.2  mrg tune_hgcd_appr (void)
   1783  1.1.1.2  mrg {
   1784  1.1.1.2  mrg   static struct param_t  param;
   1785  1.1.1.2  mrg   param.name = "HGCD_APPR_THRESHOLD";
   1786  1.1.1.2  mrg   param.function = speed_mpn_hgcd_appr;
   1787  1.1.1.2  mrg   /* We seem to get strange results for small sizes */
   1788  1.1.1.2  mrg   param.min_size = 50;
   1789  1.1.1.2  mrg   param.stop_since_change = 150;
   1790  1.1.1.2  mrg   one (&hgcd_appr_threshold, &param);
   1791  1.1.1.2  mrg }
   1792  1.1.1.2  mrg 
   1793  1.1.1.2  mrg void
   1794  1.1.1.2  mrg tune_hgcd_reduce (void)
   1795  1.1.1.2  mrg {
   1796  1.1.1.2  mrg   static struct param_t  param;
   1797  1.1.1.2  mrg   param.name = "HGCD_REDUCE_THRESHOLD";
   1798  1.1.1.2  mrg   param.function = speed_mpn_hgcd_reduce;
   1799  1.1.1.2  mrg   param.min_size = 30;
   1800  1.1.1.2  mrg   param.max_size = 7000;
   1801  1.1.1.2  mrg   param.step_factor = 0.04;
   1802  1.1.1.2  mrg   one (&hgcd_reduce_threshold, &param);
   1803  1.1.1.2  mrg }
   1804  1.1.1.2  mrg 
   1805  1.1.1.2  mrg void
   1806      1.1  mrg tune_gcd_dc (void)
   1807      1.1  mrg {
   1808      1.1  mrg   static struct param_t  param;
   1809      1.1  mrg   param.name = "GCD_DC_THRESHOLD";
   1810      1.1  mrg   param.function = speed_mpn_gcd;
   1811      1.1  mrg   param.min_size = hgcd_threshold;
   1812      1.1  mrg   param.max_size = 3000;
   1813      1.1  mrg   param.step_factor = 0.02;
   1814      1.1  mrg   one (&gcd_dc_threshold, &param);
   1815      1.1  mrg }
   1816      1.1  mrg 
   1817      1.1  mrg void
   1818      1.1  mrg tune_gcdext_dc (void)
   1819      1.1  mrg {
   1820      1.1  mrg   static struct param_t  param;
   1821      1.1  mrg   param.name = "GCDEXT_DC_THRESHOLD";
   1822      1.1  mrg   param.function = speed_mpn_gcdext;
   1823      1.1  mrg   param.min_size = hgcd_threshold;
   1824      1.1  mrg   param.max_size = 3000;
   1825      1.1  mrg   param.step_factor = 0.02;
   1826      1.1  mrg   one (&gcdext_dc_threshold, &param);
   1827      1.1  mrg }
   1828      1.1  mrg 
   1829  1.1.1.2  mrg /* In tune_powm_sec we compute the table used by the win_size function.  The
   1830  1.1.1.2  mrg    cutoff points are in exponent bits, disregarding other operand sizes.  It is
   1831  1.1.1.2  mrg    not possible to use the one framework since it currently uses a granilarity
   1832  1.1.1.2  mrg    of full limbs.
   1833  1.1.1.2  mrg */
   1834  1.1.1.2  mrg 
   1835  1.1.1.2  mrg /* This win_size replaces the variant in the powm code, allowing us to
   1836  1.1.1.2  mrg    control k in the k-ary algorithms.  */
   1837  1.1.1.2  mrg int winsize;
   1838  1.1.1.2  mrg int
   1839  1.1.1.2  mrg win_size (mp_bitcnt_t eb)
   1840  1.1.1.2  mrg {
   1841  1.1.1.2  mrg   return winsize;
   1842  1.1.1.2  mrg }
   1843  1.1.1.2  mrg 
   1844  1.1.1.2  mrg void
   1845  1.1.1.2  mrg tune_powm_sec (void)
   1846  1.1.1.2  mrg {
   1847  1.1.1.2  mrg   mp_size_t n;
   1848  1.1.1.2  mrg   int k, i;
   1849  1.1.1.2  mrg   mp_size_t itch;
   1850  1.1.1.2  mrg   mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
   1851  1.1.1.2  mrg   const int n_max = 3000 / GMP_NUMB_BITS;
   1852  1.1.1.2  mrg   const int n_measurements = 5;
   1853  1.1.1.2  mrg   mp_ptr rp, bp, ep, mp, tp;
   1854  1.1.1.2  mrg   double ttab[n_measurements], tk, tkp1;
   1855  1.1.1.2  mrg   TMP_DECL;
   1856  1.1.1.2  mrg   TMP_MARK;
   1857  1.1.1.2  mrg 
   1858  1.1.1.2  mrg   possible_nbits_cutoff = 0;
   1859  1.1.1.2  mrg 
   1860  1.1.1.2  mrg   k = 1;
   1861  1.1.1.2  mrg 
   1862  1.1.1.2  mrg   winsize = 10;			/* the itch function needs this */
   1863  1.1.1.2  mrg   itch = mpn_powm_sec_itch (n_max, n_max, n_max);
   1864  1.1.1.2  mrg 
   1865  1.1.1.2  mrg   rp = TMP_ALLOC_LIMBS (n_max);
   1866  1.1.1.2  mrg   bp = TMP_ALLOC_LIMBS (n_max);
   1867  1.1.1.2  mrg   ep = TMP_ALLOC_LIMBS (n_max);
   1868  1.1.1.2  mrg   mp = TMP_ALLOC_LIMBS (n_max);
   1869  1.1.1.2  mrg   tp = TMP_ALLOC_LIMBS (itch);
   1870  1.1.1.2  mrg 
   1871  1.1.1.2  mrg   mpn_random (bp, n_max);
   1872  1.1.1.2  mrg   mpn_random (mp, n_max);
   1873  1.1.1.2  mrg   mp[0] |= 1;
   1874  1.1.1.2  mrg 
   1875  1.1.1.2  mrg /* How about taking the M operand size into account?
   1876  1.1.1.2  mrg 
   1877  1.1.1.2  mrg    An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
   1878  1.1.1.2  mrg    B = O(M)).
   1879  1.1.1.2  mrg 
   1880  1.1.1.2  mrg    Using k-ary and no sliding window, the precomputation will need time
   1881  1.1.1.2  mrg    O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
   1882  1.1.1.2  mrg    O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
   1883  1.1.1.2  mrg 
   1884  1.1.1.2  mrg    An operation R=powm_sec(B,E,N) will take time like powm.
   1885  1.1.1.2  mrg 
   1886  1.1.1.2  mrg    Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
   1887  1.1.1.2  mrg    main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
   1888  1.1.1.2  mrg    O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
   1889  1.1.1.2  mrg    table reads, respectively.  */
   1890  1.1.1.2  mrg 
   1891  1.1.1.2  mrg   printf ("#define POWM_SEC_TABLE  ");
   1892  1.1.1.2  mrg 
   1893  1.1.1.2  mrg   for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; )
   1894  1.1.1.2  mrg     {
   1895  1.1.1.2  mrg       n = (nbits - 1) / GMP_NUMB_BITS + 1;
   1896  1.1.1.2  mrg 
   1897  1.1.1.2  mrg       /* Generate E such that sliding-window for k and k+1 works equally
   1898  1.1.1.2  mrg 	 well/poorly (but sliding is not used in powm_sec, of course). */
   1899  1.1.1.2  mrg       for (i = 0; i < n; i++)
   1900  1.1.1.2  mrg 	ep[i] = ~CNST_LIMB(0);
   1901  1.1.1.2  mrg 
   1902  1.1.1.2  mrg       /* Truncate E to be exactly nbits large.  */
   1903  1.1.1.2  mrg       if (nbits % GMP_NUMB_BITS != 0)
   1904  1.1.1.2  mrg 	mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS);
   1905  1.1.1.2  mrg       ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS;
   1906  1.1.1.2  mrg 
   1907  1.1.1.2  mrg       winsize = k;
   1908  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   1909  1.1.1.2  mrg 	{
   1910  1.1.1.2  mrg 	  speed_starttime ();
   1911  1.1.1.2  mrg 	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
   1912  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   1913  1.1.1.2  mrg 	}
   1914  1.1.1.2  mrg       tk = median (ttab, n_measurements);
   1915  1.1.1.2  mrg 
   1916  1.1.1.2  mrg       winsize = k + 1;
   1917  1.1.1.2  mrg       speed_starttime ();
   1918  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   1919  1.1.1.2  mrg 	{
   1920  1.1.1.2  mrg 	  speed_starttime ();
   1921  1.1.1.2  mrg 	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
   1922  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   1923  1.1.1.2  mrg 	}
   1924  1.1.1.2  mrg       tkp1 = median (ttab, n_measurements);
   1925  1.1.1.2  mrg /*
   1926  1.1.1.2  mrg       printf ("testing: %ld, %d", nbits, k, ep[n-1]);
   1927  1.1.1.2  mrg       printf ("   %10.5f  %10.5f\n", tk, tkp1);
   1928  1.1.1.2  mrg */
   1929  1.1.1.2  mrg       if (tkp1 < tk)
   1930  1.1.1.2  mrg 	{
   1931  1.1.1.2  mrg 	  if (possible_nbits_cutoff)
   1932  1.1.1.2  mrg 	    {
   1933  1.1.1.2  mrg 	      /* Two consecutive sizes indicate k increase, obey.  */
   1934  1.1.1.2  mrg 	      if (k > 1)
   1935  1.1.1.2  mrg 		printf (",");
   1936  1.1.1.2  mrg 	      printf ("%ld", (long) possible_nbits_cutoff);
   1937  1.1.1.2  mrg 	      k++;
   1938  1.1.1.2  mrg 	      possible_nbits_cutoff = 0;
   1939  1.1.1.2  mrg 	    }
   1940  1.1.1.2  mrg 	  else
   1941  1.1.1.2  mrg 	    {
   1942  1.1.1.2  mrg 	      /* One measurement indicate k increase, save nbits for further
   1943  1.1.1.2  mrg 		 consideration.  */
   1944  1.1.1.2  mrg 	      possible_nbits_cutoff = nbits;
   1945  1.1.1.2  mrg 	    }
   1946  1.1.1.2  mrg 	}
   1947  1.1.1.2  mrg       else
   1948  1.1.1.2  mrg 	possible_nbits_cutoff = 0;
   1949  1.1.1.2  mrg 
   1950  1.1.1.2  mrg       nbits_next = nbits * 65 / 64;
   1951  1.1.1.2  mrg       nbits = nbits_next + (nbits_next == nbits);
   1952  1.1.1.2  mrg     }
   1953  1.1.1.2  mrg   printf ("\n");
   1954  1.1.1.2  mrg   TMP_FREE;
   1955  1.1.1.2  mrg }
   1956  1.1.1.2  mrg 
   1957      1.1  mrg 
   1958      1.1  mrg /* size_extra==1 reflects the fact that with high<divisor one division is
   1959      1.1  mrg    always skipped.  Forcing high<divisor while testing ensures consistency
   1960      1.1  mrg    while stepping through sizes, ie. that size-1 divides will be done each
   1961      1.1  mrg    time.
   1962      1.1  mrg 
   1963      1.1  mrg    min_size==2 and min_is_always are used so that if plain division is only
   1964      1.1  mrg    better at size==1 then don't bother including that code just for that
   1965      1.1  mrg    case, instead go with preinv always and get a size saving.  */
   1966      1.1  mrg 
   1967      1.1  mrg #define DIV_1_PARAMS                    \
   1968      1.1  mrg   param.check_size = 256;               \
   1969      1.1  mrg   param.min_size = 2;                   \
   1970      1.1  mrg   param.min_is_always = 1;              \
   1971      1.1  mrg   param.data_high = DATA_HIGH_LT_R;     \
   1972      1.1  mrg   param.size_extra = 1;                 \
   1973      1.1  mrg   param.stop_factor = 2.0;
   1974      1.1  mrg 
   1975      1.1  mrg 
   1976  1.1.1.2  mrg double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
   1977      1.1  mrg 
   1978      1.1  mrg void
   1979      1.1  mrg tune_divrem_1 (void)
   1980      1.1  mrg {
   1981      1.1  mrg   /* plain version by default */
   1982      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
   1983      1.1  mrg 
   1984      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   1985      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   1986      1.1  mrg      appear in gmp-mparam.h.  */
   1987      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   1988      1.1  mrg     return;
   1989      1.1  mrg 
   1990      1.1  mrg   if (GMP_NAIL_BITS != 0)
   1991      1.1  mrg     {
   1992      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   1993      1.1  mrg                            "no preinv with nails");
   1994      1.1  mrg       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   1995      1.1  mrg                            "no preinv with nails");
   1996      1.1  mrg       return;
   1997      1.1  mrg     }
   1998      1.1  mrg 
   1999      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2000      1.1  mrg     {
   2001      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
   2002      1.1  mrg       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
   2003      1.1  mrg       return;
   2004      1.1  mrg     }
   2005      1.1  mrg 
   2006      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
   2007      1.1  mrg 
   2008      1.1  mrg   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
   2009      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2010      1.1  mrg      is more important. */
   2011      1.1  mrg   {
   2012      1.1  mrg     static struct param_t  param;
   2013      1.1  mrg     param.name = "DIVREM_1_NORM_THRESHOLD";
   2014      1.1  mrg     DIV_1_PARAMS;
   2015      1.1  mrg     s.r = randlimb_norm ();
   2016      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2017      1.1  mrg     one (&divrem_1_norm_threshold, &param);
   2018      1.1  mrg   }
   2019      1.1  mrg   {
   2020      1.1  mrg     static struct param_t  param;
   2021      1.1  mrg     param.name = "DIVREM_1_UNNORM_THRESHOLD";
   2022      1.1  mrg     DIV_1_PARAMS;
   2023      1.1  mrg     s.r = randlimb_half ();
   2024      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2025      1.1  mrg     one (&divrem_1_unnorm_threshold, &param);
   2026      1.1  mrg   }
   2027      1.1  mrg }
   2028      1.1  mrg 
   2029      1.1  mrg 
   2030      1.1  mrg void
   2031      1.1  mrg tune_mod_1 (void)
   2032      1.1  mrg {
   2033      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2034      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2035      1.1  mrg      appear in gmp-mparam.h.  */
   2036      1.1  mrg   if (HAVE_NATIVE_mpn_mod_1)
   2037      1.1  mrg     return;
   2038      1.1  mrg 
   2039      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2040      1.1  mrg     {
   2041      1.1  mrg       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2042      1.1  mrg                            "no preinv with nails");
   2043      1.1  mrg       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2044      1.1  mrg                            "no preinv with nails");
   2045      1.1  mrg       return;
   2046      1.1  mrg     }
   2047      1.1  mrg 
   2048  1.1.1.2  mrg   if (!HAVE_NATIVE_mpn_mod_1_1p)
   2049  1.1.1.2  mrg     {
   2050  1.1.1.2  mrg       static struct param_t  param;
   2051  1.1.1.2  mrg       double   t1, t2;
   2052  1.1.1.2  mrg 
   2053  1.1.1.2  mrg       s.size = 10;
   2054  1.1.1.2  mrg       s.r = randlimb_half ();
   2055  1.1.1.2  mrg 
   2056  1.1.1.2  mrg       t1 = tuneup_measure (speed_mpn_mod_1_1_1, &param, &s);
   2057  1.1.1.2  mrg       t2 = tuneup_measure (speed_mpn_mod_1_1_2, &param, &s);
   2058  1.1.1.2  mrg 
   2059  1.1.1.2  mrg       if (t1 == -1.0 || t2 == -1.0)
   2060  1.1.1.2  mrg 	{
   2061  1.1.1.2  mrg 	  printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n",
   2062  1.1.1.2  mrg 		  (long) s.size);
   2063  1.1.1.2  mrg 	  abort ();
   2064  1.1.1.2  mrg 	}
   2065  1.1.1.2  mrg       mod_1_1p_method = (t1 < t2) ? 1 : 2;
   2066  1.1.1.2  mrg       print_define ("MOD_1_1P_METHOD", mod_1_1p_method);
   2067  1.1.1.2  mrg     }
   2068  1.1.1.2  mrg 
   2069      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2070      1.1  mrg     {
   2071      1.1  mrg       print_define ("MOD_1_NORM_THRESHOLD", 0L);
   2072      1.1  mrg       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
   2073      1.1  mrg     }
   2074      1.1  mrg   else
   2075      1.1  mrg     {
   2076      1.1  mrg       {
   2077      1.1  mrg 	static struct param_t  param;
   2078      1.1  mrg 	param.name = "MOD_1_NORM_THRESHOLD";
   2079      1.1  mrg 	DIV_1_PARAMS;
   2080      1.1  mrg 	s.r = randlimb_norm ();
   2081      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2082      1.1  mrg 	one (&mod_1_norm_threshold, &param);
   2083      1.1  mrg       }
   2084      1.1  mrg       {
   2085      1.1  mrg 	static struct param_t  param;
   2086      1.1  mrg 	param.name = "MOD_1_UNNORM_THRESHOLD";
   2087      1.1  mrg 	DIV_1_PARAMS;
   2088      1.1  mrg 	s.r = randlimb_half ();
   2089      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2090      1.1  mrg 	one (&mod_1_unnorm_threshold, &param);
   2091      1.1  mrg       }
   2092      1.1  mrg     }
   2093      1.1  mrg   {
   2094      1.1  mrg     static struct param_t  param;
   2095      1.1  mrg 
   2096      1.1  mrg     param.check_size = 256;
   2097      1.1  mrg 
   2098      1.1  mrg     s.r = randlimb_norm ();
   2099      1.1  mrg     param.function = speed_mpn_mod_1_tune;
   2100      1.1  mrg 
   2101      1.1  mrg     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
   2102      1.1  mrg     param.min_size = 2;
   2103      1.1  mrg     one (&mod_1n_to_mod_1_1_threshold, &param);
   2104      1.1  mrg   }
   2105  1.1.1.2  mrg 
   2106      1.1  mrg   {
   2107      1.1  mrg     static struct param_t  param;
   2108      1.1  mrg 
   2109      1.1  mrg     param.check_size = 256;
   2110  1.1.1.2  mrg     s.r = randlimb_half ();
   2111      1.1  mrg     param.noprint = 1;
   2112      1.1  mrg 
   2113  1.1.1.2  mrg     param.function = speed_mpn_mod_1_1;
   2114  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_2;
   2115  1.1.1.2  mrg     param.min_is_always = 1;
   2116      1.1  mrg     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
   2117  1.1.1.2  mrg     param.min_size = 2;
   2118      1.1  mrg     one (&mod_1_1_to_mod_1_2_threshold, &param);
   2119      1.1  mrg 
   2120  1.1.1.2  mrg     param.function = speed_mpn_mod_1_2;
   2121  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_4;
   2122  1.1.1.2  mrg     param.min_is_always = 1;
   2123      1.1  mrg     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
   2124  1.1.1.2  mrg     param.min_size = 1;
   2125      1.1  mrg     one (&mod_1_2_to_mod_1_4_threshold, &param);
   2126      1.1  mrg 
   2127  1.1.1.2  mrg     if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
   2128      1.1  mrg       {
   2129  1.1.1.2  mrg 	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
   2130      1.1  mrg 	mod_1_2_to_mod_1_4_threshold = 0;
   2131      1.1  mrg 
   2132  1.1.1.2  mrg 	param.function = speed_mpn_mod_1_1;
   2133  1.1.1.2  mrg 	param.function2 = speed_mpn_mod_1_4;
   2134  1.1.1.2  mrg 	param.min_is_always = 1;
   2135  1.1.1.2  mrg 	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
   2136  1.1.1.2  mrg 	param.min_size = 2;
   2137      1.1  mrg 	one (&mod_1_1_to_mod_1_2_threshold, &param);
   2138      1.1  mrg       }
   2139  1.1.1.2  mrg 
   2140  1.1.1.2  mrg     param.function = speed_mpn_mod_1_tune;
   2141  1.1.1.2  mrg     param.function2 = NULL;
   2142  1.1.1.2  mrg     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
   2143  1.1.1.2  mrg     param.min_size = 2;
   2144  1.1.1.2  mrg     param.min_is_always = 0;
   2145  1.1.1.2  mrg     one (&mod_1u_to_mod_1_1_threshold, &param);
   2146  1.1.1.2  mrg 
   2147  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
   2148  1.1.1.2  mrg       mod_1_1_to_mod_1_2_threshold = 0;
   2149  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
   2150  1.1.1.2  mrg       mod_1_2_to_mod_1_4_threshold = 0;
   2151  1.1.1.2  mrg 
   2152  1.1.1.2  mrg     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
   2153  1.1.1.2  mrg     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
   2154  1.1.1.2  mrg 			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
   2155  1.1.1.2  mrg     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
   2156  1.1.1.2  mrg 			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
   2157      1.1  mrg   }
   2158      1.1  mrg 
   2159      1.1  mrg   {
   2160      1.1  mrg     static struct param_t  param;
   2161      1.1  mrg 
   2162      1.1  mrg     param.check_size = 256;
   2163      1.1  mrg 
   2164      1.1  mrg     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
   2165      1.1  mrg     s.r = randlimb_norm ();
   2166      1.1  mrg     param.function = speed_mpn_preinv_mod_1;
   2167      1.1  mrg     param.function2 = speed_mpn_mod_1_tune;
   2168      1.1  mrg     param.min_size = 1;
   2169      1.1  mrg     one (&preinv_mod_1_to_mod_1_threshold, &param);
   2170      1.1  mrg   }
   2171      1.1  mrg }
   2172      1.1  mrg 
   2173      1.1  mrg 
   2174      1.1  mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
   2175      1.1  mrg    imply that udiv_qrnnd_preinv is worth using, but it seems most
   2176      1.1  mrg    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
   2177      1.1  mrg    directly.  */
   2178      1.1  mrg 
   2179      1.1  mrg void
   2180      1.1  mrg tune_preinv_divrem_1 (void)
   2181      1.1  mrg {
   2182      1.1  mrg   static struct param_t  param;
   2183      1.1  mrg   speed_function_t  divrem_1;
   2184      1.1  mrg   const char        *divrem_1_name;
   2185      1.1  mrg   double            t1, t2;
   2186      1.1  mrg 
   2187      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2188      1.1  mrg     {
   2189      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
   2190      1.1  mrg       return;
   2191      1.1  mrg     }
   2192      1.1  mrg 
   2193      1.1  mrg   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
   2194      1.1  mrg      it's faster than mpn_divrem_1.  */
   2195      1.1  mrg   if (HAVE_NATIVE_mpn_preinv_divrem_1)
   2196      1.1  mrg     {
   2197      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
   2198      1.1  mrg       return;
   2199      1.1  mrg     }
   2200      1.1  mrg 
   2201      1.1  mrg   /* If udiv_qrnnd_preinv is the only division method then of course
   2202      1.1  mrg      mpn_preinv_divrem_1 should be used.  */
   2203      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2204      1.1  mrg     {
   2205      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
   2206      1.1  mrg       return;
   2207      1.1  mrg     }
   2208      1.1  mrg 
   2209      1.1  mrg   /* If we've got an assembler version of mpn_divrem_1, then compare against
   2210      1.1  mrg      that, not the mpn_divrem_1_div generic C.  */
   2211      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2212      1.1  mrg     {
   2213      1.1  mrg       divrem_1 = speed_mpn_divrem_1;
   2214      1.1  mrg       divrem_1_name = "mpn_divrem_1";
   2215      1.1  mrg     }
   2216      1.1  mrg   else
   2217      1.1  mrg     {
   2218      1.1  mrg       divrem_1 = speed_mpn_divrem_1_div;
   2219      1.1  mrg       divrem_1_name = "mpn_divrem_1_div";
   2220      1.1  mrg     }
   2221      1.1  mrg 
   2222      1.1  mrg   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
   2223      1.1  mrg   s.size = 200;                     /* generous but not too big */
   2224      1.1  mrg   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
   2225      1.1  mrg      since in general that's probably most common, though in fact for a
   2226      1.1  mrg      64-bit limb mp_bases[10].big_base is normalized.  */
   2227      1.1  mrg   s.r = urandom() & (GMP_NUMB_MASK >> 4);
   2228      1.1  mrg   if (s.r == 0) s.r = 123;
   2229      1.1  mrg 
   2230      1.1  mrg   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
   2231      1.1  mrg   t2 = tuneup_measure (divrem_1, &param, &s);
   2232      1.1  mrg   if (t1 == -1.0 || t2 == -1.0)
   2233      1.1  mrg     {
   2234      1.1  mrg       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
   2235      1.1  mrg               divrem_1_name, (long) s.size);
   2236      1.1  mrg       abort ();
   2237      1.1  mrg     }
   2238      1.1  mrg   if (option_trace >= 1)
   2239      1.1  mrg     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
   2240      1.1  mrg             (long) s.size, t1, divrem_1_name, t2);
   2241      1.1  mrg 
   2242      1.1  mrg   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
   2243      1.1  mrg }
   2244      1.1  mrg 
   2245      1.1  mrg 
   2246      1.1  mrg 
   2247      1.1  mrg void
   2248      1.1  mrg tune_divrem_2 (void)
   2249      1.1  mrg {
   2250      1.1  mrg   static struct param_t  param;
   2251      1.1  mrg 
   2252      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2253      1.1  mrg      the results in the .asm file, and there's no need for such thresholds
   2254      1.1  mrg      to appear in gmp-mparam.h.  */
   2255      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_2)
   2256      1.1  mrg     return;
   2257      1.1  mrg 
   2258      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2259      1.1  mrg     {
   2260      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
   2261      1.1  mrg                            "no preinv with nails");
   2262      1.1  mrg       return;
   2263      1.1  mrg     }
   2264      1.1  mrg 
   2265      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2266      1.1  mrg     {
   2267      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
   2268      1.1  mrg       return;
   2269      1.1  mrg     }
   2270      1.1  mrg 
   2271      1.1  mrg   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
   2272      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2273      1.1  mrg      is more important.
   2274      1.1  mrg 
   2275      1.1  mrg      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
   2276      1.1  mrg      code space if plain division is better only at size==2 or size==3. */
   2277      1.1  mrg   param.name = "DIVREM_2_THRESHOLD";
   2278      1.1  mrg   param.check_size = 256;
   2279      1.1  mrg   param.min_size = 4;
   2280      1.1  mrg   param.min_is_always = 1;
   2281      1.1  mrg   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
   2282      1.1  mrg   param.stop_factor = 2.0;
   2283      1.1  mrg 
   2284      1.1  mrg   s.r = randlimb_norm ();
   2285      1.1  mrg   param.function = speed_mpn_divrem_2;
   2286      1.1  mrg   one (&divrem_2_threshold, &param);
   2287      1.1  mrg }
   2288      1.1  mrg 
   2289  1.1.1.2  mrg void
   2290  1.1.1.2  mrg tune_div_qr_2 (void)
   2291  1.1.1.2  mrg {
   2292  1.1.1.2  mrg   static struct param_t  param;
   2293  1.1.1.2  mrg   param.name = "DIV_QR_2_PI2_THRESHOLD";
   2294  1.1.1.2  mrg   param.function = speed_mpn_div_qr_2n;
   2295  1.1.1.2  mrg   param.check_size = 500;
   2296  1.1.1.2  mrg   param.min_size = 4;
   2297  1.1.1.2  mrg   one (&div_qr_2_pi2_threshold, &param);
   2298  1.1.1.2  mrg }
   2299      1.1  mrg 
   2300      1.1  mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
   2301      1.1  mrg    tune for that.  Its speed can differ on odd or even divisor, so take an
   2302      1.1  mrg    average threshold for the two.
   2303      1.1  mrg 
   2304      1.1  mrg    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
   2305      1.1  mrg    might not vary that way, but don't test this since high<divisor isn't
   2306      1.1  mrg    expected to occur often with small divisors.  */
   2307      1.1  mrg 
   2308      1.1  mrg void
   2309      1.1  mrg tune_divexact_1 (void)
   2310      1.1  mrg {
   2311      1.1  mrg   static struct param_t  param;
   2312      1.1  mrg   mp_size_t  thresh[2], average;
   2313      1.1  mrg   int        low, i;
   2314      1.1  mrg 
   2315      1.1  mrg   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
   2316      1.1  mrg      full mpn_divrem_1.  */
   2317      1.1  mrg   if (HAVE_NATIVE_mpn_divexact_1)
   2318      1.1  mrg     {
   2319      1.1  mrg       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
   2320      1.1  mrg       return;
   2321      1.1  mrg     }
   2322      1.1  mrg 
   2323      1.1  mrg   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
   2324      1.1  mrg 
   2325      1.1  mrg   param.name = "DIVEXACT_1_THRESHOLD";
   2326      1.1  mrg   param.data_high = DATA_HIGH_GE_R;
   2327      1.1  mrg   param.check_size = 256;
   2328      1.1  mrg   param.min_size = 2;
   2329      1.1  mrg   param.stop_factor = 1.5;
   2330      1.1  mrg   param.function  = tuned_speed_mpn_divrem_1;
   2331      1.1  mrg   param.function2 = speed_mpn_divexact_1;
   2332      1.1  mrg   param.noprint = 1;
   2333      1.1  mrg 
   2334      1.1  mrg   print_define_start (param.name);
   2335      1.1  mrg 
   2336      1.1  mrg   for (low = 0; low <= 1; low++)
   2337      1.1  mrg     {
   2338      1.1  mrg       s.r = randlimb_half();
   2339      1.1  mrg       if (low == 0)
   2340      1.1  mrg         s.r |= 1;
   2341      1.1  mrg       else
   2342      1.1  mrg         s.r &= ~CNST_LIMB(7);
   2343      1.1  mrg 
   2344      1.1  mrg       one (&thresh[low], &param);
   2345      1.1  mrg       if (option_trace)
   2346      1.1  mrg         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
   2347      1.1  mrg 
   2348      1.1  mrg       if (thresh[low] == MP_SIZE_T_MAX)
   2349      1.1  mrg         {
   2350      1.1  mrg           average = MP_SIZE_T_MAX;
   2351      1.1  mrg           goto divexact_1_done;
   2352      1.1  mrg         }
   2353      1.1  mrg     }
   2354      1.1  mrg 
   2355      1.1  mrg   if (option_trace)
   2356      1.1  mrg     {
   2357      1.1  mrg       printf ("average of:");
   2358      1.1  mrg       for (i = 0; i < numberof(thresh); i++)
   2359      1.1  mrg         printf (" %ld", (long) thresh[i]);
   2360      1.1  mrg       printf ("\n");
   2361      1.1  mrg     }
   2362      1.1  mrg 
   2363      1.1  mrg   average = 0;
   2364      1.1  mrg   for (i = 0; i < numberof(thresh); i++)
   2365      1.1  mrg     average += thresh[i];
   2366      1.1  mrg   average /= numberof(thresh);
   2367      1.1  mrg 
   2368      1.1  mrg   /* If divexact turns out to be better as early as 3 limbs, then use it
   2369      1.1  mrg      always, so as to reduce code size and conditional jumps.  */
   2370      1.1  mrg   if (average <= 3)
   2371      1.1  mrg     average = 0;
   2372      1.1  mrg 
   2373      1.1  mrg  divexact_1_done:
   2374      1.1  mrg   print_define_end (param.name, average);
   2375      1.1  mrg }
   2376      1.1  mrg 
   2377      1.1  mrg 
   2378      1.1  mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
   2379      1.1  mrg    same as mpn_mod_1, but this might not be true of an assembler
   2380      1.1  mrg    implementation.  The threshold used is an average based on data where a
   2381      1.1  mrg    divide can be skipped and where it can't.
   2382      1.1  mrg 
   2383      1.1  mrg    If modexact turns out to be better as early as 3 limbs, then use it
   2384      1.1  mrg    always, so as to reduce code size and conditional jumps.  */
   2385      1.1  mrg 
   2386      1.1  mrg void
   2387      1.1  mrg tune_modexact_1_odd (void)
   2388      1.1  mrg {
   2389      1.1  mrg   static struct param_t  param;
   2390      1.1  mrg   mp_size_t  thresh_lt, thresh_ge, average;
   2391      1.1  mrg 
   2392      1.1  mrg #if 0
   2393      1.1  mrg   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
   2394      1.1  mrg      of a full mpn_mod_1.  */
   2395      1.1  mrg   if (HAVE_NATIVE_mpn_modexact_1_odd)
   2396      1.1  mrg     {
   2397      1.1  mrg       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
   2398      1.1  mrg       return;
   2399      1.1  mrg     }
   2400      1.1  mrg #endif
   2401      1.1  mrg 
   2402      1.1  mrg   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
   2403      1.1  mrg   param.check_size = 256;
   2404      1.1  mrg   param.min_size = 2;
   2405      1.1  mrg   param.stop_factor = 1.5;
   2406      1.1  mrg   param.function  = speed_mpn_modexact_1c_odd;
   2407      1.1  mrg   param.function2 = speed_mpn_mod_1_tune;
   2408      1.1  mrg   param.noprint = 1;
   2409      1.1  mrg   s.r = randlimb_half () | 1;
   2410      1.1  mrg 
   2411      1.1  mrg   print_define_start (param.name);
   2412      1.1  mrg 
   2413      1.1  mrg   param.data_high = DATA_HIGH_LT_R;
   2414      1.1  mrg   one (&thresh_lt, &param);
   2415      1.1  mrg   if (option_trace)
   2416      1.1  mrg     printf ("lt thresh %ld\n", (long) thresh_lt);
   2417      1.1  mrg 
   2418      1.1  mrg   average = thresh_lt;
   2419      1.1  mrg   if (thresh_lt != MP_SIZE_T_MAX)
   2420      1.1  mrg     {
   2421      1.1  mrg       param.data_high = DATA_HIGH_GE_R;
   2422      1.1  mrg       one (&thresh_ge, &param);
   2423      1.1  mrg       if (option_trace)
   2424      1.1  mrg         printf ("ge thresh %ld\n", (long) thresh_ge);
   2425      1.1  mrg 
   2426      1.1  mrg       if (thresh_ge != MP_SIZE_T_MAX)
   2427      1.1  mrg         {
   2428      1.1  mrg           average = (thresh_ge + thresh_lt) / 2;
   2429      1.1  mrg           if (thresh_ge <= 3)
   2430      1.1  mrg             average = 0;
   2431      1.1  mrg         }
   2432      1.1  mrg     }
   2433      1.1  mrg 
   2434      1.1  mrg   print_define_end (param.name, average);
   2435      1.1  mrg }
   2436      1.1  mrg 
   2437      1.1  mrg 
   2438      1.1  mrg void
   2439      1.1  mrg tune_jacobi_base (void)
   2440      1.1  mrg {
   2441      1.1  mrg   static struct param_t  param;
   2442  1.1.1.2  mrg   double   t1, t2, t3, t4;
   2443      1.1  mrg   int      method;
   2444      1.1  mrg 
   2445      1.1  mrg   s.size = GMP_LIMB_BITS * 3 / 4;
   2446      1.1  mrg 
   2447      1.1  mrg   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
   2448      1.1  mrg   if (option_trace >= 1)
   2449      1.1  mrg     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
   2450      1.1  mrg 
   2451      1.1  mrg   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
   2452      1.1  mrg   if (option_trace >= 1)
   2453      1.1  mrg     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
   2454      1.1  mrg 
   2455      1.1  mrg   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
   2456      1.1  mrg   if (option_trace >= 1)
   2457      1.1  mrg     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
   2458      1.1  mrg 
   2459  1.1.1.2  mrg   t4 = tuneup_measure (speed_mpn_jacobi_base_4, &param, &s);
   2460  1.1.1.2  mrg   if (option_trace >= 1)
   2461  1.1.1.2  mrg     printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
   2462  1.1.1.2  mrg 
   2463  1.1.1.2  mrg   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
   2464      1.1  mrg     {
   2465      1.1  mrg       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
   2466      1.1  mrg               (long) s.size);
   2467      1.1  mrg       abort ();
   2468      1.1  mrg     }
   2469      1.1  mrg 
   2470  1.1.1.2  mrg   if (t1 < t2 && t1 < t3 && t1 < t4)
   2471      1.1  mrg     method = 1;
   2472  1.1.1.2  mrg   else if (t2 < t3 && t2 < t4)
   2473      1.1  mrg     method = 2;
   2474  1.1.1.2  mrg   else if (t3 < t4)
   2475      1.1  mrg     method = 3;
   2476  1.1.1.2  mrg   else
   2477  1.1.1.2  mrg     method = 4;
   2478      1.1  mrg 
   2479      1.1  mrg   print_define ("JACOBI_BASE_METHOD", method);
   2480      1.1  mrg }
   2481      1.1  mrg 
   2482      1.1  mrg 
   2483      1.1  mrg void
   2484      1.1  mrg tune_get_str (void)
   2485      1.1  mrg {
   2486      1.1  mrg   /* Tune for decimal, it being most common.  Some rough testing suggests
   2487      1.1  mrg      other bases are different, but not by very much.  */
   2488      1.1  mrg   s.r = 10;
   2489      1.1  mrg   {
   2490      1.1  mrg     static struct param_t  param;
   2491      1.1  mrg     GET_STR_PRECOMPUTE_THRESHOLD = 0;
   2492      1.1  mrg     param.name = "GET_STR_DC_THRESHOLD";
   2493      1.1  mrg     param.function = speed_mpn_get_str;
   2494      1.1  mrg     param.min_size = 4;
   2495      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2496      1.1  mrg     one (&get_str_dc_threshold, &param);
   2497      1.1  mrg   }
   2498      1.1  mrg   {
   2499      1.1  mrg     static struct param_t  param;
   2500      1.1  mrg     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
   2501      1.1  mrg     param.function = speed_mpn_get_str;
   2502      1.1  mrg     param.min_size = GET_STR_DC_THRESHOLD;
   2503      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2504      1.1  mrg     one (&get_str_precompute_threshold, &param);
   2505      1.1  mrg   }
   2506      1.1  mrg }
   2507      1.1  mrg 
   2508      1.1  mrg 
   2509      1.1  mrg double
   2510      1.1  mrg speed_mpn_pre_set_str (struct speed_params *s)
   2511      1.1  mrg {
   2512      1.1  mrg   unsigned char *str;
   2513      1.1  mrg   mp_ptr     wp;
   2514      1.1  mrg   mp_size_t  wn;
   2515      1.1  mrg   unsigned   i;
   2516      1.1  mrg   int        base;
   2517      1.1  mrg   double     t;
   2518      1.1  mrg   mp_ptr powtab_mem, tp;
   2519      1.1  mrg   powers_t powtab[GMP_LIMB_BITS];
   2520      1.1  mrg   mp_size_t un;
   2521      1.1  mrg   int chars_per_limb;
   2522      1.1  mrg   TMP_DECL;
   2523      1.1  mrg 
   2524      1.1  mrg   SPEED_RESTRICT_COND (s->size >= 1);
   2525      1.1  mrg 
   2526      1.1  mrg   base = s->r == 0 ? 10 : s->r;
   2527      1.1  mrg   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
   2528      1.1  mrg 
   2529      1.1  mrg   TMP_MARK;
   2530      1.1  mrg 
   2531      1.1  mrg   str = TMP_ALLOC (s->size);
   2532      1.1  mrg   for (i = 0; i < s->size; i++)
   2533      1.1  mrg     str[i] = s->xp[i] % base;
   2534      1.1  mrg 
   2535  1.1.1.2  mrg   LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
   2536      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
   2537      1.1  mrg 
   2538      1.1  mrg   /* use this during development to check wn is big enough */
   2539      1.1  mrg   /*
   2540      1.1  mrg   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
   2541      1.1  mrg   */
   2542      1.1  mrg 
   2543      1.1  mrg   speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
   2544      1.1  mrg   speed_operand_dst (s, wp, wn);
   2545      1.1  mrg   speed_cache_fill (s);
   2546      1.1  mrg 
   2547      1.1  mrg   chars_per_limb = mp_bases[base].chars_per_limb;
   2548      1.1  mrg   un = s->size / chars_per_limb + 1;
   2549      1.1  mrg   powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
   2550      1.1  mrg   mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
   2551      1.1  mrg   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
   2552      1.1  mrg 
   2553      1.1  mrg   speed_starttime ();
   2554      1.1  mrg   i = s->reps;
   2555      1.1  mrg   do
   2556      1.1  mrg     {
   2557      1.1  mrg       mpn_pre_set_str (wp, str, s->size, powtab, tp);
   2558      1.1  mrg     }
   2559      1.1  mrg   while (--i != 0);
   2560      1.1  mrg   t = speed_endtime ();
   2561      1.1  mrg 
   2562      1.1  mrg   TMP_FREE;
   2563      1.1  mrg   return t;
   2564      1.1  mrg }
   2565      1.1  mrg 
   2566      1.1  mrg void
   2567      1.1  mrg tune_set_str (void)
   2568      1.1  mrg {
   2569      1.1  mrg   s.r = 10;  /* decimal */
   2570      1.1  mrg   {
   2571      1.1  mrg     static struct param_t  param;
   2572      1.1  mrg     SET_STR_PRECOMPUTE_THRESHOLD = 0;
   2573      1.1  mrg     param.step_factor = 0.01;
   2574      1.1  mrg     param.name = "SET_STR_DC_THRESHOLD";
   2575      1.1  mrg     param.function = speed_mpn_pre_set_str;
   2576      1.1  mrg     param.min_size = 100;
   2577      1.1  mrg     param.max_size = 50000;
   2578      1.1  mrg     one (&set_str_dc_threshold, &param);
   2579      1.1  mrg   }
   2580      1.1  mrg   {
   2581      1.1  mrg     static struct param_t  param;
   2582      1.1  mrg     param.step_factor = 0.02;
   2583      1.1  mrg     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
   2584      1.1  mrg     param.function = speed_mpn_set_str;
   2585      1.1  mrg     param.min_size = SET_STR_DC_THRESHOLD;
   2586      1.1  mrg     param.max_size = 100000;
   2587      1.1  mrg     one (&set_str_precompute_threshold, &param);
   2588      1.1  mrg   }
   2589      1.1  mrg }
   2590      1.1  mrg 
   2591      1.1  mrg 
   2592      1.1  mrg void
   2593      1.1  mrg tune_fft_mul (void)
   2594      1.1  mrg {
   2595      1.1  mrg   static struct fft_param_t  param;
   2596      1.1  mrg 
   2597      1.1  mrg   if (option_fft_max_size == 0)
   2598      1.1  mrg     return;
   2599      1.1  mrg 
   2600      1.1  mrg   param.table_name          = "MUL_FFT_TABLE3";
   2601      1.1  mrg   param.threshold_name      = "MUL_FFT_THRESHOLD";
   2602      1.1  mrg   param.p_threshold         = &mul_fft_threshold;
   2603      1.1  mrg   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
   2604      1.1  mrg   param.p_modf_threshold    = &mul_fft_modf_threshold;
   2605      1.1  mrg   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
   2606      1.1  mrg   param.max_size            = option_fft_max_size;
   2607      1.1  mrg   param.function            = speed_mpn_fft_mul;
   2608      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft;
   2609      1.1  mrg   param.mul_function        = speed_mpn_mul_n;
   2610      1.1  mrg   param.sqr = 0;
   2611      1.1  mrg   fft (&param);
   2612      1.1  mrg }
   2613      1.1  mrg 
   2614      1.1  mrg 
   2615      1.1  mrg void
   2616      1.1  mrg tune_fft_sqr (void)
   2617      1.1  mrg {
   2618      1.1  mrg   static struct fft_param_t  param;
   2619      1.1  mrg 
   2620      1.1  mrg   if (option_fft_max_size == 0)
   2621      1.1  mrg     return;
   2622      1.1  mrg 
   2623      1.1  mrg   param.table_name          = "SQR_FFT_TABLE3";
   2624      1.1  mrg   param.threshold_name      = "SQR_FFT_THRESHOLD";
   2625      1.1  mrg   param.p_threshold         = &sqr_fft_threshold;
   2626      1.1  mrg   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
   2627      1.1  mrg   param.p_modf_threshold    = &sqr_fft_modf_threshold;
   2628      1.1  mrg   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
   2629      1.1  mrg   param.max_size            = option_fft_max_size;
   2630      1.1  mrg   param.function            = speed_mpn_fft_sqr;
   2631      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
   2632      1.1  mrg   param.mul_function        = speed_mpn_sqr;
   2633      1.1  mrg   param.sqr = 1;
   2634      1.1  mrg   fft (&param);
   2635      1.1  mrg }
   2636      1.1  mrg 
   2637      1.1  mrg void
   2638  1.1.1.2  mrg tune_fac_ui (void)
   2639  1.1.1.2  mrg {
   2640  1.1.1.2  mrg   static struct param_t  param;
   2641  1.1.1.2  mrg 
   2642  1.1.1.2  mrg   param.function = speed_mpz_fac_ui_tune;
   2643  1.1.1.2  mrg 
   2644  1.1.1.2  mrg   param.name = "FAC_DSC_THRESHOLD";
   2645  1.1.1.2  mrg   param.min_size = 70;
   2646  1.1.1.2  mrg   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
   2647  1.1.1.2  mrg   one (&fac_dsc_threshold, &param);
   2648  1.1.1.2  mrg 
   2649  1.1.1.2  mrg   param.name = "FAC_ODD_THRESHOLD";
   2650  1.1.1.2  mrg   param.min_size = 22;
   2651  1.1.1.2  mrg   param.stop_factor = 1.7;
   2652  1.1.1.2  mrg   param.min_is_always = 1;
   2653  1.1.1.2  mrg   one (&fac_odd_threshold, &param);
   2654  1.1.1.2  mrg }
   2655  1.1.1.2  mrg 
   2656  1.1.1.2  mrg void
   2657      1.1  mrg all (void)
   2658      1.1  mrg {
   2659      1.1  mrg   time_t  start_time, end_time;
   2660      1.1  mrg   TMP_DECL;
   2661      1.1  mrg 
   2662      1.1  mrg   TMP_MARK;
   2663      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
   2664      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
   2665      1.1  mrg 
   2666      1.1  mrg   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
   2667      1.1  mrg   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
   2668      1.1  mrg 
   2669      1.1  mrg   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
   2670      1.1  mrg 
   2671      1.1  mrg   speed_time_init ();
   2672      1.1  mrg   fprintf (stderr, "Using: %s\n", speed_time_string);
   2673      1.1  mrg 
   2674      1.1  mrg   fprintf (stderr, "speed_precision %d", speed_precision);
   2675      1.1  mrg   if (speed_unittime == 1.0)
   2676      1.1  mrg     fprintf (stderr, ", speed_unittime 1 cycle");
   2677      1.1  mrg   else
   2678      1.1  mrg     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
   2679      1.1  mrg   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
   2680      1.1  mrg     fprintf (stderr, ", CPU freq unknown\n");
   2681      1.1  mrg   else
   2682      1.1  mrg     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
   2683      1.1  mrg 
   2684      1.1  mrg   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
   2685      1.1  mrg            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
   2686      1.1  mrg   fprintf (stderr, "\n");
   2687      1.1  mrg 
   2688      1.1  mrg   time (&start_time);
   2689      1.1  mrg   {
   2690      1.1  mrg     struct tm  *tp;
   2691      1.1  mrg     tp = localtime (&start_time);
   2692      1.1  mrg     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
   2693      1.1  mrg             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
   2694      1.1  mrg 
   2695      1.1  mrg #ifdef __GNUC__
   2696      1.1  mrg     /* gcc sub-minor version doesn't seem to come through as a define */
   2697      1.1  mrg     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
   2698      1.1  mrg #define PRINTED_COMPILER
   2699      1.1  mrg #endif
   2700      1.1  mrg #if defined (__SUNPRO_C)
   2701      1.1  mrg     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
   2702      1.1  mrg #define PRINTED_COMPILER
   2703      1.1  mrg #endif
   2704      1.1  mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
   2705      1.1  mrg     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
   2706      1.1  mrg     printf ("MIPSpro C %d.%d.%d */\n",
   2707      1.1  mrg 	    _COMPILER_VERSION / 100,
   2708      1.1  mrg 	    _COMPILER_VERSION / 10 % 10,
   2709      1.1  mrg 	    _COMPILER_VERSION % 10);
   2710      1.1  mrg #define PRINTED_COMPILER
   2711      1.1  mrg #endif
   2712      1.1  mrg #if defined (__DECC) && defined (__DECC_VER)
   2713      1.1  mrg     printf ("DEC C %d */\n", __DECC_VER);
   2714      1.1  mrg #define PRINTED_COMPILER
   2715      1.1  mrg #endif
   2716      1.1  mrg #if ! defined (PRINTED_COMPILER)
   2717      1.1  mrg     printf ("system compiler */\n");
   2718      1.1  mrg #endif
   2719      1.1  mrg   }
   2720      1.1  mrg   printf ("\n");
   2721      1.1  mrg 
   2722      1.1  mrg   tune_divrem_1 ();
   2723      1.1  mrg   tune_mod_1 ();
   2724      1.1  mrg   tune_preinv_divrem_1 ();
   2725  1.1.1.2  mrg #if 0
   2726      1.1  mrg   tune_divrem_2 ();
   2727  1.1.1.2  mrg #endif
   2728  1.1.1.2  mrg   tune_div_qr_2 ();
   2729      1.1  mrg   tune_divexact_1 ();
   2730      1.1  mrg   tune_modexact_1_odd ();
   2731      1.1  mrg   printf("\n");
   2732      1.1  mrg 
   2733      1.1  mrg   tune_mul_n ();
   2734      1.1  mrg   printf("\n");
   2735      1.1  mrg 
   2736      1.1  mrg   tune_mul ();
   2737      1.1  mrg   printf("\n");
   2738      1.1  mrg 
   2739      1.1  mrg   tune_sqr ();
   2740      1.1  mrg   printf("\n");
   2741      1.1  mrg 
   2742  1.1.1.2  mrg   tune_mulmid ();
   2743  1.1.1.2  mrg   printf("\n");
   2744  1.1.1.2  mrg 
   2745      1.1  mrg   tune_mulmod_bnm1 ();
   2746      1.1  mrg   tune_sqrmod_bnm1 ();
   2747      1.1  mrg   printf("\n");
   2748      1.1  mrg 
   2749      1.1  mrg   tune_fft_mul ();
   2750      1.1  mrg   printf("\n");
   2751      1.1  mrg 
   2752      1.1  mrg   tune_fft_sqr ();
   2753      1.1  mrg   printf ("\n");
   2754      1.1  mrg 
   2755      1.1  mrg   tune_mullo ();
   2756      1.1  mrg   printf("\n");
   2757      1.1  mrg 
   2758      1.1  mrg   tune_dc_div ();
   2759      1.1  mrg   tune_dc_bdiv ();
   2760      1.1  mrg 
   2761      1.1  mrg   printf("\n");
   2762      1.1  mrg   tune_invertappr ();
   2763      1.1  mrg   tune_invert ();
   2764      1.1  mrg   printf("\n");
   2765      1.1  mrg 
   2766      1.1  mrg   tune_binvert ();
   2767      1.1  mrg   tune_redc ();
   2768      1.1  mrg   printf("\n");
   2769      1.1  mrg 
   2770      1.1  mrg   tune_mu_div ();
   2771      1.1  mrg   tune_mu_bdiv ();
   2772      1.1  mrg   printf("\n");
   2773      1.1  mrg 
   2774  1.1.1.2  mrg   tune_powm_sec ();
   2775  1.1.1.2  mrg   printf("\n");
   2776  1.1.1.2  mrg 
   2777      1.1  mrg   tune_matrix22_mul ();
   2778      1.1  mrg   tune_hgcd ();
   2779  1.1.1.2  mrg   tune_hgcd_appr ();
   2780  1.1.1.2  mrg   tune_hgcd_reduce();
   2781      1.1  mrg   tune_gcd_dc ();
   2782      1.1  mrg   tune_gcdext_dc ();
   2783      1.1  mrg   tune_jacobi_base ();
   2784      1.1  mrg   printf("\n");
   2785      1.1  mrg 
   2786      1.1  mrg   tune_get_str ();
   2787      1.1  mrg   tune_set_str ();
   2788      1.1  mrg   printf("\n");
   2789      1.1  mrg 
   2790  1.1.1.2  mrg   tune_fac_ui ();
   2791  1.1.1.2  mrg   printf("\n");
   2792  1.1.1.2  mrg 
   2793      1.1  mrg   time (&end_time);
   2794      1.1  mrg   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
   2795      1.1  mrg           (long) (end_time - start_time));
   2796      1.1  mrg 
   2797      1.1  mrg   TMP_FREE;
   2798      1.1  mrg }
   2799      1.1  mrg 
   2800      1.1  mrg 
   2801      1.1  mrg int
   2802      1.1  mrg main (int argc, char *argv[])
   2803      1.1  mrg {
   2804      1.1  mrg   int  opt;
   2805      1.1  mrg 
   2806      1.1  mrg   /* Unbuffered so if output is redirected to a file it isn't lost if the
   2807      1.1  mrg      program is killed part way through.  */
   2808      1.1  mrg   setbuf (stdout, NULL);
   2809      1.1  mrg   setbuf (stderr, NULL);
   2810      1.1  mrg 
   2811      1.1  mrg   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
   2812      1.1  mrg     {
   2813      1.1  mrg       switch (opt) {
   2814      1.1  mrg       case 'f':
   2815      1.1  mrg         if (optarg[0] == 't')
   2816      1.1  mrg           option_fft_trace = 2;
   2817      1.1  mrg         else
   2818      1.1  mrg           option_fft_max_size = atol (optarg);
   2819      1.1  mrg         break;
   2820      1.1  mrg       case 'o':
   2821      1.1  mrg         speed_option_set (optarg);
   2822      1.1  mrg         break;
   2823      1.1  mrg       case 'p':
   2824      1.1  mrg         speed_precision = atoi (optarg);
   2825      1.1  mrg         break;
   2826      1.1  mrg       case 't':
   2827      1.1  mrg         option_trace++;
   2828      1.1  mrg         break;
   2829      1.1  mrg       case '?':
   2830      1.1  mrg         exit(1);
   2831      1.1  mrg       }
   2832      1.1  mrg     }
   2833      1.1  mrg 
   2834      1.1  mrg   all ();
   2835      1.1  mrg   exit (0);
   2836      1.1  mrg }
   2837