Home | History | Annotate | Line # | Download | only in tune
tuneup.c revision 1.1.1.3
      1      1.1  mrg /* Create tuned thresholds for various algorithms.
      2      1.1  mrg 
      3  1.1.1.3  mrg Copyright 1999-2003, 2005, 2006, 2008-2012 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg This file is part of the GNU MP Library.
      6      1.1  mrg 
      7      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg it under the terms of either:
      9  1.1.1.3  mrg 
     10  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg     option) any later version.
     13  1.1.1.3  mrg 
     14  1.1.1.3  mrg or
     15  1.1.1.3  mrg 
     16  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg     later version.
     19  1.1.1.3  mrg 
     20  1.1.1.3  mrg or both in parallel, as here.
     21      1.1  mrg 
     22      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg for more details.
     26      1.1  mrg 
     27  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg /* Usage: tuneup [-t] [-t] [-p precision]
     33      1.1  mrg 
     34      1.1  mrg    -t turns on some diagnostic traces, a second -t turns on more traces.
     35      1.1  mrg 
     36      1.1  mrg    Notes:
     37      1.1  mrg 
     38      1.1  mrg    The code here isn't a vision of loveliness, mainly because it's subject
     39      1.1  mrg    to ongoing changes according to new things wanting to be tuned, and
     40      1.1  mrg    practical requirements of systems tested.
     41      1.1  mrg 
     42      1.1  mrg    Sometimes running the program twice produces slightly different results.
     43      1.1  mrg    This is probably because there's so little separating algorithms near
     44      1.1  mrg    their crossover, and on that basis it should make little or no difference
     45      1.1  mrg    to the final speed of the relevant routines, but nothing has been done to
     46      1.1  mrg    check that carefully.
     47      1.1  mrg 
     48      1.1  mrg    Algorithm:
     49      1.1  mrg 
     50      1.1  mrg    The thresholds are determined as follows.  A crossover may not be a
     51      1.1  mrg    single size but rather a range where it oscillates between method A or
     52      1.1  mrg    method B faster.  If the threshold is set making B used where A is faster
     53      1.1  mrg    (or vice versa) that's bad.  Badness is the percentage time lost and
     54      1.1  mrg    total badness is the sum of this over all sizes measured.  The threshold
     55      1.1  mrg    is set to minimize total badness.
     56      1.1  mrg 
     57      1.1  mrg    Suppose, as sizes increase, method B becomes faster than method A.  The
     58      1.1  mrg    effect of the rule is that, as you look at increasing sizes, isolated
     59      1.1  mrg    points where B is faster are ignored, but when it's consistently faster,
     60      1.1  mrg    or faster on balance, then the threshold is set there.  The same result
     61      1.1  mrg    is obtained thinking in the other direction of A becoming faster at
     62      1.1  mrg    smaller sizes.
     63      1.1  mrg 
     64      1.1  mrg    In practice the thresholds tend to be chosen to bring on the next
     65      1.1  mrg    algorithm fairly quickly.
     66      1.1  mrg 
     67      1.1  mrg    This rule is attractive because it's got a basis in reason and is fairly
     68      1.1  mrg    easy to implement, but no work has been done to actually compare it in
     69      1.1  mrg    absolute terms to other possibilities.
     70      1.1  mrg 
     71      1.1  mrg    Implementation:
     72      1.1  mrg 
     73      1.1  mrg    In a normal library build the thresholds are constants.  To tune them
     74      1.1  mrg    selected objects are recompiled with the thresholds as global variables
     75      1.1  mrg    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
     76      1.1  mrg    the end of gmp-impl.h, and rules in tune/Makefile.am.
     77      1.1  mrg 
     78      1.1  mrg    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
     79      1.1  mrg    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
     80      1.1  mrg    level, but recurse into the basecase.
     81      1.1  mrg 
     82      1.1  mrg    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
     83      1.1  mrg    Other routines in turn will make use of both of those.  Naturally the
     84      1.1  mrg    dependants must be tuned first.
     85      1.1  mrg 
     86      1.1  mrg    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
     87      1.1  mrg    just a threshold based on comparing two routines (mpn_divrem_1 and
     88      1.1  mrg    mpn_divexact_1), and no further use of the value determined.
     89      1.1  mrg 
     90      1.1  mrg    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
     91      1.1  mrg    just comparisons between certain routines on representative data.
     92      1.1  mrg 
     93      1.1  mrg    Shortcuts are applied when native (assembler) versions of routines exist.
     94      1.1  mrg    For instance a native mpn_sqr_basecase is assumed to be always faster
     95      1.1  mrg    than mpn_mul_basecase, with no measuring.
     96      1.1  mrg 
     97      1.1  mrg    No attempt is made to tune within assembler routines, for instance
     98      1.1  mrg    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
     99      1.1  mrg    written and tuned all by hand.  Assembler routines that might have hard
    100      1.1  mrg    limits are recompiled though, to make them accept a bigger range of sizes
    101      1.1  mrg    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
    102      1.1  mrg 
    103      1.1  mrg    Limitations:
    104      1.1  mrg 
    105      1.1  mrg    The FFTs aren't subject to the same badness rule as the other thresholds,
    106      1.1  mrg    so each k is probably being brought on a touch early.  This isn't likely
    107      1.1  mrg    to make a difference, and the simpler probing means fewer tests.
    108      1.1  mrg 
    109      1.1  mrg */
    110      1.1  mrg 
    111      1.1  mrg #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
    112      1.1  mrg 
    113      1.1  mrg #include "config.h"
    114      1.1  mrg 
    115      1.1  mrg #include <math.h>
    116      1.1  mrg #include <stdio.h>
    117      1.1  mrg #include <stdlib.h>
    118      1.1  mrg #include <time.h>
    119      1.1  mrg #if HAVE_UNISTD_H
    120      1.1  mrg #include <unistd.h>
    121      1.1  mrg #endif
    122      1.1  mrg 
    123      1.1  mrg #include "gmp.h"
    124      1.1  mrg #include "gmp-impl.h"
    125      1.1  mrg #include "longlong.h"
    126      1.1  mrg 
    127      1.1  mrg #include "tests.h"
    128      1.1  mrg #include "speed.h"
    129      1.1  mrg 
    130      1.1  mrg #if !HAVE_DECL_OPTARG
    131      1.1  mrg extern char *optarg;
    132      1.1  mrg extern int optind, opterr;
    133      1.1  mrg #endif
    134      1.1  mrg 
    135      1.1  mrg 
    136      1.1  mrg #define DEFAULT_MAX_SIZE   1000  /* limbs */
    137      1.1  mrg 
    138      1.1  mrg #if WANT_FFT
    139      1.1  mrg mp_size_t  option_fft_max_size = 50000;  /* limbs */
    140      1.1  mrg #else
    141      1.1  mrg mp_size_t  option_fft_max_size = 0;
    142      1.1  mrg #endif
    143      1.1  mrg int        option_trace = 0;
    144      1.1  mrg int        option_fft_trace = 0;
    145      1.1  mrg struct speed_params  s;
    146      1.1  mrg 
    147      1.1  mrg struct dat_t {
    148      1.1  mrg   mp_size_t  size;
    149      1.1  mrg   double     d;
    150      1.1  mrg } *dat = NULL;
    151      1.1  mrg int  ndat = 0;
    152      1.1  mrg int  allocdat = 0;
    153      1.1  mrg 
    154      1.1  mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
    155      1.1  mrg    case use zero here, which for params.max_size means no limit.  */
    156      1.1  mrg #ifndef TUNE_SQR_TOOM2_MAX
    157      1.1  mrg #define TUNE_SQR_TOOM2_MAX  0
    158      1.1  mrg #endif
    159      1.1  mrg 
    160      1.1  mrg mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
    161      1.1  mrg mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
    162      1.1  mrg mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
    163      1.1  mrg mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
    164      1.1  mrg mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
    165      1.1  mrg mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
    166      1.1  mrg mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
    167      1.1  mrg mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
    168      1.1  mrg mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
    169  1.1.1.2  mrg mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
    170      1.1  mrg mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
    171      1.1  mrg mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
    172      1.1  mrg mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
    173      1.1  mrg mp_size_t  sqr_toom2_threshold
    174      1.1  mrg   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
    175      1.1  mrg mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
    176      1.1  mrg mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
    177      1.1  mrg mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
    178      1.1  mrg mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
    179      1.1  mrg mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
    180      1.1  mrg mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
    181      1.1  mrg mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
    182      1.1  mrg mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
    183      1.1  mrg mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
    184  1.1.1.3  mrg mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
    185  1.1.1.3  mrg mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
    186  1.1.1.3  mrg mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
    187  1.1.1.2  mrg mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
    188      1.1  mrg mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
    189      1.1  mrg mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
    190  1.1.1.2  mrg mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
    191      1.1  mrg mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
    192      1.1  mrg mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
    193      1.1  mrg mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
    194      1.1  mrg mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
    195      1.1  mrg mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
    196      1.1  mrg mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
    197      1.1  mrg mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    198      1.1  mrg mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
    199      1.1  mrg mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    200      1.1  mrg mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
    201      1.1  mrg mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
    202      1.1  mrg mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
    203      1.1  mrg mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
    204      1.1  mrg mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
    205      1.1  mrg mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
    206      1.1  mrg mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
    207      1.1  mrg mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
    208      1.1  mrg mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
    209      1.1  mrg mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
    210  1.1.1.2  mrg mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
    211  1.1.1.2  mrg mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
    212      1.1  mrg mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
    213      1.1  mrg mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
    214  1.1.1.3  mrg int	   div_qr_1n_pi1_method		= 0;
    215  1.1.1.3  mrg mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
    216  1.1.1.3  mrg mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
    217      1.1  mrg mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
    218      1.1  mrg mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
    219      1.1  mrg mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
    220      1.1  mrg mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
    221  1.1.1.2  mrg int	   mod_1_1p_method		= 0;
    222      1.1  mrg mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    223      1.1  mrg mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    224      1.1  mrg mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
    225      1.1  mrg mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
    226      1.1  mrg mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
    227      1.1  mrg mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
    228      1.1  mrg mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
    229      1.1  mrg mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
    230      1.1  mrg mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
    231      1.1  mrg mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
    232  1.1.1.2  mrg mp_size_t  fac_odd_threshold            = 0;
    233  1.1.1.2  mrg mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
    234      1.1  mrg 
    235      1.1  mrg mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
    236      1.1  mrg mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
    237      1.1  mrg 
    238      1.1  mrg struct param_t {
    239      1.1  mrg   const char        *name;
    240      1.1  mrg   speed_function_t  function;
    241      1.1  mrg   speed_function_t  function2;
    242      1.1  mrg   double            step_factor;    /* how much to step relatively */
    243      1.1  mrg   int               step;           /* how much to step absolutely */
    244      1.1  mrg   double            function_fudge; /* multiplier for "function" speeds */
    245      1.1  mrg   int               stop_since_change;
    246      1.1  mrg   double            stop_factor;
    247      1.1  mrg   mp_size_t         min_size;
    248      1.1  mrg   int               min_is_always;
    249      1.1  mrg   mp_size_t         max_size;
    250      1.1  mrg   mp_size_t         check_size;
    251      1.1  mrg   mp_size_t         size_extra;
    252      1.1  mrg 
    253      1.1  mrg #define DATA_HIGH_LT_R  1
    254      1.1  mrg #define DATA_HIGH_GE_R  2
    255      1.1  mrg   int               data_high;
    256      1.1  mrg 
    257      1.1  mrg   int               noprint;
    258      1.1  mrg };
    259      1.1  mrg 
    260      1.1  mrg 
    261      1.1  mrg /* These are normally undefined when false, which suits "#if" fine.
    262      1.1  mrg    But give them zero values so they can be used in plain C "if"s.  */
    263      1.1  mrg #ifndef UDIV_PREINV_ALWAYS
    264      1.1  mrg #define UDIV_PREINV_ALWAYS 0
    265      1.1  mrg #endif
    266      1.1  mrg #ifndef HAVE_NATIVE_mpn_divexact_1
    267      1.1  mrg #define HAVE_NATIVE_mpn_divexact_1 0
    268      1.1  mrg #endif
    269  1.1.1.3  mrg #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
    270  1.1.1.3  mrg #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
    271  1.1.1.3  mrg #endif
    272      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_1
    273      1.1  mrg #define HAVE_NATIVE_mpn_divrem_1 0
    274      1.1  mrg #endif
    275      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_2
    276      1.1  mrg #define HAVE_NATIVE_mpn_divrem_2 0
    277      1.1  mrg #endif
    278      1.1  mrg #ifndef HAVE_NATIVE_mpn_mod_1
    279      1.1  mrg #define HAVE_NATIVE_mpn_mod_1 0
    280      1.1  mrg #endif
    281  1.1.1.2  mrg #ifndef HAVE_NATIVE_mpn_mod_1_1p
    282  1.1.1.2  mrg #define HAVE_NATIVE_mpn_mod_1_1p 0
    283  1.1.1.2  mrg #endif
    284      1.1  mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd
    285      1.1  mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0
    286      1.1  mrg #endif
    287      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
    288      1.1  mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0
    289      1.1  mrg #endif
    290      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1
    291      1.1  mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0
    292      1.1  mrg #endif
    293      1.1  mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase
    294      1.1  mrg #define HAVE_NATIVE_mpn_sqr_basecase 0
    295      1.1  mrg #endif
    296      1.1  mrg 
    297      1.1  mrg 
    298      1.1  mrg #define MAX3(a,b,c)  MAX (MAX (a, b), c)
    299      1.1  mrg 
    300      1.1  mrg mp_limb_t
    301      1.1  mrg randlimb_norm (void)
    302      1.1  mrg {
    303      1.1  mrg   mp_limb_t  n;
    304      1.1  mrg   mpn_random (&n, 1);
    305      1.1  mrg   n |= GMP_NUMB_HIGHBIT;
    306      1.1  mrg   return n;
    307      1.1  mrg }
    308      1.1  mrg 
    309      1.1  mrg #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
    310      1.1  mrg 
    311      1.1  mrg mp_limb_t
    312      1.1  mrg randlimb_half (void)
    313      1.1  mrg {
    314      1.1  mrg   mp_limb_t  n;
    315      1.1  mrg   mpn_random (&n, 1);
    316      1.1  mrg   n &= GMP_NUMB_HALFMASK;
    317      1.1  mrg   n += (n==0);
    318      1.1  mrg   return n;
    319      1.1  mrg }
    320      1.1  mrg 
    321      1.1  mrg 
    322      1.1  mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger
    323      1.1  mrg    if necessary.  */
    324      1.1  mrg void
    325      1.1  mrg add_dat (mp_size_t size, double d)
    326      1.1  mrg {
    327      1.1  mrg #define ALLOCDAT_STEP  500
    328      1.1  mrg 
    329      1.1  mrg   ASSERT_ALWAYS (ndat <= allocdat);
    330      1.1  mrg 
    331      1.1  mrg   if (ndat == allocdat)
    332      1.1  mrg     {
    333      1.1  mrg       dat = (struct dat_t *) __gmp_allocate_or_reallocate
    334      1.1  mrg         (dat, allocdat * sizeof(dat[0]),
    335      1.1  mrg          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
    336      1.1  mrg       allocdat += ALLOCDAT_STEP;
    337      1.1  mrg     }
    338      1.1  mrg 
    339      1.1  mrg   dat[ndat].size = size;
    340      1.1  mrg   dat[ndat].d = d;
    341      1.1  mrg   ndat++;
    342      1.1  mrg }
    343      1.1  mrg 
    344      1.1  mrg 
    345      1.1  mrg /* Return the threshold size based on the data accumulated. */
    346      1.1  mrg mp_size_t
    347      1.1  mrg analyze_dat (int final)
    348      1.1  mrg {
    349      1.1  mrg   double  x, min_x;
    350      1.1  mrg   int     j, min_j;
    351      1.1  mrg 
    352      1.1  mrg   /* If the threshold is set at dat[0].size, any positive values are bad. */
    353      1.1  mrg   x = 0.0;
    354      1.1  mrg   for (j = 0; j < ndat; j++)
    355      1.1  mrg     if (dat[j].d > 0.0)
    356      1.1  mrg       x += dat[j].d;
    357      1.1  mrg 
    358      1.1  mrg   if (option_trace >= 2 && final)
    359      1.1  mrg     {
    360      1.1  mrg       printf ("\n");
    361      1.1  mrg       printf ("x is the sum of the badness from setting thresh at given size\n");
    362      1.1  mrg       printf ("  (minimum x is sought)\n");
    363      1.1  mrg       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
    364      1.1  mrg     }
    365      1.1  mrg 
    366      1.1  mrg   min_x = x;
    367      1.1  mrg   min_j = 0;
    368      1.1  mrg 
    369      1.1  mrg 
    370      1.1  mrg   /* When stepping to the next dat[j].size, positive values are no longer
    371      1.1  mrg      bad (so subtracted), negative values become bad (so add the absolute
    372      1.1  mrg      value, meaning subtract). */
    373      1.1  mrg   for (j = 0; j < ndat; x -= dat[j].d, j++)
    374      1.1  mrg     {
    375      1.1  mrg       if (option_trace >= 2 && final)
    376      1.1  mrg         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
    377      1.1  mrg 
    378      1.1  mrg       if (x < min_x)
    379      1.1  mrg         {
    380      1.1  mrg           min_x = x;
    381      1.1  mrg           min_j = j;
    382      1.1  mrg         }
    383      1.1  mrg     }
    384      1.1  mrg 
    385      1.1  mrg   return min_j;
    386      1.1  mrg }
    387      1.1  mrg 
    388      1.1  mrg 
    389  1.1.1.3  mrg /* Measuring for recompiled mpn/generic/div_qr_1.c,
    390  1.1.1.3  mrg  * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
    391  1.1.1.3  mrg 
    392  1.1.1.3  mrg mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
    393  1.1.1.3  mrg 
    394  1.1.1.3  mrg #if defined (__cplusplus)
    395  1.1.1.3  mrg extern "C" {
    396  1.1.1.3  mrg #endif
    397      1.1  mrg 
    398  1.1.1.2  mrg mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
    399  1.1.1.2  mrg mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
    400  1.1.1.2  mrg void mpz_fac_ui_tune (mpz_ptr, unsigned long);
    401      1.1  mrg 
    402  1.1.1.3  mrg #if defined (__cplusplus)
    403  1.1.1.3  mrg }
    404  1.1.1.3  mrg #endif
    405  1.1.1.3  mrg 
    406      1.1  mrg double
    407      1.1  mrg speed_mpn_mod_1_tune (struct speed_params *s)
    408      1.1  mrg {
    409      1.1  mrg   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
    410      1.1  mrg }
    411      1.1  mrg double
    412      1.1  mrg speed_mpn_divrem_1_tune (struct speed_params *s)
    413      1.1  mrg {
    414      1.1  mrg   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
    415      1.1  mrg }
    416  1.1.1.2  mrg double
    417  1.1.1.2  mrg speed_mpz_fac_ui_tune (struct speed_params *s)
    418  1.1.1.2  mrg {
    419  1.1.1.2  mrg   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
    420  1.1.1.2  mrg }
    421  1.1.1.3  mrg double
    422  1.1.1.3  mrg speed_mpn_div_qr_1_tune (struct speed_params *s)
    423  1.1.1.3  mrg {
    424  1.1.1.3  mrg   SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
    425  1.1.1.3  mrg }
    426      1.1  mrg 
    427      1.1  mrg double
    428      1.1  mrg tuneup_measure (speed_function_t fun,
    429      1.1  mrg                 const struct param_t *param,
    430      1.1  mrg                 struct speed_params *s)
    431      1.1  mrg {
    432      1.1  mrg   static struct param_t  dummy;
    433      1.1  mrg   double   t;
    434      1.1  mrg   TMP_DECL;
    435      1.1  mrg 
    436      1.1  mrg   if (! param)
    437      1.1  mrg     param = &dummy;
    438      1.1  mrg 
    439      1.1  mrg   s->size += param->size_extra;
    440      1.1  mrg 
    441      1.1  mrg   TMP_MARK;
    442      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
    443      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
    444      1.1  mrg 
    445      1.1  mrg   mpn_random (s->xp, s->size);
    446      1.1  mrg   mpn_random (s->yp, s->size);
    447      1.1  mrg 
    448      1.1  mrg   switch (param->data_high) {
    449      1.1  mrg   case DATA_HIGH_LT_R:
    450      1.1  mrg     s->xp[s->size-1] %= s->r;
    451      1.1  mrg     s->yp[s->size-1] %= s->r;
    452      1.1  mrg     break;
    453      1.1  mrg   case DATA_HIGH_GE_R:
    454      1.1  mrg     s->xp[s->size-1] |= s->r;
    455      1.1  mrg     s->yp[s->size-1] |= s->r;
    456      1.1  mrg     break;
    457      1.1  mrg   }
    458      1.1  mrg 
    459      1.1  mrg   t = speed_measure (fun, s);
    460      1.1  mrg 
    461      1.1  mrg   s->size -= param->size_extra;
    462      1.1  mrg 
    463      1.1  mrg   TMP_FREE;
    464      1.1  mrg   return t;
    465      1.1  mrg }
    466      1.1  mrg 
    467      1.1  mrg 
    468      1.1  mrg #define PRINT_WIDTH  31
    469      1.1  mrg 
    470      1.1  mrg void
    471      1.1  mrg print_define_start (const char *name)
    472      1.1  mrg {
    473      1.1  mrg   printf ("#define %-*s  ", PRINT_WIDTH, name);
    474      1.1  mrg   if (option_trace)
    475      1.1  mrg     printf ("...\n");
    476      1.1  mrg }
    477      1.1  mrg 
    478      1.1  mrg void
    479      1.1  mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark)
    480      1.1  mrg {
    481      1.1  mrg   if (option_trace)
    482      1.1  mrg     printf ("#define %-*s  ", PRINT_WIDTH, name);
    483      1.1  mrg 
    484      1.1  mrg   if (value == MP_SIZE_T_MAX)
    485      1.1  mrg     printf ("MP_SIZE_T_MAX");
    486      1.1  mrg   else
    487      1.1  mrg     printf ("%5ld", (long) value);
    488      1.1  mrg 
    489      1.1  mrg   if (remark != NULL)
    490      1.1  mrg     printf ("  /* %s */", remark);
    491      1.1  mrg   printf ("\n");
    492      1.1  mrg   fflush (stdout);
    493      1.1  mrg }
    494      1.1  mrg 
    495      1.1  mrg void
    496      1.1  mrg print_define_end (const char *name, mp_size_t value)
    497      1.1  mrg {
    498      1.1  mrg   const char  *remark;
    499      1.1  mrg   if (value == MP_SIZE_T_MAX)
    500      1.1  mrg     remark = "never";
    501      1.1  mrg   else if (value == 0)
    502      1.1  mrg     remark = "always";
    503      1.1  mrg   else
    504      1.1  mrg     remark = NULL;
    505      1.1  mrg   print_define_end_remark (name, value, remark);
    506      1.1  mrg }
    507      1.1  mrg 
    508      1.1  mrg void
    509      1.1  mrg print_define (const char *name, mp_size_t value)
    510      1.1  mrg {
    511      1.1  mrg   print_define_start (name);
    512      1.1  mrg   print_define_end (name, value);
    513      1.1  mrg }
    514      1.1  mrg 
    515      1.1  mrg void
    516      1.1  mrg print_define_remark (const char *name, mp_size_t value, const char *remark)
    517      1.1  mrg {
    518      1.1  mrg   print_define_start (name);
    519      1.1  mrg   print_define_end_remark (name, value, remark);
    520      1.1  mrg }
    521      1.1  mrg 
    522      1.1  mrg 
    523      1.1  mrg void
    524      1.1  mrg one (mp_size_t *threshold, struct param_t *param)
    525      1.1  mrg {
    526      1.1  mrg   int  since_positive, since_thresh_change;
    527      1.1  mrg   int  thresh_idx, new_thresh_idx;
    528      1.1  mrg 
    529      1.1  mrg #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
    530      1.1  mrg 
    531      1.1  mrg   DEFAULT (param->function_fudge, 1.0);
    532      1.1  mrg   DEFAULT (param->function2, param->function);
    533      1.1  mrg   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
    534      1.1  mrg   DEFAULT (param->step, 1);            /* small steps by default */
    535      1.1  mrg   DEFAULT (param->stop_since_change, 80);
    536      1.1  mrg   DEFAULT (param->stop_factor, 1.2);
    537      1.1  mrg   DEFAULT (param->min_size, 10);
    538      1.1  mrg   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
    539      1.1  mrg 
    540      1.1  mrg   if (param->check_size != 0)
    541      1.1  mrg     {
    542      1.1  mrg       double   t1, t2;
    543      1.1  mrg       s.size = param->check_size;
    544      1.1  mrg 
    545      1.1  mrg       *threshold = s.size+1;
    546      1.1  mrg       t1 = tuneup_measure (param->function, param, &s);
    547      1.1  mrg 
    548      1.1  mrg       *threshold = s.size;
    549      1.1  mrg       t2 = tuneup_measure (param->function2, param, &s);
    550      1.1  mrg       if (t1 == -1.0 || t2 == -1.0)
    551      1.1  mrg         {
    552      1.1  mrg           printf ("Oops, can't run both functions at size %ld\n",
    553      1.1  mrg                   (long) s.size);
    554      1.1  mrg           abort ();
    555      1.1  mrg         }
    556      1.1  mrg       t1 *= param->function_fudge;
    557      1.1  mrg 
    558      1.1  mrg       /* ask that t2 is at least 4% below t1 */
    559      1.1  mrg       if (t1 < t2*1.04)
    560      1.1  mrg         {
    561      1.1  mrg           if (option_trace)
    562      1.1  mrg             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
    563      1.1  mrg           *threshold = MP_SIZE_T_MAX;
    564      1.1  mrg           if (! param->noprint)
    565      1.1  mrg             print_define (param->name, *threshold);
    566      1.1  mrg           return;
    567      1.1  mrg         }
    568      1.1  mrg 
    569      1.1  mrg       if (option_trace >= 2)
    570      1.1  mrg         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
    571      1.1  mrg                 (long) s.size, t1, t2);
    572      1.1  mrg     }
    573      1.1  mrg 
    574      1.1  mrg   if (! param->noprint || option_trace)
    575      1.1  mrg     print_define_start (param->name);
    576      1.1  mrg 
    577      1.1  mrg   ndat = 0;
    578      1.1  mrg   since_positive = 0;
    579      1.1  mrg   since_thresh_change = 0;
    580      1.1  mrg   thresh_idx = 0;
    581      1.1  mrg 
    582      1.1  mrg   if (option_trace >= 2)
    583      1.1  mrg     {
    584      1.1  mrg       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
    585      1.1  mrg       printf ("              (seconds)    (seconds)    diff    thresh\n");
    586      1.1  mrg     }
    587      1.1  mrg 
    588      1.1  mrg   for (s.size = param->min_size;
    589      1.1  mrg        s.size < param->max_size;
    590      1.1  mrg        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
    591      1.1  mrg     {
    592      1.1  mrg       double   ti, tiplus1, d;
    593      1.1  mrg 
    594      1.1  mrg       /*
    595      1.1  mrg         FIXME: check minimum size requirements are met, possibly by just
    596      1.1  mrg         checking for the -1 returns from the speed functions.
    597      1.1  mrg       */
    598      1.1  mrg 
    599      1.1  mrg       /* using method A at this size */
    600      1.1  mrg       *threshold = s.size+1;
    601      1.1  mrg       ti = tuneup_measure (param->function, param, &s);
    602      1.1  mrg       if (ti == -1.0)
    603      1.1  mrg         abort ();
    604      1.1  mrg       ti *= param->function_fudge;
    605      1.1  mrg 
    606      1.1  mrg       /* using method B at this size */
    607      1.1  mrg       *threshold = s.size;
    608      1.1  mrg       tiplus1 = tuneup_measure (param->function2, param, &s);
    609      1.1  mrg       if (tiplus1 == -1.0)
    610      1.1  mrg         abort ();
    611      1.1  mrg 
    612      1.1  mrg       /* Calculate the fraction by which the one or the other routine is
    613      1.1  mrg          slower.  */
    614      1.1  mrg       if (tiplus1 >= ti)
    615      1.1  mrg         d = (tiplus1 - ti) / tiplus1;  /* negative */
    616      1.1  mrg       else
    617      1.1  mrg         d = (tiplus1 - ti) / ti;       /* positive */
    618      1.1  mrg 
    619      1.1  mrg       add_dat (s.size, d);
    620      1.1  mrg 
    621      1.1  mrg       new_thresh_idx = analyze_dat (0);
    622      1.1  mrg 
    623      1.1  mrg       if (option_trace >= 2)
    624      1.1  mrg         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
    625      1.1  mrg                 (long) s.size, ti, tiplus1, d,
    626      1.1  mrg                 ti > tiplus1 ? '#' : ' ',
    627      1.1  mrg                 (long) dat[new_thresh_idx].size);
    628      1.1  mrg 
    629      1.1  mrg       /* Stop if the last time method i was faster was more than a
    630      1.1  mrg          certain number of measurements ago.  */
    631      1.1  mrg #define STOP_SINCE_POSITIVE  200
    632      1.1  mrg       if (d >= 0)
    633      1.1  mrg         since_positive = 0;
    634      1.1  mrg       else
    635      1.1  mrg         if (++since_positive > STOP_SINCE_POSITIVE)
    636      1.1  mrg           {
    637      1.1  mrg             if (option_trace >= 1)
    638      1.1  mrg               printf ("stopped due to since_positive (%d)\n",
    639      1.1  mrg                       STOP_SINCE_POSITIVE);
    640      1.1  mrg             break;
    641      1.1  mrg           }
    642      1.1  mrg 
    643      1.1  mrg       /* Stop if method A has become slower by a certain factor. */
    644      1.1  mrg       if (ti >= tiplus1 * param->stop_factor)
    645      1.1  mrg         {
    646      1.1  mrg           if (option_trace >= 1)
    647      1.1  mrg             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
    648      1.1  mrg                     param->stop_factor);
    649      1.1  mrg           break;
    650      1.1  mrg         }
    651      1.1  mrg 
    652      1.1  mrg       /* Stop if the threshold implied hasn't changed in a certain
    653      1.1  mrg          number of measurements.  (It's this condition that usually
    654      1.1  mrg          stops the loop.) */
    655      1.1  mrg       if (thresh_idx != new_thresh_idx)
    656      1.1  mrg         since_thresh_change = 0, thresh_idx = new_thresh_idx;
    657      1.1  mrg       else
    658      1.1  mrg         if (++since_thresh_change > param->stop_since_change)
    659      1.1  mrg           {
    660      1.1  mrg             if (option_trace >= 1)
    661      1.1  mrg               printf ("stopped due to since_thresh_change (%d)\n",
    662      1.1  mrg                       param->stop_since_change);
    663      1.1  mrg             break;
    664      1.1  mrg           }
    665      1.1  mrg 
    666      1.1  mrg       /* Stop if the threshold implied is more than a certain number of
    667      1.1  mrg          measurements ago.  */
    668      1.1  mrg #define STOP_SINCE_AFTER   500
    669      1.1  mrg       if (ndat - thresh_idx > STOP_SINCE_AFTER)
    670      1.1  mrg         {
    671      1.1  mrg           if (option_trace >= 1)
    672      1.1  mrg             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
    673      1.1  mrg                     STOP_SINCE_AFTER);
    674      1.1  mrg           break;
    675      1.1  mrg         }
    676      1.1  mrg 
    677      1.1  mrg       /* Stop when the size limit is reached before the end of the
    678      1.1  mrg          crossover, but only show this as an error for >= the default max
    679      1.1  mrg          size.  FIXME: Maybe should make it a param choice whether this is
    680      1.1  mrg          an error.  */
    681      1.1  mrg       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
    682      1.1  mrg         {
    683      1.1  mrg           fprintf (stderr, "%s\n", param->name);
    684      1.1  mrg           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
    685      1.1  mrg                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    686      1.1  mrg           fprintf (stderr, "    max size reached before end of crossover\n");
    687      1.1  mrg           break;
    688      1.1  mrg         }
    689      1.1  mrg     }
    690      1.1  mrg 
    691      1.1  mrg   if (option_trace >= 1)
    692      1.1  mrg     printf ("sizes %ld to %ld total %d measurements\n",
    693      1.1  mrg             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    694      1.1  mrg 
    695      1.1  mrg   *threshold = dat[analyze_dat (1)].size;
    696      1.1  mrg 
    697      1.1  mrg   if (param->min_is_always)
    698      1.1  mrg     {
    699      1.1  mrg       if (*threshold == param->min_size)
    700      1.1  mrg         *threshold = 0;
    701      1.1  mrg     }
    702      1.1  mrg 
    703      1.1  mrg   if (! param->noprint || option_trace)
    704      1.1  mrg     print_define_end (param->name, *threshold);
    705      1.1  mrg }
    706      1.1  mrg 
    707      1.1  mrg 
    708      1.1  mrg /* Special probing for the fft thresholds.  The size restrictions on the
    709      1.1  mrg    FFTs mean the graph of time vs size has a step effect.  See this for
    710      1.1  mrg    example using
    711      1.1  mrg 
    712      1.1  mrg        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
    713      1.1  mrg        gnuplot foo.gnuplot
    714      1.1  mrg 
    715      1.1  mrg    The current approach is to compare routines at the midpoint of relevant
    716      1.1  mrg    steps.  Arguably a more sophisticated system of threshold data is wanted
    717      1.1  mrg    if this step effect remains. */
    718      1.1  mrg 
    719      1.1  mrg struct fft_param_t {
    720      1.1  mrg   const char        *table_name;
    721      1.1  mrg   const char        *threshold_name;
    722      1.1  mrg   const char        *modf_threshold_name;
    723      1.1  mrg   mp_size_t         *p_threshold;
    724      1.1  mrg   mp_size_t         *p_modf_threshold;
    725      1.1  mrg   mp_size_t         first_size;
    726      1.1  mrg   mp_size_t         max_size;
    727      1.1  mrg   speed_function_t  function;
    728      1.1  mrg   speed_function_t  mul_modf_function;
    729      1.1  mrg   speed_function_t  mul_function;
    730      1.1  mrg   mp_size_t         sqr;
    731      1.1  mrg };
    732      1.1  mrg 
    733      1.1  mrg 
    734      1.1  mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
    735      1.1  mrg    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
    736      1.1  mrg    of 2^(k-1) bits. */
    737      1.1  mrg 
    738      1.1  mrg mp_size_t
    739      1.1  mrg fft_step_size (int k)
    740      1.1  mrg {
    741      1.1  mrg   mp_size_t  step;
    742      1.1  mrg 
    743      1.1  mrg   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
    744      1.1  mrg   step *= (mp_size_t) 1 << k;
    745      1.1  mrg 
    746      1.1  mrg   if (step <= 0)
    747      1.1  mrg     {
    748      1.1  mrg       printf ("Can't handle k=%d\n", k);
    749      1.1  mrg       abort ();
    750      1.1  mrg     }
    751      1.1  mrg 
    752      1.1  mrg   return step;
    753      1.1  mrg }
    754      1.1  mrg 
    755      1.1  mrg mp_size_t
    756      1.1  mrg fft_next_size (mp_size_t pl, int k)
    757      1.1  mrg {
    758      1.1  mrg   mp_size_t  m = fft_step_size (k);
    759      1.1  mrg 
    760      1.1  mrg /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
    761      1.1  mrg 
    762      1.1  mrg   if (pl == 0 || (pl & (m-1)) != 0)
    763      1.1  mrg     pl = (pl | (m-1)) + 1;
    764      1.1  mrg 
    765      1.1  mrg /*    printf (" %ld\n", pl); */
    766      1.1  mrg   return pl;
    767      1.1  mrg }
    768      1.1  mrg 
    769      1.1  mrg #define NMAX_DEFAULT 1000000
    770      1.1  mrg #define MAX_REPS 25
    771      1.1  mrg #define MIN_REPS 5
    772      1.1  mrg 
    773      1.1  mrg static inline size_t
    774      1.1  mrg mpn_mul_fft_lcm (size_t a, unsigned int k)
    775      1.1  mrg {
    776      1.1  mrg   unsigned int l = k;
    777      1.1  mrg 
    778      1.1  mrg   while (a % 2 == 0 && k > 0)
    779      1.1  mrg     {
    780      1.1  mrg       a >>= 1;
    781      1.1  mrg       k--;
    782      1.1  mrg     }
    783      1.1  mrg   return a << l;
    784      1.1  mrg }
    785      1.1  mrg 
    786      1.1  mrg mp_size_t
    787      1.1  mrg fftfill (mp_size_t pl, int k, int sqr)
    788      1.1  mrg {
    789      1.1  mrg   mp_size_t maxLK;
    790      1.1  mrg   mp_bitcnt_t N, Nprime, nprime, M;
    791      1.1  mrg 
    792      1.1  mrg   N = pl * GMP_NUMB_BITS;
    793      1.1  mrg   M = N >> k;
    794      1.1  mrg 
    795      1.1  mrg   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
    796      1.1  mrg 
    797      1.1  mrg   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
    798      1.1  mrg   nprime = Nprime / GMP_NUMB_BITS;
    799      1.1  mrg   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    800      1.1  mrg     {
    801      1.1  mrg       size_t K2;
    802      1.1  mrg       for (;;)
    803      1.1  mrg 	{
    804      1.1  mrg 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
    805      1.1  mrg 	  if ((nprime & (K2 - 1)) == 0)
    806      1.1  mrg 	    break;
    807      1.1  mrg 	  nprime = (nprime + K2 - 1) & -K2;
    808      1.1  mrg 	  Nprime = nprime * GMP_LIMB_BITS;
    809      1.1  mrg 	}
    810      1.1  mrg     }
    811      1.1  mrg   ASSERT_ALWAYS (nprime < pl);
    812      1.1  mrg 
    813      1.1  mrg   return Nprime;
    814      1.1  mrg }
    815      1.1  mrg 
    816      1.1  mrg static int
    817      1.1  mrg compare_double (const void *ap, const void *bp)
    818      1.1  mrg {
    819      1.1  mrg   double a = * (const double *) ap;
    820      1.1  mrg   double b = * (const double *) bp;
    821      1.1  mrg 
    822      1.1  mrg   if (a < b)
    823      1.1  mrg     return -1;
    824      1.1  mrg   else if (a > b)
    825      1.1  mrg     return 1;
    826      1.1  mrg   else
    827      1.1  mrg     return 0;
    828      1.1  mrg }
    829      1.1  mrg 
    830      1.1  mrg double
    831      1.1  mrg median (double *times, int n)
    832      1.1  mrg {
    833      1.1  mrg   qsort (times, n, sizeof (double), compare_double);
    834      1.1  mrg   return times[n/2];
    835      1.1  mrg }
    836      1.1  mrg 
    837      1.1  mrg #define FFT_CACHE_SIZE 25
    838      1.1  mrg typedef struct fft_cache
    839      1.1  mrg {
    840      1.1  mrg   mp_size_t n;
    841      1.1  mrg   double time;
    842      1.1  mrg } fft_cache_t;
    843      1.1  mrg 
    844      1.1  mrg fft_cache_t fft_cache[FFT_CACHE_SIZE];
    845      1.1  mrg 
    846      1.1  mrg double
    847      1.1  mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
    848      1.1  mrg 		int n_measurements)
    849      1.1  mrg {
    850      1.1  mrg   int i;
    851      1.1  mrg   double t, ttab[MAX_REPS];
    852      1.1  mrg 
    853      1.1  mrg   if (fft_cache[k].n == n)
    854      1.1  mrg     return fft_cache[k].time;
    855      1.1  mrg 
    856      1.1  mrg   for (i = 0; i < n_measurements; i++)
    857      1.1  mrg     {
    858      1.1  mrg       speed_starttime ();
    859      1.1  mrg       mpn_mul_fft (rp, n, ap, n, bp, n, k);
    860      1.1  mrg       ttab[i] = speed_endtime ();
    861      1.1  mrg     }
    862      1.1  mrg 
    863      1.1  mrg   t = median (ttab, n_measurements);
    864      1.1  mrg   fft_cache[k].n = n;
    865      1.1  mrg   fft_cache[k].time = t;
    866      1.1  mrg   return t;
    867      1.1  mrg }
    868      1.1  mrg 
    869      1.1  mrg #define INSERT_FFTTAB(idx, nval, kval)					\
    870      1.1  mrg   do {									\
    871      1.1  mrg     fft_tab[idx].n = nval;						\
    872      1.1  mrg     fft_tab[idx].k = kval;						\
    873  1.1.1.3  mrg     fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
    874  1.1.1.3  mrg     fft_tab[idx+1].k = (1 <<  5) - 1;					\
    875      1.1  mrg   } while (0)
    876      1.1  mrg 
    877      1.1  mrg int
    878      1.1  mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
    879      1.1  mrg {
    880      1.1  mrg   mp_size_t n, n1, prev_n1;
    881      1.1  mrg   int k, best_k, last_best_k, kmax;
    882      1.1  mrg   int eff, prev_eff;
    883      1.1  mrg   double t0, t1;
    884      1.1  mrg   int n_measurements;
    885      1.1  mrg   mp_limb_t *ap, *bp, *rp;
    886      1.1  mrg   mp_size_t alloc;
    887      1.1  mrg   struct fft_table_nk *fft_tab;
    888      1.1  mrg 
    889      1.1  mrg   fft_tab = mpn_fft_table3[p->sqr];
    890      1.1  mrg 
    891      1.1  mrg   for (k = 0; k < FFT_CACHE_SIZE; k++)
    892      1.1  mrg     fft_cache[k].n = 0;
    893      1.1  mrg 
    894      1.1  mrg   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    895      1.1  mrg     {
    896      1.1  mrg       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
    897      1.1  mrg     }
    898      1.1  mrg 
    899      1.1  mrg   if (print)
    900      1.1  mrg     printf ("#define %s%*s", p->table_name, 38, "");
    901      1.1  mrg 
    902      1.1  mrg   if (idx == 0)
    903      1.1  mrg     {
    904      1.1  mrg       INSERT_FFTTAB (0, nmin, initial_k);
    905      1.1  mrg 
    906      1.1  mrg       if (print)
    907      1.1  mrg 	{
    908      1.1  mrg 	  printf ("\\\n  { ");
    909      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
    910      1.1  mrg 	}
    911      1.1  mrg 
    912      1.1  mrg       idx = 1;
    913      1.1  mrg     }
    914      1.1  mrg 
    915  1.1.1.3  mrg   ap = (mp_ptr) malloc (sizeof (mp_limb_t));
    916      1.1  mrg   if (p->sqr)
    917      1.1  mrg     bp = ap;
    918      1.1  mrg   else
    919  1.1.1.3  mrg     bp = (mp_ptr) malloc (sizeof (mp_limb_t));
    920  1.1.1.3  mrg   rp = (mp_ptr) malloc (sizeof (mp_limb_t));
    921      1.1  mrg   alloc = 1;
    922      1.1  mrg 
    923      1.1  mrg   /* Round n to comply to initial k value */
    924      1.1  mrg   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
    925      1.1  mrg 
    926      1.1  mrg   n_measurements = (18 - initial_k) | 1;
    927      1.1  mrg   n_measurements = MAX (n_measurements, MIN_REPS);
    928      1.1  mrg   n_measurements = MIN (n_measurements, MAX_REPS);
    929      1.1  mrg 
    930      1.1  mrg   last_best_k = initial_k;
    931      1.1  mrg   best_k = initial_k;
    932      1.1  mrg 
    933      1.1  mrg   while (n < nmax)
    934      1.1  mrg     {
    935      1.1  mrg       int start_k, end_k;
    936      1.1  mrg 
    937      1.1  mrg       /* Assume the current best k is best until we hit its next FFT step.  */
    938      1.1  mrg       t0 = 99999;
    939      1.1  mrg 
    940      1.1  mrg       prev_n1 = n + 1;
    941      1.1  mrg 
    942      1.1  mrg       start_k = MAX (4, best_k - 4);
    943      1.1  mrg       end_k = MIN (24, best_k + 4);
    944      1.1  mrg       for (k = start_k; k <= end_k; k++)
    945      1.1  mrg 	{
    946      1.1  mrg           n1 = mpn_fft_next_size (prev_n1, k);
    947      1.1  mrg 
    948      1.1  mrg 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
    949      1.1  mrg 
    950      1.1  mrg 	  if (eff < 70)		/* avoid measuring too slow fft:s */
    951      1.1  mrg 	    continue;
    952      1.1  mrg 
    953      1.1  mrg 	  if (n1 > alloc)
    954      1.1  mrg 	    {
    955      1.1  mrg 	      alloc = n1;
    956      1.1  mrg 	      if (p->sqr)
    957      1.1  mrg 		{
    958  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
    959  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
    960  1.1.1.3  mrg 		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
    961      1.1  mrg 		  mpn_random (ap, alloc);
    962  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
    963      1.1  mrg 		}
    964      1.1  mrg 	      else
    965      1.1  mrg 		{
    966  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
    967  1.1.1.3  mrg 		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
    968  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
    969  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
    970      1.1  mrg 		  mpn_random (ap, alloc);
    971  1.1.1.3  mrg 		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
    972      1.1  mrg 		  mpn_random (bp, alloc);
    973  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
    974      1.1  mrg 		}
    975      1.1  mrg 	    }
    976      1.1  mrg 
    977      1.1  mrg 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
    978      1.1  mrg 
    979      1.1  mrg 	  if (t1 * n_measurements > 0.3)
    980      1.1  mrg 	    n_measurements -= 2;
    981      1.1  mrg 	  n_measurements = MAX (n_measurements, MIN_REPS);
    982      1.1  mrg 
    983      1.1  mrg 	  if (t1 < t0)
    984      1.1  mrg 	    {
    985      1.1  mrg 	      best_k = k;
    986      1.1  mrg 	      t0 = t1;
    987      1.1  mrg 	    }
    988      1.1  mrg 	}
    989      1.1  mrg 
    990      1.1  mrg       n1 = mpn_fft_next_size (prev_n1, best_k);
    991      1.1  mrg 
    992      1.1  mrg       if (last_best_k != best_k)
    993      1.1  mrg 	{
    994      1.1  mrg 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
    995      1.1  mrg 
    996      1.1  mrg 	  if (idx >= FFT_TABLE3_SIZE)
    997      1.1  mrg 	    {
    998      1.1  mrg 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
    999      1.1  mrg 	      abort ();
   1000      1.1  mrg 	    }
   1001      1.1  mrg 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
   1002      1.1  mrg 
   1003      1.1  mrg 	  if (print)
   1004      1.1  mrg 	    {
   1005      1.1  mrg 	      printf (", ");
   1006      1.1  mrg 	      if (idx % 4 == 0)
   1007      1.1  mrg 		printf ("\\\n    ");
   1008      1.1  mrg 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1009      1.1  mrg 	    }
   1010      1.1  mrg 
   1011      1.1  mrg 	  if (option_trace >= 2)
   1012      1.1  mrg 	    {
   1013      1.1  mrg 	      printf ("{%lu,%u}\n", prev_n1, best_k);
   1014      1.1  mrg 	      fflush (stdout);
   1015      1.1  mrg 	    }
   1016      1.1  mrg 
   1017      1.1  mrg 	  last_best_k = best_k;
   1018      1.1  mrg 	  idx++;
   1019      1.1  mrg 	}
   1020      1.1  mrg 
   1021      1.1  mrg       for (;;)
   1022      1.1  mrg 	{
   1023      1.1  mrg 	  prev_n1 = n1;
   1024      1.1  mrg 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
   1025      1.1  mrg 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
   1026      1.1  mrg 	  eff = fftfill (n1, best_k, p->sqr);
   1027      1.1  mrg 
   1028      1.1  mrg 	  if (eff != prev_eff)
   1029      1.1  mrg 	    break;
   1030      1.1  mrg 	}
   1031      1.1  mrg 
   1032      1.1  mrg       n = prev_n1;
   1033      1.1  mrg     }
   1034      1.1  mrg 
   1035      1.1  mrg   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
   1036      1.1  mrg   kmax = MIN (kmax, 25-1);
   1037      1.1  mrg   for (k = last_best_k + 1; k <= kmax; k++)
   1038      1.1  mrg     {
   1039      1.1  mrg       if (idx >= FFT_TABLE3_SIZE)
   1040      1.1  mrg 	{
   1041      1.1  mrg 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   1042      1.1  mrg 	  abort ();
   1043      1.1  mrg 	}
   1044      1.1  mrg       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
   1045      1.1  mrg 
   1046      1.1  mrg       if (print)
   1047      1.1  mrg 	{
   1048      1.1  mrg 	  printf (", ");
   1049      1.1  mrg 	  if (idx % 4 == 0)
   1050      1.1  mrg 	    printf ("\\\n    ");
   1051      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1052      1.1  mrg 	}
   1053      1.1  mrg 
   1054      1.1  mrg       idx++;
   1055      1.1  mrg     }
   1056      1.1  mrg 
   1057      1.1  mrg   if (print)
   1058      1.1  mrg     printf (" }\n");
   1059      1.1  mrg 
   1060      1.1  mrg   free (ap);
   1061      1.1  mrg   if (! p->sqr)
   1062      1.1  mrg     free (bp);
   1063      1.1  mrg   free (rp);
   1064      1.1  mrg 
   1065      1.1  mrg   return idx;
   1066      1.1  mrg }
   1067      1.1  mrg 
   1068      1.1  mrg void
   1069      1.1  mrg fft (struct fft_param_t *p)
   1070      1.1  mrg {
   1071      1.1  mrg   mp_size_t  size;
   1072      1.1  mrg   int        k, idx, initial_k;
   1073      1.1  mrg 
   1074      1.1  mrg   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
   1075      1.1  mrg 
   1076      1.1  mrg #if 1
   1077      1.1  mrg   {
   1078      1.1  mrg     /* Use plain one() mechanism, for some reasonable initial values of k.  The
   1079      1.1  mrg        advantage is that we don't depend on mpn_fft_table3, which can therefore
   1080      1.1  mrg        leave it completely uninitialized.  */
   1081      1.1  mrg 
   1082      1.1  mrg     static struct param_t param;
   1083      1.1  mrg     mp_size_t thres, best_thres;
   1084      1.1  mrg     int best_k;
   1085      1.1  mrg     char buf[20];
   1086      1.1  mrg 
   1087      1.1  mrg     best_thres = MP_SIZE_T_MAX;
   1088      1.1  mrg     best_k = -1;
   1089      1.1  mrg 
   1090      1.1  mrg     for (k = 5; k <= 7; k++)
   1091      1.1  mrg       {
   1092      1.1  mrg 	param.name = p->modf_threshold_name;
   1093      1.1  mrg 	param.min_size = 100;
   1094      1.1  mrg 	param.max_size = 2000;
   1095      1.1  mrg 	param.function  = p->mul_function;
   1096      1.1  mrg 	param.step_factor = 0.0;
   1097      1.1  mrg 	param.step = 4;
   1098      1.1  mrg 	param.function2 = p->mul_modf_function;
   1099      1.1  mrg 	param.noprint = 1;
   1100      1.1  mrg 	s.r = k;
   1101      1.1  mrg 	one (&thres, &param);
   1102      1.1  mrg 	if (thres < best_thres)
   1103      1.1  mrg 	  {
   1104      1.1  mrg 	    best_thres = thres;
   1105      1.1  mrg 	    best_k = k;
   1106      1.1  mrg 	  }
   1107      1.1  mrg       }
   1108      1.1  mrg 
   1109      1.1  mrg     *(p->p_modf_threshold) = best_thres;
   1110      1.1  mrg     sprintf (buf, "k = %d", best_k);
   1111      1.1  mrg     print_define_remark (p->modf_threshold_name, best_thres, buf);
   1112      1.1  mrg     initial_k = best_k;
   1113      1.1  mrg   }
   1114      1.1  mrg #else
   1115      1.1  mrg   size = p->first_size;
   1116      1.1  mrg   for (;;)
   1117      1.1  mrg     {
   1118      1.1  mrg       double  tk, tm;
   1119      1.1  mrg 
   1120      1.1  mrg       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
   1121      1.1  mrg       k = mpn_fft_best_k (size, p->sqr);
   1122      1.1  mrg 
   1123      1.1  mrg       if (size >= p->max_size)
   1124      1.1  mrg         break;
   1125      1.1  mrg 
   1126      1.1  mrg       s.size = size + fft_step_size (k) / 2;
   1127      1.1  mrg       s.r = k;
   1128      1.1  mrg       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
   1129      1.1  mrg       if (tk == -1.0)
   1130      1.1  mrg         abort ();
   1131      1.1  mrg 
   1132      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1133      1.1  mrg       if (tm == -1.0)
   1134      1.1  mrg         abort ();
   1135      1.1  mrg 
   1136      1.1  mrg       if (option_trace >= 2)
   1137      1.1  mrg         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
   1138      1.1  mrg                 (long) size,
   1139      1.1  mrg                 (long) size + fft_step_size (k) / 2, k, tk,
   1140      1.1  mrg                 (long) s.size, tm);
   1141      1.1  mrg 
   1142      1.1  mrg       if (tk < tm)
   1143      1.1  mrg         {
   1144      1.1  mrg 	  *p->p_modf_threshold = s.size;
   1145      1.1  mrg 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
   1146      1.1  mrg 	  break;
   1147      1.1  mrg         }
   1148      1.1  mrg     }
   1149      1.1  mrg   initial_k = ?;
   1150      1.1  mrg #endif
   1151      1.1  mrg 
   1152      1.1  mrg   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
   1153      1.1  mrg 
   1154      1.1  mrg   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
   1155      1.1  mrg   printf ("#define %s_SIZE %d\n", p->table_name, idx);
   1156      1.1  mrg 
   1157      1.1  mrg   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
   1158      1.1  mrg 
   1159      1.1  mrg   size = 2 * *p->p_modf_threshold;	/* OK? */
   1160      1.1  mrg   for (;;)
   1161      1.1  mrg     {
   1162      1.1  mrg       double  tk, tm;
   1163      1.1  mrg       mp_size_t mulmod_size, mul_size;;
   1164      1.1  mrg 
   1165      1.1  mrg       if (size >= p->max_size)
   1166      1.1  mrg         break;
   1167      1.1  mrg 
   1168      1.1  mrg       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
   1169      1.1  mrg       mul_size = (size + mulmod_size) / 2;	/* middle of step */
   1170      1.1  mrg 
   1171      1.1  mrg       s.size = mulmod_size;
   1172      1.1  mrg       tk = tuneup_measure (p->function, NULL, &s);
   1173      1.1  mrg       if (tk == -1.0)
   1174      1.1  mrg         abort ();
   1175      1.1  mrg 
   1176      1.1  mrg       s.size = mul_size;
   1177      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1178      1.1  mrg       if (tm == -1.0)
   1179      1.1  mrg         abort ();
   1180      1.1  mrg 
   1181      1.1  mrg       if (option_trace >= 2)
   1182      1.1  mrg         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
   1183      1.1  mrg                 (long) size,
   1184      1.1  mrg                 (long) mulmod_size, tk,
   1185      1.1  mrg                 (long) mul_size, tm);
   1186      1.1  mrg 
   1187      1.1  mrg       size = mulmod_size;
   1188      1.1  mrg 
   1189      1.1  mrg       if (tk < tm)
   1190      1.1  mrg         {
   1191      1.1  mrg 	  *p->p_threshold = s.size;
   1192      1.1  mrg 	  print_define (p->threshold_name, *p->p_threshold);
   1193      1.1  mrg 	  break;
   1194      1.1  mrg         }
   1195      1.1  mrg     }
   1196      1.1  mrg }
   1197      1.1  mrg 
   1198      1.1  mrg 
   1199      1.1  mrg 
   1200      1.1  mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
   1201      1.1  mrg    giving wrong results.  */
   1202      1.1  mrg void
   1203      1.1  mrg tune_mul_n (void)
   1204      1.1  mrg {
   1205      1.1  mrg   static struct param_t  param;
   1206  1.1.1.2  mrg   mp_size_t next_toom_start;
   1207  1.1.1.2  mrg   int something_changed;
   1208      1.1  mrg 
   1209      1.1  mrg   param.function = speed_mpn_mul_n;
   1210      1.1  mrg 
   1211      1.1  mrg   param.name = "MUL_TOOM22_THRESHOLD";
   1212      1.1  mrg   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
   1213      1.1  mrg   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
   1214      1.1  mrg   one (&mul_toom22_threshold, &param);
   1215      1.1  mrg 
   1216  1.1.1.2  mrg   param.noprint = 1;
   1217  1.1.1.2  mrg 
   1218  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1219  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1220  1.1.1.2  mrg   something_changed = 1;
   1221  1.1.1.2  mrg   while (something_changed)
   1222  1.1.1.2  mrg     {
   1223  1.1.1.2  mrg       something_changed = 0;
   1224  1.1.1.2  mrg 
   1225  1.1.1.2  mrg 	next_toom_start = mul_toom22_threshold;
   1226  1.1.1.2  mrg 
   1227  1.1.1.2  mrg 	if (mul_toom33_threshold != 0)
   1228  1.1.1.2  mrg 	  {
   1229  1.1.1.2  mrg 	    param.name = "MUL_TOOM33_THRESHOLD";
   1230  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
   1231  1.1.1.2  mrg 	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
   1232  1.1.1.2  mrg 	    one (&mul_toom33_threshold, &param);
   1233  1.1.1.2  mrg 
   1234  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
   1235  1.1.1.2  mrg 	      {
   1236  1.1.1.2  mrg 		mul_toom33_threshold = 0;
   1237  1.1.1.2  mrg 		something_changed = 1;
   1238  1.1.1.2  mrg 	      }
   1239  1.1.1.2  mrg 	  }
   1240  1.1.1.2  mrg 
   1241  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
   1242  1.1.1.2  mrg 
   1243  1.1.1.2  mrg 	if (mul_toom44_threshold != 0)
   1244  1.1.1.2  mrg 	  {
   1245  1.1.1.2  mrg 	    param.name = "MUL_TOOM44_THRESHOLD";
   1246  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
   1247  1.1.1.2  mrg 	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
   1248  1.1.1.2  mrg 	    one (&mul_toom44_threshold, &param);
   1249  1.1.1.2  mrg 
   1250  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
   1251  1.1.1.2  mrg 	      {
   1252  1.1.1.2  mrg 		mul_toom44_threshold = 0;
   1253  1.1.1.2  mrg 		something_changed = 1;
   1254  1.1.1.2  mrg 	      }
   1255  1.1.1.2  mrg 	  }
   1256  1.1.1.2  mrg 
   1257  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
   1258  1.1.1.2  mrg 
   1259  1.1.1.2  mrg 	if (mul_toom6h_threshold != 0)
   1260  1.1.1.2  mrg 	  {
   1261  1.1.1.2  mrg 	    param.name = "MUL_TOOM6H_THRESHOLD";
   1262  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
   1263  1.1.1.2  mrg 	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
   1264  1.1.1.2  mrg 	    one (&mul_toom6h_threshold, &param);
   1265  1.1.1.2  mrg 
   1266  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
   1267  1.1.1.2  mrg 	      {
   1268  1.1.1.2  mrg 		mul_toom6h_threshold = 0;
   1269  1.1.1.2  mrg 		something_changed = 1;
   1270  1.1.1.2  mrg 	      }
   1271  1.1.1.2  mrg 	  }
   1272  1.1.1.2  mrg 
   1273  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
   1274  1.1.1.2  mrg 
   1275  1.1.1.2  mrg 	if (mul_toom8h_threshold != 0)
   1276  1.1.1.2  mrg 	  {
   1277  1.1.1.2  mrg 	    param.name = "MUL_TOOM8H_THRESHOLD";
   1278  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
   1279  1.1.1.2  mrg 	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
   1280  1.1.1.2  mrg 	    one (&mul_toom8h_threshold, &param);
   1281  1.1.1.2  mrg 
   1282  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
   1283  1.1.1.2  mrg 	      {
   1284  1.1.1.2  mrg 		mul_toom8h_threshold = 0;
   1285  1.1.1.2  mrg 		something_changed = 1;
   1286  1.1.1.2  mrg 	      }
   1287  1.1.1.2  mrg 	  }
   1288  1.1.1.2  mrg     }
   1289  1.1.1.2  mrg 
   1290  1.1.1.2  mrg     print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
   1291  1.1.1.2  mrg     print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
   1292  1.1.1.2  mrg     print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
   1293  1.1.1.2  mrg     print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
   1294      1.1  mrg 
   1295      1.1  mrg   /* disabled until tuned */
   1296      1.1  mrg   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1297      1.1  mrg }
   1298      1.1  mrg 
   1299      1.1  mrg void
   1300      1.1  mrg tune_mul (void)
   1301      1.1  mrg {
   1302      1.1  mrg   static struct param_t  param;
   1303      1.1  mrg   mp_size_t thres;
   1304      1.1  mrg 
   1305      1.1  mrg   param.noprint = 1;
   1306      1.1  mrg 
   1307      1.1  mrg   param.function = speed_mpn_toom32_for_toom43_mul;
   1308      1.1  mrg   param.function2 = speed_mpn_toom43_for_toom32_mul;
   1309      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
   1310  1.1.1.2  mrg   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
   1311      1.1  mrg   one (&thres, &param);
   1312  1.1.1.2  mrg   mul_toom32_to_toom43_threshold = thres * 17 / 24;
   1313      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
   1314      1.1  mrg 
   1315      1.1  mrg   param.function = speed_mpn_toom32_for_toom53_mul;
   1316      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom32_mul;
   1317      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
   1318  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
   1319      1.1  mrg   one (&thres, &param);
   1320  1.1.1.2  mrg   mul_toom32_to_toom53_threshold = thres * 19 / 30;
   1321      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
   1322      1.1  mrg 
   1323      1.1  mrg   param.function = speed_mpn_toom42_for_toom53_mul;
   1324      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom42_mul;
   1325      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
   1326  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
   1327      1.1  mrg   one (&thres, &param);
   1328  1.1.1.2  mrg   mul_toom42_to_toom53_threshold = thres * 11 / 20;
   1329      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
   1330      1.1  mrg 
   1331      1.1  mrg   param.function = speed_mpn_toom42_mul;
   1332      1.1  mrg   param.function2 = speed_mpn_toom63_mul;
   1333      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
   1334  1.1.1.2  mrg   param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
   1335      1.1  mrg   one (&thres, &param);
   1336  1.1.1.2  mrg   mul_toom42_to_toom63_threshold = thres / 2;
   1337      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
   1338  1.1.1.2  mrg 
   1339  1.1.1.2  mrg   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
   1340  1.1.1.2  mrg   param.function = speed_mpn_toom43_for_toom54_mul;
   1341  1.1.1.2  mrg   param.function2 = speed_mpn_toom54_for_toom43_mul;
   1342  1.1.1.2  mrg   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
   1343  1.1.1.2  mrg   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
   1344  1.1.1.2  mrg   one (&thres, &param);
   1345  1.1.1.2  mrg   mul_toom43_to_toom54_threshold = thres * 5 / 6;
   1346  1.1.1.2  mrg   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
   1347      1.1  mrg }
   1348      1.1  mrg 
   1349      1.1  mrg 
   1350      1.1  mrg void
   1351      1.1  mrg tune_mullo (void)
   1352      1.1  mrg {
   1353      1.1  mrg   static struct param_t  param;
   1354      1.1  mrg 
   1355      1.1  mrg   param.function = speed_mpn_mullo_n;
   1356      1.1  mrg 
   1357      1.1  mrg   param.name = "MULLO_BASECASE_THRESHOLD";
   1358      1.1  mrg   param.min_size = 1;
   1359      1.1  mrg   param.min_is_always = 1;
   1360      1.1  mrg   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
   1361      1.1  mrg   param.stop_factor = 1.5;
   1362      1.1  mrg   param.noprint = 1;
   1363      1.1  mrg   one (&mullo_basecase_threshold, &param);
   1364      1.1  mrg 
   1365      1.1  mrg   param.name = "MULLO_DC_THRESHOLD";
   1366      1.1  mrg   param.min_size = 8;
   1367      1.1  mrg   param.min_is_always = 0;
   1368      1.1  mrg   param.max_size = 1000;
   1369      1.1  mrg   one (&mullo_dc_threshold, &param);
   1370      1.1  mrg 
   1371      1.1  mrg   if (mullo_basecase_threshold >= mullo_dc_threshold)
   1372      1.1  mrg     {
   1373      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
   1374      1.1  mrg       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
   1375      1.1  mrg     }
   1376      1.1  mrg   else
   1377      1.1  mrg     {
   1378      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
   1379      1.1  mrg       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
   1380      1.1  mrg     }
   1381      1.1  mrg 
   1382  1.1.1.3  mrg   if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
   1383  1.1.1.3  mrg     {
   1384  1.1.1.3  mrg       param.name = "MULLO_MUL_N_THRESHOLD";
   1385  1.1.1.3  mrg       param.min_size = mullo_dc_threshold;
   1386  1.1.1.3  mrg       param.max_size = 2 * mul_fft_threshold;
   1387  1.1.1.3  mrg       param.noprint = 0;
   1388  1.1.1.3  mrg       param.step_factor = 0.03;
   1389  1.1.1.3  mrg       one (&mullo_mul_n_threshold, &param);
   1390  1.1.1.3  mrg     }
   1391  1.1.1.3  mrg   else
   1392  1.1.1.3  mrg     print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
   1393  1.1.1.3  mrg 			 "without FFT use mullo forever");
   1394  1.1.1.3  mrg }
   1395  1.1.1.3  mrg 
   1396  1.1.1.3  mrg void
   1397  1.1.1.3  mrg tune_sqrlo (void)
   1398  1.1.1.3  mrg {
   1399  1.1.1.3  mrg   static struct param_t  param;
   1400  1.1.1.3  mrg 
   1401  1.1.1.3  mrg   param.function = speed_mpn_sqrlo;
   1402  1.1.1.3  mrg 
   1403  1.1.1.3  mrg   param.name = "SQRLO_BASECASE_THRESHOLD";
   1404  1.1.1.3  mrg   param.min_size = 1;
   1405  1.1.1.3  mrg   param.min_is_always = 1;
   1406  1.1.1.3  mrg   param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
   1407  1.1.1.3  mrg   param.stop_factor = 1.5;
   1408  1.1.1.3  mrg   param.noprint = 1;
   1409  1.1.1.3  mrg   one (&sqrlo_basecase_threshold, &param);
   1410  1.1.1.3  mrg 
   1411  1.1.1.3  mrg   param.name = "SQRLO_DC_THRESHOLD";
   1412  1.1.1.3  mrg   param.min_size = 8;
   1413  1.1.1.3  mrg   param.min_is_always = 0;
   1414  1.1.1.3  mrg   param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
   1415  1.1.1.3  mrg   one (&sqrlo_dc_threshold, &param);
   1416  1.1.1.3  mrg 
   1417  1.1.1.3  mrg   if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
   1418  1.1.1.3  mrg     {
   1419  1.1.1.3  mrg       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
   1420  1.1.1.3  mrg       print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
   1421  1.1.1.3  mrg     }
   1422  1.1.1.3  mrg   else
   1423  1.1.1.3  mrg     {
   1424  1.1.1.3  mrg       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
   1425  1.1.1.3  mrg       print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
   1426  1.1.1.3  mrg     }
   1427  1.1.1.3  mrg 
   1428  1.1.1.3  mrg   if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
   1429  1.1.1.3  mrg     {
   1430  1.1.1.3  mrg       param.name = "SQRLO_SQR_THRESHOLD";
   1431  1.1.1.3  mrg       param.min_size = sqrlo_dc_threshold;
   1432  1.1.1.3  mrg       param.max_size = 2 * sqr_fft_threshold;
   1433  1.1.1.3  mrg       param.noprint = 0;
   1434  1.1.1.3  mrg       param.step_factor = 0.03;
   1435  1.1.1.3  mrg       one (&sqrlo_sqr_threshold, &param);
   1436  1.1.1.3  mrg     }
   1437  1.1.1.3  mrg   else
   1438  1.1.1.3  mrg     print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
   1439  1.1.1.3  mrg 			 "without FFT use sqrlo forever");
   1440      1.1  mrg }
   1441      1.1  mrg 
   1442      1.1  mrg void
   1443  1.1.1.2  mrg tune_mulmid (void)
   1444  1.1.1.2  mrg {
   1445  1.1.1.2  mrg   static struct param_t  param;
   1446  1.1.1.2  mrg 
   1447  1.1.1.2  mrg   param.name = "MULMID_TOOM42_THRESHOLD";
   1448  1.1.1.2  mrg   param.function = speed_mpn_mulmid_n;
   1449  1.1.1.2  mrg   param.min_size = 4;
   1450  1.1.1.2  mrg   param.max_size = 100;
   1451  1.1.1.2  mrg   one (&mulmid_toom42_threshold, &param);
   1452  1.1.1.2  mrg }
   1453  1.1.1.2  mrg 
   1454  1.1.1.2  mrg void
   1455      1.1  mrg tune_mulmod_bnm1 (void)
   1456      1.1  mrg {
   1457      1.1  mrg   static struct param_t  param;
   1458      1.1  mrg 
   1459      1.1  mrg   param.name = "MULMOD_BNM1_THRESHOLD";
   1460      1.1  mrg   param.function = speed_mpn_mulmod_bnm1;
   1461      1.1  mrg   param.min_size = 4;
   1462      1.1  mrg   param.max_size = 100;
   1463      1.1  mrg   one (&mulmod_bnm1_threshold, &param);
   1464      1.1  mrg }
   1465      1.1  mrg 
   1466      1.1  mrg void
   1467      1.1  mrg tune_sqrmod_bnm1 (void)
   1468      1.1  mrg {
   1469      1.1  mrg   static struct param_t  param;
   1470      1.1  mrg 
   1471      1.1  mrg   param.name = "SQRMOD_BNM1_THRESHOLD";
   1472      1.1  mrg   param.function = speed_mpn_sqrmod_bnm1;
   1473      1.1  mrg   param.min_size = 4;
   1474      1.1  mrg   param.max_size = 100;
   1475      1.1  mrg   one (&sqrmod_bnm1_threshold, &param);
   1476      1.1  mrg }
   1477      1.1  mrg 
   1478      1.1  mrg 
   1479      1.1  mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
   1480      1.1  mrg    is faster only at size==2 then we don't want to bother with extra code
   1481      1.1  mrg    just for that.  Start karatsuba from 4 same as MUL above.  */
   1482      1.1  mrg 
   1483      1.1  mrg void
   1484      1.1  mrg tune_sqr (void)
   1485      1.1  mrg {
   1486      1.1  mrg   /* disabled until tuned */
   1487      1.1  mrg   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1488      1.1  mrg 
   1489      1.1  mrg   if (HAVE_NATIVE_mpn_sqr_basecase)
   1490      1.1  mrg     {
   1491      1.1  mrg       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
   1492      1.1  mrg       sqr_basecase_threshold = 0;
   1493      1.1  mrg     }
   1494      1.1  mrg   else
   1495      1.1  mrg     {
   1496      1.1  mrg       static struct param_t  param;
   1497      1.1  mrg       param.name = "SQR_BASECASE_THRESHOLD";
   1498      1.1  mrg       param.function = speed_mpn_sqr;
   1499      1.1  mrg       param.min_size = 3;
   1500      1.1  mrg       param.min_is_always = 1;
   1501      1.1  mrg       param.max_size = TUNE_SQR_TOOM2_MAX;
   1502      1.1  mrg       param.noprint = 1;
   1503      1.1  mrg       one (&sqr_basecase_threshold, &param);
   1504      1.1  mrg     }
   1505      1.1  mrg 
   1506      1.1  mrg   {
   1507      1.1  mrg     static struct param_t  param;
   1508      1.1  mrg     param.name = "SQR_TOOM2_THRESHOLD";
   1509      1.1  mrg     param.function = speed_mpn_sqr;
   1510      1.1  mrg     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
   1511      1.1  mrg     param.max_size = TUNE_SQR_TOOM2_MAX;
   1512      1.1  mrg     param.noprint = 1;
   1513      1.1  mrg     one (&sqr_toom2_threshold, &param);
   1514      1.1  mrg 
   1515      1.1  mrg     if (! HAVE_NATIVE_mpn_sqr_basecase
   1516      1.1  mrg         && sqr_toom2_threshold < sqr_basecase_threshold)
   1517      1.1  mrg       {
   1518      1.1  mrg         /* Karatsuba becomes faster than mul_basecase before
   1519      1.1  mrg            sqr_basecase does.  Arrange for the expression
   1520      1.1  mrg            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
   1521      1.1  mrg            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
   1522      1.1  mrg            SQR_TOOM2_THRESHOLD to zero, making
   1523      1.1  mrg            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
   1524      1.1  mrg 
   1525      1.1  mrg         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
   1526      1.1  mrg         SQR_TOOM2_THRESHOLD = 0;
   1527      1.1  mrg 
   1528      1.1  mrg         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
   1529      1.1  mrg                              "toom2");
   1530      1.1  mrg         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
   1531      1.1  mrg                              "never sqr_basecase");
   1532      1.1  mrg       }
   1533      1.1  mrg     else
   1534      1.1  mrg       {
   1535      1.1  mrg         if (! HAVE_NATIVE_mpn_sqr_basecase)
   1536      1.1  mrg           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
   1537      1.1  mrg         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
   1538      1.1  mrg       }
   1539      1.1  mrg   }
   1540      1.1  mrg 
   1541      1.1  mrg   {
   1542      1.1  mrg     static struct param_t  param;
   1543  1.1.1.2  mrg     mp_size_t next_toom_start;
   1544  1.1.1.2  mrg     int something_changed;
   1545      1.1  mrg 
   1546      1.1  mrg     param.function = speed_mpn_sqr;
   1547  1.1.1.2  mrg     param.noprint = 1;
   1548      1.1  mrg 
   1549  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1550  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1551  1.1.1.2  mrg     something_changed = 1;
   1552  1.1.1.2  mrg     while (something_changed)
   1553  1.1.1.2  mrg       {
   1554  1.1.1.2  mrg 	something_changed = 0;
   1555  1.1.1.2  mrg 
   1556  1.1.1.2  mrg 	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
   1557  1.1.1.2  mrg 
   1558  1.1.1.2  mrg 	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
   1559  1.1.1.2  mrg 	param.name = "SQR_TOOM3_THRESHOLD";
   1560  1.1.1.2  mrg 	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
   1561  1.1.1.2  mrg 	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
   1562  1.1.1.2  mrg 	one (&sqr_toom3_threshold, &param);
   1563  1.1.1.2  mrg 
   1564  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
   1565  1.1.1.2  mrg 
   1566  1.1.1.2  mrg 	if (sqr_toom4_threshold != 0)
   1567  1.1.1.2  mrg 	  {
   1568  1.1.1.2  mrg 	    param.name = "SQR_TOOM4_THRESHOLD";
   1569  1.1.1.2  mrg 	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
   1570  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
   1571  1.1.1.2  mrg 	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
   1572  1.1.1.2  mrg 	    one (&sqr_toom4_threshold, &param);
   1573  1.1.1.2  mrg 
   1574  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
   1575  1.1.1.2  mrg 	      {
   1576  1.1.1.2  mrg 		sqr_toom4_threshold = 0;
   1577  1.1.1.2  mrg 		something_changed = 1;
   1578  1.1.1.2  mrg 	      }
   1579  1.1.1.2  mrg 	  }
   1580  1.1.1.2  mrg 
   1581  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
   1582  1.1.1.2  mrg 
   1583  1.1.1.2  mrg 	if (sqr_toom6_threshold != 0)
   1584  1.1.1.2  mrg 	  {
   1585  1.1.1.2  mrg 	    param.name = "SQR_TOOM6_THRESHOLD";
   1586  1.1.1.2  mrg 	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
   1587  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
   1588  1.1.1.2  mrg 	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
   1589  1.1.1.2  mrg 	    one (&sqr_toom6_threshold, &param);
   1590  1.1.1.2  mrg 
   1591  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
   1592  1.1.1.2  mrg 	      {
   1593  1.1.1.2  mrg 		sqr_toom6_threshold = 0;
   1594  1.1.1.2  mrg 		something_changed = 1;
   1595  1.1.1.2  mrg 	      }
   1596  1.1.1.2  mrg 	  }
   1597  1.1.1.2  mrg 
   1598  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
   1599  1.1.1.2  mrg 
   1600  1.1.1.2  mrg 	if (sqr_toom8_threshold != 0)
   1601  1.1.1.2  mrg 	  {
   1602  1.1.1.2  mrg 	    param.name = "SQR_TOOM8_THRESHOLD";
   1603  1.1.1.2  mrg 	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
   1604  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
   1605  1.1.1.2  mrg 	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
   1606  1.1.1.2  mrg 	    one (&sqr_toom8_threshold, &param);
   1607  1.1.1.2  mrg 
   1608  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
   1609  1.1.1.2  mrg 	      {
   1610  1.1.1.2  mrg 		sqr_toom8_threshold = 0;
   1611  1.1.1.2  mrg 		something_changed = 1;
   1612  1.1.1.2  mrg 	      }
   1613  1.1.1.2  mrg 	  }
   1614  1.1.1.2  mrg       }
   1615  1.1.1.2  mrg 
   1616  1.1.1.2  mrg     print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
   1617  1.1.1.2  mrg     print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
   1618  1.1.1.2  mrg     print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
   1619  1.1.1.2  mrg     print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
   1620      1.1  mrg   }
   1621      1.1  mrg }
   1622      1.1  mrg 
   1623      1.1  mrg 
   1624      1.1  mrg void
   1625      1.1  mrg tune_dc_div (void)
   1626      1.1  mrg {
   1627      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1628      1.1  mrg   {
   1629      1.1  mrg     static struct param_t  param;
   1630      1.1  mrg     param.name = "DC_DIV_QR_THRESHOLD";
   1631      1.1  mrg     param.function = speed_mpn_sbpi1_div_qr;
   1632      1.1  mrg     param.function2 = speed_mpn_dcpi1_div_qr;
   1633      1.1  mrg     param.min_size = 6;
   1634      1.1  mrg     one (&dc_div_qr_threshold, &param);
   1635      1.1  mrg   }
   1636      1.1  mrg   {
   1637      1.1  mrg     static struct param_t  param;
   1638      1.1  mrg     param.name = "DC_DIVAPPR_Q_THRESHOLD";
   1639      1.1  mrg     param.function = speed_mpn_sbpi1_divappr_q;
   1640      1.1  mrg     param.function2 = speed_mpn_dcpi1_divappr_q;
   1641      1.1  mrg     param.min_size = 6;
   1642      1.1  mrg     one (&dc_divappr_q_threshold, &param);
   1643      1.1  mrg   }
   1644      1.1  mrg }
   1645      1.1  mrg 
   1646      1.1  mrg static double
   1647      1.1  mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
   1648      1.1  mrg {
   1649      1.1  mrg   if (s->size < DC_DIV_QR_THRESHOLD)
   1650      1.1  mrg     return speed_mpn_sbpi1_div_qr (s);
   1651      1.1  mrg   else
   1652      1.1  mrg     return speed_mpn_dcpi1_div_qr (s);
   1653      1.1  mrg }
   1654      1.1  mrg 
   1655      1.1  mrg void
   1656      1.1  mrg tune_mu_div (void)
   1657      1.1  mrg {
   1658      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1659      1.1  mrg   {
   1660      1.1  mrg     static struct param_t  param;
   1661      1.1  mrg     param.name = "MU_DIV_QR_THRESHOLD";
   1662      1.1  mrg     param.function = speed_mpn_dcpi1_div_qr;
   1663      1.1  mrg     param.function2 = speed_mpn_mu_div_qr;
   1664  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1665      1.1  mrg     param.max_size = 5000;
   1666      1.1  mrg     param.step_factor = 0.02;
   1667      1.1  mrg     one (&mu_div_qr_threshold, &param);
   1668      1.1  mrg   }
   1669      1.1  mrg   {
   1670      1.1  mrg     static struct param_t  param;
   1671      1.1  mrg     param.name = "MU_DIVAPPR_Q_THRESHOLD";
   1672      1.1  mrg     param.function = speed_mpn_dcpi1_divappr_q;
   1673      1.1  mrg     param.function2 = speed_mpn_mu_divappr_q;
   1674  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1675      1.1  mrg     param.max_size = 5000;
   1676      1.1  mrg     param.step_factor = 0.02;
   1677      1.1  mrg     one (&mu_divappr_q_threshold, &param);
   1678      1.1  mrg   }
   1679      1.1  mrg   {
   1680      1.1  mrg     static struct param_t  param;
   1681      1.1  mrg     param.name = "MUPI_DIV_QR_THRESHOLD";
   1682      1.1  mrg     param.function = speed_mpn_sbordcpi1_div_qr;
   1683      1.1  mrg     param.function2 = speed_mpn_mupi_div_qr;
   1684      1.1  mrg     param.min_size = 6;
   1685      1.1  mrg     param.min_is_always = 1;
   1686      1.1  mrg     param.max_size = 1000;
   1687      1.1  mrg     param.step_factor = 0.02;
   1688      1.1  mrg     one (&mupi_div_qr_threshold, &param);
   1689      1.1  mrg   }
   1690      1.1  mrg }
   1691      1.1  mrg 
   1692      1.1  mrg void
   1693      1.1  mrg tune_dc_bdiv (void)
   1694      1.1  mrg {
   1695      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1696      1.1  mrg   {
   1697      1.1  mrg     static struct param_t  param;
   1698      1.1  mrg     param.name = "DC_BDIV_QR_THRESHOLD";
   1699      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_qr;
   1700      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_qr;
   1701      1.1  mrg     param.min_size = 4;
   1702      1.1  mrg     one (&dc_bdiv_qr_threshold, &param);
   1703      1.1  mrg   }
   1704      1.1  mrg   {
   1705      1.1  mrg     static struct param_t  param;
   1706      1.1  mrg     param.name = "DC_BDIV_Q_THRESHOLD";
   1707      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_q;
   1708      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_q;
   1709      1.1  mrg     param.min_size = 4;
   1710      1.1  mrg     one (&dc_bdiv_q_threshold, &param);
   1711      1.1  mrg   }
   1712      1.1  mrg }
   1713      1.1  mrg 
   1714      1.1  mrg void
   1715      1.1  mrg tune_mu_bdiv (void)
   1716      1.1  mrg {
   1717      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1718      1.1  mrg   {
   1719      1.1  mrg     static struct param_t  param;
   1720      1.1  mrg     param.name = "MU_BDIV_QR_THRESHOLD";
   1721      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_qr;
   1722      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_qr;
   1723  1.1.1.3  mrg     param.min_size = dc_bdiv_qr_threshold;
   1724      1.1  mrg     param.max_size = 5000;
   1725      1.1  mrg     param.step_factor = 0.02;
   1726      1.1  mrg     one (&mu_bdiv_qr_threshold, &param);
   1727      1.1  mrg   }
   1728      1.1  mrg   {
   1729      1.1  mrg     static struct param_t  param;
   1730      1.1  mrg     param.name = "MU_BDIV_Q_THRESHOLD";
   1731      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_q;
   1732      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_q;
   1733  1.1.1.3  mrg     param.min_size = dc_bdiv_q_threshold;
   1734      1.1  mrg     param.max_size = 5000;
   1735      1.1  mrg     param.step_factor = 0.02;
   1736      1.1  mrg     one (&mu_bdiv_q_threshold, &param);
   1737      1.1  mrg   }
   1738      1.1  mrg }
   1739      1.1  mrg 
   1740      1.1  mrg void
   1741      1.1  mrg tune_invertappr (void)
   1742      1.1  mrg {
   1743      1.1  mrg   static struct param_t  param;
   1744      1.1  mrg 
   1745      1.1  mrg   param.function = speed_mpn_ni_invertappr;
   1746      1.1  mrg   param.name = "INV_MULMOD_BNM1_THRESHOLD";
   1747  1.1.1.3  mrg   param.min_size = 5;
   1748      1.1  mrg   one (&inv_mulmod_bnm1_threshold, &param);
   1749      1.1  mrg 
   1750      1.1  mrg   param.function = speed_mpn_invertappr;
   1751      1.1  mrg   param.name = "INV_NEWTON_THRESHOLD";
   1752  1.1.1.3  mrg   param.min_size = 5;
   1753      1.1  mrg   one (&inv_newton_threshold, &param);
   1754      1.1  mrg }
   1755      1.1  mrg 
   1756      1.1  mrg void
   1757      1.1  mrg tune_invert (void)
   1758      1.1  mrg {
   1759      1.1  mrg   static struct param_t  param;
   1760      1.1  mrg 
   1761      1.1  mrg   param.function = speed_mpn_invert;
   1762      1.1  mrg   param.name = "INV_APPR_THRESHOLD";
   1763  1.1.1.3  mrg   param.min_size = 5;
   1764      1.1  mrg   one (&inv_appr_threshold, &param);
   1765      1.1  mrg }
   1766      1.1  mrg 
   1767      1.1  mrg void
   1768      1.1  mrg tune_binvert (void)
   1769      1.1  mrg {
   1770      1.1  mrg   static struct param_t  param;
   1771      1.1  mrg 
   1772      1.1  mrg   param.function = speed_mpn_binvert;
   1773      1.1  mrg   param.name = "BINV_NEWTON_THRESHOLD";
   1774      1.1  mrg   param.min_size = 8;		/* pointless with smaller operands */
   1775      1.1  mrg   one (&binv_newton_threshold, &param);
   1776      1.1  mrg }
   1777      1.1  mrg 
   1778      1.1  mrg void
   1779      1.1  mrg tune_redc (void)
   1780      1.1  mrg {
   1781      1.1  mrg #define TUNE_REDC_2_MAX 100
   1782      1.1  mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
   1783      1.1  mrg #define WANT_REDC_2 1
   1784      1.1  mrg #endif
   1785      1.1  mrg 
   1786      1.1  mrg #if WANT_REDC_2
   1787      1.1  mrg   {
   1788      1.1  mrg     static struct param_t  param;
   1789      1.1  mrg     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1790      1.1  mrg     param.function = speed_mpn_redc_1;
   1791      1.1  mrg     param.function2 = speed_mpn_redc_2;
   1792      1.1  mrg     param.min_size = 1;
   1793      1.1  mrg     param.min_is_always = 1;
   1794      1.1  mrg     param.max_size = TUNE_REDC_2_MAX;
   1795      1.1  mrg     param.noprint = 1;
   1796  1.1.1.2  mrg     param.stop_factor = 1.5;
   1797      1.1  mrg     one (&redc_1_to_redc_2_threshold, &param);
   1798      1.1  mrg   }
   1799      1.1  mrg   {
   1800      1.1  mrg     static struct param_t  param;
   1801      1.1  mrg     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
   1802      1.1  mrg     param.function = speed_mpn_redc_2;
   1803      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1804      1.1  mrg     param.min_size = 16;
   1805      1.1  mrg     param.noprint = 1;
   1806      1.1  mrg     one (&redc_2_to_redc_n_threshold, &param);
   1807      1.1  mrg   }
   1808  1.1.1.2  mrg   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
   1809      1.1  mrg     {
   1810  1.1.1.2  mrg       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
   1811  1.1.1.2  mrg 
   1812  1.1.1.2  mrg       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
   1813  1.1.1.2  mrg 	 REDC_1_TO_REDC_2_THRESHOLD.  */
   1814  1.1.1.2  mrg       {
   1815  1.1.1.2  mrg 	static struct param_t  param;
   1816  1.1.1.2  mrg 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1817  1.1.1.2  mrg 	param.function = speed_mpn_redc_1;
   1818  1.1.1.2  mrg 	param.function2 = speed_mpn_redc_n;
   1819  1.1.1.2  mrg 	param.min_size = 16;
   1820  1.1.1.2  mrg 	param.noprint = 1;
   1821  1.1.1.2  mrg 	one (&redc_1_to_redc_2_threshold, &param);
   1822  1.1.1.2  mrg       }
   1823      1.1  mrg     }
   1824  1.1.1.2  mrg   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
   1825  1.1.1.2  mrg   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
   1826      1.1  mrg #else
   1827      1.1  mrg   {
   1828      1.1  mrg     static struct param_t  param;
   1829      1.1  mrg     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
   1830      1.1  mrg     param.function = speed_mpn_redc_1;
   1831      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1832      1.1  mrg     param.min_size = 16;
   1833      1.1  mrg     one (&redc_1_to_redc_n_threshold, &param);
   1834      1.1  mrg   }
   1835      1.1  mrg #endif
   1836      1.1  mrg }
   1837      1.1  mrg 
   1838      1.1  mrg void
   1839      1.1  mrg tune_matrix22_mul (void)
   1840      1.1  mrg {
   1841      1.1  mrg   static struct param_t  param;
   1842      1.1  mrg   param.name = "MATRIX22_STRASSEN_THRESHOLD";
   1843      1.1  mrg   param.function = speed_mpn_matrix22_mul;
   1844      1.1  mrg   param.min_size = 2;
   1845      1.1  mrg   one (&matrix22_strassen_threshold, &param);
   1846      1.1  mrg }
   1847      1.1  mrg 
   1848      1.1  mrg void
   1849      1.1  mrg tune_hgcd (void)
   1850      1.1  mrg {
   1851      1.1  mrg   static struct param_t  param;
   1852      1.1  mrg   param.name = "HGCD_THRESHOLD";
   1853      1.1  mrg   param.function = speed_mpn_hgcd;
   1854      1.1  mrg   /* We seem to get strange results for small sizes */
   1855      1.1  mrg   param.min_size = 30;
   1856      1.1  mrg   one (&hgcd_threshold, &param);
   1857      1.1  mrg }
   1858      1.1  mrg 
   1859      1.1  mrg void
   1860  1.1.1.2  mrg tune_hgcd_appr (void)
   1861  1.1.1.2  mrg {
   1862  1.1.1.2  mrg   static struct param_t  param;
   1863  1.1.1.2  mrg   param.name = "HGCD_APPR_THRESHOLD";
   1864  1.1.1.2  mrg   param.function = speed_mpn_hgcd_appr;
   1865  1.1.1.2  mrg   /* We seem to get strange results for small sizes */
   1866  1.1.1.2  mrg   param.min_size = 50;
   1867  1.1.1.2  mrg   param.stop_since_change = 150;
   1868  1.1.1.2  mrg   one (&hgcd_appr_threshold, &param);
   1869  1.1.1.2  mrg }
   1870  1.1.1.2  mrg 
   1871  1.1.1.2  mrg void
   1872  1.1.1.2  mrg tune_hgcd_reduce (void)
   1873  1.1.1.2  mrg {
   1874  1.1.1.2  mrg   static struct param_t  param;
   1875  1.1.1.2  mrg   param.name = "HGCD_REDUCE_THRESHOLD";
   1876  1.1.1.2  mrg   param.function = speed_mpn_hgcd_reduce;
   1877  1.1.1.2  mrg   param.min_size = 30;
   1878  1.1.1.2  mrg   param.max_size = 7000;
   1879  1.1.1.2  mrg   param.step_factor = 0.04;
   1880  1.1.1.2  mrg   one (&hgcd_reduce_threshold, &param);
   1881  1.1.1.2  mrg }
   1882  1.1.1.2  mrg 
   1883  1.1.1.2  mrg void
   1884      1.1  mrg tune_gcd_dc (void)
   1885      1.1  mrg {
   1886      1.1  mrg   static struct param_t  param;
   1887      1.1  mrg   param.name = "GCD_DC_THRESHOLD";
   1888      1.1  mrg   param.function = speed_mpn_gcd;
   1889      1.1  mrg   param.min_size = hgcd_threshold;
   1890      1.1  mrg   param.max_size = 3000;
   1891      1.1  mrg   param.step_factor = 0.02;
   1892      1.1  mrg   one (&gcd_dc_threshold, &param);
   1893      1.1  mrg }
   1894      1.1  mrg 
   1895      1.1  mrg void
   1896      1.1  mrg tune_gcdext_dc (void)
   1897      1.1  mrg {
   1898      1.1  mrg   static struct param_t  param;
   1899      1.1  mrg   param.name = "GCDEXT_DC_THRESHOLD";
   1900      1.1  mrg   param.function = speed_mpn_gcdext;
   1901      1.1  mrg   param.min_size = hgcd_threshold;
   1902      1.1  mrg   param.max_size = 3000;
   1903      1.1  mrg   param.step_factor = 0.02;
   1904      1.1  mrg   one (&gcdext_dc_threshold, &param);
   1905      1.1  mrg }
   1906      1.1  mrg 
   1907  1.1.1.2  mrg /* In tune_powm_sec we compute the table used by the win_size function.  The
   1908  1.1.1.2  mrg    cutoff points are in exponent bits, disregarding other operand sizes.  It is
   1909  1.1.1.3  mrg    not possible to use the one framework since it currently uses a granularity
   1910  1.1.1.2  mrg    of full limbs.
   1911  1.1.1.2  mrg */
   1912  1.1.1.2  mrg 
   1913  1.1.1.2  mrg /* This win_size replaces the variant in the powm code, allowing us to
   1914  1.1.1.2  mrg    control k in the k-ary algorithms.  */
   1915  1.1.1.2  mrg int winsize;
   1916  1.1.1.2  mrg int
   1917  1.1.1.2  mrg win_size (mp_bitcnt_t eb)
   1918  1.1.1.2  mrg {
   1919  1.1.1.2  mrg   return winsize;
   1920  1.1.1.2  mrg }
   1921  1.1.1.2  mrg 
   1922  1.1.1.2  mrg void
   1923  1.1.1.2  mrg tune_powm_sec (void)
   1924  1.1.1.2  mrg {
   1925  1.1.1.2  mrg   mp_size_t n;
   1926  1.1.1.2  mrg   int k, i;
   1927  1.1.1.2  mrg   mp_size_t itch;
   1928  1.1.1.2  mrg   mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
   1929  1.1.1.2  mrg   const int n_max = 3000 / GMP_NUMB_BITS;
   1930  1.1.1.2  mrg   const int n_measurements = 5;
   1931  1.1.1.2  mrg   mp_ptr rp, bp, ep, mp, tp;
   1932  1.1.1.2  mrg   double ttab[n_measurements], tk, tkp1;
   1933  1.1.1.2  mrg   TMP_DECL;
   1934  1.1.1.2  mrg   TMP_MARK;
   1935  1.1.1.2  mrg 
   1936  1.1.1.2  mrg   possible_nbits_cutoff = 0;
   1937  1.1.1.2  mrg 
   1938  1.1.1.2  mrg   k = 1;
   1939  1.1.1.2  mrg 
   1940  1.1.1.2  mrg   winsize = 10;			/* the itch function needs this */
   1941  1.1.1.3  mrg   itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
   1942  1.1.1.2  mrg 
   1943  1.1.1.2  mrg   rp = TMP_ALLOC_LIMBS (n_max);
   1944  1.1.1.2  mrg   bp = TMP_ALLOC_LIMBS (n_max);
   1945  1.1.1.2  mrg   ep = TMP_ALLOC_LIMBS (n_max);
   1946  1.1.1.2  mrg   mp = TMP_ALLOC_LIMBS (n_max);
   1947  1.1.1.2  mrg   tp = TMP_ALLOC_LIMBS (itch);
   1948  1.1.1.2  mrg 
   1949  1.1.1.2  mrg   mpn_random (bp, n_max);
   1950  1.1.1.2  mrg   mpn_random (mp, n_max);
   1951  1.1.1.2  mrg   mp[0] |= 1;
   1952  1.1.1.2  mrg 
   1953  1.1.1.2  mrg /* How about taking the M operand size into account?
   1954  1.1.1.2  mrg 
   1955  1.1.1.2  mrg    An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
   1956  1.1.1.2  mrg    B = O(M)).
   1957  1.1.1.2  mrg 
   1958  1.1.1.2  mrg    Using k-ary and no sliding window, the precomputation will need time
   1959  1.1.1.2  mrg    O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
   1960  1.1.1.2  mrg    O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
   1961  1.1.1.2  mrg 
   1962  1.1.1.2  mrg    An operation R=powm_sec(B,E,N) will take time like powm.
   1963  1.1.1.2  mrg 
   1964  1.1.1.2  mrg    Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
   1965  1.1.1.2  mrg    main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
   1966  1.1.1.2  mrg    O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
   1967  1.1.1.2  mrg    table reads, respectively.  */
   1968  1.1.1.2  mrg 
   1969  1.1.1.2  mrg   printf ("#define POWM_SEC_TABLE  ");
   1970  1.1.1.2  mrg 
   1971  1.1.1.3  mrg   /* For nbits == 1, we should always use k == 1, so no need to tune
   1972  1.1.1.3  mrg      that. Starting with nbits == 2 also ensure that nbits always is
   1973  1.1.1.3  mrg      larger than the windowsize k+1. */
   1974  1.1.1.3  mrg   for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
   1975  1.1.1.2  mrg     {
   1976  1.1.1.2  mrg       n = (nbits - 1) / GMP_NUMB_BITS + 1;
   1977  1.1.1.2  mrg 
   1978  1.1.1.2  mrg       /* Generate E such that sliding-window for k and k+1 works equally
   1979  1.1.1.2  mrg 	 well/poorly (but sliding is not used in powm_sec, of course). */
   1980  1.1.1.2  mrg       for (i = 0; i < n; i++)
   1981  1.1.1.2  mrg 	ep[i] = ~CNST_LIMB(0);
   1982  1.1.1.2  mrg 
   1983  1.1.1.2  mrg       winsize = k;
   1984  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   1985  1.1.1.2  mrg 	{
   1986  1.1.1.2  mrg 	  speed_starttime ();
   1987  1.1.1.3  mrg 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
   1988  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   1989  1.1.1.2  mrg 	}
   1990  1.1.1.2  mrg       tk = median (ttab, n_measurements);
   1991  1.1.1.2  mrg 
   1992  1.1.1.2  mrg       winsize = k + 1;
   1993  1.1.1.2  mrg       speed_starttime ();
   1994  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   1995  1.1.1.2  mrg 	{
   1996  1.1.1.2  mrg 	  speed_starttime ();
   1997  1.1.1.3  mrg 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
   1998  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   1999  1.1.1.2  mrg 	}
   2000  1.1.1.2  mrg       tkp1 = median (ttab, n_measurements);
   2001  1.1.1.2  mrg /*
   2002  1.1.1.2  mrg       printf ("testing: %ld, %d", nbits, k, ep[n-1]);
   2003  1.1.1.2  mrg       printf ("   %10.5f  %10.5f\n", tk, tkp1);
   2004  1.1.1.2  mrg */
   2005  1.1.1.2  mrg       if (tkp1 < tk)
   2006  1.1.1.2  mrg 	{
   2007  1.1.1.2  mrg 	  if (possible_nbits_cutoff)
   2008  1.1.1.2  mrg 	    {
   2009  1.1.1.2  mrg 	      /* Two consecutive sizes indicate k increase, obey.  */
   2010  1.1.1.3  mrg 
   2011  1.1.1.3  mrg 	      /* Must always have x[k] >= k */
   2012  1.1.1.3  mrg 	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
   2013  1.1.1.3  mrg 
   2014  1.1.1.2  mrg 	      if (k > 1)
   2015  1.1.1.2  mrg 		printf (",");
   2016  1.1.1.2  mrg 	      printf ("%ld", (long) possible_nbits_cutoff);
   2017  1.1.1.2  mrg 	      k++;
   2018  1.1.1.2  mrg 	      possible_nbits_cutoff = 0;
   2019  1.1.1.2  mrg 	    }
   2020  1.1.1.2  mrg 	  else
   2021  1.1.1.2  mrg 	    {
   2022  1.1.1.2  mrg 	      /* One measurement indicate k increase, save nbits for further
   2023  1.1.1.2  mrg 		 consideration.  */
   2024  1.1.1.3  mrg 	      /* The new larger k gets used for sizes > the cutoff
   2025  1.1.1.3  mrg 		 value, hence the cutoff should be one less than the
   2026  1.1.1.3  mrg 		 smallest size where it gives a speedup. */
   2027  1.1.1.3  mrg 	      possible_nbits_cutoff = nbits - 1;
   2028  1.1.1.2  mrg 	    }
   2029  1.1.1.2  mrg 	}
   2030  1.1.1.2  mrg       else
   2031  1.1.1.2  mrg 	possible_nbits_cutoff = 0;
   2032  1.1.1.2  mrg 
   2033  1.1.1.2  mrg       nbits_next = nbits * 65 / 64;
   2034  1.1.1.2  mrg       nbits = nbits_next + (nbits_next == nbits);
   2035  1.1.1.2  mrg     }
   2036  1.1.1.2  mrg   printf ("\n");
   2037  1.1.1.2  mrg   TMP_FREE;
   2038  1.1.1.2  mrg }
   2039  1.1.1.2  mrg 
   2040      1.1  mrg 
   2041      1.1  mrg /* size_extra==1 reflects the fact that with high<divisor one division is
   2042      1.1  mrg    always skipped.  Forcing high<divisor while testing ensures consistency
   2043      1.1  mrg    while stepping through sizes, ie. that size-1 divides will be done each
   2044      1.1  mrg    time.
   2045      1.1  mrg 
   2046      1.1  mrg    min_size==2 and min_is_always are used so that if plain division is only
   2047      1.1  mrg    better at size==1 then don't bother including that code just for that
   2048      1.1  mrg    case, instead go with preinv always and get a size saving.  */
   2049      1.1  mrg 
   2050      1.1  mrg #define DIV_1_PARAMS                    \
   2051      1.1  mrg   param.check_size = 256;               \
   2052      1.1  mrg   param.min_size = 2;                   \
   2053      1.1  mrg   param.min_is_always = 1;              \
   2054      1.1  mrg   param.data_high = DATA_HIGH_LT_R;     \
   2055      1.1  mrg   param.size_extra = 1;                 \
   2056      1.1  mrg   param.stop_factor = 2.0;
   2057      1.1  mrg 
   2058      1.1  mrg 
   2059  1.1.1.2  mrg double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
   2060      1.1  mrg 
   2061      1.1  mrg void
   2062      1.1  mrg tune_divrem_1 (void)
   2063      1.1  mrg {
   2064      1.1  mrg   /* plain version by default */
   2065      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
   2066      1.1  mrg 
   2067      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2068      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2069      1.1  mrg      appear in gmp-mparam.h.  */
   2070      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2071      1.1  mrg     return;
   2072      1.1  mrg 
   2073      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2074      1.1  mrg     {
   2075      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2076      1.1  mrg                            "no preinv with nails");
   2077      1.1  mrg       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2078      1.1  mrg                            "no preinv with nails");
   2079      1.1  mrg       return;
   2080      1.1  mrg     }
   2081      1.1  mrg 
   2082      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2083      1.1  mrg     {
   2084      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
   2085      1.1  mrg       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
   2086      1.1  mrg       return;
   2087      1.1  mrg     }
   2088      1.1  mrg 
   2089      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
   2090      1.1  mrg 
   2091      1.1  mrg   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
   2092      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2093      1.1  mrg      is more important. */
   2094      1.1  mrg   {
   2095      1.1  mrg     static struct param_t  param;
   2096      1.1  mrg     param.name = "DIVREM_1_NORM_THRESHOLD";
   2097      1.1  mrg     DIV_1_PARAMS;
   2098      1.1  mrg     s.r = randlimb_norm ();
   2099      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2100      1.1  mrg     one (&divrem_1_norm_threshold, &param);
   2101      1.1  mrg   }
   2102      1.1  mrg   {
   2103      1.1  mrg     static struct param_t  param;
   2104      1.1  mrg     param.name = "DIVREM_1_UNNORM_THRESHOLD";
   2105      1.1  mrg     DIV_1_PARAMS;
   2106      1.1  mrg     s.r = randlimb_half ();
   2107      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2108      1.1  mrg     one (&divrem_1_unnorm_threshold, &param);
   2109      1.1  mrg   }
   2110      1.1  mrg }
   2111      1.1  mrg 
   2112  1.1.1.3  mrg void
   2113  1.1.1.3  mrg tune_div_qr_1 (void)
   2114  1.1.1.3  mrg {
   2115  1.1.1.3  mrg   static struct param_t  param;
   2116  1.1.1.3  mrg   double            t1, t2;
   2117  1.1.1.3  mrg 
   2118  1.1.1.3  mrg   if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
   2119  1.1.1.3  mrg     {
   2120  1.1.1.3  mrg       static struct param_t  param;
   2121  1.1.1.3  mrg       double   t1, t2;
   2122  1.1.1.3  mrg 
   2123  1.1.1.3  mrg       s.size = 10;
   2124  1.1.1.3  mrg       s.r = randlimb_norm ();
   2125  1.1.1.3  mrg 
   2126  1.1.1.3  mrg       t1 = tuneup_measure (speed_mpn_div_qr_1n_pi1_1, &param, &s);
   2127  1.1.1.3  mrg       t2 = tuneup_measure (speed_mpn_div_qr_1n_pi1_2, &param, &s);
   2128  1.1.1.3  mrg 
   2129  1.1.1.3  mrg       if (t1 == -1.0 || t2 == -1.0)
   2130  1.1.1.3  mrg 	{
   2131  1.1.1.3  mrg 	  printf ("Oops, can't measure all mpn_div_qr_1n_pi1 methods at %ld\n",
   2132  1.1.1.3  mrg 		  (long) s.size);
   2133  1.1.1.3  mrg 	  abort ();
   2134  1.1.1.3  mrg 	}
   2135  1.1.1.3  mrg       div_qr_1n_pi1_method = (t1 < t2) ? 1 : 2;
   2136  1.1.1.3  mrg       print_define ("DIV_QR_1N_PI1_METHOD", div_qr_1n_pi1_method);
   2137  1.1.1.3  mrg     }
   2138  1.1.1.3  mrg 
   2139  1.1.1.3  mrg   {
   2140  1.1.1.3  mrg     static struct param_t  param;
   2141  1.1.1.3  mrg     param.name = "DIV_QR_1_NORM_THRESHOLD";
   2142  1.1.1.3  mrg     DIV_1_PARAMS;
   2143  1.1.1.3  mrg     param.min_size = 1;
   2144  1.1.1.3  mrg     param.min_is_always = 0;
   2145  1.1.1.3  mrg     s.r = randlimb_norm ();
   2146  1.1.1.3  mrg     param.function = speed_mpn_div_qr_1_tune;
   2147  1.1.1.3  mrg     one (&div_qr_1_norm_threshold, &param);
   2148  1.1.1.3  mrg   }
   2149  1.1.1.3  mrg   {
   2150  1.1.1.3  mrg     static struct param_t  param;
   2151  1.1.1.3  mrg     param.name = "DIV_QR_1_UNNORM_THRESHOLD";
   2152  1.1.1.3  mrg     DIV_1_PARAMS;
   2153  1.1.1.3  mrg     param.min_size = 1;
   2154  1.1.1.3  mrg     param.min_is_always = 0;
   2155  1.1.1.3  mrg     s.r = randlimb_half();
   2156  1.1.1.3  mrg     param.function = speed_mpn_div_qr_1_tune;
   2157  1.1.1.3  mrg     one (&div_qr_1_unnorm_threshold, &param);
   2158  1.1.1.3  mrg   }
   2159  1.1.1.3  mrg }
   2160  1.1.1.3  mrg 
   2161      1.1  mrg 
   2162      1.1  mrg void
   2163      1.1  mrg tune_mod_1 (void)
   2164      1.1  mrg {
   2165      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2166      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2167      1.1  mrg      appear in gmp-mparam.h.  */
   2168      1.1  mrg   if (HAVE_NATIVE_mpn_mod_1)
   2169      1.1  mrg     return;
   2170      1.1  mrg 
   2171      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2172      1.1  mrg     {
   2173      1.1  mrg       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2174      1.1  mrg                            "no preinv with nails");
   2175      1.1  mrg       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2176      1.1  mrg                            "no preinv with nails");
   2177      1.1  mrg       return;
   2178      1.1  mrg     }
   2179      1.1  mrg 
   2180  1.1.1.2  mrg   if (!HAVE_NATIVE_mpn_mod_1_1p)
   2181  1.1.1.2  mrg     {
   2182  1.1.1.2  mrg       static struct param_t  param;
   2183  1.1.1.2  mrg       double   t1, t2;
   2184  1.1.1.2  mrg 
   2185  1.1.1.2  mrg       s.size = 10;
   2186  1.1.1.2  mrg       s.r = randlimb_half ();
   2187  1.1.1.2  mrg 
   2188  1.1.1.2  mrg       t1 = tuneup_measure (speed_mpn_mod_1_1_1, &param, &s);
   2189  1.1.1.2  mrg       t2 = tuneup_measure (speed_mpn_mod_1_1_2, &param, &s);
   2190  1.1.1.2  mrg 
   2191  1.1.1.2  mrg       if (t1 == -1.0 || t2 == -1.0)
   2192  1.1.1.2  mrg 	{
   2193  1.1.1.2  mrg 	  printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n",
   2194  1.1.1.2  mrg 		  (long) s.size);
   2195  1.1.1.2  mrg 	  abort ();
   2196  1.1.1.2  mrg 	}
   2197  1.1.1.2  mrg       mod_1_1p_method = (t1 < t2) ? 1 : 2;
   2198  1.1.1.2  mrg       print_define ("MOD_1_1P_METHOD", mod_1_1p_method);
   2199  1.1.1.2  mrg     }
   2200  1.1.1.2  mrg 
   2201      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2202      1.1  mrg     {
   2203      1.1  mrg       print_define ("MOD_1_NORM_THRESHOLD", 0L);
   2204      1.1  mrg       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
   2205      1.1  mrg     }
   2206      1.1  mrg   else
   2207      1.1  mrg     {
   2208      1.1  mrg       {
   2209      1.1  mrg 	static struct param_t  param;
   2210      1.1  mrg 	param.name = "MOD_1_NORM_THRESHOLD";
   2211      1.1  mrg 	DIV_1_PARAMS;
   2212      1.1  mrg 	s.r = randlimb_norm ();
   2213      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2214      1.1  mrg 	one (&mod_1_norm_threshold, &param);
   2215      1.1  mrg       }
   2216      1.1  mrg       {
   2217      1.1  mrg 	static struct param_t  param;
   2218      1.1  mrg 	param.name = "MOD_1_UNNORM_THRESHOLD";
   2219      1.1  mrg 	DIV_1_PARAMS;
   2220      1.1  mrg 	s.r = randlimb_half ();
   2221      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2222      1.1  mrg 	one (&mod_1_unnorm_threshold, &param);
   2223      1.1  mrg       }
   2224      1.1  mrg     }
   2225      1.1  mrg   {
   2226      1.1  mrg     static struct param_t  param;
   2227      1.1  mrg 
   2228      1.1  mrg     param.check_size = 256;
   2229      1.1  mrg 
   2230      1.1  mrg     s.r = randlimb_norm ();
   2231      1.1  mrg     param.function = speed_mpn_mod_1_tune;
   2232      1.1  mrg 
   2233      1.1  mrg     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
   2234      1.1  mrg     param.min_size = 2;
   2235      1.1  mrg     one (&mod_1n_to_mod_1_1_threshold, &param);
   2236      1.1  mrg   }
   2237  1.1.1.2  mrg 
   2238      1.1  mrg   {
   2239      1.1  mrg     static struct param_t  param;
   2240      1.1  mrg 
   2241      1.1  mrg     param.check_size = 256;
   2242  1.1.1.2  mrg     s.r = randlimb_half ();
   2243      1.1  mrg     param.noprint = 1;
   2244      1.1  mrg 
   2245  1.1.1.2  mrg     param.function = speed_mpn_mod_1_1;
   2246  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_2;
   2247  1.1.1.2  mrg     param.min_is_always = 1;
   2248      1.1  mrg     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
   2249  1.1.1.2  mrg     param.min_size = 2;
   2250      1.1  mrg     one (&mod_1_1_to_mod_1_2_threshold, &param);
   2251      1.1  mrg 
   2252  1.1.1.2  mrg     param.function = speed_mpn_mod_1_2;
   2253  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_4;
   2254  1.1.1.2  mrg     param.min_is_always = 1;
   2255      1.1  mrg     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
   2256  1.1.1.2  mrg     param.min_size = 1;
   2257      1.1  mrg     one (&mod_1_2_to_mod_1_4_threshold, &param);
   2258      1.1  mrg 
   2259  1.1.1.2  mrg     if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
   2260      1.1  mrg       {
   2261  1.1.1.2  mrg 	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
   2262      1.1  mrg 	mod_1_2_to_mod_1_4_threshold = 0;
   2263      1.1  mrg 
   2264  1.1.1.2  mrg 	param.function = speed_mpn_mod_1_1;
   2265  1.1.1.2  mrg 	param.function2 = speed_mpn_mod_1_4;
   2266  1.1.1.2  mrg 	param.min_is_always = 1;
   2267  1.1.1.2  mrg 	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
   2268  1.1.1.2  mrg 	param.min_size = 2;
   2269      1.1  mrg 	one (&mod_1_1_to_mod_1_2_threshold, &param);
   2270      1.1  mrg       }
   2271  1.1.1.2  mrg 
   2272  1.1.1.2  mrg     param.function = speed_mpn_mod_1_tune;
   2273  1.1.1.2  mrg     param.function2 = NULL;
   2274  1.1.1.2  mrg     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
   2275  1.1.1.2  mrg     param.min_size = 2;
   2276  1.1.1.2  mrg     param.min_is_always = 0;
   2277  1.1.1.2  mrg     one (&mod_1u_to_mod_1_1_threshold, &param);
   2278  1.1.1.2  mrg 
   2279  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
   2280  1.1.1.2  mrg       mod_1_1_to_mod_1_2_threshold = 0;
   2281  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
   2282  1.1.1.2  mrg       mod_1_2_to_mod_1_4_threshold = 0;
   2283  1.1.1.2  mrg 
   2284  1.1.1.2  mrg     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
   2285  1.1.1.2  mrg     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
   2286  1.1.1.2  mrg 			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
   2287  1.1.1.2  mrg     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
   2288  1.1.1.2  mrg 			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
   2289      1.1  mrg   }
   2290      1.1  mrg 
   2291      1.1  mrg   {
   2292      1.1  mrg     static struct param_t  param;
   2293      1.1  mrg 
   2294      1.1  mrg     param.check_size = 256;
   2295      1.1  mrg 
   2296      1.1  mrg     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
   2297      1.1  mrg     s.r = randlimb_norm ();
   2298      1.1  mrg     param.function = speed_mpn_preinv_mod_1;
   2299      1.1  mrg     param.function2 = speed_mpn_mod_1_tune;
   2300      1.1  mrg     param.min_size = 1;
   2301      1.1  mrg     one (&preinv_mod_1_to_mod_1_threshold, &param);
   2302      1.1  mrg   }
   2303      1.1  mrg }
   2304      1.1  mrg 
   2305      1.1  mrg 
   2306      1.1  mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
   2307      1.1  mrg    imply that udiv_qrnnd_preinv is worth using, but it seems most
   2308      1.1  mrg    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
   2309      1.1  mrg    directly.  */
   2310      1.1  mrg 
   2311      1.1  mrg void
   2312      1.1  mrg tune_preinv_divrem_1 (void)
   2313      1.1  mrg {
   2314      1.1  mrg   static struct param_t  param;
   2315      1.1  mrg   speed_function_t  divrem_1;
   2316      1.1  mrg   const char        *divrem_1_name;
   2317      1.1  mrg   double            t1, t2;
   2318      1.1  mrg 
   2319      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2320      1.1  mrg     {
   2321      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
   2322      1.1  mrg       return;
   2323      1.1  mrg     }
   2324      1.1  mrg 
   2325      1.1  mrg   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
   2326      1.1  mrg      it's faster than mpn_divrem_1.  */
   2327      1.1  mrg   if (HAVE_NATIVE_mpn_preinv_divrem_1)
   2328      1.1  mrg     {
   2329      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
   2330      1.1  mrg       return;
   2331      1.1  mrg     }
   2332      1.1  mrg 
   2333      1.1  mrg   /* If udiv_qrnnd_preinv is the only division method then of course
   2334      1.1  mrg      mpn_preinv_divrem_1 should be used.  */
   2335      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2336      1.1  mrg     {
   2337      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
   2338      1.1  mrg       return;
   2339      1.1  mrg     }
   2340      1.1  mrg 
   2341      1.1  mrg   /* If we've got an assembler version of mpn_divrem_1, then compare against
   2342      1.1  mrg      that, not the mpn_divrem_1_div generic C.  */
   2343      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2344      1.1  mrg     {
   2345      1.1  mrg       divrem_1 = speed_mpn_divrem_1;
   2346      1.1  mrg       divrem_1_name = "mpn_divrem_1";
   2347      1.1  mrg     }
   2348      1.1  mrg   else
   2349      1.1  mrg     {
   2350      1.1  mrg       divrem_1 = speed_mpn_divrem_1_div;
   2351      1.1  mrg       divrem_1_name = "mpn_divrem_1_div";
   2352      1.1  mrg     }
   2353      1.1  mrg 
   2354      1.1  mrg   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
   2355      1.1  mrg   s.size = 200;                     /* generous but not too big */
   2356      1.1  mrg   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
   2357      1.1  mrg      since in general that's probably most common, though in fact for a
   2358      1.1  mrg      64-bit limb mp_bases[10].big_base is normalized.  */
   2359      1.1  mrg   s.r = urandom() & (GMP_NUMB_MASK >> 4);
   2360      1.1  mrg   if (s.r == 0) s.r = 123;
   2361      1.1  mrg 
   2362      1.1  mrg   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
   2363      1.1  mrg   t2 = tuneup_measure (divrem_1, &param, &s);
   2364      1.1  mrg   if (t1 == -1.0 || t2 == -1.0)
   2365      1.1  mrg     {
   2366      1.1  mrg       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
   2367      1.1  mrg               divrem_1_name, (long) s.size);
   2368      1.1  mrg       abort ();
   2369      1.1  mrg     }
   2370      1.1  mrg   if (option_trace >= 1)
   2371      1.1  mrg     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
   2372      1.1  mrg             (long) s.size, t1, divrem_1_name, t2);
   2373      1.1  mrg 
   2374      1.1  mrg   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
   2375      1.1  mrg }
   2376      1.1  mrg 
   2377      1.1  mrg 
   2378      1.1  mrg 
   2379      1.1  mrg void
   2380      1.1  mrg tune_divrem_2 (void)
   2381      1.1  mrg {
   2382      1.1  mrg   static struct param_t  param;
   2383      1.1  mrg 
   2384      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2385      1.1  mrg      the results in the .asm file, and there's no need for such thresholds
   2386      1.1  mrg      to appear in gmp-mparam.h.  */
   2387      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_2)
   2388      1.1  mrg     return;
   2389      1.1  mrg 
   2390      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2391      1.1  mrg     {
   2392      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
   2393      1.1  mrg                            "no preinv with nails");
   2394      1.1  mrg       return;
   2395      1.1  mrg     }
   2396      1.1  mrg 
   2397      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2398      1.1  mrg     {
   2399      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
   2400      1.1  mrg       return;
   2401      1.1  mrg     }
   2402      1.1  mrg 
   2403      1.1  mrg   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
   2404      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2405      1.1  mrg      is more important.
   2406      1.1  mrg 
   2407      1.1  mrg      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
   2408      1.1  mrg      code space if plain division is better only at size==2 or size==3. */
   2409      1.1  mrg   param.name = "DIVREM_2_THRESHOLD";
   2410      1.1  mrg   param.check_size = 256;
   2411      1.1  mrg   param.min_size = 4;
   2412      1.1  mrg   param.min_is_always = 1;
   2413      1.1  mrg   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
   2414      1.1  mrg   param.stop_factor = 2.0;
   2415      1.1  mrg 
   2416      1.1  mrg   s.r = randlimb_norm ();
   2417      1.1  mrg   param.function = speed_mpn_divrem_2;
   2418      1.1  mrg   one (&divrem_2_threshold, &param);
   2419      1.1  mrg }
   2420      1.1  mrg 
   2421  1.1.1.2  mrg void
   2422  1.1.1.2  mrg tune_div_qr_2 (void)
   2423  1.1.1.2  mrg {
   2424  1.1.1.2  mrg   static struct param_t  param;
   2425  1.1.1.2  mrg   param.name = "DIV_QR_2_PI2_THRESHOLD";
   2426  1.1.1.2  mrg   param.function = speed_mpn_div_qr_2n;
   2427  1.1.1.2  mrg   param.check_size = 500;
   2428  1.1.1.2  mrg   param.min_size = 4;
   2429  1.1.1.2  mrg   one (&div_qr_2_pi2_threshold, &param);
   2430  1.1.1.2  mrg }
   2431      1.1  mrg 
   2432      1.1  mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
   2433      1.1  mrg    tune for that.  Its speed can differ on odd or even divisor, so take an
   2434      1.1  mrg    average threshold for the two.
   2435      1.1  mrg 
   2436      1.1  mrg    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
   2437      1.1  mrg    might not vary that way, but don't test this since high<divisor isn't
   2438      1.1  mrg    expected to occur often with small divisors.  */
   2439      1.1  mrg 
   2440      1.1  mrg void
   2441      1.1  mrg tune_divexact_1 (void)
   2442      1.1  mrg {
   2443      1.1  mrg   static struct param_t  param;
   2444      1.1  mrg   mp_size_t  thresh[2], average;
   2445      1.1  mrg   int        low, i;
   2446      1.1  mrg 
   2447      1.1  mrg   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
   2448      1.1  mrg      full mpn_divrem_1.  */
   2449      1.1  mrg   if (HAVE_NATIVE_mpn_divexact_1)
   2450      1.1  mrg     {
   2451      1.1  mrg       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
   2452      1.1  mrg       return;
   2453      1.1  mrg     }
   2454      1.1  mrg 
   2455      1.1  mrg   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
   2456      1.1  mrg 
   2457      1.1  mrg   param.name = "DIVEXACT_1_THRESHOLD";
   2458      1.1  mrg   param.data_high = DATA_HIGH_GE_R;
   2459      1.1  mrg   param.check_size = 256;
   2460      1.1  mrg   param.min_size = 2;
   2461      1.1  mrg   param.stop_factor = 1.5;
   2462      1.1  mrg   param.function  = tuned_speed_mpn_divrem_1;
   2463      1.1  mrg   param.function2 = speed_mpn_divexact_1;
   2464      1.1  mrg   param.noprint = 1;
   2465      1.1  mrg 
   2466      1.1  mrg   print_define_start (param.name);
   2467      1.1  mrg 
   2468      1.1  mrg   for (low = 0; low <= 1; low++)
   2469      1.1  mrg     {
   2470      1.1  mrg       s.r = randlimb_half();
   2471      1.1  mrg       if (low == 0)
   2472      1.1  mrg         s.r |= 1;
   2473      1.1  mrg       else
   2474      1.1  mrg         s.r &= ~CNST_LIMB(7);
   2475      1.1  mrg 
   2476      1.1  mrg       one (&thresh[low], &param);
   2477      1.1  mrg       if (option_trace)
   2478      1.1  mrg         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
   2479      1.1  mrg 
   2480      1.1  mrg       if (thresh[low] == MP_SIZE_T_MAX)
   2481      1.1  mrg         {
   2482      1.1  mrg           average = MP_SIZE_T_MAX;
   2483      1.1  mrg           goto divexact_1_done;
   2484      1.1  mrg         }
   2485      1.1  mrg     }
   2486      1.1  mrg 
   2487      1.1  mrg   if (option_trace)
   2488      1.1  mrg     {
   2489      1.1  mrg       printf ("average of:");
   2490      1.1  mrg       for (i = 0; i < numberof(thresh); i++)
   2491      1.1  mrg         printf (" %ld", (long) thresh[i]);
   2492      1.1  mrg       printf ("\n");
   2493      1.1  mrg     }
   2494      1.1  mrg 
   2495      1.1  mrg   average = 0;
   2496      1.1  mrg   for (i = 0; i < numberof(thresh); i++)
   2497      1.1  mrg     average += thresh[i];
   2498      1.1  mrg   average /= numberof(thresh);
   2499      1.1  mrg 
   2500      1.1  mrg   /* If divexact turns out to be better as early as 3 limbs, then use it
   2501      1.1  mrg      always, so as to reduce code size and conditional jumps.  */
   2502      1.1  mrg   if (average <= 3)
   2503      1.1  mrg     average = 0;
   2504      1.1  mrg 
   2505      1.1  mrg  divexact_1_done:
   2506      1.1  mrg   print_define_end (param.name, average);
   2507      1.1  mrg }
   2508      1.1  mrg 
   2509      1.1  mrg 
   2510      1.1  mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
   2511      1.1  mrg    same as mpn_mod_1, but this might not be true of an assembler
   2512      1.1  mrg    implementation.  The threshold used is an average based on data where a
   2513      1.1  mrg    divide can be skipped and where it can't.
   2514      1.1  mrg 
   2515      1.1  mrg    If modexact turns out to be better as early as 3 limbs, then use it
   2516      1.1  mrg    always, so as to reduce code size and conditional jumps.  */
   2517      1.1  mrg 
   2518      1.1  mrg void
   2519      1.1  mrg tune_modexact_1_odd (void)
   2520      1.1  mrg {
   2521      1.1  mrg   static struct param_t  param;
   2522      1.1  mrg   mp_size_t  thresh_lt, thresh_ge, average;
   2523      1.1  mrg 
   2524      1.1  mrg #if 0
   2525      1.1  mrg   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
   2526      1.1  mrg      of a full mpn_mod_1.  */
   2527      1.1  mrg   if (HAVE_NATIVE_mpn_modexact_1_odd)
   2528      1.1  mrg     {
   2529      1.1  mrg       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
   2530      1.1  mrg       return;
   2531      1.1  mrg     }
   2532      1.1  mrg #endif
   2533      1.1  mrg 
   2534      1.1  mrg   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
   2535      1.1  mrg   param.check_size = 256;
   2536      1.1  mrg   param.min_size = 2;
   2537      1.1  mrg   param.stop_factor = 1.5;
   2538      1.1  mrg   param.function  = speed_mpn_modexact_1c_odd;
   2539      1.1  mrg   param.function2 = speed_mpn_mod_1_tune;
   2540      1.1  mrg   param.noprint = 1;
   2541      1.1  mrg   s.r = randlimb_half () | 1;
   2542      1.1  mrg 
   2543      1.1  mrg   print_define_start (param.name);
   2544      1.1  mrg 
   2545      1.1  mrg   param.data_high = DATA_HIGH_LT_R;
   2546      1.1  mrg   one (&thresh_lt, &param);
   2547      1.1  mrg   if (option_trace)
   2548      1.1  mrg     printf ("lt thresh %ld\n", (long) thresh_lt);
   2549      1.1  mrg 
   2550      1.1  mrg   average = thresh_lt;
   2551      1.1  mrg   if (thresh_lt != MP_SIZE_T_MAX)
   2552      1.1  mrg     {
   2553      1.1  mrg       param.data_high = DATA_HIGH_GE_R;
   2554      1.1  mrg       one (&thresh_ge, &param);
   2555      1.1  mrg       if (option_trace)
   2556      1.1  mrg         printf ("ge thresh %ld\n", (long) thresh_ge);
   2557      1.1  mrg 
   2558      1.1  mrg       if (thresh_ge != MP_SIZE_T_MAX)
   2559      1.1  mrg         {
   2560      1.1  mrg           average = (thresh_ge + thresh_lt) / 2;
   2561      1.1  mrg           if (thresh_ge <= 3)
   2562      1.1  mrg             average = 0;
   2563      1.1  mrg         }
   2564      1.1  mrg     }
   2565      1.1  mrg 
   2566      1.1  mrg   print_define_end (param.name, average);
   2567      1.1  mrg }
   2568      1.1  mrg 
   2569      1.1  mrg 
   2570      1.1  mrg void
   2571      1.1  mrg tune_jacobi_base (void)
   2572      1.1  mrg {
   2573      1.1  mrg   static struct param_t  param;
   2574  1.1.1.2  mrg   double   t1, t2, t3, t4;
   2575      1.1  mrg   int      method;
   2576      1.1  mrg 
   2577      1.1  mrg   s.size = GMP_LIMB_BITS * 3 / 4;
   2578      1.1  mrg 
   2579      1.1  mrg   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
   2580      1.1  mrg   if (option_trace >= 1)
   2581      1.1  mrg     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
   2582      1.1  mrg 
   2583      1.1  mrg   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
   2584      1.1  mrg   if (option_trace >= 1)
   2585      1.1  mrg     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
   2586      1.1  mrg 
   2587      1.1  mrg   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
   2588      1.1  mrg   if (option_trace >= 1)
   2589      1.1  mrg     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
   2590      1.1  mrg 
   2591  1.1.1.2  mrg   t4 = tuneup_measure (speed_mpn_jacobi_base_4, &param, &s);
   2592  1.1.1.2  mrg   if (option_trace >= 1)
   2593  1.1.1.2  mrg     printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
   2594  1.1.1.2  mrg 
   2595  1.1.1.2  mrg   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
   2596      1.1  mrg     {
   2597      1.1  mrg       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
   2598      1.1  mrg               (long) s.size);
   2599      1.1  mrg       abort ();
   2600      1.1  mrg     }
   2601      1.1  mrg 
   2602  1.1.1.2  mrg   if (t1 < t2 && t1 < t3 && t1 < t4)
   2603      1.1  mrg     method = 1;
   2604  1.1.1.2  mrg   else if (t2 < t3 && t2 < t4)
   2605      1.1  mrg     method = 2;
   2606  1.1.1.2  mrg   else if (t3 < t4)
   2607      1.1  mrg     method = 3;
   2608  1.1.1.2  mrg   else
   2609  1.1.1.2  mrg     method = 4;
   2610      1.1  mrg 
   2611      1.1  mrg   print_define ("JACOBI_BASE_METHOD", method);
   2612      1.1  mrg }
   2613      1.1  mrg 
   2614      1.1  mrg 
   2615      1.1  mrg void
   2616      1.1  mrg tune_get_str (void)
   2617      1.1  mrg {
   2618      1.1  mrg   /* Tune for decimal, it being most common.  Some rough testing suggests
   2619      1.1  mrg      other bases are different, but not by very much.  */
   2620      1.1  mrg   s.r = 10;
   2621      1.1  mrg   {
   2622      1.1  mrg     static struct param_t  param;
   2623      1.1  mrg     GET_STR_PRECOMPUTE_THRESHOLD = 0;
   2624      1.1  mrg     param.name = "GET_STR_DC_THRESHOLD";
   2625      1.1  mrg     param.function = speed_mpn_get_str;
   2626      1.1  mrg     param.min_size = 4;
   2627      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2628      1.1  mrg     one (&get_str_dc_threshold, &param);
   2629      1.1  mrg   }
   2630      1.1  mrg   {
   2631      1.1  mrg     static struct param_t  param;
   2632      1.1  mrg     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
   2633      1.1  mrg     param.function = speed_mpn_get_str;
   2634      1.1  mrg     param.min_size = GET_STR_DC_THRESHOLD;
   2635      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2636      1.1  mrg     one (&get_str_precompute_threshold, &param);
   2637      1.1  mrg   }
   2638      1.1  mrg }
   2639      1.1  mrg 
   2640      1.1  mrg 
   2641      1.1  mrg double
   2642      1.1  mrg speed_mpn_pre_set_str (struct speed_params *s)
   2643      1.1  mrg {
   2644      1.1  mrg   unsigned char *str;
   2645      1.1  mrg   mp_ptr     wp;
   2646      1.1  mrg   mp_size_t  wn;
   2647      1.1  mrg   unsigned   i;
   2648      1.1  mrg   int        base;
   2649      1.1  mrg   double     t;
   2650      1.1  mrg   mp_ptr powtab_mem, tp;
   2651      1.1  mrg   powers_t powtab[GMP_LIMB_BITS];
   2652      1.1  mrg   mp_size_t un;
   2653      1.1  mrg   int chars_per_limb;
   2654      1.1  mrg   TMP_DECL;
   2655      1.1  mrg 
   2656      1.1  mrg   SPEED_RESTRICT_COND (s->size >= 1);
   2657      1.1  mrg 
   2658      1.1  mrg   base = s->r == 0 ? 10 : s->r;
   2659      1.1  mrg   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
   2660      1.1  mrg 
   2661      1.1  mrg   TMP_MARK;
   2662      1.1  mrg 
   2663  1.1.1.3  mrg   str = (unsigned char *) TMP_ALLOC (s->size);
   2664      1.1  mrg   for (i = 0; i < s->size; i++)
   2665      1.1  mrg     str[i] = s->xp[i] % base;
   2666      1.1  mrg 
   2667  1.1.1.2  mrg   LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
   2668      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
   2669      1.1  mrg 
   2670      1.1  mrg   /* use this during development to check wn is big enough */
   2671      1.1  mrg   /*
   2672      1.1  mrg   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
   2673      1.1  mrg   */
   2674      1.1  mrg 
   2675  1.1.1.3  mrg   speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
   2676      1.1  mrg   speed_operand_dst (s, wp, wn);
   2677      1.1  mrg   speed_cache_fill (s);
   2678      1.1  mrg 
   2679      1.1  mrg   chars_per_limb = mp_bases[base].chars_per_limb;
   2680      1.1  mrg   un = s->size / chars_per_limb + 1;
   2681      1.1  mrg   powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
   2682      1.1  mrg   mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
   2683      1.1  mrg   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
   2684      1.1  mrg 
   2685      1.1  mrg   speed_starttime ();
   2686      1.1  mrg   i = s->reps;
   2687      1.1  mrg   do
   2688      1.1  mrg     {
   2689      1.1  mrg       mpn_pre_set_str (wp, str, s->size, powtab, tp);
   2690      1.1  mrg     }
   2691      1.1  mrg   while (--i != 0);
   2692      1.1  mrg   t = speed_endtime ();
   2693      1.1  mrg 
   2694      1.1  mrg   TMP_FREE;
   2695      1.1  mrg   return t;
   2696      1.1  mrg }
   2697      1.1  mrg 
   2698      1.1  mrg void
   2699      1.1  mrg tune_set_str (void)
   2700      1.1  mrg {
   2701      1.1  mrg   s.r = 10;  /* decimal */
   2702      1.1  mrg   {
   2703      1.1  mrg     static struct param_t  param;
   2704      1.1  mrg     SET_STR_PRECOMPUTE_THRESHOLD = 0;
   2705      1.1  mrg     param.step_factor = 0.01;
   2706      1.1  mrg     param.name = "SET_STR_DC_THRESHOLD";
   2707      1.1  mrg     param.function = speed_mpn_pre_set_str;
   2708      1.1  mrg     param.min_size = 100;
   2709      1.1  mrg     param.max_size = 50000;
   2710      1.1  mrg     one (&set_str_dc_threshold, &param);
   2711      1.1  mrg   }
   2712      1.1  mrg   {
   2713      1.1  mrg     static struct param_t  param;
   2714      1.1  mrg     param.step_factor = 0.02;
   2715      1.1  mrg     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
   2716      1.1  mrg     param.function = speed_mpn_set_str;
   2717      1.1  mrg     param.min_size = SET_STR_DC_THRESHOLD;
   2718      1.1  mrg     param.max_size = 100000;
   2719      1.1  mrg     one (&set_str_precompute_threshold, &param);
   2720      1.1  mrg   }
   2721      1.1  mrg }
   2722      1.1  mrg 
   2723      1.1  mrg 
   2724      1.1  mrg void
   2725      1.1  mrg tune_fft_mul (void)
   2726      1.1  mrg {
   2727      1.1  mrg   static struct fft_param_t  param;
   2728      1.1  mrg 
   2729      1.1  mrg   if (option_fft_max_size == 0)
   2730      1.1  mrg     return;
   2731      1.1  mrg 
   2732      1.1  mrg   param.table_name          = "MUL_FFT_TABLE3";
   2733      1.1  mrg   param.threshold_name      = "MUL_FFT_THRESHOLD";
   2734      1.1  mrg   param.p_threshold         = &mul_fft_threshold;
   2735      1.1  mrg   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
   2736      1.1  mrg   param.p_modf_threshold    = &mul_fft_modf_threshold;
   2737      1.1  mrg   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
   2738      1.1  mrg   param.max_size            = option_fft_max_size;
   2739      1.1  mrg   param.function            = speed_mpn_fft_mul;
   2740      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft;
   2741      1.1  mrg   param.mul_function        = speed_mpn_mul_n;
   2742      1.1  mrg   param.sqr = 0;
   2743      1.1  mrg   fft (&param);
   2744      1.1  mrg }
   2745      1.1  mrg 
   2746      1.1  mrg 
   2747      1.1  mrg void
   2748      1.1  mrg tune_fft_sqr (void)
   2749      1.1  mrg {
   2750      1.1  mrg   static struct fft_param_t  param;
   2751      1.1  mrg 
   2752      1.1  mrg   if (option_fft_max_size == 0)
   2753      1.1  mrg     return;
   2754      1.1  mrg 
   2755      1.1  mrg   param.table_name          = "SQR_FFT_TABLE3";
   2756      1.1  mrg   param.threshold_name      = "SQR_FFT_THRESHOLD";
   2757      1.1  mrg   param.p_threshold         = &sqr_fft_threshold;
   2758      1.1  mrg   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
   2759      1.1  mrg   param.p_modf_threshold    = &sqr_fft_modf_threshold;
   2760      1.1  mrg   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
   2761      1.1  mrg   param.max_size            = option_fft_max_size;
   2762      1.1  mrg   param.function            = speed_mpn_fft_sqr;
   2763      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
   2764      1.1  mrg   param.mul_function        = speed_mpn_sqr;
   2765      1.1  mrg   param.sqr = 1;
   2766      1.1  mrg   fft (&param);
   2767      1.1  mrg }
   2768      1.1  mrg 
   2769      1.1  mrg void
   2770  1.1.1.2  mrg tune_fac_ui (void)
   2771  1.1.1.2  mrg {
   2772  1.1.1.2  mrg   static struct param_t  param;
   2773  1.1.1.2  mrg 
   2774  1.1.1.2  mrg   param.function = speed_mpz_fac_ui_tune;
   2775  1.1.1.2  mrg 
   2776  1.1.1.2  mrg   param.name = "FAC_DSC_THRESHOLD";
   2777  1.1.1.2  mrg   param.min_size = 70;
   2778  1.1.1.2  mrg   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
   2779  1.1.1.2  mrg   one (&fac_dsc_threshold, &param);
   2780  1.1.1.2  mrg 
   2781  1.1.1.2  mrg   param.name = "FAC_ODD_THRESHOLD";
   2782  1.1.1.2  mrg   param.min_size = 22;
   2783  1.1.1.2  mrg   param.stop_factor = 1.7;
   2784  1.1.1.2  mrg   param.min_is_always = 1;
   2785  1.1.1.2  mrg   one (&fac_odd_threshold, &param);
   2786  1.1.1.2  mrg }
   2787  1.1.1.2  mrg 
   2788  1.1.1.2  mrg void
   2789      1.1  mrg all (void)
   2790      1.1  mrg {
   2791      1.1  mrg   time_t  start_time, end_time;
   2792      1.1  mrg   TMP_DECL;
   2793      1.1  mrg 
   2794      1.1  mrg   TMP_MARK;
   2795      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
   2796      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
   2797      1.1  mrg 
   2798      1.1  mrg   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
   2799      1.1  mrg   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
   2800      1.1  mrg 
   2801      1.1  mrg   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
   2802      1.1  mrg 
   2803      1.1  mrg   speed_time_init ();
   2804      1.1  mrg   fprintf (stderr, "Using: %s\n", speed_time_string);
   2805      1.1  mrg 
   2806      1.1  mrg   fprintf (stderr, "speed_precision %d", speed_precision);
   2807      1.1  mrg   if (speed_unittime == 1.0)
   2808      1.1  mrg     fprintf (stderr, ", speed_unittime 1 cycle");
   2809      1.1  mrg   else
   2810      1.1  mrg     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
   2811      1.1  mrg   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
   2812      1.1  mrg     fprintf (stderr, ", CPU freq unknown\n");
   2813      1.1  mrg   else
   2814      1.1  mrg     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
   2815      1.1  mrg 
   2816      1.1  mrg   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
   2817      1.1  mrg            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
   2818      1.1  mrg   fprintf (stderr, "\n");
   2819      1.1  mrg 
   2820      1.1  mrg   time (&start_time);
   2821      1.1  mrg   {
   2822      1.1  mrg     struct tm  *tp;
   2823      1.1  mrg     tp = localtime (&start_time);
   2824      1.1  mrg     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
   2825      1.1  mrg             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
   2826      1.1  mrg 
   2827      1.1  mrg #ifdef __GNUC__
   2828      1.1  mrg     /* gcc sub-minor version doesn't seem to come through as a define */
   2829      1.1  mrg     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
   2830      1.1  mrg #define PRINTED_COMPILER
   2831      1.1  mrg #endif
   2832      1.1  mrg #if defined (__SUNPRO_C)
   2833      1.1  mrg     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
   2834      1.1  mrg #define PRINTED_COMPILER
   2835      1.1  mrg #endif
   2836      1.1  mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
   2837      1.1  mrg     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
   2838      1.1  mrg     printf ("MIPSpro C %d.%d.%d */\n",
   2839      1.1  mrg 	    _COMPILER_VERSION / 100,
   2840      1.1  mrg 	    _COMPILER_VERSION / 10 % 10,
   2841      1.1  mrg 	    _COMPILER_VERSION % 10);
   2842      1.1  mrg #define PRINTED_COMPILER
   2843      1.1  mrg #endif
   2844      1.1  mrg #if defined (__DECC) && defined (__DECC_VER)
   2845      1.1  mrg     printf ("DEC C %d */\n", __DECC_VER);
   2846      1.1  mrg #define PRINTED_COMPILER
   2847      1.1  mrg #endif
   2848      1.1  mrg #if ! defined (PRINTED_COMPILER)
   2849      1.1  mrg     printf ("system compiler */\n");
   2850      1.1  mrg #endif
   2851      1.1  mrg   }
   2852      1.1  mrg   printf ("\n");
   2853      1.1  mrg 
   2854      1.1  mrg   tune_divrem_1 ();
   2855      1.1  mrg   tune_mod_1 ();
   2856      1.1  mrg   tune_preinv_divrem_1 ();
   2857  1.1.1.3  mrg   tune_div_qr_1 ();
   2858  1.1.1.2  mrg #if 0
   2859      1.1  mrg   tune_divrem_2 ();
   2860  1.1.1.2  mrg #endif
   2861  1.1.1.2  mrg   tune_div_qr_2 ();
   2862      1.1  mrg   tune_divexact_1 ();
   2863      1.1  mrg   tune_modexact_1_odd ();
   2864      1.1  mrg   printf("\n");
   2865      1.1  mrg 
   2866      1.1  mrg   tune_mul_n ();
   2867      1.1  mrg   printf("\n");
   2868      1.1  mrg 
   2869      1.1  mrg   tune_mul ();
   2870      1.1  mrg   printf("\n");
   2871      1.1  mrg 
   2872      1.1  mrg   tune_sqr ();
   2873      1.1  mrg   printf("\n");
   2874      1.1  mrg 
   2875  1.1.1.2  mrg   tune_mulmid ();
   2876  1.1.1.2  mrg   printf("\n");
   2877  1.1.1.2  mrg 
   2878      1.1  mrg   tune_mulmod_bnm1 ();
   2879      1.1  mrg   tune_sqrmod_bnm1 ();
   2880      1.1  mrg   printf("\n");
   2881      1.1  mrg 
   2882      1.1  mrg   tune_fft_mul ();
   2883      1.1  mrg   printf("\n");
   2884      1.1  mrg 
   2885      1.1  mrg   tune_fft_sqr ();
   2886      1.1  mrg   printf ("\n");
   2887      1.1  mrg 
   2888      1.1  mrg   tune_mullo ();
   2889  1.1.1.3  mrg   tune_sqrlo ();
   2890      1.1  mrg   printf("\n");
   2891      1.1  mrg 
   2892      1.1  mrg   tune_dc_div ();
   2893      1.1  mrg   tune_dc_bdiv ();
   2894      1.1  mrg 
   2895      1.1  mrg   printf("\n");
   2896      1.1  mrg   tune_invertappr ();
   2897      1.1  mrg   tune_invert ();
   2898      1.1  mrg   printf("\n");
   2899      1.1  mrg 
   2900      1.1  mrg   tune_binvert ();
   2901      1.1  mrg   tune_redc ();
   2902      1.1  mrg   printf("\n");
   2903      1.1  mrg 
   2904      1.1  mrg   tune_mu_div ();
   2905      1.1  mrg   tune_mu_bdiv ();
   2906      1.1  mrg   printf("\n");
   2907      1.1  mrg 
   2908  1.1.1.2  mrg   tune_powm_sec ();
   2909  1.1.1.2  mrg   printf("\n");
   2910  1.1.1.2  mrg 
   2911  1.1.1.3  mrg   tune_get_str ();
   2912  1.1.1.3  mrg   tune_set_str ();
   2913  1.1.1.3  mrg   printf("\n");
   2914  1.1.1.3  mrg 
   2915  1.1.1.3  mrg   tune_fac_ui ();
   2916  1.1.1.3  mrg   printf("\n");
   2917  1.1.1.3  mrg 
   2918      1.1  mrg   tune_matrix22_mul ();
   2919      1.1  mrg   tune_hgcd ();
   2920  1.1.1.2  mrg   tune_hgcd_appr ();
   2921  1.1.1.2  mrg   tune_hgcd_reduce();
   2922      1.1  mrg   tune_gcd_dc ();
   2923      1.1  mrg   tune_gcdext_dc ();
   2924      1.1  mrg   tune_jacobi_base ();
   2925      1.1  mrg   printf("\n");
   2926      1.1  mrg 
   2927      1.1  mrg   time (&end_time);
   2928      1.1  mrg   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
   2929      1.1  mrg           (long) (end_time - start_time));
   2930      1.1  mrg 
   2931      1.1  mrg   TMP_FREE;
   2932      1.1  mrg }
   2933      1.1  mrg 
   2934      1.1  mrg 
   2935      1.1  mrg int
   2936      1.1  mrg main (int argc, char *argv[])
   2937      1.1  mrg {
   2938      1.1  mrg   int  opt;
   2939      1.1  mrg 
   2940      1.1  mrg   /* Unbuffered so if output is redirected to a file it isn't lost if the
   2941      1.1  mrg      program is killed part way through.  */
   2942      1.1  mrg   setbuf (stdout, NULL);
   2943      1.1  mrg   setbuf (stderr, NULL);
   2944      1.1  mrg 
   2945      1.1  mrg   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
   2946      1.1  mrg     {
   2947      1.1  mrg       switch (opt) {
   2948      1.1  mrg       case 'f':
   2949      1.1  mrg         if (optarg[0] == 't')
   2950      1.1  mrg           option_fft_trace = 2;
   2951      1.1  mrg         else
   2952      1.1  mrg           option_fft_max_size = atol (optarg);
   2953      1.1  mrg         break;
   2954      1.1  mrg       case 'o':
   2955      1.1  mrg         speed_option_set (optarg);
   2956      1.1  mrg         break;
   2957      1.1  mrg       case 'p':
   2958      1.1  mrg         speed_precision = atoi (optarg);
   2959      1.1  mrg         break;
   2960      1.1  mrg       case 't':
   2961      1.1  mrg         option_trace++;
   2962      1.1  mrg         break;
   2963      1.1  mrg       case '?':
   2964      1.1  mrg         exit(1);
   2965      1.1  mrg       }
   2966      1.1  mrg     }
   2967      1.1  mrg 
   2968      1.1  mrg   all ();
   2969      1.1  mrg   exit (0);
   2970      1.1  mrg }
   2971