Home | History | Annotate | Line # | Download | only in tune
tuneup.c revision 1.1.1.5
      1      1.1  mrg /* Create tuned thresholds for various algorithms.
      2      1.1  mrg 
      3  1.1.1.4  mrg Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg This file is part of the GNU MP Library.
      6      1.1  mrg 
      7      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg it under the terms of either:
      9  1.1.1.3  mrg 
     10  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg     option) any later version.
     13  1.1.1.3  mrg 
     14  1.1.1.3  mrg or
     15  1.1.1.3  mrg 
     16  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg     later version.
     19  1.1.1.3  mrg 
     20  1.1.1.3  mrg or both in parallel, as here.
     21      1.1  mrg 
     22      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg for more details.
     26      1.1  mrg 
     27  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg /* Usage: tuneup [-t] [-t] [-p precision]
     33      1.1  mrg 
     34      1.1  mrg    -t turns on some diagnostic traces, a second -t turns on more traces.
     35      1.1  mrg 
     36      1.1  mrg    Notes:
     37      1.1  mrg 
     38      1.1  mrg    The code here isn't a vision of loveliness, mainly because it's subject
     39      1.1  mrg    to ongoing changes according to new things wanting to be tuned, and
     40      1.1  mrg    practical requirements of systems tested.
     41      1.1  mrg 
     42      1.1  mrg    Sometimes running the program twice produces slightly different results.
     43      1.1  mrg    This is probably because there's so little separating algorithms near
     44      1.1  mrg    their crossover, and on that basis it should make little or no difference
     45      1.1  mrg    to the final speed of the relevant routines, but nothing has been done to
     46      1.1  mrg    check that carefully.
     47      1.1  mrg 
     48      1.1  mrg    Algorithm:
     49      1.1  mrg 
     50      1.1  mrg    The thresholds are determined as follows.  A crossover may not be a
     51      1.1  mrg    single size but rather a range where it oscillates between method A or
     52      1.1  mrg    method B faster.  If the threshold is set making B used where A is faster
     53      1.1  mrg    (or vice versa) that's bad.  Badness is the percentage time lost and
     54      1.1  mrg    total badness is the sum of this over all sizes measured.  The threshold
     55      1.1  mrg    is set to minimize total badness.
     56      1.1  mrg 
     57      1.1  mrg    Suppose, as sizes increase, method B becomes faster than method A.  The
     58      1.1  mrg    effect of the rule is that, as you look at increasing sizes, isolated
     59      1.1  mrg    points where B is faster are ignored, but when it's consistently faster,
     60      1.1  mrg    or faster on balance, then the threshold is set there.  The same result
     61      1.1  mrg    is obtained thinking in the other direction of A becoming faster at
     62      1.1  mrg    smaller sizes.
     63      1.1  mrg 
     64      1.1  mrg    In practice the thresholds tend to be chosen to bring on the next
     65      1.1  mrg    algorithm fairly quickly.
     66      1.1  mrg 
     67      1.1  mrg    This rule is attractive because it's got a basis in reason and is fairly
     68      1.1  mrg    easy to implement, but no work has been done to actually compare it in
     69      1.1  mrg    absolute terms to other possibilities.
     70      1.1  mrg 
     71      1.1  mrg    Implementation:
     72      1.1  mrg 
     73      1.1  mrg    In a normal library build the thresholds are constants.  To tune them
     74      1.1  mrg    selected objects are recompiled with the thresholds as global variables
     75      1.1  mrg    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
     76      1.1  mrg    the end of gmp-impl.h, and rules in tune/Makefile.am.
     77      1.1  mrg 
     78      1.1  mrg    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
     79      1.1  mrg    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
     80      1.1  mrg    level, but recurse into the basecase.
     81      1.1  mrg 
     82      1.1  mrg    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
     83      1.1  mrg    Other routines in turn will make use of both of those.  Naturally the
     84      1.1  mrg    dependants must be tuned first.
     85      1.1  mrg 
     86      1.1  mrg    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
     87      1.1  mrg    just a threshold based on comparing two routines (mpn_divrem_1 and
     88      1.1  mrg    mpn_divexact_1), and no further use of the value determined.
     89      1.1  mrg 
     90      1.1  mrg    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
     91      1.1  mrg    just comparisons between certain routines on representative data.
     92      1.1  mrg 
     93      1.1  mrg    Shortcuts are applied when native (assembler) versions of routines exist.
     94      1.1  mrg    For instance a native mpn_sqr_basecase is assumed to be always faster
     95      1.1  mrg    than mpn_mul_basecase, with no measuring.
     96      1.1  mrg 
     97      1.1  mrg    No attempt is made to tune within assembler routines, for instance
     98      1.1  mrg    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
     99      1.1  mrg    written and tuned all by hand.  Assembler routines that might have hard
    100      1.1  mrg    limits are recompiled though, to make them accept a bigger range of sizes
    101      1.1  mrg    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
    102      1.1  mrg 
    103      1.1  mrg    Limitations:
    104      1.1  mrg 
    105      1.1  mrg    The FFTs aren't subject to the same badness rule as the other thresholds,
    106      1.1  mrg    so each k is probably being brought on a touch early.  This isn't likely
    107      1.1  mrg    to make a difference, and the simpler probing means fewer tests.
    108      1.1  mrg 
    109      1.1  mrg */
    110      1.1  mrg 
    111      1.1  mrg #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
    112      1.1  mrg 
    113      1.1  mrg #include "config.h"
    114      1.1  mrg 
    115      1.1  mrg #include <math.h>
    116      1.1  mrg #include <stdio.h>
    117      1.1  mrg #include <stdlib.h>
    118      1.1  mrg #include <time.h>
    119      1.1  mrg #if HAVE_UNISTD_H
    120      1.1  mrg #include <unistd.h>
    121      1.1  mrg #endif
    122      1.1  mrg 
    123      1.1  mrg #include "gmp-impl.h"
    124      1.1  mrg #include "longlong.h"
    125      1.1  mrg 
    126      1.1  mrg #include "tests.h"
    127      1.1  mrg #include "speed.h"
    128      1.1  mrg 
    129      1.1  mrg #if !HAVE_DECL_OPTARG
    130      1.1  mrg extern char *optarg;
    131      1.1  mrg extern int optind, opterr;
    132      1.1  mrg #endif
    133      1.1  mrg 
    134      1.1  mrg 
    135      1.1  mrg #define DEFAULT_MAX_SIZE   1000  /* limbs */
    136      1.1  mrg 
    137      1.1  mrg #if WANT_FFT
    138      1.1  mrg mp_size_t  option_fft_max_size = 50000;  /* limbs */
    139      1.1  mrg #else
    140      1.1  mrg mp_size_t  option_fft_max_size = 0;
    141      1.1  mrg #endif
    142      1.1  mrg int        option_trace = 0;
    143      1.1  mrg int        option_fft_trace = 0;
    144      1.1  mrg struct speed_params  s;
    145      1.1  mrg 
    146      1.1  mrg struct dat_t {
    147      1.1  mrg   mp_size_t  size;
    148      1.1  mrg   double     d;
    149      1.1  mrg } *dat = NULL;
    150      1.1  mrg int  ndat = 0;
    151      1.1  mrg int  allocdat = 0;
    152      1.1  mrg 
    153      1.1  mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
    154      1.1  mrg    case use zero here, which for params.max_size means no limit.  */
    155      1.1  mrg #ifndef TUNE_SQR_TOOM2_MAX
    156      1.1  mrg #define TUNE_SQR_TOOM2_MAX  0
    157      1.1  mrg #endif
    158      1.1  mrg 
    159      1.1  mrg mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
    160      1.1  mrg mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
    161      1.1  mrg mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
    162      1.1  mrg mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
    163      1.1  mrg mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
    164      1.1  mrg mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
    165      1.1  mrg mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
    166      1.1  mrg mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
    167      1.1  mrg mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
    168  1.1.1.2  mrg mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
    169      1.1  mrg mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
    170      1.1  mrg mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
    171      1.1  mrg mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
    172      1.1  mrg mp_size_t  sqr_toom2_threshold
    173      1.1  mrg   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
    174      1.1  mrg mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
    175      1.1  mrg mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
    176      1.1  mrg mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
    177      1.1  mrg mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
    178      1.1  mrg mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
    179      1.1  mrg mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
    180      1.1  mrg mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
    181      1.1  mrg mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
    182      1.1  mrg mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
    183  1.1.1.3  mrg mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
    184  1.1.1.3  mrg mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
    185  1.1.1.3  mrg mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
    186  1.1.1.2  mrg mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
    187      1.1  mrg mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
    188      1.1  mrg mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
    189  1.1.1.2  mrg mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
    190      1.1  mrg mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
    191      1.1  mrg mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
    192      1.1  mrg mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
    193      1.1  mrg mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
    194      1.1  mrg mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
    195      1.1  mrg mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
    196      1.1  mrg mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    197      1.1  mrg mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
    198      1.1  mrg mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    199      1.1  mrg mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
    200      1.1  mrg mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
    201      1.1  mrg mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
    202      1.1  mrg mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
    203      1.1  mrg mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
    204      1.1  mrg mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
    205      1.1  mrg mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
    206      1.1  mrg mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
    207      1.1  mrg mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
    208      1.1  mrg mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
    209  1.1.1.2  mrg mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
    210  1.1.1.2  mrg mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
    211      1.1  mrg mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
    212      1.1  mrg mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
    213  1.1.1.3  mrg int	   div_qr_1n_pi1_method		= 0;
    214  1.1.1.3  mrg mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
    215  1.1.1.3  mrg mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
    216      1.1  mrg mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
    217      1.1  mrg mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
    218      1.1  mrg mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
    219      1.1  mrg mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
    220  1.1.1.2  mrg int	   mod_1_1p_method		= 0;
    221      1.1  mrg mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    222      1.1  mrg mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    223      1.1  mrg mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
    224      1.1  mrg mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
    225      1.1  mrg mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
    226      1.1  mrg mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
    227      1.1  mrg mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
    228      1.1  mrg mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
    229      1.1  mrg mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
    230      1.1  mrg mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
    231  1.1.1.2  mrg mp_size_t  fac_odd_threshold            = 0;
    232  1.1.1.2  mrg mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
    233      1.1  mrg 
    234      1.1  mrg mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
    235      1.1  mrg mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
    236      1.1  mrg 
    237      1.1  mrg struct param_t {
    238      1.1  mrg   const char        *name;
    239      1.1  mrg   speed_function_t  function;
    240      1.1  mrg   speed_function_t  function2;
    241      1.1  mrg   double            step_factor;    /* how much to step relatively */
    242      1.1  mrg   int               step;           /* how much to step absolutely */
    243      1.1  mrg   double            function_fudge; /* multiplier for "function" speeds */
    244      1.1  mrg   int               stop_since_change;
    245      1.1  mrg   double            stop_factor;
    246      1.1  mrg   mp_size_t         min_size;
    247      1.1  mrg   int               min_is_always;
    248      1.1  mrg   mp_size_t         max_size;
    249      1.1  mrg   mp_size_t         check_size;
    250      1.1  mrg   mp_size_t         size_extra;
    251      1.1  mrg 
    252      1.1  mrg #define DATA_HIGH_LT_R  1
    253      1.1  mrg #define DATA_HIGH_GE_R  2
    254      1.1  mrg   int               data_high;
    255      1.1  mrg 
    256      1.1  mrg   int               noprint;
    257      1.1  mrg };
    258      1.1  mrg 
    259      1.1  mrg 
    260      1.1  mrg /* These are normally undefined when false, which suits "#if" fine.
    261      1.1  mrg    But give them zero values so they can be used in plain C "if"s.  */
    262      1.1  mrg #ifndef UDIV_PREINV_ALWAYS
    263      1.1  mrg #define UDIV_PREINV_ALWAYS 0
    264      1.1  mrg #endif
    265      1.1  mrg #ifndef HAVE_NATIVE_mpn_divexact_1
    266      1.1  mrg #define HAVE_NATIVE_mpn_divexact_1 0
    267      1.1  mrg #endif
    268  1.1.1.3  mrg #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
    269  1.1.1.3  mrg #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
    270  1.1.1.3  mrg #endif
    271      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_1
    272      1.1  mrg #define HAVE_NATIVE_mpn_divrem_1 0
    273      1.1  mrg #endif
    274      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_2
    275      1.1  mrg #define HAVE_NATIVE_mpn_divrem_2 0
    276      1.1  mrg #endif
    277      1.1  mrg #ifndef HAVE_NATIVE_mpn_mod_1
    278      1.1  mrg #define HAVE_NATIVE_mpn_mod_1 0
    279      1.1  mrg #endif
    280  1.1.1.2  mrg #ifndef HAVE_NATIVE_mpn_mod_1_1p
    281  1.1.1.2  mrg #define HAVE_NATIVE_mpn_mod_1_1p 0
    282  1.1.1.2  mrg #endif
    283      1.1  mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd
    284      1.1  mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0
    285      1.1  mrg #endif
    286      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
    287      1.1  mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0
    288      1.1  mrg #endif
    289      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1
    290      1.1  mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0
    291      1.1  mrg #endif
    292      1.1  mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase
    293      1.1  mrg #define HAVE_NATIVE_mpn_sqr_basecase 0
    294      1.1  mrg #endif
    295      1.1  mrg 
    296      1.1  mrg 
    297      1.1  mrg #define MAX3(a,b,c)  MAX (MAX (a, b), c)
    298      1.1  mrg 
    299      1.1  mrg mp_limb_t
    300      1.1  mrg randlimb_norm (void)
    301      1.1  mrg {
    302      1.1  mrg   mp_limb_t  n;
    303      1.1  mrg   mpn_random (&n, 1);
    304      1.1  mrg   n |= GMP_NUMB_HIGHBIT;
    305      1.1  mrg   return n;
    306      1.1  mrg }
    307      1.1  mrg 
    308      1.1  mrg #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
    309      1.1  mrg 
    310      1.1  mrg mp_limb_t
    311      1.1  mrg randlimb_half (void)
    312      1.1  mrg {
    313      1.1  mrg   mp_limb_t  n;
    314      1.1  mrg   mpn_random (&n, 1);
    315      1.1  mrg   n &= GMP_NUMB_HALFMASK;
    316      1.1  mrg   n += (n==0);
    317      1.1  mrg   return n;
    318      1.1  mrg }
    319      1.1  mrg 
    320      1.1  mrg 
    321      1.1  mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger
    322      1.1  mrg    if necessary.  */
    323      1.1  mrg void
    324      1.1  mrg add_dat (mp_size_t size, double d)
    325      1.1  mrg {
    326      1.1  mrg #define ALLOCDAT_STEP  500
    327      1.1  mrg 
    328      1.1  mrg   ASSERT_ALWAYS (ndat <= allocdat);
    329      1.1  mrg 
    330      1.1  mrg   if (ndat == allocdat)
    331      1.1  mrg     {
    332      1.1  mrg       dat = (struct dat_t *) __gmp_allocate_or_reallocate
    333      1.1  mrg         (dat, allocdat * sizeof(dat[0]),
    334      1.1  mrg          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
    335      1.1  mrg       allocdat += ALLOCDAT_STEP;
    336      1.1  mrg     }
    337      1.1  mrg 
    338      1.1  mrg   dat[ndat].size = size;
    339      1.1  mrg   dat[ndat].d = d;
    340      1.1  mrg   ndat++;
    341      1.1  mrg }
    342      1.1  mrg 
    343      1.1  mrg 
    344      1.1  mrg /* Return the threshold size based on the data accumulated. */
    345      1.1  mrg mp_size_t
    346      1.1  mrg analyze_dat (int final)
    347      1.1  mrg {
    348      1.1  mrg   double  x, min_x;
    349      1.1  mrg   int     j, min_j;
    350      1.1  mrg 
    351      1.1  mrg   /* If the threshold is set at dat[0].size, any positive values are bad. */
    352      1.1  mrg   x = 0.0;
    353      1.1  mrg   for (j = 0; j < ndat; j++)
    354      1.1  mrg     if (dat[j].d > 0.0)
    355      1.1  mrg       x += dat[j].d;
    356      1.1  mrg 
    357      1.1  mrg   if (option_trace >= 2 && final)
    358      1.1  mrg     {
    359      1.1  mrg       printf ("\n");
    360      1.1  mrg       printf ("x is the sum of the badness from setting thresh at given size\n");
    361      1.1  mrg       printf ("  (minimum x is sought)\n");
    362      1.1  mrg       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
    363      1.1  mrg     }
    364      1.1  mrg 
    365      1.1  mrg   min_x = x;
    366      1.1  mrg   min_j = 0;
    367      1.1  mrg 
    368      1.1  mrg 
    369      1.1  mrg   /* When stepping to the next dat[j].size, positive values are no longer
    370      1.1  mrg      bad (so subtracted), negative values become bad (so add the absolute
    371      1.1  mrg      value, meaning subtract). */
    372      1.1  mrg   for (j = 0; j < ndat; x -= dat[j].d, j++)
    373      1.1  mrg     {
    374      1.1  mrg       if (option_trace >= 2 && final)
    375      1.1  mrg         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
    376      1.1  mrg 
    377      1.1  mrg       if (x < min_x)
    378      1.1  mrg         {
    379      1.1  mrg           min_x = x;
    380      1.1  mrg           min_j = j;
    381      1.1  mrg         }
    382      1.1  mrg     }
    383      1.1  mrg 
    384      1.1  mrg   return min_j;
    385      1.1  mrg }
    386      1.1  mrg 
    387      1.1  mrg 
    388  1.1.1.3  mrg /* Measuring for recompiled mpn/generic/div_qr_1.c,
    389  1.1.1.3  mrg  * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
    390  1.1.1.3  mrg 
    391  1.1.1.3  mrg mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
    392  1.1.1.3  mrg 
    393  1.1.1.3  mrg #if defined (__cplusplus)
    394  1.1.1.3  mrg extern "C" {
    395  1.1.1.3  mrg #endif
    396      1.1  mrg 
    397  1.1.1.2  mrg mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
    398  1.1.1.2  mrg mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
    399  1.1.1.2  mrg void mpz_fac_ui_tune (mpz_ptr, unsigned long);
    400      1.1  mrg 
    401  1.1.1.3  mrg #if defined (__cplusplus)
    402  1.1.1.3  mrg }
    403  1.1.1.3  mrg #endif
    404  1.1.1.3  mrg 
    405      1.1  mrg double
    406      1.1  mrg speed_mpn_mod_1_tune (struct speed_params *s)
    407      1.1  mrg {
    408      1.1  mrg   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
    409      1.1  mrg }
    410      1.1  mrg double
    411      1.1  mrg speed_mpn_divrem_1_tune (struct speed_params *s)
    412      1.1  mrg {
    413      1.1  mrg   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
    414      1.1  mrg }
    415  1.1.1.2  mrg double
    416  1.1.1.2  mrg speed_mpz_fac_ui_tune (struct speed_params *s)
    417  1.1.1.2  mrg {
    418  1.1.1.2  mrg   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
    419  1.1.1.2  mrg }
    420  1.1.1.3  mrg double
    421  1.1.1.3  mrg speed_mpn_div_qr_1_tune (struct speed_params *s)
    422  1.1.1.3  mrg {
    423  1.1.1.3  mrg   SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
    424  1.1.1.3  mrg }
    425      1.1  mrg 
    426      1.1  mrg double
    427      1.1  mrg tuneup_measure (speed_function_t fun,
    428      1.1  mrg                 const struct param_t *param,
    429      1.1  mrg                 struct speed_params *s)
    430      1.1  mrg {
    431      1.1  mrg   static struct param_t  dummy;
    432      1.1  mrg   double   t;
    433      1.1  mrg   TMP_DECL;
    434      1.1  mrg 
    435      1.1  mrg   if (! param)
    436      1.1  mrg     param = &dummy;
    437      1.1  mrg 
    438      1.1  mrg   s->size += param->size_extra;
    439      1.1  mrg 
    440      1.1  mrg   TMP_MARK;
    441      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
    442      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
    443      1.1  mrg 
    444      1.1  mrg   mpn_random (s->xp, s->size);
    445      1.1  mrg   mpn_random (s->yp, s->size);
    446      1.1  mrg 
    447      1.1  mrg   switch (param->data_high) {
    448      1.1  mrg   case DATA_HIGH_LT_R:
    449      1.1  mrg     s->xp[s->size-1] %= s->r;
    450      1.1  mrg     s->yp[s->size-1] %= s->r;
    451      1.1  mrg     break;
    452      1.1  mrg   case DATA_HIGH_GE_R:
    453      1.1  mrg     s->xp[s->size-1] |= s->r;
    454      1.1  mrg     s->yp[s->size-1] |= s->r;
    455      1.1  mrg     break;
    456      1.1  mrg   }
    457      1.1  mrg 
    458      1.1  mrg   t = speed_measure (fun, s);
    459      1.1  mrg 
    460      1.1  mrg   s->size -= param->size_extra;
    461      1.1  mrg 
    462      1.1  mrg   TMP_FREE;
    463      1.1  mrg   return t;
    464      1.1  mrg }
    465      1.1  mrg 
    466      1.1  mrg 
    467      1.1  mrg #define PRINT_WIDTH  31
    468      1.1  mrg 
    469      1.1  mrg void
    470      1.1  mrg print_define_start (const char *name)
    471      1.1  mrg {
    472      1.1  mrg   printf ("#define %-*s  ", PRINT_WIDTH, name);
    473      1.1  mrg   if (option_trace)
    474      1.1  mrg     printf ("...\n");
    475      1.1  mrg }
    476      1.1  mrg 
    477      1.1  mrg void
    478      1.1  mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark)
    479      1.1  mrg {
    480      1.1  mrg   if (option_trace)
    481      1.1  mrg     printf ("#define %-*s  ", PRINT_WIDTH, name);
    482      1.1  mrg 
    483      1.1  mrg   if (value == MP_SIZE_T_MAX)
    484      1.1  mrg     printf ("MP_SIZE_T_MAX");
    485      1.1  mrg   else
    486      1.1  mrg     printf ("%5ld", (long) value);
    487      1.1  mrg 
    488      1.1  mrg   if (remark != NULL)
    489      1.1  mrg     printf ("  /* %s */", remark);
    490      1.1  mrg   printf ("\n");
    491      1.1  mrg   fflush (stdout);
    492      1.1  mrg }
    493      1.1  mrg 
    494      1.1  mrg void
    495      1.1  mrg print_define_end (const char *name, mp_size_t value)
    496      1.1  mrg {
    497      1.1  mrg   const char  *remark;
    498      1.1  mrg   if (value == MP_SIZE_T_MAX)
    499      1.1  mrg     remark = "never";
    500      1.1  mrg   else if (value == 0)
    501      1.1  mrg     remark = "always";
    502      1.1  mrg   else
    503      1.1  mrg     remark = NULL;
    504      1.1  mrg   print_define_end_remark (name, value, remark);
    505      1.1  mrg }
    506      1.1  mrg 
    507      1.1  mrg void
    508      1.1  mrg print_define (const char *name, mp_size_t value)
    509      1.1  mrg {
    510      1.1  mrg   print_define_start (name);
    511      1.1  mrg   print_define_end (name, value);
    512      1.1  mrg }
    513      1.1  mrg 
    514      1.1  mrg void
    515      1.1  mrg print_define_remark (const char *name, mp_size_t value, const char *remark)
    516      1.1  mrg {
    517      1.1  mrg   print_define_start (name);
    518      1.1  mrg   print_define_end_remark (name, value, remark);
    519      1.1  mrg }
    520      1.1  mrg 
    521  1.1.1.4  mrg void
    522  1.1.1.4  mrg print_define_with_speedup (const char *name, mp_size_t value,
    523  1.1.1.4  mrg 			   mp_size_t runner_up, double speedup)
    524  1.1.1.4  mrg {
    525  1.1.1.4  mrg   char buf[100];
    526  1.1.1.5  mrg #if __STDC_VERSION__ >= 199901L
    527  1.1.1.5  mrg   snprintf (buf, sizeof buf, "%.2f%% faster than %ld",
    528  1.1.1.4  mrg 	    100.0 * (speedup - 1), runner_up);
    529  1.1.1.5  mrg #else
    530  1.1.1.5  mrg   sprintf (buf, "%.2f%% faster than %ld",
    531  1.1.1.5  mrg 	    100.0 * (speedup - 1), runner_up);
    532  1.1.1.5  mrg #endif
    533  1.1.1.4  mrg   print_define_remark (name, value, buf);
    534  1.1.1.4  mrg }
    535      1.1  mrg 
    536      1.1  mrg void
    537      1.1  mrg one (mp_size_t *threshold, struct param_t *param)
    538      1.1  mrg {
    539      1.1  mrg   int  since_positive, since_thresh_change;
    540      1.1  mrg   int  thresh_idx, new_thresh_idx;
    541      1.1  mrg 
    542      1.1  mrg #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
    543      1.1  mrg 
    544      1.1  mrg   DEFAULT (param->function_fudge, 1.0);
    545      1.1  mrg   DEFAULT (param->function2, param->function);
    546      1.1  mrg   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
    547      1.1  mrg   DEFAULT (param->step, 1);            /* small steps by default */
    548      1.1  mrg   DEFAULT (param->stop_since_change, 80);
    549      1.1  mrg   DEFAULT (param->stop_factor, 1.2);
    550      1.1  mrg   DEFAULT (param->min_size, 10);
    551      1.1  mrg   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
    552      1.1  mrg 
    553      1.1  mrg   if (param->check_size != 0)
    554      1.1  mrg     {
    555      1.1  mrg       double   t1, t2;
    556      1.1  mrg       s.size = param->check_size;
    557      1.1  mrg 
    558      1.1  mrg       *threshold = s.size+1;
    559      1.1  mrg       t1 = tuneup_measure (param->function, param, &s);
    560      1.1  mrg 
    561      1.1  mrg       *threshold = s.size;
    562      1.1  mrg       t2 = tuneup_measure (param->function2, param, &s);
    563      1.1  mrg       if (t1 == -1.0 || t2 == -1.0)
    564      1.1  mrg         {
    565      1.1  mrg           printf ("Oops, can't run both functions at size %ld\n",
    566      1.1  mrg                   (long) s.size);
    567      1.1  mrg           abort ();
    568      1.1  mrg         }
    569      1.1  mrg       t1 *= param->function_fudge;
    570      1.1  mrg 
    571      1.1  mrg       /* ask that t2 is at least 4% below t1 */
    572      1.1  mrg       if (t1 < t2*1.04)
    573      1.1  mrg         {
    574      1.1  mrg           if (option_trace)
    575      1.1  mrg             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
    576      1.1  mrg           *threshold = MP_SIZE_T_MAX;
    577      1.1  mrg           if (! param->noprint)
    578      1.1  mrg             print_define (param->name, *threshold);
    579      1.1  mrg           return;
    580      1.1  mrg         }
    581      1.1  mrg 
    582      1.1  mrg       if (option_trace >= 2)
    583      1.1  mrg         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
    584      1.1  mrg                 (long) s.size, t1, t2);
    585      1.1  mrg     }
    586      1.1  mrg 
    587      1.1  mrg   if (! param->noprint || option_trace)
    588      1.1  mrg     print_define_start (param->name);
    589      1.1  mrg 
    590      1.1  mrg   ndat = 0;
    591      1.1  mrg   since_positive = 0;
    592      1.1  mrg   since_thresh_change = 0;
    593      1.1  mrg   thresh_idx = 0;
    594      1.1  mrg 
    595      1.1  mrg   if (option_trace >= 2)
    596      1.1  mrg     {
    597      1.1  mrg       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
    598      1.1  mrg       printf ("              (seconds)    (seconds)    diff    thresh\n");
    599      1.1  mrg     }
    600      1.1  mrg 
    601      1.1  mrg   for (s.size = param->min_size;
    602      1.1  mrg        s.size < param->max_size;
    603      1.1  mrg        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
    604      1.1  mrg     {
    605      1.1  mrg       double   ti, tiplus1, d;
    606      1.1  mrg 
    607      1.1  mrg       /*
    608      1.1  mrg         FIXME: check minimum size requirements are met, possibly by just
    609      1.1  mrg         checking for the -1 returns from the speed functions.
    610      1.1  mrg       */
    611      1.1  mrg 
    612      1.1  mrg       /* using method A at this size */
    613      1.1  mrg       *threshold = s.size+1;
    614      1.1  mrg       ti = tuneup_measure (param->function, param, &s);
    615      1.1  mrg       if (ti == -1.0)
    616      1.1  mrg         abort ();
    617      1.1  mrg       ti *= param->function_fudge;
    618      1.1  mrg 
    619      1.1  mrg       /* using method B at this size */
    620      1.1  mrg       *threshold = s.size;
    621      1.1  mrg       tiplus1 = tuneup_measure (param->function2, param, &s);
    622      1.1  mrg       if (tiplus1 == -1.0)
    623      1.1  mrg         abort ();
    624      1.1  mrg 
    625      1.1  mrg       /* Calculate the fraction by which the one or the other routine is
    626      1.1  mrg          slower.  */
    627      1.1  mrg       if (tiplus1 >= ti)
    628      1.1  mrg         d = (tiplus1 - ti) / tiplus1;  /* negative */
    629      1.1  mrg       else
    630      1.1  mrg         d = (tiplus1 - ti) / ti;       /* positive */
    631      1.1  mrg 
    632      1.1  mrg       add_dat (s.size, d);
    633      1.1  mrg 
    634      1.1  mrg       new_thresh_idx = analyze_dat (0);
    635      1.1  mrg 
    636      1.1  mrg       if (option_trace >= 2)
    637      1.1  mrg         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
    638      1.1  mrg                 (long) s.size, ti, tiplus1, d,
    639      1.1  mrg                 ti > tiplus1 ? '#' : ' ',
    640      1.1  mrg                 (long) dat[new_thresh_idx].size);
    641      1.1  mrg 
    642      1.1  mrg       /* Stop if the last time method i was faster was more than a
    643      1.1  mrg          certain number of measurements ago.  */
    644      1.1  mrg #define STOP_SINCE_POSITIVE  200
    645      1.1  mrg       if (d >= 0)
    646      1.1  mrg         since_positive = 0;
    647      1.1  mrg       else
    648      1.1  mrg         if (++since_positive > STOP_SINCE_POSITIVE)
    649      1.1  mrg           {
    650      1.1  mrg             if (option_trace >= 1)
    651      1.1  mrg               printf ("stopped due to since_positive (%d)\n",
    652      1.1  mrg                       STOP_SINCE_POSITIVE);
    653      1.1  mrg             break;
    654      1.1  mrg           }
    655      1.1  mrg 
    656      1.1  mrg       /* Stop if method A has become slower by a certain factor. */
    657      1.1  mrg       if (ti >= tiplus1 * param->stop_factor)
    658      1.1  mrg         {
    659      1.1  mrg           if (option_trace >= 1)
    660      1.1  mrg             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
    661      1.1  mrg                     param->stop_factor);
    662      1.1  mrg           break;
    663      1.1  mrg         }
    664      1.1  mrg 
    665      1.1  mrg       /* Stop if the threshold implied hasn't changed in a certain
    666      1.1  mrg          number of measurements.  (It's this condition that usually
    667      1.1  mrg          stops the loop.) */
    668      1.1  mrg       if (thresh_idx != new_thresh_idx)
    669      1.1  mrg         since_thresh_change = 0, thresh_idx = new_thresh_idx;
    670      1.1  mrg       else
    671      1.1  mrg         if (++since_thresh_change > param->stop_since_change)
    672      1.1  mrg           {
    673      1.1  mrg             if (option_trace >= 1)
    674      1.1  mrg               printf ("stopped due to since_thresh_change (%d)\n",
    675      1.1  mrg                       param->stop_since_change);
    676      1.1  mrg             break;
    677      1.1  mrg           }
    678      1.1  mrg 
    679      1.1  mrg       /* Stop if the threshold implied is more than a certain number of
    680      1.1  mrg          measurements ago.  */
    681      1.1  mrg #define STOP_SINCE_AFTER   500
    682      1.1  mrg       if (ndat - thresh_idx > STOP_SINCE_AFTER)
    683      1.1  mrg         {
    684      1.1  mrg           if (option_trace >= 1)
    685      1.1  mrg             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
    686      1.1  mrg                     STOP_SINCE_AFTER);
    687      1.1  mrg           break;
    688      1.1  mrg         }
    689      1.1  mrg 
    690      1.1  mrg       /* Stop when the size limit is reached before the end of the
    691      1.1  mrg          crossover, but only show this as an error for >= the default max
    692      1.1  mrg          size.  FIXME: Maybe should make it a param choice whether this is
    693      1.1  mrg          an error.  */
    694      1.1  mrg       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
    695      1.1  mrg         {
    696      1.1  mrg           fprintf (stderr, "%s\n", param->name);
    697      1.1  mrg           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
    698      1.1  mrg                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    699      1.1  mrg           fprintf (stderr, "    max size reached before end of crossover\n");
    700      1.1  mrg           break;
    701      1.1  mrg         }
    702      1.1  mrg     }
    703      1.1  mrg 
    704      1.1  mrg   if (option_trace >= 1)
    705      1.1  mrg     printf ("sizes %ld to %ld total %d measurements\n",
    706      1.1  mrg             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    707      1.1  mrg 
    708      1.1  mrg   *threshold = dat[analyze_dat (1)].size;
    709      1.1  mrg 
    710      1.1  mrg   if (param->min_is_always)
    711      1.1  mrg     {
    712      1.1  mrg       if (*threshold == param->min_size)
    713      1.1  mrg         *threshold = 0;
    714      1.1  mrg     }
    715      1.1  mrg 
    716      1.1  mrg   if (! param->noprint || option_trace)
    717      1.1  mrg     print_define_end (param->name, *threshold);
    718      1.1  mrg }
    719      1.1  mrg 
    720  1.1.1.4  mrg /* Time N different FUNCTIONS with the same parameters and size, to
    721  1.1.1.4  mrg    select the fastest. Since *_METHOD defines start numbering from
    722  1.1.1.4  mrg    one, if functions[i] is fastest, the value of the define is i+1.
    723  1.1.1.4  mrg    Also output a comment with speedup compared to the next fastest
    724  1.1.1.4  mrg    function. The NAME argument is used only for trace output.
    725  1.1.1.4  mrg 
    726  1.1.1.4  mrg    Returns the index of the fastest function.
    727  1.1.1.4  mrg */
    728  1.1.1.4  mrg int
    729  1.1.1.4  mrg one_method (int n, speed_function_t *functions,
    730  1.1.1.4  mrg 	    const char *name, const char *define,
    731  1.1.1.4  mrg 	    const struct param_t *param)
    732  1.1.1.4  mrg {
    733  1.1.1.4  mrg   double *t;
    734  1.1.1.4  mrg   int i;
    735  1.1.1.4  mrg   int method;
    736  1.1.1.4  mrg   int method_runner_up;
    737  1.1.1.4  mrg 
    738  1.1.1.4  mrg   TMP_DECL;
    739  1.1.1.4  mrg   TMP_MARK;
    740  1.1.1.4  mrg   t = (double*) TMP_ALLOC (n * sizeof (*t));
    741  1.1.1.4  mrg 
    742  1.1.1.4  mrg   for (i = 0; i < n; i++)
    743  1.1.1.4  mrg     {
    744  1.1.1.4  mrg       t[i] = tuneup_measure (functions[i], param, &s);
    745  1.1.1.4  mrg       if (option_trace >= 1)
    746  1.1.1.4  mrg 	printf ("size=%ld, %s, method %d %.9f\n",
    747  1.1.1.4  mrg 		(long) s.size, name, i + 1, t[i]);
    748  1.1.1.4  mrg       if (t[i] == -1.0)
    749  1.1.1.4  mrg 	{
    750  1.1.1.4  mrg 	  printf ("Oops, can't measure all %s methods\n", name);
    751  1.1.1.4  mrg 	  abort ();
    752  1.1.1.4  mrg 	}
    753  1.1.1.4  mrg     }
    754  1.1.1.4  mrg   method = 0;
    755  1.1.1.4  mrg   for (i = 1; i < n; i++)
    756  1.1.1.4  mrg     if (t[i] < t[method])
    757  1.1.1.4  mrg       method = i;
    758  1.1.1.4  mrg 
    759  1.1.1.4  mrg   method_runner_up = (method == 0);
    760  1.1.1.4  mrg   for (i = 0; i < n; i++)
    761  1.1.1.4  mrg     if (i != method && t[i] < t[method_runner_up])
    762  1.1.1.4  mrg       method_runner_up = i;
    763  1.1.1.4  mrg 
    764  1.1.1.4  mrg   print_define_with_speedup (define, method + 1, method_runner_up + 1,
    765  1.1.1.4  mrg 			     t[method_runner_up] / t[method]);
    766  1.1.1.4  mrg 
    767  1.1.1.4  mrg   TMP_FREE;
    768  1.1.1.4  mrg   return method;
    769  1.1.1.4  mrg }
    770  1.1.1.4  mrg 
    771      1.1  mrg 
    772      1.1  mrg /* Special probing for the fft thresholds.  The size restrictions on the
    773      1.1  mrg    FFTs mean the graph of time vs size has a step effect.  See this for
    774      1.1  mrg    example using
    775      1.1  mrg 
    776      1.1  mrg        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
    777      1.1  mrg        gnuplot foo.gnuplot
    778      1.1  mrg 
    779      1.1  mrg    The current approach is to compare routines at the midpoint of relevant
    780      1.1  mrg    steps.  Arguably a more sophisticated system of threshold data is wanted
    781      1.1  mrg    if this step effect remains. */
    782      1.1  mrg 
    783      1.1  mrg struct fft_param_t {
    784      1.1  mrg   const char        *table_name;
    785      1.1  mrg   const char        *threshold_name;
    786      1.1  mrg   const char        *modf_threshold_name;
    787      1.1  mrg   mp_size_t         *p_threshold;
    788      1.1  mrg   mp_size_t         *p_modf_threshold;
    789      1.1  mrg   mp_size_t         first_size;
    790      1.1  mrg   mp_size_t         max_size;
    791      1.1  mrg   speed_function_t  function;
    792      1.1  mrg   speed_function_t  mul_modf_function;
    793      1.1  mrg   speed_function_t  mul_function;
    794      1.1  mrg   mp_size_t         sqr;
    795      1.1  mrg };
    796      1.1  mrg 
    797      1.1  mrg 
    798      1.1  mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
    799      1.1  mrg    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
    800      1.1  mrg    of 2^(k-1) bits. */
    801      1.1  mrg 
    802      1.1  mrg mp_size_t
    803      1.1  mrg fft_step_size (int k)
    804      1.1  mrg {
    805      1.1  mrg   mp_size_t  step;
    806      1.1  mrg 
    807      1.1  mrg   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
    808      1.1  mrg   step *= (mp_size_t) 1 << k;
    809      1.1  mrg 
    810      1.1  mrg   if (step <= 0)
    811      1.1  mrg     {
    812      1.1  mrg       printf ("Can't handle k=%d\n", k);
    813      1.1  mrg       abort ();
    814      1.1  mrg     }
    815      1.1  mrg 
    816      1.1  mrg   return step;
    817      1.1  mrg }
    818      1.1  mrg 
    819      1.1  mrg mp_size_t
    820      1.1  mrg fft_next_size (mp_size_t pl, int k)
    821      1.1  mrg {
    822      1.1  mrg   mp_size_t  m = fft_step_size (k);
    823      1.1  mrg 
    824      1.1  mrg /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
    825      1.1  mrg 
    826      1.1  mrg   if (pl == 0 || (pl & (m-1)) != 0)
    827      1.1  mrg     pl = (pl | (m-1)) + 1;
    828      1.1  mrg 
    829      1.1  mrg /*    printf (" %ld\n", pl); */
    830      1.1  mrg   return pl;
    831      1.1  mrg }
    832      1.1  mrg 
    833      1.1  mrg #define NMAX_DEFAULT 1000000
    834      1.1  mrg #define MAX_REPS 25
    835      1.1  mrg #define MIN_REPS 5
    836      1.1  mrg 
    837      1.1  mrg static inline size_t
    838      1.1  mrg mpn_mul_fft_lcm (size_t a, unsigned int k)
    839      1.1  mrg {
    840      1.1  mrg   unsigned int l = k;
    841      1.1  mrg 
    842      1.1  mrg   while (a % 2 == 0 && k > 0)
    843      1.1  mrg     {
    844      1.1  mrg       a >>= 1;
    845      1.1  mrg       k--;
    846      1.1  mrg     }
    847      1.1  mrg   return a << l;
    848      1.1  mrg }
    849      1.1  mrg 
    850      1.1  mrg mp_size_t
    851      1.1  mrg fftfill (mp_size_t pl, int k, int sqr)
    852      1.1  mrg {
    853      1.1  mrg   mp_size_t maxLK;
    854      1.1  mrg   mp_bitcnt_t N, Nprime, nprime, M;
    855      1.1  mrg 
    856      1.1  mrg   N = pl * GMP_NUMB_BITS;
    857      1.1  mrg   M = N >> k;
    858      1.1  mrg 
    859      1.1  mrg   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
    860      1.1  mrg 
    861      1.1  mrg   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
    862      1.1  mrg   nprime = Nprime / GMP_NUMB_BITS;
    863      1.1  mrg   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    864      1.1  mrg     {
    865      1.1  mrg       size_t K2;
    866      1.1  mrg       for (;;)
    867      1.1  mrg 	{
    868      1.1  mrg 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
    869      1.1  mrg 	  if ((nprime & (K2 - 1)) == 0)
    870      1.1  mrg 	    break;
    871      1.1  mrg 	  nprime = (nprime + K2 - 1) & -K2;
    872      1.1  mrg 	  Nprime = nprime * GMP_LIMB_BITS;
    873      1.1  mrg 	}
    874      1.1  mrg     }
    875      1.1  mrg   ASSERT_ALWAYS (nprime < pl);
    876      1.1  mrg 
    877      1.1  mrg   return Nprime;
    878      1.1  mrg }
    879      1.1  mrg 
    880      1.1  mrg static int
    881      1.1  mrg compare_double (const void *ap, const void *bp)
    882      1.1  mrg {
    883      1.1  mrg   double a = * (const double *) ap;
    884      1.1  mrg   double b = * (const double *) bp;
    885      1.1  mrg 
    886      1.1  mrg   if (a < b)
    887      1.1  mrg     return -1;
    888      1.1  mrg   else if (a > b)
    889      1.1  mrg     return 1;
    890      1.1  mrg   else
    891      1.1  mrg     return 0;
    892      1.1  mrg }
    893      1.1  mrg 
    894      1.1  mrg double
    895      1.1  mrg median (double *times, int n)
    896      1.1  mrg {
    897      1.1  mrg   qsort (times, n, sizeof (double), compare_double);
    898      1.1  mrg   return times[n/2];
    899      1.1  mrg }
    900      1.1  mrg 
    901      1.1  mrg #define FFT_CACHE_SIZE 25
    902      1.1  mrg typedef struct fft_cache
    903      1.1  mrg {
    904      1.1  mrg   mp_size_t n;
    905      1.1  mrg   double time;
    906      1.1  mrg } fft_cache_t;
    907      1.1  mrg 
    908      1.1  mrg fft_cache_t fft_cache[FFT_CACHE_SIZE];
    909      1.1  mrg 
    910      1.1  mrg double
    911      1.1  mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
    912      1.1  mrg 		int n_measurements)
    913      1.1  mrg {
    914      1.1  mrg   int i;
    915      1.1  mrg   double t, ttab[MAX_REPS];
    916      1.1  mrg 
    917      1.1  mrg   if (fft_cache[k].n == n)
    918      1.1  mrg     return fft_cache[k].time;
    919      1.1  mrg 
    920      1.1  mrg   for (i = 0; i < n_measurements; i++)
    921      1.1  mrg     {
    922      1.1  mrg       speed_starttime ();
    923      1.1  mrg       mpn_mul_fft (rp, n, ap, n, bp, n, k);
    924      1.1  mrg       ttab[i] = speed_endtime ();
    925      1.1  mrg     }
    926      1.1  mrg 
    927      1.1  mrg   t = median (ttab, n_measurements);
    928      1.1  mrg   fft_cache[k].n = n;
    929      1.1  mrg   fft_cache[k].time = t;
    930      1.1  mrg   return t;
    931      1.1  mrg }
    932      1.1  mrg 
    933      1.1  mrg #define INSERT_FFTTAB(idx, nval, kval)					\
    934      1.1  mrg   do {									\
    935      1.1  mrg     fft_tab[idx].n = nval;						\
    936      1.1  mrg     fft_tab[idx].k = kval;						\
    937  1.1.1.3  mrg     fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
    938  1.1.1.3  mrg     fft_tab[idx+1].k = (1 <<  5) - 1;					\
    939      1.1  mrg   } while (0)
    940      1.1  mrg 
    941      1.1  mrg int
    942      1.1  mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
    943      1.1  mrg {
    944      1.1  mrg   mp_size_t n, n1, prev_n1;
    945      1.1  mrg   int k, best_k, last_best_k, kmax;
    946      1.1  mrg   int eff, prev_eff;
    947      1.1  mrg   double t0, t1;
    948      1.1  mrg   int n_measurements;
    949      1.1  mrg   mp_limb_t *ap, *bp, *rp;
    950      1.1  mrg   mp_size_t alloc;
    951      1.1  mrg   struct fft_table_nk *fft_tab;
    952      1.1  mrg 
    953      1.1  mrg   fft_tab = mpn_fft_table3[p->sqr];
    954      1.1  mrg 
    955      1.1  mrg   for (k = 0; k < FFT_CACHE_SIZE; k++)
    956      1.1  mrg     fft_cache[k].n = 0;
    957      1.1  mrg 
    958      1.1  mrg   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    959      1.1  mrg     {
    960      1.1  mrg       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
    961      1.1  mrg     }
    962      1.1  mrg 
    963      1.1  mrg   if (print)
    964      1.1  mrg     printf ("#define %s%*s", p->table_name, 38, "");
    965      1.1  mrg 
    966      1.1  mrg   if (idx == 0)
    967      1.1  mrg     {
    968      1.1  mrg       INSERT_FFTTAB (0, nmin, initial_k);
    969      1.1  mrg 
    970      1.1  mrg       if (print)
    971      1.1  mrg 	{
    972      1.1  mrg 	  printf ("\\\n  { ");
    973      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
    974      1.1  mrg 	}
    975      1.1  mrg 
    976      1.1  mrg       idx = 1;
    977      1.1  mrg     }
    978      1.1  mrg 
    979  1.1.1.3  mrg   ap = (mp_ptr) malloc (sizeof (mp_limb_t));
    980      1.1  mrg   if (p->sqr)
    981      1.1  mrg     bp = ap;
    982      1.1  mrg   else
    983  1.1.1.3  mrg     bp = (mp_ptr) malloc (sizeof (mp_limb_t));
    984  1.1.1.3  mrg   rp = (mp_ptr) malloc (sizeof (mp_limb_t));
    985      1.1  mrg   alloc = 1;
    986      1.1  mrg 
    987      1.1  mrg   /* Round n to comply to initial k value */
    988      1.1  mrg   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
    989      1.1  mrg 
    990      1.1  mrg   n_measurements = (18 - initial_k) | 1;
    991      1.1  mrg   n_measurements = MAX (n_measurements, MIN_REPS);
    992      1.1  mrg   n_measurements = MIN (n_measurements, MAX_REPS);
    993      1.1  mrg 
    994      1.1  mrg   last_best_k = initial_k;
    995      1.1  mrg   best_k = initial_k;
    996      1.1  mrg 
    997      1.1  mrg   while (n < nmax)
    998      1.1  mrg     {
    999      1.1  mrg       int start_k, end_k;
   1000      1.1  mrg 
   1001      1.1  mrg       /* Assume the current best k is best until we hit its next FFT step.  */
   1002      1.1  mrg       t0 = 99999;
   1003      1.1  mrg 
   1004      1.1  mrg       prev_n1 = n + 1;
   1005      1.1  mrg 
   1006      1.1  mrg       start_k = MAX (4, best_k - 4);
   1007      1.1  mrg       end_k = MIN (24, best_k + 4);
   1008      1.1  mrg       for (k = start_k; k <= end_k; k++)
   1009      1.1  mrg 	{
   1010      1.1  mrg           n1 = mpn_fft_next_size (prev_n1, k);
   1011      1.1  mrg 
   1012      1.1  mrg 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
   1013      1.1  mrg 
   1014      1.1  mrg 	  if (eff < 70)		/* avoid measuring too slow fft:s */
   1015      1.1  mrg 	    continue;
   1016      1.1  mrg 
   1017      1.1  mrg 	  if (n1 > alloc)
   1018      1.1  mrg 	    {
   1019      1.1  mrg 	      alloc = n1;
   1020      1.1  mrg 	      if (p->sqr)
   1021      1.1  mrg 		{
   1022  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
   1023  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
   1024  1.1.1.3  mrg 		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
   1025      1.1  mrg 		  mpn_random (ap, alloc);
   1026  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
   1027      1.1  mrg 		}
   1028      1.1  mrg 	      else
   1029      1.1  mrg 		{
   1030  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
   1031  1.1.1.3  mrg 		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
   1032  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
   1033  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
   1034      1.1  mrg 		  mpn_random (ap, alloc);
   1035  1.1.1.3  mrg 		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
   1036      1.1  mrg 		  mpn_random (bp, alloc);
   1037  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
   1038      1.1  mrg 		}
   1039      1.1  mrg 	    }
   1040      1.1  mrg 
   1041      1.1  mrg 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
   1042      1.1  mrg 
   1043      1.1  mrg 	  if (t1 * n_measurements > 0.3)
   1044      1.1  mrg 	    n_measurements -= 2;
   1045      1.1  mrg 	  n_measurements = MAX (n_measurements, MIN_REPS);
   1046      1.1  mrg 
   1047      1.1  mrg 	  if (t1 < t0)
   1048      1.1  mrg 	    {
   1049      1.1  mrg 	      best_k = k;
   1050      1.1  mrg 	      t0 = t1;
   1051      1.1  mrg 	    }
   1052      1.1  mrg 	}
   1053      1.1  mrg 
   1054      1.1  mrg       n1 = mpn_fft_next_size (prev_n1, best_k);
   1055      1.1  mrg 
   1056      1.1  mrg       if (last_best_k != best_k)
   1057      1.1  mrg 	{
   1058      1.1  mrg 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
   1059      1.1  mrg 
   1060      1.1  mrg 	  if (idx >= FFT_TABLE3_SIZE)
   1061      1.1  mrg 	    {
   1062      1.1  mrg 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   1063      1.1  mrg 	      abort ();
   1064      1.1  mrg 	    }
   1065      1.1  mrg 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
   1066      1.1  mrg 
   1067      1.1  mrg 	  if (print)
   1068      1.1  mrg 	    {
   1069      1.1  mrg 	      printf (", ");
   1070      1.1  mrg 	      if (idx % 4 == 0)
   1071      1.1  mrg 		printf ("\\\n    ");
   1072      1.1  mrg 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1073      1.1  mrg 	    }
   1074      1.1  mrg 
   1075      1.1  mrg 	  if (option_trace >= 2)
   1076      1.1  mrg 	    {
   1077      1.1  mrg 	      printf ("{%lu,%u}\n", prev_n1, best_k);
   1078      1.1  mrg 	      fflush (stdout);
   1079      1.1  mrg 	    }
   1080      1.1  mrg 
   1081      1.1  mrg 	  last_best_k = best_k;
   1082      1.1  mrg 	  idx++;
   1083      1.1  mrg 	}
   1084      1.1  mrg 
   1085      1.1  mrg       for (;;)
   1086      1.1  mrg 	{
   1087      1.1  mrg 	  prev_n1 = n1;
   1088      1.1  mrg 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
   1089      1.1  mrg 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
   1090      1.1  mrg 	  eff = fftfill (n1, best_k, p->sqr);
   1091      1.1  mrg 
   1092      1.1  mrg 	  if (eff != prev_eff)
   1093      1.1  mrg 	    break;
   1094      1.1  mrg 	}
   1095      1.1  mrg 
   1096      1.1  mrg       n = prev_n1;
   1097      1.1  mrg     }
   1098      1.1  mrg 
   1099      1.1  mrg   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
   1100      1.1  mrg   kmax = MIN (kmax, 25-1);
   1101      1.1  mrg   for (k = last_best_k + 1; k <= kmax; k++)
   1102      1.1  mrg     {
   1103      1.1  mrg       if (idx >= FFT_TABLE3_SIZE)
   1104      1.1  mrg 	{
   1105      1.1  mrg 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   1106      1.1  mrg 	  abort ();
   1107      1.1  mrg 	}
   1108      1.1  mrg       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
   1109      1.1  mrg 
   1110      1.1  mrg       if (print)
   1111      1.1  mrg 	{
   1112      1.1  mrg 	  printf (", ");
   1113      1.1  mrg 	  if (idx % 4 == 0)
   1114      1.1  mrg 	    printf ("\\\n    ");
   1115      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1116      1.1  mrg 	}
   1117      1.1  mrg 
   1118      1.1  mrg       idx++;
   1119      1.1  mrg     }
   1120      1.1  mrg 
   1121      1.1  mrg   if (print)
   1122      1.1  mrg     printf (" }\n");
   1123      1.1  mrg 
   1124      1.1  mrg   free (ap);
   1125      1.1  mrg   if (! p->sqr)
   1126      1.1  mrg     free (bp);
   1127      1.1  mrg   free (rp);
   1128      1.1  mrg 
   1129      1.1  mrg   return idx;
   1130      1.1  mrg }
   1131      1.1  mrg 
   1132      1.1  mrg void
   1133      1.1  mrg fft (struct fft_param_t *p)
   1134      1.1  mrg {
   1135      1.1  mrg   mp_size_t  size;
   1136      1.1  mrg   int        k, idx, initial_k;
   1137      1.1  mrg 
   1138      1.1  mrg   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
   1139      1.1  mrg 
   1140      1.1  mrg #if 1
   1141      1.1  mrg   {
   1142      1.1  mrg     /* Use plain one() mechanism, for some reasonable initial values of k.  The
   1143      1.1  mrg        advantage is that we don't depend on mpn_fft_table3, which can therefore
   1144      1.1  mrg        leave it completely uninitialized.  */
   1145      1.1  mrg 
   1146      1.1  mrg     static struct param_t param;
   1147      1.1  mrg     mp_size_t thres, best_thres;
   1148      1.1  mrg     int best_k;
   1149      1.1  mrg     char buf[20];
   1150      1.1  mrg 
   1151      1.1  mrg     best_thres = MP_SIZE_T_MAX;
   1152      1.1  mrg     best_k = -1;
   1153      1.1  mrg 
   1154      1.1  mrg     for (k = 5; k <= 7; k++)
   1155      1.1  mrg       {
   1156      1.1  mrg 	param.name = p->modf_threshold_name;
   1157      1.1  mrg 	param.min_size = 100;
   1158      1.1  mrg 	param.max_size = 2000;
   1159      1.1  mrg 	param.function  = p->mul_function;
   1160      1.1  mrg 	param.step_factor = 0.0;
   1161      1.1  mrg 	param.step = 4;
   1162      1.1  mrg 	param.function2 = p->mul_modf_function;
   1163      1.1  mrg 	param.noprint = 1;
   1164      1.1  mrg 	s.r = k;
   1165      1.1  mrg 	one (&thres, &param);
   1166      1.1  mrg 	if (thres < best_thres)
   1167      1.1  mrg 	  {
   1168      1.1  mrg 	    best_thres = thres;
   1169      1.1  mrg 	    best_k = k;
   1170      1.1  mrg 	  }
   1171      1.1  mrg       }
   1172      1.1  mrg 
   1173      1.1  mrg     *(p->p_modf_threshold) = best_thres;
   1174      1.1  mrg     sprintf (buf, "k = %d", best_k);
   1175      1.1  mrg     print_define_remark (p->modf_threshold_name, best_thres, buf);
   1176      1.1  mrg     initial_k = best_k;
   1177      1.1  mrg   }
   1178      1.1  mrg #else
   1179      1.1  mrg   size = p->first_size;
   1180      1.1  mrg   for (;;)
   1181      1.1  mrg     {
   1182      1.1  mrg       double  tk, tm;
   1183      1.1  mrg 
   1184      1.1  mrg       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
   1185      1.1  mrg       k = mpn_fft_best_k (size, p->sqr);
   1186      1.1  mrg 
   1187      1.1  mrg       if (size >= p->max_size)
   1188      1.1  mrg         break;
   1189      1.1  mrg 
   1190      1.1  mrg       s.size = size + fft_step_size (k) / 2;
   1191      1.1  mrg       s.r = k;
   1192      1.1  mrg       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
   1193      1.1  mrg       if (tk == -1.0)
   1194      1.1  mrg         abort ();
   1195      1.1  mrg 
   1196      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1197      1.1  mrg       if (tm == -1.0)
   1198      1.1  mrg         abort ();
   1199      1.1  mrg 
   1200      1.1  mrg       if (option_trace >= 2)
   1201      1.1  mrg         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
   1202      1.1  mrg                 (long) size,
   1203      1.1  mrg                 (long) size + fft_step_size (k) / 2, k, tk,
   1204      1.1  mrg                 (long) s.size, tm);
   1205      1.1  mrg 
   1206      1.1  mrg       if (tk < tm)
   1207      1.1  mrg         {
   1208      1.1  mrg 	  *p->p_modf_threshold = s.size;
   1209      1.1  mrg 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
   1210      1.1  mrg 	  break;
   1211      1.1  mrg         }
   1212      1.1  mrg     }
   1213      1.1  mrg   initial_k = ?;
   1214      1.1  mrg #endif
   1215      1.1  mrg 
   1216      1.1  mrg   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
   1217      1.1  mrg 
   1218      1.1  mrg   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
   1219      1.1  mrg   printf ("#define %s_SIZE %d\n", p->table_name, idx);
   1220      1.1  mrg 
   1221      1.1  mrg   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
   1222      1.1  mrg 
   1223      1.1  mrg   size = 2 * *p->p_modf_threshold;	/* OK? */
   1224      1.1  mrg   for (;;)
   1225      1.1  mrg     {
   1226      1.1  mrg       double  tk, tm;
   1227      1.1  mrg       mp_size_t mulmod_size, mul_size;;
   1228      1.1  mrg 
   1229      1.1  mrg       if (size >= p->max_size)
   1230      1.1  mrg         break;
   1231      1.1  mrg 
   1232      1.1  mrg       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
   1233      1.1  mrg       mul_size = (size + mulmod_size) / 2;	/* middle of step */
   1234      1.1  mrg 
   1235      1.1  mrg       s.size = mulmod_size;
   1236      1.1  mrg       tk = tuneup_measure (p->function, NULL, &s);
   1237      1.1  mrg       if (tk == -1.0)
   1238      1.1  mrg         abort ();
   1239      1.1  mrg 
   1240      1.1  mrg       s.size = mul_size;
   1241      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1242      1.1  mrg       if (tm == -1.0)
   1243      1.1  mrg         abort ();
   1244      1.1  mrg 
   1245      1.1  mrg       if (option_trace >= 2)
   1246      1.1  mrg         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
   1247      1.1  mrg                 (long) size,
   1248      1.1  mrg                 (long) mulmod_size, tk,
   1249      1.1  mrg                 (long) mul_size, tm);
   1250      1.1  mrg 
   1251      1.1  mrg       size = mulmod_size;
   1252      1.1  mrg 
   1253      1.1  mrg       if (tk < tm)
   1254      1.1  mrg         {
   1255      1.1  mrg 	  *p->p_threshold = s.size;
   1256      1.1  mrg 	  print_define (p->threshold_name, *p->p_threshold);
   1257      1.1  mrg 	  break;
   1258      1.1  mrg         }
   1259      1.1  mrg     }
   1260      1.1  mrg }
   1261      1.1  mrg 
   1262  1.1.1.4  mrg /* Compare mpn_mul_1 to whatever fast exact single-limb division we have.  This
   1263  1.1.1.4  mrg    is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
   1264  1.1.1.4  mrg    This is used in get_str and set_str.  */
   1265  1.1.1.4  mrg void
   1266  1.1.1.4  mrg relspeed_div_1_vs_mul_1 (void)
   1267  1.1.1.4  mrg {
   1268  1.1.1.5  mrg #define max_opsize 100
   1269  1.1.1.4  mrg   mp_size_t n;
   1270  1.1.1.4  mrg   long j;
   1271  1.1.1.4  mrg   mp_limb_t rp[max_opsize];
   1272  1.1.1.4  mrg   mp_limb_t ap[max_opsize];
   1273  1.1.1.4  mrg   double multime, divtime;
   1274  1.1.1.4  mrg 
   1275  1.1.1.4  mrg   mpn_random (ap, max_opsize);
   1276  1.1.1.4  mrg 
   1277  1.1.1.4  mrg   multime = 0;
   1278  1.1.1.4  mrg   for (n = max_opsize; n > 1; n--)
   1279  1.1.1.4  mrg     {
   1280  1.1.1.4  mrg       mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1281  1.1.1.4  mrg       speed_starttime ();
   1282  1.1.1.4  mrg       for (j = speed_precision; j != 0 ; j--)
   1283  1.1.1.4  mrg 	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1284  1.1.1.4  mrg       multime += speed_endtime () / n;
   1285  1.1.1.4  mrg     }
   1286  1.1.1.4  mrg 
   1287  1.1.1.4  mrg   divtime = 0;
   1288  1.1.1.4  mrg   for (n = max_opsize; n > 1; n--)
   1289  1.1.1.4  mrg     {
   1290  1.1.1.4  mrg       /* Make input divisible for good measure.  */
   1291  1.1.1.4  mrg       ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
   1292  1.1.1.4  mrg 
   1293  1.1.1.4  mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
   1294  1.1.1.4  mrg 	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
   1295  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_BINVERTED_10,
   1296  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_CTZ_10);
   1297  1.1.1.4  mrg #else
   1298  1.1.1.4  mrg 	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1299  1.1.1.4  mrg #endif
   1300  1.1.1.4  mrg       speed_starttime ();
   1301  1.1.1.4  mrg       for (j = speed_precision; j != 0 ; j--)
   1302  1.1.1.4  mrg 	{
   1303  1.1.1.4  mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
   1304  1.1.1.4  mrg 	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
   1305  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_BINVERTED_10,
   1306  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_CTZ_10);
   1307  1.1.1.4  mrg #else
   1308  1.1.1.4  mrg 	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1309  1.1.1.4  mrg #endif
   1310  1.1.1.4  mrg 	}
   1311  1.1.1.4  mrg       divtime += speed_endtime () / n;
   1312  1.1.1.4  mrg     }
   1313  1.1.1.4  mrg 
   1314  1.1.1.4  mrg   print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
   1315  1.1.1.4  mrg }
   1316      1.1  mrg 
   1317      1.1  mrg 
   1318      1.1  mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
   1319      1.1  mrg    giving wrong results.  */
   1320      1.1  mrg void
   1321      1.1  mrg tune_mul_n (void)
   1322      1.1  mrg {
   1323      1.1  mrg   static struct param_t  param;
   1324  1.1.1.2  mrg   mp_size_t next_toom_start;
   1325  1.1.1.2  mrg   int something_changed;
   1326      1.1  mrg 
   1327      1.1  mrg   param.function = speed_mpn_mul_n;
   1328      1.1  mrg 
   1329      1.1  mrg   param.name = "MUL_TOOM22_THRESHOLD";
   1330      1.1  mrg   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
   1331      1.1  mrg   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
   1332      1.1  mrg   one (&mul_toom22_threshold, &param);
   1333      1.1  mrg 
   1334  1.1.1.2  mrg   param.noprint = 1;
   1335  1.1.1.2  mrg 
   1336  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1337  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1338  1.1.1.2  mrg   something_changed = 1;
   1339  1.1.1.2  mrg   while (something_changed)
   1340  1.1.1.2  mrg     {
   1341  1.1.1.2  mrg       something_changed = 0;
   1342  1.1.1.2  mrg 
   1343  1.1.1.2  mrg 	next_toom_start = mul_toom22_threshold;
   1344  1.1.1.2  mrg 
   1345  1.1.1.2  mrg 	if (mul_toom33_threshold != 0)
   1346  1.1.1.2  mrg 	  {
   1347  1.1.1.2  mrg 	    param.name = "MUL_TOOM33_THRESHOLD";
   1348  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
   1349  1.1.1.2  mrg 	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
   1350  1.1.1.2  mrg 	    one (&mul_toom33_threshold, &param);
   1351  1.1.1.2  mrg 
   1352  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
   1353  1.1.1.2  mrg 	      {
   1354  1.1.1.2  mrg 		mul_toom33_threshold = 0;
   1355  1.1.1.2  mrg 		something_changed = 1;
   1356  1.1.1.2  mrg 	      }
   1357  1.1.1.2  mrg 	  }
   1358  1.1.1.2  mrg 
   1359  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
   1360  1.1.1.2  mrg 
   1361  1.1.1.2  mrg 	if (mul_toom44_threshold != 0)
   1362  1.1.1.2  mrg 	  {
   1363  1.1.1.2  mrg 	    param.name = "MUL_TOOM44_THRESHOLD";
   1364  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
   1365  1.1.1.2  mrg 	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
   1366  1.1.1.2  mrg 	    one (&mul_toom44_threshold, &param);
   1367  1.1.1.2  mrg 
   1368  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
   1369  1.1.1.2  mrg 	      {
   1370  1.1.1.2  mrg 		mul_toom44_threshold = 0;
   1371  1.1.1.2  mrg 		something_changed = 1;
   1372  1.1.1.2  mrg 	      }
   1373  1.1.1.2  mrg 	  }
   1374  1.1.1.2  mrg 
   1375  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
   1376  1.1.1.2  mrg 
   1377  1.1.1.2  mrg 	if (mul_toom6h_threshold != 0)
   1378  1.1.1.2  mrg 	  {
   1379  1.1.1.2  mrg 	    param.name = "MUL_TOOM6H_THRESHOLD";
   1380  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
   1381  1.1.1.2  mrg 	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
   1382  1.1.1.2  mrg 	    one (&mul_toom6h_threshold, &param);
   1383  1.1.1.2  mrg 
   1384  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
   1385  1.1.1.2  mrg 	      {
   1386  1.1.1.2  mrg 		mul_toom6h_threshold = 0;
   1387  1.1.1.2  mrg 		something_changed = 1;
   1388  1.1.1.2  mrg 	      }
   1389  1.1.1.2  mrg 	  }
   1390  1.1.1.2  mrg 
   1391  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
   1392  1.1.1.2  mrg 
   1393  1.1.1.2  mrg 	if (mul_toom8h_threshold != 0)
   1394  1.1.1.2  mrg 	  {
   1395  1.1.1.2  mrg 	    param.name = "MUL_TOOM8H_THRESHOLD";
   1396  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
   1397  1.1.1.2  mrg 	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
   1398  1.1.1.2  mrg 	    one (&mul_toom8h_threshold, &param);
   1399  1.1.1.2  mrg 
   1400  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
   1401  1.1.1.2  mrg 	      {
   1402  1.1.1.2  mrg 		mul_toom8h_threshold = 0;
   1403  1.1.1.2  mrg 		something_changed = 1;
   1404  1.1.1.2  mrg 	      }
   1405  1.1.1.2  mrg 	  }
   1406  1.1.1.2  mrg     }
   1407  1.1.1.2  mrg 
   1408  1.1.1.2  mrg     print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
   1409  1.1.1.2  mrg     print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
   1410  1.1.1.2  mrg     print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
   1411  1.1.1.2  mrg     print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
   1412      1.1  mrg 
   1413      1.1  mrg   /* disabled until tuned */
   1414      1.1  mrg   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1415      1.1  mrg }
   1416      1.1  mrg 
   1417      1.1  mrg void
   1418      1.1  mrg tune_mul (void)
   1419      1.1  mrg {
   1420      1.1  mrg   static struct param_t  param;
   1421      1.1  mrg   mp_size_t thres;
   1422      1.1  mrg 
   1423      1.1  mrg   param.noprint = 1;
   1424      1.1  mrg 
   1425      1.1  mrg   param.function = speed_mpn_toom32_for_toom43_mul;
   1426      1.1  mrg   param.function2 = speed_mpn_toom43_for_toom32_mul;
   1427      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
   1428  1.1.1.2  mrg   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
   1429      1.1  mrg   one (&thres, &param);
   1430  1.1.1.2  mrg   mul_toom32_to_toom43_threshold = thres * 17 / 24;
   1431      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
   1432      1.1  mrg 
   1433      1.1  mrg   param.function = speed_mpn_toom32_for_toom53_mul;
   1434      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom32_mul;
   1435      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
   1436  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
   1437      1.1  mrg   one (&thres, &param);
   1438  1.1.1.2  mrg   mul_toom32_to_toom53_threshold = thres * 19 / 30;
   1439      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
   1440      1.1  mrg 
   1441      1.1  mrg   param.function = speed_mpn_toom42_for_toom53_mul;
   1442      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom42_mul;
   1443      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
   1444  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
   1445      1.1  mrg   one (&thres, &param);
   1446  1.1.1.2  mrg   mul_toom42_to_toom53_threshold = thres * 11 / 20;
   1447      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
   1448      1.1  mrg 
   1449      1.1  mrg   param.function = speed_mpn_toom42_mul;
   1450      1.1  mrg   param.function2 = speed_mpn_toom63_mul;
   1451      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
   1452  1.1.1.2  mrg   param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
   1453      1.1  mrg   one (&thres, &param);
   1454  1.1.1.2  mrg   mul_toom42_to_toom63_threshold = thres / 2;
   1455      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
   1456  1.1.1.2  mrg 
   1457  1.1.1.2  mrg   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
   1458  1.1.1.2  mrg   param.function = speed_mpn_toom43_for_toom54_mul;
   1459  1.1.1.2  mrg   param.function2 = speed_mpn_toom54_for_toom43_mul;
   1460  1.1.1.2  mrg   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
   1461  1.1.1.2  mrg   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
   1462  1.1.1.2  mrg   one (&thres, &param);
   1463  1.1.1.2  mrg   mul_toom43_to_toom54_threshold = thres * 5 / 6;
   1464  1.1.1.2  mrg   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
   1465      1.1  mrg }
   1466      1.1  mrg 
   1467      1.1  mrg 
   1468      1.1  mrg void
   1469      1.1  mrg tune_mullo (void)
   1470      1.1  mrg {
   1471      1.1  mrg   static struct param_t  param;
   1472      1.1  mrg 
   1473      1.1  mrg   param.function = speed_mpn_mullo_n;
   1474      1.1  mrg 
   1475      1.1  mrg   param.name = "MULLO_BASECASE_THRESHOLD";
   1476  1.1.1.4  mrg   param.min_size = 2;
   1477      1.1  mrg   param.min_is_always = 1;
   1478      1.1  mrg   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
   1479      1.1  mrg   param.stop_factor = 1.5;
   1480      1.1  mrg   param.noprint = 1;
   1481      1.1  mrg   one (&mullo_basecase_threshold, &param);
   1482      1.1  mrg 
   1483      1.1  mrg   param.name = "MULLO_DC_THRESHOLD";
   1484      1.1  mrg   param.min_size = 8;
   1485      1.1  mrg   param.min_is_always = 0;
   1486      1.1  mrg   param.max_size = 1000;
   1487      1.1  mrg   one (&mullo_dc_threshold, &param);
   1488      1.1  mrg 
   1489      1.1  mrg   if (mullo_basecase_threshold >= mullo_dc_threshold)
   1490      1.1  mrg     {
   1491      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
   1492      1.1  mrg       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
   1493      1.1  mrg     }
   1494      1.1  mrg   else
   1495      1.1  mrg     {
   1496      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
   1497      1.1  mrg       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
   1498      1.1  mrg     }
   1499      1.1  mrg 
   1500  1.1.1.3  mrg   if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
   1501  1.1.1.3  mrg     {
   1502  1.1.1.3  mrg       param.name = "MULLO_MUL_N_THRESHOLD";
   1503  1.1.1.3  mrg       param.min_size = mullo_dc_threshold;
   1504  1.1.1.3  mrg       param.max_size = 2 * mul_fft_threshold;
   1505  1.1.1.3  mrg       param.noprint = 0;
   1506  1.1.1.3  mrg       param.step_factor = 0.03;
   1507  1.1.1.3  mrg       one (&mullo_mul_n_threshold, &param);
   1508  1.1.1.3  mrg     }
   1509  1.1.1.3  mrg   else
   1510  1.1.1.3  mrg     print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
   1511  1.1.1.3  mrg 			 "without FFT use mullo forever");
   1512  1.1.1.3  mrg }
   1513  1.1.1.3  mrg 
   1514  1.1.1.3  mrg void
   1515  1.1.1.3  mrg tune_sqrlo (void)
   1516  1.1.1.3  mrg {
   1517  1.1.1.3  mrg   static struct param_t  param;
   1518  1.1.1.3  mrg 
   1519  1.1.1.3  mrg   param.function = speed_mpn_sqrlo;
   1520  1.1.1.3  mrg 
   1521  1.1.1.3  mrg   param.name = "SQRLO_BASECASE_THRESHOLD";
   1522  1.1.1.4  mrg   param.min_size = 2;
   1523  1.1.1.3  mrg   param.min_is_always = 1;
   1524  1.1.1.3  mrg   param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
   1525  1.1.1.3  mrg   param.stop_factor = 1.5;
   1526  1.1.1.3  mrg   param.noprint = 1;
   1527  1.1.1.3  mrg   one (&sqrlo_basecase_threshold, &param);
   1528  1.1.1.3  mrg 
   1529  1.1.1.3  mrg   param.name = "SQRLO_DC_THRESHOLD";
   1530  1.1.1.3  mrg   param.min_size = 8;
   1531  1.1.1.3  mrg   param.min_is_always = 0;
   1532  1.1.1.3  mrg   param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
   1533  1.1.1.3  mrg   one (&sqrlo_dc_threshold, &param);
   1534  1.1.1.3  mrg 
   1535  1.1.1.3  mrg   if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
   1536  1.1.1.3  mrg     {
   1537  1.1.1.3  mrg       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
   1538  1.1.1.3  mrg       print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
   1539  1.1.1.3  mrg     }
   1540  1.1.1.3  mrg   else
   1541  1.1.1.3  mrg     {
   1542  1.1.1.3  mrg       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
   1543  1.1.1.3  mrg       print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
   1544  1.1.1.3  mrg     }
   1545  1.1.1.3  mrg 
   1546  1.1.1.3  mrg   if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
   1547  1.1.1.3  mrg     {
   1548  1.1.1.3  mrg       param.name = "SQRLO_SQR_THRESHOLD";
   1549  1.1.1.3  mrg       param.min_size = sqrlo_dc_threshold;
   1550  1.1.1.3  mrg       param.max_size = 2 * sqr_fft_threshold;
   1551  1.1.1.3  mrg       param.noprint = 0;
   1552  1.1.1.3  mrg       param.step_factor = 0.03;
   1553  1.1.1.3  mrg       one (&sqrlo_sqr_threshold, &param);
   1554  1.1.1.3  mrg     }
   1555  1.1.1.3  mrg   else
   1556  1.1.1.3  mrg     print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
   1557  1.1.1.3  mrg 			 "without FFT use sqrlo forever");
   1558      1.1  mrg }
   1559      1.1  mrg 
   1560      1.1  mrg void
   1561  1.1.1.2  mrg tune_mulmid (void)
   1562  1.1.1.2  mrg {
   1563  1.1.1.2  mrg   static struct param_t  param;
   1564  1.1.1.2  mrg 
   1565  1.1.1.2  mrg   param.name = "MULMID_TOOM42_THRESHOLD";
   1566  1.1.1.2  mrg   param.function = speed_mpn_mulmid_n;
   1567  1.1.1.2  mrg   param.min_size = 4;
   1568  1.1.1.2  mrg   param.max_size = 100;
   1569  1.1.1.2  mrg   one (&mulmid_toom42_threshold, &param);
   1570  1.1.1.2  mrg }
   1571  1.1.1.2  mrg 
   1572  1.1.1.2  mrg void
   1573      1.1  mrg tune_mulmod_bnm1 (void)
   1574      1.1  mrg {
   1575      1.1  mrg   static struct param_t  param;
   1576      1.1  mrg 
   1577      1.1  mrg   param.name = "MULMOD_BNM1_THRESHOLD";
   1578      1.1  mrg   param.function = speed_mpn_mulmod_bnm1;
   1579      1.1  mrg   param.min_size = 4;
   1580      1.1  mrg   param.max_size = 100;
   1581      1.1  mrg   one (&mulmod_bnm1_threshold, &param);
   1582      1.1  mrg }
   1583      1.1  mrg 
   1584      1.1  mrg void
   1585      1.1  mrg tune_sqrmod_bnm1 (void)
   1586      1.1  mrg {
   1587      1.1  mrg   static struct param_t  param;
   1588      1.1  mrg 
   1589      1.1  mrg   param.name = "SQRMOD_BNM1_THRESHOLD";
   1590      1.1  mrg   param.function = speed_mpn_sqrmod_bnm1;
   1591      1.1  mrg   param.min_size = 4;
   1592      1.1  mrg   param.max_size = 100;
   1593      1.1  mrg   one (&sqrmod_bnm1_threshold, &param);
   1594      1.1  mrg }
   1595      1.1  mrg 
   1596      1.1  mrg 
   1597      1.1  mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
   1598      1.1  mrg    is faster only at size==2 then we don't want to bother with extra code
   1599      1.1  mrg    just for that.  Start karatsuba from 4 same as MUL above.  */
   1600      1.1  mrg 
   1601      1.1  mrg void
   1602      1.1  mrg tune_sqr (void)
   1603      1.1  mrg {
   1604      1.1  mrg   /* disabled until tuned */
   1605      1.1  mrg   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1606      1.1  mrg 
   1607      1.1  mrg   if (HAVE_NATIVE_mpn_sqr_basecase)
   1608      1.1  mrg     {
   1609      1.1  mrg       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
   1610      1.1  mrg       sqr_basecase_threshold = 0;
   1611      1.1  mrg     }
   1612      1.1  mrg   else
   1613      1.1  mrg     {
   1614      1.1  mrg       static struct param_t  param;
   1615      1.1  mrg       param.name = "SQR_BASECASE_THRESHOLD";
   1616      1.1  mrg       param.function = speed_mpn_sqr;
   1617      1.1  mrg       param.min_size = 3;
   1618      1.1  mrg       param.min_is_always = 1;
   1619      1.1  mrg       param.max_size = TUNE_SQR_TOOM2_MAX;
   1620      1.1  mrg       param.noprint = 1;
   1621      1.1  mrg       one (&sqr_basecase_threshold, &param);
   1622      1.1  mrg     }
   1623      1.1  mrg 
   1624      1.1  mrg   {
   1625      1.1  mrg     static struct param_t  param;
   1626      1.1  mrg     param.name = "SQR_TOOM2_THRESHOLD";
   1627      1.1  mrg     param.function = speed_mpn_sqr;
   1628      1.1  mrg     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
   1629      1.1  mrg     param.max_size = TUNE_SQR_TOOM2_MAX;
   1630      1.1  mrg     param.noprint = 1;
   1631      1.1  mrg     one (&sqr_toom2_threshold, &param);
   1632      1.1  mrg 
   1633      1.1  mrg     if (! HAVE_NATIVE_mpn_sqr_basecase
   1634      1.1  mrg         && sqr_toom2_threshold < sqr_basecase_threshold)
   1635      1.1  mrg       {
   1636      1.1  mrg         /* Karatsuba becomes faster than mul_basecase before
   1637      1.1  mrg            sqr_basecase does.  Arrange for the expression
   1638      1.1  mrg            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
   1639      1.1  mrg            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
   1640      1.1  mrg            SQR_TOOM2_THRESHOLD to zero, making
   1641      1.1  mrg            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
   1642      1.1  mrg 
   1643      1.1  mrg         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
   1644      1.1  mrg         SQR_TOOM2_THRESHOLD = 0;
   1645      1.1  mrg 
   1646      1.1  mrg         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
   1647      1.1  mrg                              "toom2");
   1648      1.1  mrg         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
   1649      1.1  mrg                              "never sqr_basecase");
   1650      1.1  mrg       }
   1651      1.1  mrg     else
   1652      1.1  mrg       {
   1653      1.1  mrg         if (! HAVE_NATIVE_mpn_sqr_basecase)
   1654      1.1  mrg           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
   1655      1.1  mrg         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
   1656      1.1  mrg       }
   1657      1.1  mrg   }
   1658      1.1  mrg 
   1659      1.1  mrg   {
   1660      1.1  mrg     static struct param_t  param;
   1661  1.1.1.2  mrg     mp_size_t next_toom_start;
   1662  1.1.1.2  mrg     int something_changed;
   1663      1.1  mrg 
   1664      1.1  mrg     param.function = speed_mpn_sqr;
   1665  1.1.1.2  mrg     param.noprint = 1;
   1666      1.1  mrg 
   1667  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1668  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1669  1.1.1.2  mrg     something_changed = 1;
   1670  1.1.1.2  mrg     while (something_changed)
   1671  1.1.1.2  mrg       {
   1672  1.1.1.2  mrg 	something_changed = 0;
   1673  1.1.1.2  mrg 
   1674  1.1.1.2  mrg 	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
   1675  1.1.1.2  mrg 
   1676  1.1.1.2  mrg 	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
   1677  1.1.1.2  mrg 	param.name = "SQR_TOOM3_THRESHOLD";
   1678  1.1.1.2  mrg 	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
   1679  1.1.1.2  mrg 	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
   1680  1.1.1.2  mrg 	one (&sqr_toom3_threshold, &param);
   1681  1.1.1.2  mrg 
   1682  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
   1683  1.1.1.2  mrg 
   1684  1.1.1.2  mrg 	if (sqr_toom4_threshold != 0)
   1685  1.1.1.2  mrg 	  {
   1686  1.1.1.2  mrg 	    param.name = "SQR_TOOM4_THRESHOLD";
   1687  1.1.1.2  mrg 	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
   1688  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
   1689  1.1.1.2  mrg 	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
   1690  1.1.1.2  mrg 	    one (&sqr_toom4_threshold, &param);
   1691  1.1.1.2  mrg 
   1692  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
   1693  1.1.1.2  mrg 	      {
   1694  1.1.1.2  mrg 		sqr_toom4_threshold = 0;
   1695  1.1.1.2  mrg 		something_changed = 1;
   1696  1.1.1.2  mrg 	      }
   1697  1.1.1.2  mrg 	  }
   1698  1.1.1.2  mrg 
   1699  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
   1700  1.1.1.2  mrg 
   1701  1.1.1.2  mrg 	if (sqr_toom6_threshold != 0)
   1702  1.1.1.2  mrg 	  {
   1703  1.1.1.2  mrg 	    param.name = "SQR_TOOM6_THRESHOLD";
   1704  1.1.1.2  mrg 	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
   1705  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
   1706  1.1.1.2  mrg 	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
   1707  1.1.1.2  mrg 	    one (&sqr_toom6_threshold, &param);
   1708  1.1.1.2  mrg 
   1709  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
   1710  1.1.1.2  mrg 	      {
   1711  1.1.1.2  mrg 		sqr_toom6_threshold = 0;
   1712  1.1.1.2  mrg 		something_changed = 1;
   1713  1.1.1.2  mrg 	      }
   1714  1.1.1.2  mrg 	  }
   1715  1.1.1.2  mrg 
   1716  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
   1717  1.1.1.2  mrg 
   1718  1.1.1.2  mrg 	if (sqr_toom8_threshold != 0)
   1719  1.1.1.2  mrg 	  {
   1720  1.1.1.2  mrg 	    param.name = "SQR_TOOM8_THRESHOLD";
   1721  1.1.1.2  mrg 	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
   1722  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
   1723  1.1.1.2  mrg 	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
   1724  1.1.1.2  mrg 	    one (&sqr_toom8_threshold, &param);
   1725  1.1.1.2  mrg 
   1726  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
   1727  1.1.1.2  mrg 	      {
   1728  1.1.1.2  mrg 		sqr_toom8_threshold = 0;
   1729  1.1.1.2  mrg 		something_changed = 1;
   1730  1.1.1.2  mrg 	      }
   1731  1.1.1.2  mrg 	  }
   1732  1.1.1.2  mrg       }
   1733  1.1.1.2  mrg 
   1734  1.1.1.2  mrg     print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
   1735  1.1.1.2  mrg     print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
   1736  1.1.1.2  mrg     print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
   1737  1.1.1.2  mrg     print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
   1738      1.1  mrg   }
   1739      1.1  mrg }
   1740      1.1  mrg 
   1741      1.1  mrg 
   1742      1.1  mrg void
   1743      1.1  mrg tune_dc_div (void)
   1744      1.1  mrg {
   1745      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1746      1.1  mrg   {
   1747      1.1  mrg     static struct param_t  param;
   1748      1.1  mrg     param.name = "DC_DIV_QR_THRESHOLD";
   1749      1.1  mrg     param.function = speed_mpn_sbpi1_div_qr;
   1750      1.1  mrg     param.function2 = speed_mpn_dcpi1_div_qr;
   1751      1.1  mrg     param.min_size = 6;
   1752      1.1  mrg     one (&dc_div_qr_threshold, &param);
   1753      1.1  mrg   }
   1754      1.1  mrg   {
   1755      1.1  mrg     static struct param_t  param;
   1756      1.1  mrg     param.name = "DC_DIVAPPR_Q_THRESHOLD";
   1757      1.1  mrg     param.function = speed_mpn_sbpi1_divappr_q;
   1758      1.1  mrg     param.function2 = speed_mpn_dcpi1_divappr_q;
   1759      1.1  mrg     param.min_size = 6;
   1760      1.1  mrg     one (&dc_divappr_q_threshold, &param);
   1761      1.1  mrg   }
   1762      1.1  mrg }
   1763      1.1  mrg 
   1764      1.1  mrg static double
   1765      1.1  mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
   1766      1.1  mrg {
   1767      1.1  mrg   if (s->size < DC_DIV_QR_THRESHOLD)
   1768      1.1  mrg     return speed_mpn_sbpi1_div_qr (s);
   1769      1.1  mrg   else
   1770      1.1  mrg     return speed_mpn_dcpi1_div_qr (s);
   1771      1.1  mrg }
   1772      1.1  mrg 
   1773      1.1  mrg void
   1774      1.1  mrg tune_mu_div (void)
   1775      1.1  mrg {
   1776      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1777      1.1  mrg   {
   1778      1.1  mrg     static struct param_t  param;
   1779      1.1  mrg     param.name = "MU_DIV_QR_THRESHOLD";
   1780      1.1  mrg     param.function = speed_mpn_dcpi1_div_qr;
   1781      1.1  mrg     param.function2 = speed_mpn_mu_div_qr;
   1782  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1783      1.1  mrg     param.max_size = 5000;
   1784      1.1  mrg     param.step_factor = 0.02;
   1785      1.1  mrg     one (&mu_div_qr_threshold, &param);
   1786      1.1  mrg   }
   1787      1.1  mrg   {
   1788      1.1  mrg     static struct param_t  param;
   1789      1.1  mrg     param.name = "MU_DIVAPPR_Q_THRESHOLD";
   1790      1.1  mrg     param.function = speed_mpn_dcpi1_divappr_q;
   1791      1.1  mrg     param.function2 = speed_mpn_mu_divappr_q;
   1792  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1793      1.1  mrg     param.max_size = 5000;
   1794      1.1  mrg     param.step_factor = 0.02;
   1795      1.1  mrg     one (&mu_divappr_q_threshold, &param);
   1796      1.1  mrg   }
   1797      1.1  mrg   {
   1798      1.1  mrg     static struct param_t  param;
   1799      1.1  mrg     param.name = "MUPI_DIV_QR_THRESHOLD";
   1800      1.1  mrg     param.function = speed_mpn_sbordcpi1_div_qr;
   1801      1.1  mrg     param.function2 = speed_mpn_mupi_div_qr;
   1802      1.1  mrg     param.min_size = 6;
   1803      1.1  mrg     param.min_is_always = 1;
   1804      1.1  mrg     param.max_size = 1000;
   1805      1.1  mrg     param.step_factor = 0.02;
   1806      1.1  mrg     one (&mupi_div_qr_threshold, &param);
   1807      1.1  mrg   }
   1808      1.1  mrg }
   1809      1.1  mrg 
   1810      1.1  mrg void
   1811      1.1  mrg tune_dc_bdiv (void)
   1812      1.1  mrg {
   1813      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1814      1.1  mrg   {
   1815      1.1  mrg     static struct param_t  param;
   1816      1.1  mrg     param.name = "DC_BDIV_QR_THRESHOLD";
   1817      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_qr;
   1818      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_qr;
   1819      1.1  mrg     param.min_size = 4;
   1820      1.1  mrg     one (&dc_bdiv_qr_threshold, &param);
   1821      1.1  mrg   }
   1822      1.1  mrg   {
   1823      1.1  mrg     static struct param_t  param;
   1824      1.1  mrg     param.name = "DC_BDIV_Q_THRESHOLD";
   1825      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_q;
   1826      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_q;
   1827      1.1  mrg     param.min_size = 4;
   1828      1.1  mrg     one (&dc_bdiv_q_threshold, &param);
   1829      1.1  mrg   }
   1830      1.1  mrg }
   1831      1.1  mrg 
   1832      1.1  mrg void
   1833      1.1  mrg tune_mu_bdiv (void)
   1834      1.1  mrg {
   1835      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1836      1.1  mrg   {
   1837      1.1  mrg     static struct param_t  param;
   1838      1.1  mrg     param.name = "MU_BDIV_QR_THRESHOLD";
   1839      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_qr;
   1840      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_qr;
   1841  1.1.1.3  mrg     param.min_size = dc_bdiv_qr_threshold;
   1842      1.1  mrg     param.max_size = 5000;
   1843      1.1  mrg     param.step_factor = 0.02;
   1844      1.1  mrg     one (&mu_bdiv_qr_threshold, &param);
   1845      1.1  mrg   }
   1846      1.1  mrg   {
   1847      1.1  mrg     static struct param_t  param;
   1848      1.1  mrg     param.name = "MU_BDIV_Q_THRESHOLD";
   1849      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_q;
   1850      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_q;
   1851  1.1.1.3  mrg     param.min_size = dc_bdiv_q_threshold;
   1852      1.1  mrg     param.max_size = 5000;
   1853      1.1  mrg     param.step_factor = 0.02;
   1854      1.1  mrg     one (&mu_bdiv_q_threshold, &param);
   1855      1.1  mrg   }
   1856      1.1  mrg }
   1857      1.1  mrg 
   1858      1.1  mrg void
   1859      1.1  mrg tune_invertappr (void)
   1860      1.1  mrg {
   1861      1.1  mrg   static struct param_t  param;
   1862      1.1  mrg 
   1863      1.1  mrg   param.function = speed_mpn_ni_invertappr;
   1864      1.1  mrg   param.name = "INV_MULMOD_BNM1_THRESHOLD";
   1865  1.1.1.3  mrg   param.min_size = 5;
   1866      1.1  mrg   one (&inv_mulmod_bnm1_threshold, &param);
   1867      1.1  mrg 
   1868      1.1  mrg   param.function = speed_mpn_invertappr;
   1869      1.1  mrg   param.name = "INV_NEWTON_THRESHOLD";
   1870  1.1.1.3  mrg   param.min_size = 5;
   1871      1.1  mrg   one (&inv_newton_threshold, &param);
   1872      1.1  mrg }
   1873      1.1  mrg 
   1874      1.1  mrg void
   1875      1.1  mrg tune_invert (void)
   1876      1.1  mrg {
   1877      1.1  mrg   static struct param_t  param;
   1878      1.1  mrg 
   1879      1.1  mrg   param.function = speed_mpn_invert;
   1880      1.1  mrg   param.name = "INV_APPR_THRESHOLD";
   1881  1.1.1.3  mrg   param.min_size = 5;
   1882      1.1  mrg   one (&inv_appr_threshold, &param);
   1883      1.1  mrg }
   1884      1.1  mrg 
   1885      1.1  mrg void
   1886      1.1  mrg tune_binvert (void)
   1887      1.1  mrg {
   1888      1.1  mrg   static struct param_t  param;
   1889      1.1  mrg 
   1890      1.1  mrg   param.function = speed_mpn_binvert;
   1891      1.1  mrg   param.name = "BINV_NEWTON_THRESHOLD";
   1892      1.1  mrg   param.min_size = 8;		/* pointless with smaller operands */
   1893      1.1  mrg   one (&binv_newton_threshold, &param);
   1894      1.1  mrg }
   1895      1.1  mrg 
   1896      1.1  mrg void
   1897      1.1  mrg tune_redc (void)
   1898      1.1  mrg {
   1899      1.1  mrg #define TUNE_REDC_2_MAX 100
   1900      1.1  mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
   1901      1.1  mrg #define WANT_REDC_2 1
   1902      1.1  mrg #endif
   1903      1.1  mrg 
   1904      1.1  mrg #if WANT_REDC_2
   1905      1.1  mrg   {
   1906      1.1  mrg     static struct param_t  param;
   1907      1.1  mrg     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1908      1.1  mrg     param.function = speed_mpn_redc_1;
   1909      1.1  mrg     param.function2 = speed_mpn_redc_2;
   1910      1.1  mrg     param.min_size = 1;
   1911      1.1  mrg     param.min_is_always = 1;
   1912      1.1  mrg     param.max_size = TUNE_REDC_2_MAX;
   1913      1.1  mrg     param.noprint = 1;
   1914  1.1.1.2  mrg     param.stop_factor = 1.5;
   1915      1.1  mrg     one (&redc_1_to_redc_2_threshold, &param);
   1916      1.1  mrg   }
   1917      1.1  mrg   {
   1918      1.1  mrg     static struct param_t  param;
   1919      1.1  mrg     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
   1920      1.1  mrg     param.function = speed_mpn_redc_2;
   1921      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1922      1.1  mrg     param.min_size = 16;
   1923      1.1  mrg     param.noprint = 1;
   1924      1.1  mrg     one (&redc_2_to_redc_n_threshold, &param);
   1925      1.1  mrg   }
   1926  1.1.1.2  mrg   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
   1927      1.1  mrg     {
   1928  1.1.1.2  mrg       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
   1929  1.1.1.2  mrg 
   1930  1.1.1.2  mrg       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
   1931  1.1.1.2  mrg 	 REDC_1_TO_REDC_2_THRESHOLD.  */
   1932  1.1.1.2  mrg       {
   1933  1.1.1.2  mrg 	static struct param_t  param;
   1934  1.1.1.2  mrg 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1935  1.1.1.2  mrg 	param.function = speed_mpn_redc_1;
   1936  1.1.1.2  mrg 	param.function2 = speed_mpn_redc_n;
   1937  1.1.1.2  mrg 	param.min_size = 16;
   1938  1.1.1.2  mrg 	param.noprint = 1;
   1939  1.1.1.2  mrg 	one (&redc_1_to_redc_2_threshold, &param);
   1940  1.1.1.2  mrg       }
   1941      1.1  mrg     }
   1942  1.1.1.2  mrg   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
   1943  1.1.1.2  mrg   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
   1944      1.1  mrg #else
   1945      1.1  mrg   {
   1946      1.1  mrg     static struct param_t  param;
   1947      1.1  mrg     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
   1948      1.1  mrg     param.function = speed_mpn_redc_1;
   1949      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1950      1.1  mrg     param.min_size = 16;
   1951      1.1  mrg     one (&redc_1_to_redc_n_threshold, &param);
   1952      1.1  mrg   }
   1953      1.1  mrg #endif
   1954      1.1  mrg }
   1955      1.1  mrg 
   1956      1.1  mrg void
   1957      1.1  mrg tune_matrix22_mul (void)
   1958      1.1  mrg {
   1959      1.1  mrg   static struct param_t  param;
   1960      1.1  mrg   param.name = "MATRIX22_STRASSEN_THRESHOLD";
   1961      1.1  mrg   param.function = speed_mpn_matrix22_mul;
   1962      1.1  mrg   param.min_size = 2;
   1963      1.1  mrg   one (&matrix22_strassen_threshold, &param);
   1964      1.1  mrg }
   1965      1.1  mrg 
   1966      1.1  mrg void
   1967  1.1.1.4  mrg tune_hgcd2 (void)
   1968  1.1.1.4  mrg {
   1969  1.1.1.4  mrg   static struct param_t  param;
   1970  1.1.1.4  mrg   hgcd2_func_t *f[5] =
   1971  1.1.1.4  mrg     { mpn_hgcd2_1,
   1972  1.1.1.4  mrg       mpn_hgcd2_2,
   1973  1.1.1.4  mrg       mpn_hgcd2_3,
   1974  1.1.1.4  mrg       mpn_hgcd2_4,
   1975  1.1.1.4  mrg       mpn_hgcd2_5 };
   1976  1.1.1.4  mrg   speed_function_t speed_f[5] =
   1977  1.1.1.4  mrg     { speed_mpn_hgcd2_1,
   1978  1.1.1.4  mrg       speed_mpn_hgcd2_2,
   1979  1.1.1.4  mrg       speed_mpn_hgcd2_3,
   1980  1.1.1.4  mrg       speed_mpn_hgcd2_4,
   1981  1.1.1.4  mrg       speed_mpn_hgcd2_5 };
   1982  1.1.1.4  mrg   int best;
   1983  1.1.1.4  mrg 
   1984  1.1.1.4  mrg   s.size = 1;
   1985  1.1.1.4  mrg   best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
   1986  1.1.1.4  mrg 
   1987  1.1.1.4  mrg   /* Use selected function when tuning hgcd and gcd */
   1988  1.1.1.4  mrg   hgcd2_func = f[best];
   1989  1.1.1.4  mrg }
   1990  1.1.1.4  mrg 
   1991  1.1.1.4  mrg void
   1992      1.1  mrg tune_hgcd (void)
   1993      1.1  mrg {
   1994      1.1  mrg   static struct param_t  param;
   1995      1.1  mrg   param.name = "HGCD_THRESHOLD";
   1996      1.1  mrg   param.function = speed_mpn_hgcd;
   1997      1.1  mrg   /* We seem to get strange results for small sizes */
   1998      1.1  mrg   param.min_size = 30;
   1999      1.1  mrg   one (&hgcd_threshold, &param);
   2000      1.1  mrg }
   2001      1.1  mrg 
   2002      1.1  mrg void
   2003  1.1.1.2  mrg tune_hgcd_appr (void)
   2004  1.1.1.2  mrg {
   2005  1.1.1.2  mrg   static struct param_t  param;
   2006  1.1.1.2  mrg   param.name = "HGCD_APPR_THRESHOLD";
   2007  1.1.1.2  mrg   param.function = speed_mpn_hgcd_appr;
   2008  1.1.1.2  mrg   /* We seem to get strange results for small sizes */
   2009  1.1.1.2  mrg   param.min_size = 50;
   2010  1.1.1.2  mrg   param.stop_since_change = 150;
   2011  1.1.1.2  mrg   one (&hgcd_appr_threshold, &param);
   2012  1.1.1.2  mrg }
   2013  1.1.1.2  mrg 
   2014  1.1.1.2  mrg void
   2015  1.1.1.2  mrg tune_hgcd_reduce (void)
   2016  1.1.1.2  mrg {
   2017  1.1.1.2  mrg   static struct param_t  param;
   2018  1.1.1.2  mrg   param.name = "HGCD_REDUCE_THRESHOLD";
   2019  1.1.1.2  mrg   param.function = speed_mpn_hgcd_reduce;
   2020  1.1.1.2  mrg   param.min_size = 30;
   2021  1.1.1.2  mrg   param.max_size = 7000;
   2022  1.1.1.2  mrg   param.step_factor = 0.04;
   2023  1.1.1.2  mrg   one (&hgcd_reduce_threshold, &param);
   2024  1.1.1.2  mrg }
   2025  1.1.1.2  mrg 
   2026  1.1.1.2  mrg void
   2027      1.1  mrg tune_gcd_dc (void)
   2028      1.1  mrg {
   2029      1.1  mrg   static struct param_t  param;
   2030      1.1  mrg   param.name = "GCD_DC_THRESHOLD";
   2031      1.1  mrg   param.function = speed_mpn_gcd;
   2032      1.1  mrg   param.min_size = hgcd_threshold;
   2033      1.1  mrg   param.max_size = 3000;
   2034      1.1  mrg   param.step_factor = 0.02;
   2035      1.1  mrg   one (&gcd_dc_threshold, &param);
   2036      1.1  mrg }
   2037      1.1  mrg 
   2038      1.1  mrg void
   2039      1.1  mrg tune_gcdext_dc (void)
   2040      1.1  mrg {
   2041      1.1  mrg   static struct param_t  param;
   2042      1.1  mrg   param.name = "GCDEXT_DC_THRESHOLD";
   2043      1.1  mrg   param.function = speed_mpn_gcdext;
   2044      1.1  mrg   param.min_size = hgcd_threshold;
   2045      1.1  mrg   param.max_size = 3000;
   2046      1.1  mrg   param.step_factor = 0.02;
   2047      1.1  mrg   one (&gcdext_dc_threshold, &param);
   2048      1.1  mrg }
   2049      1.1  mrg 
   2050  1.1.1.2  mrg /* In tune_powm_sec we compute the table used by the win_size function.  The
   2051  1.1.1.2  mrg    cutoff points are in exponent bits, disregarding other operand sizes.  It is
   2052  1.1.1.3  mrg    not possible to use the one framework since it currently uses a granularity
   2053  1.1.1.2  mrg    of full limbs.
   2054  1.1.1.2  mrg */
   2055  1.1.1.2  mrg 
   2056  1.1.1.2  mrg /* This win_size replaces the variant in the powm code, allowing us to
   2057  1.1.1.2  mrg    control k in the k-ary algorithms.  */
   2058  1.1.1.2  mrg int winsize;
   2059  1.1.1.2  mrg int
   2060  1.1.1.2  mrg win_size (mp_bitcnt_t eb)
   2061  1.1.1.2  mrg {
   2062  1.1.1.2  mrg   return winsize;
   2063  1.1.1.2  mrg }
   2064  1.1.1.2  mrg 
   2065  1.1.1.2  mrg void
   2066  1.1.1.2  mrg tune_powm_sec (void)
   2067  1.1.1.2  mrg {
   2068  1.1.1.2  mrg   mp_size_t n;
   2069  1.1.1.2  mrg   int k, i;
   2070  1.1.1.2  mrg   mp_size_t itch;
   2071  1.1.1.2  mrg   mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
   2072  1.1.1.2  mrg   const int n_max = 3000 / GMP_NUMB_BITS;
   2073  1.1.1.5  mrg #define n_measurements  5
   2074  1.1.1.2  mrg   mp_ptr rp, bp, ep, mp, tp;
   2075  1.1.1.2  mrg   double ttab[n_measurements], tk, tkp1;
   2076  1.1.1.2  mrg   TMP_DECL;
   2077  1.1.1.2  mrg   TMP_MARK;
   2078  1.1.1.2  mrg 
   2079  1.1.1.2  mrg   possible_nbits_cutoff = 0;
   2080  1.1.1.2  mrg 
   2081  1.1.1.2  mrg   k = 1;
   2082  1.1.1.2  mrg 
   2083  1.1.1.2  mrg   winsize = 10;			/* the itch function needs this */
   2084  1.1.1.3  mrg   itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
   2085  1.1.1.2  mrg 
   2086  1.1.1.2  mrg   rp = TMP_ALLOC_LIMBS (n_max);
   2087  1.1.1.2  mrg   bp = TMP_ALLOC_LIMBS (n_max);
   2088  1.1.1.2  mrg   ep = TMP_ALLOC_LIMBS (n_max);
   2089  1.1.1.2  mrg   mp = TMP_ALLOC_LIMBS (n_max);
   2090  1.1.1.2  mrg   tp = TMP_ALLOC_LIMBS (itch);
   2091  1.1.1.2  mrg 
   2092  1.1.1.2  mrg   mpn_random (bp, n_max);
   2093  1.1.1.2  mrg   mpn_random (mp, n_max);
   2094  1.1.1.2  mrg   mp[0] |= 1;
   2095  1.1.1.2  mrg 
   2096  1.1.1.2  mrg /* How about taking the M operand size into account?
   2097  1.1.1.2  mrg 
   2098  1.1.1.2  mrg    An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
   2099  1.1.1.2  mrg    B = O(M)).
   2100  1.1.1.2  mrg 
   2101  1.1.1.2  mrg    Using k-ary and no sliding window, the precomputation will need time
   2102  1.1.1.2  mrg    O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
   2103  1.1.1.2  mrg    O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
   2104  1.1.1.2  mrg 
   2105  1.1.1.2  mrg    An operation R=powm_sec(B,E,N) will take time like powm.
   2106  1.1.1.2  mrg 
   2107  1.1.1.2  mrg    Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
   2108  1.1.1.2  mrg    main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
   2109  1.1.1.2  mrg    O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
   2110  1.1.1.2  mrg    table reads, respectively.  */
   2111  1.1.1.2  mrg 
   2112  1.1.1.2  mrg   printf ("#define POWM_SEC_TABLE  ");
   2113  1.1.1.2  mrg 
   2114  1.1.1.3  mrg   /* For nbits == 1, we should always use k == 1, so no need to tune
   2115  1.1.1.3  mrg      that. Starting with nbits == 2 also ensure that nbits always is
   2116  1.1.1.3  mrg      larger than the windowsize k+1. */
   2117  1.1.1.3  mrg   for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
   2118  1.1.1.2  mrg     {
   2119  1.1.1.2  mrg       n = (nbits - 1) / GMP_NUMB_BITS + 1;
   2120  1.1.1.2  mrg 
   2121  1.1.1.2  mrg       /* Generate E such that sliding-window for k and k+1 works equally
   2122  1.1.1.2  mrg 	 well/poorly (but sliding is not used in powm_sec, of course). */
   2123  1.1.1.2  mrg       for (i = 0; i < n; i++)
   2124  1.1.1.2  mrg 	ep[i] = ~CNST_LIMB(0);
   2125  1.1.1.2  mrg 
   2126  1.1.1.2  mrg       winsize = k;
   2127  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   2128  1.1.1.2  mrg 	{
   2129  1.1.1.2  mrg 	  speed_starttime ();
   2130  1.1.1.3  mrg 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
   2131  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   2132  1.1.1.2  mrg 	}
   2133  1.1.1.2  mrg       tk = median (ttab, n_measurements);
   2134  1.1.1.2  mrg 
   2135  1.1.1.2  mrg       winsize = k + 1;
   2136  1.1.1.2  mrg       speed_starttime ();
   2137  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   2138  1.1.1.2  mrg 	{
   2139  1.1.1.2  mrg 	  speed_starttime ();
   2140  1.1.1.3  mrg 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
   2141  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   2142  1.1.1.2  mrg 	}
   2143  1.1.1.2  mrg       tkp1 = median (ttab, n_measurements);
   2144  1.1.1.2  mrg /*
   2145  1.1.1.2  mrg       printf ("testing: %ld, %d", nbits, k, ep[n-1]);
   2146  1.1.1.2  mrg       printf ("   %10.5f  %10.5f\n", tk, tkp1);
   2147  1.1.1.2  mrg */
   2148  1.1.1.2  mrg       if (tkp1 < tk)
   2149  1.1.1.2  mrg 	{
   2150  1.1.1.2  mrg 	  if (possible_nbits_cutoff)
   2151  1.1.1.2  mrg 	    {
   2152  1.1.1.2  mrg 	      /* Two consecutive sizes indicate k increase, obey.  */
   2153  1.1.1.3  mrg 
   2154  1.1.1.3  mrg 	      /* Must always have x[k] >= k */
   2155  1.1.1.3  mrg 	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
   2156  1.1.1.3  mrg 
   2157  1.1.1.2  mrg 	      if (k > 1)
   2158  1.1.1.2  mrg 		printf (",");
   2159  1.1.1.2  mrg 	      printf ("%ld", (long) possible_nbits_cutoff);
   2160  1.1.1.2  mrg 	      k++;
   2161  1.1.1.2  mrg 	      possible_nbits_cutoff = 0;
   2162  1.1.1.2  mrg 	    }
   2163  1.1.1.2  mrg 	  else
   2164  1.1.1.2  mrg 	    {
   2165  1.1.1.2  mrg 	      /* One measurement indicate k increase, save nbits for further
   2166  1.1.1.2  mrg 		 consideration.  */
   2167  1.1.1.3  mrg 	      /* The new larger k gets used for sizes > the cutoff
   2168  1.1.1.3  mrg 		 value, hence the cutoff should be one less than the
   2169  1.1.1.3  mrg 		 smallest size where it gives a speedup. */
   2170  1.1.1.3  mrg 	      possible_nbits_cutoff = nbits - 1;
   2171  1.1.1.2  mrg 	    }
   2172  1.1.1.2  mrg 	}
   2173  1.1.1.2  mrg       else
   2174  1.1.1.2  mrg 	possible_nbits_cutoff = 0;
   2175  1.1.1.2  mrg 
   2176  1.1.1.2  mrg       nbits_next = nbits * 65 / 64;
   2177  1.1.1.2  mrg       nbits = nbits_next + (nbits_next == nbits);
   2178  1.1.1.2  mrg     }
   2179  1.1.1.2  mrg   printf ("\n");
   2180  1.1.1.2  mrg   TMP_FREE;
   2181  1.1.1.2  mrg }
   2182  1.1.1.2  mrg 
   2183      1.1  mrg 
   2184      1.1  mrg /* size_extra==1 reflects the fact that with high<divisor one division is
   2185      1.1  mrg    always skipped.  Forcing high<divisor while testing ensures consistency
   2186      1.1  mrg    while stepping through sizes, ie. that size-1 divides will be done each
   2187      1.1  mrg    time.
   2188      1.1  mrg 
   2189      1.1  mrg    min_size==2 and min_is_always are used so that if plain division is only
   2190      1.1  mrg    better at size==1 then don't bother including that code just for that
   2191      1.1  mrg    case, instead go with preinv always and get a size saving.  */
   2192      1.1  mrg 
   2193      1.1  mrg #define DIV_1_PARAMS                    \
   2194      1.1  mrg   param.check_size = 256;               \
   2195      1.1  mrg   param.min_size = 2;                   \
   2196      1.1  mrg   param.min_is_always = 1;              \
   2197      1.1  mrg   param.data_high = DATA_HIGH_LT_R;     \
   2198      1.1  mrg   param.size_extra = 1;                 \
   2199      1.1  mrg   param.stop_factor = 2.0;
   2200      1.1  mrg 
   2201      1.1  mrg 
   2202  1.1.1.2  mrg double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
   2203      1.1  mrg 
   2204      1.1  mrg void
   2205      1.1  mrg tune_divrem_1 (void)
   2206      1.1  mrg {
   2207      1.1  mrg   /* plain version by default */
   2208      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
   2209      1.1  mrg 
   2210      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2211      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2212      1.1  mrg      appear in gmp-mparam.h.  */
   2213      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2214      1.1  mrg     return;
   2215      1.1  mrg 
   2216      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2217      1.1  mrg     {
   2218      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2219      1.1  mrg                            "no preinv with nails");
   2220      1.1  mrg       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2221      1.1  mrg                            "no preinv with nails");
   2222      1.1  mrg       return;
   2223      1.1  mrg     }
   2224      1.1  mrg 
   2225      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2226      1.1  mrg     {
   2227      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
   2228      1.1  mrg       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
   2229      1.1  mrg       return;
   2230      1.1  mrg     }
   2231      1.1  mrg 
   2232      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
   2233      1.1  mrg 
   2234      1.1  mrg   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
   2235      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2236      1.1  mrg      is more important. */
   2237      1.1  mrg   {
   2238      1.1  mrg     static struct param_t  param;
   2239      1.1  mrg     param.name = "DIVREM_1_NORM_THRESHOLD";
   2240      1.1  mrg     DIV_1_PARAMS;
   2241      1.1  mrg     s.r = randlimb_norm ();
   2242      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2243      1.1  mrg     one (&divrem_1_norm_threshold, &param);
   2244      1.1  mrg   }
   2245      1.1  mrg   {
   2246      1.1  mrg     static struct param_t  param;
   2247      1.1  mrg     param.name = "DIVREM_1_UNNORM_THRESHOLD";
   2248      1.1  mrg     DIV_1_PARAMS;
   2249      1.1  mrg     s.r = randlimb_half ();
   2250      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2251      1.1  mrg     one (&divrem_1_unnorm_threshold, &param);
   2252      1.1  mrg   }
   2253      1.1  mrg }
   2254      1.1  mrg 
   2255  1.1.1.3  mrg void
   2256  1.1.1.3  mrg tune_div_qr_1 (void)
   2257  1.1.1.3  mrg {
   2258  1.1.1.3  mrg   if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
   2259  1.1.1.3  mrg     {
   2260  1.1.1.3  mrg       static struct param_t  param;
   2261  1.1.1.4  mrg       speed_function_t f[2] =
   2262  1.1.1.4  mrg 	{
   2263  1.1.1.4  mrg 	 speed_mpn_div_qr_1n_pi1_1,
   2264  1.1.1.4  mrg 	 speed_mpn_div_qr_1n_pi1_2,
   2265  1.1.1.4  mrg 	};
   2266  1.1.1.3  mrg 
   2267  1.1.1.3  mrg       s.size = 10;
   2268  1.1.1.3  mrg       s.r = randlimb_norm ();
   2269  1.1.1.3  mrg 
   2270  1.1.1.4  mrg       one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
   2271  1.1.1.3  mrg     }
   2272  1.1.1.3  mrg 
   2273  1.1.1.3  mrg   {
   2274  1.1.1.3  mrg     static struct param_t  param;
   2275  1.1.1.3  mrg     param.name = "DIV_QR_1_NORM_THRESHOLD";
   2276  1.1.1.3  mrg     DIV_1_PARAMS;
   2277  1.1.1.3  mrg     param.min_size = 1;
   2278  1.1.1.3  mrg     param.min_is_always = 0;
   2279  1.1.1.3  mrg     s.r = randlimb_norm ();
   2280  1.1.1.3  mrg     param.function = speed_mpn_div_qr_1_tune;
   2281  1.1.1.3  mrg     one (&div_qr_1_norm_threshold, &param);
   2282  1.1.1.3  mrg   }
   2283  1.1.1.3  mrg   {
   2284  1.1.1.3  mrg     static struct param_t  param;
   2285  1.1.1.3  mrg     param.name = "DIV_QR_1_UNNORM_THRESHOLD";
   2286  1.1.1.3  mrg     DIV_1_PARAMS;
   2287  1.1.1.3  mrg     param.min_size = 1;
   2288  1.1.1.3  mrg     param.min_is_always = 0;
   2289  1.1.1.3  mrg     s.r = randlimb_half();
   2290  1.1.1.3  mrg     param.function = speed_mpn_div_qr_1_tune;
   2291  1.1.1.3  mrg     one (&div_qr_1_unnorm_threshold, &param);
   2292  1.1.1.3  mrg   }
   2293  1.1.1.3  mrg }
   2294  1.1.1.3  mrg 
   2295      1.1  mrg 
   2296      1.1  mrg void
   2297      1.1  mrg tune_mod_1 (void)
   2298      1.1  mrg {
   2299      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2300      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2301      1.1  mrg      appear in gmp-mparam.h.  */
   2302      1.1  mrg   if (HAVE_NATIVE_mpn_mod_1)
   2303      1.1  mrg     return;
   2304      1.1  mrg 
   2305      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2306      1.1  mrg     {
   2307      1.1  mrg       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2308      1.1  mrg                            "no preinv with nails");
   2309      1.1  mrg       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2310      1.1  mrg                            "no preinv with nails");
   2311      1.1  mrg       return;
   2312      1.1  mrg     }
   2313      1.1  mrg 
   2314  1.1.1.2  mrg   if (!HAVE_NATIVE_mpn_mod_1_1p)
   2315  1.1.1.2  mrg     {
   2316  1.1.1.2  mrg       static struct param_t  param;
   2317  1.1.1.4  mrg       speed_function_t f[2] =
   2318  1.1.1.4  mrg 	{
   2319  1.1.1.4  mrg 	 speed_mpn_mod_1_1_1,
   2320  1.1.1.4  mrg 	 speed_mpn_mod_1_1_2,
   2321  1.1.1.4  mrg 	};
   2322  1.1.1.2  mrg 
   2323  1.1.1.2  mrg       s.size = 10;
   2324  1.1.1.2  mrg       s.r = randlimb_half ();
   2325  1.1.1.4  mrg       one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
   2326  1.1.1.2  mrg     }
   2327  1.1.1.2  mrg 
   2328      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2329      1.1  mrg     {
   2330      1.1  mrg       print_define ("MOD_1_NORM_THRESHOLD", 0L);
   2331      1.1  mrg       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
   2332      1.1  mrg     }
   2333      1.1  mrg   else
   2334      1.1  mrg     {
   2335      1.1  mrg       {
   2336      1.1  mrg 	static struct param_t  param;
   2337      1.1  mrg 	param.name = "MOD_1_NORM_THRESHOLD";
   2338      1.1  mrg 	DIV_1_PARAMS;
   2339      1.1  mrg 	s.r = randlimb_norm ();
   2340      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2341      1.1  mrg 	one (&mod_1_norm_threshold, &param);
   2342      1.1  mrg       }
   2343      1.1  mrg       {
   2344      1.1  mrg 	static struct param_t  param;
   2345      1.1  mrg 	param.name = "MOD_1_UNNORM_THRESHOLD";
   2346      1.1  mrg 	DIV_1_PARAMS;
   2347      1.1  mrg 	s.r = randlimb_half ();
   2348      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2349      1.1  mrg 	one (&mod_1_unnorm_threshold, &param);
   2350      1.1  mrg       }
   2351      1.1  mrg     }
   2352      1.1  mrg   {
   2353      1.1  mrg     static struct param_t  param;
   2354      1.1  mrg 
   2355      1.1  mrg     param.check_size = 256;
   2356      1.1  mrg 
   2357      1.1  mrg     s.r = randlimb_norm ();
   2358      1.1  mrg     param.function = speed_mpn_mod_1_tune;
   2359      1.1  mrg 
   2360      1.1  mrg     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
   2361      1.1  mrg     param.min_size = 2;
   2362      1.1  mrg     one (&mod_1n_to_mod_1_1_threshold, &param);
   2363      1.1  mrg   }
   2364  1.1.1.2  mrg 
   2365      1.1  mrg   {
   2366      1.1  mrg     static struct param_t  param;
   2367      1.1  mrg 
   2368      1.1  mrg     param.check_size = 256;
   2369  1.1.1.2  mrg     s.r = randlimb_half ();
   2370      1.1  mrg     param.noprint = 1;
   2371      1.1  mrg 
   2372  1.1.1.2  mrg     param.function = speed_mpn_mod_1_1;
   2373  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_2;
   2374  1.1.1.2  mrg     param.min_is_always = 1;
   2375      1.1  mrg     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
   2376  1.1.1.2  mrg     param.min_size = 2;
   2377      1.1  mrg     one (&mod_1_1_to_mod_1_2_threshold, &param);
   2378      1.1  mrg 
   2379  1.1.1.2  mrg     param.function = speed_mpn_mod_1_2;
   2380  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_4;
   2381  1.1.1.2  mrg     param.min_is_always = 1;
   2382      1.1  mrg     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
   2383  1.1.1.2  mrg     param.min_size = 1;
   2384      1.1  mrg     one (&mod_1_2_to_mod_1_4_threshold, &param);
   2385      1.1  mrg 
   2386  1.1.1.2  mrg     if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
   2387      1.1  mrg       {
   2388  1.1.1.2  mrg 	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
   2389      1.1  mrg 	mod_1_2_to_mod_1_4_threshold = 0;
   2390      1.1  mrg 
   2391  1.1.1.2  mrg 	param.function = speed_mpn_mod_1_1;
   2392  1.1.1.2  mrg 	param.function2 = speed_mpn_mod_1_4;
   2393  1.1.1.2  mrg 	param.min_is_always = 1;
   2394  1.1.1.2  mrg 	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
   2395  1.1.1.2  mrg 	param.min_size = 2;
   2396      1.1  mrg 	one (&mod_1_1_to_mod_1_2_threshold, &param);
   2397      1.1  mrg       }
   2398  1.1.1.2  mrg 
   2399  1.1.1.2  mrg     param.function = speed_mpn_mod_1_tune;
   2400  1.1.1.2  mrg     param.function2 = NULL;
   2401  1.1.1.2  mrg     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
   2402  1.1.1.2  mrg     param.min_size = 2;
   2403  1.1.1.2  mrg     param.min_is_always = 0;
   2404  1.1.1.2  mrg     one (&mod_1u_to_mod_1_1_threshold, &param);
   2405  1.1.1.2  mrg 
   2406  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
   2407  1.1.1.2  mrg       mod_1_1_to_mod_1_2_threshold = 0;
   2408  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
   2409  1.1.1.2  mrg       mod_1_2_to_mod_1_4_threshold = 0;
   2410  1.1.1.2  mrg 
   2411  1.1.1.2  mrg     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
   2412  1.1.1.2  mrg     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
   2413  1.1.1.2  mrg 			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
   2414  1.1.1.2  mrg     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
   2415  1.1.1.2  mrg 			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
   2416      1.1  mrg   }
   2417      1.1  mrg 
   2418      1.1  mrg   {
   2419      1.1  mrg     static struct param_t  param;
   2420      1.1  mrg 
   2421      1.1  mrg     param.check_size = 256;
   2422      1.1  mrg 
   2423      1.1  mrg     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
   2424      1.1  mrg     s.r = randlimb_norm ();
   2425      1.1  mrg     param.function = speed_mpn_preinv_mod_1;
   2426      1.1  mrg     param.function2 = speed_mpn_mod_1_tune;
   2427      1.1  mrg     param.min_size = 1;
   2428      1.1  mrg     one (&preinv_mod_1_to_mod_1_threshold, &param);
   2429      1.1  mrg   }
   2430      1.1  mrg }
   2431      1.1  mrg 
   2432      1.1  mrg 
   2433      1.1  mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
   2434      1.1  mrg    imply that udiv_qrnnd_preinv is worth using, but it seems most
   2435      1.1  mrg    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
   2436      1.1  mrg    directly.  */
   2437      1.1  mrg 
   2438      1.1  mrg void
   2439      1.1  mrg tune_preinv_divrem_1 (void)
   2440      1.1  mrg {
   2441      1.1  mrg   static struct param_t  param;
   2442      1.1  mrg   speed_function_t  divrem_1;
   2443      1.1  mrg   const char        *divrem_1_name;
   2444      1.1  mrg   double            t1, t2;
   2445      1.1  mrg 
   2446      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2447      1.1  mrg     {
   2448      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
   2449      1.1  mrg       return;
   2450      1.1  mrg     }
   2451      1.1  mrg 
   2452      1.1  mrg   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
   2453      1.1  mrg      it's faster than mpn_divrem_1.  */
   2454      1.1  mrg   if (HAVE_NATIVE_mpn_preinv_divrem_1)
   2455      1.1  mrg     {
   2456      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
   2457      1.1  mrg       return;
   2458      1.1  mrg     }
   2459      1.1  mrg 
   2460      1.1  mrg   /* If udiv_qrnnd_preinv is the only division method then of course
   2461      1.1  mrg      mpn_preinv_divrem_1 should be used.  */
   2462      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2463      1.1  mrg     {
   2464      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
   2465      1.1  mrg       return;
   2466      1.1  mrg     }
   2467      1.1  mrg 
   2468      1.1  mrg   /* If we've got an assembler version of mpn_divrem_1, then compare against
   2469      1.1  mrg      that, not the mpn_divrem_1_div generic C.  */
   2470      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2471      1.1  mrg     {
   2472      1.1  mrg       divrem_1 = speed_mpn_divrem_1;
   2473      1.1  mrg       divrem_1_name = "mpn_divrem_1";
   2474      1.1  mrg     }
   2475      1.1  mrg   else
   2476      1.1  mrg     {
   2477      1.1  mrg       divrem_1 = speed_mpn_divrem_1_div;
   2478      1.1  mrg       divrem_1_name = "mpn_divrem_1_div";
   2479      1.1  mrg     }
   2480      1.1  mrg 
   2481      1.1  mrg   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
   2482      1.1  mrg   s.size = 200;                     /* generous but not too big */
   2483      1.1  mrg   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
   2484      1.1  mrg      since in general that's probably most common, though in fact for a
   2485      1.1  mrg      64-bit limb mp_bases[10].big_base is normalized.  */
   2486      1.1  mrg   s.r = urandom() & (GMP_NUMB_MASK >> 4);
   2487      1.1  mrg   if (s.r == 0) s.r = 123;
   2488      1.1  mrg 
   2489      1.1  mrg   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
   2490      1.1  mrg   t2 = tuneup_measure (divrem_1, &param, &s);
   2491      1.1  mrg   if (t1 == -1.0 || t2 == -1.0)
   2492      1.1  mrg     {
   2493      1.1  mrg       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
   2494      1.1  mrg               divrem_1_name, (long) s.size);
   2495      1.1  mrg       abort ();
   2496      1.1  mrg     }
   2497      1.1  mrg   if (option_trace >= 1)
   2498      1.1  mrg     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
   2499      1.1  mrg             (long) s.size, t1, divrem_1_name, t2);
   2500      1.1  mrg 
   2501      1.1  mrg   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
   2502      1.1  mrg }
   2503      1.1  mrg 
   2504      1.1  mrg 
   2505      1.1  mrg 
   2506      1.1  mrg void
   2507      1.1  mrg tune_divrem_2 (void)
   2508      1.1  mrg {
   2509      1.1  mrg   static struct param_t  param;
   2510      1.1  mrg 
   2511      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2512      1.1  mrg      the results in the .asm file, and there's no need for such thresholds
   2513      1.1  mrg      to appear in gmp-mparam.h.  */
   2514      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_2)
   2515      1.1  mrg     return;
   2516      1.1  mrg 
   2517      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2518      1.1  mrg     {
   2519      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
   2520      1.1  mrg                            "no preinv with nails");
   2521      1.1  mrg       return;
   2522      1.1  mrg     }
   2523      1.1  mrg 
   2524      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2525      1.1  mrg     {
   2526      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
   2527      1.1  mrg       return;
   2528      1.1  mrg     }
   2529      1.1  mrg 
   2530      1.1  mrg   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
   2531      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2532      1.1  mrg      is more important.
   2533      1.1  mrg 
   2534      1.1  mrg      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
   2535      1.1  mrg      code space if plain division is better only at size==2 or size==3. */
   2536      1.1  mrg   param.name = "DIVREM_2_THRESHOLD";
   2537      1.1  mrg   param.check_size = 256;
   2538      1.1  mrg   param.min_size = 4;
   2539      1.1  mrg   param.min_is_always = 1;
   2540      1.1  mrg   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
   2541      1.1  mrg   param.stop_factor = 2.0;
   2542      1.1  mrg 
   2543      1.1  mrg   s.r = randlimb_norm ();
   2544      1.1  mrg   param.function = speed_mpn_divrem_2;
   2545      1.1  mrg   one (&divrem_2_threshold, &param);
   2546      1.1  mrg }
   2547      1.1  mrg 
   2548  1.1.1.2  mrg void
   2549  1.1.1.2  mrg tune_div_qr_2 (void)
   2550  1.1.1.2  mrg {
   2551  1.1.1.2  mrg   static struct param_t  param;
   2552  1.1.1.2  mrg   param.name = "DIV_QR_2_PI2_THRESHOLD";
   2553  1.1.1.2  mrg   param.function = speed_mpn_div_qr_2n;
   2554  1.1.1.2  mrg   param.check_size = 500;
   2555  1.1.1.2  mrg   param.min_size = 4;
   2556  1.1.1.2  mrg   one (&div_qr_2_pi2_threshold, &param);
   2557  1.1.1.2  mrg }
   2558      1.1  mrg 
   2559      1.1  mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
   2560      1.1  mrg    tune for that.  Its speed can differ on odd or even divisor, so take an
   2561      1.1  mrg    average threshold for the two.
   2562      1.1  mrg 
   2563      1.1  mrg    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
   2564      1.1  mrg    might not vary that way, but don't test this since high<divisor isn't
   2565      1.1  mrg    expected to occur often with small divisors.  */
   2566      1.1  mrg 
   2567      1.1  mrg void
   2568      1.1  mrg tune_divexact_1 (void)
   2569      1.1  mrg {
   2570      1.1  mrg   static struct param_t  param;
   2571      1.1  mrg   mp_size_t  thresh[2], average;
   2572      1.1  mrg   int        low, i;
   2573      1.1  mrg 
   2574      1.1  mrg   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
   2575      1.1  mrg      full mpn_divrem_1.  */
   2576      1.1  mrg   if (HAVE_NATIVE_mpn_divexact_1)
   2577      1.1  mrg     {
   2578      1.1  mrg       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
   2579      1.1  mrg       return;
   2580      1.1  mrg     }
   2581      1.1  mrg 
   2582      1.1  mrg   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
   2583      1.1  mrg 
   2584      1.1  mrg   param.name = "DIVEXACT_1_THRESHOLD";
   2585      1.1  mrg   param.data_high = DATA_HIGH_GE_R;
   2586      1.1  mrg   param.check_size = 256;
   2587      1.1  mrg   param.min_size = 2;
   2588      1.1  mrg   param.stop_factor = 1.5;
   2589      1.1  mrg   param.function  = tuned_speed_mpn_divrem_1;
   2590      1.1  mrg   param.function2 = speed_mpn_divexact_1;
   2591      1.1  mrg   param.noprint = 1;
   2592      1.1  mrg 
   2593      1.1  mrg   print_define_start (param.name);
   2594      1.1  mrg 
   2595      1.1  mrg   for (low = 0; low <= 1; low++)
   2596      1.1  mrg     {
   2597      1.1  mrg       s.r = randlimb_half();
   2598      1.1  mrg       if (low == 0)
   2599      1.1  mrg         s.r |= 1;
   2600      1.1  mrg       else
   2601      1.1  mrg         s.r &= ~CNST_LIMB(7);
   2602      1.1  mrg 
   2603      1.1  mrg       one (&thresh[low], &param);
   2604      1.1  mrg       if (option_trace)
   2605      1.1  mrg         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
   2606      1.1  mrg 
   2607      1.1  mrg       if (thresh[low] == MP_SIZE_T_MAX)
   2608      1.1  mrg         {
   2609      1.1  mrg           average = MP_SIZE_T_MAX;
   2610      1.1  mrg           goto divexact_1_done;
   2611      1.1  mrg         }
   2612      1.1  mrg     }
   2613      1.1  mrg 
   2614      1.1  mrg   if (option_trace)
   2615      1.1  mrg     {
   2616      1.1  mrg       printf ("average of:");
   2617      1.1  mrg       for (i = 0; i < numberof(thresh); i++)
   2618      1.1  mrg         printf (" %ld", (long) thresh[i]);
   2619      1.1  mrg       printf ("\n");
   2620      1.1  mrg     }
   2621      1.1  mrg 
   2622      1.1  mrg   average = 0;
   2623      1.1  mrg   for (i = 0; i < numberof(thresh); i++)
   2624      1.1  mrg     average += thresh[i];
   2625      1.1  mrg   average /= numberof(thresh);
   2626      1.1  mrg 
   2627      1.1  mrg   /* If divexact turns out to be better as early as 3 limbs, then use it
   2628      1.1  mrg      always, so as to reduce code size and conditional jumps.  */
   2629      1.1  mrg   if (average <= 3)
   2630      1.1  mrg     average = 0;
   2631      1.1  mrg 
   2632      1.1  mrg  divexact_1_done:
   2633      1.1  mrg   print_define_end (param.name, average);
   2634      1.1  mrg }
   2635      1.1  mrg 
   2636      1.1  mrg 
   2637      1.1  mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
   2638      1.1  mrg    same as mpn_mod_1, but this might not be true of an assembler
   2639      1.1  mrg    implementation.  The threshold used is an average based on data where a
   2640      1.1  mrg    divide can be skipped and where it can't.
   2641      1.1  mrg 
   2642      1.1  mrg    If modexact turns out to be better as early as 3 limbs, then use it
   2643      1.1  mrg    always, so as to reduce code size and conditional jumps.  */
   2644      1.1  mrg 
   2645      1.1  mrg void
   2646      1.1  mrg tune_modexact_1_odd (void)
   2647      1.1  mrg {
   2648      1.1  mrg   static struct param_t  param;
   2649      1.1  mrg   mp_size_t  thresh_lt, thresh_ge, average;
   2650      1.1  mrg 
   2651      1.1  mrg #if 0
   2652      1.1  mrg   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
   2653      1.1  mrg      of a full mpn_mod_1.  */
   2654      1.1  mrg   if (HAVE_NATIVE_mpn_modexact_1_odd)
   2655      1.1  mrg     {
   2656      1.1  mrg       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
   2657      1.1  mrg       return;
   2658      1.1  mrg     }
   2659      1.1  mrg #endif
   2660      1.1  mrg 
   2661      1.1  mrg   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
   2662      1.1  mrg   param.check_size = 256;
   2663      1.1  mrg   param.min_size = 2;
   2664      1.1  mrg   param.stop_factor = 1.5;
   2665      1.1  mrg   param.function  = speed_mpn_modexact_1c_odd;
   2666      1.1  mrg   param.function2 = speed_mpn_mod_1_tune;
   2667      1.1  mrg   param.noprint = 1;
   2668      1.1  mrg   s.r = randlimb_half () | 1;
   2669      1.1  mrg 
   2670      1.1  mrg   print_define_start (param.name);
   2671      1.1  mrg 
   2672      1.1  mrg   param.data_high = DATA_HIGH_LT_R;
   2673      1.1  mrg   one (&thresh_lt, &param);
   2674      1.1  mrg   if (option_trace)
   2675      1.1  mrg     printf ("lt thresh %ld\n", (long) thresh_lt);
   2676      1.1  mrg 
   2677      1.1  mrg   average = thresh_lt;
   2678      1.1  mrg   if (thresh_lt != MP_SIZE_T_MAX)
   2679      1.1  mrg     {
   2680      1.1  mrg       param.data_high = DATA_HIGH_GE_R;
   2681      1.1  mrg       one (&thresh_ge, &param);
   2682      1.1  mrg       if (option_trace)
   2683      1.1  mrg         printf ("ge thresh %ld\n", (long) thresh_ge);
   2684      1.1  mrg 
   2685      1.1  mrg       if (thresh_ge != MP_SIZE_T_MAX)
   2686      1.1  mrg         {
   2687      1.1  mrg           average = (thresh_ge + thresh_lt) / 2;
   2688      1.1  mrg           if (thresh_ge <= 3)
   2689      1.1  mrg             average = 0;
   2690      1.1  mrg         }
   2691      1.1  mrg     }
   2692      1.1  mrg 
   2693      1.1  mrg   print_define_end (param.name, average);
   2694      1.1  mrg }
   2695      1.1  mrg 
   2696      1.1  mrg 
   2697      1.1  mrg void
   2698      1.1  mrg tune_jacobi_base (void)
   2699      1.1  mrg {
   2700      1.1  mrg   static struct param_t  param;
   2701  1.1.1.4  mrg   speed_function_t f[4] =
   2702      1.1  mrg     {
   2703  1.1.1.4  mrg      speed_mpn_jacobi_base_1,
   2704  1.1.1.4  mrg      speed_mpn_jacobi_base_2,
   2705  1.1.1.4  mrg      speed_mpn_jacobi_base_3,
   2706  1.1.1.4  mrg      speed_mpn_jacobi_base_4,
   2707  1.1.1.4  mrg     };
   2708      1.1  mrg 
   2709  1.1.1.4  mrg   s.size = GMP_LIMB_BITS * 3 / 4;
   2710      1.1  mrg 
   2711  1.1.1.4  mrg   one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
   2712      1.1  mrg }
   2713      1.1  mrg 
   2714      1.1  mrg 
   2715      1.1  mrg void
   2716      1.1  mrg tune_get_str (void)
   2717      1.1  mrg {
   2718      1.1  mrg   /* Tune for decimal, it being most common.  Some rough testing suggests
   2719      1.1  mrg      other bases are different, but not by very much.  */
   2720      1.1  mrg   s.r = 10;
   2721      1.1  mrg   {
   2722      1.1  mrg     static struct param_t  param;
   2723      1.1  mrg     GET_STR_PRECOMPUTE_THRESHOLD = 0;
   2724      1.1  mrg     param.name = "GET_STR_DC_THRESHOLD";
   2725      1.1  mrg     param.function = speed_mpn_get_str;
   2726      1.1  mrg     param.min_size = 4;
   2727      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2728      1.1  mrg     one (&get_str_dc_threshold, &param);
   2729      1.1  mrg   }
   2730      1.1  mrg   {
   2731      1.1  mrg     static struct param_t  param;
   2732      1.1  mrg     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
   2733      1.1  mrg     param.function = speed_mpn_get_str;
   2734      1.1  mrg     param.min_size = GET_STR_DC_THRESHOLD;
   2735      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2736      1.1  mrg     one (&get_str_precompute_threshold, &param);
   2737      1.1  mrg   }
   2738      1.1  mrg }
   2739      1.1  mrg 
   2740      1.1  mrg 
   2741      1.1  mrg double
   2742      1.1  mrg speed_mpn_pre_set_str (struct speed_params *s)
   2743      1.1  mrg {
   2744      1.1  mrg   unsigned char *str;
   2745      1.1  mrg   mp_ptr     wp;
   2746      1.1  mrg   mp_size_t  wn;
   2747      1.1  mrg   unsigned   i;
   2748      1.1  mrg   int        base;
   2749      1.1  mrg   double     t;
   2750      1.1  mrg   mp_ptr powtab_mem, tp;
   2751      1.1  mrg   powers_t powtab[GMP_LIMB_BITS];
   2752      1.1  mrg   mp_size_t un;
   2753      1.1  mrg   int chars_per_limb;
   2754  1.1.1.5  mrg   size_t n_pows;
   2755  1.1.1.5  mrg   powers_t *pt;
   2756      1.1  mrg   TMP_DECL;
   2757      1.1  mrg 
   2758      1.1  mrg   SPEED_RESTRICT_COND (s->size >= 1);
   2759      1.1  mrg 
   2760      1.1  mrg   base = s->r == 0 ? 10 : s->r;
   2761      1.1  mrg   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
   2762      1.1  mrg 
   2763      1.1  mrg   TMP_MARK;
   2764      1.1  mrg 
   2765  1.1.1.3  mrg   str = (unsigned char *) TMP_ALLOC (s->size);
   2766      1.1  mrg   for (i = 0; i < s->size; i++)
   2767      1.1  mrg     str[i] = s->xp[i] % base;
   2768      1.1  mrg 
   2769  1.1.1.2  mrg   LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
   2770      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
   2771      1.1  mrg 
   2772      1.1  mrg   /* use this during development to check wn is big enough */
   2773      1.1  mrg   /*
   2774      1.1  mrg   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
   2775      1.1  mrg   */
   2776      1.1  mrg 
   2777  1.1.1.3  mrg   speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
   2778      1.1  mrg   speed_operand_dst (s, wp, wn);
   2779      1.1  mrg   speed_cache_fill (s);
   2780      1.1  mrg 
   2781      1.1  mrg   chars_per_limb = mp_bases[base].chars_per_limb;
   2782      1.1  mrg   un = s->size / chars_per_limb + 1;
   2783  1.1.1.4  mrg   powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
   2784  1.1.1.5  mrg   n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
   2785  1.1.1.5  mrg   pt = powtab + n_pows;
   2786      1.1  mrg   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
   2787      1.1  mrg 
   2788      1.1  mrg   speed_starttime ();
   2789      1.1  mrg   i = s->reps;
   2790      1.1  mrg   do
   2791      1.1  mrg     {
   2792  1.1.1.4  mrg       mpn_pre_set_str (wp, str, s->size, pt, tp);
   2793      1.1  mrg     }
   2794      1.1  mrg   while (--i != 0);
   2795      1.1  mrg   t = speed_endtime ();
   2796      1.1  mrg 
   2797      1.1  mrg   TMP_FREE;
   2798      1.1  mrg   return t;
   2799      1.1  mrg }
   2800      1.1  mrg 
   2801      1.1  mrg void
   2802      1.1  mrg tune_set_str (void)
   2803      1.1  mrg {
   2804      1.1  mrg   s.r = 10;  /* decimal */
   2805      1.1  mrg   {
   2806      1.1  mrg     static struct param_t  param;
   2807      1.1  mrg     SET_STR_PRECOMPUTE_THRESHOLD = 0;
   2808      1.1  mrg     param.step_factor = 0.01;
   2809      1.1  mrg     param.name = "SET_STR_DC_THRESHOLD";
   2810      1.1  mrg     param.function = speed_mpn_pre_set_str;
   2811      1.1  mrg     param.min_size = 100;
   2812      1.1  mrg     param.max_size = 50000;
   2813      1.1  mrg     one (&set_str_dc_threshold, &param);
   2814      1.1  mrg   }
   2815      1.1  mrg   {
   2816      1.1  mrg     static struct param_t  param;
   2817      1.1  mrg     param.step_factor = 0.02;
   2818      1.1  mrg     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
   2819      1.1  mrg     param.function = speed_mpn_set_str;
   2820      1.1  mrg     param.min_size = SET_STR_DC_THRESHOLD;
   2821      1.1  mrg     param.max_size = 100000;
   2822      1.1  mrg     one (&set_str_precompute_threshold, &param);
   2823      1.1  mrg   }
   2824      1.1  mrg }
   2825      1.1  mrg 
   2826      1.1  mrg 
   2827      1.1  mrg void
   2828      1.1  mrg tune_fft_mul (void)
   2829      1.1  mrg {
   2830      1.1  mrg   static struct fft_param_t  param;
   2831      1.1  mrg 
   2832      1.1  mrg   if (option_fft_max_size == 0)
   2833      1.1  mrg     return;
   2834      1.1  mrg 
   2835      1.1  mrg   param.table_name          = "MUL_FFT_TABLE3";
   2836      1.1  mrg   param.threshold_name      = "MUL_FFT_THRESHOLD";
   2837      1.1  mrg   param.p_threshold         = &mul_fft_threshold;
   2838      1.1  mrg   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
   2839      1.1  mrg   param.p_modf_threshold    = &mul_fft_modf_threshold;
   2840      1.1  mrg   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
   2841      1.1  mrg   param.max_size            = option_fft_max_size;
   2842      1.1  mrg   param.function            = speed_mpn_fft_mul;
   2843      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft;
   2844      1.1  mrg   param.mul_function        = speed_mpn_mul_n;
   2845      1.1  mrg   param.sqr = 0;
   2846      1.1  mrg   fft (&param);
   2847      1.1  mrg }
   2848      1.1  mrg 
   2849      1.1  mrg 
   2850      1.1  mrg void
   2851      1.1  mrg tune_fft_sqr (void)
   2852      1.1  mrg {
   2853      1.1  mrg   static struct fft_param_t  param;
   2854      1.1  mrg 
   2855      1.1  mrg   if (option_fft_max_size == 0)
   2856      1.1  mrg     return;
   2857      1.1  mrg 
   2858      1.1  mrg   param.table_name          = "SQR_FFT_TABLE3";
   2859      1.1  mrg   param.threshold_name      = "SQR_FFT_THRESHOLD";
   2860      1.1  mrg   param.p_threshold         = &sqr_fft_threshold;
   2861      1.1  mrg   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
   2862      1.1  mrg   param.p_modf_threshold    = &sqr_fft_modf_threshold;
   2863      1.1  mrg   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
   2864      1.1  mrg   param.max_size            = option_fft_max_size;
   2865      1.1  mrg   param.function            = speed_mpn_fft_sqr;
   2866      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
   2867      1.1  mrg   param.mul_function        = speed_mpn_sqr;
   2868      1.1  mrg   param.sqr = 1;
   2869      1.1  mrg   fft (&param);
   2870      1.1  mrg }
   2871      1.1  mrg 
   2872      1.1  mrg void
   2873  1.1.1.2  mrg tune_fac_ui (void)
   2874  1.1.1.2  mrg {
   2875  1.1.1.2  mrg   static struct param_t  param;
   2876  1.1.1.2  mrg 
   2877  1.1.1.2  mrg   param.function = speed_mpz_fac_ui_tune;
   2878  1.1.1.2  mrg 
   2879  1.1.1.2  mrg   param.name = "FAC_DSC_THRESHOLD";
   2880  1.1.1.2  mrg   param.min_size = 70;
   2881  1.1.1.2  mrg   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
   2882  1.1.1.2  mrg   one (&fac_dsc_threshold, &param);
   2883  1.1.1.2  mrg 
   2884  1.1.1.2  mrg   param.name = "FAC_ODD_THRESHOLD";
   2885  1.1.1.2  mrg   param.min_size = 22;
   2886  1.1.1.2  mrg   param.stop_factor = 1.7;
   2887  1.1.1.2  mrg   param.min_is_always = 1;
   2888  1.1.1.2  mrg   one (&fac_odd_threshold, &param);
   2889  1.1.1.2  mrg }
   2890  1.1.1.2  mrg 
   2891  1.1.1.2  mrg void
   2892      1.1  mrg all (void)
   2893      1.1  mrg {
   2894      1.1  mrg   time_t  start_time, end_time;
   2895      1.1  mrg   TMP_DECL;
   2896      1.1  mrg 
   2897      1.1  mrg   TMP_MARK;
   2898      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
   2899      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
   2900      1.1  mrg 
   2901      1.1  mrg   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
   2902      1.1  mrg   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
   2903      1.1  mrg 
   2904      1.1  mrg   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
   2905      1.1  mrg 
   2906      1.1  mrg   speed_time_init ();
   2907      1.1  mrg   fprintf (stderr, "Using: %s\n", speed_time_string);
   2908      1.1  mrg 
   2909      1.1  mrg   fprintf (stderr, "speed_precision %d", speed_precision);
   2910      1.1  mrg   if (speed_unittime == 1.0)
   2911      1.1  mrg     fprintf (stderr, ", speed_unittime 1 cycle");
   2912      1.1  mrg   else
   2913      1.1  mrg     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
   2914      1.1  mrg   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
   2915      1.1  mrg     fprintf (stderr, ", CPU freq unknown\n");
   2916      1.1  mrg   else
   2917      1.1  mrg     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
   2918      1.1  mrg 
   2919      1.1  mrg   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
   2920      1.1  mrg            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
   2921      1.1  mrg   fprintf (stderr, "\n");
   2922      1.1  mrg 
   2923      1.1  mrg   time (&start_time);
   2924      1.1  mrg   {
   2925      1.1  mrg     struct tm  *tp;
   2926      1.1  mrg     tp = localtime (&start_time);
   2927      1.1  mrg     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
   2928      1.1  mrg             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
   2929      1.1  mrg 
   2930      1.1  mrg #ifdef __GNUC__
   2931      1.1  mrg     /* gcc sub-minor version doesn't seem to come through as a define */
   2932      1.1  mrg     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
   2933      1.1  mrg #define PRINTED_COMPILER
   2934      1.1  mrg #endif
   2935      1.1  mrg #if defined (__SUNPRO_C)
   2936      1.1  mrg     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
   2937      1.1  mrg #define PRINTED_COMPILER
   2938      1.1  mrg #endif
   2939      1.1  mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
   2940      1.1  mrg     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
   2941      1.1  mrg     printf ("MIPSpro C %d.%d.%d */\n",
   2942      1.1  mrg 	    _COMPILER_VERSION / 100,
   2943      1.1  mrg 	    _COMPILER_VERSION / 10 % 10,
   2944      1.1  mrg 	    _COMPILER_VERSION % 10);
   2945      1.1  mrg #define PRINTED_COMPILER
   2946      1.1  mrg #endif
   2947      1.1  mrg #if defined (__DECC) && defined (__DECC_VER)
   2948      1.1  mrg     printf ("DEC C %d */\n", __DECC_VER);
   2949      1.1  mrg #define PRINTED_COMPILER
   2950      1.1  mrg #endif
   2951      1.1  mrg #if ! defined (PRINTED_COMPILER)
   2952      1.1  mrg     printf ("system compiler */\n");
   2953      1.1  mrg #endif
   2954      1.1  mrg   }
   2955      1.1  mrg   printf ("\n");
   2956      1.1  mrg 
   2957      1.1  mrg   tune_divrem_1 ();
   2958      1.1  mrg   tune_mod_1 ();
   2959      1.1  mrg   tune_preinv_divrem_1 ();
   2960  1.1.1.3  mrg   tune_div_qr_1 ();
   2961  1.1.1.2  mrg #if 0
   2962      1.1  mrg   tune_divrem_2 ();
   2963  1.1.1.2  mrg #endif
   2964  1.1.1.2  mrg   tune_div_qr_2 ();
   2965      1.1  mrg   tune_divexact_1 ();
   2966      1.1  mrg   tune_modexact_1_odd ();
   2967      1.1  mrg   printf("\n");
   2968      1.1  mrg 
   2969  1.1.1.4  mrg   relspeed_div_1_vs_mul_1 ();
   2970  1.1.1.4  mrg   printf("\n");
   2971  1.1.1.4  mrg 
   2972      1.1  mrg   tune_mul_n ();
   2973      1.1  mrg   printf("\n");
   2974      1.1  mrg 
   2975      1.1  mrg   tune_mul ();
   2976      1.1  mrg   printf("\n");
   2977      1.1  mrg 
   2978      1.1  mrg   tune_sqr ();
   2979      1.1  mrg   printf("\n");
   2980      1.1  mrg 
   2981  1.1.1.2  mrg   tune_mulmid ();
   2982  1.1.1.2  mrg   printf("\n");
   2983  1.1.1.2  mrg 
   2984      1.1  mrg   tune_mulmod_bnm1 ();
   2985      1.1  mrg   tune_sqrmod_bnm1 ();
   2986      1.1  mrg   printf("\n");
   2987      1.1  mrg 
   2988      1.1  mrg   tune_fft_mul ();
   2989      1.1  mrg   printf("\n");
   2990      1.1  mrg 
   2991      1.1  mrg   tune_fft_sqr ();
   2992      1.1  mrg   printf ("\n");
   2993      1.1  mrg 
   2994      1.1  mrg   tune_mullo ();
   2995  1.1.1.3  mrg   tune_sqrlo ();
   2996      1.1  mrg   printf("\n");
   2997      1.1  mrg 
   2998      1.1  mrg   tune_dc_div ();
   2999      1.1  mrg   tune_dc_bdiv ();
   3000      1.1  mrg 
   3001      1.1  mrg   printf("\n");
   3002      1.1  mrg   tune_invertappr ();
   3003      1.1  mrg   tune_invert ();
   3004      1.1  mrg   printf("\n");
   3005      1.1  mrg 
   3006      1.1  mrg   tune_binvert ();
   3007      1.1  mrg   tune_redc ();
   3008      1.1  mrg   printf("\n");
   3009      1.1  mrg 
   3010      1.1  mrg   tune_mu_div ();
   3011      1.1  mrg   tune_mu_bdiv ();
   3012      1.1  mrg   printf("\n");
   3013      1.1  mrg 
   3014  1.1.1.2  mrg   tune_powm_sec ();
   3015  1.1.1.2  mrg   printf("\n");
   3016  1.1.1.2  mrg 
   3017  1.1.1.3  mrg   tune_get_str ();
   3018  1.1.1.3  mrg   tune_set_str ();
   3019  1.1.1.3  mrg   printf("\n");
   3020  1.1.1.3  mrg 
   3021  1.1.1.3  mrg   tune_fac_ui ();
   3022  1.1.1.3  mrg   printf("\n");
   3023  1.1.1.3  mrg 
   3024      1.1  mrg   tune_matrix22_mul ();
   3025  1.1.1.4  mrg   tune_hgcd2 ();
   3026      1.1  mrg   tune_hgcd ();
   3027  1.1.1.2  mrg   tune_hgcd_appr ();
   3028  1.1.1.2  mrg   tune_hgcd_reduce();
   3029      1.1  mrg   tune_gcd_dc ();
   3030      1.1  mrg   tune_gcdext_dc ();
   3031      1.1  mrg   tune_jacobi_base ();
   3032      1.1  mrg   printf("\n");
   3033      1.1  mrg 
   3034      1.1  mrg   time (&end_time);
   3035      1.1  mrg   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
   3036      1.1  mrg           (long) (end_time - start_time));
   3037      1.1  mrg 
   3038      1.1  mrg   TMP_FREE;
   3039      1.1  mrg }
   3040      1.1  mrg 
   3041      1.1  mrg 
   3042      1.1  mrg int
   3043      1.1  mrg main (int argc, char *argv[])
   3044      1.1  mrg {
   3045      1.1  mrg   int  opt;
   3046      1.1  mrg 
   3047      1.1  mrg   /* Unbuffered so if output is redirected to a file it isn't lost if the
   3048      1.1  mrg      program is killed part way through.  */
   3049      1.1  mrg   setbuf (stdout, NULL);
   3050      1.1  mrg   setbuf (stderr, NULL);
   3051      1.1  mrg 
   3052      1.1  mrg   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
   3053      1.1  mrg     {
   3054      1.1  mrg       switch (opt) {
   3055      1.1  mrg       case 'f':
   3056      1.1  mrg         if (optarg[0] == 't')
   3057      1.1  mrg           option_fft_trace = 2;
   3058      1.1  mrg         else
   3059      1.1  mrg           option_fft_max_size = atol (optarg);
   3060      1.1  mrg         break;
   3061      1.1  mrg       case 'o':
   3062      1.1  mrg         speed_option_set (optarg);
   3063      1.1  mrg         break;
   3064      1.1  mrg       case 'p':
   3065      1.1  mrg         speed_precision = atoi (optarg);
   3066      1.1  mrg         break;
   3067      1.1  mrg       case 't':
   3068      1.1  mrg         option_trace++;
   3069      1.1  mrg         break;
   3070      1.1  mrg       case '?':
   3071      1.1  mrg         exit(1);
   3072      1.1  mrg       }
   3073      1.1  mrg     }
   3074      1.1  mrg 
   3075      1.1  mrg   all ();
   3076      1.1  mrg   exit (0);
   3077      1.1  mrg }
   3078