Home | History | Annotate | Line # | Download | only in tune
tuneup.c revision 1.1.1.4
      1      1.1  mrg /* Create tuned thresholds for various algorithms.
      2      1.1  mrg 
      3  1.1.1.4  mrg Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg This file is part of the GNU MP Library.
      6      1.1  mrg 
      7      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg it under the terms of either:
      9  1.1.1.3  mrg 
     10  1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg     option) any later version.
     13  1.1.1.3  mrg 
     14  1.1.1.3  mrg or
     15  1.1.1.3  mrg 
     16  1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg     later version.
     19  1.1.1.3  mrg 
     20  1.1.1.3  mrg or both in parallel, as here.
     21      1.1  mrg 
     22      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg for more details.
     26      1.1  mrg 
     27  1.1.1.3  mrg You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg /* Usage: tuneup [-t] [-t] [-p precision]
     33      1.1  mrg 
     34      1.1  mrg    -t turns on some diagnostic traces, a second -t turns on more traces.
     35      1.1  mrg 
     36      1.1  mrg    Notes:
     37      1.1  mrg 
     38      1.1  mrg    The code here isn't a vision of loveliness, mainly because it's subject
     39      1.1  mrg    to ongoing changes according to new things wanting to be tuned, and
     40      1.1  mrg    practical requirements of systems tested.
     41      1.1  mrg 
     42      1.1  mrg    Sometimes running the program twice produces slightly different results.
     43      1.1  mrg    This is probably because there's so little separating algorithms near
     44      1.1  mrg    their crossover, and on that basis it should make little or no difference
     45      1.1  mrg    to the final speed of the relevant routines, but nothing has been done to
     46      1.1  mrg    check that carefully.
     47      1.1  mrg 
     48      1.1  mrg    Algorithm:
     49      1.1  mrg 
     50      1.1  mrg    The thresholds are determined as follows.  A crossover may not be a
     51      1.1  mrg    single size but rather a range where it oscillates between method A or
     52      1.1  mrg    method B faster.  If the threshold is set making B used where A is faster
     53      1.1  mrg    (or vice versa) that's bad.  Badness is the percentage time lost and
     54      1.1  mrg    total badness is the sum of this over all sizes measured.  The threshold
     55      1.1  mrg    is set to minimize total badness.
     56      1.1  mrg 
     57      1.1  mrg    Suppose, as sizes increase, method B becomes faster than method A.  The
     58      1.1  mrg    effect of the rule is that, as you look at increasing sizes, isolated
     59      1.1  mrg    points where B is faster are ignored, but when it's consistently faster,
     60      1.1  mrg    or faster on balance, then the threshold is set there.  The same result
     61      1.1  mrg    is obtained thinking in the other direction of A becoming faster at
     62      1.1  mrg    smaller sizes.
     63      1.1  mrg 
     64      1.1  mrg    In practice the thresholds tend to be chosen to bring on the next
     65      1.1  mrg    algorithm fairly quickly.
     66      1.1  mrg 
     67      1.1  mrg    This rule is attractive because it's got a basis in reason and is fairly
     68      1.1  mrg    easy to implement, but no work has been done to actually compare it in
     69      1.1  mrg    absolute terms to other possibilities.
     70      1.1  mrg 
     71      1.1  mrg    Implementation:
     72      1.1  mrg 
     73      1.1  mrg    In a normal library build the thresholds are constants.  To tune them
     74      1.1  mrg    selected objects are recompiled with the thresholds as global variables
     75      1.1  mrg    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
     76      1.1  mrg    the end of gmp-impl.h, and rules in tune/Makefile.am.
     77      1.1  mrg 
     78      1.1  mrg    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
     79      1.1  mrg    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
     80      1.1  mrg    level, but recurse into the basecase.
     81      1.1  mrg 
     82      1.1  mrg    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
     83      1.1  mrg    Other routines in turn will make use of both of those.  Naturally the
     84      1.1  mrg    dependants must be tuned first.
     85      1.1  mrg 
     86      1.1  mrg    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
     87      1.1  mrg    just a threshold based on comparing two routines (mpn_divrem_1 and
     88      1.1  mrg    mpn_divexact_1), and no further use of the value determined.
     89      1.1  mrg 
     90      1.1  mrg    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
     91      1.1  mrg    just comparisons between certain routines on representative data.
     92      1.1  mrg 
     93      1.1  mrg    Shortcuts are applied when native (assembler) versions of routines exist.
     94      1.1  mrg    For instance a native mpn_sqr_basecase is assumed to be always faster
     95      1.1  mrg    than mpn_mul_basecase, with no measuring.
     96      1.1  mrg 
     97      1.1  mrg    No attempt is made to tune within assembler routines, for instance
     98      1.1  mrg    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
     99      1.1  mrg    written and tuned all by hand.  Assembler routines that might have hard
    100      1.1  mrg    limits are recompiled though, to make them accept a bigger range of sizes
    101      1.1  mrg    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
    102      1.1  mrg 
    103      1.1  mrg    Limitations:
    104      1.1  mrg 
    105      1.1  mrg    The FFTs aren't subject to the same badness rule as the other thresholds,
    106      1.1  mrg    so each k is probably being brought on a touch early.  This isn't likely
    107      1.1  mrg    to make a difference, and the simpler probing means fewer tests.
    108      1.1  mrg 
    109      1.1  mrg */
    110      1.1  mrg 
    111      1.1  mrg #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
    112      1.1  mrg 
    113      1.1  mrg #include "config.h"
    114      1.1  mrg 
    115      1.1  mrg #include <math.h>
    116      1.1  mrg #include <stdio.h>
    117      1.1  mrg #include <stdlib.h>
    118      1.1  mrg #include <time.h>
    119      1.1  mrg #if HAVE_UNISTD_H
    120      1.1  mrg #include <unistd.h>
    121      1.1  mrg #endif
    122      1.1  mrg 
    123      1.1  mrg #include "gmp-impl.h"
    124      1.1  mrg #include "longlong.h"
    125      1.1  mrg 
    126      1.1  mrg #include "tests.h"
    127      1.1  mrg #include "speed.h"
    128      1.1  mrg 
    129      1.1  mrg #if !HAVE_DECL_OPTARG
    130      1.1  mrg extern char *optarg;
    131      1.1  mrg extern int optind, opterr;
    132      1.1  mrg #endif
    133      1.1  mrg 
    134      1.1  mrg 
    135      1.1  mrg #define DEFAULT_MAX_SIZE   1000  /* limbs */
    136      1.1  mrg 
    137      1.1  mrg #if WANT_FFT
    138      1.1  mrg mp_size_t  option_fft_max_size = 50000;  /* limbs */
    139      1.1  mrg #else
    140      1.1  mrg mp_size_t  option_fft_max_size = 0;
    141      1.1  mrg #endif
    142      1.1  mrg int        option_trace = 0;
    143      1.1  mrg int        option_fft_trace = 0;
    144      1.1  mrg struct speed_params  s;
    145      1.1  mrg 
    146      1.1  mrg struct dat_t {
    147      1.1  mrg   mp_size_t  size;
    148      1.1  mrg   double     d;
    149      1.1  mrg } *dat = NULL;
    150      1.1  mrg int  ndat = 0;
    151      1.1  mrg int  allocdat = 0;
    152      1.1  mrg 
    153      1.1  mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
    154      1.1  mrg    case use zero here, which for params.max_size means no limit.  */
    155      1.1  mrg #ifndef TUNE_SQR_TOOM2_MAX
    156      1.1  mrg #define TUNE_SQR_TOOM2_MAX  0
    157      1.1  mrg #endif
    158      1.1  mrg 
    159      1.1  mrg mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
    160      1.1  mrg mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
    161      1.1  mrg mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
    162      1.1  mrg mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
    163      1.1  mrg mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
    164      1.1  mrg mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
    165      1.1  mrg mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
    166      1.1  mrg mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
    167      1.1  mrg mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
    168  1.1.1.2  mrg mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
    169      1.1  mrg mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
    170      1.1  mrg mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
    171      1.1  mrg mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
    172      1.1  mrg mp_size_t  sqr_toom2_threshold
    173      1.1  mrg   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
    174      1.1  mrg mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
    175      1.1  mrg mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
    176      1.1  mrg mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
    177      1.1  mrg mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
    178      1.1  mrg mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
    179      1.1  mrg mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
    180      1.1  mrg mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
    181      1.1  mrg mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
    182      1.1  mrg mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
    183  1.1.1.3  mrg mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
    184  1.1.1.3  mrg mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
    185  1.1.1.3  mrg mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
    186  1.1.1.2  mrg mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
    187      1.1  mrg mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
    188      1.1  mrg mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
    189  1.1.1.2  mrg mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
    190      1.1  mrg mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
    191      1.1  mrg mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
    192      1.1  mrg mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
    193      1.1  mrg mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
    194      1.1  mrg mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
    195      1.1  mrg mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
    196      1.1  mrg mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    197      1.1  mrg mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
    198      1.1  mrg mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
    199      1.1  mrg mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
    200      1.1  mrg mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
    201      1.1  mrg mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
    202      1.1  mrg mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
    203      1.1  mrg mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
    204      1.1  mrg mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
    205      1.1  mrg mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
    206      1.1  mrg mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
    207      1.1  mrg mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
    208      1.1  mrg mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
    209  1.1.1.2  mrg mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
    210  1.1.1.2  mrg mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
    211      1.1  mrg mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
    212      1.1  mrg mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
    213  1.1.1.3  mrg int	   div_qr_1n_pi1_method		= 0;
    214  1.1.1.3  mrg mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
    215  1.1.1.3  mrg mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
    216      1.1  mrg mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
    217      1.1  mrg mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
    218      1.1  mrg mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
    219      1.1  mrg mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
    220  1.1.1.2  mrg int	   mod_1_1p_method		= 0;
    221      1.1  mrg mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    222      1.1  mrg mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
    223      1.1  mrg mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
    224      1.1  mrg mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
    225      1.1  mrg mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
    226      1.1  mrg mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
    227      1.1  mrg mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
    228      1.1  mrg mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
    229      1.1  mrg mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
    230      1.1  mrg mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
    231  1.1.1.2  mrg mp_size_t  fac_odd_threshold            = 0;
    232  1.1.1.2  mrg mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
    233      1.1  mrg 
    234      1.1  mrg mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
    235      1.1  mrg mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
    236      1.1  mrg 
    237      1.1  mrg struct param_t {
    238      1.1  mrg   const char        *name;
    239      1.1  mrg   speed_function_t  function;
    240      1.1  mrg   speed_function_t  function2;
    241      1.1  mrg   double            step_factor;    /* how much to step relatively */
    242      1.1  mrg   int               step;           /* how much to step absolutely */
    243      1.1  mrg   double            function_fudge; /* multiplier for "function" speeds */
    244      1.1  mrg   int               stop_since_change;
    245      1.1  mrg   double            stop_factor;
    246      1.1  mrg   mp_size_t         min_size;
    247      1.1  mrg   int               min_is_always;
    248      1.1  mrg   mp_size_t         max_size;
    249      1.1  mrg   mp_size_t         check_size;
    250      1.1  mrg   mp_size_t         size_extra;
    251      1.1  mrg 
    252      1.1  mrg #define DATA_HIGH_LT_R  1
    253      1.1  mrg #define DATA_HIGH_GE_R  2
    254      1.1  mrg   int               data_high;
    255      1.1  mrg 
    256      1.1  mrg   int               noprint;
    257      1.1  mrg };
    258      1.1  mrg 
    259      1.1  mrg 
    260      1.1  mrg /* These are normally undefined when false, which suits "#if" fine.
    261      1.1  mrg    But give them zero values so they can be used in plain C "if"s.  */
    262      1.1  mrg #ifndef UDIV_PREINV_ALWAYS
    263      1.1  mrg #define UDIV_PREINV_ALWAYS 0
    264      1.1  mrg #endif
    265      1.1  mrg #ifndef HAVE_NATIVE_mpn_divexact_1
    266      1.1  mrg #define HAVE_NATIVE_mpn_divexact_1 0
    267      1.1  mrg #endif
    268  1.1.1.3  mrg #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
    269  1.1.1.3  mrg #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
    270  1.1.1.3  mrg #endif
    271      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_1
    272      1.1  mrg #define HAVE_NATIVE_mpn_divrem_1 0
    273      1.1  mrg #endif
    274      1.1  mrg #ifndef HAVE_NATIVE_mpn_divrem_2
    275      1.1  mrg #define HAVE_NATIVE_mpn_divrem_2 0
    276      1.1  mrg #endif
    277      1.1  mrg #ifndef HAVE_NATIVE_mpn_mod_1
    278      1.1  mrg #define HAVE_NATIVE_mpn_mod_1 0
    279      1.1  mrg #endif
    280  1.1.1.2  mrg #ifndef HAVE_NATIVE_mpn_mod_1_1p
    281  1.1.1.2  mrg #define HAVE_NATIVE_mpn_mod_1_1p 0
    282  1.1.1.2  mrg #endif
    283      1.1  mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd
    284      1.1  mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0
    285      1.1  mrg #endif
    286      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
    287      1.1  mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0
    288      1.1  mrg #endif
    289      1.1  mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1
    290      1.1  mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0
    291      1.1  mrg #endif
    292      1.1  mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase
    293      1.1  mrg #define HAVE_NATIVE_mpn_sqr_basecase 0
    294      1.1  mrg #endif
    295      1.1  mrg 
    296      1.1  mrg 
    297      1.1  mrg #define MAX3(a,b,c)  MAX (MAX (a, b), c)
    298      1.1  mrg 
    299      1.1  mrg mp_limb_t
    300      1.1  mrg randlimb_norm (void)
    301      1.1  mrg {
    302      1.1  mrg   mp_limb_t  n;
    303      1.1  mrg   mpn_random (&n, 1);
    304      1.1  mrg   n |= GMP_NUMB_HIGHBIT;
    305      1.1  mrg   return n;
    306      1.1  mrg }
    307      1.1  mrg 
    308      1.1  mrg #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
    309      1.1  mrg 
    310      1.1  mrg mp_limb_t
    311      1.1  mrg randlimb_half (void)
    312      1.1  mrg {
    313      1.1  mrg   mp_limb_t  n;
    314      1.1  mrg   mpn_random (&n, 1);
    315      1.1  mrg   n &= GMP_NUMB_HALFMASK;
    316      1.1  mrg   n += (n==0);
    317      1.1  mrg   return n;
    318      1.1  mrg }
    319      1.1  mrg 
    320      1.1  mrg 
    321      1.1  mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger
    322      1.1  mrg    if necessary.  */
    323      1.1  mrg void
    324      1.1  mrg add_dat (mp_size_t size, double d)
    325      1.1  mrg {
    326      1.1  mrg #define ALLOCDAT_STEP  500
    327      1.1  mrg 
    328      1.1  mrg   ASSERT_ALWAYS (ndat <= allocdat);
    329      1.1  mrg 
    330      1.1  mrg   if (ndat == allocdat)
    331      1.1  mrg     {
    332      1.1  mrg       dat = (struct dat_t *) __gmp_allocate_or_reallocate
    333      1.1  mrg         (dat, allocdat * sizeof(dat[0]),
    334      1.1  mrg          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
    335      1.1  mrg       allocdat += ALLOCDAT_STEP;
    336      1.1  mrg     }
    337      1.1  mrg 
    338      1.1  mrg   dat[ndat].size = size;
    339      1.1  mrg   dat[ndat].d = d;
    340      1.1  mrg   ndat++;
    341      1.1  mrg }
    342      1.1  mrg 
    343      1.1  mrg 
    344      1.1  mrg /* Return the threshold size based on the data accumulated. */
    345      1.1  mrg mp_size_t
    346      1.1  mrg analyze_dat (int final)
    347      1.1  mrg {
    348      1.1  mrg   double  x, min_x;
    349      1.1  mrg   int     j, min_j;
    350      1.1  mrg 
    351      1.1  mrg   /* If the threshold is set at dat[0].size, any positive values are bad. */
    352      1.1  mrg   x = 0.0;
    353      1.1  mrg   for (j = 0; j < ndat; j++)
    354      1.1  mrg     if (dat[j].d > 0.0)
    355      1.1  mrg       x += dat[j].d;
    356      1.1  mrg 
    357      1.1  mrg   if (option_trace >= 2 && final)
    358      1.1  mrg     {
    359      1.1  mrg       printf ("\n");
    360      1.1  mrg       printf ("x is the sum of the badness from setting thresh at given size\n");
    361      1.1  mrg       printf ("  (minimum x is sought)\n");
    362      1.1  mrg       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
    363      1.1  mrg     }
    364      1.1  mrg 
    365      1.1  mrg   min_x = x;
    366      1.1  mrg   min_j = 0;
    367      1.1  mrg 
    368      1.1  mrg 
    369      1.1  mrg   /* When stepping to the next dat[j].size, positive values are no longer
    370      1.1  mrg      bad (so subtracted), negative values become bad (so add the absolute
    371      1.1  mrg      value, meaning subtract). */
    372      1.1  mrg   for (j = 0; j < ndat; x -= dat[j].d, j++)
    373      1.1  mrg     {
    374      1.1  mrg       if (option_trace >= 2 && final)
    375      1.1  mrg         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
    376      1.1  mrg 
    377      1.1  mrg       if (x < min_x)
    378      1.1  mrg         {
    379      1.1  mrg           min_x = x;
    380      1.1  mrg           min_j = j;
    381      1.1  mrg         }
    382      1.1  mrg     }
    383      1.1  mrg 
    384      1.1  mrg   return min_j;
    385      1.1  mrg }
    386      1.1  mrg 
    387      1.1  mrg 
    388  1.1.1.3  mrg /* Measuring for recompiled mpn/generic/div_qr_1.c,
    389  1.1.1.3  mrg  * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
    390  1.1.1.3  mrg 
    391  1.1.1.3  mrg mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
    392  1.1.1.3  mrg 
    393  1.1.1.3  mrg #if defined (__cplusplus)
    394  1.1.1.3  mrg extern "C" {
    395  1.1.1.3  mrg #endif
    396      1.1  mrg 
    397  1.1.1.2  mrg mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
    398  1.1.1.2  mrg mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
    399  1.1.1.2  mrg void mpz_fac_ui_tune (mpz_ptr, unsigned long);
    400      1.1  mrg 
    401  1.1.1.3  mrg #if defined (__cplusplus)
    402  1.1.1.3  mrg }
    403  1.1.1.3  mrg #endif
    404  1.1.1.3  mrg 
    405      1.1  mrg double
    406      1.1  mrg speed_mpn_mod_1_tune (struct speed_params *s)
    407      1.1  mrg {
    408      1.1  mrg   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
    409      1.1  mrg }
    410      1.1  mrg double
    411      1.1  mrg speed_mpn_divrem_1_tune (struct speed_params *s)
    412      1.1  mrg {
    413      1.1  mrg   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
    414      1.1  mrg }
    415  1.1.1.2  mrg double
    416  1.1.1.2  mrg speed_mpz_fac_ui_tune (struct speed_params *s)
    417  1.1.1.2  mrg {
    418  1.1.1.2  mrg   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
    419  1.1.1.2  mrg }
    420  1.1.1.3  mrg double
    421  1.1.1.3  mrg speed_mpn_div_qr_1_tune (struct speed_params *s)
    422  1.1.1.3  mrg {
    423  1.1.1.3  mrg   SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
    424  1.1.1.3  mrg }
    425      1.1  mrg 
    426      1.1  mrg double
    427      1.1  mrg tuneup_measure (speed_function_t fun,
    428      1.1  mrg                 const struct param_t *param,
    429      1.1  mrg                 struct speed_params *s)
    430      1.1  mrg {
    431      1.1  mrg   static struct param_t  dummy;
    432      1.1  mrg   double   t;
    433      1.1  mrg   TMP_DECL;
    434      1.1  mrg 
    435      1.1  mrg   if (! param)
    436      1.1  mrg     param = &dummy;
    437      1.1  mrg 
    438      1.1  mrg   s->size += param->size_extra;
    439      1.1  mrg 
    440      1.1  mrg   TMP_MARK;
    441      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
    442      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
    443      1.1  mrg 
    444      1.1  mrg   mpn_random (s->xp, s->size);
    445      1.1  mrg   mpn_random (s->yp, s->size);
    446      1.1  mrg 
    447      1.1  mrg   switch (param->data_high) {
    448      1.1  mrg   case DATA_HIGH_LT_R:
    449      1.1  mrg     s->xp[s->size-1] %= s->r;
    450      1.1  mrg     s->yp[s->size-1] %= s->r;
    451      1.1  mrg     break;
    452      1.1  mrg   case DATA_HIGH_GE_R:
    453      1.1  mrg     s->xp[s->size-1] |= s->r;
    454      1.1  mrg     s->yp[s->size-1] |= s->r;
    455      1.1  mrg     break;
    456      1.1  mrg   }
    457      1.1  mrg 
    458      1.1  mrg   t = speed_measure (fun, s);
    459      1.1  mrg 
    460      1.1  mrg   s->size -= param->size_extra;
    461      1.1  mrg 
    462      1.1  mrg   TMP_FREE;
    463      1.1  mrg   return t;
    464      1.1  mrg }
    465      1.1  mrg 
    466      1.1  mrg 
    467      1.1  mrg #define PRINT_WIDTH  31
    468      1.1  mrg 
    469      1.1  mrg void
    470      1.1  mrg print_define_start (const char *name)
    471      1.1  mrg {
    472      1.1  mrg   printf ("#define %-*s  ", PRINT_WIDTH, name);
    473      1.1  mrg   if (option_trace)
    474      1.1  mrg     printf ("...\n");
    475      1.1  mrg }
    476      1.1  mrg 
    477      1.1  mrg void
    478      1.1  mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark)
    479      1.1  mrg {
    480      1.1  mrg   if (option_trace)
    481      1.1  mrg     printf ("#define %-*s  ", PRINT_WIDTH, name);
    482      1.1  mrg 
    483      1.1  mrg   if (value == MP_SIZE_T_MAX)
    484      1.1  mrg     printf ("MP_SIZE_T_MAX");
    485      1.1  mrg   else
    486      1.1  mrg     printf ("%5ld", (long) value);
    487      1.1  mrg 
    488      1.1  mrg   if (remark != NULL)
    489      1.1  mrg     printf ("  /* %s */", remark);
    490      1.1  mrg   printf ("\n");
    491      1.1  mrg   fflush (stdout);
    492      1.1  mrg }
    493      1.1  mrg 
    494      1.1  mrg void
    495      1.1  mrg print_define_end (const char *name, mp_size_t value)
    496      1.1  mrg {
    497      1.1  mrg   const char  *remark;
    498      1.1  mrg   if (value == MP_SIZE_T_MAX)
    499      1.1  mrg     remark = "never";
    500      1.1  mrg   else if (value == 0)
    501      1.1  mrg     remark = "always";
    502      1.1  mrg   else
    503      1.1  mrg     remark = NULL;
    504      1.1  mrg   print_define_end_remark (name, value, remark);
    505      1.1  mrg }
    506      1.1  mrg 
    507      1.1  mrg void
    508      1.1  mrg print_define (const char *name, mp_size_t value)
    509      1.1  mrg {
    510      1.1  mrg   print_define_start (name);
    511      1.1  mrg   print_define_end (name, value);
    512      1.1  mrg }
    513      1.1  mrg 
    514      1.1  mrg void
    515      1.1  mrg print_define_remark (const char *name, mp_size_t value, const char *remark)
    516      1.1  mrg {
    517      1.1  mrg   print_define_start (name);
    518      1.1  mrg   print_define_end_remark (name, value, remark);
    519      1.1  mrg }
    520      1.1  mrg 
    521  1.1.1.4  mrg void
    522  1.1.1.4  mrg print_define_with_speedup (const char *name, mp_size_t value,
    523  1.1.1.4  mrg 			   mp_size_t runner_up, double speedup)
    524  1.1.1.4  mrg {
    525  1.1.1.4  mrg   char buf[100];
    526  1.1.1.4  mrg   snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
    527  1.1.1.4  mrg 	    100.0 * (speedup - 1), runner_up);
    528  1.1.1.4  mrg   print_define_remark (name, value, buf);
    529  1.1.1.4  mrg }
    530      1.1  mrg 
    531      1.1  mrg void
    532      1.1  mrg one (mp_size_t *threshold, struct param_t *param)
    533      1.1  mrg {
    534      1.1  mrg   int  since_positive, since_thresh_change;
    535      1.1  mrg   int  thresh_idx, new_thresh_idx;
    536      1.1  mrg 
    537      1.1  mrg #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
    538      1.1  mrg 
    539      1.1  mrg   DEFAULT (param->function_fudge, 1.0);
    540      1.1  mrg   DEFAULT (param->function2, param->function);
    541      1.1  mrg   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
    542      1.1  mrg   DEFAULT (param->step, 1);            /* small steps by default */
    543      1.1  mrg   DEFAULT (param->stop_since_change, 80);
    544      1.1  mrg   DEFAULT (param->stop_factor, 1.2);
    545      1.1  mrg   DEFAULT (param->min_size, 10);
    546      1.1  mrg   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
    547      1.1  mrg 
    548      1.1  mrg   if (param->check_size != 0)
    549      1.1  mrg     {
    550      1.1  mrg       double   t1, t2;
    551      1.1  mrg       s.size = param->check_size;
    552      1.1  mrg 
    553      1.1  mrg       *threshold = s.size+1;
    554      1.1  mrg       t1 = tuneup_measure (param->function, param, &s);
    555      1.1  mrg 
    556      1.1  mrg       *threshold = s.size;
    557      1.1  mrg       t2 = tuneup_measure (param->function2, param, &s);
    558      1.1  mrg       if (t1 == -1.0 || t2 == -1.0)
    559      1.1  mrg         {
    560      1.1  mrg           printf ("Oops, can't run both functions at size %ld\n",
    561      1.1  mrg                   (long) s.size);
    562      1.1  mrg           abort ();
    563      1.1  mrg         }
    564      1.1  mrg       t1 *= param->function_fudge;
    565      1.1  mrg 
    566      1.1  mrg       /* ask that t2 is at least 4% below t1 */
    567      1.1  mrg       if (t1 < t2*1.04)
    568      1.1  mrg         {
    569      1.1  mrg           if (option_trace)
    570      1.1  mrg             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
    571      1.1  mrg           *threshold = MP_SIZE_T_MAX;
    572      1.1  mrg           if (! param->noprint)
    573      1.1  mrg             print_define (param->name, *threshold);
    574      1.1  mrg           return;
    575      1.1  mrg         }
    576      1.1  mrg 
    577      1.1  mrg       if (option_trace >= 2)
    578      1.1  mrg         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
    579      1.1  mrg                 (long) s.size, t1, t2);
    580      1.1  mrg     }
    581      1.1  mrg 
    582      1.1  mrg   if (! param->noprint || option_trace)
    583      1.1  mrg     print_define_start (param->name);
    584      1.1  mrg 
    585      1.1  mrg   ndat = 0;
    586      1.1  mrg   since_positive = 0;
    587      1.1  mrg   since_thresh_change = 0;
    588      1.1  mrg   thresh_idx = 0;
    589      1.1  mrg 
    590      1.1  mrg   if (option_trace >= 2)
    591      1.1  mrg     {
    592      1.1  mrg       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
    593      1.1  mrg       printf ("              (seconds)    (seconds)    diff    thresh\n");
    594      1.1  mrg     }
    595      1.1  mrg 
    596      1.1  mrg   for (s.size = param->min_size;
    597      1.1  mrg        s.size < param->max_size;
    598      1.1  mrg        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
    599      1.1  mrg     {
    600      1.1  mrg       double   ti, tiplus1, d;
    601      1.1  mrg 
    602      1.1  mrg       /*
    603      1.1  mrg         FIXME: check minimum size requirements are met, possibly by just
    604      1.1  mrg         checking for the -1 returns from the speed functions.
    605      1.1  mrg       */
    606      1.1  mrg 
    607      1.1  mrg       /* using method A at this size */
    608      1.1  mrg       *threshold = s.size+1;
    609      1.1  mrg       ti = tuneup_measure (param->function, param, &s);
    610      1.1  mrg       if (ti == -1.0)
    611      1.1  mrg         abort ();
    612      1.1  mrg       ti *= param->function_fudge;
    613      1.1  mrg 
    614      1.1  mrg       /* using method B at this size */
    615      1.1  mrg       *threshold = s.size;
    616      1.1  mrg       tiplus1 = tuneup_measure (param->function2, param, &s);
    617      1.1  mrg       if (tiplus1 == -1.0)
    618      1.1  mrg         abort ();
    619      1.1  mrg 
    620      1.1  mrg       /* Calculate the fraction by which the one or the other routine is
    621      1.1  mrg          slower.  */
    622      1.1  mrg       if (tiplus1 >= ti)
    623      1.1  mrg         d = (tiplus1 - ti) / tiplus1;  /* negative */
    624      1.1  mrg       else
    625      1.1  mrg         d = (tiplus1 - ti) / ti;       /* positive */
    626      1.1  mrg 
    627      1.1  mrg       add_dat (s.size, d);
    628      1.1  mrg 
    629      1.1  mrg       new_thresh_idx = analyze_dat (0);
    630      1.1  mrg 
    631      1.1  mrg       if (option_trace >= 2)
    632      1.1  mrg         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
    633      1.1  mrg                 (long) s.size, ti, tiplus1, d,
    634      1.1  mrg                 ti > tiplus1 ? '#' : ' ',
    635      1.1  mrg                 (long) dat[new_thresh_idx].size);
    636      1.1  mrg 
    637      1.1  mrg       /* Stop if the last time method i was faster was more than a
    638      1.1  mrg          certain number of measurements ago.  */
    639      1.1  mrg #define STOP_SINCE_POSITIVE  200
    640      1.1  mrg       if (d >= 0)
    641      1.1  mrg         since_positive = 0;
    642      1.1  mrg       else
    643      1.1  mrg         if (++since_positive > STOP_SINCE_POSITIVE)
    644      1.1  mrg           {
    645      1.1  mrg             if (option_trace >= 1)
    646      1.1  mrg               printf ("stopped due to since_positive (%d)\n",
    647      1.1  mrg                       STOP_SINCE_POSITIVE);
    648      1.1  mrg             break;
    649      1.1  mrg           }
    650      1.1  mrg 
    651      1.1  mrg       /* Stop if method A has become slower by a certain factor. */
    652      1.1  mrg       if (ti >= tiplus1 * param->stop_factor)
    653      1.1  mrg         {
    654      1.1  mrg           if (option_trace >= 1)
    655      1.1  mrg             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
    656      1.1  mrg                     param->stop_factor);
    657      1.1  mrg           break;
    658      1.1  mrg         }
    659      1.1  mrg 
    660      1.1  mrg       /* Stop if the threshold implied hasn't changed in a certain
    661      1.1  mrg          number of measurements.  (It's this condition that usually
    662      1.1  mrg          stops the loop.) */
    663      1.1  mrg       if (thresh_idx != new_thresh_idx)
    664      1.1  mrg         since_thresh_change = 0, thresh_idx = new_thresh_idx;
    665      1.1  mrg       else
    666      1.1  mrg         if (++since_thresh_change > param->stop_since_change)
    667      1.1  mrg           {
    668      1.1  mrg             if (option_trace >= 1)
    669      1.1  mrg               printf ("stopped due to since_thresh_change (%d)\n",
    670      1.1  mrg                       param->stop_since_change);
    671      1.1  mrg             break;
    672      1.1  mrg           }
    673      1.1  mrg 
    674      1.1  mrg       /* Stop if the threshold implied is more than a certain number of
    675      1.1  mrg          measurements ago.  */
    676      1.1  mrg #define STOP_SINCE_AFTER   500
    677      1.1  mrg       if (ndat - thresh_idx > STOP_SINCE_AFTER)
    678      1.1  mrg         {
    679      1.1  mrg           if (option_trace >= 1)
    680      1.1  mrg             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
    681      1.1  mrg                     STOP_SINCE_AFTER);
    682      1.1  mrg           break;
    683      1.1  mrg         }
    684      1.1  mrg 
    685      1.1  mrg       /* Stop when the size limit is reached before the end of the
    686      1.1  mrg          crossover, but only show this as an error for >= the default max
    687      1.1  mrg          size.  FIXME: Maybe should make it a param choice whether this is
    688      1.1  mrg          an error.  */
    689      1.1  mrg       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
    690      1.1  mrg         {
    691      1.1  mrg           fprintf (stderr, "%s\n", param->name);
    692      1.1  mrg           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
    693      1.1  mrg                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    694      1.1  mrg           fprintf (stderr, "    max size reached before end of crossover\n");
    695      1.1  mrg           break;
    696      1.1  mrg         }
    697      1.1  mrg     }
    698      1.1  mrg 
    699      1.1  mrg   if (option_trace >= 1)
    700      1.1  mrg     printf ("sizes %ld to %ld total %d measurements\n",
    701      1.1  mrg             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
    702      1.1  mrg 
    703      1.1  mrg   *threshold = dat[analyze_dat (1)].size;
    704      1.1  mrg 
    705      1.1  mrg   if (param->min_is_always)
    706      1.1  mrg     {
    707      1.1  mrg       if (*threshold == param->min_size)
    708      1.1  mrg         *threshold = 0;
    709      1.1  mrg     }
    710      1.1  mrg 
    711      1.1  mrg   if (! param->noprint || option_trace)
    712      1.1  mrg     print_define_end (param->name, *threshold);
    713      1.1  mrg }
    714      1.1  mrg 
    715  1.1.1.4  mrg /* Time N different FUNCTIONS with the same parameters and size, to
    716  1.1.1.4  mrg    select the fastest. Since *_METHOD defines start numbering from
    717  1.1.1.4  mrg    one, if functions[i] is fastest, the value of the define is i+1.
    718  1.1.1.4  mrg    Also output a comment with speedup compared to the next fastest
    719  1.1.1.4  mrg    function. The NAME argument is used only for trace output.
    720  1.1.1.4  mrg 
    721  1.1.1.4  mrg    Returns the index of the fastest function.
    722  1.1.1.4  mrg */
    723  1.1.1.4  mrg int
    724  1.1.1.4  mrg one_method (int n, speed_function_t *functions,
    725  1.1.1.4  mrg 	    const char *name, const char *define,
    726  1.1.1.4  mrg 	    const struct param_t *param)
    727  1.1.1.4  mrg {
    728  1.1.1.4  mrg   double *t;
    729  1.1.1.4  mrg   int i;
    730  1.1.1.4  mrg   int method;
    731  1.1.1.4  mrg   int method_runner_up;
    732  1.1.1.4  mrg 
    733  1.1.1.4  mrg   TMP_DECL;
    734  1.1.1.4  mrg   TMP_MARK;
    735  1.1.1.4  mrg   t = (double*) TMP_ALLOC (n * sizeof (*t));
    736  1.1.1.4  mrg 
    737  1.1.1.4  mrg   for (i = 0; i < n; i++)
    738  1.1.1.4  mrg     {
    739  1.1.1.4  mrg       t[i] = tuneup_measure (functions[i], param, &s);
    740  1.1.1.4  mrg       if (option_trace >= 1)
    741  1.1.1.4  mrg 	printf ("size=%ld, %s, method %d %.9f\n",
    742  1.1.1.4  mrg 		(long) s.size, name, i + 1, t[i]);
    743  1.1.1.4  mrg       if (t[i] == -1.0)
    744  1.1.1.4  mrg 	{
    745  1.1.1.4  mrg 	  printf ("Oops, can't measure all %s methods\n", name);
    746  1.1.1.4  mrg 	  abort ();
    747  1.1.1.4  mrg 	}
    748  1.1.1.4  mrg     }
    749  1.1.1.4  mrg   method = 0;
    750  1.1.1.4  mrg   for (i = 1; i < n; i++)
    751  1.1.1.4  mrg     if (t[i] < t[method])
    752  1.1.1.4  mrg       method = i;
    753  1.1.1.4  mrg 
    754  1.1.1.4  mrg   method_runner_up = (method == 0);
    755  1.1.1.4  mrg   for (i = 0; i < n; i++)
    756  1.1.1.4  mrg     if (i != method && t[i] < t[method_runner_up])
    757  1.1.1.4  mrg       method_runner_up = i;
    758  1.1.1.4  mrg 
    759  1.1.1.4  mrg   print_define_with_speedup (define, method + 1, method_runner_up + 1,
    760  1.1.1.4  mrg 			     t[method_runner_up] / t[method]);
    761  1.1.1.4  mrg 
    762  1.1.1.4  mrg   TMP_FREE;
    763  1.1.1.4  mrg   return method;
    764  1.1.1.4  mrg }
    765  1.1.1.4  mrg 
    766      1.1  mrg 
    767      1.1  mrg /* Special probing for the fft thresholds.  The size restrictions on the
    768      1.1  mrg    FFTs mean the graph of time vs size has a step effect.  See this for
    769      1.1  mrg    example using
    770      1.1  mrg 
    771      1.1  mrg        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
    772      1.1  mrg        gnuplot foo.gnuplot
    773      1.1  mrg 
    774      1.1  mrg    The current approach is to compare routines at the midpoint of relevant
    775      1.1  mrg    steps.  Arguably a more sophisticated system of threshold data is wanted
    776      1.1  mrg    if this step effect remains. */
    777      1.1  mrg 
    778      1.1  mrg struct fft_param_t {
    779      1.1  mrg   const char        *table_name;
    780      1.1  mrg   const char        *threshold_name;
    781      1.1  mrg   const char        *modf_threshold_name;
    782      1.1  mrg   mp_size_t         *p_threshold;
    783      1.1  mrg   mp_size_t         *p_modf_threshold;
    784      1.1  mrg   mp_size_t         first_size;
    785      1.1  mrg   mp_size_t         max_size;
    786      1.1  mrg   speed_function_t  function;
    787      1.1  mrg   speed_function_t  mul_modf_function;
    788      1.1  mrg   speed_function_t  mul_function;
    789      1.1  mrg   mp_size_t         sqr;
    790      1.1  mrg };
    791      1.1  mrg 
    792      1.1  mrg 
    793      1.1  mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
    794      1.1  mrg    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
    795      1.1  mrg    of 2^(k-1) bits. */
    796      1.1  mrg 
    797      1.1  mrg mp_size_t
    798      1.1  mrg fft_step_size (int k)
    799      1.1  mrg {
    800      1.1  mrg   mp_size_t  step;
    801      1.1  mrg 
    802      1.1  mrg   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
    803      1.1  mrg   step *= (mp_size_t) 1 << k;
    804      1.1  mrg 
    805      1.1  mrg   if (step <= 0)
    806      1.1  mrg     {
    807      1.1  mrg       printf ("Can't handle k=%d\n", k);
    808      1.1  mrg       abort ();
    809      1.1  mrg     }
    810      1.1  mrg 
    811      1.1  mrg   return step;
    812      1.1  mrg }
    813      1.1  mrg 
    814      1.1  mrg mp_size_t
    815      1.1  mrg fft_next_size (mp_size_t pl, int k)
    816      1.1  mrg {
    817      1.1  mrg   mp_size_t  m = fft_step_size (k);
    818      1.1  mrg 
    819      1.1  mrg /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
    820      1.1  mrg 
    821      1.1  mrg   if (pl == 0 || (pl & (m-1)) != 0)
    822      1.1  mrg     pl = (pl | (m-1)) + 1;
    823      1.1  mrg 
    824      1.1  mrg /*    printf (" %ld\n", pl); */
    825      1.1  mrg   return pl;
    826      1.1  mrg }
    827      1.1  mrg 
    828      1.1  mrg #define NMAX_DEFAULT 1000000
    829      1.1  mrg #define MAX_REPS 25
    830      1.1  mrg #define MIN_REPS 5
    831      1.1  mrg 
    832      1.1  mrg static inline size_t
    833      1.1  mrg mpn_mul_fft_lcm (size_t a, unsigned int k)
    834      1.1  mrg {
    835      1.1  mrg   unsigned int l = k;
    836      1.1  mrg 
    837      1.1  mrg   while (a % 2 == 0 && k > 0)
    838      1.1  mrg     {
    839      1.1  mrg       a >>= 1;
    840      1.1  mrg       k--;
    841      1.1  mrg     }
    842      1.1  mrg   return a << l;
    843      1.1  mrg }
    844      1.1  mrg 
    845      1.1  mrg mp_size_t
    846      1.1  mrg fftfill (mp_size_t pl, int k, int sqr)
    847      1.1  mrg {
    848      1.1  mrg   mp_size_t maxLK;
    849      1.1  mrg   mp_bitcnt_t N, Nprime, nprime, M;
    850      1.1  mrg 
    851      1.1  mrg   N = pl * GMP_NUMB_BITS;
    852      1.1  mrg   M = N >> k;
    853      1.1  mrg 
    854      1.1  mrg   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
    855      1.1  mrg 
    856      1.1  mrg   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
    857      1.1  mrg   nprime = Nprime / GMP_NUMB_BITS;
    858      1.1  mrg   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    859      1.1  mrg     {
    860      1.1  mrg       size_t K2;
    861      1.1  mrg       for (;;)
    862      1.1  mrg 	{
    863      1.1  mrg 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
    864      1.1  mrg 	  if ((nprime & (K2 - 1)) == 0)
    865      1.1  mrg 	    break;
    866      1.1  mrg 	  nprime = (nprime + K2 - 1) & -K2;
    867      1.1  mrg 	  Nprime = nprime * GMP_LIMB_BITS;
    868      1.1  mrg 	}
    869      1.1  mrg     }
    870      1.1  mrg   ASSERT_ALWAYS (nprime < pl);
    871      1.1  mrg 
    872      1.1  mrg   return Nprime;
    873      1.1  mrg }
    874      1.1  mrg 
    875      1.1  mrg static int
    876      1.1  mrg compare_double (const void *ap, const void *bp)
    877      1.1  mrg {
    878      1.1  mrg   double a = * (const double *) ap;
    879      1.1  mrg   double b = * (const double *) bp;
    880      1.1  mrg 
    881      1.1  mrg   if (a < b)
    882      1.1  mrg     return -1;
    883      1.1  mrg   else if (a > b)
    884      1.1  mrg     return 1;
    885      1.1  mrg   else
    886      1.1  mrg     return 0;
    887      1.1  mrg }
    888      1.1  mrg 
    889      1.1  mrg double
    890      1.1  mrg median (double *times, int n)
    891      1.1  mrg {
    892      1.1  mrg   qsort (times, n, sizeof (double), compare_double);
    893      1.1  mrg   return times[n/2];
    894      1.1  mrg }
    895      1.1  mrg 
    896      1.1  mrg #define FFT_CACHE_SIZE 25
    897      1.1  mrg typedef struct fft_cache
    898      1.1  mrg {
    899      1.1  mrg   mp_size_t n;
    900      1.1  mrg   double time;
    901      1.1  mrg } fft_cache_t;
    902      1.1  mrg 
    903      1.1  mrg fft_cache_t fft_cache[FFT_CACHE_SIZE];
    904      1.1  mrg 
    905      1.1  mrg double
    906      1.1  mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
    907      1.1  mrg 		int n_measurements)
    908      1.1  mrg {
    909      1.1  mrg   int i;
    910      1.1  mrg   double t, ttab[MAX_REPS];
    911      1.1  mrg 
    912      1.1  mrg   if (fft_cache[k].n == n)
    913      1.1  mrg     return fft_cache[k].time;
    914      1.1  mrg 
    915      1.1  mrg   for (i = 0; i < n_measurements; i++)
    916      1.1  mrg     {
    917      1.1  mrg       speed_starttime ();
    918      1.1  mrg       mpn_mul_fft (rp, n, ap, n, bp, n, k);
    919      1.1  mrg       ttab[i] = speed_endtime ();
    920      1.1  mrg     }
    921      1.1  mrg 
    922      1.1  mrg   t = median (ttab, n_measurements);
    923      1.1  mrg   fft_cache[k].n = n;
    924      1.1  mrg   fft_cache[k].time = t;
    925      1.1  mrg   return t;
    926      1.1  mrg }
    927      1.1  mrg 
    928      1.1  mrg #define INSERT_FFTTAB(idx, nval, kval)					\
    929      1.1  mrg   do {									\
    930      1.1  mrg     fft_tab[idx].n = nval;						\
    931      1.1  mrg     fft_tab[idx].k = kval;						\
    932  1.1.1.3  mrg     fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
    933  1.1.1.3  mrg     fft_tab[idx+1].k = (1 <<  5) - 1;					\
    934      1.1  mrg   } while (0)
    935      1.1  mrg 
    936      1.1  mrg int
    937      1.1  mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
    938      1.1  mrg {
    939      1.1  mrg   mp_size_t n, n1, prev_n1;
    940      1.1  mrg   int k, best_k, last_best_k, kmax;
    941      1.1  mrg   int eff, prev_eff;
    942      1.1  mrg   double t0, t1;
    943      1.1  mrg   int n_measurements;
    944      1.1  mrg   mp_limb_t *ap, *bp, *rp;
    945      1.1  mrg   mp_size_t alloc;
    946      1.1  mrg   struct fft_table_nk *fft_tab;
    947      1.1  mrg 
    948      1.1  mrg   fft_tab = mpn_fft_table3[p->sqr];
    949      1.1  mrg 
    950      1.1  mrg   for (k = 0; k < FFT_CACHE_SIZE; k++)
    951      1.1  mrg     fft_cache[k].n = 0;
    952      1.1  mrg 
    953      1.1  mrg   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
    954      1.1  mrg     {
    955      1.1  mrg       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
    956      1.1  mrg     }
    957      1.1  mrg 
    958      1.1  mrg   if (print)
    959      1.1  mrg     printf ("#define %s%*s", p->table_name, 38, "");
    960      1.1  mrg 
    961      1.1  mrg   if (idx == 0)
    962      1.1  mrg     {
    963      1.1  mrg       INSERT_FFTTAB (0, nmin, initial_k);
    964      1.1  mrg 
    965      1.1  mrg       if (print)
    966      1.1  mrg 	{
    967      1.1  mrg 	  printf ("\\\n  { ");
    968      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
    969      1.1  mrg 	}
    970      1.1  mrg 
    971      1.1  mrg       idx = 1;
    972      1.1  mrg     }
    973      1.1  mrg 
    974  1.1.1.3  mrg   ap = (mp_ptr) malloc (sizeof (mp_limb_t));
    975      1.1  mrg   if (p->sqr)
    976      1.1  mrg     bp = ap;
    977      1.1  mrg   else
    978  1.1.1.3  mrg     bp = (mp_ptr) malloc (sizeof (mp_limb_t));
    979  1.1.1.3  mrg   rp = (mp_ptr) malloc (sizeof (mp_limb_t));
    980      1.1  mrg   alloc = 1;
    981      1.1  mrg 
    982      1.1  mrg   /* Round n to comply to initial k value */
    983      1.1  mrg   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
    984      1.1  mrg 
    985      1.1  mrg   n_measurements = (18 - initial_k) | 1;
    986      1.1  mrg   n_measurements = MAX (n_measurements, MIN_REPS);
    987      1.1  mrg   n_measurements = MIN (n_measurements, MAX_REPS);
    988      1.1  mrg 
    989      1.1  mrg   last_best_k = initial_k;
    990      1.1  mrg   best_k = initial_k;
    991      1.1  mrg 
    992      1.1  mrg   while (n < nmax)
    993      1.1  mrg     {
    994      1.1  mrg       int start_k, end_k;
    995      1.1  mrg 
    996      1.1  mrg       /* Assume the current best k is best until we hit its next FFT step.  */
    997      1.1  mrg       t0 = 99999;
    998      1.1  mrg 
    999      1.1  mrg       prev_n1 = n + 1;
   1000      1.1  mrg 
   1001      1.1  mrg       start_k = MAX (4, best_k - 4);
   1002      1.1  mrg       end_k = MIN (24, best_k + 4);
   1003      1.1  mrg       for (k = start_k; k <= end_k; k++)
   1004      1.1  mrg 	{
   1005      1.1  mrg           n1 = mpn_fft_next_size (prev_n1, k);
   1006      1.1  mrg 
   1007      1.1  mrg 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
   1008      1.1  mrg 
   1009      1.1  mrg 	  if (eff < 70)		/* avoid measuring too slow fft:s */
   1010      1.1  mrg 	    continue;
   1011      1.1  mrg 
   1012      1.1  mrg 	  if (n1 > alloc)
   1013      1.1  mrg 	    {
   1014      1.1  mrg 	      alloc = n1;
   1015      1.1  mrg 	      if (p->sqr)
   1016      1.1  mrg 		{
   1017  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
   1018  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
   1019  1.1.1.3  mrg 		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
   1020      1.1  mrg 		  mpn_random (ap, alloc);
   1021  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
   1022      1.1  mrg 		}
   1023      1.1  mrg 	      else
   1024      1.1  mrg 		{
   1025  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
   1026  1.1.1.3  mrg 		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
   1027  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
   1028  1.1.1.3  mrg 		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
   1029      1.1  mrg 		  mpn_random (ap, alloc);
   1030  1.1.1.3  mrg 		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
   1031      1.1  mrg 		  mpn_random (bp, alloc);
   1032  1.1.1.3  mrg 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
   1033      1.1  mrg 		}
   1034      1.1  mrg 	    }
   1035      1.1  mrg 
   1036      1.1  mrg 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
   1037      1.1  mrg 
   1038      1.1  mrg 	  if (t1 * n_measurements > 0.3)
   1039      1.1  mrg 	    n_measurements -= 2;
   1040      1.1  mrg 	  n_measurements = MAX (n_measurements, MIN_REPS);
   1041      1.1  mrg 
   1042      1.1  mrg 	  if (t1 < t0)
   1043      1.1  mrg 	    {
   1044      1.1  mrg 	      best_k = k;
   1045      1.1  mrg 	      t0 = t1;
   1046      1.1  mrg 	    }
   1047      1.1  mrg 	}
   1048      1.1  mrg 
   1049      1.1  mrg       n1 = mpn_fft_next_size (prev_n1, best_k);
   1050      1.1  mrg 
   1051      1.1  mrg       if (last_best_k != best_k)
   1052      1.1  mrg 	{
   1053      1.1  mrg 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
   1054      1.1  mrg 
   1055      1.1  mrg 	  if (idx >= FFT_TABLE3_SIZE)
   1056      1.1  mrg 	    {
   1057      1.1  mrg 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   1058      1.1  mrg 	      abort ();
   1059      1.1  mrg 	    }
   1060      1.1  mrg 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
   1061      1.1  mrg 
   1062      1.1  mrg 	  if (print)
   1063      1.1  mrg 	    {
   1064      1.1  mrg 	      printf (", ");
   1065      1.1  mrg 	      if (idx % 4 == 0)
   1066      1.1  mrg 		printf ("\\\n    ");
   1067      1.1  mrg 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1068      1.1  mrg 	    }
   1069      1.1  mrg 
   1070      1.1  mrg 	  if (option_trace >= 2)
   1071      1.1  mrg 	    {
   1072      1.1  mrg 	      printf ("{%lu,%u}\n", prev_n1, best_k);
   1073      1.1  mrg 	      fflush (stdout);
   1074      1.1  mrg 	    }
   1075      1.1  mrg 
   1076      1.1  mrg 	  last_best_k = best_k;
   1077      1.1  mrg 	  idx++;
   1078      1.1  mrg 	}
   1079      1.1  mrg 
   1080      1.1  mrg       for (;;)
   1081      1.1  mrg 	{
   1082      1.1  mrg 	  prev_n1 = n1;
   1083      1.1  mrg 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
   1084      1.1  mrg 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
   1085      1.1  mrg 	  eff = fftfill (n1, best_k, p->sqr);
   1086      1.1  mrg 
   1087      1.1  mrg 	  if (eff != prev_eff)
   1088      1.1  mrg 	    break;
   1089      1.1  mrg 	}
   1090      1.1  mrg 
   1091      1.1  mrg       n = prev_n1;
   1092      1.1  mrg     }
   1093      1.1  mrg 
   1094      1.1  mrg   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
   1095      1.1  mrg   kmax = MIN (kmax, 25-1);
   1096      1.1  mrg   for (k = last_best_k + 1; k <= kmax; k++)
   1097      1.1  mrg     {
   1098      1.1  mrg       if (idx >= FFT_TABLE3_SIZE)
   1099      1.1  mrg 	{
   1100      1.1  mrg 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
   1101      1.1  mrg 	  abort ();
   1102      1.1  mrg 	}
   1103      1.1  mrg       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
   1104      1.1  mrg 
   1105      1.1  mrg       if (print)
   1106      1.1  mrg 	{
   1107      1.1  mrg 	  printf (", ");
   1108      1.1  mrg 	  if (idx % 4 == 0)
   1109      1.1  mrg 	    printf ("\\\n    ");
   1110      1.1  mrg 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
   1111      1.1  mrg 	}
   1112      1.1  mrg 
   1113      1.1  mrg       idx++;
   1114      1.1  mrg     }
   1115      1.1  mrg 
   1116      1.1  mrg   if (print)
   1117      1.1  mrg     printf (" }\n");
   1118      1.1  mrg 
   1119      1.1  mrg   free (ap);
   1120      1.1  mrg   if (! p->sqr)
   1121      1.1  mrg     free (bp);
   1122      1.1  mrg   free (rp);
   1123      1.1  mrg 
   1124      1.1  mrg   return idx;
   1125      1.1  mrg }
   1126      1.1  mrg 
   1127      1.1  mrg void
   1128      1.1  mrg fft (struct fft_param_t *p)
   1129      1.1  mrg {
   1130      1.1  mrg   mp_size_t  size;
   1131      1.1  mrg   int        k, idx, initial_k;
   1132      1.1  mrg 
   1133      1.1  mrg   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
   1134      1.1  mrg 
   1135      1.1  mrg #if 1
   1136      1.1  mrg   {
   1137      1.1  mrg     /* Use plain one() mechanism, for some reasonable initial values of k.  The
   1138      1.1  mrg        advantage is that we don't depend on mpn_fft_table3, which can therefore
   1139      1.1  mrg        leave it completely uninitialized.  */
   1140      1.1  mrg 
   1141      1.1  mrg     static struct param_t param;
   1142      1.1  mrg     mp_size_t thres, best_thres;
   1143      1.1  mrg     int best_k;
   1144      1.1  mrg     char buf[20];
   1145      1.1  mrg 
   1146      1.1  mrg     best_thres = MP_SIZE_T_MAX;
   1147      1.1  mrg     best_k = -1;
   1148      1.1  mrg 
   1149      1.1  mrg     for (k = 5; k <= 7; k++)
   1150      1.1  mrg       {
   1151      1.1  mrg 	param.name = p->modf_threshold_name;
   1152      1.1  mrg 	param.min_size = 100;
   1153      1.1  mrg 	param.max_size = 2000;
   1154      1.1  mrg 	param.function  = p->mul_function;
   1155      1.1  mrg 	param.step_factor = 0.0;
   1156      1.1  mrg 	param.step = 4;
   1157      1.1  mrg 	param.function2 = p->mul_modf_function;
   1158      1.1  mrg 	param.noprint = 1;
   1159      1.1  mrg 	s.r = k;
   1160      1.1  mrg 	one (&thres, &param);
   1161      1.1  mrg 	if (thres < best_thres)
   1162      1.1  mrg 	  {
   1163      1.1  mrg 	    best_thres = thres;
   1164      1.1  mrg 	    best_k = k;
   1165      1.1  mrg 	  }
   1166      1.1  mrg       }
   1167      1.1  mrg 
   1168      1.1  mrg     *(p->p_modf_threshold) = best_thres;
   1169      1.1  mrg     sprintf (buf, "k = %d", best_k);
   1170      1.1  mrg     print_define_remark (p->modf_threshold_name, best_thres, buf);
   1171      1.1  mrg     initial_k = best_k;
   1172      1.1  mrg   }
   1173      1.1  mrg #else
   1174      1.1  mrg   size = p->first_size;
   1175      1.1  mrg   for (;;)
   1176      1.1  mrg     {
   1177      1.1  mrg       double  tk, tm;
   1178      1.1  mrg 
   1179      1.1  mrg       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
   1180      1.1  mrg       k = mpn_fft_best_k (size, p->sqr);
   1181      1.1  mrg 
   1182      1.1  mrg       if (size >= p->max_size)
   1183      1.1  mrg         break;
   1184      1.1  mrg 
   1185      1.1  mrg       s.size = size + fft_step_size (k) / 2;
   1186      1.1  mrg       s.r = k;
   1187      1.1  mrg       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
   1188      1.1  mrg       if (tk == -1.0)
   1189      1.1  mrg         abort ();
   1190      1.1  mrg 
   1191      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1192      1.1  mrg       if (tm == -1.0)
   1193      1.1  mrg         abort ();
   1194      1.1  mrg 
   1195      1.1  mrg       if (option_trace >= 2)
   1196      1.1  mrg         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
   1197      1.1  mrg                 (long) size,
   1198      1.1  mrg                 (long) size + fft_step_size (k) / 2, k, tk,
   1199      1.1  mrg                 (long) s.size, tm);
   1200      1.1  mrg 
   1201      1.1  mrg       if (tk < tm)
   1202      1.1  mrg         {
   1203      1.1  mrg 	  *p->p_modf_threshold = s.size;
   1204      1.1  mrg 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
   1205      1.1  mrg 	  break;
   1206      1.1  mrg         }
   1207      1.1  mrg     }
   1208      1.1  mrg   initial_k = ?;
   1209      1.1  mrg #endif
   1210      1.1  mrg 
   1211      1.1  mrg   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
   1212      1.1  mrg 
   1213      1.1  mrg   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
   1214      1.1  mrg   printf ("#define %s_SIZE %d\n", p->table_name, idx);
   1215      1.1  mrg 
   1216      1.1  mrg   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
   1217      1.1  mrg 
   1218      1.1  mrg   size = 2 * *p->p_modf_threshold;	/* OK? */
   1219      1.1  mrg   for (;;)
   1220      1.1  mrg     {
   1221      1.1  mrg       double  tk, tm;
   1222      1.1  mrg       mp_size_t mulmod_size, mul_size;;
   1223      1.1  mrg 
   1224      1.1  mrg       if (size >= p->max_size)
   1225      1.1  mrg         break;
   1226      1.1  mrg 
   1227      1.1  mrg       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
   1228      1.1  mrg       mul_size = (size + mulmod_size) / 2;	/* middle of step */
   1229      1.1  mrg 
   1230      1.1  mrg       s.size = mulmod_size;
   1231      1.1  mrg       tk = tuneup_measure (p->function, NULL, &s);
   1232      1.1  mrg       if (tk == -1.0)
   1233      1.1  mrg         abort ();
   1234      1.1  mrg 
   1235      1.1  mrg       s.size = mul_size;
   1236      1.1  mrg       tm = tuneup_measure (p->mul_function, NULL, &s);
   1237      1.1  mrg       if (tm == -1.0)
   1238      1.1  mrg         abort ();
   1239      1.1  mrg 
   1240      1.1  mrg       if (option_trace >= 2)
   1241      1.1  mrg         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
   1242      1.1  mrg                 (long) size,
   1243      1.1  mrg                 (long) mulmod_size, tk,
   1244      1.1  mrg                 (long) mul_size, tm);
   1245      1.1  mrg 
   1246      1.1  mrg       size = mulmod_size;
   1247      1.1  mrg 
   1248      1.1  mrg       if (tk < tm)
   1249      1.1  mrg         {
   1250      1.1  mrg 	  *p->p_threshold = s.size;
   1251      1.1  mrg 	  print_define (p->threshold_name, *p->p_threshold);
   1252      1.1  mrg 	  break;
   1253      1.1  mrg         }
   1254      1.1  mrg     }
   1255      1.1  mrg }
   1256      1.1  mrg 
   1257  1.1.1.4  mrg /* Compare mpn_mul_1 to whatever fast exact single-limb division we have.  This
   1258  1.1.1.4  mrg    is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
   1259  1.1.1.4  mrg    This is used in get_str and set_str.  */
   1260  1.1.1.4  mrg void
   1261  1.1.1.4  mrg relspeed_div_1_vs_mul_1 (void)
   1262  1.1.1.4  mrg {
   1263  1.1.1.4  mrg   const size_t max_opsize = 100;
   1264  1.1.1.4  mrg   mp_size_t n;
   1265  1.1.1.4  mrg   long j;
   1266  1.1.1.4  mrg   mp_limb_t rp[max_opsize];
   1267  1.1.1.4  mrg   mp_limb_t ap[max_opsize];
   1268  1.1.1.4  mrg   double multime, divtime;
   1269  1.1.1.4  mrg 
   1270  1.1.1.4  mrg   mpn_random (ap, max_opsize);
   1271  1.1.1.4  mrg 
   1272  1.1.1.4  mrg   multime = 0;
   1273  1.1.1.4  mrg   for (n = max_opsize; n > 1; n--)
   1274  1.1.1.4  mrg     {
   1275  1.1.1.4  mrg       mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1276  1.1.1.4  mrg       speed_starttime ();
   1277  1.1.1.4  mrg       for (j = speed_precision; j != 0 ; j--)
   1278  1.1.1.4  mrg 	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1279  1.1.1.4  mrg       multime += speed_endtime () / n;
   1280  1.1.1.4  mrg     }
   1281  1.1.1.4  mrg 
   1282  1.1.1.4  mrg   divtime = 0;
   1283  1.1.1.4  mrg   for (n = max_opsize; n > 1; n--)
   1284  1.1.1.4  mrg     {
   1285  1.1.1.4  mrg       /* Make input divisible for good measure.  */
   1286  1.1.1.4  mrg       ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
   1287  1.1.1.4  mrg 
   1288  1.1.1.4  mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
   1289  1.1.1.4  mrg 	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
   1290  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_BINVERTED_10,
   1291  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_CTZ_10);
   1292  1.1.1.4  mrg #else
   1293  1.1.1.4  mrg 	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1294  1.1.1.4  mrg #endif
   1295  1.1.1.4  mrg       speed_starttime ();
   1296  1.1.1.4  mrg       for (j = speed_precision; j != 0 ; j--)
   1297  1.1.1.4  mrg 	{
   1298  1.1.1.4  mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
   1299  1.1.1.4  mrg 	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
   1300  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_BINVERTED_10,
   1301  1.1.1.4  mrg 			    MP_BASES_BIG_BASE_CTZ_10);
   1302  1.1.1.4  mrg #else
   1303  1.1.1.4  mrg 	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
   1304  1.1.1.4  mrg #endif
   1305  1.1.1.4  mrg 	}
   1306  1.1.1.4  mrg       divtime += speed_endtime () / n;
   1307  1.1.1.4  mrg     }
   1308  1.1.1.4  mrg 
   1309  1.1.1.4  mrg   print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
   1310  1.1.1.4  mrg }
   1311      1.1  mrg 
   1312      1.1  mrg 
   1313      1.1  mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
   1314      1.1  mrg    giving wrong results.  */
   1315      1.1  mrg void
   1316      1.1  mrg tune_mul_n (void)
   1317      1.1  mrg {
   1318      1.1  mrg   static struct param_t  param;
   1319  1.1.1.2  mrg   mp_size_t next_toom_start;
   1320  1.1.1.2  mrg   int something_changed;
   1321      1.1  mrg 
   1322      1.1  mrg   param.function = speed_mpn_mul_n;
   1323      1.1  mrg 
   1324      1.1  mrg   param.name = "MUL_TOOM22_THRESHOLD";
   1325      1.1  mrg   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
   1326      1.1  mrg   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
   1327      1.1  mrg   one (&mul_toom22_threshold, &param);
   1328      1.1  mrg 
   1329  1.1.1.2  mrg   param.noprint = 1;
   1330  1.1.1.2  mrg 
   1331  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1332  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1333  1.1.1.2  mrg   something_changed = 1;
   1334  1.1.1.2  mrg   while (something_changed)
   1335  1.1.1.2  mrg     {
   1336  1.1.1.2  mrg       something_changed = 0;
   1337  1.1.1.2  mrg 
   1338  1.1.1.2  mrg 	next_toom_start = mul_toom22_threshold;
   1339  1.1.1.2  mrg 
   1340  1.1.1.2  mrg 	if (mul_toom33_threshold != 0)
   1341  1.1.1.2  mrg 	  {
   1342  1.1.1.2  mrg 	    param.name = "MUL_TOOM33_THRESHOLD";
   1343  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
   1344  1.1.1.2  mrg 	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
   1345  1.1.1.2  mrg 	    one (&mul_toom33_threshold, &param);
   1346  1.1.1.2  mrg 
   1347  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
   1348  1.1.1.2  mrg 	      {
   1349  1.1.1.2  mrg 		mul_toom33_threshold = 0;
   1350  1.1.1.2  mrg 		something_changed = 1;
   1351  1.1.1.2  mrg 	      }
   1352  1.1.1.2  mrg 	  }
   1353  1.1.1.2  mrg 
   1354  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
   1355  1.1.1.2  mrg 
   1356  1.1.1.2  mrg 	if (mul_toom44_threshold != 0)
   1357  1.1.1.2  mrg 	  {
   1358  1.1.1.2  mrg 	    param.name = "MUL_TOOM44_THRESHOLD";
   1359  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
   1360  1.1.1.2  mrg 	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
   1361  1.1.1.2  mrg 	    one (&mul_toom44_threshold, &param);
   1362  1.1.1.2  mrg 
   1363  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
   1364  1.1.1.2  mrg 	      {
   1365  1.1.1.2  mrg 		mul_toom44_threshold = 0;
   1366  1.1.1.2  mrg 		something_changed = 1;
   1367  1.1.1.2  mrg 	      }
   1368  1.1.1.2  mrg 	  }
   1369  1.1.1.2  mrg 
   1370  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
   1371  1.1.1.2  mrg 
   1372  1.1.1.2  mrg 	if (mul_toom6h_threshold != 0)
   1373  1.1.1.2  mrg 	  {
   1374  1.1.1.2  mrg 	    param.name = "MUL_TOOM6H_THRESHOLD";
   1375  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
   1376  1.1.1.2  mrg 	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
   1377  1.1.1.2  mrg 	    one (&mul_toom6h_threshold, &param);
   1378  1.1.1.2  mrg 
   1379  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
   1380  1.1.1.2  mrg 	      {
   1381  1.1.1.2  mrg 		mul_toom6h_threshold = 0;
   1382  1.1.1.2  mrg 		something_changed = 1;
   1383  1.1.1.2  mrg 	      }
   1384  1.1.1.2  mrg 	  }
   1385  1.1.1.2  mrg 
   1386  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
   1387  1.1.1.2  mrg 
   1388  1.1.1.2  mrg 	if (mul_toom8h_threshold != 0)
   1389  1.1.1.2  mrg 	  {
   1390  1.1.1.2  mrg 	    param.name = "MUL_TOOM8H_THRESHOLD";
   1391  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
   1392  1.1.1.2  mrg 	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
   1393  1.1.1.2  mrg 	    one (&mul_toom8h_threshold, &param);
   1394  1.1.1.2  mrg 
   1395  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
   1396  1.1.1.2  mrg 	      {
   1397  1.1.1.2  mrg 		mul_toom8h_threshold = 0;
   1398  1.1.1.2  mrg 		something_changed = 1;
   1399  1.1.1.2  mrg 	      }
   1400  1.1.1.2  mrg 	  }
   1401  1.1.1.2  mrg     }
   1402  1.1.1.2  mrg 
   1403  1.1.1.2  mrg     print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
   1404  1.1.1.2  mrg     print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
   1405  1.1.1.2  mrg     print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
   1406  1.1.1.2  mrg     print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
   1407      1.1  mrg 
   1408      1.1  mrg   /* disabled until tuned */
   1409      1.1  mrg   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1410      1.1  mrg }
   1411      1.1  mrg 
   1412      1.1  mrg void
   1413      1.1  mrg tune_mul (void)
   1414      1.1  mrg {
   1415      1.1  mrg   static struct param_t  param;
   1416      1.1  mrg   mp_size_t thres;
   1417      1.1  mrg 
   1418      1.1  mrg   param.noprint = 1;
   1419      1.1  mrg 
   1420      1.1  mrg   param.function = speed_mpn_toom32_for_toom43_mul;
   1421      1.1  mrg   param.function2 = speed_mpn_toom43_for_toom32_mul;
   1422      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
   1423  1.1.1.2  mrg   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
   1424      1.1  mrg   one (&thres, &param);
   1425  1.1.1.2  mrg   mul_toom32_to_toom43_threshold = thres * 17 / 24;
   1426      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
   1427      1.1  mrg 
   1428      1.1  mrg   param.function = speed_mpn_toom32_for_toom53_mul;
   1429      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom32_mul;
   1430      1.1  mrg   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
   1431  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
   1432      1.1  mrg   one (&thres, &param);
   1433  1.1.1.2  mrg   mul_toom32_to_toom53_threshold = thres * 19 / 30;
   1434      1.1  mrg   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
   1435      1.1  mrg 
   1436      1.1  mrg   param.function = speed_mpn_toom42_for_toom53_mul;
   1437      1.1  mrg   param.function2 = speed_mpn_toom53_for_toom42_mul;
   1438      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
   1439  1.1.1.2  mrg   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
   1440      1.1  mrg   one (&thres, &param);
   1441  1.1.1.2  mrg   mul_toom42_to_toom53_threshold = thres * 11 / 20;
   1442      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
   1443      1.1  mrg 
   1444      1.1  mrg   param.function = speed_mpn_toom42_mul;
   1445      1.1  mrg   param.function2 = speed_mpn_toom63_mul;
   1446      1.1  mrg   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
   1447  1.1.1.2  mrg   param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
   1448      1.1  mrg   one (&thres, &param);
   1449  1.1.1.2  mrg   mul_toom42_to_toom63_threshold = thres / 2;
   1450      1.1  mrg   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
   1451  1.1.1.2  mrg 
   1452  1.1.1.2  mrg   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
   1453  1.1.1.2  mrg   param.function = speed_mpn_toom43_for_toom54_mul;
   1454  1.1.1.2  mrg   param.function2 = speed_mpn_toom54_for_toom43_mul;
   1455  1.1.1.2  mrg   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
   1456  1.1.1.2  mrg   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
   1457  1.1.1.2  mrg   one (&thres, &param);
   1458  1.1.1.2  mrg   mul_toom43_to_toom54_threshold = thres * 5 / 6;
   1459  1.1.1.2  mrg   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
   1460      1.1  mrg }
   1461      1.1  mrg 
   1462      1.1  mrg 
   1463      1.1  mrg void
   1464      1.1  mrg tune_mullo (void)
   1465      1.1  mrg {
   1466      1.1  mrg   static struct param_t  param;
   1467      1.1  mrg 
   1468      1.1  mrg   param.function = speed_mpn_mullo_n;
   1469      1.1  mrg 
   1470      1.1  mrg   param.name = "MULLO_BASECASE_THRESHOLD";
   1471  1.1.1.4  mrg   param.min_size = 2;
   1472      1.1  mrg   param.min_is_always = 1;
   1473      1.1  mrg   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
   1474      1.1  mrg   param.stop_factor = 1.5;
   1475      1.1  mrg   param.noprint = 1;
   1476      1.1  mrg   one (&mullo_basecase_threshold, &param);
   1477      1.1  mrg 
   1478      1.1  mrg   param.name = "MULLO_DC_THRESHOLD";
   1479      1.1  mrg   param.min_size = 8;
   1480      1.1  mrg   param.min_is_always = 0;
   1481      1.1  mrg   param.max_size = 1000;
   1482      1.1  mrg   one (&mullo_dc_threshold, &param);
   1483      1.1  mrg 
   1484      1.1  mrg   if (mullo_basecase_threshold >= mullo_dc_threshold)
   1485      1.1  mrg     {
   1486      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
   1487      1.1  mrg       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
   1488      1.1  mrg     }
   1489      1.1  mrg   else
   1490      1.1  mrg     {
   1491      1.1  mrg       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
   1492      1.1  mrg       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
   1493      1.1  mrg     }
   1494      1.1  mrg 
   1495  1.1.1.3  mrg   if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
   1496  1.1.1.3  mrg     {
   1497  1.1.1.3  mrg       param.name = "MULLO_MUL_N_THRESHOLD";
   1498  1.1.1.3  mrg       param.min_size = mullo_dc_threshold;
   1499  1.1.1.3  mrg       param.max_size = 2 * mul_fft_threshold;
   1500  1.1.1.3  mrg       param.noprint = 0;
   1501  1.1.1.3  mrg       param.step_factor = 0.03;
   1502  1.1.1.3  mrg       one (&mullo_mul_n_threshold, &param);
   1503  1.1.1.3  mrg     }
   1504  1.1.1.3  mrg   else
   1505  1.1.1.3  mrg     print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
   1506  1.1.1.3  mrg 			 "without FFT use mullo forever");
   1507  1.1.1.3  mrg }
   1508  1.1.1.3  mrg 
   1509  1.1.1.3  mrg void
   1510  1.1.1.3  mrg tune_sqrlo (void)
   1511  1.1.1.3  mrg {
   1512  1.1.1.3  mrg   static struct param_t  param;
   1513  1.1.1.3  mrg 
   1514  1.1.1.3  mrg   param.function = speed_mpn_sqrlo;
   1515  1.1.1.3  mrg 
   1516  1.1.1.3  mrg   param.name = "SQRLO_BASECASE_THRESHOLD";
   1517  1.1.1.4  mrg   param.min_size = 2;
   1518  1.1.1.3  mrg   param.min_is_always = 1;
   1519  1.1.1.3  mrg   param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
   1520  1.1.1.3  mrg   param.stop_factor = 1.5;
   1521  1.1.1.3  mrg   param.noprint = 1;
   1522  1.1.1.3  mrg   one (&sqrlo_basecase_threshold, &param);
   1523  1.1.1.3  mrg 
   1524  1.1.1.3  mrg   param.name = "SQRLO_DC_THRESHOLD";
   1525  1.1.1.3  mrg   param.min_size = 8;
   1526  1.1.1.3  mrg   param.min_is_always = 0;
   1527  1.1.1.3  mrg   param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
   1528  1.1.1.3  mrg   one (&sqrlo_dc_threshold, &param);
   1529  1.1.1.3  mrg 
   1530  1.1.1.3  mrg   if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
   1531  1.1.1.3  mrg     {
   1532  1.1.1.3  mrg       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
   1533  1.1.1.3  mrg       print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
   1534  1.1.1.3  mrg     }
   1535  1.1.1.3  mrg   else
   1536  1.1.1.3  mrg     {
   1537  1.1.1.3  mrg       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
   1538  1.1.1.3  mrg       print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
   1539  1.1.1.3  mrg     }
   1540  1.1.1.3  mrg 
   1541  1.1.1.3  mrg   if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
   1542  1.1.1.3  mrg     {
   1543  1.1.1.3  mrg       param.name = "SQRLO_SQR_THRESHOLD";
   1544  1.1.1.3  mrg       param.min_size = sqrlo_dc_threshold;
   1545  1.1.1.3  mrg       param.max_size = 2 * sqr_fft_threshold;
   1546  1.1.1.3  mrg       param.noprint = 0;
   1547  1.1.1.3  mrg       param.step_factor = 0.03;
   1548  1.1.1.3  mrg       one (&sqrlo_sqr_threshold, &param);
   1549  1.1.1.3  mrg     }
   1550  1.1.1.3  mrg   else
   1551  1.1.1.3  mrg     print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
   1552  1.1.1.3  mrg 			 "without FFT use sqrlo forever");
   1553      1.1  mrg }
   1554      1.1  mrg 
   1555      1.1  mrg void
   1556  1.1.1.2  mrg tune_mulmid (void)
   1557  1.1.1.2  mrg {
   1558  1.1.1.2  mrg   static struct param_t  param;
   1559  1.1.1.2  mrg 
   1560  1.1.1.2  mrg   param.name = "MULMID_TOOM42_THRESHOLD";
   1561  1.1.1.2  mrg   param.function = speed_mpn_mulmid_n;
   1562  1.1.1.2  mrg   param.min_size = 4;
   1563  1.1.1.2  mrg   param.max_size = 100;
   1564  1.1.1.2  mrg   one (&mulmid_toom42_threshold, &param);
   1565  1.1.1.2  mrg }
   1566  1.1.1.2  mrg 
   1567  1.1.1.2  mrg void
   1568      1.1  mrg tune_mulmod_bnm1 (void)
   1569      1.1  mrg {
   1570      1.1  mrg   static struct param_t  param;
   1571      1.1  mrg 
   1572      1.1  mrg   param.name = "MULMOD_BNM1_THRESHOLD";
   1573      1.1  mrg   param.function = speed_mpn_mulmod_bnm1;
   1574      1.1  mrg   param.min_size = 4;
   1575      1.1  mrg   param.max_size = 100;
   1576      1.1  mrg   one (&mulmod_bnm1_threshold, &param);
   1577      1.1  mrg }
   1578      1.1  mrg 
   1579      1.1  mrg void
   1580      1.1  mrg tune_sqrmod_bnm1 (void)
   1581      1.1  mrg {
   1582      1.1  mrg   static struct param_t  param;
   1583      1.1  mrg 
   1584      1.1  mrg   param.name = "SQRMOD_BNM1_THRESHOLD";
   1585      1.1  mrg   param.function = speed_mpn_sqrmod_bnm1;
   1586      1.1  mrg   param.min_size = 4;
   1587      1.1  mrg   param.max_size = 100;
   1588      1.1  mrg   one (&sqrmod_bnm1_threshold, &param);
   1589      1.1  mrg }
   1590      1.1  mrg 
   1591      1.1  mrg 
   1592      1.1  mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
   1593      1.1  mrg    is faster only at size==2 then we don't want to bother with extra code
   1594      1.1  mrg    just for that.  Start karatsuba from 4 same as MUL above.  */
   1595      1.1  mrg 
   1596      1.1  mrg void
   1597      1.1  mrg tune_sqr (void)
   1598      1.1  mrg {
   1599      1.1  mrg   /* disabled until tuned */
   1600      1.1  mrg   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
   1601      1.1  mrg 
   1602      1.1  mrg   if (HAVE_NATIVE_mpn_sqr_basecase)
   1603      1.1  mrg     {
   1604      1.1  mrg       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
   1605      1.1  mrg       sqr_basecase_threshold = 0;
   1606      1.1  mrg     }
   1607      1.1  mrg   else
   1608      1.1  mrg     {
   1609      1.1  mrg       static struct param_t  param;
   1610      1.1  mrg       param.name = "SQR_BASECASE_THRESHOLD";
   1611      1.1  mrg       param.function = speed_mpn_sqr;
   1612      1.1  mrg       param.min_size = 3;
   1613      1.1  mrg       param.min_is_always = 1;
   1614      1.1  mrg       param.max_size = TUNE_SQR_TOOM2_MAX;
   1615      1.1  mrg       param.noprint = 1;
   1616      1.1  mrg       one (&sqr_basecase_threshold, &param);
   1617      1.1  mrg     }
   1618      1.1  mrg 
   1619      1.1  mrg   {
   1620      1.1  mrg     static struct param_t  param;
   1621      1.1  mrg     param.name = "SQR_TOOM2_THRESHOLD";
   1622      1.1  mrg     param.function = speed_mpn_sqr;
   1623      1.1  mrg     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
   1624      1.1  mrg     param.max_size = TUNE_SQR_TOOM2_MAX;
   1625      1.1  mrg     param.noprint = 1;
   1626      1.1  mrg     one (&sqr_toom2_threshold, &param);
   1627      1.1  mrg 
   1628      1.1  mrg     if (! HAVE_NATIVE_mpn_sqr_basecase
   1629      1.1  mrg         && sqr_toom2_threshold < sqr_basecase_threshold)
   1630      1.1  mrg       {
   1631      1.1  mrg         /* Karatsuba becomes faster than mul_basecase before
   1632      1.1  mrg            sqr_basecase does.  Arrange for the expression
   1633      1.1  mrg            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
   1634      1.1  mrg            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
   1635      1.1  mrg            SQR_TOOM2_THRESHOLD to zero, making
   1636      1.1  mrg            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
   1637      1.1  mrg 
   1638      1.1  mrg         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
   1639      1.1  mrg         SQR_TOOM2_THRESHOLD = 0;
   1640      1.1  mrg 
   1641      1.1  mrg         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
   1642      1.1  mrg                              "toom2");
   1643      1.1  mrg         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
   1644      1.1  mrg                              "never sqr_basecase");
   1645      1.1  mrg       }
   1646      1.1  mrg     else
   1647      1.1  mrg       {
   1648      1.1  mrg         if (! HAVE_NATIVE_mpn_sqr_basecase)
   1649      1.1  mrg           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
   1650      1.1  mrg         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
   1651      1.1  mrg       }
   1652      1.1  mrg   }
   1653      1.1  mrg 
   1654      1.1  mrg   {
   1655      1.1  mrg     static struct param_t  param;
   1656  1.1.1.2  mrg     mp_size_t next_toom_start;
   1657  1.1.1.2  mrg     int something_changed;
   1658      1.1  mrg 
   1659      1.1  mrg     param.function = speed_mpn_sqr;
   1660  1.1.1.2  mrg     param.noprint = 1;
   1661      1.1  mrg 
   1662  1.1.1.2  mrg   /* Threshold sequence loop.  Disable functions that would be used in a very
   1663  1.1.1.2  mrg      narrow range, re-measuring things when that happens.  */
   1664  1.1.1.2  mrg     something_changed = 1;
   1665  1.1.1.2  mrg     while (something_changed)
   1666  1.1.1.2  mrg       {
   1667  1.1.1.2  mrg 	something_changed = 0;
   1668  1.1.1.2  mrg 
   1669  1.1.1.2  mrg 	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
   1670  1.1.1.2  mrg 
   1671  1.1.1.2  mrg 	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
   1672  1.1.1.2  mrg 	param.name = "SQR_TOOM3_THRESHOLD";
   1673  1.1.1.2  mrg 	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
   1674  1.1.1.2  mrg 	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
   1675  1.1.1.2  mrg 	one (&sqr_toom3_threshold, &param);
   1676  1.1.1.2  mrg 
   1677  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
   1678  1.1.1.2  mrg 
   1679  1.1.1.2  mrg 	if (sqr_toom4_threshold != 0)
   1680  1.1.1.2  mrg 	  {
   1681  1.1.1.2  mrg 	    param.name = "SQR_TOOM4_THRESHOLD";
   1682  1.1.1.2  mrg 	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
   1683  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
   1684  1.1.1.2  mrg 	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
   1685  1.1.1.2  mrg 	    one (&sqr_toom4_threshold, &param);
   1686  1.1.1.2  mrg 
   1687  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
   1688  1.1.1.2  mrg 	      {
   1689  1.1.1.2  mrg 		sqr_toom4_threshold = 0;
   1690  1.1.1.2  mrg 		something_changed = 1;
   1691  1.1.1.2  mrg 	      }
   1692  1.1.1.2  mrg 	  }
   1693  1.1.1.2  mrg 
   1694  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
   1695  1.1.1.2  mrg 
   1696  1.1.1.2  mrg 	if (sqr_toom6_threshold != 0)
   1697  1.1.1.2  mrg 	  {
   1698  1.1.1.2  mrg 	    param.name = "SQR_TOOM6_THRESHOLD";
   1699  1.1.1.2  mrg 	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
   1700  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
   1701  1.1.1.2  mrg 	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
   1702  1.1.1.2  mrg 	    one (&sqr_toom6_threshold, &param);
   1703  1.1.1.2  mrg 
   1704  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
   1705  1.1.1.2  mrg 	      {
   1706  1.1.1.2  mrg 		sqr_toom6_threshold = 0;
   1707  1.1.1.2  mrg 		something_changed = 1;
   1708  1.1.1.2  mrg 	      }
   1709  1.1.1.2  mrg 	  }
   1710  1.1.1.2  mrg 
   1711  1.1.1.2  mrg 	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
   1712  1.1.1.2  mrg 
   1713  1.1.1.2  mrg 	if (sqr_toom8_threshold != 0)
   1714  1.1.1.2  mrg 	  {
   1715  1.1.1.2  mrg 	    param.name = "SQR_TOOM8_THRESHOLD";
   1716  1.1.1.2  mrg 	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
   1717  1.1.1.2  mrg 	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
   1718  1.1.1.2  mrg 	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
   1719  1.1.1.2  mrg 	    one (&sqr_toom8_threshold, &param);
   1720  1.1.1.2  mrg 
   1721  1.1.1.2  mrg 	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
   1722  1.1.1.2  mrg 	      {
   1723  1.1.1.2  mrg 		sqr_toom8_threshold = 0;
   1724  1.1.1.2  mrg 		something_changed = 1;
   1725  1.1.1.2  mrg 	      }
   1726  1.1.1.2  mrg 	  }
   1727  1.1.1.2  mrg       }
   1728  1.1.1.2  mrg 
   1729  1.1.1.2  mrg     print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
   1730  1.1.1.2  mrg     print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
   1731  1.1.1.2  mrg     print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
   1732  1.1.1.2  mrg     print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
   1733      1.1  mrg   }
   1734      1.1  mrg }
   1735      1.1  mrg 
   1736      1.1  mrg 
   1737      1.1  mrg void
   1738      1.1  mrg tune_dc_div (void)
   1739      1.1  mrg {
   1740      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1741      1.1  mrg   {
   1742      1.1  mrg     static struct param_t  param;
   1743      1.1  mrg     param.name = "DC_DIV_QR_THRESHOLD";
   1744      1.1  mrg     param.function = speed_mpn_sbpi1_div_qr;
   1745      1.1  mrg     param.function2 = speed_mpn_dcpi1_div_qr;
   1746      1.1  mrg     param.min_size = 6;
   1747      1.1  mrg     one (&dc_div_qr_threshold, &param);
   1748      1.1  mrg   }
   1749      1.1  mrg   {
   1750      1.1  mrg     static struct param_t  param;
   1751      1.1  mrg     param.name = "DC_DIVAPPR_Q_THRESHOLD";
   1752      1.1  mrg     param.function = speed_mpn_sbpi1_divappr_q;
   1753      1.1  mrg     param.function2 = speed_mpn_dcpi1_divappr_q;
   1754      1.1  mrg     param.min_size = 6;
   1755      1.1  mrg     one (&dc_divappr_q_threshold, &param);
   1756      1.1  mrg   }
   1757      1.1  mrg }
   1758      1.1  mrg 
   1759      1.1  mrg static double
   1760      1.1  mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
   1761      1.1  mrg {
   1762      1.1  mrg   if (s->size < DC_DIV_QR_THRESHOLD)
   1763      1.1  mrg     return speed_mpn_sbpi1_div_qr (s);
   1764      1.1  mrg   else
   1765      1.1  mrg     return speed_mpn_dcpi1_div_qr (s);
   1766      1.1  mrg }
   1767      1.1  mrg 
   1768      1.1  mrg void
   1769      1.1  mrg tune_mu_div (void)
   1770      1.1  mrg {
   1771      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n */
   1772      1.1  mrg   {
   1773      1.1  mrg     static struct param_t  param;
   1774      1.1  mrg     param.name = "MU_DIV_QR_THRESHOLD";
   1775      1.1  mrg     param.function = speed_mpn_dcpi1_div_qr;
   1776      1.1  mrg     param.function2 = speed_mpn_mu_div_qr;
   1777  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1778      1.1  mrg     param.max_size = 5000;
   1779      1.1  mrg     param.step_factor = 0.02;
   1780      1.1  mrg     one (&mu_div_qr_threshold, &param);
   1781      1.1  mrg   }
   1782      1.1  mrg   {
   1783      1.1  mrg     static struct param_t  param;
   1784      1.1  mrg     param.name = "MU_DIVAPPR_Q_THRESHOLD";
   1785      1.1  mrg     param.function = speed_mpn_dcpi1_divappr_q;
   1786      1.1  mrg     param.function2 = speed_mpn_mu_divappr_q;
   1787  1.1.1.2  mrg     param.min_size = mul_toom22_threshold;
   1788      1.1  mrg     param.max_size = 5000;
   1789      1.1  mrg     param.step_factor = 0.02;
   1790      1.1  mrg     one (&mu_divappr_q_threshold, &param);
   1791      1.1  mrg   }
   1792      1.1  mrg   {
   1793      1.1  mrg     static struct param_t  param;
   1794      1.1  mrg     param.name = "MUPI_DIV_QR_THRESHOLD";
   1795      1.1  mrg     param.function = speed_mpn_sbordcpi1_div_qr;
   1796      1.1  mrg     param.function2 = speed_mpn_mupi_div_qr;
   1797      1.1  mrg     param.min_size = 6;
   1798      1.1  mrg     param.min_is_always = 1;
   1799      1.1  mrg     param.max_size = 1000;
   1800      1.1  mrg     param.step_factor = 0.02;
   1801      1.1  mrg     one (&mupi_div_qr_threshold, &param);
   1802      1.1  mrg   }
   1803      1.1  mrg }
   1804      1.1  mrg 
   1805      1.1  mrg void
   1806      1.1  mrg tune_dc_bdiv (void)
   1807      1.1  mrg {
   1808      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1809      1.1  mrg   {
   1810      1.1  mrg     static struct param_t  param;
   1811      1.1  mrg     param.name = "DC_BDIV_QR_THRESHOLD";
   1812      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_qr;
   1813      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_qr;
   1814      1.1  mrg     param.min_size = 4;
   1815      1.1  mrg     one (&dc_bdiv_qr_threshold, &param);
   1816      1.1  mrg   }
   1817      1.1  mrg   {
   1818      1.1  mrg     static struct param_t  param;
   1819      1.1  mrg     param.name = "DC_BDIV_Q_THRESHOLD";
   1820      1.1  mrg     param.function = speed_mpn_sbpi1_bdiv_q;
   1821      1.1  mrg     param.function2 = speed_mpn_dcpi1_bdiv_q;
   1822      1.1  mrg     param.min_size = 4;
   1823      1.1  mrg     one (&dc_bdiv_q_threshold, &param);
   1824      1.1  mrg   }
   1825      1.1  mrg }
   1826      1.1  mrg 
   1827      1.1  mrg void
   1828      1.1  mrg tune_mu_bdiv (void)
   1829      1.1  mrg {
   1830      1.1  mrg   s.r = 0;		/* clear to make speed function do 2n/n*/
   1831      1.1  mrg   {
   1832      1.1  mrg     static struct param_t  param;
   1833      1.1  mrg     param.name = "MU_BDIV_QR_THRESHOLD";
   1834      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_qr;
   1835      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_qr;
   1836  1.1.1.3  mrg     param.min_size = dc_bdiv_qr_threshold;
   1837      1.1  mrg     param.max_size = 5000;
   1838      1.1  mrg     param.step_factor = 0.02;
   1839      1.1  mrg     one (&mu_bdiv_qr_threshold, &param);
   1840      1.1  mrg   }
   1841      1.1  mrg   {
   1842      1.1  mrg     static struct param_t  param;
   1843      1.1  mrg     param.name = "MU_BDIV_Q_THRESHOLD";
   1844      1.1  mrg     param.function = speed_mpn_dcpi1_bdiv_q;
   1845      1.1  mrg     param.function2 = speed_mpn_mu_bdiv_q;
   1846  1.1.1.3  mrg     param.min_size = dc_bdiv_q_threshold;
   1847      1.1  mrg     param.max_size = 5000;
   1848      1.1  mrg     param.step_factor = 0.02;
   1849      1.1  mrg     one (&mu_bdiv_q_threshold, &param);
   1850      1.1  mrg   }
   1851      1.1  mrg }
   1852      1.1  mrg 
   1853      1.1  mrg void
   1854      1.1  mrg tune_invertappr (void)
   1855      1.1  mrg {
   1856      1.1  mrg   static struct param_t  param;
   1857      1.1  mrg 
   1858      1.1  mrg   param.function = speed_mpn_ni_invertappr;
   1859      1.1  mrg   param.name = "INV_MULMOD_BNM1_THRESHOLD";
   1860  1.1.1.3  mrg   param.min_size = 5;
   1861      1.1  mrg   one (&inv_mulmod_bnm1_threshold, &param);
   1862      1.1  mrg 
   1863      1.1  mrg   param.function = speed_mpn_invertappr;
   1864      1.1  mrg   param.name = "INV_NEWTON_THRESHOLD";
   1865  1.1.1.3  mrg   param.min_size = 5;
   1866      1.1  mrg   one (&inv_newton_threshold, &param);
   1867      1.1  mrg }
   1868      1.1  mrg 
   1869      1.1  mrg void
   1870      1.1  mrg tune_invert (void)
   1871      1.1  mrg {
   1872      1.1  mrg   static struct param_t  param;
   1873      1.1  mrg 
   1874      1.1  mrg   param.function = speed_mpn_invert;
   1875      1.1  mrg   param.name = "INV_APPR_THRESHOLD";
   1876  1.1.1.3  mrg   param.min_size = 5;
   1877      1.1  mrg   one (&inv_appr_threshold, &param);
   1878      1.1  mrg }
   1879      1.1  mrg 
   1880      1.1  mrg void
   1881      1.1  mrg tune_binvert (void)
   1882      1.1  mrg {
   1883      1.1  mrg   static struct param_t  param;
   1884      1.1  mrg 
   1885      1.1  mrg   param.function = speed_mpn_binvert;
   1886      1.1  mrg   param.name = "BINV_NEWTON_THRESHOLD";
   1887      1.1  mrg   param.min_size = 8;		/* pointless with smaller operands */
   1888      1.1  mrg   one (&binv_newton_threshold, &param);
   1889      1.1  mrg }
   1890      1.1  mrg 
   1891      1.1  mrg void
   1892      1.1  mrg tune_redc (void)
   1893      1.1  mrg {
   1894      1.1  mrg #define TUNE_REDC_2_MAX 100
   1895      1.1  mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
   1896      1.1  mrg #define WANT_REDC_2 1
   1897      1.1  mrg #endif
   1898      1.1  mrg 
   1899      1.1  mrg #if WANT_REDC_2
   1900      1.1  mrg   {
   1901      1.1  mrg     static struct param_t  param;
   1902      1.1  mrg     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1903      1.1  mrg     param.function = speed_mpn_redc_1;
   1904      1.1  mrg     param.function2 = speed_mpn_redc_2;
   1905      1.1  mrg     param.min_size = 1;
   1906      1.1  mrg     param.min_is_always = 1;
   1907      1.1  mrg     param.max_size = TUNE_REDC_2_MAX;
   1908      1.1  mrg     param.noprint = 1;
   1909  1.1.1.2  mrg     param.stop_factor = 1.5;
   1910      1.1  mrg     one (&redc_1_to_redc_2_threshold, &param);
   1911      1.1  mrg   }
   1912      1.1  mrg   {
   1913      1.1  mrg     static struct param_t  param;
   1914      1.1  mrg     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
   1915      1.1  mrg     param.function = speed_mpn_redc_2;
   1916      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1917      1.1  mrg     param.min_size = 16;
   1918      1.1  mrg     param.noprint = 1;
   1919      1.1  mrg     one (&redc_2_to_redc_n_threshold, &param);
   1920      1.1  mrg   }
   1921  1.1.1.2  mrg   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
   1922      1.1  mrg     {
   1923  1.1.1.2  mrg       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
   1924  1.1.1.2  mrg 
   1925  1.1.1.2  mrg       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
   1926  1.1.1.2  mrg 	 REDC_1_TO_REDC_2_THRESHOLD.  */
   1927  1.1.1.2  mrg       {
   1928  1.1.1.2  mrg 	static struct param_t  param;
   1929  1.1.1.2  mrg 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
   1930  1.1.1.2  mrg 	param.function = speed_mpn_redc_1;
   1931  1.1.1.2  mrg 	param.function2 = speed_mpn_redc_n;
   1932  1.1.1.2  mrg 	param.min_size = 16;
   1933  1.1.1.2  mrg 	param.noprint = 1;
   1934  1.1.1.2  mrg 	one (&redc_1_to_redc_2_threshold, &param);
   1935  1.1.1.2  mrg       }
   1936      1.1  mrg     }
   1937  1.1.1.2  mrg   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
   1938  1.1.1.2  mrg   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
   1939      1.1  mrg #else
   1940      1.1  mrg   {
   1941      1.1  mrg     static struct param_t  param;
   1942      1.1  mrg     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
   1943      1.1  mrg     param.function = speed_mpn_redc_1;
   1944      1.1  mrg     param.function2 = speed_mpn_redc_n;
   1945      1.1  mrg     param.min_size = 16;
   1946      1.1  mrg     one (&redc_1_to_redc_n_threshold, &param);
   1947      1.1  mrg   }
   1948      1.1  mrg #endif
   1949      1.1  mrg }
   1950      1.1  mrg 
   1951      1.1  mrg void
   1952      1.1  mrg tune_matrix22_mul (void)
   1953      1.1  mrg {
   1954      1.1  mrg   static struct param_t  param;
   1955      1.1  mrg   param.name = "MATRIX22_STRASSEN_THRESHOLD";
   1956      1.1  mrg   param.function = speed_mpn_matrix22_mul;
   1957      1.1  mrg   param.min_size = 2;
   1958      1.1  mrg   one (&matrix22_strassen_threshold, &param);
   1959      1.1  mrg }
   1960      1.1  mrg 
   1961      1.1  mrg void
   1962  1.1.1.4  mrg tune_hgcd2 (void)
   1963  1.1.1.4  mrg {
   1964  1.1.1.4  mrg   static struct param_t  param;
   1965  1.1.1.4  mrg   hgcd2_func_t *f[5] =
   1966  1.1.1.4  mrg     { mpn_hgcd2_1,
   1967  1.1.1.4  mrg       mpn_hgcd2_2,
   1968  1.1.1.4  mrg       mpn_hgcd2_3,
   1969  1.1.1.4  mrg       mpn_hgcd2_4,
   1970  1.1.1.4  mrg       mpn_hgcd2_5 };
   1971  1.1.1.4  mrg   speed_function_t speed_f[5] =
   1972  1.1.1.4  mrg     { speed_mpn_hgcd2_1,
   1973  1.1.1.4  mrg       speed_mpn_hgcd2_2,
   1974  1.1.1.4  mrg       speed_mpn_hgcd2_3,
   1975  1.1.1.4  mrg       speed_mpn_hgcd2_4,
   1976  1.1.1.4  mrg       speed_mpn_hgcd2_5 };
   1977  1.1.1.4  mrg   int best;
   1978  1.1.1.4  mrg 
   1979  1.1.1.4  mrg   s.size = 1;
   1980  1.1.1.4  mrg   best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
   1981  1.1.1.4  mrg 
   1982  1.1.1.4  mrg   /* Use selected function when tuning hgcd and gcd */
   1983  1.1.1.4  mrg   hgcd2_func = f[best];
   1984  1.1.1.4  mrg }
   1985  1.1.1.4  mrg 
   1986  1.1.1.4  mrg void
   1987      1.1  mrg tune_hgcd (void)
   1988      1.1  mrg {
   1989      1.1  mrg   static struct param_t  param;
   1990      1.1  mrg   param.name = "HGCD_THRESHOLD";
   1991      1.1  mrg   param.function = speed_mpn_hgcd;
   1992      1.1  mrg   /* We seem to get strange results for small sizes */
   1993      1.1  mrg   param.min_size = 30;
   1994      1.1  mrg   one (&hgcd_threshold, &param);
   1995      1.1  mrg }
   1996      1.1  mrg 
   1997      1.1  mrg void
   1998  1.1.1.2  mrg tune_hgcd_appr (void)
   1999  1.1.1.2  mrg {
   2000  1.1.1.2  mrg   static struct param_t  param;
   2001  1.1.1.2  mrg   param.name = "HGCD_APPR_THRESHOLD";
   2002  1.1.1.2  mrg   param.function = speed_mpn_hgcd_appr;
   2003  1.1.1.2  mrg   /* We seem to get strange results for small sizes */
   2004  1.1.1.2  mrg   param.min_size = 50;
   2005  1.1.1.2  mrg   param.stop_since_change = 150;
   2006  1.1.1.2  mrg   one (&hgcd_appr_threshold, &param);
   2007  1.1.1.2  mrg }
   2008  1.1.1.2  mrg 
   2009  1.1.1.2  mrg void
   2010  1.1.1.2  mrg tune_hgcd_reduce (void)
   2011  1.1.1.2  mrg {
   2012  1.1.1.2  mrg   static struct param_t  param;
   2013  1.1.1.2  mrg   param.name = "HGCD_REDUCE_THRESHOLD";
   2014  1.1.1.2  mrg   param.function = speed_mpn_hgcd_reduce;
   2015  1.1.1.2  mrg   param.min_size = 30;
   2016  1.1.1.2  mrg   param.max_size = 7000;
   2017  1.1.1.2  mrg   param.step_factor = 0.04;
   2018  1.1.1.2  mrg   one (&hgcd_reduce_threshold, &param);
   2019  1.1.1.2  mrg }
   2020  1.1.1.2  mrg 
   2021  1.1.1.2  mrg void
   2022      1.1  mrg tune_gcd_dc (void)
   2023      1.1  mrg {
   2024      1.1  mrg   static struct param_t  param;
   2025      1.1  mrg   param.name = "GCD_DC_THRESHOLD";
   2026      1.1  mrg   param.function = speed_mpn_gcd;
   2027      1.1  mrg   param.min_size = hgcd_threshold;
   2028      1.1  mrg   param.max_size = 3000;
   2029      1.1  mrg   param.step_factor = 0.02;
   2030      1.1  mrg   one (&gcd_dc_threshold, &param);
   2031      1.1  mrg }
   2032      1.1  mrg 
   2033      1.1  mrg void
   2034      1.1  mrg tune_gcdext_dc (void)
   2035      1.1  mrg {
   2036      1.1  mrg   static struct param_t  param;
   2037      1.1  mrg   param.name = "GCDEXT_DC_THRESHOLD";
   2038      1.1  mrg   param.function = speed_mpn_gcdext;
   2039      1.1  mrg   param.min_size = hgcd_threshold;
   2040      1.1  mrg   param.max_size = 3000;
   2041      1.1  mrg   param.step_factor = 0.02;
   2042      1.1  mrg   one (&gcdext_dc_threshold, &param);
   2043      1.1  mrg }
   2044      1.1  mrg 
   2045  1.1.1.2  mrg /* In tune_powm_sec we compute the table used by the win_size function.  The
   2046  1.1.1.2  mrg    cutoff points are in exponent bits, disregarding other operand sizes.  It is
   2047  1.1.1.3  mrg    not possible to use the one framework since it currently uses a granularity
   2048  1.1.1.2  mrg    of full limbs.
   2049  1.1.1.2  mrg */
   2050  1.1.1.2  mrg 
   2051  1.1.1.2  mrg /* This win_size replaces the variant in the powm code, allowing us to
   2052  1.1.1.2  mrg    control k in the k-ary algorithms.  */
   2053  1.1.1.2  mrg int winsize;
   2054  1.1.1.2  mrg int
   2055  1.1.1.2  mrg win_size (mp_bitcnt_t eb)
   2056  1.1.1.2  mrg {
   2057  1.1.1.2  mrg   return winsize;
   2058  1.1.1.2  mrg }
   2059  1.1.1.2  mrg 
   2060  1.1.1.2  mrg void
   2061  1.1.1.2  mrg tune_powm_sec (void)
   2062  1.1.1.2  mrg {
   2063  1.1.1.2  mrg   mp_size_t n;
   2064  1.1.1.2  mrg   int k, i;
   2065  1.1.1.2  mrg   mp_size_t itch;
   2066  1.1.1.2  mrg   mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
   2067  1.1.1.2  mrg   const int n_max = 3000 / GMP_NUMB_BITS;
   2068  1.1.1.2  mrg   const int n_measurements = 5;
   2069  1.1.1.2  mrg   mp_ptr rp, bp, ep, mp, tp;
   2070  1.1.1.2  mrg   double ttab[n_measurements], tk, tkp1;
   2071  1.1.1.2  mrg   TMP_DECL;
   2072  1.1.1.2  mrg   TMP_MARK;
   2073  1.1.1.2  mrg 
   2074  1.1.1.2  mrg   possible_nbits_cutoff = 0;
   2075  1.1.1.2  mrg 
   2076  1.1.1.2  mrg   k = 1;
   2077  1.1.1.2  mrg 
   2078  1.1.1.2  mrg   winsize = 10;			/* the itch function needs this */
   2079  1.1.1.3  mrg   itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
   2080  1.1.1.2  mrg 
   2081  1.1.1.2  mrg   rp = TMP_ALLOC_LIMBS (n_max);
   2082  1.1.1.2  mrg   bp = TMP_ALLOC_LIMBS (n_max);
   2083  1.1.1.2  mrg   ep = TMP_ALLOC_LIMBS (n_max);
   2084  1.1.1.2  mrg   mp = TMP_ALLOC_LIMBS (n_max);
   2085  1.1.1.2  mrg   tp = TMP_ALLOC_LIMBS (itch);
   2086  1.1.1.2  mrg 
   2087  1.1.1.2  mrg   mpn_random (bp, n_max);
   2088  1.1.1.2  mrg   mpn_random (mp, n_max);
   2089  1.1.1.2  mrg   mp[0] |= 1;
   2090  1.1.1.2  mrg 
   2091  1.1.1.2  mrg /* How about taking the M operand size into account?
   2092  1.1.1.2  mrg 
   2093  1.1.1.2  mrg    An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
   2094  1.1.1.2  mrg    B = O(M)).
   2095  1.1.1.2  mrg 
   2096  1.1.1.2  mrg    Using k-ary and no sliding window, the precomputation will need time
   2097  1.1.1.2  mrg    O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
   2098  1.1.1.2  mrg    O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
   2099  1.1.1.2  mrg 
   2100  1.1.1.2  mrg    An operation R=powm_sec(B,E,N) will take time like powm.
   2101  1.1.1.2  mrg 
   2102  1.1.1.2  mrg    Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
   2103  1.1.1.2  mrg    main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
   2104  1.1.1.2  mrg    O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
   2105  1.1.1.2  mrg    table reads, respectively.  */
   2106  1.1.1.2  mrg 
   2107  1.1.1.2  mrg   printf ("#define POWM_SEC_TABLE  ");
   2108  1.1.1.2  mrg 
   2109  1.1.1.3  mrg   /* For nbits == 1, we should always use k == 1, so no need to tune
   2110  1.1.1.3  mrg      that. Starting with nbits == 2 also ensure that nbits always is
   2111  1.1.1.3  mrg      larger than the windowsize k+1. */
   2112  1.1.1.3  mrg   for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
   2113  1.1.1.2  mrg     {
   2114  1.1.1.2  mrg       n = (nbits - 1) / GMP_NUMB_BITS + 1;
   2115  1.1.1.2  mrg 
   2116  1.1.1.2  mrg       /* Generate E such that sliding-window for k and k+1 works equally
   2117  1.1.1.2  mrg 	 well/poorly (but sliding is not used in powm_sec, of course). */
   2118  1.1.1.2  mrg       for (i = 0; i < n; i++)
   2119  1.1.1.2  mrg 	ep[i] = ~CNST_LIMB(0);
   2120  1.1.1.2  mrg 
   2121  1.1.1.2  mrg       winsize = k;
   2122  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   2123  1.1.1.2  mrg 	{
   2124  1.1.1.2  mrg 	  speed_starttime ();
   2125  1.1.1.3  mrg 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
   2126  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   2127  1.1.1.2  mrg 	}
   2128  1.1.1.2  mrg       tk = median (ttab, n_measurements);
   2129  1.1.1.2  mrg 
   2130  1.1.1.2  mrg       winsize = k + 1;
   2131  1.1.1.2  mrg       speed_starttime ();
   2132  1.1.1.2  mrg       for (i = 0; i < n_measurements; i++)
   2133  1.1.1.2  mrg 	{
   2134  1.1.1.2  mrg 	  speed_starttime ();
   2135  1.1.1.3  mrg 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
   2136  1.1.1.2  mrg 	  ttab[i] = speed_endtime ();
   2137  1.1.1.2  mrg 	}
   2138  1.1.1.2  mrg       tkp1 = median (ttab, n_measurements);
   2139  1.1.1.2  mrg /*
   2140  1.1.1.2  mrg       printf ("testing: %ld, %d", nbits, k, ep[n-1]);
   2141  1.1.1.2  mrg       printf ("   %10.5f  %10.5f\n", tk, tkp1);
   2142  1.1.1.2  mrg */
   2143  1.1.1.2  mrg       if (tkp1 < tk)
   2144  1.1.1.2  mrg 	{
   2145  1.1.1.2  mrg 	  if (possible_nbits_cutoff)
   2146  1.1.1.2  mrg 	    {
   2147  1.1.1.2  mrg 	      /* Two consecutive sizes indicate k increase, obey.  */
   2148  1.1.1.3  mrg 
   2149  1.1.1.3  mrg 	      /* Must always have x[k] >= k */
   2150  1.1.1.3  mrg 	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
   2151  1.1.1.3  mrg 
   2152  1.1.1.2  mrg 	      if (k > 1)
   2153  1.1.1.2  mrg 		printf (",");
   2154  1.1.1.2  mrg 	      printf ("%ld", (long) possible_nbits_cutoff);
   2155  1.1.1.2  mrg 	      k++;
   2156  1.1.1.2  mrg 	      possible_nbits_cutoff = 0;
   2157  1.1.1.2  mrg 	    }
   2158  1.1.1.2  mrg 	  else
   2159  1.1.1.2  mrg 	    {
   2160  1.1.1.2  mrg 	      /* One measurement indicate k increase, save nbits for further
   2161  1.1.1.2  mrg 		 consideration.  */
   2162  1.1.1.3  mrg 	      /* The new larger k gets used for sizes > the cutoff
   2163  1.1.1.3  mrg 		 value, hence the cutoff should be one less than the
   2164  1.1.1.3  mrg 		 smallest size where it gives a speedup. */
   2165  1.1.1.3  mrg 	      possible_nbits_cutoff = nbits - 1;
   2166  1.1.1.2  mrg 	    }
   2167  1.1.1.2  mrg 	}
   2168  1.1.1.2  mrg       else
   2169  1.1.1.2  mrg 	possible_nbits_cutoff = 0;
   2170  1.1.1.2  mrg 
   2171  1.1.1.2  mrg       nbits_next = nbits * 65 / 64;
   2172  1.1.1.2  mrg       nbits = nbits_next + (nbits_next == nbits);
   2173  1.1.1.2  mrg     }
   2174  1.1.1.2  mrg   printf ("\n");
   2175  1.1.1.2  mrg   TMP_FREE;
   2176  1.1.1.2  mrg }
   2177  1.1.1.2  mrg 
   2178      1.1  mrg 
   2179      1.1  mrg /* size_extra==1 reflects the fact that with high<divisor one division is
   2180      1.1  mrg    always skipped.  Forcing high<divisor while testing ensures consistency
   2181      1.1  mrg    while stepping through sizes, ie. that size-1 divides will be done each
   2182      1.1  mrg    time.
   2183      1.1  mrg 
   2184      1.1  mrg    min_size==2 and min_is_always are used so that if plain division is only
   2185      1.1  mrg    better at size==1 then don't bother including that code just for that
   2186      1.1  mrg    case, instead go with preinv always and get a size saving.  */
   2187      1.1  mrg 
   2188      1.1  mrg #define DIV_1_PARAMS                    \
   2189      1.1  mrg   param.check_size = 256;               \
   2190      1.1  mrg   param.min_size = 2;                   \
   2191      1.1  mrg   param.min_is_always = 1;              \
   2192      1.1  mrg   param.data_high = DATA_HIGH_LT_R;     \
   2193      1.1  mrg   param.size_extra = 1;                 \
   2194      1.1  mrg   param.stop_factor = 2.0;
   2195      1.1  mrg 
   2196      1.1  mrg 
   2197  1.1.1.2  mrg double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
   2198      1.1  mrg 
   2199      1.1  mrg void
   2200      1.1  mrg tune_divrem_1 (void)
   2201      1.1  mrg {
   2202      1.1  mrg   /* plain version by default */
   2203      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
   2204      1.1  mrg 
   2205      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2206      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2207      1.1  mrg      appear in gmp-mparam.h.  */
   2208      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2209      1.1  mrg     return;
   2210      1.1  mrg 
   2211      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2212      1.1  mrg     {
   2213      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2214      1.1  mrg                            "no preinv with nails");
   2215      1.1  mrg       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2216      1.1  mrg                            "no preinv with nails");
   2217      1.1  mrg       return;
   2218      1.1  mrg     }
   2219      1.1  mrg 
   2220      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2221      1.1  mrg     {
   2222      1.1  mrg       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
   2223      1.1  mrg       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
   2224      1.1  mrg       return;
   2225      1.1  mrg     }
   2226      1.1  mrg 
   2227      1.1  mrg   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
   2228      1.1  mrg 
   2229      1.1  mrg   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
   2230      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2231      1.1  mrg      is more important. */
   2232      1.1  mrg   {
   2233      1.1  mrg     static struct param_t  param;
   2234      1.1  mrg     param.name = "DIVREM_1_NORM_THRESHOLD";
   2235      1.1  mrg     DIV_1_PARAMS;
   2236      1.1  mrg     s.r = randlimb_norm ();
   2237      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2238      1.1  mrg     one (&divrem_1_norm_threshold, &param);
   2239      1.1  mrg   }
   2240      1.1  mrg   {
   2241      1.1  mrg     static struct param_t  param;
   2242      1.1  mrg     param.name = "DIVREM_1_UNNORM_THRESHOLD";
   2243      1.1  mrg     DIV_1_PARAMS;
   2244      1.1  mrg     s.r = randlimb_half ();
   2245      1.1  mrg     param.function = speed_mpn_divrem_1_tune;
   2246      1.1  mrg     one (&divrem_1_unnorm_threshold, &param);
   2247      1.1  mrg   }
   2248      1.1  mrg }
   2249      1.1  mrg 
   2250  1.1.1.3  mrg void
   2251  1.1.1.3  mrg tune_div_qr_1 (void)
   2252  1.1.1.3  mrg {
   2253  1.1.1.3  mrg   if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
   2254  1.1.1.3  mrg     {
   2255  1.1.1.3  mrg       static struct param_t  param;
   2256  1.1.1.4  mrg       speed_function_t f[2] =
   2257  1.1.1.4  mrg 	{
   2258  1.1.1.4  mrg 	 speed_mpn_div_qr_1n_pi1_1,
   2259  1.1.1.4  mrg 	 speed_mpn_div_qr_1n_pi1_2,
   2260  1.1.1.4  mrg 	};
   2261  1.1.1.3  mrg 
   2262  1.1.1.3  mrg       s.size = 10;
   2263  1.1.1.3  mrg       s.r = randlimb_norm ();
   2264  1.1.1.3  mrg 
   2265  1.1.1.4  mrg       one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
   2266  1.1.1.3  mrg     }
   2267  1.1.1.3  mrg 
   2268  1.1.1.3  mrg   {
   2269  1.1.1.3  mrg     static struct param_t  param;
   2270  1.1.1.3  mrg     param.name = "DIV_QR_1_NORM_THRESHOLD";
   2271  1.1.1.3  mrg     DIV_1_PARAMS;
   2272  1.1.1.3  mrg     param.min_size = 1;
   2273  1.1.1.3  mrg     param.min_is_always = 0;
   2274  1.1.1.3  mrg     s.r = randlimb_norm ();
   2275  1.1.1.3  mrg     param.function = speed_mpn_div_qr_1_tune;
   2276  1.1.1.3  mrg     one (&div_qr_1_norm_threshold, &param);
   2277  1.1.1.3  mrg   }
   2278  1.1.1.3  mrg   {
   2279  1.1.1.3  mrg     static struct param_t  param;
   2280  1.1.1.3  mrg     param.name = "DIV_QR_1_UNNORM_THRESHOLD";
   2281  1.1.1.3  mrg     DIV_1_PARAMS;
   2282  1.1.1.3  mrg     param.min_size = 1;
   2283  1.1.1.3  mrg     param.min_is_always = 0;
   2284  1.1.1.3  mrg     s.r = randlimb_half();
   2285  1.1.1.3  mrg     param.function = speed_mpn_div_qr_1_tune;
   2286  1.1.1.3  mrg     one (&div_qr_1_unnorm_threshold, &param);
   2287  1.1.1.3  mrg   }
   2288  1.1.1.3  mrg }
   2289  1.1.1.3  mrg 
   2290      1.1  mrg 
   2291      1.1  mrg void
   2292      1.1  mrg tune_mod_1 (void)
   2293      1.1  mrg {
   2294      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2295      1.1  mrg      the results in the .asm file, there's no need for such thresholds to
   2296      1.1  mrg      appear in gmp-mparam.h.  */
   2297      1.1  mrg   if (HAVE_NATIVE_mpn_mod_1)
   2298      1.1  mrg     return;
   2299      1.1  mrg 
   2300      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2301      1.1  mrg     {
   2302      1.1  mrg       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
   2303      1.1  mrg                            "no preinv with nails");
   2304      1.1  mrg       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
   2305      1.1  mrg                            "no preinv with nails");
   2306      1.1  mrg       return;
   2307      1.1  mrg     }
   2308      1.1  mrg 
   2309  1.1.1.2  mrg   if (!HAVE_NATIVE_mpn_mod_1_1p)
   2310  1.1.1.2  mrg     {
   2311  1.1.1.2  mrg       static struct param_t  param;
   2312  1.1.1.4  mrg       speed_function_t f[2] =
   2313  1.1.1.4  mrg 	{
   2314  1.1.1.4  mrg 	 speed_mpn_mod_1_1_1,
   2315  1.1.1.4  mrg 	 speed_mpn_mod_1_1_2,
   2316  1.1.1.4  mrg 	};
   2317  1.1.1.2  mrg 
   2318  1.1.1.2  mrg       s.size = 10;
   2319  1.1.1.2  mrg       s.r = randlimb_half ();
   2320  1.1.1.4  mrg       one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
   2321  1.1.1.2  mrg     }
   2322  1.1.1.2  mrg 
   2323      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2324      1.1  mrg     {
   2325      1.1  mrg       print_define ("MOD_1_NORM_THRESHOLD", 0L);
   2326      1.1  mrg       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
   2327      1.1  mrg     }
   2328      1.1  mrg   else
   2329      1.1  mrg     {
   2330      1.1  mrg       {
   2331      1.1  mrg 	static struct param_t  param;
   2332      1.1  mrg 	param.name = "MOD_1_NORM_THRESHOLD";
   2333      1.1  mrg 	DIV_1_PARAMS;
   2334      1.1  mrg 	s.r = randlimb_norm ();
   2335      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2336      1.1  mrg 	one (&mod_1_norm_threshold, &param);
   2337      1.1  mrg       }
   2338      1.1  mrg       {
   2339      1.1  mrg 	static struct param_t  param;
   2340      1.1  mrg 	param.name = "MOD_1_UNNORM_THRESHOLD";
   2341      1.1  mrg 	DIV_1_PARAMS;
   2342      1.1  mrg 	s.r = randlimb_half ();
   2343      1.1  mrg 	param.function = speed_mpn_mod_1_tune;
   2344      1.1  mrg 	one (&mod_1_unnorm_threshold, &param);
   2345      1.1  mrg       }
   2346      1.1  mrg     }
   2347      1.1  mrg   {
   2348      1.1  mrg     static struct param_t  param;
   2349      1.1  mrg 
   2350      1.1  mrg     param.check_size = 256;
   2351      1.1  mrg 
   2352      1.1  mrg     s.r = randlimb_norm ();
   2353      1.1  mrg     param.function = speed_mpn_mod_1_tune;
   2354      1.1  mrg 
   2355      1.1  mrg     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
   2356      1.1  mrg     param.min_size = 2;
   2357      1.1  mrg     one (&mod_1n_to_mod_1_1_threshold, &param);
   2358      1.1  mrg   }
   2359  1.1.1.2  mrg 
   2360      1.1  mrg   {
   2361      1.1  mrg     static struct param_t  param;
   2362      1.1  mrg 
   2363      1.1  mrg     param.check_size = 256;
   2364  1.1.1.2  mrg     s.r = randlimb_half ();
   2365      1.1  mrg     param.noprint = 1;
   2366      1.1  mrg 
   2367  1.1.1.2  mrg     param.function = speed_mpn_mod_1_1;
   2368  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_2;
   2369  1.1.1.2  mrg     param.min_is_always = 1;
   2370      1.1  mrg     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
   2371  1.1.1.2  mrg     param.min_size = 2;
   2372      1.1  mrg     one (&mod_1_1_to_mod_1_2_threshold, &param);
   2373      1.1  mrg 
   2374  1.1.1.2  mrg     param.function = speed_mpn_mod_1_2;
   2375  1.1.1.2  mrg     param.function2 = speed_mpn_mod_1_4;
   2376  1.1.1.2  mrg     param.min_is_always = 1;
   2377      1.1  mrg     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
   2378  1.1.1.2  mrg     param.min_size = 1;
   2379      1.1  mrg     one (&mod_1_2_to_mod_1_4_threshold, &param);
   2380      1.1  mrg 
   2381  1.1.1.2  mrg     if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
   2382      1.1  mrg       {
   2383  1.1.1.2  mrg 	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
   2384      1.1  mrg 	mod_1_2_to_mod_1_4_threshold = 0;
   2385      1.1  mrg 
   2386  1.1.1.2  mrg 	param.function = speed_mpn_mod_1_1;
   2387  1.1.1.2  mrg 	param.function2 = speed_mpn_mod_1_4;
   2388  1.1.1.2  mrg 	param.min_is_always = 1;
   2389  1.1.1.2  mrg 	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
   2390  1.1.1.2  mrg 	param.min_size = 2;
   2391      1.1  mrg 	one (&mod_1_1_to_mod_1_2_threshold, &param);
   2392      1.1  mrg       }
   2393  1.1.1.2  mrg 
   2394  1.1.1.2  mrg     param.function = speed_mpn_mod_1_tune;
   2395  1.1.1.2  mrg     param.function2 = NULL;
   2396  1.1.1.2  mrg     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
   2397  1.1.1.2  mrg     param.min_size = 2;
   2398  1.1.1.2  mrg     param.min_is_always = 0;
   2399  1.1.1.2  mrg     one (&mod_1u_to_mod_1_1_threshold, &param);
   2400  1.1.1.2  mrg 
   2401  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
   2402  1.1.1.2  mrg       mod_1_1_to_mod_1_2_threshold = 0;
   2403  1.1.1.2  mrg     if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
   2404  1.1.1.2  mrg       mod_1_2_to_mod_1_4_threshold = 0;
   2405  1.1.1.2  mrg 
   2406  1.1.1.2  mrg     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
   2407  1.1.1.2  mrg     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
   2408  1.1.1.2  mrg 			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
   2409  1.1.1.2  mrg     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
   2410  1.1.1.2  mrg 			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
   2411      1.1  mrg   }
   2412      1.1  mrg 
   2413      1.1  mrg   {
   2414      1.1  mrg     static struct param_t  param;
   2415      1.1  mrg 
   2416      1.1  mrg     param.check_size = 256;
   2417      1.1  mrg 
   2418      1.1  mrg     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
   2419      1.1  mrg     s.r = randlimb_norm ();
   2420      1.1  mrg     param.function = speed_mpn_preinv_mod_1;
   2421      1.1  mrg     param.function2 = speed_mpn_mod_1_tune;
   2422      1.1  mrg     param.min_size = 1;
   2423      1.1  mrg     one (&preinv_mod_1_to_mod_1_threshold, &param);
   2424      1.1  mrg   }
   2425      1.1  mrg }
   2426      1.1  mrg 
   2427      1.1  mrg 
   2428      1.1  mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
   2429      1.1  mrg    imply that udiv_qrnnd_preinv is worth using, but it seems most
   2430      1.1  mrg    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
   2431      1.1  mrg    directly.  */
   2432      1.1  mrg 
   2433      1.1  mrg void
   2434      1.1  mrg tune_preinv_divrem_1 (void)
   2435      1.1  mrg {
   2436      1.1  mrg   static struct param_t  param;
   2437      1.1  mrg   speed_function_t  divrem_1;
   2438      1.1  mrg   const char        *divrem_1_name;
   2439      1.1  mrg   double            t1, t2;
   2440      1.1  mrg 
   2441      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2442      1.1  mrg     {
   2443      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
   2444      1.1  mrg       return;
   2445      1.1  mrg     }
   2446      1.1  mrg 
   2447      1.1  mrg   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
   2448      1.1  mrg      it's faster than mpn_divrem_1.  */
   2449      1.1  mrg   if (HAVE_NATIVE_mpn_preinv_divrem_1)
   2450      1.1  mrg     {
   2451      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
   2452      1.1  mrg       return;
   2453      1.1  mrg     }
   2454      1.1  mrg 
   2455      1.1  mrg   /* If udiv_qrnnd_preinv is the only division method then of course
   2456      1.1  mrg      mpn_preinv_divrem_1 should be used.  */
   2457      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2458      1.1  mrg     {
   2459      1.1  mrg       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
   2460      1.1  mrg       return;
   2461      1.1  mrg     }
   2462      1.1  mrg 
   2463      1.1  mrg   /* If we've got an assembler version of mpn_divrem_1, then compare against
   2464      1.1  mrg      that, not the mpn_divrem_1_div generic C.  */
   2465      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_1)
   2466      1.1  mrg     {
   2467      1.1  mrg       divrem_1 = speed_mpn_divrem_1;
   2468      1.1  mrg       divrem_1_name = "mpn_divrem_1";
   2469      1.1  mrg     }
   2470      1.1  mrg   else
   2471      1.1  mrg     {
   2472      1.1  mrg       divrem_1 = speed_mpn_divrem_1_div;
   2473      1.1  mrg       divrem_1_name = "mpn_divrem_1_div";
   2474      1.1  mrg     }
   2475      1.1  mrg 
   2476      1.1  mrg   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
   2477      1.1  mrg   s.size = 200;                     /* generous but not too big */
   2478      1.1  mrg   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
   2479      1.1  mrg      since in general that's probably most common, though in fact for a
   2480      1.1  mrg      64-bit limb mp_bases[10].big_base is normalized.  */
   2481      1.1  mrg   s.r = urandom() & (GMP_NUMB_MASK >> 4);
   2482      1.1  mrg   if (s.r == 0) s.r = 123;
   2483      1.1  mrg 
   2484      1.1  mrg   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
   2485      1.1  mrg   t2 = tuneup_measure (divrem_1, &param, &s);
   2486      1.1  mrg   if (t1 == -1.0 || t2 == -1.0)
   2487      1.1  mrg     {
   2488      1.1  mrg       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
   2489      1.1  mrg               divrem_1_name, (long) s.size);
   2490      1.1  mrg       abort ();
   2491      1.1  mrg     }
   2492      1.1  mrg   if (option_trace >= 1)
   2493      1.1  mrg     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
   2494      1.1  mrg             (long) s.size, t1, divrem_1_name, t2);
   2495      1.1  mrg 
   2496      1.1  mrg   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
   2497      1.1  mrg }
   2498      1.1  mrg 
   2499      1.1  mrg 
   2500      1.1  mrg 
   2501      1.1  mrg void
   2502      1.1  mrg tune_divrem_2 (void)
   2503      1.1  mrg {
   2504      1.1  mrg   static struct param_t  param;
   2505      1.1  mrg 
   2506      1.1  mrg   /* No support for tuning native assembler code, do that by hand and put
   2507      1.1  mrg      the results in the .asm file, and there's no need for such thresholds
   2508      1.1  mrg      to appear in gmp-mparam.h.  */
   2509      1.1  mrg   if (HAVE_NATIVE_mpn_divrem_2)
   2510      1.1  mrg     return;
   2511      1.1  mrg 
   2512      1.1  mrg   if (GMP_NAIL_BITS != 0)
   2513      1.1  mrg     {
   2514      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
   2515      1.1  mrg                            "no preinv with nails");
   2516      1.1  mrg       return;
   2517      1.1  mrg     }
   2518      1.1  mrg 
   2519      1.1  mrg   if (UDIV_PREINV_ALWAYS)
   2520      1.1  mrg     {
   2521      1.1  mrg       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
   2522      1.1  mrg       return;
   2523      1.1  mrg     }
   2524      1.1  mrg 
   2525      1.1  mrg   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
   2526      1.1  mrg      a bit out for the fractional part, but that's too bad, the integer part
   2527      1.1  mrg      is more important.
   2528      1.1  mrg 
   2529      1.1  mrg      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
   2530      1.1  mrg      code space if plain division is better only at size==2 or size==3. */
   2531      1.1  mrg   param.name = "DIVREM_2_THRESHOLD";
   2532      1.1  mrg   param.check_size = 256;
   2533      1.1  mrg   param.min_size = 4;
   2534      1.1  mrg   param.min_is_always = 1;
   2535      1.1  mrg   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
   2536      1.1  mrg   param.stop_factor = 2.0;
   2537      1.1  mrg 
   2538      1.1  mrg   s.r = randlimb_norm ();
   2539      1.1  mrg   param.function = speed_mpn_divrem_2;
   2540      1.1  mrg   one (&divrem_2_threshold, &param);
   2541      1.1  mrg }
   2542      1.1  mrg 
   2543  1.1.1.2  mrg void
   2544  1.1.1.2  mrg tune_div_qr_2 (void)
   2545  1.1.1.2  mrg {
   2546  1.1.1.2  mrg   static struct param_t  param;
   2547  1.1.1.2  mrg   param.name = "DIV_QR_2_PI2_THRESHOLD";
   2548  1.1.1.2  mrg   param.function = speed_mpn_div_qr_2n;
   2549  1.1.1.2  mrg   param.check_size = 500;
   2550  1.1.1.2  mrg   param.min_size = 4;
   2551  1.1.1.2  mrg   one (&div_qr_2_pi2_threshold, &param);
   2552  1.1.1.2  mrg }
   2553      1.1  mrg 
   2554      1.1  mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
   2555      1.1  mrg    tune for that.  Its speed can differ on odd or even divisor, so take an
   2556      1.1  mrg    average threshold for the two.
   2557      1.1  mrg 
   2558      1.1  mrg    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
   2559      1.1  mrg    might not vary that way, but don't test this since high<divisor isn't
   2560      1.1  mrg    expected to occur often with small divisors.  */
   2561      1.1  mrg 
   2562      1.1  mrg void
   2563      1.1  mrg tune_divexact_1 (void)
   2564      1.1  mrg {
   2565      1.1  mrg   static struct param_t  param;
   2566      1.1  mrg   mp_size_t  thresh[2], average;
   2567      1.1  mrg   int        low, i;
   2568      1.1  mrg 
   2569      1.1  mrg   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
   2570      1.1  mrg      full mpn_divrem_1.  */
   2571      1.1  mrg   if (HAVE_NATIVE_mpn_divexact_1)
   2572      1.1  mrg     {
   2573      1.1  mrg       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
   2574      1.1  mrg       return;
   2575      1.1  mrg     }
   2576      1.1  mrg 
   2577      1.1  mrg   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
   2578      1.1  mrg 
   2579      1.1  mrg   param.name = "DIVEXACT_1_THRESHOLD";
   2580      1.1  mrg   param.data_high = DATA_HIGH_GE_R;
   2581      1.1  mrg   param.check_size = 256;
   2582      1.1  mrg   param.min_size = 2;
   2583      1.1  mrg   param.stop_factor = 1.5;
   2584      1.1  mrg   param.function  = tuned_speed_mpn_divrem_1;
   2585      1.1  mrg   param.function2 = speed_mpn_divexact_1;
   2586      1.1  mrg   param.noprint = 1;
   2587      1.1  mrg 
   2588      1.1  mrg   print_define_start (param.name);
   2589      1.1  mrg 
   2590      1.1  mrg   for (low = 0; low <= 1; low++)
   2591      1.1  mrg     {
   2592      1.1  mrg       s.r = randlimb_half();
   2593      1.1  mrg       if (low == 0)
   2594      1.1  mrg         s.r |= 1;
   2595      1.1  mrg       else
   2596      1.1  mrg         s.r &= ~CNST_LIMB(7);
   2597      1.1  mrg 
   2598      1.1  mrg       one (&thresh[low], &param);
   2599      1.1  mrg       if (option_trace)
   2600      1.1  mrg         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
   2601      1.1  mrg 
   2602      1.1  mrg       if (thresh[low] == MP_SIZE_T_MAX)
   2603      1.1  mrg         {
   2604      1.1  mrg           average = MP_SIZE_T_MAX;
   2605      1.1  mrg           goto divexact_1_done;
   2606      1.1  mrg         }
   2607      1.1  mrg     }
   2608      1.1  mrg 
   2609      1.1  mrg   if (option_trace)
   2610      1.1  mrg     {
   2611      1.1  mrg       printf ("average of:");
   2612      1.1  mrg       for (i = 0; i < numberof(thresh); i++)
   2613      1.1  mrg         printf (" %ld", (long) thresh[i]);
   2614      1.1  mrg       printf ("\n");
   2615      1.1  mrg     }
   2616      1.1  mrg 
   2617      1.1  mrg   average = 0;
   2618      1.1  mrg   for (i = 0; i < numberof(thresh); i++)
   2619      1.1  mrg     average += thresh[i];
   2620      1.1  mrg   average /= numberof(thresh);
   2621      1.1  mrg 
   2622      1.1  mrg   /* If divexact turns out to be better as early as 3 limbs, then use it
   2623      1.1  mrg      always, so as to reduce code size and conditional jumps.  */
   2624      1.1  mrg   if (average <= 3)
   2625      1.1  mrg     average = 0;
   2626      1.1  mrg 
   2627      1.1  mrg  divexact_1_done:
   2628      1.1  mrg   print_define_end (param.name, average);
   2629      1.1  mrg }
   2630      1.1  mrg 
   2631      1.1  mrg 
   2632      1.1  mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
   2633      1.1  mrg    same as mpn_mod_1, but this might not be true of an assembler
   2634      1.1  mrg    implementation.  The threshold used is an average based on data where a
   2635      1.1  mrg    divide can be skipped and where it can't.
   2636      1.1  mrg 
   2637      1.1  mrg    If modexact turns out to be better as early as 3 limbs, then use it
   2638      1.1  mrg    always, so as to reduce code size and conditional jumps.  */
   2639      1.1  mrg 
   2640      1.1  mrg void
   2641      1.1  mrg tune_modexact_1_odd (void)
   2642      1.1  mrg {
   2643      1.1  mrg   static struct param_t  param;
   2644      1.1  mrg   mp_size_t  thresh_lt, thresh_ge, average;
   2645      1.1  mrg 
   2646      1.1  mrg #if 0
   2647      1.1  mrg   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
   2648      1.1  mrg      of a full mpn_mod_1.  */
   2649      1.1  mrg   if (HAVE_NATIVE_mpn_modexact_1_odd)
   2650      1.1  mrg     {
   2651      1.1  mrg       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
   2652      1.1  mrg       return;
   2653      1.1  mrg     }
   2654      1.1  mrg #endif
   2655      1.1  mrg 
   2656      1.1  mrg   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
   2657      1.1  mrg   param.check_size = 256;
   2658      1.1  mrg   param.min_size = 2;
   2659      1.1  mrg   param.stop_factor = 1.5;
   2660      1.1  mrg   param.function  = speed_mpn_modexact_1c_odd;
   2661      1.1  mrg   param.function2 = speed_mpn_mod_1_tune;
   2662      1.1  mrg   param.noprint = 1;
   2663      1.1  mrg   s.r = randlimb_half () | 1;
   2664      1.1  mrg 
   2665      1.1  mrg   print_define_start (param.name);
   2666      1.1  mrg 
   2667      1.1  mrg   param.data_high = DATA_HIGH_LT_R;
   2668      1.1  mrg   one (&thresh_lt, &param);
   2669      1.1  mrg   if (option_trace)
   2670      1.1  mrg     printf ("lt thresh %ld\n", (long) thresh_lt);
   2671      1.1  mrg 
   2672      1.1  mrg   average = thresh_lt;
   2673      1.1  mrg   if (thresh_lt != MP_SIZE_T_MAX)
   2674      1.1  mrg     {
   2675      1.1  mrg       param.data_high = DATA_HIGH_GE_R;
   2676      1.1  mrg       one (&thresh_ge, &param);
   2677      1.1  mrg       if (option_trace)
   2678      1.1  mrg         printf ("ge thresh %ld\n", (long) thresh_ge);
   2679      1.1  mrg 
   2680      1.1  mrg       if (thresh_ge != MP_SIZE_T_MAX)
   2681      1.1  mrg         {
   2682      1.1  mrg           average = (thresh_ge + thresh_lt) / 2;
   2683      1.1  mrg           if (thresh_ge <= 3)
   2684      1.1  mrg             average = 0;
   2685      1.1  mrg         }
   2686      1.1  mrg     }
   2687      1.1  mrg 
   2688      1.1  mrg   print_define_end (param.name, average);
   2689      1.1  mrg }
   2690      1.1  mrg 
   2691      1.1  mrg 
   2692      1.1  mrg void
   2693      1.1  mrg tune_jacobi_base (void)
   2694      1.1  mrg {
   2695      1.1  mrg   static struct param_t  param;
   2696  1.1.1.4  mrg   speed_function_t f[4] =
   2697      1.1  mrg     {
   2698  1.1.1.4  mrg      speed_mpn_jacobi_base_1,
   2699  1.1.1.4  mrg      speed_mpn_jacobi_base_2,
   2700  1.1.1.4  mrg      speed_mpn_jacobi_base_3,
   2701  1.1.1.4  mrg      speed_mpn_jacobi_base_4,
   2702  1.1.1.4  mrg     };
   2703      1.1  mrg 
   2704  1.1.1.4  mrg   s.size = GMP_LIMB_BITS * 3 / 4;
   2705      1.1  mrg 
   2706  1.1.1.4  mrg   one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
   2707      1.1  mrg }
   2708      1.1  mrg 
   2709      1.1  mrg 
   2710      1.1  mrg void
   2711      1.1  mrg tune_get_str (void)
   2712      1.1  mrg {
   2713      1.1  mrg   /* Tune for decimal, it being most common.  Some rough testing suggests
   2714      1.1  mrg      other bases are different, but not by very much.  */
   2715      1.1  mrg   s.r = 10;
   2716      1.1  mrg   {
   2717      1.1  mrg     static struct param_t  param;
   2718      1.1  mrg     GET_STR_PRECOMPUTE_THRESHOLD = 0;
   2719      1.1  mrg     param.name = "GET_STR_DC_THRESHOLD";
   2720      1.1  mrg     param.function = speed_mpn_get_str;
   2721      1.1  mrg     param.min_size = 4;
   2722      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2723      1.1  mrg     one (&get_str_dc_threshold, &param);
   2724      1.1  mrg   }
   2725      1.1  mrg   {
   2726      1.1  mrg     static struct param_t  param;
   2727      1.1  mrg     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
   2728      1.1  mrg     param.function = speed_mpn_get_str;
   2729      1.1  mrg     param.min_size = GET_STR_DC_THRESHOLD;
   2730      1.1  mrg     param.max_size = GET_STR_THRESHOLD_LIMIT;
   2731      1.1  mrg     one (&get_str_precompute_threshold, &param);
   2732      1.1  mrg   }
   2733      1.1  mrg }
   2734      1.1  mrg 
   2735      1.1  mrg 
   2736      1.1  mrg double
   2737      1.1  mrg speed_mpn_pre_set_str (struct speed_params *s)
   2738      1.1  mrg {
   2739      1.1  mrg   unsigned char *str;
   2740      1.1  mrg   mp_ptr     wp;
   2741      1.1  mrg   mp_size_t  wn;
   2742      1.1  mrg   unsigned   i;
   2743      1.1  mrg   int        base;
   2744      1.1  mrg   double     t;
   2745      1.1  mrg   mp_ptr powtab_mem, tp;
   2746      1.1  mrg   powers_t powtab[GMP_LIMB_BITS];
   2747      1.1  mrg   mp_size_t un;
   2748      1.1  mrg   int chars_per_limb;
   2749      1.1  mrg   TMP_DECL;
   2750      1.1  mrg 
   2751      1.1  mrg   SPEED_RESTRICT_COND (s->size >= 1);
   2752      1.1  mrg 
   2753      1.1  mrg   base = s->r == 0 ? 10 : s->r;
   2754      1.1  mrg   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
   2755      1.1  mrg 
   2756      1.1  mrg   TMP_MARK;
   2757      1.1  mrg 
   2758  1.1.1.3  mrg   str = (unsigned char *) TMP_ALLOC (s->size);
   2759      1.1  mrg   for (i = 0; i < s->size; i++)
   2760      1.1  mrg     str[i] = s->xp[i] % base;
   2761      1.1  mrg 
   2762  1.1.1.2  mrg   LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
   2763      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
   2764      1.1  mrg 
   2765      1.1  mrg   /* use this during development to check wn is big enough */
   2766      1.1  mrg   /*
   2767      1.1  mrg   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
   2768      1.1  mrg   */
   2769      1.1  mrg 
   2770  1.1.1.3  mrg   speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
   2771      1.1  mrg   speed_operand_dst (s, wp, wn);
   2772      1.1  mrg   speed_cache_fill (s);
   2773      1.1  mrg 
   2774      1.1  mrg   chars_per_limb = mp_bases[base].chars_per_limb;
   2775      1.1  mrg   un = s->size / chars_per_limb + 1;
   2776  1.1.1.4  mrg   powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
   2777  1.1.1.4  mrg   size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
   2778  1.1.1.4  mrg   powers_t *pt = powtab + n_pows;
   2779      1.1  mrg   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
   2780      1.1  mrg 
   2781      1.1  mrg   speed_starttime ();
   2782      1.1  mrg   i = s->reps;
   2783      1.1  mrg   do
   2784      1.1  mrg     {
   2785  1.1.1.4  mrg       mpn_pre_set_str (wp, str, s->size, pt, tp);
   2786      1.1  mrg     }
   2787      1.1  mrg   while (--i != 0);
   2788      1.1  mrg   t = speed_endtime ();
   2789      1.1  mrg 
   2790      1.1  mrg   TMP_FREE;
   2791      1.1  mrg   return t;
   2792      1.1  mrg }
   2793      1.1  mrg 
   2794      1.1  mrg void
   2795      1.1  mrg tune_set_str (void)
   2796      1.1  mrg {
   2797      1.1  mrg   s.r = 10;  /* decimal */
   2798      1.1  mrg   {
   2799      1.1  mrg     static struct param_t  param;
   2800      1.1  mrg     SET_STR_PRECOMPUTE_THRESHOLD = 0;
   2801      1.1  mrg     param.step_factor = 0.01;
   2802      1.1  mrg     param.name = "SET_STR_DC_THRESHOLD";
   2803      1.1  mrg     param.function = speed_mpn_pre_set_str;
   2804      1.1  mrg     param.min_size = 100;
   2805      1.1  mrg     param.max_size = 50000;
   2806      1.1  mrg     one (&set_str_dc_threshold, &param);
   2807      1.1  mrg   }
   2808      1.1  mrg   {
   2809      1.1  mrg     static struct param_t  param;
   2810      1.1  mrg     param.step_factor = 0.02;
   2811      1.1  mrg     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
   2812      1.1  mrg     param.function = speed_mpn_set_str;
   2813      1.1  mrg     param.min_size = SET_STR_DC_THRESHOLD;
   2814      1.1  mrg     param.max_size = 100000;
   2815      1.1  mrg     one (&set_str_precompute_threshold, &param);
   2816      1.1  mrg   }
   2817      1.1  mrg }
   2818      1.1  mrg 
   2819      1.1  mrg 
   2820      1.1  mrg void
   2821      1.1  mrg tune_fft_mul (void)
   2822      1.1  mrg {
   2823      1.1  mrg   static struct fft_param_t  param;
   2824      1.1  mrg 
   2825      1.1  mrg   if (option_fft_max_size == 0)
   2826      1.1  mrg     return;
   2827      1.1  mrg 
   2828      1.1  mrg   param.table_name          = "MUL_FFT_TABLE3";
   2829      1.1  mrg   param.threshold_name      = "MUL_FFT_THRESHOLD";
   2830      1.1  mrg   param.p_threshold         = &mul_fft_threshold;
   2831      1.1  mrg   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
   2832      1.1  mrg   param.p_modf_threshold    = &mul_fft_modf_threshold;
   2833      1.1  mrg   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
   2834      1.1  mrg   param.max_size            = option_fft_max_size;
   2835      1.1  mrg   param.function            = speed_mpn_fft_mul;
   2836      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft;
   2837      1.1  mrg   param.mul_function        = speed_mpn_mul_n;
   2838      1.1  mrg   param.sqr = 0;
   2839      1.1  mrg   fft (&param);
   2840      1.1  mrg }
   2841      1.1  mrg 
   2842      1.1  mrg 
   2843      1.1  mrg void
   2844      1.1  mrg tune_fft_sqr (void)
   2845      1.1  mrg {
   2846      1.1  mrg   static struct fft_param_t  param;
   2847      1.1  mrg 
   2848      1.1  mrg   if (option_fft_max_size == 0)
   2849      1.1  mrg     return;
   2850      1.1  mrg 
   2851      1.1  mrg   param.table_name          = "SQR_FFT_TABLE3";
   2852      1.1  mrg   param.threshold_name      = "SQR_FFT_THRESHOLD";
   2853      1.1  mrg   param.p_threshold         = &sqr_fft_threshold;
   2854      1.1  mrg   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
   2855      1.1  mrg   param.p_modf_threshold    = &sqr_fft_modf_threshold;
   2856      1.1  mrg   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
   2857      1.1  mrg   param.max_size            = option_fft_max_size;
   2858      1.1  mrg   param.function            = speed_mpn_fft_sqr;
   2859      1.1  mrg   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
   2860      1.1  mrg   param.mul_function        = speed_mpn_sqr;
   2861      1.1  mrg   param.sqr = 1;
   2862      1.1  mrg   fft (&param);
   2863      1.1  mrg }
   2864      1.1  mrg 
   2865      1.1  mrg void
   2866  1.1.1.2  mrg tune_fac_ui (void)
   2867  1.1.1.2  mrg {
   2868  1.1.1.2  mrg   static struct param_t  param;
   2869  1.1.1.2  mrg 
   2870  1.1.1.2  mrg   param.function = speed_mpz_fac_ui_tune;
   2871  1.1.1.2  mrg 
   2872  1.1.1.2  mrg   param.name = "FAC_DSC_THRESHOLD";
   2873  1.1.1.2  mrg   param.min_size = 70;
   2874  1.1.1.2  mrg   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
   2875  1.1.1.2  mrg   one (&fac_dsc_threshold, &param);
   2876  1.1.1.2  mrg 
   2877  1.1.1.2  mrg   param.name = "FAC_ODD_THRESHOLD";
   2878  1.1.1.2  mrg   param.min_size = 22;
   2879  1.1.1.2  mrg   param.stop_factor = 1.7;
   2880  1.1.1.2  mrg   param.min_is_always = 1;
   2881  1.1.1.2  mrg   one (&fac_odd_threshold, &param);
   2882  1.1.1.2  mrg }
   2883  1.1.1.2  mrg 
   2884  1.1.1.2  mrg void
   2885      1.1  mrg all (void)
   2886      1.1  mrg {
   2887      1.1  mrg   time_t  start_time, end_time;
   2888      1.1  mrg   TMP_DECL;
   2889      1.1  mrg 
   2890      1.1  mrg   TMP_MARK;
   2891      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
   2892      1.1  mrg   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
   2893      1.1  mrg 
   2894      1.1  mrg   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
   2895      1.1  mrg   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
   2896      1.1  mrg 
   2897      1.1  mrg   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
   2898      1.1  mrg 
   2899      1.1  mrg   speed_time_init ();
   2900      1.1  mrg   fprintf (stderr, "Using: %s\n", speed_time_string);
   2901      1.1  mrg 
   2902      1.1  mrg   fprintf (stderr, "speed_precision %d", speed_precision);
   2903      1.1  mrg   if (speed_unittime == 1.0)
   2904      1.1  mrg     fprintf (stderr, ", speed_unittime 1 cycle");
   2905      1.1  mrg   else
   2906      1.1  mrg     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
   2907      1.1  mrg   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
   2908      1.1  mrg     fprintf (stderr, ", CPU freq unknown\n");
   2909      1.1  mrg   else
   2910      1.1  mrg     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
   2911      1.1  mrg 
   2912      1.1  mrg   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
   2913      1.1  mrg            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
   2914      1.1  mrg   fprintf (stderr, "\n");
   2915      1.1  mrg 
   2916      1.1  mrg   time (&start_time);
   2917      1.1  mrg   {
   2918      1.1  mrg     struct tm  *tp;
   2919      1.1  mrg     tp = localtime (&start_time);
   2920      1.1  mrg     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
   2921      1.1  mrg             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
   2922      1.1  mrg 
   2923      1.1  mrg #ifdef __GNUC__
   2924      1.1  mrg     /* gcc sub-minor version doesn't seem to come through as a define */
   2925      1.1  mrg     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
   2926      1.1  mrg #define PRINTED_COMPILER
   2927      1.1  mrg #endif
   2928      1.1  mrg #if defined (__SUNPRO_C)
   2929      1.1  mrg     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
   2930      1.1  mrg #define PRINTED_COMPILER
   2931      1.1  mrg #endif
   2932      1.1  mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
   2933      1.1  mrg     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
   2934      1.1  mrg     printf ("MIPSpro C %d.%d.%d */\n",
   2935      1.1  mrg 	    _COMPILER_VERSION / 100,
   2936      1.1  mrg 	    _COMPILER_VERSION / 10 % 10,
   2937      1.1  mrg 	    _COMPILER_VERSION % 10);
   2938      1.1  mrg #define PRINTED_COMPILER
   2939      1.1  mrg #endif
   2940      1.1  mrg #if defined (__DECC) && defined (__DECC_VER)
   2941      1.1  mrg     printf ("DEC C %d */\n", __DECC_VER);
   2942      1.1  mrg #define PRINTED_COMPILER
   2943      1.1  mrg #endif
   2944      1.1  mrg #if ! defined (PRINTED_COMPILER)
   2945      1.1  mrg     printf ("system compiler */\n");
   2946      1.1  mrg #endif
   2947      1.1  mrg   }
   2948      1.1  mrg   printf ("\n");
   2949      1.1  mrg 
   2950      1.1  mrg   tune_divrem_1 ();
   2951      1.1  mrg   tune_mod_1 ();
   2952      1.1  mrg   tune_preinv_divrem_1 ();
   2953  1.1.1.3  mrg   tune_div_qr_1 ();
   2954  1.1.1.2  mrg #if 0
   2955      1.1  mrg   tune_divrem_2 ();
   2956  1.1.1.2  mrg #endif
   2957  1.1.1.2  mrg   tune_div_qr_2 ();
   2958      1.1  mrg   tune_divexact_1 ();
   2959      1.1  mrg   tune_modexact_1_odd ();
   2960      1.1  mrg   printf("\n");
   2961      1.1  mrg 
   2962  1.1.1.4  mrg   relspeed_div_1_vs_mul_1 ();
   2963  1.1.1.4  mrg   printf("\n");
   2964  1.1.1.4  mrg 
   2965      1.1  mrg   tune_mul_n ();
   2966      1.1  mrg   printf("\n");
   2967      1.1  mrg 
   2968      1.1  mrg   tune_mul ();
   2969      1.1  mrg   printf("\n");
   2970      1.1  mrg 
   2971      1.1  mrg   tune_sqr ();
   2972      1.1  mrg   printf("\n");
   2973      1.1  mrg 
   2974  1.1.1.2  mrg   tune_mulmid ();
   2975  1.1.1.2  mrg   printf("\n");
   2976  1.1.1.2  mrg 
   2977      1.1  mrg   tune_mulmod_bnm1 ();
   2978      1.1  mrg   tune_sqrmod_bnm1 ();
   2979      1.1  mrg   printf("\n");
   2980      1.1  mrg 
   2981      1.1  mrg   tune_fft_mul ();
   2982      1.1  mrg   printf("\n");
   2983      1.1  mrg 
   2984      1.1  mrg   tune_fft_sqr ();
   2985      1.1  mrg   printf ("\n");
   2986      1.1  mrg 
   2987      1.1  mrg   tune_mullo ();
   2988  1.1.1.3  mrg   tune_sqrlo ();
   2989      1.1  mrg   printf("\n");
   2990      1.1  mrg 
   2991      1.1  mrg   tune_dc_div ();
   2992      1.1  mrg   tune_dc_bdiv ();
   2993      1.1  mrg 
   2994      1.1  mrg   printf("\n");
   2995      1.1  mrg   tune_invertappr ();
   2996      1.1  mrg   tune_invert ();
   2997      1.1  mrg   printf("\n");
   2998      1.1  mrg 
   2999      1.1  mrg   tune_binvert ();
   3000      1.1  mrg   tune_redc ();
   3001      1.1  mrg   printf("\n");
   3002      1.1  mrg 
   3003      1.1  mrg   tune_mu_div ();
   3004      1.1  mrg   tune_mu_bdiv ();
   3005      1.1  mrg   printf("\n");
   3006      1.1  mrg 
   3007  1.1.1.2  mrg   tune_powm_sec ();
   3008  1.1.1.2  mrg   printf("\n");
   3009  1.1.1.2  mrg 
   3010  1.1.1.3  mrg   tune_get_str ();
   3011  1.1.1.3  mrg   tune_set_str ();
   3012  1.1.1.3  mrg   printf("\n");
   3013  1.1.1.3  mrg 
   3014  1.1.1.3  mrg   tune_fac_ui ();
   3015  1.1.1.3  mrg   printf("\n");
   3016  1.1.1.3  mrg 
   3017      1.1  mrg   tune_matrix22_mul ();
   3018  1.1.1.4  mrg   tune_hgcd2 ();
   3019      1.1  mrg   tune_hgcd ();
   3020  1.1.1.2  mrg   tune_hgcd_appr ();
   3021  1.1.1.2  mrg   tune_hgcd_reduce();
   3022      1.1  mrg   tune_gcd_dc ();
   3023      1.1  mrg   tune_gcdext_dc ();
   3024      1.1  mrg   tune_jacobi_base ();
   3025      1.1  mrg   printf("\n");
   3026      1.1  mrg 
   3027      1.1  mrg   time (&end_time);
   3028      1.1  mrg   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
   3029      1.1  mrg           (long) (end_time - start_time));
   3030      1.1  mrg 
   3031      1.1  mrg   TMP_FREE;
   3032      1.1  mrg }
   3033      1.1  mrg 
   3034      1.1  mrg 
   3035      1.1  mrg int
   3036      1.1  mrg main (int argc, char *argv[])
   3037      1.1  mrg {
   3038      1.1  mrg   int  opt;
   3039      1.1  mrg 
   3040      1.1  mrg   /* Unbuffered so if output is redirected to a file it isn't lost if the
   3041      1.1  mrg      program is killed part way through.  */
   3042      1.1  mrg   setbuf (stdout, NULL);
   3043      1.1  mrg   setbuf (stderr, NULL);
   3044      1.1  mrg 
   3045      1.1  mrg   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
   3046      1.1  mrg     {
   3047      1.1  mrg       switch (opt) {
   3048      1.1  mrg       case 'f':
   3049      1.1  mrg         if (optarg[0] == 't')
   3050      1.1  mrg           option_fft_trace = 2;
   3051      1.1  mrg         else
   3052      1.1  mrg           option_fft_max_size = atol (optarg);
   3053      1.1  mrg         break;
   3054      1.1  mrg       case 'o':
   3055      1.1  mrg         speed_option_set (optarg);
   3056      1.1  mrg         break;
   3057      1.1  mrg       case 'p':
   3058      1.1  mrg         speed_precision = atoi (optarg);
   3059      1.1  mrg         break;
   3060      1.1  mrg       case 't':
   3061      1.1  mrg         option_trace++;
   3062      1.1  mrg         break;
   3063      1.1  mrg       case '?':
   3064      1.1  mrg         exit(1);
   3065      1.1  mrg       }
   3066      1.1  mrg     }
   3067      1.1  mrg 
   3068      1.1  mrg   all ();
   3069      1.1  mrg   exit (0);
   3070      1.1  mrg }
   3071