1 1.1 mrg /* Create tuned thresholds for various algorithms. 2 1.1 mrg 3 1.1.1.4 mrg Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg This file is part of the GNU MP Library. 6 1.1 mrg 7 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.3 mrg it under the terms of either: 9 1.1.1.3 mrg 10 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free 11 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your 12 1.1.1.3 mrg option) any later version. 13 1.1.1.3 mrg 14 1.1.1.3 mrg or 15 1.1.1.3 mrg 16 1.1.1.3 mrg * the GNU General Public License as published by the Free Software 17 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.3 mrg later version. 19 1.1.1.3 mrg 20 1.1.1.3 mrg or both in parallel, as here. 21 1.1 mrg 22 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.3 mrg for more details. 26 1.1 mrg 27 1.1.1.3 mrg You should have received copies of the GNU General Public License and the 28 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.3 mrg see https://www.gnu.org/licenses/. */ 30 1.1 mrg 31 1.1 mrg 32 1.1 mrg /* Usage: tuneup [-t] [-t] [-p precision] 33 1.1 mrg 34 1.1 mrg -t turns on some diagnostic traces, a second -t turns on more traces. 35 1.1 mrg 36 1.1 mrg Notes: 37 1.1 mrg 38 1.1 mrg The code here isn't a vision of loveliness, mainly because it's subject 39 1.1 mrg to ongoing changes according to new things wanting to be tuned, and 40 1.1 mrg practical requirements of systems tested. 41 1.1 mrg 42 1.1 mrg Sometimes running the program twice produces slightly different results. 43 1.1 mrg This is probably because there's so little separating algorithms near 44 1.1 mrg their crossover, and on that basis it should make little or no difference 45 1.1 mrg to the final speed of the relevant routines, but nothing has been done to 46 1.1 mrg check that carefully. 47 1.1 mrg 48 1.1 mrg Algorithm: 49 1.1 mrg 50 1.1 mrg The thresholds are determined as follows. A crossover may not be a 51 1.1 mrg single size but rather a range where it oscillates between method A or 52 1.1 mrg method B faster. If the threshold is set making B used where A is faster 53 1.1 mrg (or vice versa) that's bad. Badness is the percentage time lost and 54 1.1 mrg total badness is the sum of this over all sizes measured. The threshold 55 1.1 mrg is set to minimize total badness. 56 1.1 mrg 57 1.1 mrg Suppose, as sizes increase, method B becomes faster than method A. The 58 1.1 mrg effect of the rule is that, as you look at increasing sizes, isolated 59 1.1 mrg points where B is faster are ignored, but when it's consistently faster, 60 1.1 mrg or faster on balance, then the threshold is set there. The same result 61 1.1 mrg is obtained thinking in the other direction of A becoming faster at 62 1.1 mrg smaller sizes. 63 1.1 mrg 64 1.1 mrg In practice the thresholds tend to be chosen to bring on the next 65 1.1 mrg algorithm fairly quickly. 66 1.1 mrg 67 1.1 mrg This rule is attractive because it's got a basis in reason and is fairly 68 1.1 mrg easy to implement, but no work has been done to actually compare it in 69 1.1 mrg absolute terms to other possibilities. 70 1.1 mrg 71 1.1 mrg Implementation: 72 1.1 mrg 73 1.1 mrg In a normal library build the thresholds are constants. To tune them 74 1.1 mrg selected objects are recompiled with the thresholds as global variables 75 1.1 mrg instead. #define TUNE_PROGRAM_BUILD does this, with help from code at 76 1.1 mrg the end of gmp-impl.h, and rules in tune/Makefile.am. 77 1.1 mrg 78 1.1 mrg MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The 79 1.1 mrg threshold is set to "size+1" to avoid karatsuba, or to "size" to use one 80 1.1 mrg level, but recurse into the basecase. 81 1.1 mrg 82 1.1 mrg MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. 83 1.1 mrg Other routines in turn will make use of both of those. Naturally the 84 1.1 mrg dependants must be tuned first. 85 1.1 mrg 86 1.1 mrg In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, 87 1.1 mrg just a threshold based on comparing two routines (mpn_divrem_1 and 88 1.1 mrg mpn_divexact_1), and no further use of the value determined. 89 1.1 mrg 90 1.1 mrg Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being 91 1.1 mrg just comparisons between certain routines on representative data. 92 1.1 mrg 93 1.1 mrg Shortcuts are applied when native (assembler) versions of routines exist. 94 1.1 mrg For instance a native mpn_sqr_basecase is assumed to be always faster 95 1.1 mrg than mpn_mul_basecase, with no measuring. 96 1.1 mrg 97 1.1 mrg No attempt is made to tune within assembler routines, for instance 98 1.1 mrg DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be 99 1.1 mrg written and tuned all by hand. Assembler routines that might have hard 100 1.1 mrg limits are recompiled though, to make them accept a bigger range of sizes 101 1.1 mrg than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr. 102 1.1 mrg 103 1.1 mrg Limitations: 104 1.1 mrg 105 1.1 mrg The FFTs aren't subject to the same badness rule as the other thresholds, 106 1.1 mrg so each k is probably being brought on a touch early. This isn't likely 107 1.1 mrg to make a difference, and the simpler probing means fewer tests. 108 1.1 mrg 109 1.1 mrg */ 110 1.1 mrg 111 1.1 mrg #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ 112 1.1 mrg 113 1.1 mrg #include "config.h" 114 1.1 mrg 115 1.1 mrg #include <math.h> 116 1.1 mrg #include <stdio.h> 117 1.1 mrg #include <stdlib.h> 118 1.1 mrg #include <time.h> 119 1.1 mrg #if HAVE_UNISTD_H 120 1.1 mrg #include <unistd.h> 121 1.1 mrg #endif 122 1.1 mrg 123 1.1 mrg #include "gmp-impl.h" 124 1.1 mrg #include "longlong.h" 125 1.1 mrg 126 1.1 mrg #include "tests.h" 127 1.1 mrg #include "speed.h" 128 1.1 mrg 129 1.1 mrg #if !HAVE_DECL_OPTARG 130 1.1 mrg extern char *optarg; 131 1.1 mrg extern int optind, opterr; 132 1.1 mrg #endif 133 1.1 mrg 134 1.1 mrg 135 1.1 mrg #define DEFAULT_MAX_SIZE 1000 /* limbs */ 136 1.1 mrg 137 1.1 mrg #if WANT_FFT 138 1.1 mrg mp_size_t option_fft_max_size = 50000; /* limbs */ 139 1.1 mrg #else 140 1.1 mrg mp_size_t option_fft_max_size = 0; 141 1.1 mrg #endif 142 1.1 mrg int option_trace = 0; 143 1.1 mrg int option_fft_trace = 0; 144 1.1 mrg struct speed_params s; 145 1.1 mrg 146 1.1 mrg struct dat_t { 147 1.1 mrg mp_size_t size; 148 1.1 mrg double d; 149 1.1 mrg } *dat = NULL; 150 1.1 mrg int ndat = 0; 151 1.1 mrg int allocdat = 0; 152 1.1 mrg 153 1.1 mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that 154 1.1 mrg case use zero here, which for params.max_size means no limit. */ 155 1.1 mrg #ifndef TUNE_SQR_TOOM2_MAX 156 1.1 mrg #define TUNE_SQR_TOOM2_MAX 0 157 1.1 mrg #endif 158 1.1 mrg 159 1.1 mrg mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX; 160 1.1 mrg mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT; 161 1.1 mrg mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; 162 1.1 mrg mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT; 163 1.1 mrg mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; 164 1.1 mrg mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX; 165 1.1 mrg mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX; 166 1.1 mrg mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX; 167 1.1 mrg mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX; 168 1.1.1.2 mrg mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX; 169 1.1 mrg mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; 170 1.1 mrg mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; 171 1.1 mrg mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; 172 1.1 mrg mp_size_t sqr_toom2_threshold 173 1.1 mrg = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX); 174 1.1 mrg mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 175 1.1 mrg mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 176 1.1 mrg mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 177 1.1 mrg mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 178 1.1 mrg mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; 179 1.1 mrg mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; 180 1.1 mrg mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; 181 1.1 mrg mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; 182 1.1 mrg mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; 183 1.1.1.3 mrg mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX; 184 1.1.1.3 mrg mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX; 185 1.1.1.3 mrg mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX; 186 1.1.1.2 mrg mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX; 187 1.1 mrg mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; 188 1.1 mrg mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; 189 1.1.1.2 mrg mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX; 190 1.1 mrg mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; 191 1.1 mrg mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX; 192 1.1 mrg mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX; 193 1.1 mrg mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX; 194 1.1 mrg mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX; 195 1.1 mrg mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX; 196 1.1 mrg mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX; 197 1.1 mrg mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX; 198 1.1 mrg mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX; 199 1.1 mrg mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX; 200 1.1 mrg mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX; 201 1.1 mrg mp_size_t inv_newton_threshold = MP_SIZE_T_MAX; 202 1.1 mrg mp_size_t inv_appr_threshold = MP_SIZE_T_MAX; 203 1.1 mrg mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; 204 1.1 mrg mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; 205 1.1 mrg mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; 206 1.1 mrg mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; 207 1.1 mrg mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; 208 1.1 mrg mp_size_t hgcd_threshold = MP_SIZE_T_MAX; 209 1.1.1.2 mrg mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; 210 1.1.1.2 mrg mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX; 211 1.1 mrg mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; 212 1.1 mrg mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; 213 1.1.1.3 mrg int div_qr_1n_pi1_method = 0; 214 1.1.1.3 mrg mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX; 215 1.1.1.3 mrg mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX; 216 1.1 mrg mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; 217 1.1 mrg mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; 218 1.1 mrg mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; 219 1.1 mrg mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; 220 1.1.1.2 mrg int mod_1_1p_method = 0; 221 1.1 mrg mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; 222 1.1 mrg mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; 223 1.1 mrg mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; 224 1.1 mrg mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX; 225 1.1 mrg mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX; 226 1.1 mrg mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; 227 1.1 mrg mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; 228 1.1 mrg mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; 229 1.1 mrg mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; 230 1.1 mrg mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; 231 1.1.1.2 mrg mp_size_t fac_odd_threshold = 0; 232 1.1.1.2 mrg mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT; 233 1.1 mrg 234 1.1 mrg mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; 235 1.1 mrg mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; 236 1.1 mrg 237 1.1 mrg struct param_t { 238 1.1 mrg const char *name; 239 1.1 mrg speed_function_t function; 240 1.1 mrg speed_function_t function2; 241 1.1 mrg double step_factor; /* how much to step relatively */ 242 1.1 mrg int step; /* how much to step absolutely */ 243 1.1 mrg double function_fudge; /* multiplier for "function" speeds */ 244 1.1 mrg int stop_since_change; 245 1.1 mrg double stop_factor; 246 1.1 mrg mp_size_t min_size; 247 1.1 mrg int min_is_always; 248 1.1 mrg mp_size_t max_size; 249 1.1 mrg mp_size_t check_size; 250 1.1 mrg mp_size_t size_extra; 251 1.1 mrg 252 1.1 mrg #define DATA_HIGH_LT_R 1 253 1.1 mrg #define DATA_HIGH_GE_R 2 254 1.1 mrg int data_high; 255 1.1 mrg 256 1.1 mrg int noprint; 257 1.1 mrg }; 258 1.1 mrg 259 1.1 mrg 260 1.1 mrg /* These are normally undefined when false, which suits "#if" fine. 261 1.1 mrg But give them zero values so they can be used in plain C "if"s. */ 262 1.1 mrg #ifndef UDIV_PREINV_ALWAYS 263 1.1 mrg #define UDIV_PREINV_ALWAYS 0 264 1.1 mrg #endif 265 1.1 mrg #ifndef HAVE_NATIVE_mpn_divexact_1 266 1.1 mrg #define HAVE_NATIVE_mpn_divexact_1 0 267 1.1 mrg #endif 268 1.1.1.3 mrg #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1 269 1.1.1.3 mrg #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0 270 1.1.1.3 mrg #endif 271 1.1 mrg #ifndef HAVE_NATIVE_mpn_divrem_1 272 1.1 mrg #define HAVE_NATIVE_mpn_divrem_1 0 273 1.1 mrg #endif 274 1.1 mrg #ifndef HAVE_NATIVE_mpn_divrem_2 275 1.1 mrg #define HAVE_NATIVE_mpn_divrem_2 0 276 1.1 mrg #endif 277 1.1 mrg #ifndef HAVE_NATIVE_mpn_mod_1 278 1.1 mrg #define HAVE_NATIVE_mpn_mod_1 0 279 1.1 mrg #endif 280 1.1.1.2 mrg #ifndef HAVE_NATIVE_mpn_mod_1_1p 281 1.1.1.2 mrg #define HAVE_NATIVE_mpn_mod_1_1p 0 282 1.1.1.2 mrg #endif 283 1.1 mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd 284 1.1 mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0 285 1.1 mrg #endif 286 1.1 mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 287 1.1 mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0 288 1.1 mrg #endif 289 1.1 mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1 290 1.1 mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0 291 1.1 mrg #endif 292 1.1 mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase 293 1.1 mrg #define HAVE_NATIVE_mpn_sqr_basecase 0 294 1.1 mrg #endif 295 1.1 mrg 296 1.1 mrg 297 1.1 mrg #define MAX3(a,b,c) MAX (MAX (a, b), c) 298 1.1 mrg 299 1.1 mrg mp_limb_t 300 1.1 mrg randlimb_norm (void) 301 1.1 mrg { 302 1.1 mrg mp_limb_t n; 303 1.1 mrg mpn_random (&n, 1); 304 1.1 mrg n |= GMP_NUMB_HIGHBIT; 305 1.1 mrg return n; 306 1.1 mrg } 307 1.1 mrg 308 1.1 mrg #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) 309 1.1 mrg 310 1.1 mrg mp_limb_t 311 1.1 mrg randlimb_half (void) 312 1.1 mrg { 313 1.1 mrg mp_limb_t n; 314 1.1 mrg mpn_random (&n, 1); 315 1.1 mrg n &= GMP_NUMB_HALFMASK; 316 1.1 mrg n += (n==0); 317 1.1 mrg return n; 318 1.1 mrg } 319 1.1 mrg 320 1.1 mrg 321 1.1 mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger 322 1.1 mrg if necessary. */ 323 1.1 mrg void 324 1.1 mrg add_dat (mp_size_t size, double d) 325 1.1 mrg { 326 1.1 mrg #define ALLOCDAT_STEP 500 327 1.1 mrg 328 1.1 mrg ASSERT_ALWAYS (ndat <= allocdat); 329 1.1 mrg 330 1.1 mrg if (ndat == allocdat) 331 1.1 mrg { 332 1.1 mrg dat = (struct dat_t *) __gmp_allocate_or_reallocate 333 1.1 mrg (dat, allocdat * sizeof(dat[0]), 334 1.1 mrg (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); 335 1.1 mrg allocdat += ALLOCDAT_STEP; 336 1.1 mrg } 337 1.1 mrg 338 1.1 mrg dat[ndat].size = size; 339 1.1 mrg dat[ndat].d = d; 340 1.1 mrg ndat++; 341 1.1 mrg } 342 1.1 mrg 343 1.1 mrg 344 1.1 mrg /* Return the threshold size based on the data accumulated. */ 345 1.1 mrg mp_size_t 346 1.1 mrg analyze_dat (int final) 347 1.1 mrg { 348 1.1 mrg double x, min_x; 349 1.1 mrg int j, min_j; 350 1.1 mrg 351 1.1 mrg /* If the threshold is set at dat[0].size, any positive values are bad. */ 352 1.1 mrg x = 0.0; 353 1.1 mrg for (j = 0; j < ndat; j++) 354 1.1 mrg if (dat[j].d > 0.0) 355 1.1 mrg x += dat[j].d; 356 1.1 mrg 357 1.1 mrg if (option_trace >= 2 && final) 358 1.1 mrg { 359 1.1 mrg printf ("\n"); 360 1.1 mrg printf ("x is the sum of the badness from setting thresh at given size\n"); 361 1.1 mrg printf (" (minimum x is sought)\n"); 362 1.1 mrg printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); 363 1.1 mrg } 364 1.1 mrg 365 1.1 mrg min_x = x; 366 1.1 mrg min_j = 0; 367 1.1 mrg 368 1.1 mrg 369 1.1 mrg /* When stepping to the next dat[j].size, positive values are no longer 370 1.1 mrg bad (so subtracted), negative values become bad (so add the absolute 371 1.1 mrg value, meaning subtract). */ 372 1.1 mrg for (j = 0; j < ndat; x -= dat[j].d, j++) 373 1.1 mrg { 374 1.1 mrg if (option_trace >= 2 && final) 375 1.1 mrg printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); 376 1.1 mrg 377 1.1 mrg if (x < min_x) 378 1.1 mrg { 379 1.1 mrg min_x = x; 380 1.1 mrg min_j = j; 381 1.1 mrg } 382 1.1 mrg } 383 1.1 mrg 384 1.1 mrg return min_j; 385 1.1 mrg } 386 1.1 mrg 387 1.1 mrg 388 1.1.1.3 mrg /* Measuring for recompiled mpn/generic/div_qr_1.c, 389 1.1.1.3 mrg * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */ 390 1.1.1.3 mrg 391 1.1.1.3 mrg mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t); 392 1.1.1.3 mrg 393 1.1.1.3 mrg #if defined (__cplusplus) 394 1.1.1.3 mrg extern "C" { 395 1.1.1.3 mrg #endif 396 1.1 mrg 397 1.1.1.2 mrg mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 398 1.1.1.2 mrg mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t); 399 1.1.1.2 mrg void mpz_fac_ui_tune (mpz_ptr, unsigned long); 400 1.1 mrg 401 1.1.1.3 mrg #if defined (__cplusplus) 402 1.1.1.3 mrg } 403 1.1.1.3 mrg #endif 404 1.1.1.3 mrg 405 1.1 mrg double 406 1.1 mrg speed_mpn_mod_1_tune (struct speed_params *s) 407 1.1 mrg { 408 1.1 mrg SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); 409 1.1 mrg } 410 1.1 mrg double 411 1.1 mrg speed_mpn_divrem_1_tune (struct speed_params *s) 412 1.1 mrg { 413 1.1 mrg SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); 414 1.1 mrg } 415 1.1.1.2 mrg double 416 1.1.1.2 mrg speed_mpz_fac_ui_tune (struct speed_params *s) 417 1.1.1.2 mrg { 418 1.1.1.2 mrg SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune); 419 1.1.1.2 mrg } 420 1.1.1.3 mrg double 421 1.1.1.3 mrg speed_mpn_div_qr_1_tune (struct speed_params *s) 422 1.1.1.3 mrg { 423 1.1.1.3 mrg SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune); 424 1.1.1.3 mrg } 425 1.1 mrg 426 1.1 mrg double 427 1.1 mrg tuneup_measure (speed_function_t fun, 428 1.1 mrg const struct param_t *param, 429 1.1 mrg struct speed_params *s) 430 1.1 mrg { 431 1.1 mrg static struct param_t dummy; 432 1.1 mrg double t; 433 1.1 mrg TMP_DECL; 434 1.1 mrg 435 1.1 mrg if (! param) 436 1.1 mrg param = &dummy; 437 1.1 mrg 438 1.1 mrg s->size += param->size_extra; 439 1.1 mrg 440 1.1 mrg TMP_MARK; 441 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); 442 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); 443 1.1 mrg 444 1.1 mrg mpn_random (s->xp, s->size); 445 1.1 mrg mpn_random (s->yp, s->size); 446 1.1 mrg 447 1.1 mrg switch (param->data_high) { 448 1.1 mrg case DATA_HIGH_LT_R: 449 1.1 mrg s->xp[s->size-1] %= s->r; 450 1.1 mrg s->yp[s->size-1] %= s->r; 451 1.1 mrg break; 452 1.1 mrg case DATA_HIGH_GE_R: 453 1.1 mrg s->xp[s->size-1] |= s->r; 454 1.1 mrg s->yp[s->size-1] |= s->r; 455 1.1 mrg break; 456 1.1 mrg } 457 1.1 mrg 458 1.1 mrg t = speed_measure (fun, s); 459 1.1 mrg 460 1.1 mrg s->size -= param->size_extra; 461 1.1 mrg 462 1.1 mrg TMP_FREE; 463 1.1 mrg return t; 464 1.1 mrg } 465 1.1 mrg 466 1.1 mrg 467 1.1 mrg #define PRINT_WIDTH 31 468 1.1 mrg 469 1.1 mrg void 470 1.1 mrg print_define_start (const char *name) 471 1.1 mrg { 472 1.1 mrg printf ("#define %-*s ", PRINT_WIDTH, name); 473 1.1 mrg if (option_trace) 474 1.1 mrg printf ("...\n"); 475 1.1 mrg } 476 1.1 mrg 477 1.1 mrg void 478 1.1 mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark) 479 1.1 mrg { 480 1.1 mrg if (option_trace) 481 1.1 mrg printf ("#define %-*s ", PRINT_WIDTH, name); 482 1.1 mrg 483 1.1 mrg if (value == MP_SIZE_T_MAX) 484 1.1 mrg printf ("MP_SIZE_T_MAX"); 485 1.1 mrg else 486 1.1 mrg printf ("%5ld", (long) value); 487 1.1 mrg 488 1.1 mrg if (remark != NULL) 489 1.1 mrg printf (" /* %s */", remark); 490 1.1 mrg printf ("\n"); 491 1.1 mrg fflush (stdout); 492 1.1 mrg } 493 1.1 mrg 494 1.1 mrg void 495 1.1 mrg print_define_end (const char *name, mp_size_t value) 496 1.1 mrg { 497 1.1 mrg const char *remark; 498 1.1 mrg if (value == MP_SIZE_T_MAX) 499 1.1 mrg remark = "never"; 500 1.1 mrg else if (value == 0) 501 1.1 mrg remark = "always"; 502 1.1 mrg else 503 1.1 mrg remark = NULL; 504 1.1 mrg print_define_end_remark (name, value, remark); 505 1.1 mrg } 506 1.1 mrg 507 1.1 mrg void 508 1.1 mrg print_define (const char *name, mp_size_t value) 509 1.1 mrg { 510 1.1 mrg print_define_start (name); 511 1.1 mrg print_define_end (name, value); 512 1.1 mrg } 513 1.1 mrg 514 1.1 mrg void 515 1.1 mrg print_define_remark (const char *name, mp_size_t value, const char *remark) 516 1.1 mrg { 517 1.1 mrg print_define_start (name); 518 1.1 mrg print_define_end_remark (name, value, remark); 519 1.1 mrg } 520 1.1 mrg 521 1.1.1.4 mrg void 522 1.1.1.4 mrg print_define_with_speedup (const char *name, mp_size_t value, 523 1.1.1.4 mrg mp_size_t runner_up, double speedup) 524 1.1.1.4 mrg { 525 1.1.1.4 mrg char buf[100]; 526 1.1.1.5 mrg #if __STDC_VERSION__ >= 199901L 527 1.1.1.5 mrg snprintf (buf, sizeof buf, "%.2f%% faster than %ld", 528 1.1.1.4 mrg 100.0 * (speedup - 1), runner_up); 529 1.1.1.5 mrg #else 530 1.1.1.5 mrg sprintf (buf, "%.2f%% faster than %ld", 531 1.1.1.5 mrg 100.0 * (speedup - 1), runner_up); 532 1.1.1.5 mrg #endif 533 1.1.1.4 mrg print_define_remark (name, value, buf); 534 1.1.1.4 mrg } 535 1.1 mrg 536 1.1 mrg void 537 1.1 mrg one (mp_size_t *threshold, struct param_t *param) 538 1.1 mrg { 539 1.1 mrg int since_positive, since_thresh_change; 540 1.1 mrg int thresh_idx, new_thresh_idx; 541 1.1 mrg 542 1.1 mrg #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) 543 1.1 mrg 544 1.1 mrg DEFAULT (param->function_fudge, 1.0); 545 1.1 mrg DEFAULT (param->function2, param->function); 546 1.1 mrg DEFAULT (param->step_factor, 0.01); /* small steps by default */ 547 1.1 mrg DEFAULT (param->step, 1); /* small steps by default */ 548 1.1 mrg DEFAULT (param->stop_since_change, 80); 549 1.1 mrg DEFAULT (param->stop_factor, 1.2); 550 1.1 mrg DEFAULT (param->min_size, 10); 551 1.1 mrg DEFAULT (param->max_size, DEFAULT_MAX_SIZE); 552 1.1 mrg 553 1.1 mrg if (param->check_size != 0) 554 1.1 mrg { 555 1.1 mrg double t1, t2; 556 1.1 mrg s.size = param->check_size; 557 1.1 mrg 558 1.1 mrg *threshold = s.size+1; 559 1.1 mrg t1 = tuneup_measure (param->function, param, &s); 560 1.1 mrg 561 1.1 mrg *threshold = s.size; 562 1.1 mrg t2 = tuneup_measure (param->function2, param, &s); 563 1.1 mrg if (t1 == -1.0 || t2 == -1.0) 564 1.1 mrg { 565 1.1 mrg printf ("Oops, can't run both functions at size %ld\n", 566 1.1 mrg (long) s.size); 567 1.1 mrg abort (); 568 1.1 mrg } 569 1.1 mrg t1 *= param->function_fudge; 570 1.1 mrg 571 1.1 mrg /* ask that t2 is at least 4% below t1 */ 572 1.1 mrg if (t1 < t2*1.04) 573 1.1 mrg { 574 1.1 mrg if (option_trace) 575 1.1 mrg printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); 576 1.1 mrg *threshold = MP_SIZE_T_MAX; 577 1.1 mrg if (! param->noprint) 578 1.1 mrg print_define (param->name, *threshold); 579 1.1 mrg return; 580 1.1 mrg } 581 1.1 mrg 582 1.1 mrg if (option_trace >= 2) 583 1.1 mrg printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", 584 1.1 mrg (long) s.size, t1, t2); 585 1.1 mrg } 586 1.1 mrg 587 1.1 mrg if (! param->noprint || option_trace) 588 1.1 mrg print_define_start (param->name); 589 1.1 mrg 590 1.1 mrg ndat = 0; 591 1.1 mrg since_positive = 0; 592 1.1 mrg since_thresh_change = 0; 593 1.1 mrg thresh_idx = 0; 594 1.1 mrg 595 1.1 mrg if (option_trace >= 2) 596 1.1 mrg { 597 1.1 mrg printf (" algorithm-A algorithm-B ratio possible\n"); 598 1.1 mrg printf (" (seconds) (seconds) diff thresh\n"); 599 1.1 mrg } 600 1.1 mrg 601 1.1 mrg for (s.size = param->min_size; 602 1.1 mrg s.size < param->max_size; 603 1.1 mrg s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step)) 604 1.1 mrg { 605 1.1 mrg double ti, tiplus1, d; 606 1.1 mrg 607 1.1 mrg /* 608 1.1 mrg FIXME: check minimum size requirements are met, possibly by just 609 1.1 mrg checking for the -1 returns from the speed functions. 610 1.1 mrg */ 611 1.1 mrg 612 1.1 mrg /* using method A at this size */ 613 1.1 mrg *threshold = s.size+1; 614 1.1 mrg ti = tuneup_measure (param->function, param, &s); 615 1.1 mrg if (ti == -1.0) 616 1.1 mrg abort (); 617 1.1 mrg ti *= param->function_fudge; 618 1.1 mrg 619 1.1 mrg /* using method B at this size */ 620 1.1 mrg *threshold = s.size; 621 1.1 mrg tiplus1 = tuneup_measure (param->function2, param, &s); 622 1.1 mrg if (tiplus1 == -1.0) 623 1.1 mrg abort (); 624 1.1 mrg 625 1.1 mrg /* Calculate the fraction by which the one or the other routine is 626 1.1 mrg slower. */ 627 1.1 mrg if (tiplus1 >= ti) 628 1.1 mrg d = (tiplus1 - ti) / tiplus1; /* negative */ 629 1.1 mrg else 630 1.1 mrg d = (tiplus1 - ti) / ti; /* positive */ 631 1.1 mrg 632 1.1 mrg add_dat (s.size, d); 633 1.1 mrg 634 1.1 mrg new_thresh_idx = analyze_dat (0); 635 1.1 mrg 636 1.1 mrg if (option_trace >= 2) 637 1.1 mrg printf ("size=%ld %.9f %.9f % .4f %c %ld\n", 638 1.1 mrg (long) s.size, ti, tiplus1, d, 639 1.1 mrg ti > tiplus1 ? '#' : ' ', 640 1.1 mrg (long) dat[new_thresh_idx].size); 641 1.1 mrg 642 1.1 mrg /* Stop if the last time method i was faster was more than a 643 1.1 mrg certain number of measurements ago. */ 644 1.1 mrg #define STOP_SINCE_POSITIVE 200 645 1.1 mrg if (d >= 0) 646 1.1 mrg since_positive = 0; 647 1.1 mrg else 648 1.1 mrg if (++since_positive > STOP_SINCE_POSITIVE) 649 1.1 mrg { 650 1.1 mrg if (option_trace >= 1) 651 1.1 mrg printf ("stopped due to since_positive (%d)\n", 652 1.1 mrg STOP_SINCE_POSITIVE); 653 1.1 mrg break; 654 1.1 mrg } 655 1.1 mrg 656 1.1 mrg /* Stop if method A has become slower by a certain factor. */ 657 1.1 mrg if (ti >= tiplus1 * param->stop_factor) 658 1.1 mrg { 659 1.1 mrg if (option_trace >= 1) 660 1.1 mrg printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", 661 1.1 mrg param->stop_factor); 662 1.1 mrg break; 663 1.1 mrg } 664 1.1 mrg 665 1.1 mrg /* Stop if the threshold implied hasn't changed in a certain 666 1.1 mrg number of measurements. (It's this condition that usually 667 1.1 mrg stops the loop.) */ 668 1.1 mrg if (thresh_idx != new_thresh_idx) 669 1.1 mrg since_thresh_change = 0, thresh_idx = new_thresh_idx; 670 1.1 mrg else 671 1.1 mrg if (++since_thresh_change > param->stop_since_change) 672 1.1 mrg { 673 1.1 mrg if (option_trace >= 1) 674 1.1 mrg printf ("stopped due to since_thresh_change (%d)\n", 675 1.1 mrg param->stop_since_change); 676 1.1 mrg break; 677 1.1 mrg } 678 1.1 mrg 679 1.1 mrg /* Stop if the threshold implied is more than a certain number of 680 1.1 mrg measurements ago. */ 681 1.1 mrg #define STOP_SINCE_AFTER 500 682 1.1 mrg if (ndat - thresh_idx > STOP_SINCE_AFTER) 683 1.1 mrg { 684 1.1 mrg if (option_trace >= 1) 685 1.1 mrg printf ("stopped due to ndat - thresh_idx > amount (%d)\n", 686 1.1 mrg STOP_SINCE_AFTER); 687 1.1 mrg break; 688 1.1 mrg } 689 1.1 mrg 690 1.1 mrg /* Stop when the size limit is reached before the end of the 691 1.1 mrg crossover, but only show this as an error for >= the default max 692 1.1 mrg size. FIXME: Maybe should make it a param choice whether this is 693 1.1 mrg an error. */ 694 1.1 mrg if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) 695 1.1 mrg { 696 1.1 mrg fprintf (stderr, "%s\n", param->name); 697 1.1 mrg fprintf (stderr, "sizes %ld to %ld total %d measurements\n", 698 1.1 mrg (long) dat[0].size, (long) dat[ndat-1].size, ndat); 699 1.1 mrg fprintf (stderr, " max size reached before end of crossover\n"); 700 1.1 mrg break; 701 1.1 mrg } 702 1.1 mrg } 703 1.1 mrg 704 1.1 mrg if (option_trace >= 1) 705 1.1 mrg printf ("sizes %ld to %ld total %d measurements\n", 706 1.1 mrg (long) dat[0].size, (long) dat[ndat-1].size, ndat); 707 1.1 mrg 708 1.1 mrg *threshold = dat[analyze_dat (1)].size; 709 1.1 mrg 710 1.1 mrg if (param->min_is_always) 711 1.1 mrg { 712 1.1 mrg if (*threshold == param->min_size) 713 1.1 mrg *threshold = 0; 714 1.1 mrg } 715 1.1 mrg 716 1.1 mrg if (! param->noprint || option_trace) 717 1.1 mrg print_define_end (param->name, *threshold); 718 1.1 mrg } 719 1.1 mrg 720 1.1.1.4 mrg /* Time N different FUNCTIONS with the same parameters and size, to 721 1.1.1.4 mrg select the fastest. Since *_METHOD defines start numbering from 722 1.1.1.4 mrg one, if functions[i] is fastest, the value of the define is i+1. 723 1.1.1.4 mrg Also output a comment with speedup compared to the next fastest 724 1.1.1.4 mrg function. The NAME argument is used only for trace output. 725 1.1.1.4 mrg 726 1.1.1.4 mrg Returns the index of the fastest function. 727 1.1.1.4 mrg */ 728 1.1.1.4 mrg int 729 1.1.1.4 mrg one_method (int n, speed_function_t *functions, 730 1.1.1.4 mrg const char *name, const char *define, 731 1.1.1.4 mrg const struct param_t *param) 732 1.1.1.4 mrg { 733 1.1.1.4 mrg double *t; 734 1.1.1.4 mrg int i; 735 1.1.1.4 mrg int method; 736 1.1.1.4 mrg int method_runner_up; 737 1.1.1.4 mrg 738 1.1.1.4 mrg TMP_DECL; 739 1.1.1.4 mrg TMP_MARK; 740 1.1.1.4 mrg t = (double*) TMP_ALLOC (n * sizeof (*t)); 741 1.1.1.4 mrg 742 1.1.1.4 mrg for (i = 0; i < n; i++) 743 1.1.1.4 mrg { 744 1.1.1.4 mrg t[i] = tuneup_measure (functions[i], param, &s); 745 1.1.1.4 mrg if (option_trace >= 1) 746 1.1.1.4 mrg printf ("size=%ld, %s, method %d %.9f\n", 747 1.1.1.4 mrg (long) s.size, name, i + 1, t[i]); 748 1.1.1.4 mrg if (t[i] == -1.0) 749 1.1.1.4 mrg { 750 1.1.1.4 mrg printf ("Oops, can't measure all %s methods\n", name); 751 1.1.1.4 mrg abort (); 752 1.1.1.4 mrg } 753 1.1.1.4 mrg } 754 1.1.1.4 mrg method = 0; 755 1.1.1.4 mrg for (i = 1; i < n; i++) 756 1.1.1.4 mrg if (t[i] < t[method]) 757 1.1.1.4 mrg method = i; 758 1.1.1.4 mrg 759 1.1.1.4 mrg method_runner_up = (method == 0); 760 1.1.1.4 mrg for (i = 0; i < n; i++) 761 1.1.1.4 mrg if (i != method && t[i] < t[method_runner_up]) 762 1.1.1.4 mrg method_runner_up = i; 763 1.1.1.4 mrg 764 1.1.1.4 mrg print_define_with_speedup (define, method + 1, method_runner_up + 1, 765 1.1.1.4 mrg t[method_runner_up] / t[method]); 766 1.1.1.4 mrg 767 1.1.1.4 mrg TMP_FREE; 768 1.1.1.4 mrg return method; 769 1.1.1.4 mrg } 770 1.1.1.4 mrg 771 1.1 mrg 772 1.1 mrg /* Special probing for the fft thresholds. The size restrictions on the 773 1.1 mrg FFTs mean the graph of time vs size has a step effect. See this for 774 1.1 mrg example using 775 1.1 mrg 776 1.1 mrg ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 777 1.1 mrg gnuplot foo.gnuplot 778 1.1 mrg 779 1.1 mrg The current approach is to compare routines at the midpoint of relevant 780 1.1 mrg steps. Arguably a more sophisticated system of threshold data is wanted 781 1.1 mrg if this step effect remains. */ 782 1.1 mrg 783 1.1 mrg struct fft_param_t { 784 1.1 mrg const char *table_name; 785 1.1 mrg const char *threshold_name; 786 1.1 mrg const char *modf_threshold_name; 787 1.1 mrg mp_size_t *p_threshold; 788 1.1 mrg mp_size_t *p_modf_threshold; 789 1.1 mrg mp_size_t first_size; 790 1.1 mrg mp_size_t max_size; 791 1.1 mrg speed_function_t function; 792 1.1 mrg speed_function_t mul_modf_function; 793 1.1 mrg speed_function_t mul_function; 794 1.1 mrg mp_size_t sqr; 795 1.1 mrg }; 796 1.1 mrg 797 1.1 mrg 798 1.1 mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with 799 1.1 mrg N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple 800 1.1 mrg of 2^(k-1) bits. */ 801 1.1 mrg 802 1.1 mrg mp_size_t 803 1.1 mrg fft_step_size (int k) 804 1.1 mrg { 805 1.1 mrg mp_size_t step; 806 1.1 mrg 807 1.1 mrg step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS; 808 1.1 mrg step *= (mp_size_t) 1 << k; 809 1.1 mrg 810 1.1 mrg if (step <= 0) 811 1.1 mrg { 812 1.1 mrg printf ("Can't handle k=%d\n", k); 813 1.1 mrg abort (); 814 1.1 mrg } 815 1.1 mrg 816 1.1 mrg return step; 817 1.1 mrg } 818 1.1 mrg 819 1.1 mrg mp_size_t 820 1.1 mrg fft_next_size (mp_size_t pl, int k) 821 1.1 mrg { 822 1.1 mrg mp_size_t m = fft_step_size (k); 823 1.1 mrg 824 1.1 mrg /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ 825 1.1 mrg 826 1.1 mrg if (pl == 0 || (pl & (m-1)) != 0) 827 1.1 mrg pl = (pl | (m-1)) + 1; 828 1.1 mrg 829 1.1 mrg /* printf (" %ld\n", pl); */ 830 1.1 mrg return pl; 831 1.1 mrg } 832 1.1 mrg 833 1.1 mrg #define NMAX_DEFAULT 1000000 834 1.1 mrg #define MAX_REPS 25 835 1.1 mrg #define MIN_REPS 5 836 1.1 mrg 837 1.1 mrg static inline size_t 838 1.1 mrg mpn_mul_fft_lcm (size_t a, unsigned int k) 839 1.1 mrg { 840 1.1 mrg unsigned int l = k; 841 1.1 mrg 842 1.1 mrg while (a % 2 == 0 && k > 0) 843 1.1 mrg { 844 1.1 mrg a >>= 1; 845 1.1 mrg k--; 846 1.1 mrg } 847 1.1 mrg return a << l; 848 1.1 mrg } 849 1.1 mrg 850 1.1 mrg mp_size_t 851 1.1 mrg fftfill (mp_size_t pl, int k, int sqr) 852 1.1 mrg { 853 1.1 mrg mp_size_t maxLK; 854 1.1 mrg mp_bitcnt_t N, Nprime, nprime, M; 855 1.1 mrg 856 1.1 mrg N = pl * GMP_NUMB_BITS; 857 1.1 mrg M = N >> k; 858 1.1 mrg 859 1.1 mrg maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); 860 1.1 mrg 861 1.1 mrg Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; 862 1.1 mrg nprime = Nprime / GMP_NUMB_BITS; 863 1.1 mrg if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 864 1.1 mrg { 865 1.1 mrg size_t K2; 866 1.1 mrg for (;;) 867 1.1 mrg { 868 1.1 mrg K2 = 1L << mpn_fft_best_k (nprime, sqr); 869 1.1 mrg if ((nprime & (K2 - 1)) == 0) 870 1.1 mrg break; 871 1.1 mrg nprime = (nprime + K2 - 1) & -K2; 872 1.1 mrg Nprime = nprime * GMP_LIMB_BITS; 873 1.1 mrg } 874 1.1 mrg } 875 1.1 mrg ASSERT_ALWAYS (nprime < pl); 876 1.1 mrg 877 1.1 mrg return Nprime; 878 1.1 mrg } 879 1.1 mrg 880 1.1 mrg static int 881 1.1 mrg compare_double (const void *ap, const void *bp) 882 1.1 mrg { 883 1.1 mrg double a = * (const double *) ap; 884 1.1 mrg double b = * (const double *) bp; 885 1.1 mrg 886 1.1 mrg if (a < b) 887 1.1 mrg return -1; 888 1.1 mrg else if (a > b) 889 1.1 mrg return 1; 890 1.1 mrg else 891 1.1 mrg return 0; 892 1.1 mrg } 893 1.1 mrg 894 1.1 mrg double 895 1.1 mrg median (double *times, int n) 896 1.1 mrg { 897 1.1 mrg qsort (times, n, sizeof (double), compare_double); 898 1.1 mrg return times[n/2]; 899 1.1 mrg } 900 1.1 mrg 901 1.1 mrg #define FFT_CACHE_SIZE 25 902 1.1 mrg typedef struct fft_cache 903 1.1 mrg { 904 1.1 mrg mp_size_t n; 905 1.1 mrg double time; 906 1.1 mrg } fft_cache_t; 907 1.1 mrg 908 1.1 mrg fft_cache_t fft_cache[FFT_CACHE_SIZE]; 909 1.1 mrg 910 1.1 mrg double 911 1.1 mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k, 912 1.1 mrg int n_measurements) 913 1.1 mrg { 914 1.1 mrg int i; 915 1.1 mrg double t, ttab[MAX_REPS]; 916 1.1 mrg 917 1.1 mrg if (fft_cache[k].n == n) 918 1.1 mrg return fft_cache[k].time; 919 1.1 mrg 920 1.1 mrg for (i = 0; i < n_measurements; i++) 921 1.1 mrg { 922 1.1 mrg speed_starttime (); 923 1.1 mrg mpn_mul_fft (rp, n, ap, n, bp, n, k); 924 1.1 mrg ttab[i] = speed_endtime (); 925 1.1 mrg } 926 1.1 mrg 927 1.1 mrg t = median (ttab, n_measurements); 928 1.1 mrg fft_cache[k].n = n; 929 1.1 mrg fft_cache[k].time = t; 930 1.1 mrg return t; 931 1.1 mrg } 932 1.1 mrg 933 1.1 mrg #define INSERT_FFTTAB(idx, nval, kval) \ 934 1.1 mrg do { \ 935 1.1 mrg fft_tab[idx].n = nval; \ 936 1.1 mrg fft_tab[idx].k = kval; \ 937 1.1.1.3 mrg fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \ 938 1.1.1.3 mrg fft_tab[idx+1].k = (1 << 5) - 1; \ 939 1.1 mrg } while (0) 940 1.1 mrg 941 1.1 mrg int 942 1.1 mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print) 943 1.1 mrg { 944 1.1 mrg mp_size_t n, n1, prev_n1; 945 1.1 mrg int k, best_k, last_best_k, kmax; 946 1.1 mrg int eff, prev_eff; 947 1.1 mrg double t0, t1; 948 1.1 mrg int n_measurements; 949 1.1 mrg mp_limb_t *ap, *bp, *rp; 950 1.1 mrg mp_size_t alloc; 951 1.1 mrg struct fft_table_nk *fft_tab; 952 1.1 mrg 953 1.1 mrg fft_tab = mpn_fft_table3[p->sqr]; 954 1.1 mrg 955 1.1 mrg for (k = 0; k < FFT_CACHE_SIZE; k++) 956 1.1 mrg fft_cache[k].n = 0; 957 1.1 mrg 958 1.1 mrg if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 959 1.1 mrg { 960 1.1 mrg nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD); 961 1.1 mrg } 962 1.1 mrg 963 1.1 mrg if (print) 964 1.1 mrg printf ("#define %s%*s", p->table_name, 38, ""); 965 1.1 mrg 966 1.1 mrg if (idx == 0) 967 1.1 mrg { 968 1.1 mrg INSERT_FFTTAB (0, nmin, initial_k); 969 1.1 mrg 970 1.1 mrg if (print) 971 1.1 mrg { 972 1.1 mrg printf ("\\\n { "); 973 1.1 mrg printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k); 974 1.1 mrg } 975 1.1 mrg 976 1.1 mrg idx = 1; 977 1.1 mrg } 978 1.1 mrg 979 1.1.1.3 mrg ap = (mp_ptr) malloc (sizeof (mp_limb_t)); 980 1.1 mrg if (p->sqr) 981 1.1 mrg bp = ap; 982 1.1 mrg else 983 1.1.1.3 mrg bp = (mp_ptr) malloc (sizeof (mp_limb_t)); 984 1.1.1.3 mrg rp = (mp_ptr) malloc (sizeof (mp_limb_t)); 985 1.1 mrg alloc = 1; 986 1.1 mrg 987 1.1 mrg /* Round n to comply to initial k value */ 988 1.1 mrg n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k); 989 1.1 mrg 990 1.1 mrg n_measurements = (18 - initial_k) | 1; 991 1.1 mrg n_measurements = MAX (n_measurements, MIN_REPS); 992 1.1 mrg n_measurements = MIN (n_measurements, MAX_REPS); 993 1.1 mrg 994 1.1 mrg last_best_k = initial_k; 995 1.1 mrg best_k = initial_k; 996 1.1 mrg 997 1.1 mrg while (n < nmax) 998 1.1 mrg { 999 1.1 mrg int start_k, end_k; 1000 1.1 mrg 1001 1.1 mrg /* Assume the current best k is best until we hit its next FFT step. */ 1002 1.1 mrg t0 = 99999; 1003 1.1 mrg 1004 1.1 mrg prev_n1 = n + 1; 1005 1.1 mrg 1006 1.1 mrg start_k = MAX (4, best_k - 4); 1007 1.1 mrg end_k = MIN (24, best_k + 4); 1008 1.1 mrg for (k = start_k; k <= end_k; k++) 1009 1.1 mrg { 1010 1.1 mrg n1 = mpn_fft_next_size (prev_n1, k); 1011 1.1 mrg 1012 1.1 mrg eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr); 1013 1.1 mrg 1014 1.1 mrg if (eff < 70) /* avoid measuring too slow fft:s */ 1015 1.1 mrg continue; 1016 1.1 mrg 1017 1.1 mrg if (n1 > alloc) 1018 1.1 mrg { 1019 1.1 mrg alloc = n1; 1020 1.1 mrg if (p->sqr) 1021 1.1 mrg { 1022 1.1.1.3 mrg ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); 1023 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); 1024 1.1.1.3 mrg ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); 1025 1.1 mrg mpn_random (ap, alloc); 1026 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); 1027 1.1 mrg } 1028 1.1 mrg else 1029 1.1 mrg { 1030 1.1.1.3 mrg ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); 1031 1.1.1.3 mrg bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t)); 1032 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); 1033 1.1.1.3 mrg ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); 1034 1.1 mrg mpn_random (ap, alloc); 1035 1.1.1.3 mrg bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t)); 1036 1.1 mrg mpn_random (bp, alloc); 1037 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); 1038 1.1 mrg } 1039 1.1 mrg } 1040 1.1 mrg 1041 1.1 mrg t1 = cached_measure (rp, ap, bp, n1, k, n_measurements); 1042 1.1 mrg 1043 1.1 mrg if (t1 * n_measurements > 0.3) 1044 1.1 mrg n_measurements -= 2; 1045 1.1 mrg n_measurements = MAX (n_measurements, MIN_REPS); 1046 1.1 mrg 1047 1.1 mrg if (t1 < t0) 1048 1.1 mrg { 1049 1.1 mrg best_k = k; 1050 1.1 mrg t0 = t1; 1051 1.1 mrg } 1052 1.1 mrg } 1053 1.1 mrg 1054 1.1 mrg n1 = mpn_fft_next_size (prev_n1, best_k); 1055 1.1 mrg 1056 1.1 mrg if (last_best_k != best_k) 1057 1.1 mrg { 1058 1.1 mrg ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1); 1059 1.1 mrg 1060 1.1 mrg if (idx >= FFT_TABLE3_SIZE) 1061 1.1 mrg { 1062 1.1 mrg printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 1063 1.1 mrg abort (); 1064 1.1 mrg } 1065 1.1 mrg INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k); 1066 1.1 mrg 1067 1.1 mrg if (print) 1068 1.1 mrg { 1069 1.1 mrg printf (", "); 1070 1.1 mrg if (idx % 4 == 0) 1071 1.1 mrg printf ("\\\n "); 1072 1.1 mrg printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1073 1.1 mrg } 1074 1.1 mrg 1075 1.1 mrg if (option_trace >= 2) 1076 1.1 mrg { 1077 1.1 mrg printf ("{%lu,%u}\n", prev_n1, best_k); 1078 1.1 mrg fflush (stdout); 1079 1.1 mrg } 1080 1.1 mrg 1081 1.1 mrg last_best_k = best_k; 1082 1.1 mrg idx++; 1083 1.1 mrg } 1084 1.1 mrg 1085 1.1 mrg for (;;) 1086 1.1 mrg { 1087 1.1 mrg prev_n1 = n1; 1088 1.1 mrg prev_eff = fftfill (prev_n1, best_k, p->sqr); 1089 1.1 mrg n1 = mpn_fft_next_size (prev_n1 + 1, best_k); 1090 1.1 mrg eff = fftfill (n1, best_k, p->sqr); 1091 1.1 mrg 1092 1.1 mrg if (eff != prev_eff) 1093 1.1 mrg break; 1094 1.1 mrg } 1095 1.1 mrg 1096 1.1 mrg n = prev_n1; 1097 1.1 mrg } 1098 1.1 mrg 1099 1.1 mrg kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */ 1100 1.1 mrg kmax = MIN (kmax, 25-1); 1101 1.1 mrg for (k = last_best_k + 1; k <= kmax; k++) 1102 1.1 mrg { 1103 1.1 mrg if (idx >= FFT_TABLE3_SIZE) 1104 1.1 mrg { 1105 1.1 mrg printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 1106 1.1 mrg abort (); 1107 1.1 mrg } 1108 1.1 mrg INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k); 1109 1.1 mrg 1110 1.1 mrg if (print) 1111 1.1 mrg { 1112 1.1 mrg printf (", "); 1113 1.1 mrg if (idx % 4 == 0) 1114 1.1 mrg printf ("\\\n "); 1115 1.1 mrg printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1116 1.1 mrg } 1117 1.1 mrg 1118 1.1 mrg idx++; 1119 1.1 mrg } 1120 1.1 mrg 1121 1.1 mrg if (print) 1122 1.1 mrg printf (" }\n"); 1123 1.1 mrg 1124 1.1 mrg free (ap); 1125 1.1 mrg if (! p->sqr) 1126 1.1 mrg free (bp); 1127 1.1 mrg free (rp); 1128 1.1 mrg 1129 1.1 mrg return idx; 1130 1.1 mrg } 1131 1.1 mrg 1132 1.1 mrg void 1133 1.1 mrg fft (struct fft_param_t *p) 1134 1.1 mrg { 1135 1.1 mrg mp_size_t size; 1136 1.1 mrg int k, idx, initial_k; 1137 1.1 mrg 1138 1.1 mrg /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/ 1139 1.1 mrg 1140 1.1 mrg #if 1 1141 1.1 mrg { 1142 1.1 mrg /* Use plain one() mechanism, for some reasonable initial values of k. The 1143 1.1 mrg advantage is that we don't depend on mpn_fft_table3, which can therefore 1144 1.1 mrg leave it completely uninitialized. */ 1145 1.1 mrg 1146 1.1 mrg static struct param_t param; 1147 1.1 mrg mp_size_t thres, best_thres; 1148 1.1 mrg int best_k; 1149 1.1 mrg char buf[20]; 1150 1.1 mrg 1151 1.1 mrg best_thres = MP_SIZE_T_MAX; 1152 1.1 mrg best_k = -1; 1153 1.1 mrg 1154 1.1 mrg for (k = 5; k <= 7; k++) 1155 1.1 mrg { 1156 1.1 mrg param.name = p->modf_threshold_name; 1157 1.1 mrg param.min_size = 100; 1158 1.1 mrg param.max_size = 2000; 1159 1.1 mrg param.function = p->mul_function; 1160 1.1 mrg param.step_factor = 0.0; 1161 1.1 mrg param.step = 4; 1162 1.1 mrg param.function2 = p->mul_modf_function; 1163 1.1 mrg param.noprint = 1; 1164 1.1 mrg s.r = k; 1165 1.1 mrg one (&thres, ¶m); 1166 1.1 mrg if (thres < best_thres) 1167 1.1 mrg { 1168 1.1 mrg best_thres = thres; 1169 1.1 mrg best_k = k; 1170 1.1 mrg } 1171 1.1 mrg } 1172 1.1 mrg 1173 1.1 mrg *(p->p_modf_threshold) = best_thres; 1174 1.1 mrg sprintf (buf, "k = %d", best_k); 1175 1.1 mrg print_define_remark (p->modf_threshold_name, best_thres, buf); 1176 1.1 mrg initial_k = best_k; 1177 1.1 mrg } 1178 1.1 mrg #else 1179 1.1 mrg size = p->first_size; 1180 1.1 mrg for (;;) 1181 1.1 mrg { 1182 1.1 mrg double tk, tm; 1183 1.1 mrg 1184 1.1 mrg size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr)); 1185 1.1 mrg k = mpn_fft_best_k (size, p->sqr); 1186 1.1 mrg 1187 1.1 mrg if (size >= p->max_size) 1188 1.1 mrg break; 1189 1.1 mrg 1190 1.1 mrg s.size = size + fft_step_size (k) / 2; 1191 1.1 mrg s.r = k; 1192 1.1 mrg tk = tuneup_measure (p->mul_modf_function, NULL, &s); 1193 1.1 mrg if (tk == -1.0) 1194 1.1 mrg abort (); 1195 1.1 mrg 1196 1.1 mrg tm = tuneup_measure (p->mul_function, NULL, &s); 1197 1.1 mrg if (tm == -1.0) 1198 1.1 mrg abort (); 1199 1.1 mrg 1200 1.1 mrg if (option_trace >= 2) 1201 1.1 mrg printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n", 1202 1.1 mrg (long) size, 1203 1.1 mrg (long) size + fft_step_size (k) / 2, k, tk, 1204 1.1 mrg (long) s.size, tm); 1205 1.1 mrg 1206 1.1 mrg if (tk < tm) 1207 1.1 mrg { 1208 1.1 mrg *p->p_modf_threshold = s.size; 1209 1.1 mrg print_define (p->modf_threshold_name, *p->p_modf_threshold); 1210 1.1 mrg break; 1211 1.1 mrg } 1212 1.1 mrg } 1213 1.1 mrg initial_k = ?; 1214 1.1 mrg #endif 1215 1.1 mrg 1216 1.1 mrg /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/ 1217 1.1 mrg 1218 1.1 mrg idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1); 1219 1.1 mrg printf ("#define %s_SIZE %d\n", p->table_name, idx); 1220 1.1 mrg 1221 1.1 mrg /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/ 1222 1.1 mrg 1223 1.1 mrg size = 2 * *p->p_modf_threshold; /* OK? */ 1224 1.1 mrg for (;;) 1225 1.1 mrg { 1226 1.1 mrg double tk, tm; 1227 1.1 mrg mp_size_t mulmod_size, mul_size;; 1228 1.1 mrg 1229 1.1 mrg if (size >= p->max_size) 1230 1.1 mrg break; 1231 1.1 mrg 1232 1.1 mrg mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2; 1233 1.1 mrg mul_size = (size + mulmod_size) / 2; /* middle of step */ 1234 1.1 mrg 1235 1.1 mrg s.size = mulmod_size; 1236 1.1 mrg tk = tuneup_measure (p->function, NULL, &s); 1237 1.1 mrg if (tk == -1.0) 1238 1.1 mrg abort (); 1239 1.1 mrg 1240 1.1 mrg s.size = mul_size; 1241 1.1 mrg tm = tuneup_measure (p->mul_function, NULL, &s); 1242 1.1 mrg if (tm == -1.0) 1243 1.1 mrg abort (); 1244 1.1 mrg 1245 1.1 mrg if (option_trace >= 2) 1246 1.1 mrg printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n", 1247 1.1 mrg (long) size, 1248 1.1 mrg (long) mulmod_size, tk, 1249 1.1 mrg (long) mul_size, tm); 1250 1.1 mrg 1251 1.1 mrg size = mulmod_size; 1252 1.1 mrg 1253 1.1 mrg if (tk < tm) 1254 1.1 mrg { 1255 1.1 mrg *p->p_threshold = s.size; 1256 1.1 mrg print_define (p->threshold_name, *p->p_threshold); 1257 1.1 mrg break; 1258 1.1 mrg } 1259 1.1 mrg } 1260 1.1 mrg } 1261 1.1 mrg 1262 1.1.1.4 mrg /* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This 1263 1.1.1.4 mrg is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch. 1264 1.1.1.4 mrg This is used in get_str and set_str. */ 1265 1.1.1.4 mrg void 1266 1.1.1.4 mrg relspeed_div_1_vs_mul_1 (void) 1267 1.1.1.4 mrg { 1268 1.1.1.5 mrg #define max_opsize 100 1269 1.1.1.4 mrg mp_size_t n; 1270 1.1.1.4 mrg long j; 1271 1.1.1.4 mrg mp_limb_t rp[max_opsize]; 1272 1.1.1.4 mrg mp_limb_t ap[max_opsize]; 1273 1.1.1.4 mrg double multime, divtime; 1274 1.1.1.4 mrg 1275 1.1.1.4 mrg mpn_random (ap, max_opsize); 1276 1.1.1.4 mrg 1277 1.1.1.4 mrg multime = 0; 1278 1.1.1.4 mrg for (n = max_opsize; n > 1; n--) 1279 1.1.1.4 mrg { 1280 1.1.1.4 mrg mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1281 1.1.1.4 mrg speed_starttime (); 1282 1.1.1.4 mrg for (j = speed_precision; j != 0 ; j--) 1283 1.1.1.4 mrg mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1284 1.1.1.4 mrg multime += speed_endtime () / n; 1285 1.1.1.4 mrg } 1286 1.1.1.4 mrg 1287 1.1.1.4 mrg divtime = 0; 1288 1.1.1.4 mrg for (n = max_opsize; n > 1; n--) 1289 1.1.1.4 mrg { 1290 1.1.1.4 mrg /* Make input divisible for good measure. */ 1291 1.1.1.4 mrg ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10); 1292 1.1.1.4 mrg 1293 1.1.1.4 mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 1294 1.1.1.4 mrg mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10, 1295 1.1.1.4 mrg MP_BASES_BIG_BASE_BINVERTED_10, 1296 1.1.1.4 mrg MP_BASES_BIG_BASE_CTZ_10); 1297 1.1.1.4 mrg #else 1298 1.1.1.4 mrg mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1299 1.1.1.4 mrg #endif 1300 1.1.1.4 mrg speed_starttime (); 1301 1.1.1.4 mrg for (j = speed_precision; j != 0 ; j--) 1302 1.1.1.4 mrg { 1303 1.1.1.4 mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 1304 1.1.1.4 mrg mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10, 1305 1.1.1.4 mrg MP_BASES_BIG_BASE_BINVERTED_10, 1306 1.1.1.4 mrg MP_BASES_BIG_BASE_CTZ_10); 1307 1.1.1.4 mrg #else 1308 1.1.1.4 mrg mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1309 1.1.1.4 mrg #endif 1310 1.1.1.4 mrg } 1311 1.1.1.4 mrg divtime += speed_endtime () / n; 1312 1.1.1.4 mrg } 1313 1.1.1.4 mrg 1314 1.1.1.4 mrg print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime)); 1315 1.1.1.4 mrg } 1316 1.1 mrg 1317 1.1 mrg 1318 1.1 mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, 1319 1.1 mrg giving wrong results. */ 1320 1.1 mrg void 1321 1.1 mrg tune_mul_n (void) 1322 1.1 mrg { 1323 1.1 mrg static struct param_t param; 1324 1.1.1.2 mrg mp_size_t next_toom_start; 1325 1.1.1.2 mrg int something_changed; 1326 1.1 mrg 1327 1.1 mrg param.function = speed_mpn_mul_n; 1328 1.1 mrg 1329 1.1 mrg param.name = "MUL_TOOM22_THRESHOLD"; 1330 1.1 mrg param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE); 1331 1.1 mrg param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1; 1332 1.1 mrg one (&mul_toom22_threshold, ¶m); 1333 1.1 mrg 1334 1.1.1.2 mrg param.noprint = 1; 1335 1.1.1.2 mrg 1336 1.1.1.2 mrg /* Threshold sequence loop. Disable functions that would be used in a very 1337 1.1.1.2 mrg narrow range, re-measuring things when that happens. */ 1338 1.1.1.2 mrg something_changed = 1; 1339 1.1.1.2 mrg while (something_changed) 1340 1.1.1.2 mrg { 1341 1.1.1.2 mrg something_changed = 0; 1342 1.1.1.2 mrg 1343 1.1.1.2 mrg next_toom_start = mul_toom22_threshold; 1344 1.1.1.2 mrg 1345 1.1.1.2 mrg if (mul_toom33_threshold != 0) 1346 1.1.1.2 mrg { 1347 1.1.1.2 mrg param.name = "MUL_TOOM33_THRESHOLD"; 1348 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE); 1349 1.1.1.2 mrg param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1; 1350 1.1.1.2 mrg one (&mul_toom33_threshold, ¶m); 1351 1.1.1.2 mrg 1352 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom33_threshold) 1353 1.1.1.2 mrg { 1354 1.1.1.2 mrg mul_toom33_threshold = 0; 1355 1.1.1.2 mrg something_changed = 1; 1356 1.1.1.2 mrg } 1357 1.1.1.2 mrg } 1358 1.1.1.2 mrg 1359 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, mul_toom33_threshold); 1360 1.1.1.2 mrg 1361 1.1.1.2 mrg if (mul_toom44_threshold != 0) 1362 1.1.1.2 mrg { 1363 1.1.1.2 mrg param.name = "MUL_TOOM44_THRESHOLD"; 1364 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE); 1365 1.1.1.2 mrg param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; 1366 1.1.1.2 mrg one (&mul_toom44_threshold, ¶m); 1367 1.1.1.2 mrg 1368 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom44_threshold) 1369 1.1.1.2 mrg { 1370 1.1.1.2 mrg mul_toom44_threshold = 0; 1371 1.1.1.2 mrg something_changed = 1; 1372 1.1.1.2 mrg } 1373 1.1.1.2 mrg } 1374 1.1.1.2 mrg 1375 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, mul_toom44_threshold); 1376 1.1.1.2 mrg 1377 1.1.1.2 mrg if (mul_toom6h_threshold != 0) 1378 1.1.1.2 mrg { 1379 1.1.1.2 mrg param.name = "MUL_TOOM6H_THRESHOLD"; 1380 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE); 1381 1.1.1.2 mrg param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1; 1382 1.1.1.2 mrg one (&mul_toom6h_threshold, ¶m); 1383 1.1.1.2 mrg 1384 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom6h_threshold) 1385 1.1.1.2 mrg { 1386 1.1.1.2 mrg mul_toom6h_threshold = 0; 1387 1.1.1.2 mrg something_changed = 1; 1388 1.1.1.2 mrg } 1389 1.1.1.2 mrg } 1390 1.1.1.2 mrg 1391 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, mul_toom6h_threshold); 1392 1.1.1.2 mrg 1393 1.1.1.2 mrg if (mul_toom8h_threshold != 0) 1394 1.1.1.2 mrg { 1395 1.1.1.2 mrg param.name = "MUL_TOOM8H_THRESHOLD"; 1396 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE); 1397 1.1.1.2 mrg param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1; 1398 1.1.1.2 mrg one (&mul_toom8h_threshold, ¶m); 1399 1.1.1.2 mrg 1400 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom8h_threshold) 1401 1.1.1.2 mrg { 1402 1.1.1.2 mrg mul_toom8h_threshold = 0; 1403 1.1.1.2 mrg something_changed = 1; 1404 1.1.1.2 mrg } 1405 1.1.1.2 mrg } 1406 1.1.1.2 mrg } 1407 1.1.1.2 mrg 1408 1.1.1.2 mrg print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD); 1409 1.1.1.2 mrg print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD); 1410 1.1.1.2 mrg print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD); 1411 1.1.1.2 mrg print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD); 1412 1.1 mrg 1413 1.1 mrg /* disabled until tuned */ 1414 1.1 mrg MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; 1415 1.1 mrg } 1416 1.1 mrg 1417 1.1 mrg void 1418 1.1 mrg tune_mul (void) 1419 1.1 mrg { 1420 1.1 mrg static struct param_t param; 1421 1.1 mrg mp_size_t thres; 1422 1.1 mrg 1423 1.1 mrg param.noprint = 1; 1424 1.1 mrg 1425 1.1 mrg param.function = speed_mpn_toom32_for_toom43_mul; 1426 1.1 mrg param.function2 = speed_mpn_toom43_for_toom32_mul; 1427 1.1 mrg param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; 1428 1.1.1.2 mrg param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17; 1429 1.1 mrg one (&thres, ¶m); 1430 1.1.1.2 mrg mul_toom32_to_toom43_threshold = thres * 17 / 24; 1431 1.1 mrg print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); 1432 1.1 mrg 1433 1.1 mrg param.function = speed_mpn_toom32_for_toom53_mul; 1434 1.1 mrg param.function2 = speed_mpn_toom53_for_toom32_mul; 1435 1.1 mrg param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; 1436 1.1.1.2 mrg param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19; 1437 1.1 mrg one (&thres, ¶m); 1438 1.1.1.2 mrg mul_toom32_to_toom53_threshold = thres * 19 / 30; 1439 1.1 mrg print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); 1440 1.1 mrg 1441 1.1 mrg param.function = speed_mpn_toom42_for_toom53_mul; 1442 1.1 mrg param.function2 = speed_mpn_toom53_for_toom42_mul; 1443 1.1 mrg param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; 1444 1.1.1.2 mrg param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11; 1445 1.1 mrg one (&thres, ¶m); 1446 1.1.1.2 mrg mul_toom42_to_toom53_threshold = thres * 11 / 20; 1447 1.1 mrg print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); 1448 1.1 mrg 1449 1.1 mrg param.function = speed_mpn_toom42_mul; 1450 1.1 mrg param.function2 = speed_mpn_toom63_mul; 1451 1.1 mrg param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; 1452 1.1.1.2 mrg param.min_size = MPN_TOOM63_MUL_MINSIZE * 2; 1453 1.1 mrg one (&thres, ¶m); 1454 1.1.1.2 mrg mul_toom42_to_toom63_threshold = thres / 2; 1455 1.1 mrg print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); 1456 1.1.1.2 mrg 1457 1.1.1.2 mrg /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ 1458 1.1.1.2 mrg param.function = speed_mpn_toom43_for_toom54_mul; 1459 1.1.1.2 mrg param.function2 = speed_mpn_toom54_for_toom43_mul; 1460 1.1.1.2 mrg param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD"; 1461 1.1.1.2 mrg param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5; 1462 1.1.1.2 mrg one (&thres, ¶m); 1463 1.1.1.2 mrg mul_toom43_to_toom54_threshold = thres * 5 / 6; 1464 1.1.1.2 mrg print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold); 1465 1.1 mrg } 1466 1.1 mrg 1467 1.1 mrg 1468 1.1 mrg void 1469 1.1 mrg tune_mullo (void) 1470 1.1 mrg { 1471 1.1 mrg static struct param_t param; 1472 1.1 mrg 1473 1.1 mrg param.function = speed_mpn_mullo_n; 1474 1.1 mrg 1475 1.1 mrg param.name = "MULLO_BASECASE_THRESHOLD"; 1476 1.1.1.4 mrg param.min_size = 2; 1477 1.1 mrg param.min_is_always = 1; 1478 1.1 mrg param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1; 1479 1.1 mrg param.stop_factor = 1.5; 1480 1.1 mrg param.noprint = 1; 1481 1.1 mrg one (&mullo_basecase_threshold, ¶m); 1482 1.1 mrg 1483 1.1 mrg param.name = "MULLO_DC_THRESHOLD"; 1484 1.1 mrg param.min_size = 8; 1485 1.1 mrg param.min_is_always = 0; 1486 1.1 mrg param.max_size = 1000; 1487 1.1 mrg one (&mullo_dc_threshold, ¶m); 1488 1.1 mrg 1489 1.1 mrg if (mullo_basecase_threshold >= mullo_dc_threshold) 1490 1.1 mrg { 1491 1.1 mrg print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold); 1492 1.1 mrg print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase"); 1493 1.1 mrg } 1494 1.1 mrg else 1495 1.1 mrg { 1496 1.1 mrg print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold); 1497 1.1 mrg print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold); 1498 1.1 mrg } 1499 1.1 mrg 1500 1.1.1.3 mrg if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2) 1501 1.1.1.3 mrg { 1502 1.1.1.3 mrg param.name = "MULLO_MUL_N_THRESHOLD"; 1503 1.1.1.3 mrg param.min_size = mullo_dc_threshold; 1504 1.1.1.3 mrg param.max_size = 2 * mul_fft_threshold; 1505 1.1.1.3 mrg param.noprint = 0; 1506 1.1.1.3 mrg param.step_factor = 0.03; 1507 1.1.1.3 mrg one (&mullo_mul_n_threshold, ¶m); 1508 1.1.1.3 mrg } 1509 1.1.1.3 mrg else 1510 1.1.1.3 mrg print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX, 1511 1.1.1.3 mrg "without FFT use mullo forever"); 1512 1.1.1.3 mrg } 1513 1.1.1.3 mrg 1514 1.1.1.3 mrg void 1515 1.1.1.3 mrg tune_sqrlo (void) 1516 1.1.1.3 mrg { 1517 1.1.1.3 mrg static struct param_t param; 1518 1.1.1.3 mrg 1519 1.1.1.3 mrg param.function = speed_mpn_sqrlo; 1520 1.1.1.3 mrg 1521 1.1.1.3 mrg param.name = "SQRLO_BASECASE_THRESHOLD"; 1522 1.1.1.4 mrg param.min_size = 2; 1523 1.1.1.3 mrg param.min_is_always = 1; 1524 1.1.1.3 mrg param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1; 1525 1.1.1.3 mrg param.stop_factor = 1.5; 1526 1.1.1.3 mrg param.noprint = 1; 1527 1.1.1.3 mrg one (&sqrlo_basecase_threshold, ¶m); 1528 1.1.1.3 mrg 1529 1.1.1.3 mrg param.name = "SQRLO_DC_THRESHOLD"; 1530 1.1.1.3 mrg param.min_size = 8; 1531 1.1.1.3 mrg param.min_is_always = 0; 1532 1.1.1.3 mrg param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1; 1533 1.1.1.3 mrg one (&sqrlo_dc_threshold, ¶m); 1534 1.1.1.3 mrg 1535 1.1.1.3 mrg if (sqrlo_basecase_threshold >= sqrlo_dc_threshold) 1536 1.1.1.3 mrg { 1537 1.1.1.3 mrg print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold); 1538 1.1.1.3 mrg print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase"); 1539 1.1.1.3 mrg } 1540 1.1.1.3 mrg else 1541 1.1.1.3 mrg { 1542 1.1.1.3 mrg print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold); 1543 1.1.1.3 mrg print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold); 1544 1.1.1.3 mrg } 1545 1.1.1.3 mrg 1546 1.1.1.3 mrg if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2) 1547 1.1.1.3 mrg { 1548 1.1.1.3 mrg param.name = "SQRLO_SQR_THRESHOLD"; 1549 1.1.1.3 mrg param.min_size = sqrlo_dc_threshold; 1550 1.1.1.3 mrg param.max_size = 2 * sqr_fft_threshold; 1551 1.1.1.3 mrg param.noprint = 0; 1552 1.1.1.3 mrg param.step_factor = 0.03; 1553 1.1.1.3 mrg one (&sqrlo_sqr_threshold, ¶m); 1554 1.1.1.3 mrg } 1555 1.1.1.3 mrg else 1556 1.1.1.3 mrg print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX, 1557 1.1.1.3 mrg "without FFT use sqrlo forever"); 1558 1.1 mrg } 1559 1.1 mrg 1560 1.1 mrg void 1561 1.1.1.2 mrg tune_mulmid (void) 1562 1.1.1.2 mrg { 1563 1.1.1.2 mrg static struct param_t param; 1564 1.1.1.2 mrg 1565 1.1.1.2 mrg param.name = "MULMID_TOOM42_THRESHOLD"; 1566 1.1.1.2 mrg param.function = speed_mpn_mulmid_n; 1567 1.1.1.2 mrg param.min_size = 4; 1568 1.1.1.2 mrg param.max_size = 100; 1569 1.1.1.2 mrg one (&mulmid_toom42_threshold, ¶m); 1570 1.1.1.2 mrg } 1571 1.1.1.2 mrg 1572 1.1.1.2 mrg void 1573 1.1 mrg tune_mulmod_bnm1 (void) 1574 1.1 mrg { 1575 1.1 mrg static struct param_t param; 1576 1.1 mrg 1577 1.1 mrg param.name = "MULMOD_BNM1_THRESHOLD"; 1578 1.1 mrg param.function = speed_mpn_mulmod_bnm1; 1579 1.1 mrg param.min_size = 4; 1580 1.1 mrg param.max_size = 100; 1581 1.1 mrg one (&mulmod_bnm1_threshold, ¶m); 1582 1.1 mrg } 1583 1.1 mrg 1584 1.1 mrg void 1585 1.1 mrg tune_sqrmod_bnm1 (void) 1586 1.1 mrg { 1587 1.1 mrg static struct param_t param; 1588 1.1 mrg 1589 1.1 mrg param.name = "SQRMOD_BNM1_THRESHOLD"; 1590 1.1 mrg param.function = speed_mpn_sqrmod_bnm1; 1591 1.1 mrg param.min_size = 4; 1592 1.1 mrg param.max_size = 100; 1593 1.1 mrg one (&sqrmod_bnm1_threshold, ¶m); 1594 1.1 mrg } 1595 1.1 mrg 1596 1.1 mrg 1597 1.1 mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase 1598 1.1 mrg is faster only at size==2 then we don't want to bother with extra code 1599 1.1 mrg just for that. Start karatsuba from 4 same as MUL above. */ 1600 1.1 mrg 1601 1.1 mrg void 1602 1.1 mrg tune_sqr (void) 1603 1.1 mrg { 1604 1.1 mrg /* disabled until tuned */ 1605 1.1 mrg SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; 1606 1.1 mrg 1607 1.1 mrg if (HAVE_NATIVE_mpn_sqr_basecase) 1608 1.1 mrg { 1609 1.1 mrg print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); 1610 1.1 mrg sqr_basecase_threshold = 0; 1611 1.1 mrg } 1612 1.1 mrg else 1613 1.1 mrg { 1614 1.1 mrg static struct param_t param; 1615 1.1 mrg param.name = "SQR_BASECASE_THRESHOLD"; 1616 1.1 mrg param.function = speed_mpn_sqr; 1617 1.1 mrg param.min_size = 3; 1618 1.1 mrg param.min_is_always = 1; 1619 1.1 mrg param.max_size = TUNE_SQR_TOOM2_MAX; 1620 1.1 mrg param.noprint = 1; 1621 1.1 mrg one (&sqr_basecase_threshold, ¶m); 1622 1.1 mrg } 1623 1.1 mrg 1624 1.1 mrg { 1625 1.1 mrg static struct param_t param; 1626 1.1 mrg param.name = "SQR_TOOM2_THRESHOLD"; 1627 1.1 mrg param.function = speed_mpn_sqr; 1628 1.1 mrg param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE); 1629 1.1 mrg param.max_size = TUNE_SQR_TOOM2_MAX; 1630 1.1 mrg param.noprint = 1; 1631 1.1 mrg one (&sqr_toom2_threshold, ¶m); 1632 1.1 mrg 1633 1.1 mrg if (! HAVE_NATIVE_mpn_sqr_basecase 1634 1.1 mrg && sqr_toom2_threshold < sqr_basecase_threshold) 1635 1.1 mrg { 1636 1.1 mrg /* Karatsuba becomes faster than mul_basecase before 1637 1.1 mrg sqr_basecase does. Arrange for the expression 1638 1.1 mrg "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which 1639 1.1 mrg selects mpn_sqr_basecase in mpn_sqr to be false, by setting 1640 1.1 mrg SQR_TOOM2_THRESHOLD to zero, making 1641 1.1 mrg SQR_BASECASE_THRESHOLD the toom2 threshold. */ 1642 1.1 mrg 1643 1.1 mrg sqr_basecase_threshold = SQR_TOOM2_THRESHOLD; 1644 1.1 mrg SQR_TOOM2_THRESHOLD = 0; 1645 1.1 mrg 1646 1.1 mrg print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, 1647 1.1 mrg "toom2"); 1648 1.1 mrg print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD, 1649 1.1 mrg "never sqr_basecase"); 1650 1.1 mrg } 1651 1.1 mrg else 1652 1.1 mrg { 1653 1.1 mrg if (! HAVE_NATIVE_mpn_sqr_basecase) 1654 1.1 mrg print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); 1655 1.1 mrg print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD); 1656 1.1 mrg } 1657 1.1 mrg } 1658 1.1 mrg 1659 1.1 mrg { 1660 1.1 mrg static struct param_t param; 1661 1.1.1.2 mrg mp_size_t next_toom_start; 1662 1.1.1.2 mrg int something_changed; 1663 1.1 mrg 1664 1.1 mrg param.function = speed_mpn_sqr; 1665 1.1.1.2 mrg param.noprint = 1; 1666 1.1 mrg 1667 1.1.1.2 mrg /* Threshold sequence loop. Disable functions that would be used in a very 1668 1.1.1.2 mrg narrow range, re-measuring things when that happens. */ 1669 1.1.1.2 mrg something_changed = 1; 1670 1.1.1.2 mrg while (something_changed) 1671 1.1.1.2 mrg { 1672 1.1.1.2 mrg something_changed = 0; 1673 1.1.1.2 mrg 1674 1.1.1.2 mrg next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold); 1675 1.1.1.2 mrg 1676 1.1.1.2 mrg sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 1677 1.1.1.2 mrg param.name = "SQR_TOOM3_THRESHOLD"; 1678 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE); 1679 1.1.1.2 mrg param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; 1680 1.1.1.2 mrg one (&sqr_toom3_threshold, ¶m); 1681 1.1.1.2 mrg 1682 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, sqr_toom3_threshold); 1683 1.1.1.2 mrg 1684 1.1.1.2 mrg if (sqr_toom4_threshold != 0) 1685 1.1.1.2 mrg { 1686 1.1.1.2 mrg param.name = "SQR_TOOM4_THRESHOLD"; 1687 1.1.1.2 mrg sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 1688 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE); 1689 1.1.1.2 mrg param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; 1690 1.1.1.2 mrg one (&sqr_toom4_threshold, ¶m); 1691 1.1.1.2 mrg 1692 1.1.1.2 mrg if (next_toom_start * 1.05 >= sqr_toom4_threshold) 1693 1.1.1.2 mrg { 1694 1.1.1.2 mrg sqr_toom4_threshold = 0; 1695 1.1.1.2 mrg something_changed = 1; 1696 1.1.1.2 mrg } 1697 1.1.1.2 mrg } 1698 1.1.1.2 mrg 1699 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, sqr_toom4_threshold); 1700 1.1.1.2 mrg 1701 1.1.1.2 mrg if (sqr_toom6_threshold != 0) 1702 1.1.1.2 mrg { 1703 1.1.1.2 mrg param.name = "SQR_TOOM6_THRESHOLD"; 1704 1.1.1.2 mrg sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 1705 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE); 1706 1.1.1.2 mrg param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1; 1707 1.1.1.2 mrg one (&sqr_toom6_threshold, ¶m); 1708 1.1.1.2 mrg 1709 1.1.1.2 mrg if (next_toom_start * 1.05 >= sqr_toom6_threshold) 1710 1.1.1.2 mrg { 1711 1.1.1.2 mrg sqr_toom6_threshold = 0; 1712 1.1.1.2 mrg something_changed = 1; 1713 1.1.1.2 mrg } 1714 1.1.1.2 mrg } 1715 1.1.1.2 mrg 1716 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, sqr_toom6_threshold); 1717 1.1.1.2 mrg 1718 1.1.1.2 mrg if (sqr_toom8_threshold != 0) 1719 1.1.1.2 mrg { 1720 1.1.1.2 mrg param.name = "SQR_TOOM8_THRESHOLD"; 1721 1.1.1.2 mrg sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 1722 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE); 1723 1.1.1.2 mrg param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1; 1724 1.1.1.2 mrg one (&sqr_toom8_threshold, ¶m); 1725 1.1.1.2 mrg 1726 1.1.1.2 mrg if (next_toom_start * 1.05 >= sqr_toom8_threshold) 1727 1.1.1.2 mrg { 1728 1.1.1.2 mrg sqr_toom8_threshold = 0; 1729 1.1.1.2 mrg something_changed = 1; 1730 1.1.1.2 mrg } 1731 1.1.1.2 mrg } 1732 1.1.1.2 mrg } 1733 1.1.1.2 mrg 1734 1.1.1.2 mrg print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD); 1735 1.1.1.2 mrg print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD); 1736 1.1.1.2 mrg print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD); 1737 1.1.1.2 mrg print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD); 1738 1.1 mrg } 1739 1.1 mrg } 1740 1.1 mrg 1741 1.1 mrg 1742 1.1 mrg void 1743 1.1 mrg tune_dc_div (void) 1744 1.1 mrg { 1745 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n */ 1746 1.1 mrg { 1747 1.1 mrg static struct param_t param; 1748 1.1 mrg param.name = "DC_DIV_QR_THRESHOLD"; 1749 1.1 mrg param.function = speed_mpn_sbpi1_div_qr; 1750 1.1 mrg param.function2 = speed_mpn_dcpi1_div_qr; 1751 1.1 mrg param.min_size = 6; 1752 1.1 mrg one (&dc_div_qr_threshold, ¶m); 1753 1.1 mrg } 1754 1.1 mrg { 1755 1.1 mrg static struct param_t param; 1756 1.1 mrg param.name = "DC_DIVAPPR_Q_THRESHOLD"; 1757 1.1 mrg param.function = speed_mpn_sbpi1_divappr_q; 1758 1.1 mrg param.function2 = speed_mpn_dcpi1_divappr_q; 1759 1.1 mrg param.min_size = 6; 1760 1.1 mrg one (&dc_divappr_q_threshold, ¶m); 1761 1.1 mrg } 1762 1.1 mrg } 1763 1.1 mrg 1764 1.1 mrg static double 1765 1.1 mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s) 1766 1.1 mrg { 1767 1.1 mrg if (s->size < DC_DIV_QR_THRESHOLD) 1768 1.1 mrg return speed_mpn_sbpi1_div_qr (s); 1769 1.1 mrg else 1770 1.1 mrg return speed_mpn_dcpi1_div_qr (s); 1771 1.1 mrg } 1772 1.1 mrg 1773 1.1 mrg void 1774 1.1 mrg tune_mu_div (void) 1775 1.1 mrg { 1776 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n */ 1777 1.1 mrg { 1778 1.1 mrg static struct param_t param; 1779 1.1 mrg param.name = "MU_DIV_QR_THRESHOLD"; 1780 1.1 mrg param.function = speed_mpn_dcpi1_div_qr; 1781 1.1 mrg param.function2 = speed_mpn_mu_div_qr; 1782 1.1.1.2 mrg param.min_size = mul_toom22_threshold; 1783 1.1 mrg param.max_size = 5000; 1784 1.1 mrg param.step_factor = 0.02; 1785 1.1 mrg one (&mu_div_qr_threshold, ¶m); 1786 1.1 mrg } 1787 1.1 mrg { 1788 1.1 mrg static struct param_t param; 1789 1.1 mrg param.name = "MU_DIVAPPR_Q_THRESHOLD"; 1790 1.1 mrg param.function = speed_mpn_dcpi1_divappr_q; 1791 1.1 mrg param.function2 = speed_mpn_mu_divappr_q; 1792 1.1.1.2 mrg param.min_size = mul_toom22_threshold; 1793 1.1 mrg param.max_size = 5000; 1794 1.1 mrg param.step_factor = 0.02; 1795 1.1 mrg one (&mu_divappr_q_threshold, ¶m); 1796 1.1 mrg } 1797 1.1 mrg { 1798 1.1 mrg static struct param_t param; 1799 1.1 mrg param.name = "MUPI_DIV_QR_THRESHOLD"; 1800 1.1 mrg param.function = speed_mpn_sbordcpi1_div_qr; 1801 1.1 mrg param.function2 = speed_mpn_mupi_div_qr; 1802 1.1 mrg param.min_size = 6; 1803 1.1 mrg param.min_is_always = 1; 1804 1.1 mrg param.max_size = 1000; 1805 1.1 mrg param.step_factor = 0.02; 1806 1.1 mrg one (&mupi_div_qr_threshold, ¶m); 1807 1.1 mrg } 1808 1.1 mrg } 1809 1.1 mrg 1810 1.1 mrg void 1811 1.1 mrg tune_dc_bdiv (void) 1812 1.1 mrg { 1813 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n*/ 1814 1.1 mrg { 1815 1.1 mrg static struct param_t param; 1816 1.1 mrg param.name = "DC_BDIV_QR_THRESHOLD"; 1817 1.1 mrg param.function = speed_mpn_sbpi1_bdiv_qr; 1818 1.1 mrg param.function2 = speed_mpn_dcpi1_bdiv_qr; 1819 1.1 mrg param.min_size = 4; 1820 1.1 mrg one (&dc_bdiv_qr_threshold, ¶m); 1821 1.1 mrg } 1822 1.1 mrg { 1823 1.1 mrg static struct param_t param; 1824 1.1 mrg param.name = "DC_BDIV_Q_THRESHOLD"; 1825 1.1 mrg param.function = speed_mpn_sbpi1_bdiv_q; 1826 1.1 mrg param.function2 = speed_mpn_dcpi1_bdiv_q; 1827 1.1 mrg param.min_size = 4; 1828 1.1 mrg one (&dc_bdiv_q_threshold, ¶m); 1829 1.1 mrg } 1830 1.1 mrg } 1831 1.1 mrg 1832 1.1 mrg void 1833 1.1 mrg tune_mu_bdiv (void) 1834 1.1 mrg { 1835 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n*/ 1836 1.1 mrg { 1837 1.1 mrg static struct param_t param; 1838 1.1 mrg param.name = "MU_BDIV_QR_THRESHOLD"; 1839 1.1 mrg param.function = speed_mpn_dcpi1_bdiv_qr; 1840 1.1 mrg param.function2 = speed_mpn_mu_bdiv_qr; 1841 1.1.1.3 mrg param.min_size = dc_bdiv_qr_threshold; 1842 1.1 mrg param.max_size = 5000; 1843 1.1 mrg param.step_factor = 0.02; 1844 1.1 mrg one (&mu_bdiv_qr_threshold, ¶m); 1845 1.1 mrg } 1846 1.1 mrg { 1847 1.1 mrg static struct param_t param; 1848 1.1 mrg param.name = "MU_BDIV_Q_THRESHOLD"; 1849 1.1 mrg param.function = speed_mpn_dcpi1_bdiv_q; 1850 1.1 mrg param.function2 = speed_mpn_mu_bdiv_q; 1851 1.1.1.3 mrg param.min_size = dc_bdiv_q_threshold; 1852 1.1 mrg param.max_size = 5000; 1853 1.1 mrg param.step_factor = 0.02; 1854 1.1 mrg one (&mu_bdiv_q_threshold, ¶m); 1855 1.1 mrg } 1856 1.1 mrg } 1857 1.1 mrg 1858 1.1 mrg void 1859 1.1 mrg tune_invertappr (void) 1860 1.1 mrg { 1861 1.1 mrg static struct param_t param; 1862 1.1 mrg 1863 1.1 mrg param.function = speed_mpn_ni_invertappr; 1864 1.1 mrg param.name = "INV_MULMOD_BNM1_THRESHOLD"; 1865 1.1.1.3 mrg param.min_size = 5; 1866 1.1 mrg one (&inv_mulmod_bnm1_threshold, ¶m); 1867 1.1 mrg 1868 1.1 mrg param.function = speed_mpn_invertappr; 1869 1.1 mrg param.name = "INV_NEWTON_THRESHOLD"; 1870 1.1.1.3 mrg param.min_size = 5; 1871 1.1 mrg one (&inv_newton_threshold, ¶m); 1872 1.1 mrg } 1873 1.1 mrg 1874 1.1 mrg void 1875 1.1 mrg tune_invert (void) 1876 1.1 mrg { 1877 1.1 mrg static struct param_t param; 1878 1.1 mrg 1879 1.1 mrg param.function = speed_mpn_invert; 1880 1.1 mrg param.name = "INV_APPR_THRESHOLD"; 1881 1.1.1.3 mrg param.min_size = 5; 1882 1.1 mrg one (&inv_appr_threshold, ¶m); 1883 1.1 mrg } 1884 1.1 mrg 1885 1.1 mrg void 1886 1.1 mrg tune_binvert (void) 1887 1.1 mrg { 1888 1.1 mrg static struct param_t param; 1889 1.1 mrg 1890 1.1 mrg param.function = speed_mpn_binvert; 1891 1.1 mrg param.name = "BINV_NEWTON_THRESHOLD"; 1892 1.1 mrg param.min_size = 8; /* pointless with smaller operands */ 1893 1.1 mrg one (&binv_newton_threshold, ¶m); 1894 1.1 mrg } 1895 1.1 mrg 1896 1.1 mrg void 1897 1.1 mrg tune_redc (void) 1898 1.1 mrg { 1899 1.1 mrg #define TUNE_REDC_2_MAX 100 1900 1.1 mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 1901 1.1 mrg #define WANT_REDC_2 1 1902 1.1 mrg #endif 1903 1.1 mrg 1904 1.1 mrg #if WANT_REDC_2 1905 1.1 mrg { 1906 1.1 mrg static struct param_t param; 1907 1.1 mrg param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1908 1.1 mrg param.function = speed_mpn_redc_1; 1909 1.1 mrg param.function2 = speed_mpn_redc_2; 1910 1.1 mrg param.min_size = 1; 1911 1.1 mrg param.min_is_always = 1; 1912 1.1 mrg param.max_size = TUNE_REDC_2_MAX; 1913 1.1 mrg param.noprint = 1; 1914 1.1.1.2 mrg param.stop_factor = 1.5; 1915 1.1 mrg one (&redc_1_to_redc_2_threshold, ¶m); 1916 1.1 mrg } 1917 1.1 mrg { 1918 1.1 mrg static struct param_t param; 1919 1.1 mrg param.name = "REDC_2_TO_REDC_N_THRESHOLD"; 1920 1.1 mrg param.function = speed_mpn_redc_2; 1921 1.1 mrg param.function2 = speed_mpn_redc_n; 1922 1.1 mrg param.min_size = 16; 1923 1.1 mrg param.noprint = 1; 1924 1.1 mrg one (&redc_2_to_redc_n_threshold, ¶m); 1925 1.1 mrg } 1926 1.1.1.2 mrg if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold) 1927 1.1 mrg { 1928 1.1.1.2 mrg redc_2_to_redc_n_threshold = 0; /* disable redc_2 */ 1929 1.1.1.2 mrg 1930 1.1.1.2 mrg /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as 1931 1.1.1.2 mrg REDC_1_TO_REDC_2_THRESHOLD. */ 1932 1.1.1.2 mrg { 1933 1.1.1.2 mrg static struct param_t param; 1934 1.1.1.2 mrg param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1935 1.1.1.2 mrg param.function = speed_mpn_redc_1; 1936 1.1.1.2 mrg param.function2 = speed_mpn_redc_n; 1937 1.1.1.2 mrg param.min_size = 16; 1938 1.1.1.2 mrg param.noprint = 1; 1939 1.1.1.2 mrg one (&redc_1_to_redc_2_threshold, ¶m); 1940 1.1.1.2 mrg } 1941 1.1 mrg } 1942 1.1.1.2 mrg print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD); 1943 1.1.1.2 mrg print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); 1944 1.1 mrg #else 1945 1.1 mrg { 1946 1.1 mrg static struct param_t param; 1947 1.1 mrg param.name = "REDC_1_TO_REDC_N_THRESHOLD"; 1948 1.1 mrg param.function = speed_mpn_redc_1; 1949 1.1 mrg param.function2 = speed_mpn_redc_n; 1950 1.1 mrg param.min_size = 16; 1951 1.1 mrg one (&redc_1_to_redc_n_threshold, ¶m); 1952 1.1 mrg } 1953 1.1 mrg #endif 1954 1.1 mrg } 1955 1.1 mrg 1956 1.1 mrg void 1957 1.1 mrg tune_matrix22_mul (void) 1958 1.1 mrg { 1959 1.1 mrg static struct param_t param; 1960 1.1 mrg param.name = "MATRIX22_STRASSEN_THRESHOLD"; 1961 1.1 mrg param.function = speed_mpn_matrix22_mul; 1962 1.1 mrg param.min_size = 2; 1963 1.1 mrg one (&matrix22_strassen_threshold, ¶m); 1964 1.1 mrg } 1965 1.1 mrg 1966 1.1 mrg void 1967 1.1.1.4 mrg tune_hgcd2 (void) 1968 1.1.1.4 mrg { 1969 1.1.1.4 mrg static struct param_t param; 1970 1.1.1.4 mrg hgcd2_func_t *f[5] = 1971 1.1.1.4 mrg { mpn_hgcd2_1, 1972 1.1.1.4 mrg mpn_hgcd2_2, 1973 1.1.1.4 mrg mpn_hgcd2_3, 1974 1.1.1.4 mrg mpn_hgcd2_4, 1975 1.1.1.4 mrg mpn_hgcd2_5 }; 1976 1.1.1.4 mrg speed_function_t speed_f[5] = 1977 1.1.1.4 mrg { speed_mpn_hgcd2_1, 1978 1.1.1.4 mrg speed_mpn_hgcd2_2, 1979 1.1.1.4 mrg speed_mpn_hgcd2_3, 1980 1.1.1.4 mrg speed_mpn_hgcd2_4, 1981 1.1.1.4 mrg speed_mpn_hgcd2_5 }; 1982 1.1.1.4 mrg int best; 1983 1.1.1.4 mrg 1984 1.1.1.4 mrg s.size = 1; 1985 1.1.1.4 mrg best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", ¶m); 1986 1.1.1.4 mrg 1987 1.1.1.4 mrg /* Use selected function when tuning hgcd and gcd */ 1988 1.1.1.4 mrg hgcd2_func = f[best]; 1989 1.1.1.4 mrg } 1990 1.1.1.4 mrg 1991 1.1.1.4 mrg void 1992 1.1 mrg tune_hgcd (void) 1993 1.1 mrg { 1994 1.1 mrg static struct param_t param; 1995 1.1 mrg param.name = "HGCD_THRESHOLD"; 1996 1.1 mrg param.function = speed_mpn_hgcd; 1997 1.1 mrg /* We seem to get strange results for small sizes */ 1998 1.1 mrg param.min_size = 30; 1999 1.1 mrg one (&hgcd_threshold, ¶m); 2000 1.1 mrg } 2001 1.1 mrg 2002 1.1 mrg void 2003 1.1.1.2 mrg tune_hgcd_appr (void) 2004 1.1.1.2 mrg { 2005 1.1.1.2 mrg static struct param_t param; 2006 1.1.1.2 mrg param.name = "HGCD_APPR_THRESHOLD"; 2007 1.1.1.2 mrg param.function = speed_mpn_hgcd_appr; 2008 1.1.1.2 mrg /* We seem to get strange results for small sizes */ 2009 1.1.1.2 mrg param.min_size = 50; 2010 1.1.1.2 mrg param.stop_since_change = 150; 2011 1.1.1.2 mrg one (&hgcd_appr_threshold, ¶m); 2012 1.1.1.2 mrg } 2013 1.1.1.2 mrg 2014 1.1.1.2 mrg void 2015 1.1.1.2 mrg tune_hgcd_reduce (void) 2016 1.1.1.2 mrg { 2017 1.1.1.2 mrg static struct param_t param; 2018 1.1.1.2 mrg param.name = "HGCD_REDUCE_THRESHOLD"; 2019 1.1.1.2 mrg param.function = speed_mpn_hgcd_reduce; 2020 1.1.1.2 mrg param.min_size = 30; 2021 1.1.1.2 mrg param.max_size = 7000; 2022 1.1.1.2 mrg param.step_factor = 0.04; 2023 1.1.1.2 mrg one (&hgcd_reduce_threshold, ¶m); 2024 1.1.1.2 mrg } 2025 1.1.1.2 mrg 2026 1.1.1.2 mrg void 2027 1.1 mrg tune_gcd_dc (void) 2028 1.1 mrg { 2029 1.1 mrg static struct param_t param; 2030 1.1 mrg param.name = "GCD_DC_THRESHOLD"; 2031 1.1 mrg param.function = speed_mpn_gcd; 2032 1.1 mrg param.min_size = hgcd_threshold; 2033 1.1 mrg param.max_size = 3000; 2034 1.1 mrg param.step_factor = 0.02; 2035 1.1 mrg one (&gcd_dc_threshold, ¶m); 2036 1.1 mrg } 2037 1.1 mrg 2038 1.1 mrg void 2039 1.1 mrg tune_gcdext_dc (void) 2040 1.1 mrg { 2041 1.1 mrg static struct param_t param; 2042 1.1 mrg param.name = "GCDEXT_DC_THRESHOLD"; 2043 1.1 mrg param.function = speed_mpn_gcdext; 2044 1.1 mrg param.min_size = hgcd_threshold; 2045 1.1 mrg param.max_size = 3000; 2046 1.1 mrg param.step_factor = 0.02; 2047 1.1 mrg one (&gcdext_dc_threshold, ¶m); 2048 1.1 mrg } 2049 1.1 mrg 2050 1.1.1.2 mrg /* In tune_powm_sec we compute the table used by the win_size function. The 2051 1.1.1.2 mrg cutoff points are in exponent bits, disregarding other operand sizes. It is 2052 1.1.1.3 mrg not possible to use the one framework since it currently uses a granularity 2053 1.1.1.2 mrg of full limbs. 2054 1.1.1.2 mrg */ 2055 1.1.1.2 mrg 2056 1.1.1.2 mrg /* This win_size replaces the variant in the powm code, allowing us to 2057 1.1.1.2 mrg control k in the k-ary algorithms. */ 2058 1.1.1.2 mrg int winsize; 2059 1.1.1.2 mrg int 2060 1.1.1.2 mrg win_size (mp_bitcnt_t eb) 2061 1.1.1.2 mrg { 2062 1.1.1.2 mrg return winsize; 2063 1.1.1.2 mrg } 2064 1.1.1.2 mrg 2065 1.1.1.2 mrg void 2066 1.1.1.2 mrg tune_powm_sec (void) 2067 1.1.1.2 mrg { 2068 1.1.1.2 mrg mp_size_t n; 2069 1.1.1.2 mrg int k, i; 2070 1.1.1.2 mrg mp_size_t itch; 2071 1.1.1.2 mrg mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff; 2072 1.1.1.2 mrg const int n_max = 3000 / GMP_NUMB_BITS; 2073 1.1.1.5 mrg #define n_measurements 5 2074 1.1.1.2 mrg mp_ptr rp, bp, ep, mp, tp; 2075 1.1.1.2 mrg double ttab[n_measurements], tk, tkp1; 2076 1.1.1.2 mrg TMP_DECL; 2077 1.1.1.2 mrg TMP_MARK; 2078 1.1.1.2 mrg 2079 1.1.1.2 mrg possible_nbits_cutoff = 0; 2080 1.1.1.2 mrg 2081 1.1.1.2 mrg k = 1; 2082 1.1.1.2 mrg 2083 1.1.1.2 mrg winsize = 10; /* the itch function needs this */ 2084 1.1.1.3 mrg itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max); 2085 1.1.1.2 mrg 2086 1.1.1.2 mrg rp = TMP_ALLOC_LIMBS (n_max); 2087 1.1.1.2 mrg bp = TMP_ALLOC_LIMBS (n_max); 2088 1.1.1.2 mrg ep = TMP_ALLOC_LIMBS (n_max); 2089 1.1.1.2 mrg mp = TMP_ALLOC_LIMBS (n_max); 2090 1.1.1.2 mrg tp = TMP_ALLOC_LIMBS (itch); 2091 1.1.1.2 mrg 2092 1.1.1.2 mrg mpn_random (bp, n_max); 2093 1.1.1.2 mrg mpn_random (mp, n_max); 2094 1.1.1.2 mrg mp[0] |= 1; 2095 1.1.1.2 mrg 2096 1.1.1.2 mrg /* How about taking the M operand size into account? 2097 1.1.1.2 mrg 2098 1.1.1.2 mrg An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming 2099 1.1.1.2 mrg B = O(M)). 2100 1.1.1.2 mrg 2101 1.1.1.2 mrg Using k-ary and no sliding window, the precomputation will need time 2102 1.1.1.2 mrg O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + 2103 1.1.1.2 mrg O(log(E)/k*M(N)), for the squarings, multiplications, respectively. 2104 1.1.1.2 mrg 2105 1.1.1.2 mrg An operation R=powm_sec(B,E,N) will take time like powm. 2106 1.1.1.2 mrg 2107 1.1.1.2 mrg Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the 2108 1.1.1.2 mrg main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + 2109 1.1.1.2 mrg O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full 2110 1.1.1.2 mrg table reads, respectively. */ 2111 1.1.1.2 mrg 2112 1.1.1.2 mrg printf ("#define POWM_SEC_TABLE "); 2113 1.1.1.2 mrg 2114 1.1.1.3 mrg /* For nbits == 1, we should always use k == 1, so no need to tune 2115 1.1.1.3 mrg that. Starting with nbits == 2 also ensure that nbits always is 2116 1.1.1.3 mrg larger than the windowsize k+1. */ 2117 1.1.1.3 mrg for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; ) 2118 1.1.1.2 mrg { 2119 1.1.1.2 mrg n = (nbits - 1) / GMP_NUMB_BITS + 1; 2120 1.1.1.2 mrg 2121 1.1.1.2 mrg /* Generate E such that sliding-window for k and k+1 works equally 2122 1.1.1.2 mrg well/poorly (but sliding is not used in powm_sec, of course). */ 2123 1.1.1.2 mrg for (i = 0; i < n; i++) 2124 1.1.1.2 mrg ep[i] = ~CNST_LIMB(0); 2125 1.1.1.2 mrg 2126 1.1.1.2 mrg winsize = k; 2127 1.1.1.2 mrg for (i = 0; i < n_measurements; i++) 2128 1.1.1.2 mrg { 2129 1.1.1.2 mrg speed_starttime (); 2130 1.1.1.3 mrg mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); 2131 1.1.1.2 mrg ttab[i] = speed_endtime (); 2132 1.1.1.2 mrg } 2133 1.1.1.2 mrg tk = median (ttab, n_measurements); 2134 1.1.1.2 mrg 2135 1.1.1.2 mrg winsize = k + 1; 2136 1.1.1.2 mrg speed_starttime (); 2137 1.1.1.2 mrg for (i = 0; i < n_measurements; i++) 2138 1.1.1.2 mrg { 2139 1.1.1.2 mrg speed_starttime (); 2140 1.1.1.3 mrg mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); 2141 1.1.1.2 mrg ttab[i] = speed_endtime (); 2142 1.1.1.2 mrg } 2143 1.1.1.2 mrg tkp1 = median (ttab, n_measurements); 2144 1.1.1.2 mrg /* 2145 1.1.1.2 mrg printf ("testing: %ld, %d", nbits, k, ep[n-1]); 2146 1.1.1.2 mrg printf (" %10.5f %10.5f\n", tk, tkp1); 2147 1.1.1.2 mrg */ 2148 1.1.1.2 mrg if (tkp1 < tk) 2149 1.1.1.2 mrg { 2150 1.1.1.2 mrg if (possible_nbits_cutoff) 2151 1.1.1.2 mrg { 2152 1.1.1.2 mrg /* Two consecutive sizes indicate k increase, obey. */ 2153 1.1.1.3 mrg 2154 1.1.1.3 mrg /* Must always have x[k] >= k */ 2155 1.1.1.3 mrg ASSERT_ALWAYS (possible_nbits_cutoff >= k); 2156 1.1.1.3 mrg 2157 1.1.1.2 mrg if (k > 1) 2158 1.1.1.2 mrg printf (","); 2159 1.1.1.2 mrg printf ("%ld", (long) possible_nbits_cutoff); 2160 1.1.1.2 mrg k++; 2161 1.1.1.2 mrg possible_nbits_cutoff = 0; 2162 1.1.1.2 mrg } 2163 1.1.1.2 mrg else 2164 1.1.1.2 mrg { 2165 1.1.1.2 mrg /* One measurement indicate k increase, save nbits for further 2166 1.1.1.2 mrg consideration. */ 2167 1.1.1.3 mrg /* The new larger k gets used for sizes > the cutoff 2168 1.1.1.3 mrg value, hence the cutoff should be one less than the 2169 1.1.1.3 mrg smallest size where it gives a speedup. */ 2170 1.1.1.3 mrg possible_nbits_cutoff = nbits - 1; 2171 1.1.1.2 mrg } 2172 1.1.1.2 mrg } 2173 1.1.1.2 mrg else 2174 1.1.1.2 mrg possible_nbits_cutoff = 0; 2175 1.1.1.2 mrg 2176 1.1.1.2 mrg nbits_next = nbits * 65 / 64; 2177 1.1.1.2 mrg nbits = nbits_next + (nbits_next == nbits); 2178 1.1.1.2 mrg } 2179 1.1.1.2 mrg printf ("\n"); 2180 1.1.1.2 mrg TMP_FREE; 2181 1.1.1.2 mrg } 2182 1.1.1.2 mrg 2183 1.1 mrg 2184 1.1 mrg /* size_extra==1 reflects the fact that with high<divisor one division is 2185 1.1 mrg always skipped. Forcing high<divisor while testing ensures consistency 2186 1.1 mrg while stepping through sizes, ie. that size-1 divides will be done each 2187 1.1 mrg time. 2188 1.1 mrg 2189 1.1 mrg min_size==2 and min_is_always are used so that if plain division is only 2190 1.1 mrg better at size==1 then don't bother including that code just for that 2191 1.1 mrg case, instead go with preinv always and get a size saving. */ 2192 1.1 mrg 2193 1.1 mrg #define DIV_1_PARAMS \ 2194 1.1 mrg param.check_size = 256; \ 2195 1.1 mrg param.min_size = 2; \ 2196 1.1 mrg param.min_is_always = 1; \ 2197 1.1 mrg param.data_high = DATA_HIGH_LT_R; \ 2198 1.1 mrg param.size_extra = 1; \ 2199 1.1 mrg param.stop_factor = 2.0; 2200 1.1 mrg 2201 1.1 mrg 2202 1.1.1.2 mrg double (*tuned_speed_mpn_divrem_1) (struct speed_params *); 2203 1.1 mrg 2204 1.1 mrg void 2205 1.1 mrg tune_divrem_1 (void) 2206 1.1 mrg { 2207 1.1 mrg /* plain version by default */ 2208 1.1 mrg tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; 2209 1.1 mrg 2210 1.1 mrg /* No support for tuning native assembler code, do that by hand and put 2211 1.1 mrg the results in the .asm file, there's no need for such thresholds to 2212 1.1 mrg appear in gmp-mparam.h. */ 2213 1.1 mrg if (HAVE_NATIVE_mpn_divrem_1) 2214 1.1 mrg return; 2215 1.1 mrg 2216 1.1 mrg if (GMP_NAIL_BITS != 0) 2217 1.1 mrg { 2218 1.1 mrg print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2219 1.1 mrg "no preinv with nails"); 2220 1.1 mrg print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2221 1.1 mrg "no preinv with nails"); 2222 1.1 mrg return; 2223 1.1 mrg } 2224 1.1 mrg 2225 1.1 mrg if (UDIV_PREINV_ALWAYS) 2226 1.1 mrg { 2227 1.1 mrg print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); 2228 1.1 mrg print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); 2229 1.1 mrg return; 2230 1.1 mrg } 2231 1.1 mrg 2232 1.1 mrg tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; 2233 1.1 mrg 2234 1.1 mrg /* Tune for the integer part of mpn_divrem_1. This will very possibly be 2235 1.1 mrg a bit out for the fractional part, but that's too bad, the integer part 2236 1.1 mrg is more important. */ 2237 1.1 mrg { 2238 1.1 mrg static struct param_t param; 2239 1.1 mrg param.name = "DIVREM_1_NORM_THRESHOLD"; 2240 1.1 mrg DIV_1_PARAMS; 2241 1.1 mrg s.r = randlimb_norm (); 2242 1.1 mrg param.function = speed_mpn_divrem_1_tune; 2243 1.1 mrg one (&divrem_1_norm_threshold, ¶m); 2244 1.1 mrg } 2245 1.1 mrg { 2246 1.1 mrg static struct param_t param; 2247 1.1 mrg param.name = "DIVREM_1_UNNORM_THRESHOLD"; 2248 1.1 mrg DIV_1_PARAMS; 2249 1.1 mrg s.r = randlimb_half (); 2250 1.1 mrg param.function = speed_mpn_divrem_1_tune; 2251 1.1 mrg one (&divrem_1_unnorm_threshold, ¶m); 2252 1.1 mrg } 2253 1.1 mrg } 2254 1.1 mrg 2255 1.1.1.3 mrg void 2256 1.1.1.3 mrg tune_div_qr_1 (void) 2257 1.1.1.3 mrg { 2258 1.1.1.3 mrg if (!HAVE_NATIVE_mpn_div_qr_1n_pi1) 2259 1.1.1.3 mrg { 2260 1.1.1.3 mrg static struct param_t param; 2261 1.1.1.4 mrg speed_function_t f[2] = 2262 1.1.1.4 mrg { 2263 1.1.1.4 mrg speed_mpn_div_qr_1n_pi1_1, 2264 1.1.1.4 mrg speed_mpn_div_qr_1n_pi1_2, 2265 1.1.1.4 mrg }; 2266 1.1.1.3 mrg 2267 1.1.1.3 mrg s.size = 10; 2268 1.1.1.3 mrg s.r = randlimb_norm (); 2269 1.1.1.3 mrg 2270 1.1.1.4 mrg one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", ¶m); 2271 1.1.1.3 mrg } 2272 1.1.1.3 mrg 2273 1.1.1.3 mrg { 2274 1.1.1.3 mrg static struct param_t param; 2275 1.1.1.3 mrg param.name = "DIV_QR_1_NORM_THRESHOLD"; 2276 1.1.1.3 mrg DIV_1_PARAMS; 2277 1.1.1.3 mrg param.min_size = 1; 2278 1.1.1.3 mrg param.min_is_always = 0; 2279 1.1.1.3 mrg s.r = randlimb_norm (); 2280 1.1.1.3 mrg param.function = speed_mpn_div_qr_1_tune; 2281 1.1.1.3 mrg one (&div_qr_1_norm_threshold, ¶m); 2282 1.1.1.3 mrg } 2283 1.1.1.3 mrg { 2284 1.1.1.3 mrg static struct param_t param; 2285 1.1.1.3 mrg param.name = "DIV_QR_1_UNNORM_THRESHOLD"; 2286 1.1.1.3 mrg DIV_1_PARAMS; 2287 1.1.1.3 mrg param.min_size = 1; 2288 1.1.1.3 mrg param.min_is_always = 0; 2289 1.1.1.3 mrg s.r = randlimb_half(); 2290 1.1.1.3 mrg param.function = speed_mpn_div_qr_1_tune; 2291 1.1.1.3 mrg one (&div_qr_1_unnorm_threshold, ¶m); 2292 1.1.1.3 mrg } 2293 1.1.1.3 mrg } 2294 1.1.1.3 mrg 2295 1.1 mrg 2296 1.1 mrg void 2297 1.1 mrg tune_mod_1 (void) 2298 1.1 mrg { 2299 1.1 mrg /* No support for tuning native assembler code, do that by hand and put 2300 1.1 mrg the results in the .asm file, there's no need for such thresholds to 2301 1.1 mrg appear in gmp-mparam.h. */ 2302 1.1 mrg if (HAVE_NATIVE_mpn_mod_1) 2303 1.1 mrg return; 2304 1.1 mrg 2305 1.1 mrg if (GMP_NAIL_BITS != 0) 2306 1.1 mrg { 2307 1.1 mrg print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2308 1.1 mrg "no preinv with nails"); 2309 1.1 mrg print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2310 1.1 mrg "no preinv with nails"); 2311 1.1 mrg return; 2312 1.1 mrg } 2313 1.1 mrg 2314 1.1.1.2 mrg if (!HAVE_NATIVE_mpn_mod_1_1p) 2315 1.1.1.2 mrg { 2316 1.1.1.2 mrg static struct param_t param; 2317 1.1.1.4 mrg speed_function_t f[2] = 2318 1.1.1.4 mrg { 2319 1.1.1.4 mrg speed_mpn_mod_1_1_1, 2320 1.1.1.4 mrg speed_mpn_mod_1_1_2, 2321 1.1.1.4 mrg }; 2322 1.1.1.2 mrg 2323 1.1.1.2 mrg s.size = 10; 2324 1.1.1.2 mrg s.r = randlimb_half (); 2325 1.1.1.4 mrg one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", ¶m); 2326 1.1.1.2 mrg } 2327 1.1.1.2 mrg 2328 1.1 mrg if (UDIV_PREINV_ALWAYS) 2329 1.1 mrg { 2330 1.1 mrg print_define ("MOD_1_NORM_THRESHOLD", 0L); 2331 1.1 mrg print_define ("MOD_1_UNNORM_THRESHOLD", 0L); 2332 1.1 mrg } 2333 1.1 mrg else 2334 1.1 mrg { 2335 1.1 mrg { 2336 1.1 mrg static struct param_t param; 2337 1.1 mrg param.name = "MOD_1_NORM_THRESHOLD"; 2338 1.1 mrg DIV_1_PARAMS; 2339 1.1 mrg s.r = randlimb_norm (); 2340 1.1 mrg param.function = speed_mpn_mod_1_tune; 2341 1.1 mrg one (&mod_1_norm_threshold, ¶m); 2342 1.1 mrg } 2343 1.1 mrg { 2344 1.1 mrg static struct param_t param; 2345 1.1 mrg param.name = "MOD_1_UNNORM_THRESHOLD"; 2346 1.1 mrg DIV_1_PARAMS; 2347 1.1 mrg s.r = randlimb_half (); 2348 1.1 mrg param.function = speed_mpn_mod_1_tune; 2349 1.1 mrg one (&mod_1_unnorm_threshold, ¶m); 2350 1.1 mrg } 2351 1.1 mrg } 2352 1.1 mrg { 2353 1.1 mrg static struct param_t param; 2354 1.1 mrg 2355 1.1 mrg param.check_size = 256; 2356 1.1 mrg 2357 1.1 mrg s.r = randlimb_norm (); 2358 1.1 mrg param.function = speed_mpn_mod_1_tune; 2359 1.1 mrg 2360 1.1 mrg param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD"; 2361 1.1 mrg param.min_size = 2; 2362 1.1 mrg one (&mod_1n_to_mod_1_1_threshold, ¶m); 2363 1.1 mrg } 2364 1.1.1.2 mrg 2365 1.1 mrg { 2366 1.1 mrg static struct param_t param; 2367 1.1 mrg 2368 1.1 mrg param.check_size = 256; 2369 1.1.1.2 mrg s.r = randlimb_half (); 2370 1.1 mrg param.noprint = 1; 2371 1.1 mrg 2372 1.1.1.2 mrg param.function = speed_mpn_mod_1_1; 2373 1.1.1.2 mrg param.function2 = speed_mpn_mod_1_2; 2374 1.1.1.2 mrg param.min_is_always = 1; 2375 1.1 mrg param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD"; 2376 1.1.1.2 mrg param.min_size = 2; 2377 1.1 mrg one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2378 1.1 mrg 2379 1.1.1.2 mrg param.function = speed_mpn_mod_1_2; 2380 1.1.1.2 mrg param.function2 = speed_mpn_mod_1_4; 2381 1.1.1.2 mrg param.min_is_always = 1; 2382 1.1 mrg param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD"; 2383 1.1.1.2 mrg param.min_size = 1; 2384 1.1 mrg one (&mod_1_2_to_mod_1_4_threshold, ¶m); 2385 1.1 mrg 2386 1.1.1.2 mrg if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold) 2387 1.1 mrg { 2388 1.1.1.2 mrg /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */ 2389 1.1 mrg mod_1_2_to_mod_1_4_threshold = 0; 2390 1.1 mrg 2391 1.1.1.2 mrg param.function = speed_mpn_mod_1_1; 2392 1.1.1.2 mrg param.function2 = speed_mpn_mod_1_4; 2393 1.1.1.2 mrg param.min_is_always = 1; 2394 1.1.1.2 mrg param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake"; 2395 1.1.1.2 mrg param.min_size = 2; 2396 1.1 mrg one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2397 1.1 mrg } 2398 1.1.1.2 mrg 2399 1.1.1.2 mrg param.function = speed_mpn_mod_1_tune; 2400 1.1.1.2 mrg param.function2 = NULL; 2401 1.1.1.2 mrg param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD"; 2402 1.1.1.2 mrg param.min_size = 2; 2403 1.1.1.2 mrg param.min_is_always = 0; 2404 1.1.1.2 mrg one (&mod_1u_to_mod_1_1_threshold, ¶m); 2405 1.1.1.2 mrg 2406 1.1.1.2 mrg if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold) 2407 1.1.1.2 mrg mod_1_1_to_mod_1_2_threshold = 0; 2408 1.1.1.2 mrg if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold) 2409 1.1.1.2 mrg mod_1_2_to_mod_1_4_threshold = 0; 2410 1.1.1.2 mrg 2411 1.1.1.2 mrg print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL); 2412 1.1.1.2 mrg print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, 2413 1.1.1.2 mrg mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL); 2414 1.1.1.2 mrg print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, 2415 1.1.1.2 mrg mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL); 2416 1.1 mrg } 2417 1.1 mrg 2418 1.1 mrg { 2419 1.1 mrg static struct param_t param; 2420 1.1 mrg 2421 1.1 mrg param.check_size = 256; 2422 1.1 mrg 2423 1.1 mrg param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD"; 2424 1.1 mrg s.r = randlimb_norm (); 2425 1.1 mrg param.function = speed_mpn_preinv_mod_1; 2426 1.1 mrg param.function2 = speed_mpn_mod_1_tune; 2427 1.1 mrg param.min_size = 1; 2428 1.1 mrg one (&preinv_mod_1_to_mod_1_threshold, ¶m); 2429 1.1 mrg } 2430 1.1 mrg } 2431 1.1 mrg 2432 1.1 mrg 2433 1.1 mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would 2434 1.1 mrg imply that udiv_qrnnd_preinv is worth using, but it seems most 2435 1.1 mrg straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div 2436 1.1 mrg directly. */ 2437 1.1 mrg 2438 1.1 mrg void 2439 1.1 mrg tune_preinv_divrem_1 (void) 2440 1.1 mrg { 2441 1.1 mrg static struct param_t param; 2442 1.1 mrg speed_function_t divrem_1; 2443 1.1 mrg const char *divrem_1_name; 2444 1.1 mrg double t1, t2; 2445 1.1 mrg 2446 1.1 mrg if (GMP_NAIL_BITS != 0) 2447 1.1 mrg { 2448 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); 2449 1.1 mrg return; 2450 1.1 mrg } 2451 1.1 mrg 2452 1.1 mrg /* Any native version of mpn_preinv_divrem_1 is assumed to exist because 2453 1.1 mrg it's faster than mpn_divrem_1. */ 2454 1.1 mrg if (HAVE_NATIVE_mpn_preinv_divrem_1) 2455 1.1 mrg { 2456 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); 2457 1.1 mrg return; 2458 1.1 mrg } 2459 1.1 mrg 2460 1.1 mrg /* If udiv_qrnnd_preinv is the only division method then of course 2461 1.1 mrg mpn_preinv_divrem_1 should be used. */ 2462 1.1 mrg if (UDIV_PREINV_ALWAYS) 2463 1.1 mrg { 2464 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); 2465 1.1 mrg return; 2466 1.1 mrg } 2467 1.1 mrg 2468 1.1 mrg /* If we've got an assembler version of mpn_divrem_1, then compare against 2469 1.1 mrg that, not the mpn_divrem_1_div generic C. */ 2470 1.1 mrg if (HAVE_NATIVE_mpn_divrem_1) 2471 1.1 mrg { 2472 1.1 mrg divrem_1 = speed_mpn_divrem_1; 2473 1.1 mrg divrem_1_name = "mpn_divrem_1"; 2474 1.1 mrg } 2475 1.1 mrg else 2476 1.1 mrg { 2477 1.1 mrg divrem_1 = speed_mpn_divrem_1_div; 2478 1.1 mrg divrem_1_name = "mpn_divrem_1_div"; 2479 1.1 mrg } 2480 1.1 mrg 2481 1.1 mrg param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ 2482 1.1 mrg s.size = 200; /* generous but not too big */ 2483 1.1 mrg /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, 2484 1.1 mrg since in general that's probably most common, though in fact for a 2485 1.1 mrg 64-bit limb mp_bases[10].big_base is normalized. */ 2486 1.1 mrg s.r = urandom() & (GMP_NUMB_MASK >> 4); 2487 1.1 mrg if (s.r == 0) s.r = 123; 2488 1.1 mrg 2489 1.1 mrg t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); 2490 1.1 mrg t2 = tuneup_measure (divrem_1, ¶m, &s); 2491 1.1 mrg if (t1 == -1.0 || t2 == -1.0) 2492 1.1 mrg { 2493 1.1 mrg printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", 2494 1.1 mrg divrem_1_name, (long) s.size); 2495 1.1 mrg abort (); 2496 1.1 mrg } 2497 1.1 mrg if (option_trace >= 1) 2498 1.1 mrg printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", 2499 1.1 mrg (long) s.size, t1, divrem_1_name, t2); 2500 1.1 mrg 2501 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); 2502 1.1 mrg } 2503 1.1 mrg 2504 1.1 mrg 2505 1.1 mrg 2506 1.1 mrg void 2507 1.1 mrg tune_divrem_2 (void) 2508 1.1 mrg { 2509 1.1 mrg static struct param_t param; 2510 1.1 mrg 2511 1.1 mrg /* No support for tuning native assembler code, do that by hand and put 2512 1.1 mrg the results in the .asm file, and there's no need for such thresholds 2513 1.1 mrg to appear in gmp-mparam.h. */ 2514 1.1 mrg if (HAVE_NATIVE_mpn_divrem_2) 2515 1.1 mrg return; 2516 1.1 mrg 2517 1.1 mrg if (GMP_NAIL_BITS != 0) 2518 1.1 mrg { 2519 1.1 mrg print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, 2520 1.1 mrg "no preinv with nails"); 2521 1.1 mrg return; 2522 1.1 mrg } 2523 1.1 mrg 2524 1.1 mrg if (UDIV_PREINV_ALWAYS) 2525 1.1 mrg { 2526 1.1 mrg print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); 2527 1.1 mrg return; 2528 1.1 mrg } 2529 1.1 mrg 2530 1.1 mrg /* Tune for the integer part of mpn_divrem_2. This will very possibly be 2531 1.1 mrg a bit out for the fractional part, but that's too bad, the integer part 2532 1.1 mrg is more important. 2533 1.1 mrg 2534 1.1 mrg min_size must be >=2 since nsize>=2 is required, but is set to 4 to save 2535 1.1 mrg code space if plain division is better only at size==2 or size==3. */ 2536 1.1 mrg param.name = "DIVREM_2_THRESHOLD"; 2537 1.1 mrg param.check_size = 256; 2538 1.1 mrg param.min_size = 4; 2539 1.1 mrg param.min_is_always = 1; 2540 1.1 mrg param.size_extra = 2; /* does qsize==nsize-2 divisions */ 2541 1.1 mrg param.stop_factor = 2.0; 2542 1.1 mrg 2543 1.1 mrg s.r = randlimb_norm (); 2544 1.1 mrg param.function = speed_mpn_divrem_2; 2545 1.1 mrg one (&divrem_2_threshold, ¶m); 2546 1.1 mrg } 2547 1.1 mrg 2548 1.1.1.2 mrg void 2549 1.1.1.2 mrg tune_div_qr_2 (void) 2550 1.1.1.2 mrg { 2551 1.1.1.2 mrg static struct param_t param; 2552 1.1.1.2 mrg param.name = "DIV_QR_2_PI2_THRESHOLD"; 2553 1.1.1.2 mrg param.function = speed_mpn_div_qr_2n; 2554 1.1.1.2 mrg param.check_size = 500; 2555 1.1.1.2 mrg param.min_size = 4; 2556 1.1.1.2 mrg one (&div_qr_2_pi2_threshold, ¶m); 2557 1.1.1.2 mrg } 2558 1.1 mrg 2559 1.1 mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so 2560 1.1 mrg tune for that. Its speed can differ on odd or even divisor, so take an 2561 1.1 mrg average threshold for the two. 2562 1.1 mrg 2563 1.1 mrg mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 2564 1.1 mrg might not vary that way, but don't test this since high<divisor isn't 2565 1.1 mrg expected to occur often with small divisors. */ 2566 1.1 mrg 2567 1.1 mrg void 2568 1.1 mrg tune_divexact_1 (void) 2569 1.1 mrg { 2570 1.1 mrg static struct param_t param; 2571 1.1 mrg mp_size_t thresh[2], average; 2572 1.1 mrg int low, i; 2573 1.1 mrg 2574 1.1 mrg /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a 2575 1.1 mrg full mpn_divrem_1. */ 2576 1.1 mrg if (HAVE_NATIVE_mpn_divexact_1) 2577 1.1 mrg { 2578 1.1 mrg print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)"); 2579 1.1 mrg return; 2580 1.1 mrg } 2581 1.1 mrg 2582 1.1 mrg ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); 2583 1.1 mrg 2584 1.1 mrg param.name = "DIVEXACT_1_THRESHOLD"; 2585 1.1 mrg param.data_high = DATA_HIGH_GE_R; 2586 1.1 mrg param.check_size = 256; 2587 1.1 mrg param.min_size = 2; 2588 1.1 mrg param.stop_factor = 1.5; 2589 1.1 mrg param.function = tuned_speed_mpn_divrem_1; 2590 1.1 mrg param.function2 = speed_mpn_divexact_1; 2591 1.1 mrg param.noprint = 1; 2592 1.1 mrg 2593 1.1 mrg print_define_start (param.name); 2594 1.1 mrg 2595 1.1 mrg for (low = 0; low <= 1; low++) 2596 1.1 mrg { 2597 1.1 mrg s.r = randlimb_half(); 2598 1.1 mrg if (low == 0) 2599 1.1 mrg s.r |= 1; 2600 1.1 mrg else 2601 1.1 mrg s.r &= ~CNST_LIMB(7); 2602 1.1 mrg 2603 1.1 mrg one (&thresh[low], ¶m); 2604 1.1 mrg if (option_trace) 2605 1.1 mrg printf ("low=%d thresh %ld\n", low, (long) thresh[low]); 2606 1.1 mrg 2607 1.1 mrg if (thresh[low] == MP_SIZE_T_MAX) 2608 1.1 mrg { 2609 1.1 mrg average = MP_SIZE_T_MAX; 2610 1.1 mrg goto divexact_1_done; 2611 1.1 mrg } 2612 1.1 mrg } 2613 1.1 mrg 2614 1.1 mrg if (option_trace) 2615 1.1 mrg { 2616 1.1 mrg printf ("average of:"); 2617 1.1 mrg for (i = 0; i < numberof(thresh); i++) 2618 1.1 mrg printf (" %ld", (long) thresh[i]); 2619 1.1 mrg printf ("\n"); 2620 1.1 mrg } 2621 1.1 mrg 2622 1.1 mrg average = 0; 2623 1.1 mrg for (i = 0; i < numberof(thresh); i++) 2624 1.1 mrg average += thresh[i]; 2625 1.1 mrg average /= numberof(thresh); 2626 1.1 mrg 2627 1.1 mrg /* If divexact turns out to be better as early as 3 limbs, then use it 2628 1.1 mrg always, so as to reduce code size and conditional jumps. */ 2629 1.1 mrg if (average <= 3) 2630 1.1 mrg average = 0; 2631 1.1 mrg 2632 1.1 mrg divexact_1_done: 2633 1.1 mrg print_define_end (param.name, average); 2634 1.1 mrg } 2635 1.1 mrg 2636 1.1 mrg 2637 1.1 mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the 2638 1.1 mrg same as mpn_mod_1, but this might not be true of an assembler 2639 1.1 mrg implementation. The threshold used is an average based on data where a 2640 1.1 mrg divide can be skipped and where it can't. 2641 1.1 mrg 2642 1.1 mrg If modexact turns out to be better as early as 3 limbs, then use it 2643 1.1 mrg always, so as to reduce code size and conditional jumps. */ 2644 1.1 mrg 2645 1.1 mrg void 2646 1.1 mrg tune_modexact_1_odd (void) 2647 1.1 mrg { 2648 1.1 mrg static struct param_t param; 2649 1.1 mrg mp_size_t thresh_lt, thresh_ge, average; 2650 1.1 mrg 2651 1.1 mrg #if 0 2652 1.1 mrg /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed 2653 1.1 mrg of a full mpn_mod_1. */ 2654 1.1 mrg if (HAVE_NATIVE_mpn_modexact_1_odd) 2655 1.1 mrg { 2656 1.1 mrg print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1"); 2657 1.1 mrg return; 2658 1.1 mrg } 2659 1.1 mrg #endif 2660 1.1 mrg 2661 1.1 mrg param.name = "BMOD_1_TO_MOD_1_THRESHOLD"; 2662 1.1 mrg param.check_size = 256; 2663 1.1 mrg param.min_size = 2; 2664 1.1 mrg param.stop_factor = 1.5; 2665 1.1 mrg param.function = speed_mpn_modexact_1c_odd; 2666 1.1 mrg param.function2 = speed_mpn_mod_1_tune; 2667 1.1 mrg param.noprint = 1; 2668 1.1 mrg s.r = randlimb_half () | 1; 2669 1.1 mrg 2670 1.1 mrg print_define_start (param.name); 2671 1.1 mrg 2672 1.1 mrg param.data_high = DATA_HIGH_LT_R; 2673 1.1 mrg one (&thresh_lt, ¶m); 2674 1.1 mrg if (option_trace) 2675 1.1 mrg printf ("lt thresh %ld\n", (long) thresh_lt); 2676 1.1 mrg 2677 1.1 mrg average = thresh_lt; 2678 1.1 mrg if (thresh_lt != MP_SIZE_T_MAX) 2679 1.1 mrg { 2680 1.1 mrg param.data_high = DATA_HIGH_GE_R; 2681 1.1 mrg one (&thresh_ge, ¶m); 2682 1.1 mrg if (option_trace) 2683 1.1 mrg printf ("ge thresh %ld\n", (long) thresh_ge); 2684 1.1 mrg 2685 1.1 mrg if (thresh_ge != MP_SIZE_T_MAX) 2686 1.1 mrg { 2687 1.1 mrg average = (thresh_ge + thresh_lt) / 2; 2688 1.1 mrg if (thresh_ge <= 3) 2689 1.1 mrg average = 0; 2690 1.1 mrg } 2691 1.1 mrg } 2692 1.1 mrg 2693 1.1 mrg print_define_end (param.name, average); 2694 1.1 mrg } 2695 1.1 mrg 2696 1.1 mrg 2697 1.1 mrg void 2698 1.1 mrg tune_jacobi_base (void) 2699 1.1 mrg { 2700 1.1 mrg static struct param_t param; 2701 1.1.1.4 mrg speed_function_t f[4] = 2702 1.1 mrg { 2703 1.1.1.4 mrg speed_mpn_jacobi_base_1, 2704 1.1.1.4 mrg speed_mpn_jacobi_base_2, 2705 1.1.1.4 mrg speed_mpn_jacobi_base_3, 2706 1.1.1.4 mrg speed_mpn_jacobi_base_4, 2707 1.1.1.4 mrg }; 2708 1.1 mrg 2709 1.1.1.4 mrg s.size = GMP_LIMB_BITS * 3 / 4; 2710 1.1 mrg 2711 1.1.1.4 mrg one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", ¶m); 2712 1.1 mrg } 2713 1.1 mrg 2714 1.1 mrg 2715 1.1 mrg void 2716 1.1 mrg tune_get_str (void) 2717 1.1 mrg { 2718 1.1 mrg /* Tune for decimal, it being most common. Some rough testing suggests 2719 1.1 mrg other bases are different, but not by very much. */ 2720 1.1 mrg s.r = 10; 2721 1.1 mrg { 2722 1.1 mrg static struct param_t param; 2723 1.1 mrg GET_STR_PRECOMPUTE_THRESHOLD = 0; 2724 1.1 mrg param.name = "GET_STR_DC_THRESHOLD"; 2725 1.1 mrg param.function = speed_mpn_get_str; 2726 1.1 mrg param.min_size = 4; 2727 1.1 mrg param.max_size = GET_STR_THRESHOLD_LIMIT; 2728 1.1 mrg one (&get_str_dc_threshold, ¶m); 2729 1.1 mrg } 2730 1.1 mrg { 2731 1.1 mrg static struct param_t param; 2732 1.1 mrg param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; 2733 1.1 mrg param.function = speed_mpn_get_str; 2734 1.1 mrg param.min_size = GET_STR_DC_THRESHOLD; 2735 1.1 mrg param.max_size = GET_STR_THRESHOLD_LIMIT; 2736 1.1 mrg one (&get_str_precompute_threshold, ¶m); 2737 1.1 mrg } 2738 1.1 mrg } 2739 1.1 mrg 2740 1.1 mrg 2741 1.1 mrg double 2742 1.1 mrg speed_mpn_pre_set_str (struct speed_params *s) 2743 1.1 mrg { 2744 1.1 mrg unsigned char *str; 2745 1.1 mrg mp_ptr wp; 2746 1.1 mrg mp_size_t wn; 2747 1.1 mrg unsigned i; 2748 1.1 mrg int base; 2749 1.1 mrg double t; 2750 1.1 mrg mp_ptr powtab_mem, tp; 2751 1.1 mrg powers_t powtab[GMP_LIMB_BITS]; 2752 1.1 mrg mp_size_t un; 2753 1.1 mrg int chars_per_limb; 2754 1.1.1.5 mrg size_t n_pows; 2755 1.1.1.5 mrg powers_t *pt; 2756 1.1 mrg TMP_DECL; 2757 1.1 mrg 2758 1.1 mrg SPEED_RESTRICT_COND (s->size >= 1); 2759 1.1 mrg 2760 1.1 mrg base = s->r == 0 ? 10 : s->r; 2761 1.1 mrg SPEED_RESTRICT_COND (base >= 2 && base <= 256); 2762 1.1 mrg 2763 1.1 mrg TMP_MARK; 2764 1.1 mrg 2765 1.1.1.3 mrg str = (unsigned char *) TMP_ALLOC (s->size); 2766 1.1 mrg for (i = 0; i < s->size; i++) 2767 1.1 mrg str[i] = s->xp[i] % base; 2768 1.1 mrg 2769 1.1.1.2 mrg LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); 2770 1.1 mrg SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); 2771 1.1 mrg 2772 1.1 mrg /* use this during development to check wn is big enough */ 2773 1.1 mrg /* 2774 1.1 mrg ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); 2775 1.1 mrg */ 2776 1.1 mrg 2777 1.1.1.3 mrg speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES); 2778 1.1 mrg speed_operand_dst (s, wp, wn); 2779 1.1 mrg speed_cache_fill (s); 2780 1.1 mrg 2781 1.1 mrg chars_per_limb = mp_bases[base].chars_per_limb; 2782 1.1 mrg un = s->size / chars_per_limb + 1; 2783 1.1.1.4 mrg powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); 2784 1.1.1.5 mrg n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base); 2785 1.1.1.5 mrg pt = powtab + n_pows; 2786 1.1 mrg tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); 2787 1.1 mrg 2788 1.1 mrg speed_starttime (); 2789 1.1 mrg i = s->reps; 2790 1.1 mrg do 2791 1.1 mrg { 2792 1.1.1.4 mrg mpn_pre_set_str (wp, str, s->size, pt, tp); 2793 1.1 mrg } 2794 1.1 mrg while (--i != 0); 2795 1.1 mrg t = speed_endtime (); 2796 1.1 mrg 2797 1.1 mrg TMP_FREE; 2798 1.1 mrg return t; 2799 1.1 mrg } 2800 1.1 mrg 2801 1.1 mrg void 2802 1.1 mrg tune_set_str (void) 2803 1.1 mrg { 2804 1.1 mrg s.r = 10; /* decimal */ 2805 1.1 mrg { 2806 1.1 mrg static struct param_t param; 2807 1.1 mrg SET_STR_PRECOMPUTE_THRESHOLD = 0; 2808 1.1 mrg param.step_factor = 0.01; 2809 1.1 mrg param.name = "SET_STR_DC_THRESHOLD"; 2810 1.1 mrg param.function = speed_mpn_pre_set_str; 2811 1.1 mrg param.min_size = 100; 2812 1.1 mrg param.max_size = 50000; 2813 1.1 mrg one (&set_str_dc_threshold, ¶m); 2814 1.1 mrg } 2815 1.1 mrg { 2816 1.1 mrg static struct param_t param; 2817 1.1 mrg param.step_factor = 0.02; 2818 1.1 mrg param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; 2819 1.1 mrg param.function = speed_mpn_set_str; 2820 1.1 mrg param.min_size = SET_STR_DC_THRESHOLD; 2821 1.1 mrg param.max_size = 100000; 2822 1.1 mrg one (&set_str_precompute_threshold, ¶m); 2823 1.1 mrg } 2824 1.1 mrg } 2825 1.1 mrg 2826 1.1 mrg 2827 1.1 mrg void 2828 1.1 mrg tune_fft_mul (void) 2829 1.1 mrg { 2830 1.1 mrg static struct fft_param_t param; 2831 1.1 mrg 2832 1.1 mrg if (option_fft_max_size == 0) 2833 1.1 mrg return; 2834 1.1 mrg 2835 1.1 mrg param.table_name = "MUL_FFT_TABLE3"; 2836 1.1 mrg param.threshold_name = "MUL_FFT_THRESHOLD"; 2837 1.1 mrg param.p_threshold = &mul_fft_threshold; 2838 1.1 mrg param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; 2839 1.1 mrg param.p_modf_threshold = &mul_fft_modf_threshold; 2840 1.1 mrg param.first_size = MUL_TOOM33_THRESHOLD / 2; 2841 1.1 mrg param.max_size = option_fft_max_size; 2842 1.1 mrg param.function = speed_mpn_fft_mul; 2843 1.1 mrg param.mul_modf_function = speed_mpn_mul_fft; 2844 1.1 mrg param.mul_function = speed_mpn_mul_n; 2845 1.1 mrg param.sqr = 0; 2846 1.1 mrg fft (¶m); 2847 1.1 mrg } 2848 1.1 mrg 2849 1.1 mrg 2850 1.1 mrg void 2851 1.1 mrg tune_fft_sqr (void) 2852 1.1 mrg { 2853 1.1 mrg static struct fft_param_t param; 2854 1.1 mrg 2855 1.1 mrg if (option_fft_max_size == 0) 2856 1.1 mrg return; 2857 1.1 mrg 2858 1.1 mrg param.table_name = "SQR_FFT_TABLE3"; 2859 1.1 mrg param.threshold_name = "SQR_FFT_THRESHOLD"; 2860 1.1 mrg param.p_threshold = &sqr_fft_threshold; 2861 1.1 mrg param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; 2862 1.1 mrg param.p_modf_threshold = &sqr_fft_modf_threshold; 2863 1.1 mrg param.first_size = SQR_TOOM3_THRESHOLD / 2; 2864 1.1 mrg param.max_size = option_fft_max_size; 2865 1.1 mrg param.function = speed_mpn_fft_sqr; 2866 1.1 mrg param.mul_modf_function = speed_mpn_mul_fft_sqr; 2867 1.1 mrg param.mul_function = speed_mpn_sqr; 2868 1.1 mrg param.sqr = 1; 2869 1.1 mrg fft (¶m); 2870 1.1 mrg } 2871 1.1 mrg 2872 1.1 mrg void 2873 1.1.1.2 mrg tune_fac_ui (void) 2874 1.1.1.2 mrg { 2875 1.1.1.2 mrg static struct param_t param; 2876 1.1.1.2 mrg 2877 1.1.1.2 mrg param.function = speed_mpz_fac_ui_tune; 2878 1.1.1.2 mrg 2879 1.1.1.2 mrg param.name = "FAC_DSC_THRESHOLD"; 2880 1.1.1.2 mrg param.min_size = 70; 2881 1.1.1.2 mrg param.max_size = FAC_DSC_THRESHOLD_LIMIT; 2882 1.1.1.2 mrg one (&fac_dsc_threshold, ¶m); 2883 1.1.1.2 mrg 2884 1.1.1.2 mrg param.name = "FAC_ODD_THRESHOLD"; 2885 1.1.1.2 mrg param.min_size = 22; 2886 1.1.1.2 mrg param.stop_factor = 1.7; 2887 1.1.1.2 mrg param.min_is_always = 1; 2888 1.1.1.2 mrg one (&fac_odd_threshold, ¶m); 2889 1.1.1.2 mrg } 2890 1.1.1.2 mrg 2891 1.1.1.2 mrg void 2892 1.1 mrg all (void) 2893 1.1 mrg { 2894 1.1 mrg time_t start_time, end_time; 2895 1.1 mrg TMP_DECL; 2896 1.1 mrg 2897 1.1 mrg TMP_MARK; 2898 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); 2899 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); 2900 1.1 mrg 2901 1.1 mrg mpn_random (s.xp_block, SPEED_BLOCK_SIZE); 2902 1.1 mrg mpn_random (s.yp_block, SPEED_BLOCK_SIZE); 2903 1.1 mrg 2904 1.1 mrg fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); 2905 1.1 mrg 2906 1.1 mrg speed_time_init (); 2907 1.1 mrg fprintf (stderr, "Using: %s\n", speed_time_string); 2908 1.1 mrg 2909 1.1 mrg fprintf (stderr, "speed_precision %d", speed_precision); 2910 1.1 mrg if (speed_unittime == 1.0) 2911 1.1 mrg fprintf (stderr, ", speed_unittime 1 cycle"); 2912 1.1 mrg else 2913 1.1 mrg fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); 2914 1.1 mrg if (speed_cycletime == 1.0 || speed_cycletime == 0.0) 2915 1.1 mrg fprintf (stderr, ", CPU freq unknown\n"); 2916 1.1 mrg else 2917 1.1 mrg fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); 2918 1.1 mrg 2919 1.1 mrg fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", 2920 1.1 mrg DEFAULT_MAX_SIZE, (long) option_fft_max_size); 2921 1.1 mrg fprintf (stderr, "\n"); 2922 1.1 mrg 2923 1.1 mrg time (&start_time); 2924 1.1 mrg { 2925 1.1 mrg struct tm *tp; 2926 1.1 mrg tp = localtime (&start_time); 2927 1.1 mrg printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", 2928 1.1 mrg tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); 2929 1.1 mrg 2930 1.1 mrg #ifdef __GNUC__ 2931 1.1 mrg /* gcc sub-minor version doesn't seem to come through as a define */ 2932 1.1 mrg printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); 2933 1.1 mrg #define PRINTED_COMPILER 2934 1.1 mrg #endif 2935 1.1 mrg #if defined (__SUNPRO_C) 2936 1.1 mrg printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); 2937 1.1 mrg #define PRINTED_COMPILER 2938 1.1 mrg #endif 2939 1.1 mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) 2940 1.1 mrg /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ 2941 1.1 mrg printf ("MIPSpro C %d.%d.%d */\n", 2942 1.1 mrg _COMPILER_VERSION / 100, 2943 1.1 mrg _COMPILER_VERSION / 10 % 10, 2944 1.1 mrg _COMPILER_VERSION % 10); 2945 1.1 mrg #define PRINTED_COMPILER 2946 1.1 mrg #endif 2947 1.1 mrg #if defined (__DECC) && defined (__DECC_VER) 2948 1.1 mrg printf ("DEC C %d */\n", __DECC_VER); 2949 1.1 mrg #define PRINTED_COMPILER 2950 1.1 mrg #endif 2951 1.1 mrg #if ! defined (PRINTED_COMPILER) 2952 1.1 mrg printf ("system compiler */\n"); 2953 1.1 mrg #endif 2954 1.1 mrg } 2955 1.1 mrg printf ("\n"); 2956 1.1 mrg 2957 1.1 mrg tune_divrem_1 (); 2958 1.1 mrg tune_mod_1 (); 2959 1.1 mrg tune_preinv_divrem_1 (); 2960 1.1.1.3 mrg tune_div_qr_1 (); 2961 1.1.1.2 mrg #if 0 2962 1.1 mrg tune_divrem_2 (); 2963 1.1.1.2 mrg #endif 2964 1.1.1.2 mrg tune_div_qr_2 (); 2965 1.1 mrg tune_divexact_1 (); 2966 1.1 mrg tune_modexact_1_odd (); 2967 1.1 mrg printf("\n"); 2968 1.1 mrg 2969 1.1.1.4 mrg relspeed_div_1_vs_mul_1 (); 2970 1.1.1.4 mrg printf("\n"); 2971 1.1.1.4 mrg 2972 1.1 mrg tune_mul_n (); 2973 1.1 mrg printf("\n"); 2974 1.1 mrg 2975 1.1 mrg tune_mul (); 2976 1.1 mrg printf("\n"); 2977 1.1 mrg 2978 1.1 mrg tune_sqr (); 2979 1.1 mrg printf("\n"); 2980 1.1 mrg 2981 1.1.1.2 mrg tune_mulmid (); 2982 1.1.1.2 mrg printf("\n"); 2983 1.1.1.2 mrg 2984 1.1 mrg tune_mulmod_bnm1 (); 2985 1.1 mrg tune_sqrmod_bnm1 (); 2986 1.1 mrg printf("\n"); 2987 1.1 mrg 2988 1.1 mrg tune_fft_mul (); 2989 1.1 mrg printf("\n"); 2990 1.1 mrg 2991 1.1 mrg tune_fft_sqr (); 2992 1.1 mrg printf ("\n"); 2993 1.1 mrg 2994 1.1 mrg tune_mullo (); 2995 1.1.1.3 mrg tune_sqrlo (); 2996 1.1 mrg printf("\n"); 2997 1.1 mrg 2998 1.1 mrg tune_dc_div (); 2999 1.1 mrg tune_dc_bdiv (); 3000 1.1 mrg 3001 1.1 mrg printf("\n"); 3002 1.1 mrg tune_invertappr (); 3003 1.1 mrg tune_invert (); 3004 1.1 mrg printf("\n"); 3005 1.1 mrg 3006 1.1 mrg tune_binvert (); 3007 1.1 mrg tune_redc (); 3008 1.1 mrg printf("\n"); 3009 1.1 mrg 3010 1.1 mrg tune_mu_div (); 3011 1.1 mrg tune_mu_bdiv (); 3012 1.1 mrg printf("\n"); 3013 1.1 mrg 3014 1.1.1.2 mrg tune_powm_sec (); 3015 1.1.1.2 mrg printf("\n"); 3016 1.1.1.2 mrg 3017 1.1.1.3 mrg tune_get_str (); 3018 1.1.1.3 mrg tune_set_str (); 3019 1.1.1.3 mrg printf("\n"); 3020 1.1.1.3 mrg 3021 1.1.1.3 mrg tune_fac_ui (); 3022 1.1.1.3 mrg printf("\n"); 3023 1.1.1.3 mrg 3024 1.1 mrg tune_matrix22_mul (); 3025 1.1.1.4 mrg tune_hgcd2 (); 3026 1.1 mrg tune_hgcd (); 3027 1.1.1.2 mrg tune_hgcd_appr (); 3028 1.1.1.2 mrg tune_hgcd_reduce(); 3029 1.1 mrg tune_gcd_dc (); 3030 1.1 mrg tune_gcdext_dc (); 3031 1.1 mrg tune_jacobi_base (); 3032 1.1 mrg printf("\n"); 3033 1.1 mrg 3034 1.1 mrg time (&end_time); 3035 1.1 mrg printf ("/* Tuneup completed successfully, took %ld seconds */\n", 3036 1.1 mrg (long) (end_time - start_time)); 3037 1.1 mrg 3038 1.1 mrg TMP_FREE; 3039 1.1 mrg } 3040 1.1 mrg 3041 1.1 mrg 3042 1.1 mrg int 3043 1.1 mrg main (int argc, char *argv[]) 3044 1.1 mrg { 3045 1.1 mrg int opt; 3046 1.1 mrg 3047 1.1 mrg /* Unbuffered so if output is redirected to a file it isn't lost if the 3048 1.1 mrg program is killed part way through. */ 3049 1.1 mrg setbuf (stdout, NULL); 3050 1.1 mrg setbuf (stderr, NULL); 3051 1.1 mrg 3052 1.1 mrg while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) 3053 1.1 mrg { 3054 1.1 mrg switch (opt) { 3055 1.1 mrg case 'f': 3056 1.1 mrg if (optarg[0] == 't') 3057 1.1 mrg option_fft_trace = 2; 3058 1.1 mrg else 3059 1.1 mrg option_fft_max_size = atol (optarg); 3060 1.1 mrg break; 3061 1.1 mrg case 'o': 3062 1.1 mrg speed_option_set (optarg); 3063 1.1 mrg break; 3064 1.1 mrg case 'p': 3065 1.1 mrg speed_precision = atoi (optarg); 3066 1.1 mrg break; 3067 1.1 mrg case 't': 3068 1.1 mrg option_trace++; 3069 1.1 mrg break; 3070 1.1 mrg case '?': 3071 1.1 mrg exit(1); 3072 1.1 mrg } 3073 1.1 mrg } 3074 1.1 mrg 3075 1.1 mrg all (); 3076 1.1 mrg exit (0); 3077 1.1 mrg } 3078