tuneup.c revision 1.1.1.4 1 1.1 mrg /* Create tuned thresholds for various algorithms.
2 1.1 mrg
3 1.1.1.4 mrg Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
4 1.1 mrg
5 1.1 mrg This file is part of the GNU MP Library.
6 1.1 mrg
7 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
8 1.1.1.3 mrg it under the terms of either:
9 1.1.1.3 mrg
10 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
11 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
12 1.1.1.3 mrg option) any later version.
13 1.1.1.3 mrg
14 1.1.1.3 mrg or
15 1.1.1.3 mrg
16 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
17 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
18 1.1.1.3 mrg later version.
19 1.1.1.3 mrg
20 1.1.1.3 mrg or both in parallel, as here.
21 1.1 mrg
22 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
23 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 1.1.1.3 mrg for more details.
26 1.1 mrg
27 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
28 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
29 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
30 1.1 mrg
31 1.1 mrg
32 1.1 mrg /* Usage: tuneup [-t] [-t] [-p precision]
33 1.1 mrg
34 1.1 mrg -t turns on some diagnostic traces, a second -t turns on more traces.
35 1.1 mrg
36 1.1 mrg Notes:
37 1.1 mrg
38 1.1 mrg The code here isn't a vision of loveliness, mainly because it's subject
39 1.1 mrg to ongoing changes according to new things wanting to be tuned, and
40 1.1 mrg practical requirements of systems tested.
41 1.1 mrg
42 1.1 mrg Sometimes running the program twice produces slightly different results.
43 1.1 mrg This is probably because there's so little separating algorithms near
44 1.1 mrg their crossover, and on that basis it should make little or no difference
45 1.1 mrg to the final speed of the relevant routines, but nothing has been done to
46 1.1 mrg check that carefully.
47 1.1 mrg
48 1.1 mrg Algorithm:
49 1.1 mrg
50 1.1 mrg The thresholds are determined as follows. A crossover may not be a
51 1.1 mrg single size but rather a range where it oscillates between method A or
52 1.1 mrg method B faster. If the threshold is set making B used where A is faster
53 1.1 mrg (or vice versa) that's bad. Badness is the percentage time lost and
54 1.1 mrg total badness is the sum of this over all sizes measured. The threshold
55 1.1 mrg is set to minimize total badness.
56 1.1 mrg
57 1.1 mrg Suppose, as sizes increase, method B becomes faster than method A. The
58 1.1 mrg effect of the rule is that, as you look at increasing sizes, isolated
59 1.1 mrg points where B is faster are ignored, but when it's consistently faster,
60 1.1 mrg or faster on balance, then the threshold is set there. The same result
61 1.1 mrg is obtained thinking in the other direction of A becoming faster at
62 1.1 mrg smaller sizes.
63 1.1 mrg
64 1.1 mrg In practice the thresholds tend to be chosen to bring on the next
65 1.1 mrg algorithm fairly quickly.
66 1.1 mrg
67 1.1 mrg This rule is attractive because it's got a basis in reason and is fairly
68 1.1 mrg easy to implement, but no work has been done to actually compare it in
69 1.1 mrg absolute terms to other possibilities.
70 1.1 mrg
71 1.1 mrg Implementation:
72 1.1 mrg
73 1.1 mrg In a normal library build the thresholds are constants. To tune them
74 1.1 mrg selected objects are recompiled with the thresholds as global variables
75 1.1 mrg instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
76 1.1 mrg the end of gmp-impl.h, and rules in tune/Makefile.am.
77 1.1 mrg
78 1.1 mrg MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The
79 1.1 mrg threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
80 1.1 mrg level, but recurse into the basecase.
81 1.1 mrg
82 1.1 mrg MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
83 1.1 mrg Other routines in turn will make use of both of those. Naturally the
84 1.1 mrg dependants must be tuned first.
85 1.1 mrg
86 1.1 mrg In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
87 1.1 mrg just a threshold based on comparing two routines (mpn_divrem_1 and
88 1.1 mrg mpn_divexact_1), and no further use of the value determined.
89 1.1 mrg
90 1.1 mrg Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
91 1.1 mrg just comparisons between certain routines on representative data.
92 1.1 mrg
93 1.1 mrg Shortcuts are applied when native (assembler) versions of routines exist.
94 1.1 mrg For instance a native mpn_sqr_basecase is assumed to be always faster
95 1.1 mrg than mpn_mul_basecase, with no measuring.
96 1.1 mrg
97 1.1 mrg No attempt is made to tune within assembler routines, for instance
98 1.1 mrg DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
99 1.1 mrg written and tuned all by hand. Assembler routines that might have hard
100 1.1 mrg limits are recompiled though, to make them accept a bigger range of sizes
101 1.1 mrg than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
102 1.1 mrg
103 1.1 mrg Limitations:
104 1.1 mrg
105 1.1 mrg The FFTs aren't subject to the same badness rule as the other thresholds,
106 1.1 mrg so each k is probably being brought on a touch early. This isn't likely
107 1.1 mrg to make a difference, and the simpler probing means fewer tests.
108 1.1 mrg
109 1.1 mrg */
110 1.1 mrg
111 1.1 mrg #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
112 1.1 mrg
113 1.1 mrg #include "config.h"
114 1.1 mrg
115 1.1 mrg #include <math.h>
116 1.1 mrg #include <stdio.h>
117 1.1 mrg #include <stdlib.h>
118 1.1 mrg #include <time.h>
119 1.1 mrg #if HAVE_UNISTD_H
120 1.1 mrg #include <unistd.h>
121 1.1 mrg #endif
122 1.1 mrg
123 1.1 mrg #include "gmp-impl.h"
124 1.1 mrg #include "longlong.h"
125 1.1 mrg
126 1.1 mrg #include "tests.h"
127 1.1 mrg #include "speed.h"
128 1.1 mrg
129 1.1 mrg #if !HAVE_DECL_OPTARG
130 1.1 mrg extern char *optarg;
131 1.1 mrg extern int optind, opterr;
132 1.1 mrg #endif
133 1.1 mrg
134 1.1 mrg
135 1.1 mrg #define DEFAULT_MAX_SIZE 1000 /* limbs */
136 1.1 mrg
137 1.1 mrg #if WANT_FFT
138 1.1 mrg mp_size_t option_fft_max_size = 50000; /* limbs */
139 1.1 mrg #else
140 1.1 mrg mp_size_t option_fft_max_size = 0;
141 1.1 mrg #endif
142 1.1 mrg int option_trace = 0;
143 1.1 mrg int option_fft_trace = 0;
144 1.1 mrg struct speed_params s;
145 1.1 mrg
146 1.1 mrg struct dat_t {
147 1.1 mrg mp_size_t size;
148 1.1 mrg double d;
149 1.1 mrg } *dat = NULL;
150 1.1 mrg int ndat = 0;
151 1.1 mrg int allocdat = 0;
152 1.1 mrg
153 1.1 mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
154 1.1 mrg case use zero here, which for params.max_size means no limit. */
155 1.1 mrg #ifndef TUNE_SQR_TOOM2_MAX
156 1.1 mrg #define TUNE_SQR_TOOM2_MAX 0
157 1.1 mrg #endif
158 1.1 mrg
159 1.1 mrg mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX;
160 1.1 mrg mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT;
161 1.1 mrg mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT;
162 1.1 mrg mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT;
163 1.1 mrg mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
164 1.1 mrg mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
165 1.1 mrg mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
166 1.1 mrg mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
167 1.1 mrg mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
168 1.1.1.2 mrg mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
169 1.1 mrg mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
170 1.1 mrg mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
171 1.1 mrg mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
172 1.1 mrg mp_size_t sqr_toom2_threshold
173 1.1 mrg = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
174 1.1 mrg mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
175 1.1 mrg mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
176 1.1 mrg mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
177 1.1 mrg mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
178 1.1 mrg mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
179 1.1 mrg mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
180 1.1 mrg mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX;
181 1.1 mrg mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX;
182 1.1 mrg mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX;
183 1.1.1.3 mrg mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX;
184 1.1.1.3 mrg mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX;
185 1.1.1.3 mrg mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX;
186 1.1.1.2 mrg mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX;
187 1.1 mrg mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX;
188 1.1 mrg mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX;
189 1.1.1.2 mrg mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX;
190 1.1 mrg mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
191 1.1 mrg mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
192 1.1 mrg mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX;
193 1.1 mrg mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX;
194 1.1 mrg mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX;
195 1.1 mrg mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX;
196 1.1 mrg mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
197 1.1 mrg mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
198 1.1 mrg mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX;
199 1.1 mrg mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX;
200 1.1 mrg mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX;
201 1.1 mrg mp_size_t inv_newton_threshold = MP_SIZE_T_MAX;
202 1.1 mrg mp_size_t inv_appr_threshold = MP_SIZE_T_MAX;
203 1.1 mrg mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
204 1.1 mrg mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
205 1.1 mrg mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
206 1.1 mrg mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
207 1.1 mrg mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
208 1.1 mrg mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
209 1.1.1.2 mrg mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
210 1.1.1.2 mrg mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
211 1.1 mrg mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
212 1.1 mrg mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
213 1.1.1.3 mrg int div_qr_1n_pi1_method = 0;
214 1.1.1.3 mrg mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX;
215 1.1.1.3 mrg mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX;
216 1.1 mrg mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
217 1.1 mrg mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
218 1.1 mrg mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
219 1.1 mrg mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
220 1.1.1.2 mrg int mod_1_1p_method = 0;
221 1.1 mrg mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
222 1.1 mrg mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
223 1.1 mrg mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
224 1.1 mrg mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
225 1.1 mrg mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
226 1.1 mrg mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
227 1.1 mrg mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
228 1.1 mrg mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
229 1.1 mrg mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
230 1.1 mrg mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
231 1.1.1.2 mrg mp_size_t fac_odd_threshold = 0;
232 1.1.1.2 mrg mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT;
233 1.1 mrg
234 1.1 mrg mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
235 1.1 mrg mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
236 1.1 mrg
237 1.1 mrg struct param_t {
238 1.1 mrg const char *name;
239 1.1 mrg speed_function_t function;
240 1.1 mrg speed_function_t function2;
241 1.1 mrg double step_factor; /* how much to step relatively */
242 1.1 mrg int step; /* how much to step absolutely */
243 1.1 mrg double function_fudge; /* multiplier for "function" speeds */
244 1.1 mrg int stop_since_change;
245 1.1 mrg double stop_factor;
246 1.1 mrg mp_size_t min_size;
247 1.1 mrg int min_is_always;
248 1.1 mrg mp_size_t max_size;
249 1.1 mrg mp_size_t check_size;
250 1.1 mrg mp_size_t size_extra;
251 1.1 mrg
252 1.1 mrg #define DATA_HIGH_LT_R 1
253 1.1 mrg #define DATA_HIGH_GE_R 2
254 1.1 mrg int data_high;
255 1.1 mrg
256 1.1 mrg int noprint;
257 1.1 mrg };
258 1.1 mrg
259 1.1 mrg
260 1.1 mrg /* These are normally undefined when false, which suits "#if" fine.
261 1.1 mrg But give them zero values so they can be used in plain C "if"s. */
262 1.1 mrg #ifndef UDIV_PREINV_ALWAYS
263 1.1 mrg #define UDIV_PREINV_ALWAYS 0
264 1.1 mrg #endif
265 1.1 mrg #ifndef HAVE_NATIVE_mpn_divexact_1
266 1.1 mrg #define HAVE_NATIVE_mpn_divexact_1 0
267 1.1 mrg #endif
268 1.1.1.3 mrg #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
269 1.1.1.3 mrg #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
270 1.1.1.3 mrg #endif
271 1.1 mrg #ifndef HAVE_NATIVE_mpn_divrem_1
272 1.1 mrg #define HAVE_NATIVE_mpn_divrem_1 0
273 1.1 mrg #endif
274 1.1 mrg #ifndef HAVE_NATIVE_mpn_divrem_2
275 1.1 mrg #define HAVE_NATIVE_mpn_divrem_2 0
276 1.1 mrg #endif
277 1.1 mrg #ifndef HAVE_NATIVE_mpn_mod_1
278 1.1 mrg #define HAVE_NATIVE_mpn_mod_1 0
279 1.1 mrg #endif
280 1.1.1.2 mrg #ifndef HAVE_NATIVE_mpn_mod_1_1p
281 1.1.1.2 mrg #define HAVE_NATIVE_mpn_mod_1_1p 0
282 1.1.1.2 mrg #endif
283 1.1 mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd
284 1.1 mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0
285 1.1 mrg #endif
286 1.1 mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
287 1.1 mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0
288 1.1 mrg #endif
289 1.1 mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1
290 1.1 mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0
291 1.1 mrg #endif
292 1.1 mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase
293 1.1 mrg #define HAVE_NATIVE_mpn_sqr_basecase 0
294 1.1 mrg #endif
295 1.1 mrg
296 1.1 mrg
297 1.1 mrg #define MAX3(a,b,c) MAX (MAX (a, b), c)
298 1.1 mrg
299 1.1 mrg mp_limb_t
300 1.1 mrg randlimb_norm (void)
301 1.1 mrg {
302 1.1 mrg mp_limb_t n;
303 1.1 mrg mpn_random (&n, 1);
304 1.1 mrg n |= GMP_NUMB_HIGHBIT;
305 1.1 mrg return n;
306 1.1 mrg }
307 1.1 mrg
308 1.1 mrg #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
309 1.1 mrg
310 1.1 mrg mp_limb_t
311 1.1 mrg randlimb_half (void)
312 1.1 mrg {
313 1.1 mrg mp_limb_t n;
314 1.1 mrg mpn_random (&n, 1);
315 1.1 mrg n &= GMP_NUMB_HALFMASK;
316 1.1 mrg n += (n==0);
317 1.1 mrg return n;
318 1.1 mrg }
319 1.1 mrg
320 1.1 mrg
321 1.1 mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger
322 1.1 mrg if necessary. */
323 1.1 mrg void
324 1.1 mrg add_dat (mp_size_t size, double d)
325 1.1 mrg {
326 1.1 mrg #define ALLOCDAT_STEP 500
327 1.1 mrg
328 1.1 mrg ASSERT_ALWAYS (ndat <= allocdat);
329 1.1 mrg
330 1.1 mrg if (ndat == allocdat)
331 1.1 mrg {
332 1.1 mrg dat = (struct dat_t *) __gmp_allocate_or_reallocate
333 1.1 mrg (dat, allocdat * sizeof(dat[0]),
334 1.1 mrg (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
335 1.1 mrg allocdat += ALLOCDAT_STEP;
336 1.1 mrg }
337 1.1 mrg
338 1.1 mrg dat[ndat].size = size;
339 1.1 mrg dat[ndat].d = d;
340 1.1 mrg ndat++;
341 1.1 mrg }
342 1.1 mrg
343 1.1 mrg
344 1.1 mrg /* Return the threshold size based on the data accumulated. */
345 1.1 mrg mp_size_t
346 1.1 mrg analyze_dat (int final)
347 1.1 mrg {
348 1.1 mrg double x, min_x;
349 1.1 mrg int j, min_j;
350 1.1 mrg
351 1.1 mrg /* If the threshold is set at dat[0].size, any positive values are bad. */
352 1.1 mrg x = 0.0;
353 1.1 mrg for (j = 0; j < ndat; j++)
354 1.1 mrg if (dat[j].d > 0.0)
355 1.1 mrg x += dat[j].d;
356 1.1 mrg
357 1.1 mrg if (option_trace >= 2 && final)
358 1.1 mrg {
359 1.1 mrg printf ("\n");
360 1.1 mrg printf ("x is the sum of the badness from setting thresh at given size\n");
361 1.1 mrg printf (" (minimum x is sought)\n");
362 1.1 mrg printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
363 1.1 mrg }
364 1.1 mrg
365 1.1 mrg min_x = x;
366 1.1 mrg min_j = 0;
367 1.1 mrg
368 1.1 mrg
369 1.1 mrg /* When stepping to the next dat[j].size, positive values are no longer
370 1.1 mrg bad (so subtracted), negative values become bad (so add the absolute
371 1.1 mrg value, meaning subtract). */
372 1.1 mrg for (j = 0; j < ndat; x -= dat[j].d, j++)
373 1.1 mrg {
374 1.1 mrg if (option_trace >= 2 && final)
375 1.1 mrg printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
376 1.1 mrg
377 1.1 mrg if (x < min_x)
378 1.1 mrg {
379 1.1 mrg min_x = x;
380 1.1 mrg min_j = j;
381 1.1 mrg }
382 1.1 mrg }
383 1.1 mrg
384 1.1 mrg return min_j;
385 1.1 mrg }
386 1.1 mrg
387 1.1 mrg
388 1.1.1.3 mrg /* Measuring for recompiled mpn/generic/div_qr_1.c,
389 1.1.1.3 mrg * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
390 1.1.1.3 mrg
391 1.1.1.3 mrg mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
392 1.1.1.3 mrg
393 1.1.1.3 mrg #if defined (__cplusplus)
394 1.1.1.3 mrg extern "C" {
395 1.1.1.3 mrg #endif
396 1.1 mrg
397 1.1.1.2 mrg mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
398 1.1.1.2 mrg mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
399 1.1.1.2 mrg void mpz_fac_ui_tune (mpz_ptr, unsigned long);
400 1.1 mrg
401 1.1.1.3 mrg #if defined (__cplusplus)
402 1.1.1.3 mrg }
403 1.1.1.3 mrg #endif
404 1.1.1.3 mrg
405 1.1 mrg double
406 1.1 mrg speed_mpn_mod_1_tune (struct speed_params *s)
407 1.1 mrg {
408 1.1 mrg SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
409 1.1 mrg }
410 1.1 mrg double
411 1.1 mrg speed_mpn_divrem_1_tune (struct speed_params *s)
412 1.1 mrg {
413 1.1 mrg SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
414 1.1 mrg }
415 1.1.1.2 mrg double
416 1.1.1.2 mrg speed_mpz_fac_ui_tune (struct speed_params *s)
417 1.1.1.2 mrg {
418 1.1.1.2 mrg SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
419 1.1.1.2 mrg }
420 1.1.1.3 mrg double
421 1.1.1.3 mrg speed_mpn_div_qr_1_tune (struct speed_params *s)
422 1.1.1.3 mrg {
423 1.1.1.3 mrg SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
424 1.1.1.3 mrg }
425 1.1 mrg
426 1.1 mrg double
427 1.1 mrg tuneup_measure (speed_function_t fun,
428 1.1 mrg const struct param_t *param,
429 1.1 mrg struct speed_params *s)
430 1.1 mrg {
431 1.1 mrg static struct param_t dummy;
432 1.1 mrg double t;
433 1.1 mrg TMP_DECL;
434 1.1 mrg
435 1.1 mrg if (! param)
436 1.1 mrg param = &dummy;
437 1.1 mrg
438 1.1 mrg s->size += param->size_extra;
439 1.1 mrg
440 1.1 mrg TMP_MARK;
441 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
442 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
443 1.1 mrg
444 1.1 mrg mpn_random (s->xp, s->size);
445 1.1 mrg mpn_random (s->yp, s->size);
446 1.1 mrg
447 1.1 mrg switch (param->data_high) {
448 1.1 mrg case DATA_HIGH_LT_R:
449 1.1 mrg s->xp[s->size-1] %= s->r;
450 1.1 mrg s->yp[s->size-1] %= s->r;
451 1.1 mrg break;
452 1.1 mrg case DATA_HIGH_GE_R:
453 1.1 mrg s->xp[s->size-1] |= s->r;
454 1.1 mrg s->yp[s->size-1] |= s->r;
455 1.1 mrg break;
456 1.1 mrg }
457 1.1 mrg
458 1.1 mrg t = speed_measure (fun, s);
459 1.1 mrg
460 1.1 mrg s->size -= param->size_extra;
461 1.1 mrg
462 1.1 mrg TMP_FREE;
463 1.1 mrg return t;
464 1.1 mrg }
465 1.1 mrg
466 1.1 mrg
467 1.1 mrg #define PRINT_WIDTH 31
468 1.1 mrg
469 1.1 mrg void
470 1.1 mrg print_define_start (const char *name)
471 1.1 mrg {
472 1.1 mrg printf ("#define %-*s ", PRINT_WIDTH, name);
473 1.1 mrg if (option_trace)
474 1.1 mrg printf ("...\n");
475 1.1 mrg }
476 1.1 mrg
477 1.1 mrg void
478 1.1 mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark)
479 1.1 mrg {
480 1.1 mrg if (option_trace)
481 1.1 mrg printf ("#define %-*s ", PRINT_WIDTH, name);
482 1.1 mrg
483 1.1 mrg if (value == MP_SIZE_T_MAX)
484 1.1 mrg printf ("MP_SIZE_T_MAX");
485 1.1 mrg else
486 1.1 mrg printf ("%5ld", (long) value);
487 1.1 mrg
488 1.1 mrg if (remark != NULL)
489 1.1 mrg printf (" /* %s */", remark);
490 1.1 mrg printf ("\n");
491 1.1 mrg fflush (stdout);
492 1.1 mrg }
493 1.1 mrg
494 1.1 mrg void
495 1.1 mrg print_define_end (const char *name, mp_size_t value)
496 1.1 mrg {
497 1.1 mrg const char *remark;
498 1.1 mrg if (value == MP_SIZE_T_MAX)
499 1.1 mrg remark = "never";
500 1.1 mrg else if (value == 0)
501 1.1 mrg remark = "always";
502 1.1 mrg else
503 1.1 mrg remark = NULL;
504 1.1 mrg print_define_end_remark (name, value, remark);
505 1.1 mrg }
506 1.1 mrg
507 1.1 mrg void
508 1.1 mrg print_define (const char *name, mp_size_t value)
509 1.1 mrg {
510 1.1 mrg print_define_start (name);
511 1.1 mrg print_define_end (name, value);
512 1.1 mrg }
513 1.1 mrg
514 1.1 mrg void
515 1.1 mrg print_define_remark (const char *name, mp_size_t value, const char *remark)
516 1.1 mrg {
517 1.1 mrg print_define_start (name);
518 1.1 mrg print_define_end_remark (name, value, remark);
519 1.1 mrg }
520 1.1 mrg
521 1.1.1.4 mrg void
522 1.1.1.4 mrg print_define_with_speedup (const char *name, mp_size_t value,
523 1.1.1.4 mrg mp_size_t runner_up, double speedup)
524 1.1.1.4 mrg {
525 1.1.1.4 mrg char buf[100];
526 1.1.1.4 mrg snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
527 1.1.1.4 mrg 100.0 * (speedup - 1), runner_up);
528 1.1.1.4 mrg print_define_remark (name, value, buf);
529 1.1.1.4 mrg }
530 1.1 mrg
531 1.1 mrg void
532 1.1 mrg one (mp_size_t *threshold, struct param_t *param)
533 1.1 mrg {
534 1.1 mrg int since_positive, since_thresh_change;
535 1.1 mrg int thresh_idx, new_thresh_idx;
536 1.1 mrg
537 1.1 mrg #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
538 1.1 mrg
539 1.1 mrg DEFAULT (param->function_fudge, 1.0);
540 1.1 mrg DEFAULT (param->function2, param->function);
541 1.1 mrg DEFAULT (param->step_factor, 0.01); /* small steps by default */
542 1.1 mrg DEFAULT (param->step, 1); /* small steps by default */
543 1.1 mrg DEFAULT (param->stop_since_change, 80);
544 1.1 mrg DEFAULT (param->stop_factor, 1.2);
545 1.1 mrg DEFAULT (param->min_size, 10);
546 1.1 mrg DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
547 1.1 mrg
548 1.1 mrg if (param->check_size != 0)
549 1.1 mrg {
550 1.1 mrg double t1, t2;
551 1.1 mrg s.size = param->check_size;
552 1.1 mrg
553 1.1 mrg *threshold = s.size+1;
554 1.1 mrg t1 = tuneup_measure (param->function, param, &s);
555 1.1 mrg
556 1.1 mrg *threshold = s.size;
557 1.1 mrg t2 = tuneup_measure (param->function2, param, &s);
558 1.1 mrg if (t1 == -1.0 || t2 == -1.0)
559 1.1 mrg {
560 1.1 mrg printf ("Oops, can't run both functions at size %ld\n",
561 1.1 mrg (long) s.size);
562 1.1 mrg abort ();
563 1.1 mrg }
564 1.1 mrg t1 *= param->function_fudge;
565 1.1 mrg
566 1.1 mrg /* ask that t2 is at least 4% below t1 */
567 1.1 mrg if (t1 < t2*1.04)
568 1.1 mrg {
569 1.1 mrg if (option_trace)
570 1.1 mrg printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
571 1.1 mrg *threshold = MP_SIZE_T_MAX;
572 1.1 mrg if (! param->noprint)
573 1.1 mrg print_define (param->name, *threshold);
574 1.1 mrg return;
575 1.1 mrg }
576 1.1 mrg
577 1.1 mrg if (option_trace >= 2)
578 1.1 mrg printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
579 1.1 mrg (long) s.size, t1, t2);
580 1.1 mrg }
581 1.1 mrg
582 1.1 mrg if (! param->noprint || option_trace)
583 1.1 mrg print_define_start (param->name);
584 1.1 mrg
585 1.1 mrg ndat = 0;
586 1.1 mrg since_positive = 0;
587 1.1 mrg since_thresh_change = 0;
588 1.1 mrg thresh_idx = 0;
589 1.1 mrg
590 1.1 mrg if (option_trace >= 2)
591 1.1 mrg {
592 1.1 mrg printf (" algorithm-A algorithm-B ratio possible\n");
593 1.1 mrg printf (" (seconds) (seconds) diff thresh\n");
594 1.1 mrg }
595 1.1 mrg
596 1.1 mrg for (s.size = param->min_size;
597 1.1 mrg s.size < param->max_size;
598 1.1 mrg s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
599 1.1 mrg {
600 1.1 mrg double ti, tiplus1, d;
601 1.1 mrg
602 1.1 mrg /*
603 1.1 mrg FIXME: check minimum size requirements are met, possibly by just
604 1.1 mrg checking for the -1 returns from the speed functions.
605 1.1 mrg */
606 1.1 mrg
607 1.1 mrg /* using method A at this size */
608 1.1 mrg *threshold = s.size+1;
609 1.1 mrg ti = tuneup_measure (param->function, param, &s);
610 1.1 mrg if (ti == -1.0)
611 1.1 mrg abort ();
612 1.1 mrg ti *= param->function_fudge;
613 1.1 mrg
614 1.1 mrg /* using method B at this size */
615 1.1 mrg *threshold = s.size;
616 1.1 mrg tiplus1 = tuneup_measure (param->function2, param, &s);
617 1.1 mrg if (tiplus1 == -1.0)
618 1.1 mrg abort ();
619 1.1 mrg
620 1.1 mrg /* Calculate the fraction by which the one or the other routine is
621 1.1 mrg slower. */
622 1.1 mrg if (tiplus1 >= ti)
623 1.1 mrg d = (tiplus1 - ti) / tiplus1; /* negative */
624 1.1 mrg else
625 1.1 mrg d = (tiplus1 - ti) / ti; /* positive */
626 1.1 mrg
627 1.1 mrg add_dat (s.size, d);
628 1.1 mrg
629 1.1 mrg new_thresh_idx = analyze_dat (0);
630 1.1 mrg
631 1.1 mrg if (option_trace >= 2)
632 1.1 mrg printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
633 1.1 mrg (long) s.size, ti, tiplus1, d,
634 1.1 mrg ti > tiplus1 ? '#' : ' ',
635 1.1 mrg (long) dat[new_thresh_idx].size);
636 1.1 mrg
637 1.1 mrg /* Stop if the last time method i was faster was more than a
638 1.1 mrg certain number of measurements ago. */
639 1.1 mrg #define STOP_SINCE_POSITIVE 200
640 1.1 mrg if (d >= 0)
641 1.1 mrg since_positive = 0;
642 1.1 mrg else
643 1.1 mrg if (++since_positive > STOP_SINCE_POSITIVE)
644 1.1 mrg {
645 1.1 mrg if (option_trace >= 1)
646 1.1 mrg printf ("stopped due to since_positive (%d)\n",
647 1.1 mrg STOP_SINCE_POSITIVE);
648 1.1 mrg break;
649 1.1 mrg }
650 1.1 mrg
651 1.1 mrg /* Stop if method A has become slower by a certain factor. */
652 1.1 mrg if (ti >= tiplus1 * param->stop_factor)
653 1.1 mrg {
654 1.1 mrg if (option_trace >= 1)
655 1.1 mrg printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
656 1.1 mrg param->stop_factor);
657 1.1 mrg break;
658 1.1 mrg }
659 1.1 mrg
660 1.1 mrg /* Stop if the threshold implied hasn't changed in a certain
661 1.1 mrg number of measurements. (It's this condition that usually
662 1.1 mrg stops the loop.) */
663 1.1 mrg if (thresh_idx != new_thresh_idx)
664 1.1 mrg since_thresh_change = 0, thresh_idx = new_thresh_idx;
665 1.1 mrg else
666 1.1 mrg if (++since_thresh_change > param->stop_since_change)
667 1.1 mrg {
668 1.1 mrg if (option_trace >= 1)
669 1.1 mrg printf ("stopped due to since_thresh_change (%d)\n",
670 1.1 mrg param->stop_since_change);
671 1.1 mrg break;
672 1.1 mrg }
673 1.1 mrg
674 1.1 mrg /* Stop if the threshold implied is more than a certain number of
675 1.1 mrg measurements ago. */
676 1.1 mrg #define STOP_SINCE_AFTER 500
677 1.1 mrg if (ndat - thresh_idx > STOP_SINCE_AFTER)
678 1.1 mrg {
679 1.1 mrg if (option_trace >= 1)
680 1.1 mrg printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
681 1.1 mrg STOP_SINCE_AFTER);
682 1.1 mrg break;
683 1.1 mrg }
684 1.1 mrg
685 1.1 mrg /* Stop when the size limit is reached before the end of the
686 1.1 mrg crossover, but only show this as an error for >= the default max
687 1.1 mrg size. FIXME: Maybe should make it a param choice whether this is
688 1.1 mrg an error. */
689 1.1 mrg if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
690 1.1 mrg {
691 1.1 mrg fprintf (stderr, "%s\n", param->name);
692 1.1 mrg fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
693 1.1 mrg (long) dat[0].size, (long) dat[ndat-1].size, ndat);
694 1.1 mrg fprintf (stderr, " max size reached before end of crossover\n");
695 1.1 mrg break;
696 1.1 mrg }
697 1.1 mrg }
698 1.1 mrg
699 1.1 mrg if (option_trace >= 1)
700 1.1 mrg printf ("sizes %ld to %ld total %d measurements\n",
701 1.1 mrg (long) dat[0].size, (long) dat[ndat-1].size, ndat);
702 1.1 mrg
703 1.1 mrg *threshold = dat[analyze_dat (1)].size;
704 1.1 mrg
705 1.1 mrg if (param->min_is_always)
706 1.1 mrg {
707 1.1 mrg if (*threshold == param->min_size)
708 1.1 mrg *threshold = 0;
709 1.1 mrg }
710 1.1 mrg
711 1.1 mrg if (! param->noprint || option_trace)
712 1.1 mrg print_define_end (param->name, *threshold);
713 1.1 mrg }
714 1.1 mrg
715 1.1.1.4 mrg /* Time N different FUNCTIONS with the same parameters and size, to
716 1.1.1.4 mrg select the fastest. Since *_METHOD defines start numbering from
717 1.1.1.4 mrg one, if functions[i] is fastest, the value of the define is i+1.
718 1.1.1.4 mrg Also output a comment with speedup compared to the next fastest
719 1.1.1.4 mrg function. The NAME argument is used only for trace output.
720 1.1.1.4 mrg
721 1.1.1.4 mrg Returns the index of the fastest function.
722 1.1.1.4 mrg */
723 1.1.1.4 mrg int
724 1.1.1.4 mrg one_method (int n, speed_function_t *functions,
725 1.1.1.4 mrg const char *name, const char *define,
726 1.1.1.4 mrg const struct param_t *param)
727 1.1.1.4 mrg {
728 1.1.1.4 mrg double *t;
729 1.1.1.4 mrg int i;
730 1.1.1.4 mrg int method;
731 1.1.1.4 mrg int method_runner_up;
732 1.1.1.4 mrg
733 1.1.1.4 mrg TMP_DECL;
734 1.1.1.4 mrg TMP_MARK;
735 1.1.1.4 mrg t = (double*) TMP_ALLOC (n * sizeof (*t));
736 1.1.1.4 mrg
737 1.1.1.4 mrg for (i = 0; i < n; i++)
738 1.1.1.4 mrg {
739 1.1.1.4 mrg t[i] = tuneup_measure (functions[i], param, &s);
740 1.1.1.4 mrg if (option_trace >= 1)
741 1.1.1.4 mrg printf ("size=%ld, %s, method %d %.9f\n",
742 1.1.1.4 mrg (long) s.size, name, i + 1, t[i]);
743 1.1.1.4 mrg if (t[i] == -1.0)
744 1.1.1.4 mrg {
745 1.1.1.4 mrg printf ("Oops, can't measure all %s methods\n", name);
746 1.1.1.4 mrg abort ();
747 1.1.1.4 mrg }
748 1.1.1.4 mrg }
749 1.1.1.4 mrg method = 0;
750 1.1.1.4 mrg for (i = 1; i < n; i++)
751 1.1.1.4 mrg if (t[i] < t[method])
752 1.1.1.4 mrg method = i;
753 1.1.1.4 mrg
754 1.1.1.4 mrg method_runner_up = (method == 0);
755 1.1.1.4 mrg for (i = 0; i < n; i++)
756 1.1.1.4 mrg if (i != method && t[i] < t[method_runner_up])
757 1.1.1.4 mrg method_runner_up = i;
758 1.1.1.4 mrg
759 1.1.1.4 mrg print_define_with_speedup (define, method + 1, method_runner_up + 1,
760 1.1.1.4 mrg t[method_runner_up] / t[method]);
761 1.1.1.4 mrg
762 1.1.1.4 mrg TMP_FREE;
763 1.1.1.4 mrg return method;
764 1.1.1.4 mrg }
765 1.1.1.4 mrg
766 1.1 mrg
767 1.1 mrg /* Special probing for the fft thresholds. The size restrictions on the
768 1.1 mrg FFTs mean the graph of time vs size has a step effect. See this for
769 1.1 mrg example using
770 1.1 mrg
771 1.1 mrg ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
772 1.1 mrg gnuplot foo.gnuplot
773 1.1 mrg
774 1.1 mrg The current approach is to compare routines at the midpoint of relevant
775 1.1 mrg steps. Arguably a more sophisticated system of threshold data is wanted
776 1.1 mrg if this step effect remains. */
777 1.1 mrg
778 1.1 mrg struct fft_param_t {
779 1.1 mrg const char *table_name;
780 1.1 mrg const char *threshold_name;
781 1.1 mrg const char *modf_threshold_name;
782 1.1 mrg mp_size_t *p_threshold;
783 1.1 mrg mp_size_t *p_modf_threshold;
784 1.1 mrg mp_size_t first_size;
785 1.1 mrg mp_size_t max_size;
786 1.1 mrg speed_function_t function;
787 1.1 mrg speed_function_t mul_modf_function;
788 1.1 mrg speed_function_t mul_function;
789 1.1 mrg mp_size_t sqr;
790 1.1 mrg };
791 1.1 mrg
792 1.1 mrg
793 1.1 mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
794 1.1 mrg N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
795 1.1 mrg of 2^(k-1) bits. */
796 1.1 mrg
797 1.1 mrg mp_size_t
798 1.1 mrg fft_step_size (int k)
799 1.1 mrg {
800 1.1 mrg mp_size_t step;
801 1.1 mrg
802 1.1 mrg step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
803 1.1 mrg step *= (mp_size_t) 1 << k;
804 1.1 mrg
805 1.1 mrg if (step <= 0)
806 1.1 mrg {
807 1.1 mrg printf ("Can't handle k=%d\n", k);
808 1.1 mrg abort ();
809 1.1 mrg }
810 1.1 mrg
811 1.1 mrg return step;
812 1.1 mrg }
813 1.1 mrg
814 1.1 mrg mp_size_t
815 1.1 mrg fft_next_size (mp_size_t pl, int k)
816 1.1 mrg {
817 1.1 mrg mp_size_t m = fft_step_size (k);
818 1.1 mrg
819 1.1 mrg /* printf ("[k=%d %ld] %ld ->", k, m, pl); */
820 1.1 mrg
821 1.1 mrg if (pl == 0 || (pl & (m-1)) != 0)
822 1.1 mrg pl = (pl | (m-1)) + 1;
823 1.1 mrg
824 1.1 mrg /* printf (" %ld\n", pl); */
825 1.1 mrg return pl;
826 1.1 mrg }
827 1.1 mrg
828 1.1 mrg #define NMAX_DEFAULT 1000000
829 1.1 mrg #define MAX_REPS 25
830 1.1 mrg #define MIN_REPS 5
831 1.1 mrg
832 1.1 mrg static inline size_t
833 1.1 mrg mpn_mul_fft_lcm (size_t a, unsigned int k)
834 1.1 mrg {
835 1.1 mrg unsigned int l = k;
836 1.1 mrg
837 1.1 mrg while (a % 2 == 0 && k > 0)
838 1.1 mrg {
839 1.1 mrg a >>= 1;
840 1.1 mrg k--;
841 1.1 mrg }
842 1.1 mrg return a << l;
843 1.1 mrg }
844 1.1 mrg
845 1.1 mrg mp_size_t
846 1.1 mrg fftfill (mp_size_t pl, int k, int sqr)
847 1.1 mrg {
848 1.1 mrg mp_size_t maxLK;
849 1.1 mrg mp_bitcnt_t N, Nprime, nprime, M;
850 1.1 mrg
851 1.1 mrg N = pl * GMP_NUMB_BITS;
852 1.1 mrg M = N >> k;
853 1.1 mrg
854 1.1 mrg maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
855 1.1 mrg
856 1.1 mrg Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
857 1.1 mrg nprime = Nprime / GMP_NUMB_BITS;
858 1.1 mrg if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
859 1.1 mrg {
860 1.1 mrg size_t K2;
861 1.1 mrg for (;;)
862 1.1 mrg {
863 1.1 mrg K2 = 1L << mpn_fft_best_k (nprime, sqr);
864 1.1 mrg if ((nprime & (K2 - 1)) == 0)
865 1.1 mrg break;
866 1.1 mrg nprime = (nprime + K2 - 1) & -K2;
867 1.1 mrg Nprime = nprime * GMP_LIMB_BITS;
868 1.1 mrg }
869 1.1 mrg }
870 1.1 mrg ASSERT_ALWAYS (nprime < pl);
871 1.1 mrg
872 1.1 mrg return Nprime;
873 1.1 mrg }
874 1.1 mrg
875 1.1 mrg static int
876 1.1 mrg compare_double (const void *ap, const void *bp)
877 1.1 mrg {
878 1.1 mrg double a = * (const double *) ap;
879 1.1 mrg double b = * (const double *) bp;
880 1.1 mrg
881 1.1 mrg if (a < b)
882 1.1 mrg return -1;
883 1.1 mrg else if (a > b)
884 1.1 mrg return 1;
885 1.1 mrg else
886 1.1 mrg return 0;
887 1.1 mrg }
888 1.1 mrg
889 1.1 mrg double
890 1.1 mrg median (double *times, int n)
891 1.1 mrg {
892 1.1 mrg qsort (times, n, sizeof (double), compare_double);
893 1.1 mrg return times[n/2];
894 1.1 mrg }
895 1.1 mrg
896 1.1 mrg #define FFT_CACHE_SIZE 25
897 1.1 mrg typedef struct fft_cache
898 1.1 mrg {
899 1.1 mrg mp_size_t n;
900 1.1 mrg double time;
901 1.1 mrg } fft_cache_t;
902 1.1 mrg
903 1.1 mrg fft_cache_t fft_cache[FFT_CACHE_SIZE];
904 1.1 mrg
905 1.1 mrg double
906 1.1 mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
907 1.1 mrg int n_measurements)
908 1.1 mrg {
909 1.1 mrg int i;
910 1.1 mrg double t, ttab[MAX_REPS];
911 1.1 mrg
912 1.1 mrg if (fft_cache[k].n == n)
913 1.1 mrg return fft_cache[k].time;
914 1.1 mrg
915 1.1 mrg for (i = 0; i < n_measurements; i++)
916 1.1 mrg {
917 1.1 mrg speed_starttime ();
918 1.1 mrg mpn_mul_fft (rp, n, ap, n, bp, n, k);
919 1.1 mrg ttab[i] = speed_endtime ();
920 1.1 mrg }
921 1.1 mrg
922 1.1 mrg t = median (ttab, n_measurements);
923 1.1 mrg fft_cache[k].n = n;
924 1.1 mrg fft_cache[k].time = t;
925 1.1 mrg return t;
926 1.1 mrg }
927 1.1 mrg
928 1.1 mrg #define INSERT_FFTTAB(idx, nval, kval) \
929 1.1 mrg do { \
930 1.1 mrg fft_tab[idx].n = nval; \
931 1.1 mrg fft_tab[idx].k = kval; \
932 1.1.1.3 mrg fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \
933 1.1.1.3 mrg fft_tab[idx+1].k = (1 << 5) - 1; \
934 1.1 mrg } while (0)
935 1.1 mrg
936 1.1 mrg int
937 1.1 mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
938 1.1 mrg {
939 1.1 mrg mp_size_t n, n1, prev_n1;
940 1.1 mrg int k, best_k, last_best_k, kmax;
941 1.1 mrg int eff, prev_eff;
942 1.1 mrg double t0, t1;
943 1.1 mrg int n_measurements;
944 1.1 mrg mp_limb_t *ap, *bp, *rp;
945 1.1 mrg mp_size_t alloc;
946 1.1 mrg struct fft_table_nk *fft_tab;
947 1.1 mrg
948 1.1 mrg fft_tab = mpn_fft_table3[p->sqr];
949 1.1 mrg
950 1.1 mrg for (k = 0; k < FFT_CACHE_SIZE; k++)
951 1.1 mrg fft_cache[k].n = 0;
952 1.1 mrg
953 1.1 mrg if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
954 1.1 mrg {
955 1.1 mrg nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
956 1.1 mrg }
957 1.1 mrg
958 1.1 mrg if (print)
959 1.1 mrg printf ("#define %s%*s", p->table_name, 38, "");
960 1.1 mrg
961 1.1 mrg if (idx == 0)
962 1.1 mrg {
963 1.1 mrg INSERT_FFTTAB (0, nmin, initial_k);
964 1.1 mrg
965 1.1 mrg if (print)
966 1.1 mrg {
967 1.1 mrg printf ("\\\n { ");
968 1.1 mrg printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
969 1.1 mrg }
970 1.1 mrg
971 1.1 mrg idx = 1;
972 1.1 mrg }
973 1.1 mrg
974 1.1.1.3 mrg ap = (mp_ptr) malloc (sizeof (mp_limb_t));
975 1.1 mrg if (p->sqr)
976 1.1 mrg bp = ap;
977 1.1 mrg else
978 1.1.1.3 mrg bp = (mp_ptr) malloc (sizeof (mp_limb_t));
979 1.1.1.3 mrg rp = (mp_ptr) malloc (sizeof (mp_limb_t));
980 1.1 mrg alloc = 1;
981 1.1 mrg
982 1.1 mrg /* Round n to comply to initial k value */
983 1.1 mrg n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
984 1.1 mrg
985 1.1 mrg n_measurements = (18 - initial_k) | 1;
986 1.1 mrg n_measurements = MAX (n_measurements, MIN_REPS);
987 1.1 mrg n_measurements = MIN (n_measurements, MAX_REPS);
988 1.1 mrg
989 1.1 mrg last_best_k = initial_k;
990 1.1 mrg best_k = initial_k;
991 1.1 mrg
992 1.1 mrg while (n < nmax)
993 1.1 mrg {
994 1.1 mrg int start_k, end_k;
995 1.1 mrg
996 1.1 mrg /* Assume the current best k is best until we hit its next FFT step. */
997 1.1 mrg t0 = 99999;
998 1.1 mrg
999 1.1 mrg prev_n1 = n + 1;
1000 1.1 mrg
1001 1.1 mrg start_k = MAX (4, best_k - 4);
1002 1.1 mrg end_k = MIN (24, best_k + 4);
1003 1.1 mrg for (k = start_k; k <= end_k; k++)
1004 1.1 mrg {
1005 1.1 mrg n1 = mpn_fft_next_size (prev_n1, k);
1006 1.1 mrg
1007 1.1 mrg eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
1008 1.1 mrg
1009 1.1 mrg if (eff < 70) /* avoid measuring too slow fft:s */
1010 1.1 mrg continue;
1011 1.1 mrg
1012 1.1 mrg if (n1 > alloc)
1013 1.1 mrg {
1014 1.1 mrg alloc = n1;
1015 1.1 mrg if (p->sqr)
1016 1.1 mrg {
1017 1.1.1.3 mrg ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1018 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1019 1.1.1.3 mrg ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1020 1.1 mrg mpn_random (ap, alloc);
1021 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1022 1.1 mrg }
1023 1.1 mrg else
1024 1.1 mrg {
1025 1.1.1.3 mrg ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1026 1.1.1.3 mrg bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
1027 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1028 1.1.1.3 mrg ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1029 1.1 mrg mpn_random (ap, alloc);
1030 1.1.1.3 mrg bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
1031 1.1 mrg mpn_random (bp, alloc);
1032 1.1.1.3 mrg rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1033 1.1 mrg }
1034 1.1 mrg }
1035 1.1 mrg
1036 1.1 mrg t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
1037 1.1 mrg
1038 1.1 mrg if (t1 * n_measurements > 0.3)
1039 1.1 mrg n_measurements -= 2;
1040 1.1 mrg n_measurements = MAX (n_measurements, MIN_REPS);
1041 1.1 mrg
1042 1.1 mrg if (t1 < t0)
1043 1.1 mrg {
1044 1.1 mrg best_k = k;
1045 1.1 mrg t0 = t1;
1046 1.1 mrg }
1047 1.1 mrg }
1048 1.1 mrg
1049 1.1 mrg n1 = mpn_fft_next_size (prev_n1, best_k);
1050 1.1 mrg
1051 1.1 mrg if (last_best_k != best_k)
1052 1.1 mrg {
1053 1.1 mrg ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
1054 1.1 mrg
1055 1.1 mrg if (idx >= FFT_TABLE3_SIZE)
1056 1.1 mrg {
1057 1.1 mrg printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1058 1.1 mrg abort ();
1059 1.1 mrg }
1060 1.1 mrg INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
1061 1.1 mrg
1062 1.1 mrg if (print)
1063 1.1 mrg {
1064 1.1 mrg printf (", ");
1065 1.1 mrg if (idx % 4 == 0)
1066 1.1 mrg printf ("\\\n ");
1067 1.1 mrg printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1068 1.1 mrg }
1069 1.1 mrg
1070 1.1 mrg if (option_trace >= 2)
1071 1.1 mrg {
1072 1.1 mrg printf ("{%lu,%u}\n", prev_n1, best_k);
1073 1.1 mrg fflush (stdout);
1074 1.1 mrg }
1075 1.1 mrg
1076 1.1 mrg last_best_k = best_k;
1077 1.1 mrg idx++;
1078 1.1 mrg }
1079 1.1 mrg
1080 1.1 mrg for (;;)
1081 1.1 mrg {
1082 1.1 mrg prev_n1 = n1;
1083 1.1 mrg prev_eff = fftfill (prev_n1, best_k, p->sqr);
1084 1.1 mrg n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
1085 1.1 mrg eff = fftfill (n1, best_k, p->sqr);
1086 1.1 mrg
1087 1.1 mrg if (eff != prev_eff)
1088 1.1 mrg break;
1089 1.1 mrg }
1090 1.1 mrg
1091 1.1 mrg n = prev_n1;
1092 1.1 mrg }
1093 1.1 mrg
1094 1.1 mrg kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */
1095 1.1 mrg kmax = MIN (kmax, 25-1);
1096 1.1 mrg for (k = last_best_k + 1; k <= kmax; k++)
1097 1.1 mrg {
1098 1.1 mrg if (idx >= FFT_TABLE3_SIZE)
1099 1.1 mrg {
1100 1.1 mrg printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1101 1.1 mrg abort ();
1102 1.1 mrg }
1103 1.1 mrg INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1104 1.1 mrg
1105 1.1 mrg if (print)
1106 1.1 mrg {
1107 1.1 mrg printf (", ");
1108 1.1 mrg if (idx % 4 == 0)
1109 1.1 mrg printf ("\\\n ");
1110 1.1 mrg printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1111 1.1 mrg }
1112 1.1 mrg
1113 1.1 mrg idx++;
1114 1.1 mrg }
1115 1.1 mrg
1116 1.1 mrg if (print)
1117 1.1 mrg printf (" }\n");
1118 1.1 mrg
1119 1.1 mrg free (ap);
1120 1.1 mrg if (! p->sqr)
1121 1.1 mrg free (bp);
1122 1.1 mrg free (rp);
1123 1.1 mrg
1124 1.1 mrg return idx;
1125 1.1 mrg }
1126 1.1 mrg
1127 1.1 mrg void
1128 1.1 mrg fft (struct fft_param_t *p)
1129 1.1 mrg {
1130 1.1 mrg mp_size_t size;
1131 1.1 mrg int k, idx, initial_k;
1132 1.1 mrg
1133 1.1 mrg /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1134 1.1 mrg
1135 1.1 mrg #if 1
1136 1.1 mrg {
1137 1.1 mrg /* Use plain one() mechanism, for some reasonable initial values of k. The
1138 1.1 mrg advantage is that we don't depend on mpn_fft_table3, which can therefore
1139 1.1 mrg leave it completely uninitialized. */
1140 1.1 mrg
1141 1.1 mrg static struct param_t param;
1142 1.1 mrg mp_size_t thres, best_thres;
1143 1.1 mrg int best_k;
1144 1.1 mrg char buf[20];
1145 1.1 mrg
1146 1.1 mrg best_thres = MP_SIZE_T_MAX;
1147 1.1 mrg best_k = -1;
1148 1.1 mrg
1149 1.1 mrg for (k = 5; k <= 7; k++)
1150 1.1 mrg {
1151 1.1 mrg param.name = p->modf_threshold_name;
1152 1.1 mrg param.min_size = 100;
1153 1.1 mrg param.max_size = 2000;
1154 1.1 mrg param.function = p->mul_function;
1155 1.1 mrg param.step_factor = 0.0;
1156 1.1 mrg param.step = 4;
1157 1.1 mrg param.function2 = p->mul_modf_function;
1158 1.1 mrg param.noprint = 1;
1159 1.1 mrg s.r = k;
1160 1.1 mrg one (&thres, ¶m);
1161 1.1 mrg if (thres < best_thres)
1162 1.1 mrg {
1163 1.1 mrg best_thres = thres;
1164 1.1 mrg best_k = k;
1165 1.1 mrg }
1166 1.1 mrg }
1167 1.1 mrg
1168 1.1 mrg *(p->p_modf_threshold) = best_thres;
1169 1.1 mrg sprintf (buf, "k = %d", best_k);
1170 1.1 mrg print_define_remark (p->modf_threshold_name, best_thres, buf);
1171 1.1 mrg initial_k = best_k;
1172 1.1 mrg }
1173 1.1 mrg #else
1174 1.1 mrg size = p->first_size;
1175 1.1 mrg for (;;)
1176 1.1 mrg {
1177 1.1 mrg double tk, tm;
1178 1.1 mrg
1179 1.1 mrg size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1180 1.1 mrg k = mpn_fft_best_k (size, p->sqr);
1181 1.1 mrg
1182 1.1 mrg if (size >= p->max_size)
1183 1.1 mrg break;
1184 1.1 mrg
1185 1.1 mrg s.size = size + fft_step_size (k) / 2;
1186 1.1 mrg s.r = k;
1187 1.1 mrg tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1188 1.1 mrg if (tk == -1.0)
1189 1.1 mrg abort ();
1190 1.1 mrg
1191 1.1 mrg tm = tuneup_measure (p->mul_function, NULL, &s);
1192 1.1 mrg if (tm == -1.0)
1193 1.1 mrg abort ();
1194 1.1 mrg
1195 1.1 mrg if (option_trace >= 2)
1196 1.1 mrg printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n",
1197 1.1 mrg (long) size,
1198 1.1 mrg (long) size + fft_step_size (k) / 2, k, tk,
1199 1.1 mrg (long) s.size, tm);
1200 1.1 mrg
1201 1.1 mrg if (tk < tm)
1202 1.1 mrg {
1203 1.1 mrg *p->p_modf_threshold = s.size;
1204 1.1 mrg print_define (p->modf_threshold_name, *p->p_modf_threshold);
1205 1.1 mrg break;
1206 1.1 mrg }
1207 1.1 mrg }
1208 1.1 mrg initial_k = ?;
1209 1.1 mrg #endif
1210 1.1 mrg
1211 1.1 mrg /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1212 1.1 mrg
1213 1.1 mrg idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1214 1.1 mrg printf ("#define %s_SIZE %d\n", p->table_name, idx);
1215 1.1 mrg
1216 1.1 mrg /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1217 1.1 mrg
1218 1.1 mrg size = 2 * *p->p_modf_threshold; /* OK? */
1219 1.1 mrg for (;;)
1220 1.1 mrg {
1221 1.1 mrg double tk, tm;
1222 1.1 mrg mp_size_t mulmod_size, mul_size;;
1223 1.1 mrg
1224 1.1 mrg if (size >= p->max_size)
1225 1.1 mrg break;
1226 1.1 mrg
1227 1.1 mrg mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1228 1.1 mrg mul_size = (size + mulmod_size) / 2; /* middle of step */
1229 1.1 mrg
1230 1.1 mrg s.size = mulmod_size;
1231 1.1 mrg tk = tuneup_measure (p->function, NULL, &s);
1232 1.1 mrg if (tk == -1.0)
1233 1.1 mrg abort ();
1234 1.1 mrg
1235 1.1 mrg s.size = mul_size;
1236 1.1 mrg tm = tuneup_measure (p->mul_function, NULL, &s);
1237 1.1 mrg if (tm == -1.0)
1238 1.1 mrg abort ();
1239 1.1 mrg
1240 1.1 mrg if (option_trace >= 2)
1241 1.1 mrg printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n",
1242 1.1 mrg (long) size,
1243 1.1 mrg (long) mulmod_size, tk,
1244 1.1 mrg (long) mul_size, tm);
1245 1.1 mrg
1246 1.1 mrg size = mulmod_size;
1247 1.1 mrg
1248 1.1 mrg if (tk < tm)
1249 1.1 mrg {
1250 1.1 mrg *p->p_threshold = s.size;
1251 1.1 mrg print_define (p->threshold_name, *p->p_threshold);
1252 1.1 mrg break;
1253 1.1 mrg }
1254 1.1 mrg }
1255 1.1 mrg }
1256 1.1 mrg
1257 1.1.1.4 mrg /* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This
1258 1.1.1.4 mrg is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
1259 1.1.1.4 mrg This is used in get_str and set_str. */
1260 1.1.1.4 mrg void
1261 1.1.1.4 mrg relspeed_div_1_vs_mul_1 (void)
1262 1.1.1.4 mrg {
1263 1.1.1.4 mrg const size_t max_opsize = 100;
1264 1.1.1.4 mrg mp_size_t n;
1265 1.1.1.4 mrg long j;
1266 1.1.1.4 mrg mp_limb_t rp[max_opsize];
1267 1.1.1.4 mrg mp_limb_t ap[max_opsize];
1268 1.1.1.4 mrg double multime, divtime;
1269 1.1.1.4 mrg
1270 1.1.1.4 mrg mpn_random (ap, max_opsize);
1271 1.1.1.4 mrg
1272 1.1.1.4 mrg multime = 0;
1273 1.1.1.4 mrg for (n = max_opsize; n > 1; n--)
1274 1.1.1.4 mrg {
1275 1.1.1.4 mrg mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1276 1.1.1.4 mrg speed_starttime ();
1277 1.1.1.4 mrg for (j = speed_precision; j != 0 ; j--)
1278 1.1.1.4 mrg mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1279 1.1.1.4 mrg multime += speed_endtime () / n;
1280 1.1.1.4 mrg }
1281 1.1.1.4 mrg
1282 1.1.1.4 mrg divtime = 0;
1283 1.1.1.4 mrg for (n = max_opsize; n > 1; n--)
1284 1.1.1.4 mrg {
1285 1.1.1.4 mrg /* Make input divisible for good measure. */
1286 1.1.1.4 mrg ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
1287 1.1.1.4 mrg
1288 1.1.1.4 mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1289 1.1.1.4 mrg mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1290 1.1.1.4 mrg MP_BASES_BIG_BASE_BINVERTED_10,
1291 1.1.1.4 mrg MP_BASES_BIG_BASE_CTZ_10);
1292 1.1.1.4 mrg #else
1293 1.1.1.4 mrg mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1294 1.1.1.4 mrg #endif
1295 1.1.1.4 mrg speed_starttime ();
1296 1.1.1.4 mrg for (j = speed_precision; j != 0 ; j--)
1297 1.1.1.4 mrg {
1298 1.1.1.4 mrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1299 1.1.1.4 mrg mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1300 1.1.1.4 mrg MP_BASES_BIG_BASE_BINVERTED_10,
1301 1.1.1.4 mrg MP_BASES_BIG_BASE_CTZ_10);
1302 1.1.1.4 mrg #else
1303 1.1.1.4 mrg mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1304 1.1.1.4 mrg #endif
1305 1.1.1.4 mrg }
1306 1.1.1.4 mrg divtime += speed_endtime () / n;
1307 1.1.1.4 mrg }
1308 1.1.1.4 mrg
1309 1.1.1.4 mrg print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
1310 1.1.1.4 mrg }
1311 1.1 mrg
1312 1.1 mrg
1313 1.1 mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1314 1.1 mrg giving wrong results. */
1315 1.1 mrg void
1316 1.1 mrg tune_mul_n (void)
1317 1.1 mrg {
1318 1.1 mrg static struct param_t param;
1319 1.1.1.2 mrg mp_size_t next_toom_start;
1320 1.1.1.2 mrg int something_changed;
1321 1.1 mrg
1322 1.1 mrg param.function = speed_mpn_mul_n;
1323 1.1 mrg
1324 1.1 mrg param.name = "MUL_TOOM22_THRESHOLD";
1325 1.1 mrg param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1326 1.1 mrg param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1327 1.1 mrg one (&mul_toom22_threshold, ¶m);
1328 1.1 mrg
1329 1.1.1.2 mrg param.noprint = 1;
1330 1.1.1.2 mrg
1331 1.1.1.2 mrg /* Threshold sequence loop. Disable functions that would be used in a very
1332 1.1.1.2 mrg narrow range, re-measuring things when that happens. */
1333 1.1.1.2 mrg something_changed = 1;
1334 1.1.1.2 mrg while (something_changed)
1335 1.1.1.2 mrg {
1336 1.1.1.2 mrg something_changed = 0;
1337 1.1.1.2 mrg
1338 1.1.1.2 mrg next_toom_start = mul_toom22_threshold;
1339 1.1.1.2 mrg
1340 1.1.1.2 mrg if (mul_toom33_threshold != 0)
1341 1.1.1.2 mrg {
1342 1.1.1.2 mrg param.name = "MUL_TOOM33_THRESHOLD";
1343 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
1344 1.1.1.2 mrg param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1345 1.1.1.2 mrg one (&mul_toom33_threshold, ¶m);
1346 1.1.1.2 mrg
1347 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom33_threshold)
1348 1.1.1.2 mrg {
1349 1.1.1.2 mrg mul_toom33_threshold = 0;
1350 1.1.1.2 mrg something_changed = 1;
1351 1.1.1.2 mrg }
1352 1.1.1.2 mrg }
1353 1.1.1.2 mrg
1354 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
1355 1.1.1.2 mrg
1356 1.1.1.2 mrg if (mul_toom44_threshold != 0)
1357 1.1.1.2 mrg {
1358 1.1.1.2 mrg param.name = "MUL_TOOM44_THRESHOLD";
1359 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
1360 1.1.1.2 mrg param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1361 1.1.1.2 mrg one (&mul_toom44_threshold, ¶m);
1362 1.1.1.2 mrg
1363 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom44_threshold)
1364 1.1.1.2 mrg {
1365 1.1.1.2 mrg mul_toom44_threshold = 0;
1366 1.1.1.2 mrg something_changed = 1;
1367 1.1.1.2 mrg }
1368 1.1.1.2 mrg }
1369 1.1.1.2 mrg
1370 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
1371 1.1.1.2 mrg
1372 1.1.1.2 mrg if (mul_toom6h_threshold != 0)
1373 1.1.1.2 mrg {
1374 1.1.1.2 mrg param.name = "MUL_TOOM6H_THRESHOLD";
1375 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
1376 1.1.1.2 mrg param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1377 1.1.1.2 mrg one (&mul_toom6h_threshold, ¶m);
1378 1.1.1.2 mrg
1379 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom6h_threshold)
1380 1.1.1.2 mrg {
1381 1.1.1.2 mrg mul_toom6h_threshold = 0;
1382 1.1.1.2 mrg something_changed = 1;
1383 1.1.1.2 mrg }
1384 1.1.1.2 mrg }
1385 1.1.1.2 mrg
1386 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
1387 1.1.1.2 mrg
1388 1.1.1.2 mrg if (mul_toom8h_threshold != 0)
1389 1.1.1.2 mrg {
1390 1.1.1.2 mrg param.name = "MUL_TOOM8H_THRESHOLD";
1391 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
1392 1.1.1.2 mrg param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1393 1.1.1.2 mrg one (&mul_toom8h_threshold, ¶m);
1394 1.1.1.2 mrg
1395 1.1.1.2 mrg if (next_toom_start * 1.05 >= mul_toom8h_threshold)
1396 1.1.1.2 mrg {
1397 1.1.1.2 mrg mul_toom8h_threshold = 0;
1398 1.1.1.2 mrg something_changed = 1;
1399 1.1.1.2 mrg }
1400 1.1.1.2 mrg }
1401 1.1.1.2 mrg }
1402 1.1.1.2 mrg
1403 1.1.1.2 mrg print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
1404 1.1.1.2 mrg print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
1405 1.1.1.2 mrg print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
1406 1.1.1.2 mrg print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
1407 1.1 mrg
1408 1.1 mrg /* disabled until tuned */
1409 1.1 mrg MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1410 1.1 mrg }
1411 1.1 mrg
1412 1.1 mrg void
1413 1.1 mrg tune_mul (void)
1414 1.1 mrg {
1415 1.1 mrg static struct param_t param;
1416 1.1 mrg mp_size_t thres;
1417 1.1 mrg
1418 1.1 mrg param.noprint = 1;
1419 1.1 mrg
1420 1.1 mrg param.function = speed_mpn_toom32_for_toom43_mul;
1421 1.1 mrg param.function2 = speed_mpn_toom43_for_toom32_mul;
1422 1.1 mrg param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1423 1.1.1.2 mrg param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
1424 1.1 mrg one (&thres, ¶m);
1425 1.1.1.2 mrg mul_toom32_to_toom43_threshold = thres * 17 / 24;
1426 1.1 mrg print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1427 1.1 mrg
1428 1.1 mrg param.function = speed_mpn_toom32_for_toom53_mul;
1429 1.1 mrg param.function2 = speed_mpn_toom53_for_toom32_mul;
1430 1.1 mrg param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1431 1.1.1.2 mrg param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
1432 1.1 mrg one (&thres, ¶m);
1433 1.1.1.2 mrg mul_toom32_to_toom53_threshold = thres * 19 / 30;
1434 1.1 mrg print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1435 1.1 mrg
1436 1.1 mrg param.function = speed_mpn_toom42_for_toom53_mul;
1437 1.1 mrg param.function2 = speed_mpn_toom53_for_toom42_mul;
1438 1.1 mrg param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1439 1.1.1.2 mrg param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
1440 1.1 mrg one (&thres, ¶m);
1441 1.1.1.2 mrg mul_toom42_to_toom53_threshold = thres * 11 / 20;
1442 1.1 mrg print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1443 1.1 mrg
1444 1.1 mrg param.function = speed_mpn_toom42_mul;
1445 1.1 mrg param.function2 = speed_mpn_toom63_mul;
1446 1.1 mrg param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1447 1.1.1.2 mrg param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
1448 1.1 mrg one (&thres, ¶m);
1449 1.1.1.2 mrg mul_toom42_to_toom63_threshold = thres / 2;
1450 1.1 mrg print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1451 1.1.1.2 mrg
1452 1.1.1.2 mrg /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
1453 1.1.1.2 mrg param.function = speed_mpn_toom43_for_toom54_mul;
1454 1.1.1.2 mrg param.function2 = speed_mpn_toom54_for_toom43_mul;
1455 1.1.1.2 mrg param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
1456 1.1.1.2 mrg param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
1457 1.1.1.2 mrg one (&thres, ¶m);
1458 1.1.1.2 mrg mul_toom43_to_toom54_threshold = thres * 5 / 6;
1459 1.1.1.2 mrg print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
1460 1.1 mrg }
1461 1.1 mrg
1462 1.1 mrg
1463 1.1 mrg void
1464 1.1 mrg tune_mullo (void)
1465 1.1 mrg {
1466 1.1 mrg static struct param_t param;
1467 1.1 mrg
1468 1.1 mrg param.function = speed_mpn_mullo_n;
1469 1.1 mrg
1470 1.1 mrg param.name = "MULLO_BASECASE_THRESHOLD";
1471 1.1.1.4 mrg param.min_size = 2;
1472 1.1 mrg param.min_is_always = 1;
1473 1.1 mrg param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1474 1.1 mrg param.stop_factor = 1.5;
1475 1.1 mrg param.noprint = 1;
1476 1.1 mrg one (&mullo_basecase_threshold, ¶m);
1477 1.1 mrg
1478 1.1 mrg param.name = "MULLO_DC_THRESHOLD";
1479 1.1 mrg param.min_size = 8;
1480 1.1 mrg param.min_is_always = 0;
1481 1.1 mrg param.max_size = 1000;
1482 1.1 mrg one (&mullo_dc_threshold, ¶m);
1483 1.1 mrg
1484 1.1 mrg if (mullo_basecase_threshold >= mullo_dc_threshold)
1485 1.1 mrg {
1486 1.1 mrg print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1487 1.1 mrg print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1488 1.1 mrg }
1489 1.1 mrg else
1490 1.1 mrg {
1491 1.1 mrg print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1492 1.1 mrg print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1493 1.1 mrg }
1494 1.1 mrg
1495 1.1.1.3 mrg if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
1496 1.1.1.3 mrg {
1497 1.1.1.3 mrg param.name = "MULLO_MUL_N_THRESHOLD";
1498 1.1.1.3 mrg param.min_size = mullo_dc_threshold;
1499 1.1.1.3 mrg param.max_size = 2 * mul_fft_threshold;
1500 1.1.1.3 mrg param.noprint = 0;
1501 1.1.1.3 mrg param.step_factor = 0.03;
1502 1.1.1.3 mrg one (&mullo_mul_n_threshold, ¶m);
1503 1.1.1.3 mrg }
1504 1.1.1.3 mrg else
1505 1.1.1.3 mrg print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1506 1.1.1.3 mrg "without FFT use mullo forever");
1507 1.1.1.3 mrg }
1508 1.1.1.3 mrg
1509 1.1.1.3 mrg void
1510 1.1.1.3 mrg tune_sqrlo (void)
1511 1.1.1.3 mrg {
1512 1.1.1.3 mrg static struct param_t param;
1513 1.1.1.3 mrg
1514 1.1.1.3 mrg param.function = speed_mpn_sqrlo;
1515 1.1.1.3 mrg
1516 1.1.1.3 mrg param.name = "SQRLO_BASECASE_THRESHOLD";
1517 1.1.1.4 mrg param.min_size = 2;
1518 1.1.1.3 mrg param.min_is_always = 1;
1519 1.1.1.3 mrg param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
1520 1.1.1.3 mrg param.stop_factor = 1.5;
1521 1.1.1.3 mrg param.noprint = 1;
1522 1.1.1.3 mrg one (&sqrlo_basecase_threshold, ¶m);
1523 1.1.1.3 mrg
1524 1.1.1.3 mrg param.name = "SQRLO_DC_THRESHOLD";
1525 1.1.1.3 mrg param.min_size = 8;
1526 1.1.1.3 mrg param.min_is_always = 0;
1527 1.1.1.3 mrg param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
1528 1.1.1.3 mrg one (&sqrlo_dc_threshold, ¶m);
1529 1.1.1.3 mrg
1530 1.1.1.3 mrg if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
1531 1.1.1.3 mrg {
1532 1.1.1.3 mrg print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
1533 1.1.1.3 mrg print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
1534 1.1.1.3 mrg }
1535 1.1.1.3 mrg else
1536 1.1.1.3 mrg {
1537 1.1.1.3 mrg print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
1538 1.1.1.3 mrg print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
1539 1.1.1.3 mrg }
1540 1.1.1.3 mrg
1541 1.1.1.3 mrg if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
1542 1.1.1.3 mrg {
1543 1.1.1.3 mrg param.name = "SQRLO_SQR_THRESHOLD";
1544 1.1.1.3 mrg param.min_size = sqrlo_dc_threshold;
1545 1.1.1.3 mrg param.max_size = 2 * sqr_fft_threshold;
1546 1.1.1.3 mrg param.noprint = 0;
1547 1.1.1.3 mrg param.step_factor = 0.03;
1548 1.1.1.3 mrg one (&sqrlo_sqr_threshold, ¶m);
1549 1.1.1.3 mrg }
1550 1.1.1.3 mrg else
1551 1.1.1.3 mrg print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
1552 1.1.1.3 mrg "without FFT use sqrlo forever");
1553 1.1 mrg }
1554 1.1 mrg
1555 1.1 mrg void
1556 1.1.1.2 mrg tune_mulmid (void)
1557 1.1.1.2 mrg {
1558 1.1.1.2 mrg static struct param_t param;
1559 1.1.1.2 mrg
1560 1.1.1.2 mrg param.name = "MULMID_TOOM42_THRESHOLD";
1561 1.1.1.2 mrg param.function = speed_mpn_mulmid_n;
1562 1.1.1.2 mrg param.min_size = 4;
1563 1.1.1.2 mrg param.max_size = 100;
1564 1.1.1.2 mrg one (&mulmid_toom42_threshold, ¶m);
1565 1.1.1.2 mrg }
1566 1.1.1.2 mrg
1567 1.1.1.2 mrg void
1568 1.1 mrg tune_mulmod_bnm1 (void)
1569 1.1 mrg {
1570 1.1 mrg static struct param_t param;
1571 1.1 mrg
1572 1.1 mrg param.name = "MULMOD_BNM1_THRESHOLD";
1573 1.1 mrg param.function = speed_mpn_mulmod_bnm1;
1574 1.1 mrg param.min_size = 4;
1575 1.1 mrg param.max_size = 100;
1576 1.1 mrg one (&mulmod_bnm1_threshold, ¶m);
1577 1.1 mrg }
1578 1.1 mrg
1579 1.1 mrg void
1580 1.1 mrg tune_sqrmod_bnm1 (void)
1581 1.1 mrg {
1582 1.1 mrg static struct param_t param;
1583 1.1 mrg
1584 1.1 mrg param.name = "SQRMOD_BNM1_THRESHOLD";
1585 1.1 mrg param.function = speed_mpn_sqrmod_bnm1;
1586 1.1 mrg param.min_size = 4;
1587 1.1 mrg param.max_size = 100;
1588 1.1 mrg one (&sqrmod_bnm1_threshold, ¶m);
1589 1.1 mrg }
1590 1.1 mrg
1591 1.1 mrg
1592 1.1 mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1593 1.1 mrg is faster only at size==2 then we don't want to bother with extra code
1594 1.1 mrg just for that. Start karatsuba from 4 same as MUL above. */
1595 1.1 mrg
1596 1.1 mrg void
1597 1.1 mrg tune_sqr (void)
1598 1.1 mrg {
1599 1.1 mrg /* disabled until tuned */
1600 1.1 mrg SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1601 1.1 mrg
1602 1.1 mrg if (HAVE_NATIVE_mpn_sqr_basecase)
1603 1.1 mrg {
1604 1.1 mrg print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1605 1.1 mrg sqr_basecase_threshold = 0;
1606 1.1 mrg }
1607 1.1 mrg else
1608 1.1 mrg {
1609 1.1 mrg static struct param_t param;
1610 1.1 mrg param.name = "SQR_BASECASE_THRESHOLD";
1611 1.1 mrg param.function = speed_mpn_sqr;
1612 1.1 mrg param.min_size = 3;
1613 1.1 mrg param.min_is_always = 1;
1614 1.1 mrg param.max_size = TUNE_SQR_TOOM2_MAX;
1615 1.1 mrg param.noprint = 1;
1616 1.1 mrg one (&sqr_basecase_threshold, ¶m);
1617 1.1 mrg }
1618 1.1 mrg
1619 1.1 mrg {
1620 1.1 mrg static struct param_t param;
1621 1.1 mrg param.name = "SQR_TOOM2_THRESHOLD";
1622 1.1 mrg param.function = speed_mpn_sqr;
1623 1.1 mrg param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1624 1.1 mrg param.max_size = TUNE_SQR_TOOM2_MAX;
1625 1.1 mrg param.noprint = 1;
1626 1.1 mrg one (&sqr_toom2_threshold, ¶m);
1627 1.1 mrg
1628 1.1 mrg if (! HAVE_NATIVE_mpn_sqr_basecase
1629 1.1 mrg && sqr_toom2_threshold < sqr_basecase_threshold)
1630 1.1 mrg {
1631 1.1 mrg /* Karatsuba becomes faster than mul_basecase before
1632 1.1 mrg sqr_basecase does. Arrange for the expression
1633 1.1 mrg "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1634 1.1 mrg selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1635 1.1 mrg SQR_TOOM2_THRESHOLD to zero, making
1636 1.1 mrg SQR_BASECASE_THRESHOLD the toom2 threshold. */
1637 1.1 mrg
1638 1.1 mrg sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1639 1.1 mrg SQR_TOOM2_THRESHOLD = 0;
1640 1.1 mrg
1641 1.1 mrg print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1642 1.1 mrg "toom2");
1643 1.1 mrg print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1644 1.1 mrg "never sqr_basecase");
1645 1.1 mrg }
1646 1.1 mrg else
1647 1.1 mrg {
1648 1.1 mrg if (! HAVE_NATIVE_mpn_sqr_basecase)
1649 1.1 mrg print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1650 1.1 mrg print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1651 1.1 mrg }
1652 1.1 mrg }
1653 1.1 mrg
1654 1.1 mrg {
1655 1.1 mrg static struct param_t param;
1656 1.1.1.2 mrg mp_size_t next_toom_start;
1657 1.1.1.2 mrg int something_changed;
1658 1.1 mrg
1659 1.1 mrg param.function = speed_mpn_sqr;
1660 1.1.1.2 mrg param.noprint = 1;
1661 1.1 mrg
1662 1.1.1.2 mrg /* Threshold sequence loop. Disable functions that would be used in a very
1663 1.1.1.2 mrg narrow range, re-measuring things when that happens. */
1664 1.1.1.2 mrg something_changed = 1;
1665 1.1.1.2 mrg while (something_changed)
1666 1.1.1.2 mrg {
1667 1.1.1.2 mrg something_changed = 0;
1668 1.1.1.2 mrg
1669 1.1.1.2 mrg next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1670 1.1.1.2 mrg
1671 1.1.1.2 mrg sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
1672 1.1.1.2 mrg param.name = "SQR_TOOM3_THRESHOLD";
1673 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
1674 1.1.1.2 mrg param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1675 1.1.1.2 mrg one (&sqr_toom3_threshold, ¶m);
1676 1.1.1.2 mrg
1677 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
1678 1.1.1.2 mrg
1679 1.1.1.2 mrg if (sqr_toom4_threshold != 0)
1680 1.1.1.2 mrg {
1681 1.1.1.2 mrg param.name = "SQR_TOOM4_THRESHOLD";
1682 1.1.1.2 mrg sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
1683 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
1684 1.1.1.2 mrg param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1685 1.1.1.2 mrg one (&sqr_toom4_threshold, ¶m);
1686 1.1.1.2 mrg
1687 1.1.1.2 mrg if (next_toom_start * 1.05 >= sqr_toom4_threshold)
1688 1.1.1.2 mrg {
1689 1.1.1.2 mrg sqr_toom4_threshold = 0;
1690 1.1.1.2 mrg something_changed = 1;
1691 1.1.1.2 mrg }
1692 1.1.1.2 mrg }
1693 1.1.1.2 mrg
1694 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
1695 1.1.1.2 mrg
1696 1.1.1.2 mrg if (sqr_toom6_threshold != 0)
1697 1.1.1.2 mrg {
1698 1.1.1.2 mrg param.name = "SQR_TOOM6_THRESHOLD";
1699 1.1.1.2 mrg sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
1700 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
1701 1.1.1.2 mrg param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1702 1.1.1.2 mrg one (&sqr_toom6_threshold, ¶m);
1703 1.1.1.2 mrg
1704 1.1.1.2 mrg if (next_toom_start * 1.05 >= sqr_toom6_threshold)
1705 1.1.1.2 mrg {
1706 1.1.1.2 mrg sqr_toom6_threshold = 0;
1707 1.1.1.2 mrg something_changed = 1;
1708 1.1.1.2 mrg }
1709 1.1.1.2 mrg }
1710 1.1.1.2 mrg
1711 1.1.1.2 mrg next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
1712 1.1.1.2 mrg
1713 1.1.1.2 mrg if (sqr_toom8_threshold != 0)
1714 1.1.1.2 mrg {
1715 1.1.1.2 mrg param.name = "SQR_TOOM8_THRESHOLD";
1716 1.1.1.2 mrg sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
1717 1.1.1.2 mrg param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
1718 1.1.1.2 mrg param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1719 1.1.1.2 mrg one (&sqr_toom8_threshold, ¶m);
1720 1.1.1.2 mrg
1721 1.1.1.2 mrg if (next_toom_start * 1.05 >= sqr_toom8_threshold)
1722 1.1.1.2 mrg {
1723 1.1.1.2 mrg sqr_toom8_threshold = 0;
1724 1.1.1.2 mrg something_changed = 1;
1725 1.1.1.2 mrg }
1726 1.1.1.2 mrg }
1727 1.1.1.2 mrg }
1728 1.1.1.2 mrg
1729 1.1.1.2 mrg print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
1730 1.1.1.2 mrg print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
1731 1.1.1.2 mrg print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
1732 1.1.1.2 mrg print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
1733 1.1 mrg }
1734 1.1 mrg }
1735 1.1 mrg
1736 1.1 mrg
1737 1.1 mrg void
1738 1.1 mrg tune_dc_div (void)
1739 1.1 mrg {
1740 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n */
1741 1.1 mrg {
1742 1.1 mrg static struct param_t param;
1743 1.1 mrg param.name = "DC_DIV_QR_THRESHOLD";
1744 1.1 mrg param.function = speed_mpn_sbpi1_div_qr;
1745 1.1 mrg param.function2 = speed_mpn_dcpi1_div_qr;
1746 1.1 mrg param.min_size = 6;
1747 1.1 mrg one (&dc_div_qr_threshold, ¶m);
1748 1.1 mrg }
1749 1.1 mrg {
1750 1.1 mrg static struct param_t param;
1751 1.1 mrg param.name = "DC_DIVAPPR_Q_THRESHOLD";
1752 1.1 mrg param.function = speed_mpn_sbpi1_divappr_q;
1753 1.1 mrg param.function2 = speed_mpn_dcpi1_divappr_q;
1754 1.1 mrg param.min_size = 6;
1755 1.1 mrg one (&dc_divappr_q_threshold, ¶m);
1756 1.1 mrg }
1757 1.1 mrg }
1758 1.1 mrg
1759 1.1 mrg static double
1760 1.1 mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1761 1.1 mrg {
1762 1.1 mrg if (s->size < DC_DIV_QR_THRESHOLD)
1763 1.1 mrg return speed_mpn_sbpi1_div_qr (s);
1764 1.1 mrg else
1765 1.1 mrg return speed_mpn_dcpi1_div_qr (s);
1766 1.1 mrg }
1767 1.1 mrg
1768 1.1 mrg void
1769 1.1 mrg tune_mu_div (void)
1770 1.1 mrg {
1771 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n */
1772 1.1 mrg {
1773 1.1 mrg static struct param_t param;
1774 1.1 mrg param.name = "MU_DIV_QR_THRESHOLD";
1775 1.1 mrg param.function = speed_mpn_dcpi1_div_qr;
1776 1.1 mrg param.function2 = speed_mpn_mu_div_qr;
1777 1.1.1.2 mrg param.min_size = mul_toom22_threshold;
1778 1.1 mrg param.max_size = 5000;
1779 1.1 mrg param.step_factor = 0.02;
1780 1.1 mrg one (&mu_div_qr_threshold, ¶m);
1781 1.1 mrg }
1782 1.1 mrg {
1783 1.1 mrg static struct param_t param;
1784 1.1 mrg param.name = "MU_DIVAPPR_Q_THRESHOLD";
1785 1.1 mrg param.function = speed_mpn_dcpi1_divappr_q;
1786 1.1 mrg param.function2 = speed_mpn_mu_divappr_q;
1787 1.1.1.2 mrg param.min_size = mul_toom22_threshold;
1788 1.1 mrg param.max_size = 5000;
1789 1.1 mrg param.step_factor = 0.02;
1790 1.1 mrg one (&mu_divappr_q_threshold, ¶m);
1791 1.1 mrg }
1792 1.1 mrg {
1793 1.1 mrg static struct param_t param;
1794 1.1 mrg param.name = "MUPI_DIV_QR_THRESHOLD";
1795 1.1 mrg param.function = speed_mpn_sbordcpi1_div_qr;
1796 1.1 mrg param.function2 = speed_mpn_mupi_div_qr;
1797 1.1 mrg param.min_size = 6;
1798 1.1 mrg param.min_is_always = 1;
1799 1.1 mrg param.max_size = 1000;
1800 1.1 mrg param.step_factor = 0.02;
1801 1.1 mrg one (&mupi_div_qr_threshold, ¶m);
1802 1.1 mrg }
1803 1.1 mrg }
1804 1.1 mrg
1805 1.1 mrg void
1806 1.1 mrg tune_dc_bdiv (void)
1807 1.1 mrg {
1808 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n*/
1809 1.1 mrg {
1810 1.1 mrg static struct param_t param;
1811 1.1 mrg param.name = "DC_BDIV_QR_THRESHOLD";
1812 1.1 mrg param.function = speed_mpn_sbpi1_bdiv_qr;
1813 1.1 mrg param.function2 = speed_mpn_dcpi1_bdiv_qr;
1814 1.1 mrg param.min_size = 4;
1815 1.1 mrg one (&dc_bdiv_qr_threshold, ¶m);
1816 1.1 mrg }
1817 1.1 mrg {
1818 1.1 mrg static struct param_t param;
1819 1.1 mrg param.name = "DC_BDIV_Q_THRESHOLD";
1820 1.1 mrg param.function = speed_mpn_sbpi1_bdiv_q;
1821 1.1 mrg param.function2 = speed_mpn_dcpi1_bdiv_q;
1822 1.1 mrg param.min_size = 4;
1823 1.1 mrg one (&dc_bdiv_q_threshold, ¶m);
1824 1.1 mrg }
1825 1.1 mrg }
1826 1.1 mrg
1827 1.1 mrg void
1828 1.1 mrg tune_mu_bdiv (void)
1829 1.1 mrg {
1830 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n*/
1831 1.1 mrg {
1832 1.1 mrg static struct param_t param;
1833 1.1 mrg param.name = "MU_BDIV_QR_THRESHOLD";
1834 1.1 mrg param.function = speed_mpn_dcpi1_bdiv_qr;
1835 1.1 mrg param.function2 = speed_mpn_mu_bdiv_qr;
1836 1.1.1.3 mrg param.min_size = dc_bdiv_qr_threshold;
1837 1.1 mrg param.max_size = 5000;
1838 1.1 mrg param.step_factor = 0.02;
1839 1.1 mrg one (&mu_bdiv_qr_threshold, ¶m);
1840 1.1 mrg }
1841 1.1 mrg {
1842 1.1 mrg static struct param_t param;
1843 1.1 mrg param.name = "MU_BDIV_Q_THRESHOLD";
1844 1.1 mrg param.function = speed_mpn_dcpi1_bdiv_q;
1845 1.1 mrg param.function2 = speed_mpn_mu_bdiv_q;
1846 1.1.1.3 mrg param.min_size = dc_bdiv_q_threshold;
1847 1.1 mrg param.max_size = 5000;
1848 1.1 mrg param.step_factor = 0.02;
1849 1.1 mrg one (&mu_bdiv_q_threshold, ¶m);
1850 1.1 mrg }
1851 1.1 mrg }
1852 1.1 mrg
1853 1.1 mrg void
1854 1.1 mrg tune_invertappr (void)
1855 1.1 mrg {
1856 1.1 mrg static struct param_t param;
1857 1.1 mrg
1858 1.1 mrg param.function = speed_mpn_ni_invertappr;
1859 1.1 mrg param.name = "INV_MULMOD_BNM1_THRESHOLD";
1860 1.1.1.3 mrg param.min_size = 5;
1861 1.1 mrg one (&inv_mulmod_bnm1_threshold, ¶m);
1862 1.1 mrg
1863 1.1 mrg param.function = speed_mpn_invertappr;
1864 1.1 mrg param.name = "INV_NEWTON_THRESHOLD";
1865 1.1.1.3 mrg param.min_size = 5;
1866 1.1 mrg one (&inv_newton_threshold, ¶m);
1867 1.1 mrg }
1868 1.1 mrg
1869 1.1 mrg void
1870 1.1 mrg tune_invert (void)
1871 1.1 mrg {
1872 1.1 mrg static struct param_t param;
1873 1.1 mrg
1874 1.1 mrg param.function = speed_mpn_invert;
1875 1.1 mrg param.name = "INV_APPR_THRESHOLD";
1876 1.1.1.3 mrg param.min_size = 5;
1877 1.1 mrg one (&inv_appr_threshold, ¶m);
1878 1.1 mrg }
1879 1.1 mrg
1880 1.1 mrg void
1881 1.1 mrg tune_binvert (void)
1882 1.1 mrg {
1883 1.1 mrg static struct param_t param;
1884 1.1 mrg
1885 1.1 mrg param.function = speed_mpn_binvert;
1886 1.1 mrg param.name = "BINV_NEWTON_THRESHOLD";
1887 1.1 mrg param.min_size = 8; /* pointless with smaller operands */
1888 1.1 mrg one (&binv_newton_threshold, ¶m);
1889 1.1 mrg }
1890 1.1 mrg
1891 1.1 mrg void
1892 1.1 mrg tune_redc (void)
1893 1.1 mrg {
1894 1.1 mrg #define TUNE_REDC_2_MAX 100
1895 1.1 mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1896 1.1 mrg #define WANT_REDC_2 1
1897 1.1 mrg #endif
1898 1.1 mrg
1899 1.1 mrg #if WANT_REDC_2
1900 1.1 mrg {
1901 1.1 mrg static struct param_t param;
1902 1.1 mrg param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1903 1.1 mrg param.function = speed_mpn_redc_1;
1904 1.1 mrg param.function2 = speed_mpn_redc_2;
1905 1.1 mrg param.min_size = 1;
1906 1.1 mrg param.min_is_always = 1;
1907 1.1 mrg param.max_size = TUNE_REDC_2_MAX;
1908 1.1 mrg param.noprint = 1;
1909 1.1.1.2 mrg param.stop_factor = 1.5;
1910 1.1 mrg one (&redc_1_to_redc_2_threshold, ¶m);
1911 1.1 mrg }
1912 1.1 mrg {
1913 1.1 mrg static struct param_t param;
1914 1.1 mrg param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1915 1.1 mrg param.function = speed_mpn_redc_2;
1916 1.1 mrg param.function2 = speed_mpn_redc_n;
1917 1.1 mrg param.min_size = 16;
1918 1.1 mrg param.noprint = 1;
1919 1.1 mrg one (&redc_2_to_redc_n_threshold, ¶m);
1920 1.1 mrg }
1921 1.1.1.2 mrg if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1922 1.1 mrg {
1923 1.1.1.2 mrg redc_2_to_redc_n_threshold = 0; /* disable redc_2 */
1924 1.1.1.2 mrg
1925 1.1.1.2 mrg /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1926 1.1.1.2 mrg REDC_1_TO_REDC_2_THRESHOLD. */
1927 1.1.1.2 mrg {
1928 1.1.1.2 mrg static struct param_t param;
1929 1.1.1.2 mrg param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1930 1.1.1.2 mrg param.function = speed_mpn_redc_1;
1931 1.1.1.2 mrg param.function2 = speed_mpn_redc_n;
1932 1.1.1.2 mrg param.min_size = 16;
1933 1.1.1.2 mrg param.noprint = 1;
1934 1.1.1.2 mrg one (&redc_1_to_redc_2_threshold, ¶m);
1935 1.1.1.2 mrg }
1936 1.1 mrg }
1937 1.1.1.2 mrg print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1938 1.1.1.2 mrg print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1939 1.1 mrg #else
1940 1.1 mrg {
1941 1.1 mrg static struct param_t param;
1942 1.1 mrg param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1943 1.1 mrg param.function = speed_mpn_redc_1;
1944 1.1 mrg param.function2 = speed_mpn_redc_n;
1945 1.1 mrg param.min_size = 16;
1946 1.1 mrg one (&redc_1_to_redc_n_threshold, ¶m);
1947 1.1 mrg }
1948 1.1 mrg #endif
1949 1.1 mrg }
1950 1.1 mrg
1951 1.1 mrg void
1952 1.1 mrg tune_matrix22_mul (void)
1953 1.1 mrg {
1954 1.1 mrg static struct param_t param;
1955 1.1 mrg param.name = "MATRIX22_STRASSEN_THRESHOLD";
1956 1.1 mrg param.function = speed_mpn_matrix22_mul;
1957 1.1 mrg param.min_size = 2;
1958 1.1 mrg one (&matrix22_strassen_threshold, ¶m);
1959 1.1 mrg }
1960 1.1 mrg
1961 1.1 mrg void
1962 1.1.1.4 mrg tune_hgcd2 (void)
1963 1.1.1.4 mrg {
1964 1.1.1.4 mrg static struct param_t param;
1965 1.1.1.4 mrg hgcd2_func_t *f[5] =
1966 1.1.1.4 mrg { mpn_hgcd2_1,
1967 1.1.1.4 mrg mpn_hgcd2_2,
1968 1.1.1.4 mrg mpn_hgcd2_3,
1969 1.1.1.4 mrg mpn_hgcd2_4,
1970 1.1.1.4 mrg mpn_hgcd2_5 };
1971 1.1.1.4 mrg speed_function_t speed_f[5] =
1972 1.1.1.4 mrg { speed_mpn_hgcd2_1,
1973 1.1.1.4 mrg speed_mpn_hgcd2_2,
1974 1.1.1.4 mrg speed_mpn_hgcd2_3,
1975 1.1.1.4 mrg speed_mpn_hgcd2_4,
1976 1.1.1.4 mrg speed_mpn_hgcd2_5 };
1977 1.1.1.4 mrg int best;
1978 1.1.1.4 mrg
1979 1.1.1.4 mrg s.size = 1;
1980 1.1.1.4 mrg best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", ¶m);
1981 1.1.1.4 mrg
1982 1.1.1.4 mrg /* Use selected function when tuning hgcd and gcd */
1983 1.1.1.4 mrg hgcd2_func = f[best];
1984 1.1.1.4 mrg }
1985 1.1.1.4 mrg
1986 1.1.1.4 mrg void
1987 1.1 mrg tune_hgcd (void)
1988 1.1 mrg {
1989 1.1 mrg static struct param_t param;
1990 1.1 mrg param.name = "HGCD_THRESHOLD";
1991 1.1 mrg param.function = speed_mpn_hgcd;
1992 1.1 mrg /* We seem to get strange results for small sizes */
1993 1.1 mrg param.min_size = 30;
1994 1.1 mrg one (&hgcd_threshold, ¶m);
1995 1.1 mrg }
1996 1.1 mrg
1997 1.1 mrg void
1998 1.1.1.2 mrg tune_hgcd_appr (void)
1999 1.1.1.2 mrg {
2000 1.1.1.2 mrg static struct param_t param;
2001 1.1.1.2 mrg param.name = "HGCD_APPR_THRESHOLD";
2002 1.1.1.2 mrg param.function = speed_mpn_hgcd_appr;
2003 1.1.1.2 mrg /* We seem to get strange results for small sizes */
2004 1.1.1.2 mrg param.min_size = 50;
2005 1.1.1.2 mrg param.stop_since_change = 150;
2006 1.1.1.2 mrg one (&hgcd_appr_threshold, ¶m);
2007 1.1.1.2 mrg }
2008 1.1.1.2 mrg
2009 1.1.1.2 mrg void
2010 1.1.1.2 mrg tune_hgcd_reduce (void)
2011 1.1.1.2 mrg {
2012 1.1.1.2 mrg static struct param_t param;
2013 1.1.1.2 mrg param.name = "HGCD_REDUCE_THRESHOLD";
2014 1.1.1.2 mrg param.function = speed_mpn_hgcd_reduce;
2015 1.1.1.2 mrg param.min_size = 30;
2016 1.1.1.2 mrg param.max_size = 7000;
2017 1.1.1.2 mrg param.step_factor = 0.04;
2018 1.1.1.2 mrg one (&hgcd_reduce_threshold, ¶m);
2019 1.1.1.2 mrg }
2020 1.1.1.2 mrg
2021 1.1.1.2 mrg void
2022 1.1 mrg tune_gcd_dc (void)
2023 1.1 mrg {
2024 1.1 mrg static struct param_t param;
2025 1.1 mrg param.name = "GCD_DC_THRESHOLD";
2026 1.1 mrg param.function = speed_mpn_gcd;
2027 1.1 mrg param.min_size = hgcd_threshold;
2028 1.1 mrg param.max_size = 3000;
2029 1.1 mrg param.step_factor = 0.02;
2030 1.1 mrg one (&gcd_dc_threshold, ¶m);
2031 1.1 mrg }
2032 1.1 mrg
2033 1.1 mrg void
2034 1.1 mrg tune_gcdext_dc (void)
2035 1.1 mrg {
2036 1.1 mrg static struct param_t param;
2037 1.1 mrg param.name = "GCDEXT_DC_THRESHOLD";
2038 1.1 mrg param.function = speed_mpn_gcdext;
2039 1.1 mrg param.min_size = hgcd_threshold;
2040 1.1 mrg param.max_size = 3000;
2041 1.1 mrg param.step_factor = 0.02;
2042 1.1 mrg one (&gcdext_dc_threshold, ¶m);
2043 1.1 mrg }
2044 1.1 mrg
2045 1.1.1.2 mrg /* In tune_powm_sec we compute the table used by the win_size function. The
2046 1.1.1.2 mrg cutoff points are in exponent bits, disregarding other operand sizes. It is
2047 1.1.1.3 mrg not possible to use the one framework since it currently uses a granularity
2048 1.1.1.2 mrg of full limbs.
2049 1.1.1.2 mrg */
2050 1.1.1.2 mrg
2051 1.1.1.2 mrg /* This win_size replaces the variant in the powm code, allowing us to
2052 1.1.1.2 mrg control k in the k-ary algorithms. */
2053 1.1.1.2 mrg int winsize;
2054 1.1.1.2 mrg int
2055 1.1.1.2 mrg win_size (mp_bitcnt_t eb)
2056 1.1.1.2 mrg {
2057 1.1.1.2 mrg return winsize;
2058 1.1.1.2 mrg }
2059 1.1.1.2 mrg
2060 1.1.1.2 mrg void
2061 1.1.1.2 mrg tune_powm_sec (void)
2062 1.1.1.2 mrg {
2063 1.1.1.2 mrg mp_size_t n;
2064 1.1.1.2 mrg int k, i;
2065 1.1.1.2 mrg mp_size_t itch;
2066 1.1.1.2 mrg mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
2067 1.1.1.2 mrg const int n_max = 3000 / GMP_NUMB_BITS;
2068 1.1.1.2 mrg const int n_measurements = 5;
2069 1.1.1.2 mrg mp_ptr rp, bp, ep, mp, tp;
2070 1.1.1.2 mrg double ttab[n_measurements], tk, tkp1;
2071 1.1.1.2 mrg TMP_DECL;
2072 1.1.1.2 mrg TMP_MARK;
2073 1.1.1.2 mrg
2074 1.1.1.2 mrg possible_nbits_cutoff = 0;
2075 1.1.1.2 mrg
2076 1.1.1.2 mrg k = 1;
2077 1.1.1.2 mrg
2078 1.1.1.2 mrg winsize = 10; /* the itch function needs this */
2079 1.1.1.3 mrg itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
2080 1.1.1.2 mrg
2081 1.1.1.2 mrg rp = TMP_ALLOC_LIMBS (n_max);
2082 1.1.1.2 mrg bp = TMP_ALLOC_LIMBS (n_max);
2083 1.1.1.2 mrg ep = TMP_ALLOC_LIMBS (n_max);
2084 1.1.1.2 mrg mp = TMP_ALLOC_LIMBS (n_max);
2085 1.1.1.2 mrg tp = TMP_ALLOC_LIMBS (itch);
2086 1.1.1.2 mrg
2087 1.1.1.2 mrg mpn_random (bp, n_max);
2088 1.1.1.2 mrg mpn_random (mp, n_max);
2089 1.1.1.2 mrg mp[0] |= 1;
2090 1.1.1.2 mrg
2091 1.1.1.2 mrg /* How about taking the M operand size into account?
2092 1.1.1.2 mrg
2093 1.1.1.2 mrg An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
2094 1.1.1.2 mrg B = O(M)).
2095 1.1.1.2 mrg
2096 1.1.1.2 mrg Using k-ary and no sliding window, the precomputation will need time
2097 1.1.1.2 mrg O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
2098 1.1.1.2 mrg O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
2099 1.1.1.2 mrg
2100 1.1.1.2 mrg An operation R=powm_sec(B,E,N) will take time like powm.
2101 1.1.1.2 mrg
2102 1.1.1.2 mrg Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
2103 1.1.1.2 mrg main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
2104 1.1.1.2 mrg O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
2105 1.1.1.2 mrg table reads, respectively. */
2106 1.1.1.2 mrg
2107 1.1.1.2 mrg printf ("#define POWM_SEC_TABLE ");
2108 1.1.1.2 mrg
2109 1.1.1.3 mrg /* For nbits == 1, we should always use k == 1, so no need to tune
2110 1.1.1.3 mrg that. Starting with nbits == 2 also ensure that nbits always is
2111 1.1.1.3 mrg larger than the windowsize k+1. */
2112 1.1.1.3 mrg for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
2113 1.1.1.2 mrg {
2114 1.1.1.2 mrg n = (nbits - 1) / GMP_NUMB_BITS + 1;
2115 1.1.1.2 mrg
2116 1.1.1.2 mrg /* Generate E such that sliding-window for k and k+1 works equally
2117 1.1.1.2 mrg well/poorly (but sliding is not used in powm_sec, of course). */
2118 1.1.1.2 mrg for (i = 0; i < n; i++)
2119 1.1.1.2 mrg ep[i] = ~CNST_LIMB(0);
2120 1.1.1.2 mrg
2121 1.1.1.2 mrg winsize = k;
2122 1.1.1.2 mrg for (i = 0; i < n_measurements; i++)
2123 1.1.1.2 mrg {
2124 1.1.1.2 mrg speed_starttime ();
2125 1.1.1.3 mrg mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2126 1.1.1.2 mrg ttab[i] = speed_endtime ();
2127 1.1.1.2 mrg }
2128 1.1.1.2 mrg tk = median (ttab, n_measurements);
2129 1.1.1.2 mrg
2130 1.1.1.2 mrg winsize = k + 1;
2131 1.1.1.2 mrg speed_starttime ();
2132 1.1.1.2 mrg for (i = 0; i < n_measurements; i++)
2133 1.1.1.2 mrg {
2134 1.1.1.2 mrg speed_starttime ();
2135 1.1.1.3 mrg mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2136 1.1.1.2 mrg ttab[i] = speed_endtime ();
2137 1.1.1.2 mrg }
2138 1.1.1.2 mrg tkp1 = median (ttab, n_measurements);
2139 1.1.1.2 mrg /*
2140 1.1.1.2 mrg printf ("testing: %ld, %d", nbits, k, ep[n-1]);
2141 1.1.1.2 mrg printf (" %10.5f %10.5f\n", tk, tkp1);
2142 1.1.1.2 mrg */
2143 1.1.1.2 mrg if (tkp1 < tk)
2144 1.1.1.2 mrg {
2145 1.1.1.2 mrg if (possible_nbits_cutoff)
2146 1.1.1.2 mrg {
2147 1.1.1.2 mrg /* Two consecutive sizes indicate k increase, obey. */
2148 1.1.1.3 mrg
2149 1.1.1.3 mrg /* Must always have x[k] >= k */
2150 1.1.1.3 mrg ASSERT_ALWAYS (possible_nbits_cutoff >= k);
2151 1.1.1.3 mrg
2152 1.1.1.2 mrg if (k > 1)
2153 1.1.1.2 mrg printf (",");
2154 1.1.1.2 mrg printf ("%ld", (long) possible_nbits_cutoff);
2155 1.1.1.2 mrg k++;
2156 1.1.1.2 mrg possible_nbits_cutoff = 0;
2157 1.1.1.2 mrg }
2158 1.1.1.2 mrg else
2159 1.1.1.2 mrg {
2160 1.1.1.2 mrg /* One measurement indicate k increase, save nbits for further
2161 1.1.1.2 mrg consideration. */
2162 1.1.1.3 mrg /* The new larger k gets used for sizes > the cutoff
2163 1.1.1.3 mrg value, hence the cutoff should be one less than the
2164 1.1.1.3 mrg smallest size where it gives a speedup. */
2165 1.1.1.3 mrg possible_nbits_cutoff = nbits - 1;
2166 1.1.1.2 mrg }
2167 1.1.1.2 mrg }
2168 1.1.1.2 mrg else
2169 1.1.1.2 mrg possible_nbits_cutoff = 0;
2170 1.1.1.2 mrg
2171 1.1.1.2 mrg nbits_next = nbits * 65 / 64;
2172 1.1.1.2 mrg nbits = nbits_next + (nbits_next == nbits);
2173 1.1.1.2 mrg }
2174 1.1.1.2 mrg printf ("\n");
2175 1.1.1.2 mrg TMP_FREE;
2176 1.1.1.2 mrg }
2177 1.1.1.2 mrg
2178 1.1 mrg
2179 1.1 mrg /* size_extra==1 reflects the fact that with high<divisor one division is
2180 1.1 mrg always skipped. Forcing high<divisor while testing ensures consistency
2181 1.1 mrg while stepping through sizes, ie. that size-1 divides will be done each
2182 1.1 mrg time.
2183 1.1 mrg
2184 1.1 mrg min_size==2 and min_is_always are used so that if plain division is only
2185 1.1 mrg better at size==1 then don't bother including that code just for that
2186 1.1 mrg case, instead go with preinv always and get a size saving. */
2187 1.1 mrg
2188 1.1 mrg #define DIV_1_PARAMS \
2189 1.1 mrg param.check_size = 256; \
2190 1.1 mrg param.min_size = 2; \
2191 1.1 mrg param.min_is_always = 1; \
2192 1.1 mrg param.data_high = DATA_HIGH_LT_R; \
2193 1.1 mrg param.size_extra = 1; \
2194 1.1 mrg param.stop_factor = 2.0;
2195 1.1 mrg
2196 1.1 mrg
2197 1.1.1.2 mrg double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
2198 1.1 mrg
2199 1.1 mrg void
2200 1.1 mrg tune_divrem_1 (void)
2201 1.1 mrg {
2202 1.1 mrg /* plain version by default */
2203 1.1 mrg tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
2204 1.1 mrg
2205 1.1 mrg /* No support for tuning native assembler code, do that by hand and put
2206 1.1 mrg the results in the .asm file, there's no need for such thresholds to
2207 1.1 mrg appear in gmp-mparam.h. */
2208 1.1 mrg if (HAVE_NATIVE_mpn_divrem_1)
2209 1.1 mrg return;
2210 1.1 mrg
2211 1.1 mrg if (GMP_NAIL_BITS != 0)
2212 1.1 mrg {
2213 1.1 mrg print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2214 1.1 mrg "no preinv with nails");
2215 1.1 mrg print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2216 1.1 mrg "no preinv with nails");
2217 1.1 mrg return;
2218 1.1 mrg }
2219 1.1 mrg
2220 1.1 mrg if (UDIV_PREINV_ALWAYS)
2221 1.1 mrg {
2222 1.1 mrg print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
2223 1.1 mrg print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
2224 1.1 mrg return;
2225 1.1 mrg }
2226 1.1 mrg
2227 1.1 mrg tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
2228 1.1 mrg
2229 1.1 mrg /* Tune for the integer part of mpn_divrem_1. This will very possibly be
2230 1.1 mrg a bit out for the fractional part, but that's too bad, the integer part
2231 1.1 mrg is more important. */
2232 1.1 mrg {
2233 1.1 mrg static struct param_t param;
2234 1.1 mrg param.name = "DIVREM_1_NORM_THRESHOLD";
2235 1.1 mrg DIV_1_PARAMS;
2236 1.1 mrg s.r = randlimb_norm ();
2237 1.1 mrg param.function = speed_mpn_divrem_1_tune;
2238 1.1 mrg one (&divrem_1_norm_threshold, ¶m);
2239 1.1 mrg }
2240 1.1 mrg {
2241 1.1 mrg static struct param_t param;
2242 1.1 mrg param.name = "DIVREM_1_UNNORM_THRESHOLD";
2243 1.1 mrg DIV_1_PARAMS;
2244 1.1 mrg s.r = randlimb_half ();
2245 1.1 mrg param.function = speed_mpn_divrem_1_tune;
2246 1.1 mrg one (&divrem_1_unnorm_threshold, ¶m);
2247 1.1 mrg }
2248 1.1 mrg }
2249 1.1 mrg
2250 1.1.1.3 mrg void
2251 1.1.1.3 mrg tune_div_qr_1 (void)
2252 1.1.1.3 mrg {
2253 1.1.1.3 mrg if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
2254 1.1.1.3 mrg {
2255 1.1.1.3 mrg static struct param_t param;
2256 1.1.1.4 mrg speed_function_t f[2] =
2257 1.1.1.4 mrg {
2258 1.1.1.4 mrg speed_mpn_div_qr_1n_pi1_1,
2259 1.1.1.4 mrg speed_mpn_div_qr_1n_pi1_2,
2260 1.1.1.4 mrg };
2261 1.1.1.3 mrg
2262 1.1.1.3 mrg s.size = 10;
2263 1.1.1.3 mrg s.r = randlimb_norm ();
2264 1.1.1.3 mrg
2265 1.1.1.4 mrg one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", ¶m);
2266 1.1.1.3 mrg }
2267 1.1.1.3 mrg
2268 1.1.1.3 mrg {
2269 1.1.1.3 mrg static struct param_t param;
2270 1.1.1.3 mrg param.name = "DIV_QR_1_NORM_THRESHOLD";
2271 1.1.1.3 mrg DIV_1_PARAMS;
2272 1.1.1.3 mrg param.min_size = 1;
2273 1.1.1.3 mrg param.min_is_always = 0;
2274 1.1.1.3 mrg s.r = randlimb_norm ();
2275 1.1.1.3 mrg param.function = speed_mpn_div_qr_1_tune;
2276 1.1.1.3 mrg one (&div_qr_1_norm_threshold, ¶m);
2277 1.1.1.3 mrg }
2278 1.1.1.3 mrg {
2279 1.1.1.3 mrg static struct param_t param;
2280 1.1.1.3 mrg param.name = "DIV_QR_1_UNNORM_THRESHOLD";
2281 1.1.1.3 mrg DIV_1_PARAMS;
2282 1.1.1.3 mrg param.min_size = 1;
2283 1.1.1.3 mrg param.min_is_always = 0;
2284 1.1.1.3 mrg s.r = randlimb_half();
2285 1.1.1.3 mrg param.function = speed_mpn_div_qr_1_tune;
2286 1.1.1.3 mrg one (&div_qr_1_unnorm_threshold, ¶m);
2287 1.1.1.3 mrg }
2288 1.1.1.3 mrg }
2289 1.1.1.3 mrg
2290 1.1 mrg
2291 1.1 mrg void
2292 1.1 mrg tune_mod_1 (void)
2293 1.1 mrg {
2294 1.1 mrg /* No support for tuning native assembler code, do that by hand and put
2295 1.1 mrg the results in the .asm file, there's no need for such thresholds to
2296 1.1 mrg appear in gmp-mparam.h. */
2297 1.1 mrg if (HAVE_NATIVE_mpn_mod_1)
2298 1.1 mrg return;
2299 1.1 mrg
2300 1.1 mrg if (GMP_NAIL_BITS != 0)
2301 1.1 mrg {
2302 1.1 mrg print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2303 1.1 mrg "no preinv with nails");
2304 1.1 mrg print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2305 1.1 mrg "no preinv with nails");
2306 1.1 mrg return;
2307 1.1 mrg }
2308 1.1 mrg
2309 1.1.1.2 mrg if (!HAVE_NATIVE_mpn_mod_1_1p)
2310 1.1.1.2 mrg {
2311 1.1.1.2 mrg static struct param_t param;
2312 1.1.1.4 mrg speed_function_t f[2] =
2313 1.1.1.4 mrg {
2314 1.1.1.4 mrg speed_mpn_mod_1_1_1,
2315 1.1.1.4 mrg speed_mpn_mod_1_1_2,
2316 1.1.1.4 mrg };
2317 1.1.1.2 mrg
2318 1.1.1.2 mrg s.size = 10;
2319 1.1.1.2 mrg s.r = randlimb_half ();
2320 1.1.1.4 mrg one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", ¶m);
2321 1.1.1.2 mrg }
2322 1.1.1.2 mrg
2323 1.1 mrg if (UDIV_PREINV_ALWAYS)
2324 1.1 mrg {
2325 1.1 mrg print_define ("MOD_1_NORM_THRESHOLD", 0L);
2326 1.1 mrg print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
2327 1.1 mrg }
2328 1.1 mrg else
2329 1.1 mrg {
2330 1.1 mrg {
2331 1.1 mrg static struct param_t param;
2332 1.1 mrg param.name = "MOD_1_NORM_THRESHOLD";
2333 1.1 mrg DIV_1_PARAMS;
2334 1.1 mrg s.r = randlimb_norm ();
2335 1.1 mrg param.function = speed_mpn_mod_1_tune;
2336 1.1 mrg one (&mod_1_norm_threshold, ¶m);
2337 1.1 mrg }
2338 1.1 mrg {
2339 1.1 mrg static struct param_t param;
2340 1.1 mrg param.name = "MOD_1_UNNORM_THRESHOLD";
2341 1.1 mrg DIV_1_PARAMS;
2342 1.1 mrg s.r = randlimb_half ();
2343 1.1 mrg param.function = speed_mpn_mod_1_tune;
2344 1.1 mrg one (&mod_1_unnorm_threshold, ¶m);
2345 1.1 mrg }
2346 1.1 mrg }
2347 1.1 mrg {
2348 1.1 mrg static struct param_t param;
2349 1.1 mrg
2350 1.1 mrg param.check_size = 256;
2351 1.1 mrg
2352 1.1 mrg s.r = randlimb_norm ();
2353 1.1 mrg param.function = speed_mpn_mod_1_tune;
2354 1.1 mrg
2355 1.1 mrg param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
2356 1.1 mrg param.min_size = 2;
2357 1.1 mrg one (&mod_1n_to_mod_1_1_threshold, ¶m);
2358 1.1 mrg }
2359 1.1.1.2 mrg
2360 1.1 mrg {
2361 1.1 mrg static struct param_t param;
2362 1.1 mrg
2363 1.1 mrg param.check_size = 256;
2364 1.1.1.2 mrg s.r = randlimb_half ();
2365 1.1 mrg param.noprint = 1;
2366 1.1 mrg
2367 1.1.1.2 mrg param.function = speed_mpn_mod_1_1;
2368 1.1.1.2 mrg param.function2 = speed_mpn_mod_1_2;
2369 1.1.1.2 mrg param.min_is_always = 1;
2370 1.1 mrg param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
2371 1.1.1.2 mrg param.min_size = 2;
2372 1.1 mrg one (&mod_1_1_to_mod_1_2_threshold, ¶m);
2373 1.1 mrg
2374 1.1.1.2 mrg param.function = speed_mpn_mod_1_2;
2375 1.1.1.2 mrg param.function2 = speed_mpn_mod_1_4;
2376 1.1.1.2 mrg param.min_is_always = 1;
2377 1.1 mrg param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
2378 1.1.1.2 mrg param.min_size = 1;
2379 1.1 mrg one (&mod_1_2_to_mod_1_4_threshold, ¶m);
2380 1.1 mrg
2381 1.1.1.2 mrg if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
2382 1.1 mrg {
2383 1.1.1.2 mrg /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
2384 1.1 mrg mod_1_2_to_mod_1_4_threshold = 0;
2385 1.1 mrg
2386 1.1.1.2 mrg param.function = speed_mpn_mod_1_1;
2387 1.1.1.2 mrg param.function2 = speed_mpn_mod_1_4;
2388 1.1.1.2 mrg param.min_is_always = 1;
2389 1.1.1.2 mrg param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
2390 1.1.1.2 mrg param.min_size = 2;
2391 1.1 mrg one (&mod_1_1_to_mod_1_2_threshold, ¶m);
2392 1.1 mrg }
2393 1.1.1.2 mrg
2394 1.1.1.2 mrg param.function = speed_mpn_mod_1_tune;
2395 1.1.1.2 mrg param.function2 = NULL;
2396 1.1.1.2 mrg param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
2397 1.1.1.2 mrg param.min_size = 2;
2398 1.1.1.2 mrg param.min_is_always = 0;
2399 1.1.1.2 mrg one (&mod_1u_to_mod_1_1_threshold, ¶m);
2400 1.1.1.2 mrg
2401 1.1.1.2 mrg if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
2402 1.1.1.2 mrg mod_1_1_to_mod_1_2_threshold = 0;
2403 1.1.1.2 mrg if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
2404 1.1.1.2 mrg mod_1_2_to_mod_1_4_threshold = 0;
2405 1.1.1.2 mrg
2406 1.1.1.2 mrg print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
2407 1.1.1.2 mrg print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
2408 1.1.1.2 mrg mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
2409 1.1.1.2 mrg print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
2410 1.1.1.2 mrg mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
2411 1.1 mrg }
2412 1.1 mrg
2413 1.1 mrg {
2414 1.1 mrg static struct param_t param;
2415 1.1 mrg
2416 1.1 mrg param.check_size = 256;
2417 1.1 mrg
2418 1.1 mrg param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
2419 1.1 mrg s.r = randlimb_norm ();
2420 1.1 mrg param.function = speed_mpn_preinv_mod_1;
2421 1.1 mrg param.function2 = speed_mpn_mod_1_tune;
2422 1.1 mrg param.min_size = 1;
2423 1.1 mrg one (&preinv_mod_1_to_mod_1_threshold, ¶m);
2424 1.1 mrg }
2425 1.1 mrg }
2426 1.1 mrg
2427 1.1 mrg
2428 1.1 mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
2429 1.1 mrg imply that udiv_qrnnd_preinv is worth using, but it seems most
2430 1.1 mrg straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
2431 1.1 mrg directly. */
2432 1.1 mrg
2433 1.1 mrg void
2434 1.1 mrg tune_preinv_divrem_1 (void)
2435 1.1 mrg {
2436 1.1 mrg static struct param_t param;
2437 1.1 mrg speed_function_t divrem_1;
2438 1.1 mrg const char *divrem_1_name;
2439 1.1 mrg double t1, t2;
2440 1.1 mrg
2441 1.1 mrg if (GMP_NAIL_BITS != 0)
2442 1.1 mrg {
2443 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
2444 1.1 mrg return;
2445 1.1 mrg }
2446 1.1 mrg
2447 1.1 mrg /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
2448 1.1 mrg it's faster than mpn_divrem_1. */
2449 1.1 mrg if (HAVE_NATIVE_mpn_preinv_divrem_1)
2450 1.1 mrg {
2451 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
2452 1.1 mrg return;
2453 1.1 mrg }
2454 1.1 mrg
2455 1.1 mrg /* If udiv_qrnnd_preinv is the only division method then of course
2456 1.1 mrg mpn_preinv_divrem_1 should be used. */
2457 1.1 mrg if (UDIV_PREINV_ALWAYS)
2458 1.1 mrg {
2459 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
2460 1.1 mrg return;
2461 1.1 mrg }
2462 1.1 mrg
2463 1.1 mrg /* If we've got an assembler version of mpn_divrem_1, then compare against
2464 1.1 mrg that, not the mpn_divrem_1_div generic C. */
2465 1.1 mrg if (HAVE_NATIVE_mpn_divrem_1)
2466 1.1 mrg {
2467 1.1 mrg divrem_1 = speed_mpn_divrem_1;
2468 1.1 mrg divrem_1_name = "mpn_divrem_1";
2469 1.1 mrg }
2470 1.1 mrg else
2471 1.1 mrg {
2472 1.1 mrg divrem_1 = speed_mpn_divrem_1_div;
2473 1.1 mrg divrem_1_name = "mpn_divrem_1_div";
2474 1.1 mrg }
2475 1.1 mrg
2476 1.1 mrg param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
2477 1.1 mrg s.size = 200; /* generous but not too big */
2478 1.1 mrg /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
2479 1.1 mrg since in general that's probably most common, though in fact for a
2480 1.1 mrg 64-bit limb mp_bases[10].big_base is normalized. */
2481 1.1 mrg s.r = urandom() & (GMP_NUMB_MASK >> 4);
2482 1.1 mrg if (s.r == 0) s.r = 123;
2483 1.1 mrg
2484 1.1 mrg t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s);
2485 1.1 mrg t2 = tuneup_measure (divrem_1, ¶m, &s);
2486 1.1 mrg if (t1 == -1.0 || t2 == -1.0)
2487 1.1 mrg {
2488 1.1 mrg printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
2489 1.1 mrg divrem_1_name, (long) s.size);
2490 1.1 mrg abort ();
2491 1.1 mrg }
2492 1.1 mrg if (option_trace >= 1)
2493 1.1 mrg printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
2494 1.1 mrg (long) s.size, t1, divrem_1_name, t2);
2495 1.1 mrg
2496 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
2497 1.1 mrg }
2498 1.1 mrg
2499 1.1 mrg
2500 1.1 mrg
2501 1.1 mrg void
2502 1.1 mrg tune_divrem_2 (void)
2503 1.1 mrg {
2504 1.1 mrg static struct param_t param;
2505 1.1 mrg
2506 1.1 mrg /* No support for tuning native assembler code, do that by hand and put
2507 1.1 mrg the results in the .asm file, and there's no need for such thresholds
2508 1.1 mrg to appear in gmp-mparam.h. */
2509 1.1 mrg if (HAVE_NATIVE_mpn_divrem_2)
2510 1.1 mrg return;
2511 1.1 mrg
2512 1.1 mrg if (GMP_NAIL_BITS != 0)
2513 1.1 mrg {
2514 1.1 mrg print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
2515 1.1 mrg "no preinv with nails");
2516 1.1 mrg return;
2517 1.1 mrg }
2518 1.1 mrg
2519 1.1 mrg if (UDIV_PREINV_ALWAYS)
2520 1.1 mrg {
2521 1.1 mrg print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
2522 1.1 mrg return;
2523 1.1 mrg }
2524 1.1 mrg
2525 1.1 mrg /* Tune for the integer part of mpn_divrem_2. This will very possibly be
2526 1.1 mrg a bit out for the fractional part, but that's too bad, the integer part
2527 1.1 mrg is more important.
2528 1.1 mrg
2529 1.1 mrg min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
2530 1.1 mrg code space if plain division is better only at size==2 or size==3. */
2531 1.1 mrg param.name = "DIVREM_2_THRESHOLD";
2532 1.1 mrg param.check_size = 256;
2533 1.1 mrg param.min_size = 4;
2534 1.1 mrg param.min_is_always = 1;
2535 1.1 mrg param.size_extra = 2; /* does qsize==nsize-2 divisions */
2536 1.1 mrg param.stop_factor = 2.0;
2537 1.1 mrg
2538 1.1 mrg s.r = randlimb_norm ();
2539 1.1 mrg param.function = speed_mpn_divrem_2;
2540 1.1 mrg one (&divrem_2_threshold, ¶m);
2541 1.1 mrg }
2542 1.1 mrg
2543 1.1.1.2 mrg void
2544 1.1.1.2 mrg tune_div_qr_2 (void)
2545 1.1.1.2 mrg {
2546 1.1.1.2 mrg static struct param_t param;
2547 1.1.1.2 mrg param.name = "DIV_QR_2_PI2_THRESHOLD";
2548 1.1.1.2 mrg param.function = speed_mpn_div_qr_2n;
2549 1.1.1.2 mrg param.check_size = 500;
2550 1.1.1.2 mrg param.min_size = 4;
2551 1.1.1.2 mrg one (&div_qr_2_pi2_threshold, ¶m);
2552 1.1.1.2 mrg }
2553 1.1 mrg
2554 1.1 mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
2555 1.1 mrg tune for that. Its speed can differ on odd or even divisor, so take an
2556 1.1 mrg average threshold for the two.
2557 1.1 mrg
2558 1.1 mrg mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
2559 1.1 mrg might not vary that way, but don't test this since high<divisor isn't
2560 1.1 mrg expected to occur often with small divisors. */
2561 1.1 mrg
2562 1.1 mrg void
2563 1.1 mrg tune_divexact_1 (void)
2564 1.1 mrg {
2565 1.1 mrg static struct param_t param;
2566 1.1 mrg mp_size_t thresh[2], average;
2567 1.1 mrg int low, i;
2568 1.1 mrg
2569 1.1 mrg /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
2570 1.1 mrg full mpn_divrem_1. */
2571 1.1 mrg if (HAVE_NATIVE_mpn_divexact_1)
2572 1.1 mrg {
2573 1.1 mrg print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
2574 1.1 mrg return;
2575 1.1 mrg }
2576 1.1 mrg
2577 1.1 mrg ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
2578 1.1 mrg
2579 1.1 mrg param.name = "DIVEXACT_1_THRESHOLD";
2580 1.1 mrg param.data_high = DATA_HIGH_GE_R;
2581 1.1 mrg param.check_size = 256;
2582 1.1 mrg param.min_size = 2;
2583 1.1 mrg param.stop_factor = 1.5;
2584 1.1 mrg param.function = tuned_speed_mpn_divrem_1;
2585 1.1 mrg param.function2 = speed_mpn_divexact_1;
2586 1.1 mrg param.noprint = 1;
2587 1.1 mrg
2588 1.1 mrg print_define_start (param.name);
2589 1.1 mrg
2590 1.1 mrg for (low = 0; low <= 1; low++)
2591 1.1 mrg {
2592 1.1 mrg s.r = randlimb_half();
2593 1.1 mrg if (low == 0)
2594 1.1 mrg s.r |= 1;
2595 1.1 mrg else
2596 1.1 mrg s.r &= ~CNST_LIMB(7);
2597 1.1 mrg
2598 1.1 mrg one (&thresh[low], ¶m);
2599 1.1 mrg if (option_trace)
2600 1.1 mrg printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
2601 1.1 mrg
2602 1.1 mrg if (thresh[low] == MP_SIZE_T_MAX)
2603 1.1 mrg {
2604 1.1 mrg average = MP_SIZE_T_MAX;
2605 1.1 mrg goto divexact_1_done;
2606 1.1 mrg }
2607 1.1 mrg }
2608 1.1 mrg
2609 1.1 mrg if (option_trace)
2610 1.1 mrg {
2611 1.1 mrg printf ("average of:");
2612 1.1 mrg for (i = 0; i < numberof(thresh); i++)
2613 1.1 mrg printf (" %ld", (long) thresh[i]);
2614 1.1 mrg printf ("\n");
2615 1.1 mrg }
2616 1.1 mrg
2617 1.1 mrg average = 0;
2618 1.1 mrg for (i = 0; i < numberof(thresh); i++)
2619 1.1 mrg average += thresh[i];
2620 1.1 mrg average /= numberof(thresh);
2621 1.1 mrg
2622 1.1 mrg /* If divexact turns out to be better as early as 3 limbs, then use it
2623 1.1 mrg always, so as to reduce code size and conditional jumps. */
2624 1.1 mrg if (average <= 3)
2625 1.1 mrg average = 0;
2626 1.1 mrg
2627 1.1 mrg divexact_1_done:
2628 1.1 mrg print_define_end (param.name, average);
2629 1.1 mrg }
2630 1.1 mrg
2631 1.1 mrg
2632 1.1 mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2633 1.1 mrg same as mpn_mod_1, but this might not be true of an assembler
2634 1.1 mrg implementation. The threshold used is an average based on data where a
2635 1.1 mrg divide can be skipped and where it can't.
2636 1.1 mrg
2637 1.1 mrg If modexact turns out to be better as early as 3 limbs, then use it
2638 1.1 mrg always, so as to reduce code size and conditional jumps. */
2639 1.1 mrg
2640 1.1 mrg void
2641 1.1 mrg tune_modexact_1_odd (void)
2642 1.1 mrg {
2643 1.1 mrg static struct param_t param;
2644 1.1 mrg mp_size_t thresh_lt, thresh_ge, average;
2645 1.1 mrg
2646 1.1 mrg #if 0
2647 1.1 mrg /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2648 1.1 mrg of a full mpn_mod_1. */
2649 1.1 mrg if (HAVE_NATIVE_mpn_modexact_1_odd)
2650 1.1 mrg {
2651 1.1 mrg print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2652 1.1 mrg return;
2653 1.1 mrg }
2654 1.1 mrg #endif
2655 1.1 mrg
2656 1.1 mrg param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2657 1.1 mrg param.check_size = 256;
2658 1.1 mrg param.min_size = 2;
2659 1.1 mrg param.stop_factor = 1.5;
2660 1.1 mrg param.function = speed_mpn_modexact_1c_odd;
2661 1.1 mrg param.function2 = speed_mpn_mod_1_tune;
2662 1.1 mrg param.noprint = 1;
2663 1.1 mrg s.r = randlimb_half () | 1;
2664 1.1 mrg
2665 1.1 mrg print_define_start (param.name);
2666 1.1 mrg
2667 1.1 mrg param.data_high = DATA_HIGH_LT_R;
2668 1.1 mrg one (&thresh_lt, ¶m);
2669 1.1 mrg if (option_trace)
2670 1.1 mrg printf ("lt thresh %ld\n", (long) thresh_lt);
2671 1.1 mrg
2672 1.1 mrg average = thresh_lt;
2673 1.1 mrg if (thresh_lt != MP_SIZE_T_MAX)
2674 1.1 mrg {
2675 1.1 mrg param.data_high = DATA_HIGH_GE_R;
2676 1.1 mrg one (&thresh_ge, ¶m);
2677 1.1 mrg if (option_trace)
2678 1.1 mrg printf ("ge thresh %ld\n", (long) thresh_ge);
2679 1.1 mrg
2680 1.1 mrg if (thresh_ge != MP_SIZE_T_MAX)
2681 1.1 mrg {
2682 1.1 mrg average = (thresh_ge + thresh_lt) / 2;
2683 1.1 mrg if (thresh_ge <= 3)
2684 1.1 mrg average = 0;
2685 1.1 mrg }
2686 1.1 mrg }
2687 1.1 mrg
2688 1.1 mrg print_define_end (param.name, average);
2689 1.1 mrg }
2690 1.1 mrg
2691 1.1 mrg
2692 1.1 mrg void
2693 1.1 mrg tune_jacobi_base (void)
2694 1.1 mrg {
2695 1.1 mrg static struct param_t param;
2696 1.1.1.4 mrg speed_function_t f[4] =
2697 1.1 mrg {
2698 1.1.1.4 mrg speed_mpn_jacobi_base_1,
2699 1.1.1.4 mrg speed_mpn_jacobi_base_2,
2700 1.1.1.4 mrg speed_mpn_jacobi_base_3,
2701 1.1.1.4 mrg speed_mpn_jacobi_base_4,
2702 1.1.1.4 mrg };
2703 1.1 mrg
2704 1.1.1.4 mrg s.size = GMP_LIMB_BITS * 3 / 4;
2705 1.1 mrg
2706 1.1.1.4 mrg one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", ¶m);
2707 1.1 mrg }
2708 1.1 mrg
2709 1.1 mrg
2710 1.1 mrg void
2711 1.1 mrg tune_get_str (void)
2712 1.1 mrg {
2713 1.1 mrg /* Tune for decimal, it being most common. Some rough testing suggests
2714 1.1 mrg other bases are different, but not by very much. */
2715 1.1 mrg s.r = 10;
2716 1.1 mrg {
2717 1.1 mrg static struct param_t param;
2718 1.1 mrg GET_STR_PRECOMPUTE_THRESHOLD = 0;
2719 1.1 mrg param.name = "GET_STR_DC_THRESHOLD";
2720 1.1 mrg param.function = speed_mpn_get_str;
2721 1.1 mrg param.min_size = 4;
2722 1.1 mrg param.max_size = GET_STR_THRESHOLD_LIMIT;
2723 1.1 mrg one (&get_str_dc_threshold, ¶m);
2724 1.1 mrg }
2725 1.1 mrg {
2726 1.1 mrg static struct param_t param;
2727 1.1 mrg param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2728 1.1 mrg param.function = speed_mpn_get_str;
2729 1.1 mrg param.min_size = GET_STR_DC_THRESHOLD;
2730 1.1 mrg param.max_size = GET_STR_THRESHOLD_LIMIT;
2731 1.1 mrg one (&get_str_precompute_threshold, ¶m);
2732 1.1 mrg }
2733 1.1 mrg }
2734 1.1 mrg
2735 1.1 mrg
2736 1.1 mrg double
2737 1.1 mrg speed_mpn_pre_set_str (struct speed_params *s)
2738 1.1 mrg {
2739 1.1 mrg unsigned char *str;
2740 1.1 mrg mp_ptr wp;
2741 1.1 mrg mp_size_t wn;
2742 1.1 mrg unsigned i;
2743 1.1 mrg int base;
2744 1.1 mrg double t;
2745 1.1 mrg mp_ptr powtab_mem, tp;
2746 1.1 mrg powers_t powtab[GMP_LIMB_BITS];
2747 1.1 mrg mp_size_t un;
2748 1.1 mrg int chars_per_limb;
2749 1.1 mrg TMP_DECL;
2750 1.1 mrg
2751 1.1 mrg SPEED_RESTRICT_COND (s->size >= 1);
2752 1.1 mrg
2753 1.1 mrg base = s->r == 0 ? 10 : s->r;
2754 1.1 mrg SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2755 1.1 mrg
2756 1.1 mrg TMP_MARK;
2757 1.1 mrg
2758 1.1.1.3 mrg str = (unsigned char *) TMP_ALLOC (s->size);
2759 1.1 mrg for (i = 0; i < s->size; i++)
2760 1.1 mrg str[i] = s->xp[i] % base;
2761 1.1 mrg
2762 1.1.1.2 mrg LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
2763 1.1 mrg SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2764 1.1 mrg
2765 1.1 mrg /* use this during development to check wn is big enough */
2766 1.1 mrg /*
2767 1.1 mrg ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2768 1.1 mrg */
2769 1.1 mrg
2770 1.1.1.3 mrg speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
2771 1.1 mrg speed_operand_dst (s, wp, wn);
2772 1.1 mrg speed_cache_fill (s);
2773 1.1 mrg
2774 1.1 mrg chars_per_limb = mp_bases[base].chars_per_limb;
2775 1.1 mrg un = s->size / chars_per_limb + 1;
2776 1.1.1.4 mrg powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
2777 1.1.1.4 mrg size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
2778 1.1.1.4 mrg powers_t *pt = powtab + n_pows;
2779 1.1 mrg tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2780 1.1 mrg
2781 1.1 mrg speed_starttime ();
2782 1.1 mrg i = s->reps;
2783 1.1 mrg do
2784 1.1 mrg {
2785 1.1.1.4 mrg mpn_pre_set_str (wp, str, s->size, pt, tp);
2786 1.1 mrg }
2787 1.1 mrg while (--i != 0);
2788 1.1 mrg t = speed_endtime ();
2789 1.1 mrg
2790 1.1 mrg TMP_FREE;
2791 1.1 mrg return t;
2792 1.1 mrg }
2793 1.1 mrg
2794 1.1 mrg void
2795 1.1 mrg tune_set_str (void)
2796 1.1 mrg {
2797 1.1 mrg s.r = 10; /* decimal */
2798 1.1 mrg {
2799 1.1 mrg static struct param_t param;
2800 1.1 mrg SET_STR_PRECOMPUTE_THRESHOLD = 0;
2801 1.1 mrg param.step_factor = 0.01;
2802 1.1 mrg param.name = "SET_STR_DC_THRESHOLD";
2803 1.1 mrg param.function = speed_mpn_pre_set_str;
2804 1.1 mrg param.min_size = 100;
2805 1.1 mrg param.max_size = 50000;
2806 1.1 mrg one (&set_str_dc_threshold, ¶m);
2807 1.1 mrg }
2808 1.1 mrg {
2809 1.1 mrg static struct param_t param;
2810 1.1 mrg param.step_factor = 0.02;
2811 1.1 mrg param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2812 1.1 mrg param.function = speed_mpn_set_str;
2813 1.1 mrg param.min_size = SET_STR_DC_THRESHOLD;
2814 1.1 mrg param.max_size = 100000;
2815 1.1 mrg one (&set_str_precompute_threshold, ¶m);
2816 1.1 mrg }
2817 1.1 mrg }
2818 1.1 mrg
2819 1.1 mrg
2820 1.1 mrg void
2821 1.1 mrg tune_fft_mul (void)
2822 1.1 mrg {
2823 1.1 mrg static struct fft_param_t param;
2824 1.1 mrg
2825 1.1 mrg if (option_fft_max_size == 0)
2826 1.1 mrg return;
2827 1.1 mrg
2828 1.1 mrg param.table_name = "MUL_FFT_TABLE3";
2829 1.1 mrg param.threshold_name = "MUL_FFT_THRESHOLD";
2830 1.1 mrg param.p_threshold = &mul_fft_threshold;
2831 1.1 mrg param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2832 1.1 mrg param.p_modf_threshold = &mul_fft_modf_threshold;
2833 1.1 mrg param.first_size = MUL_TOOM33_THRESHOLD / 2;
2834 1.1 mrg param.max_size = option_fft_max_size;
2835 1.1 mrg param.function = speed_mpn_fft_mul;
2836 1.1 mrg param.mul_modf_function = speed_mpn_mul_fft;
2837 1.1 mrg param.mul_function = speed_mpn_mul_n;
2838 1.1 mrg param.sqr = 0;
2839 1.1 mrg fft (¶m);
2840 1.1 mrg }
2841 1.1 mrg
2842 1.1 mrg
2843 1.1 mrg void
2844 1.1 mrg tune_fft_sqr (void)
2845 1.1 mrg {
2846 1.1 mrg static struct fft_param_t param;
2847 1.1 mrg
2848 1.1 mrg if (option_fft_max_size == 0)
2849 1.1 mrg return;
2850 1.1 mrg
2851 1.1 mrg param.table_name = "SQR_FFT_TABLE3";
2852 1.1 mrg param.threshold_name = "SQR_FFT_THRESHOLD";
2853 1.1 mrg param.p_threshold = &sqr_fft_threshold;
2854 1.1 mrg param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2855 1.1 mrg param.p_modf_threshold = &sqr_fft_modf_threshold;
2856 1.1 mrg param.first_size = SQR_TOOM3_THRESHOLD / 2;
2857 1.1 mrg param.max_size = option_fft_max_size;
2858 1.1 mrg param.function = speed_mpn_fft_sqr;
2859 1.1 mrg param.mul_modf_function = speed_mpn_mul_fft_sqr;
2860 1.1 mrg param.mul_function = speed_mpn_sqr;
2861 1.1 mrg param.sqr = 1;
2862 1.1 mrg fft (¶m);
2863 1.1 mrg }
2864 1.1 mrg
2865 1.1 mrg void
2866 1.1.1.2 mrg tune_fac_ui (void)
2867 1.1.1.2 mrg {
2868 1.1.1.2 mrg static struct param_t param;
2869 1.1.1.2 mrg
2870 1.1.1.2 mrg param.function = speed_mpz_fac_ui_tune;
2871 1.1.1.2 mrg
2872 1.1.1.2 mrg param.name = "FAC_DSC_THRESHOLD";
2873 1.1.1.2 mrg param.min_size = 70;
2874 1.1.1.2 mrg param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2875 1.1.1.2 mrg one (&fac_dsc_threshold, ¶m);
2876 1.1.1.2 mrg
2877 1.1.1.2 mrg param.name = "FAC_ODD_THRESHOLD";
2878 1.1.1.2 mrg param.min_size = 22;
2879 1.1.1.2 mrg param.stop_factor = 1.7;
2880 1.1.1.2 mrg param.min_is_always = 1;
2881 1.1.1.2 mrg one (&fac_odd_threshold, ¶m);
2882 1.1.1.2 mrg }
2883 1.1.1.2 mrg
2884 1.1.1.2 mrg void
2885 1.1 mrg all (void)
2886 1.1 mrg {
2887 1.1 mrg time_t start_time, end_time;
2888 1.1 mrg TMP_DECL;
2889 1.1 mrg
2890 1.1 mrg TMP_MARK;
2891 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2892 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2893 1.1 mrg
2894 1.1 mrg mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2895 1.1 mrg mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2896 1.1 mrg
2897 1.1 mrg fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2898 1.1 mrg
2899 1.1 mrg speed_time_init ();
2900 1.1 mrg fprintf (stderr, "Using: %s\n", speed_time_string);
2901 1.1 mrg
2902 1.1 mrg fprintf (stderr, "speed_precision %d", speed_precision);
2903 1.1 mrg if (speed_unittime == 1.0)
2904 1.1 mrg fprintf (stderr, ", speed_unittime 1 cycle");
2905 1.1 mrg else
2906 1.1 mrg fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2907 1.1 mrg if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2908 1.1 mrg fprintf (stderr, ", CPU freq unknown\n");
2909 1.1 mrg else
2910 1.1 mrg fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2911 1.1 mrg
2912 1.1 mrg fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2913 1.1 mrg DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2914 1.1 mrg fprintf (stderr, "\n");
2915 1.1 mrg
2916 1.1 mrg time (&start_time);
2917 1.1 mrg {
2918 1.1 mrg struct tm *tp;
2919 1.1 mrg tp = localtime (&start_time);
2920 1.1 mrg printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2921 1.1 mrg tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2922 1.1 mrg
2923 1.1 mrg #ifdef __GNUC__
2924 1.1 mrg /* gcc sub-minor version doesn't seem to come through as a define */
2925 1.1 mrg printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2926 1.1 mrg #define PRINTED_COMPILER
2927 1.1 mrg #endif
2928 1.1 mrg #if defined (__SUNPRO_C)
2929 1.1 mrg printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2930 1.1 mrg #define PRINTED_COMPILER
2931 1.1 mrg #endif
2932 1.1 mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2933 1.1 mrg /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2934 1.1 mrg printf ("MIPSpro C %d.%d.%d */\n",
2935 1.1 mrg _COMPILER_VERSION / 100,
2936 1.1 mrg _COMPILER_VERSION / 10 % 10,
2937 1.1 mrg _COMPILER_VERSION % 10);
2938 1.1 mrg #define PRINTED_COMPILER
2939 1.1 mrg #endif
2940 1.1 mrg #if defined (__DECC) && defined (__DECC_VER)
2941 1.1 mrg printf ("DEC C %d */\n", __DECC_VER);
2942 1.1 mrg #define PRINTED_COMPILER
2943 1.1 mrg #endif
2944 1.1 mrg #if ! defined (PRINTED_COMPILER)
2945 1.1 mrg printf ("system compiler */\n");
2946 1.1 mrg #endif
2947 1.1 mrg }
2948 1.1 mrg printf ("\n");
2949 1.1 mrg
2950 1.1 mrg tune_divrem_1 ();
2951 1.1 mrg tune_mod_1 ();
2952 1.1 mrg tune_preinv_divrem_1 ();
2953 1.1.1.3 mrg tune_div_qr_1 ();
2954 1.1.1.2 mrg #if 0
2955 1.1 mrg tune_divrem_2 ();
2956 1.1.1.2 mrg #endif
2957 1.1.1.2 mrg tune_div_qr_2 ();
2958 1.1 mrg tune_divexact_1 ();
2959 1.1 mrg tune_modexact_1_odd ();
2960 1.1 mrg printf("\n");
2961 1.1 mrg
2962 1.1.1.4 mrg relspeed_div_1_vs_mul_1 ();
2963 1.1.1.4 mrg printf("\n");
2964 1.1.1.4 mrg
2965 1.1 mrg tune_mul_n ();
2966 1.1 mrg printf("\n");
2967 1.1 mrg
2968 1.1 mrg tune_mul ();
2969 1.1 mrg printf("\n");
2970 1.1 mrg
2971 1.1 mrg tune_sqr ();
2972 1.1 mrg printf("\n");
2973 1.1 mrg
2974 1.1.1.2 mrg tune_mulmid ();
2975 1.1.1.2 mrg printf("\n");
2976 1.1.1.2 mrg
2977 1.1 mrg tune_mulmod_bnm1 ();
2978 1.1 mrg tune_sqrmod_bnm1 ();
2979 1.1 mrg printf("\n");
2980 1.1 mrg
2981 1.1 mrg tune_fft_mul ();
2982 1.1 mrg printf("\n");
2983 1.1 mrg
2984 1.1 mrg tune_fft_sqr ();
2985 1.1 mrg printf ("\n");
2986 1.1 mrg
2987 1.1 mrg tune_mullo ();
2988 1.1.1.3 mrg tune_sqrlo ();
2989 1.1 mrg printf("\n");
2990 1.1 mrg
2991 1.1 mrg tune_dc_div ();
2992 1.1 mrg tune_dc_bdiv ();
2993 1.1 mrg
2994 1.1 mrg printf("\n");
2995 1.1 mrg tune_invertappr ();
2996 1.1 mrg tune_invert ();
2997 1.1 mrg printf("\n");
2998 1.1 mrg
2999 1.1 mrg tune_binvert ();
3000 1.1 mrg tune_redc ();
3001 1.1 mrg printf("\n");
3002 1.1 mrg
3003 1.1 mrg tune_mu_div ();
3004 1.1 mrg tune_mu_bdiv ();
3005 1.1 mrg printf("\n");
3006 1.1 mrg
3007 1.1.1.2 mrg tune_powm_sec ();
3008 1.1.1.2 mrg printf("\n");
3009 1.1.1.2 mrg
3010 1.1.1.3 mrg tune_get_str ();
3011 1.1.1.3 mrg tune_set_str ();
3012 1.1.1.3 mrg printf("\n");
3013 1.1.1.3 mrg
3014 1.1.1.3 mrg tune_fac_ui ();
3015 1.1.1.3 mrg printf("\n");
3016 1.1.1.3 mrg
3017 1.1 mrg tune_matrix22_mul ();
3018 1.1.1.4 mrg tune_hgcd2 ();
3019 1.1 mrg tune_hgcd ();
3020 1.1.1.2 mrg tune_hgcd_appr ();
3021 1.1.1.2 mrg tune_hgcd_reduce();
3022 1.1 mrg tune_gcd_dc ();
3023 1.1 mrg tune_gcdext_dc ();
3024 1.1 mrg tune_jacobi_base ();
3025 1.1 mrg printf("\n");
3026 1.1 mrg
3027 1.1 mrg time (&end_time);
3028 1.1 mrg printf ("/* Tuneup completed successfully, took %ld seconds */\n",
3029 1.1 mrg (long) (end_time - start_time));
3030 1.1 mrg
3031 1.1 mrg TMP_FREE;
3032 1.1 mrg }
3033 1.1 mrg
3034 1.1 mrg
3035 1.1 mrg int
3036 1.1 mrg main (int argc, char *argv[])
3037 1.1 mrg {
3038 1.1 mrg int opt;
3039 1.1 mrg
3040 1.1 mrg /* Unbuffered so if output is redirected to a file it isn't lost if the
3041 1.1 mrg program is killed part way through. */
3042 1.1 mrg setbuf (stdout, NULL);
3043 1.1 mrg setbuf (stderr, NULL);
3044 1.1 mrg
3045 1.1 mrg while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
3046 1.1 mrg {
3047 1.1 mrg switch (opt) {
3048 1.1 mrg case 'f':
3049 1.1 mrg if (optarg[0] == 't')
3050 1.1 mrg option_fft_trace = 2;
3051 1.1 mrg else
3052 1.1 mrg option_fft_max_size = atol (optarg);
3053 1.1 mrg break;
3054 1.1 mrg case 'o':
3055 1.1 mrg speed_option_set (optarg);
3056 1.1 mrg break;
3057 1.1 mrg case 'p':
3058 1.1 mrg speed_precision = atoi (optarg);
3059 1.1 mrg break;
3060 1.1 mrg case 't':
3061 1.1 mrg option_trace++;
3062 1.1 mrg break;
3063 1.1 mrg case '?':
3064 1.1 mrg exit(1);
3065 1.1 mrg }
3066 1.1 mrg }
3067 1.1 mrg
3068 1.1 mrg all ();
3069 1.1 mrg exit (0);
3070 1.1 mrg }
3071