tuneup.c revision 1.1 1 1.1 mrg /* Create tuned thresholds for various algorithms.
2 1.1 mrg
3 1.1 mrg Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4 1.1 mrg Software Foundation, Inc.
5 1.1 mrg
6 1.1 mrg This file is part of the GNU MP Library.
7 1.1 mrg
8 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
10 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
11 1.1 mrg option) any later version.
12 1.1 mrg
13 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
14 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 1.1 mrg License for more details.
17 1.1 mrg
18 1.1 mrg You should have received a copy of the GNU Lesser General Public License
19 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
20 1.1 mrg
21 1.1 mrg
22 1.1 mrg /* Usage: tuneup [-t] [-t] [-p precision]
23 1.1 mrg
24 1.1 mrg -t turns on some diagnostic traces, a second -t turns on more traces.
25 1.1 mrg
26 1.1 mrg Notes:
27 1.1 mrg
28 1.1 mrg The code here isn't a vision of loveliness, mainly because it's subject
29 1.1 mrg to ongoing changes according to new things wanting to be tuned, and
30 1.1 mrg practical requirements of systems tested.
31 1.1 mrg
32 1.1 mrg Sometimes running the program twice produces slightly different results.
33 1.1 mrg This is probably because there's so little separating algorithms near
34 1.1 mrg their crossover, and on that basis it should make little or no difference
35 1.1 mrg to the final speed of the relevant routines, but nothing has been done to
36 1.1 mrg check that carefully.
37 1.1 mrg
38 1.1 mrg Algorithm:
39 1.1 mrg
40 1.1 mrg The thresholds are determined as follows. A crossover may not be a
41 1.1 mrg single size but rather a range where it oscillates between method A or
42 1.1 mrg method B faster. If the threshold is set making B used where A is faster
43 1.1 mrg (or vice versa) that's bad. Badness is the percentage time lost and
44 1.1 mrg total badness is the sum of this over all sizes measured. The threshold
45 1.1 mrg is set to minimize total badness.
46 1.1 mrg
47 1.1 mrg Suppose, as sizes increase, method B becomes faster than method A. The
48 1.1 mrg effect of the rule is that, as you look at increasing sizes, isolated
49 1.1 mrg points where B is faster are ignored, but when it's consistently faster,
50 1.1 mrg or faster on balance, then the threshold is set there. The same result
51 1.1 mrg is obtained thinking in the other direction of A becoming faster at
52 1.1 mrg smaller sizes.
53 1.1 mrg
54 1.1 mrg In practice the thresholds tend to be chosen to bring on the next
55 1.1 mrg algorithm fairly quickly.
56 1.1 mrg
57 1.1 mrg This rule is attractive because it's got a basis in reason and is fairly
58 1.1 mrg easy to implement, but no work has been done to actually compare it in
59 1.1 mrg absolute terms to other possibilities.
60 1.1 mrg
61 1.1 mrg Implementation:
62 1.1 mrg
63 1.1 mrg In a normal library build the thresholds are constants. To tune them
64 1.1 mrg selected objects are recompiled with the thresholds as global variables
65 1.1 mrg instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
66 1.1 mrg the end of gmp-impl.h, and rules in tune/Makefile.am.
67 1.1 mrg
68 1.1 mrg MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The
69 1.1 mrg threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
70 1.1 mrg level, but recurse into the basecase.
71 1.1 mrg
72 1.1 mrg MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
73 1.1 mrg Other routines in turn will make use of both of those. Naturally the
74 1.1 mrg dependants must be tuned first.
75 1.1 mrg
76 1.1 mrg In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
77 1.1 mrg just a threshold based on comparing two routines (mpn_divrem_1 and
78 1.1 mrg mpn_divexact_1), and no further use of the value determined.
79 1.1 mrg
80 1.1 mrg Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
81 1.1 mrg just comparisons between certain routines on representative data.
82 1.1 mrg
83 1.1 mrg Shortcuts are applied when native (assembler) versions of routines exist.
84 1.1 mrg For instance a native mpn_sqr_basecase is assumed to be always faster
85 1.1 mrg than mpn_mul_basecase, with no measuring.
86 1.1 mrg
87 1.1 mrg No attempt is made to tune within assembler routines, for instance
88 1.1 mrg DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
89 1.1 mrg written and tuned all by hand. Assembler routines that might have hard
90 1.1 mrg limits are recompiled though, to make them accept a bigger range of sizes
91 1.1 mrg than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
92 1.1 mrg
93 1.1 mrg Limitations:
94 1.1 mrg
95 1.1 mrg The FFTs aren't subject to the same badness rule as the other thresholds,
96 1.1 mrg so each k is probably being brought on a touch early. This isn't likely
97 1.1 mrg to make a difference, and the simpler probing means fewer tests.
98 1.1 mrg
99 1.1 mrg */
100 1.1 mrg
101 1.1 mrg #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
102 1.1 mrg
103 1.1 mrg #include "config.h"
104 1.1 mrg
105 1.1 mrg #include <math.h>
106 1.1 mrg #include <stdio.h>
107 1.1 mrg #include <stdlib.h>
108 1.1 mrg #include <time.h>
109 1.1 mrg #if HAVE_UNISTD_H
110 1.1 mrg #include <unistd.h>
111 1.1 mrg #endif
112 1.1 mrg
113 1.1 mrg #include "gmp.h"
114 1.1 mrg #include "gmp-impl.h"
115 1.1 mrg #include "longlong.h"
116 1.1 mrg
117 1.1 mrg #include "tests.h"
118 1.1 mrg #include "speed.h"
119 1.1 mrg
120 1.1 mrg #if !HAVE_DECL_OPTARG
121 1.1 mrg extern char *optarg;
122 1.1 mrg extern int optind, opterr;
123 1.1 mrg #endif
124 1.1 mrg
125 1.1 mrg
126 1.1 mrg #define DEFAULT_MAX_SIZE 1000 /* limbs */
127 1.1 mrg
128 1.1 mrg #if WANT_FFT
129 1.1 mrg mp_size_t option_fft_max_size = 50000; /* limbs */
130 1.1 mrg #else
131 1.1 mrg mp_size_t option_fft_max_size = 0;
132 1.1 mrg #endif
133 1.1 mrg int option_trace = 0;
134 1.1 mrg int option_fft_trace = 0;
135 1.1 mrg struct speed_params s;
136 1.1 mrg
137 1.1 mrg struct dat_t {
138 1.1 mrg mp_size_t size;
139 1.1 mrg double d;
140 1.1 mrg } *dat = NULL;
141 1.1 mrg int ndat = 0;
142 1.1 mrg int allocdat = 0;
143 1.1 mrg
144 1.1 mrg /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
145 1.1 mrg case use zero here, which for params.max_size means no limit. */
146 1.1 mrg #ifndef TUNE_SQR_TOOM2_MAX
147 1.1 mrg #define TUNE_SQR_TOOM2_MAX 0
148 1.1 mrg #endif
149 1.1 mrg
150 1.1 mrg mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX;
151 1.1 mrg mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT;
152 1.1 mrg mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT;
153 1.1 mrg mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT;
154 1.1 mrg mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
155 1.1 mrg mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
156 1.1 mrg mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
157 1.1 mrg mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
158 1.1 mrg mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
159 1.1 mrg mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
160 1.1 mrg mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
161 1.1 mrg mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
162 1.1 mrg mp_size_t sqr_toom2_threshold
163 1.1 mrg = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
164 1.1 mrg mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
165 1.1 mrg mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
166 1.1 mrg mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
167 1.1 mrg mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
168 1.1 mrg mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
169 1.1 mrg mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
170 1.1 mrg mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX;
171 1.1 mrg mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX;
172 1.1 mrg mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX;
173 1.1 mrg mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX;
174 1.1 mrg mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX;
175 1.1 mrg mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
176 1.1 mrg mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
177 1.1 mrg mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
178 1.1 mrg mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX;
179 1.1 mrg mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX;
180 1.1 mrg mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX;
181 1.1 mrg mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX;
182 1.1 mrg mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
183 1.1 mrg mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
184 1.1 mrg mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX;
185 1.1 mrg mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX;
186 1.1 mrg mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX;
187 1.1 mrg mp_size_t inv_newton_threshold = MP_SIZE_T_MAX;
188 1.1 mrg mp_size_t inv_appr_threshold = MP_SIZE_T_MAX;
189 1.1 mrg mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
190 1.1 mrg mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
191 1.1 mrg mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
192 1.1 mrg mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
193 1.1 mrg mp_size_t powm_threshold = MP_SIZE_T_MAX;
194 1.1 mrg mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
195 1.1 mrg mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
196 1.1 mrg mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
197 1.1 mrg mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
198 1.1 mrg mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
199 1.1 mrg mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
200 1.1 mrg mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
201 1.1 mrg mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
202 1.1 mrg mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
203 1.1 mrg mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
204 1.1 mrg mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
205 1.1 mrg mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
206 1.1 mrg mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
207 1.1 mrg mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
208 1.1 mrg mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
209 1.1 mrg mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
210 1.1 mrg mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
211 1.1 mrg mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
212 1.1 mrg mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
213 1.1 mrg
214 1.1 mrg mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
215 1.1 mrg mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
216 1.1 mrg
217 1.1 mrg struct param_t {
218 1.1 mrg const char *name;
219 1.1 mrg speed_function_t function;
220 1.1 mrg speed_function_t function2;
221 1.1 mrg double step_factor; /* how much to step relatively */
222 1.1 mrg int step; /* how much to step absolutely */
223 1.1 mrg double function_fudge; /* multiplier for "function" speeds */
224 1.1 mrg int stop_since_change;
225 1.1 mrg double stop_factor;
226 1.1 mrg mp_size_t min_size;
227 1.1 mrg int min_is_always;
228 1.1 mrg mp_size_t max_size;
229 1.1 mrg mp_size_t check_size;
230 1.1 mrg mp_size_t size_extra;
231 1.1 mrg
232 1.1 mrg #define DATA_HIGH_LT_R 1
233 1.1 mrg #define DATA_HIGH_GE_R 2
234 1.1 mrg int data_high;
235 1.1 mrg
236 1.1 mrg int noprint;
237 1.1 mrg };
238 1.1 mrg
239 1.1 mrg
240 1.1 mrg /* These are normally undefined when false, which suits "#if" fine.
241 1.1 mrg But give them zero values so they can be used in plain C "if"s. */
242 1.1 mrg #ifndef UDIV_PREINV_ALWAYS
243 1.1 mrg #define UDIV_PREINV_ALWAYS 0
244 1.1 mrg #endif
245 1.1 mrg #ifndef HAVE_NATIVE_mpn_divexact_1
246 1.1 mrg #define HAVE_NATIVE_mpn_divexact_1 0
247 1.1 mrg #endif
248 1.1 mrg #ifndef HAVE_NATIVE_mpn_divrem_1
249 1.1 mrg #define HAVE_NATIVE_mpn_divrem_1 0
250 1.1 mrg #endif
251 1.1 mrg #ifndef HAVE_NATIVE_mpn_divrem_2
252 1.1 mrg #define HAVE_NATIVE_mpn_divrem_2 0
253 1.1 mrg #endif
254 1.1 mrg #ifndef HAVE_NATIVE_mpn_mod_1
255 1.1 mrg #define HAVE_NATIVE_mpn_mod_1 0
256 1.1 mrg #endif
257 1.1 mrg #ifndef HAVE_NATIVE_mpn_modexact_1_odd
258 1.1 mrg #define HAVE_NATIVE_mpn_modexact_1_odd 0
259 1.1 mrg #endif
260 1.1 mrg #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
261 1.1 mrg #define HAVE_NATIVE_mpn_preinv_divrem_1 0
262 1.1 mrg #endif
263 1.1 mrg #ifndef HAVE_NATIVE_mpn_preinv_mod_1
264 1.1 mrg #define HAVE_NATIVE_mpn_preinv_mod_1 0
265 1.1 mrg #endif
266 1.1 mrg #ifndef HAVE_NATIVE_mpn_sqr_basecase
267 1.1 mrg #define HAVE_NATIVE_mpn_sqr_basecase 0
268 1.1 mrg #endif
269 1.1 mrg
270 1.1 mrg
271 1.1 mrg #define MAX3(a,b,c) MAX (MAX (a, b), c)
272 1.1 mrg
273 1.1 mrg mp_limb_t
274 1.1 mrg randlimb_norm (void)
275 1.1 mrg {
276 1.1 mrg mp_limb_t n;
277 1.1 mrg mpn_random (&n, 1);
278 1.1 mrg n |= GMP_NUMB_HIGHBIT;
279 1.1 mrg return n;
280 1.1 mrg }
281 1.1 mrg
282 1.1 mrg #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
283 1.1 mrg
284 1.1 mrg mp_limb_t
285 1.1 mrg randlimb_half (void)
286 1.1 mrg {
287 1.1 mrg mp_limb_t n;
288 1.1 mrg mpn_random (&n, 1);
289 1.1 mrg n &= GMP_NUMB_HALFMASK;
290 1.1 mrg n += (n==0);
291 1.1 mrg return n;
292 1.1 mrg }
293 1.1 mrg
294 1.1 mrg
295 1.1 mrg /* Add an entry to the end of the dat[] array, reallocing to make it bigger
296 1.1 mrg if necessary. */
297 1.1 mrg void
298 1.1 mrg add_dat (mp_size_t size, double d)
299 1.1 mrg {
300 1.1 mrg #define ALLOCDAT_STEP 500
301 1.1 mrg
302 1.1 mrg ASSERT_ALWAYS (ndat <= allocdat);
303 1.1 mrg
304 1.1 mrg if (ndat == allocdat)
305 1.1 mrg {
306 1.1 mrg dat = (struct dat_t *) __gmp_allocate_or_reallocate
307 1.1 mrg (dat, allocdat * sizeof(dat[0]),
308 1.1 mrg (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
309 1.1 mrg allocdat += ALLOCDAT_STEP;
310 1.1 mrg }
311 1.1 mrg
312 1.1 mrg dat[ndat].size = size;
313 1.1 mrg dat[ndat].d = d;
314 1.1 mrg ndat++;
315 1.1 mrg }
316 1.1 mrg
317 1.1 mrg
318 1.1 mrg /* Return the threshold size based on the data accumulated. */
319 1.1 mrg mp_size_t
320 1.1 mrg analyze_dat (int final)
321 1.1 mrg {
322 1.1 mrg double x, min_x;
323 1.1 mrg int j, min_j;
324 1.1 mrg
325 1.1 mrg /* If the threshold is set at dat[0].size, any positive values are bad. */
326 1.1 mrg x = 0.0;
327 1.1 mrg for (j = 0; j < ndat; j++)
328 1.1 mrg if (dat[j].d > 0.0)
329 1.1 mrg x += dat[j].d;
330 1.1 mrg
331 1.1 mrg if (option_trace >= 2 && final)
332 1.1 mrg {
333 1.1 mrg printf ("\n");
334 1.1 mrg printf ("x is the sum of the badness from setting thresh at given size\n");
335 1.1 mrg printf (" (minimum x is sought)\n");
336 1.1 mrg printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
337 1.1 mrg }
338 1.1 mrg
339 1.1 mrg min_x = x;
340 1.1 mrg min_j = 0;
341 1.1 mrg
342 1.1 mrg
343 1.1 mrg /* When stepping to the next dat[j].size, positive values are no longer
344 1.1 mrg bad (so subtracted), negative values become bad (so add the absolute
345 1.1 mrg value, meaning subtract). */
346 1.1 mrg for (j = 0; j < ndat; x -= dat[j].d, j++)
347 1.1 mrg {
348 1.1 mrg if (option_trace >= 2 && final)
349 1.1 mrg printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
350 1.1 mrg
351 1.1 mrg if (x < min_x)
352 1.1 mrg {
353 1.1 mrg min_x = x;
354 1.1 mrg min_j = j;
355 1.1 mrg }
356 1.1 mrg }
357 1.1 mrg
358 1.1 mrg return min_j;
359 1.1 mrg }
360 1.1 mrg
361 1.1 mrg
362 1.1 mrg /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
363 1.1 mrg
364 1.1 mrg mp_limb_t mpn_divrem_1_tune
365 1.1 mrg __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
366 1.1 mrg mp_limb_t mpn_mod_1_tune
367 1.1 mrg __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
368 1.1 mrg
369 1.1 mrg double
370 1.1 mrg speed_mpn_mod_1_tune (struct speed_params *s)
371 1.1 mrg {
372 1.1 mrg SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
373 1.1 mrg }
374 1.1 mrg double
375 1.1 mrg speed_mpn_divrem_1_tune (struct speed_params *s)
376 1.1 mrg {
377 1.1 mrg SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
378 1.1 mrg }
379 1.1 mrg
380 1.1 mrg
381 1.1 mrg double
382 1.1 mrg tuneup_measure (speed_function_t fun,
383 1.1 mrg const struct param_t *param,
384 1.1 mrg struct speed_params *s)
385 1.1 mrg {
386 1.1 mrg static struct param_t dummy;
387 1.1 mrg double t;
388 1.1 mrg TMP_DECL;
389 1.1 mrg
390 1.1 mrg if (! param)
391 1.1 mrg param = &dummy;
392 1.1 mrg
393 1.1 mrg s->size += param->size_extra;
394 1.1 mrg
395 1.1 mrg TMP_MARK;
396 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
397 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
398 1.1 mrg
399 1.1 mrg mpn_random (s->xp, s->size);
400 1.1 mrg mpn_random (s->yp, s->size);
401 1.1 mrg
402 1.1 mrg switch (param->data_high) {
403 1.1 mrg case DATA_HIGH_LT_R:
404 1.1 mrg s->xp[s->size-1] %= s->r;
405 1.1 mrg s->yp[s->size-1] %= s->r;
406 1.1 mrg break;
407 1.1 mrg case DATA_HIGH_GE_R:
408 1.1 mrg s->xp[s->size-1] |= s->r;
409 1.1 mrg s->yp[s->size-1] |= s->r;
410 1.1 mrg break;
411 1.1 mrg }
412 1.1 mrg
413 1.1 mrg t = speed_measure (fun, s);
414 1.1 mrg
415 1.1 mrg s->size -= param->size_extra;
416 1.1 mrg
417 1.1 mrg TMP_FREE;
418 1.1 mrg return t;
419 1.1 mrg }
420 1.1 mrg
421 1.1 mrg
422 1.1 mrg #define PRINT_WIDTH 31
423 1.1 mrg
424 1.1 mrg void
425 1.1 mrg print_define_start (const char *name)
426 1.1 mrg {
427 1.1 mrg printf ("#define %-*s ", PRINT_WIDTH, name);
428 1.1 mrg if (option_trace)
429 1.1 mrg printf ("...\n");
430 1.1 mrg }
431 1.1 mrg
432 1.1 mrg void
433 1.1 mrg print_define_end_remark (const char *name, mp_size_t value, const char *remark)
434 1.1 mrg {
435 1.1 mrg if (option_trace)
436 1.1 mrg printf ("#define %-*s ", PRINT_WIDTH, name);
437 1.1 mrg
438 1.1 mrg if (value == MP_SIZE_T_MAX)
439 1.1 mrg printf ("MP_SIZE_T_MAX");
440 1.1 mrg else
441 1.1 mrg printf ("%5ld", (long) value);
442 1.1 mrg
443 1.1 mrg if (remark != NULL)
444 1.1 mrg printf (" /* %s */", remark);
445 1.1 mrg printf ("\n");
446 1.1 mrg fflush (stdout);
447 1.1 mrg }
448 1.1 mrg
449 1.1 mrg void
450 1.1 mrg print_define_end (const char *name, mp_size_t value)
451 1.1 mrg {
452 1.1 mrg const char *remark;
453 1.1 mrg if (value == MP_SIZE_T_MAX)
454 1.1 mrg remark = "never";
455 1.1 mrg else if (value == 0)
456 1.1 mrg remark = "always";
457 1.1 mrg else
458 1.1 mrg remark = NULL;
459 1.1 mrg print_define_end_remark (name, value, remark);
460 1.1 mrg }
461 1.1 mrg
462 1.1 mrg void
463 1.1 mrg print_define (const char *name, mp_size_t value)
464 1.1 mrg {
465 1.1 mrg print_define_start (name);
466 1.1 mrg print_define_end (name, value);
467 1.1 mrg }
468 1.1 mrg
469 1.1 mrg void
470 1.1 mrg print_define_remark (const char *name, mp_size_t value, const char *remark)
471 1.1 mrg {
472 1.1 mrg print_define_start (name);
473 1.1 mrg print_define_end_remark (name, value, remark);
474 1.1 mrg }
475 1.1 mrg
476 1.1 mrg
477 1.1 mrg void
478 1.1 mrg one (mp_size_t *threshold, struct param_t *param)
479 1.1 mrg {
480 1.1 mrg int since_positive, since_thresh_change;
481 1.1 mrg int thresh_idx, new_thresh_idx;
482 1.1 mrg
483 1.1 mrg #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
484 1.1 mrg
485 1.1 mrg DEFAULT (param->function_fudge, 1.0);
486 1.1 mrg DEFAULT (param->function2, param->function);
487 1.1 mrg DEFAULT (param->step_factor, 0.01); /* small steps by default */
488 1.1 mrg DEFAULT (param->step, 1); /* small steps by default */
489 1.1 mrg DEFAULT (param->stop_since_change, 80);
490 1.1 mrg DEFAULT (param->stop_factor, 1.2);
491 1.1 mrg DEFAULT (param->min_size, 10);
492 1.1 mrg DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
493 1.1 mrg
494 1.1 mrg if (param->check_size != 0)
495 1.1 mrg {
496 1.1 mrg double t1, t2;
497 1.1 mrg s.size = param->check_size;
498 1.1 mrg
499 1.1 mrg *threshold = s.size+1;
500 1.1 mrg t1 = tuneup_measure (param->function, param, &s);
501 1.1 mrg
502 1.1 mrg *threshold = s.size;
503 1.1 mrg t2 = tuneup_measure (param->function2, param, &s);
504 1.1 mrg if (t1 == -1.0 || t2 == -1.0)
505 1.1 mrg {
506 1.1 mrg printf ("Oops, can't run both functions at size %ld\n",
507 1.1 mrg (long) s.size);
508 1.1 mrg abort ();
509 1.1 mrg }
510 1.1 mrg t1 *= param->function_fudge;
511 1.1 mrg
512 1.1 mrg /* ask that t2 is at least 4% below t1 */
513 1.1 mrg if (t1 < t2*1.04)
514 1.1 mrg {
515 1.1 mrg if (option_trace)
516 1.1 mrg printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
517 1.1 mrg *threshold = MP_SIZE_T_MAX;
518 1.1 mrg if (! param->noprint)
519 1.1 mrg print_define (param->name, *threshold);
520 1.1 mrg return;
521 1.1 mrg }
522 1.1 mrg
523 1.1 mrg if (option_trace >= 2)
524 1.1 mrg printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
525 1.1 mrg (long) s.size, t1, t2);
526 1.1 mrg }
527 1.1 mrg
528 1.1 mrg if (! param->noprint || option_trace)
529 1.1 mrg print_define_start (param->name);
530 1.1 mrg
531 1.1 mrg ndat = 0;
532 1.1 mrg since_positive = 0;
533 1.1 mrg since_thresh_change = 0;
534 1.1 mrg thresh_idx = 0;
535 1.1 mrg
536 1.1 mrg if (option_trace >= 2)
537 1.1 mrg {
538 1.1 mrg printf (" algorithm-A algorithm-B ratio possible\n");
539 1.1 mrg printf (" (seconds) (seconds) diff thresh\n");
540 1.1 mrg }
541 1.1 mrg
542 1.1 mrg for (s.size = param->min_size;
543 1.1 mrg s.size < param->max_size;
544 1.1 mrg s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
545 1.1 mrg {
546 1.1 mrg double ti, tiplus1, d;
547 1.1 mrg
548 1.1 mrg /*
549 1.1 mrg FIXME: check minimum size requirements are met, possibly by just
550 1.1 mrg checking for the -1 returns from the speed functions.
551 1.1 mrg */
552 1.1 mrg
553 1.1 mrg /* using method A at this size */
554 1.1 mrg *threshold = s.size+1;
555 1.1 mrg ti = tuneup_measure (param->function, param, &s);
556 1.1 mrg if (ti == -1.0)
557 1.1 mrg abort ();
558 1.1 mrg ti *= param->function_fudge;
559 1.1 mrg
560 1.1 mrg /* using method B at this size */
561 1.1 mrg *threshold = s.size;
562 1.1 mrg tiplus1 = tuneup_measure (param->function2, param, &s);
563 1.1 mrg if (tiplus1 == -1.0)
564 1.1 mrg abort ();
565 1.1 mrg
566 1.1 mrg /* Calculate the fraction by which the one or the other routine is
567 1.1 mrg slower. */
568 1.1 mrg if (tiplus1 >= ti)
569 1.1 mrg d = (tiplus1 - ti) / tiplus1; /* negative */
570 1.1 mrg else
571 1.1 mrg d = (tiplus1 - ti) / ti; /* positive */
572 1.1 mrg
573 1.1 mrg add_dat (s.size, d);
574 1.1 mrg
575 1.1 mrg new_thresh_idx = analyze_dat (0);
576 1.1 mrg
577 1.1 mrg if (option_trace >= 2)
578 1.1 mrg printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
579 1.1 mrg (long) s.size, ti, tiplus1, d,
580 1.1 mrg ti > tiplus1 ? '#' : ' ',
581 1.1 mrg (long) dat[new_thresh_idx].size);
582 1.1 mrg
583 1.1 mrg /* Stop if the last time method i was faster was more than a
584 1.1 mrg certain number of measurements ago. */
585 1.1 mrg #define STOP_SINCE_POSITIVE 200
586 1.1 mrg if (d >= 0)
587 1.1 mrg since_positive = 0;
588 1.1 mrg else
589 1.1 mrg if (++since_positive > STOP_SINCE_POSITIVE)
590 1.1 mrg {
591 1.1 mrg if (option_trace >= 1)
592 1.1 mrg printf ("stopped due to since_positive (%d)\n",
593 1.1 mrg STOP_SINCE_POSITIVE);
594 1.1 mrg break;
595 1.1 mrg }
596 1.1 mrg
597 1.1 mrg /* Stop if method A has become slower by a certain factor. */
598 1.1 mrg if (ti >= tiplus1 * param->stop_factor)
599 1.1 mrg {
600 1.1 mrg if (option_trace >= 1)
601 1.1 mrg printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
602 1.1 mrg param->stop_factor);
603 1.1 mrg break;
604 1.1 mrg }
605 1.1 mrg
606 1.1 mrg /* Stop if the threshold implied hasn't changed in a certain
607 1.1 mrg number of measurements. (It's this condition that usually
608 1.1 mrg stops the loop.) */
609 1.1 mrg if (thresh_idx != new_thresh_idx)
610 1.1 mrg since_thresh_change = 0, thresh_idx = new_thresh_idx;
611 1.1 mrg else
612 1.1 mrg if (++since_thresh_change > param->stop_since_change)
613 1.1 mrg {
614 1.1 mrg if (option_trace >= 1)
615 1.1 mrg printf ("stopped due to since_thresh_change (%d)\n",
616 1.1 mrg param->stop_since_change);
617 1.1 mrg break;
618 1.1 mrg }
619 1.1 mrg
620 1.1 mrg /* Stop if the threshold implied is more than a certain number of
621 1.1 mrg measurements ago. */
622 1.1 mrg #define STOP_SINCE_AFTER 500
623 1.1 mrg if (ndat - thresh_idx > STOP_SINCE_AFTER)
624 1.1 mrg {
625 1.1 mrg if (option_trace >= 1)
626 1.1 mrg printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
627 1.1 mrg STOP_SINCE_AFTER);
628 1.1 mrg break;
629 1.1 mrg }
630 1.1 mrg
631 1.1 mrg /* Stop when the size limit is reached before the end of the
632 1.1 mrg crossover, but only show this as an error for >= the default max
633 1.1 mrg size. FIXME: Maybe should make it a param choice whether this is
634 1.1 mrg an error. */
635 1.1 mrg if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
636 1.1 mrg {
637 1.1 mrg fprintf (stderr, "%s\n", param->name);
638 1.1 mrg fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
639 1.1 mrg (long) dat[0].size, (long) dat[ndat-1].size, ndat);
640 1.1 mrg fprintf (stderr, " max size reached before end of crossover\n");
641 1.1 mrg break;
642 1.1 mrg }
643 1.1 mrg }
644 1.1 mrg
645 1.1 mrg if (option_trace >= 1)
646 1.1 mrg printf ("sizes %ld to %ld total %d measurements\n",
647 1.1 mrg (long) dat[0].size, (long) dat[ndat-1].size, ndat);
648 1.1 mrg
649 1.1 mrg *threshold = dat[analyze_dat (1)].size;
650 1.1 mrg
651 1.1 mrg if (param->min_is_always)
652 1.1 mrg {
653 1.1 mrg if (*threshold == param->min_size)
654 1.1 mrg *threshold = 0;
655 1.1 mrg }
656 1.1 mrg
657 1.1 mrg if (! param->noprint || option_trace)
658 1.1 mrg print_define_end (param->name, *threshold);
659 1.1 mrg }
660 1.1 mrg
661 1.1 mrg
662 1.1 mrg /* Special probing for the fft thresholds. The size restrictions on the
663 1.1 mrg FFTs mean the graph of time vs size has a step effect. See this for
664 1.1 mrg example using
665 1.1 mrg
666 1.1 mrg ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
667 1.1 mrg gnuplot foo.gnuplot
668 1.1 mrg
669 1.1 mrg The current approach is to compare routines at the midpoint of relevant
670 1.1 mrg steps. Arguably a more sophisticated system of threshold data is wanted
671 1.1 mrg if this step effect remains. */
672 1.1 mrg
673 1.1 mrg struct fft_param_t {
674 1.1 mrg const char *table_name;
675 1.1 mrg const char *threshold_name;
676 1.1 mrg const char *modf_threshold_name;
677 1.1 mrg mp_size_t *p_threshold;
678 1.1 mrg mp_size_t *p_modf_threshold;
679 1.1 mrg mp_size_t first_size;
680 1.1 mrg mp_size_t max_size;
681 1.1 mrg speed_function_t function;
682 1.1 mrg speed_function_t mul_modf_function;
683 1.1 mrg speed_function_t mul_function;
684 1.1 mrg mp_size_t sqr;
685 1.1 mrg };
686 1.1 mrg
687 1.1 mrg
688 1.1 mrg /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
689 1.1 mrg N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
690 1.1 mrg of 2^(k-1) bits. */
691 1.1 mrg
692 1.1 mrg mp_size_t
693 1.1 mrg fft_step_size (int k)
694 1.1 mrg {
695 1.1 mrg mp_size_t step;
696 1.1 mrg
697 1.1 mrg step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
698 1.1 mrg step *= (mp_size_t) 1 << k;
699 1.1 mrg
700 1.1 mrg if (step <= 0)
701 1.1 mrg {
702 1.1 mrg printf ("Can't handle k=%d\n", k);
703 1.1 mrg abort ();
704 1.1 mrg }
705 1.1 mrg
706 1.1 mrg return step;
707 1.1 mrg }
708 1.1 mrg
709 1.1 mrg mp_size_t
710 1.1 mrg fft_next_size (mp_size_t pl, int k)
711 1.1 mrg {
712 1.1 mrg mp_size_t m = fft_step_size (k);
713 1.1 mrg
714 1.1 mrg /* printf ("[k=%d %ld] %ld ->", k, m, pl); */
715 1.1 mrg
716 1.1 mrg if (pl == 0 || (pl & (m-1)) != 0)
717 1.1 mrg pl = (pl | (m-1)) + 1;
718 1.1 mrg
719 1.1 mrg /* printf (" %ld\n", pl); */
720 1.1 mrg return pl;
721 1.1 mrg }
722 1.1 mrg
723 1.1 mrg #define NMAX_DEFAULT 1000000
724 1.1 mrg #define MAX_REPS 25
725 1.1 mrg #define MIN_REPS 5
726 1.1 mrg
727 1.1 mrg static inline size_t
728 1.1 mrg mpn_mul_fft_lcm (size_t a, unsigned int k)
729 1.1 mrg {
730 1.1 mrg unsigned int l = k;
731 1.1 mrg
732 1.1 mrg while (a % 2 == 0 && k > 0)
733 1.1 mrg {
734 1.1 mrg a >>= 1;
735 1.1 mrg k--;
736 1.1 mrg }
737 1.1 mrg return a << l;
738 1.1 mrg }
739 1.1 mrg
740 1.1 mrg mp_size_t
741 1.1 mrg fftfill (mp_size_t pl, int k, int sqr)
742 1.1 mrg {
743 1.1 mrg mp_size_t maxLK;
744 1.1 mrg mp_bitcnt_t N, Nprime, nprime, M;
745 1.1 mrg
746 1.1 mrg N = pl * GMP_NUMB_BITS;
747 1.1 mrg M = N >> k;
748 1.1 mrg
749 1.1 mrg maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
750 1.1 mrg
751 1.1 mrg Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
752 1.1 mrg nprime = Nprime / GMP_NUMB_BITS;
753 1.1 mrg if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
754 1.1 mrg {
755 1.1 mrg size_t K2;
756 1.1 mrg for (;;)
757 1.1 mrg {
758 1.1 mrg K2 = 1L << mpn_fft_best_k (nprime, sqr);
759 1.1 mrg if ((nprime & (K2 - 1)) == 0)
760 1.1 mrg break;
761 1.1 mrg nprime = (nprime + K2 - 1) & -K2;
762 1.1 mrg Nprime = nprime * GMP_LIMB_BITS;
763 1.1 mrg }
764 1.1 mrg }
765 1.1 mrg ASSERT_ALWAYS (nprime < pl);
766 1.1 mrg
767 1.1 mrg return Nprime;
768 1.1 mrg }
769 1.1 mrg
770 1.1 mrg static int
771 1.1 mrg compare_double (const void *ap, const void *bp)
772 1.1 mrg {
773 1.1 mrg double a = * (const double *) ap;
774 1.1 mrg double b = * (const double *) bp;
775 1.1 mrg
776 1.1 mrg if (a < b)
777 1.1 mrg return -1;
778 1.1 mrg else if (a > b)
779 1.1 mrg return 1;
780 1.1 mrg else
781 1.1 mrg return 0;
782 1.1 mrg }
783 1.1 mrg
784 1.1 mrg double
785 1.1 mrg median (double *times, int n)
786 1.1 mrg {
787 1.1 mrg qsort (times, n, sizeof (double), compare_double);
788 1.1 mrg return times[n/2];
789 1.1 mrg }
790 1.1 mrg
791 1.1 mrg #define FFT_CACHE_SIZE 25
792 1.1 mrg typedef struct fft_cache
793 1.1 mrg {
794 1.1 mrg mp_size_t n;
795 1.1 mrg double time;
796 1.1 mrg } fft_cache_t;
797 1.1 mrg
798 1.1 mrg fft_cache_t fft_cache[FFT_CACHE_SIZE];
799 1.1 mrg
800 1.1 mrg double
801 1.1 mrg cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
802 1.1 mrg int n_measurements)
803 1.1 mrg {
804 1.1 mrg int i;
805 1.1 mrg double t, ttab[MAX_REPS];
806 1.1 mrg
807 1.1 mrg if (fft_cache[k].n == n)
808 1.1 mrg return fft_cache[k].time;
809 1.1 mrg
810 1.1 mrg for (i = 0; i < n_measurements; i++)
811 1.1 mrg {
812 1.1 mrg speed_starttime ();
813 1.1 mrg mpn_mul_fft (rp, n, ap, n, bp, n, k);
814 1.1 mrg ttab[i] = speed_endtime ();
815 1.1 mrg }
816 1.1 mrg
817 1.1 mrg t = median (ttab, n_measurements);
818 1.1 mrg fft_cache[k].n = n;
819 1.1 mrg fft_cache[k].time = t;
820 1.1 mrg return t;
821 1.1 mrg }
822 1.1 mrg
823 1.1 mrg #define INSERT_FFTTAB(idx, nval, kval) \
824 1.1 mrg do { \
825 1.1 mrg fft_tab[idx].n = nval; \
826 1.1 mrg fft_tab[idx].k = kval; \
827 1.1 mrg fft_tab[idx+1].n = -1; /* sentinel */ \
828 1.1 mrg fft_tab[idx+1].k = -1; \
829 1.1 mrg } while (0)
830 1.1 mrg
831 1.1 mrg int
832 1.1 mrg fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
833 1.1 mrg {
834 1.1 mrg mp_size_t n, n1, prev_n1;
835 1.1 mrg int k, best_k, last_best_k, kmax;
836 1.1 mrg int eff, prev_eff;
837 1.1 mrg double t0, t1;
838 1.1 mrg int n_measurements;
839 1.1 mrg mp_limb_t *ap, *bp, *rp;
840 1.1 mrg mp_size_t alloc;
841 1.1 mrg char *linepref;
842 1.1 mrg struct fft_table_nk *fft_tab;
843 1.1 mrg
844 1.1 mrg fft_tab = mpn_fft_table3[p->sqr];
845 1.1 mrg
846 1.1 mrg for (k = 0; k < FFT_CACHE_SIZE; k++)
847 1.1 mrg fft_cache[k].n = 0;
848 1.1 mrg
849 1.1 mrg if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
850 1.1 mrg {
851 1.1 mrg nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
852 1.1 mrg }
853 1.1 mrg
854 1.1 mrg if (print)
855 1.1 mrg printf ("#define %s%*s", p->table_name, 38, "");
856 1.1 mrg
857 1.1 mrg if (idx == 0)
858 1.1 mrg {
859 1.1 mrg INSERT_FFTTAB (0, nmin, initial_k);
860 1.1 mrg
861 1.1 mrg if (print)
862 1.1 mrg {
863 1.1 mrg printf ("\\\n { ");
864 1.1 mrg printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
865 1.1 mrg linepref = " ";
866 1.1 mrg }
867 1.1 mrg
868 1.1 mrg idx = 1;
869 1.1 mrg }
870 1.1 mrg
871 1.1 mrg ap = malloc (sizeof (mp_limb_t));
872 1.1 mrg if (p->sqr)
873 1.1 mrg bp = ap;
874 1.1 mrg else
875 1.1 mrg bp = malloc (sizeof (mp_limb_t));
876 1.1 mrg rp = malloc (sizeof (mp_limb_t));
877 1.1 mrg alloc = 1;
878 1.1 mrg
879 1.1 mrg /* Round n to comply to initial k value */
880 1.1 mrg n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
881 1.1 mrg
882 1.1 mrg n_measurements = (18 - initial_k) | 1;
883 1.1 mrg n_measurements = MAX (n_measurements, MIN_REPS);
884 1.1 mrg n_measurements = MIN (n_measurements, MAX_REPS);
885 1.1 mrg
886 1.1 mrg last_best_k = initial_k;
887 1.1 mrg best_k = initial_k;
888 1.1 mrg
889 1.1 mrg while (n < nmax)
890 1.1 mrg {
891 1.1 mrg int start_k, end_k;
892 1.1 mrg
893 1.1 mrg /* Assume the current best k is best until we hit its next FFT step. */
894 1.1 mrg t0 = 99999;
895 1.1 mrg
896 1.1 mrg prev_n1 = n + 1;
897 1.1 mrg
898 1.1 mrg start_k = MAX (4, best_k - 4);
899 1.1 mrg end_k = MIN (24, best_k + 4);
900 1.1 mrg for (k = start_k; k <= end_k; k++)
901 1.1 mrg {
902 1.1 mrg n1 = mpn_fft_next_size (prev_n1, k);
903 1.1 mrg
904 1.1 mrg eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
905 1.1 mrg
906 1.1 mrg if (eff < 70) /* avoid measuring too slow fft:s */
907 1.1 mrg continue;
908 1.1 mrg
909 1.1 mrg if (n1 > alloc)
910 1.1 mrg {
911 1.1 mrg alloc = n1;
912 1.1 mrg if (p->sqr)
913 1.1 mrg {
914 1.1 mrg ap = realloc (ap, sizeof (mp_limb_t));
915 1.1 mrg rp = realloc (rp, sizeof (mp_limb_t));
916 1.1 mrg ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
917 1.1 mrg mpn_random (ap, alloc);
918 1.1 mrg rp = realloc (rp, alloc * sizeof (mp_limb_t));
919 1.1 mrg }
920 1.1 mrg else
921 1.1 mrg {
922 1.1 mrg ap = realloc (ap, sizeof (mp_limb_t));
923 1.1 mrg bp = realloc (bp, sizeof (mp_limb_t));
924 1.1 mrg rp = realloc (rp, sizeof (mp_limb_t));
925 1.1 mrg ap = realloc (ap, alloc * sizeof (mp_limb_t));
926 1.1 mrg mpn_random (ap, alloc);
927 1.1 mrg bp = realloc (bp, alloc * sizeof (mp_limb_t));
928 1.1 mrg mpn_random (bp, alloc);
929 1.1 mrg rp = realloc (rp, alloc * sizeof (mp_limb_t));
930 1.1 mrg }
931 1.1 mrg }
932 1.1 mrg
933 1.1 mrg t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
934 1.1 mrg
935 1.1 mrg if (t1 * n_measurements > 0.3)
936 1.1 mrg n_measurements -= 2;
937 1.1 mrg n_measurements = MAX (n_measurements, MIN_REPS);
938 1.1 mrg
939 1.1 mrg if (t1 < t0)
940 1.1 mrg {
941 1.1 mrg best_k = k;
942 1.1 mrg t0 = t1;
943 1.1 mrg }
944 1.1 mrg }
945 1.1 mrg
946 1.1 mrg n1 = mpn_fft_next_size (prev_n1, best_k);
947 1.1 mrg
948 1.1 mrg if (last_best_k != best_k)
949 1.1 mrg {
950 1.1 mrg ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
951 1.1 mrg
952 1.1 mrg if (idx >= FFT_TABLE3_SIZE)
953 1.1 mrg {
954 1.1 mrg printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
955 1.1 mrg abort ();
956 1.1 mrg }
957 1.1 mrg INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
958 1.1 mrg
959 1.1 mrg if (print)
960 1.1 mrg {
961 1.1 mrg printf (", ");
962 1.1 mrg if (idx % 4 == 0)
963 1.1 mrg printf ("\\\n ");
964 1.1 mrg printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
965 1.1 mrg }
966 1.1 mrg
967 1.1 mrg if (option_trace >= 2)
968 1.1 mrg {
969 1.1 mrg printf ("{%lu,%u}\n", prev_n1, best_k);
970 1.1 mrg fflush (stdout);
971 1.1 mrg }
972 1.1 mrg
973 1.1 mrg last_best_k = best_k;
974 1.1 mrg idx++;
975 1.1 mrg }
976 1.1 mrg
977 1.1 mrg for (;;)
978 1.1 mrg {
979 1.1 mrg prev_n1 = n1;
980 1.1 mrg prev_eff = fftfill (prev_n1, best_k, p->sqr);
981 1.1 mrg n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
982 1.1 mrg eff = fftfill (n1, best_k, p->sqr);
983 1.1 mrg
984 1.1 mrg if (eff != prev_eff)
985 1.1 mrg break;
986 1.1 mrg }
987 1.1 mrg
988 1.1 mrg n = prev_n1;
989 1.1 mrg }
990 1.1 mrg
991 1.1 mrg kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */
992 1.1 mrg kmax = MIN (kmax, 25-1);
993 1.1 mrg for (k = last_best_k + 1; k <= kmax; k++)
994 1.1 mrg {
995 1.1 mrg if (idx >= FFT_TABLE3_SIZE)
996 1.1 mrg {
997 1.1 mrg printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
998 1.1 mrg abort ();
999 1.1 mrg }
1000 1.1 mrg INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1001 1.1 mrg
1002 1.1 mrg if (print)
1003 1.1 mrg {
1004 1.1 mrg printf (", ");
1005 1.1 mrg if (idx % 4 == 0)
1006 1.1 mrg printf ("\\\n ");
1007 1.1 mrg printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1008 1.1 mrg }
1009 1.1 mrg
1010 1.1 mrg idx++;
1011 1.1 mrg }
1012 1.1 mrg
1013 1.1 mrg if (print)
1014 1.1 mrg printf (" }\n");
1015 1.1 mrg
1016 1.1 mrg free (ap);
1017 1.1 mrg if (! p->sqr)
1018 1.1 mrg free (bp);
1019 1.1 mrg free (rp);
1020 1.1 mrg
1021 1.1 mrg return idx;
1022 1.1 mrg }
1023 1.1 mrg
1024 1.1 mrg void
1025 1.1 mrg fft (struct fft_param_t *p)
1026 1.1 mrg {
1027 1.1 mrg mp_size_t size;
1028 1.1 mrg int k, idx, initial_k;
1029 1.1 mrg
1030 1.1 mrg /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1031 1.1 mrg
1032 1.1 mrg #if 1
1033 1.1 mrg {
1034 1.1 mrg /* Use plain one() mechanism, for some reasonable initial values of k. The
1035 1.1 mrg advantage is that we don't depend on mpn_fft_table3, which can therefore
1036 1.1 mrg leave it completely uninitialized. */
1037 1.1 mrg
1038 1.1 mrg static struct param_t param;
1039 1.1 mrg mp_size_t thres, best_thres;
1040 1.1 mrg int best_k;
1041 1.1 mrg char buf[20];
1042 1.1 mrg
1043 1.1 mrg best_thres = MP_SIZE_T_MAX;
1044 1.1 mrg best_k = -1;
1045 1.1 mrg
1046 1.1 mrg for (k = 5; k <= 7; k++)
1047 1.1 mrg {
1048 1.1 mrg param.name = p->modf_threshold_name;
1049 1.1 mrg param.min_size = 100;
1050 1.1 mrg param.max_size = 2000;
1051 1.1 mrg param.function = p->mul_function;
1052 1.1 mrg param.step_factor = 0.0;
1053 1.1 mrg param.step = 4;
1054 1.1 mrg param.function2 = p->mul_modf_function;
1055 1.1 mrg param.noprint = 1;
1056 1.1 mrg s.r = k;
1057 1.1 mrg one (&thres, ¶m);
1058 1.1 mrg if (thres < best_thres)
1059 1.1 mrg {
1060 1.1 mrg best_thres = thres;
1061 1.1 mrg best_k = k;
1062 1.1 mrg }
1063 1.1 mrg }
1064 1.1 mrg
1065 1.1 mrg *(p->p_modf_threshold) = best_thres;
1066 1.1 mrg sprintf (buf, "k = %d", best_k);
1067 1.1 mrg print_define_remark (p->modf_threshold_name, best_thres, buf);
1068 1.1 mrg initial_k = best_k;
1069 1.1 mrg }
1070 1.1 mrg #else
1071 1.1 mrg size = p->first_size;
1072 1.1 mrg for (;;)
1073 1.1 mrg {
1074 1.1 mrg double tk, tm;
1075 1.1 mrg
1076 1.1 mrg size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1077 1.1 mrg k = mpn_fft_best_k (size, p->sqr);
1078 1.1 mrg
1079 1.1 mrg if (size >= p->max_size)
1080 1.1 mrg break;
1081 1.1 mrg
1082 1.1 mrg s.size = size + fft_step_size (k) / 2;
1083 1.1 mrg s.r = k;
1084 1.1 mrg tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1085 1.1 mrg if (tk == -1.0)
1086 1.1 mrg abort ();
1087 1.1 mrg
1088 1.1 mrg tm = tuneup_measure (p->mul_function, NULL, &s);
1089 1.1 mrg if (tm == -1.0)
1090 1.1 mrg abort ();
1091 1.1 mrg
1092 1.1 mrg if (option_trace >= 2)
1093 1.1 mrg printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n",
1094 1.1 mrg (long) size,
1095 1.1 mrg (long) size + fft_step_size (k) / 2, k, tk,
1096 1.1 mrg (long) s.size, tm);
1097 1.1 mrg
1098 1.1 mrg if (tk < tm)
1099 1.1 mrg {
1100 1.1 mrg *p->p_modf_threshold = s.size;
1101 1.1 mrg print_define (p->modf_threshold_name, *p->p_modf_threshold);
1102 1.1 mrg break;
1103 1.1 mrg }
1104 1.1 mrg }
1105 1.1 mrg initial_k = ?;
1106 1.1 mrg #endif
1107 1.1 mrg
1108 1.1 mrg /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1109 1.1 mrg
1110 1.1 mrg idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1111 1.1 mrg printf ("#define %s_SIZE %d\n", p->table_name, idx);
1112 1.1 mrg
1113 1.1 mrg /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1114 1.1 mrg
1115 1.1 mrg size = 2 * *p->p_modf_threshold; /* OK? */
1116 1.1 mrg for (;;)
1117 1.1 mrg {
1118 1.1 mrg double tk, tm;
1119 1.1 mrg mp_size_t mulmod_size, mul_size;;
1120 1.1 mrg
1121 1.1 mrg if (size >= p->max_size)
1122 1.1 mrg break;
1123 1.1 mrg
1124 1.1 mrg mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1125 1.1 mrg mul_size = (size + mulmod_size) / 2; /* middle of step */
1126 1.1 mrg
1127 1.1 mrg s.size = mulmod_size;
1128 1.1 mrg tk = tuneup_measure (p->function, NULL, &s);
1129 1.1 mrg if (tk == -1.0)
1130 1.1 mrg abort ();
1131 1.1 mrg
1132 1.1 mrg s.size = mul_size;
1133 1.1 mrg tm = tuneup_measure (p->mul_function, NULL, &s);
1134 1.1 mrg if (tm == -1.0)
1135 1.1 mrg abort ();
1136 1.1 mrg
1137 1.1 mrg if (option_trace >= 2)
1138 1.1 mrg printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n",
1139 1.1 mrg (long) size,
1140 1.1 mrg (long) mulmod_size, tk,
1141 1.1 mrg (long) mul_size, tm);
1142 1.1 mrg
1143 1.1 mrg size = mulmod_size;
1144 1.1 mrg
1145 1.1 mrg if (tk < tm)
1146 1.1 mrg {
1147 1.1 mrg *p->p_threshold = s.size;
1148 1.1 mrg print_define (p->threshold_name, *p->p_threshold);
1149 1.1 mrg break;
1150 1.1 mrg }
1151 1.1 mrg }
1152 1.1 mrg }
1153 1.1 mrg
1154 1.1 mrg
1155 1.1 mrg
1156 1.1 mrg /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1157 1.1 mrg giving wrong results. */
1158 1.1 mrg void
1159 1.1 mrg tune_mul_n (void)
1160 1.1 mrg {
1161 1.1 mrg static struct param_t param;
1162 1.1 mrg
1163 1.1 mrg param.function = speed_mpn_mul_n;
1164 1.1 mrg
1165 1.1 mrg param.name = "MUL_TOOM22_THRESHOLD";
1166 1.1 mrg param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1167 1.1 mrg param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1168 1.1 mrg one (&mul_toom22_threshold, ¶m);
1169 1.1 mrg
1170 1.1 mrg param.name = "MUL_TOOM33_THRESHOLD";
1171 1.1 mrg param.min_size = MAX (mul_toom22_threshold, MPN_TOOM33_MUL_MINSIZE);
1172 1.1 mrg param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1173 1.1 mrg one (&mul_toom33_threshold, ¶m);
1174 1.1 mrg
1175 1.1 mrg param.name = "MUL_TOOM44_THRESHOLD";
1176 1.1 mrg param.min_size = MAX (mul_toom33_threshold, MPN_TOOM44_MUL_MINSIZE);
1177 1.1 mrg param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1178 1.1 mrg one (&mul_toom44_threshold, ¶m);
1179 1.1 mrg
1180 1.1 mrg param.name = "MUL_TOOM6H_THRESHOLD";
1181 1.1 mrg param.min_size = MAX (mul_toom44_threshold, MPN_TOOM6H_MUL_MINSIZE);
1182 1.1 mrg param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1183 1.1 mrg one (&mul_toom6h_threshold, ¶m);
1184 1.1 mrg
1185 1.1 mrg param.name = "MUL_TOOM8H_THRESHOLD";
1186 1.1 mrg param.min_size = MAX (mul_toom6h_threshold, MPN_TOOM8H_MUL_MINSIZE);
1187 1.1 mrg param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1188 1.1 mrg one (&mul_toom8h_threshold, ¶m);
1189 1.1 mrg
1190 1.1 mrg /* disabled until tuned */
1191 1.1 mrg MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1192 1.1 mrg }
1193 1.1 mrg
1194 1.1 mrg void
1195 1.1 mrg tune_mul (void)
1196 1.1 mrg {
1197 1.1 mrg static struct param_t param;
1198 1.1 mrg mp_size_t thres;
1199 1.1 mrg
1200 1.1 mrg param.noprint = 1;
1201 1.1 mrg
1202 1.1 mrg param.function = speed_mpn_toom32_for_toom43_mul;
1203 1.1 mrg param.function2 = speed_mpn_toom43_for_toom32_mul;
1204 1.1 mrg param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1205 1.1 mrg param.min_size = MPN_TOOM43_MUL_MINSIZE;
1206 1.1 mrg one (&thres, ¶m);
1207 1.1 mrg mul_toom32_to_toom43_threshold = 17*thres/24;
1208 1.1 mrg print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1209 1.1 mrg
1210 1.1 mrg param.function = speed_mpn_toom32_for_toom53_mul;
1211 1.1 mrg param.function2 = speed_mpn_toom53_for_toom32_mul;
1212 1.1 mrg param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1213 1.1 mrg param.min_size = MPN_TOOM53_MUL_MINSIZE;
1214 1.1 mrg one (&thres, ¶m);
1215 1.1 mrg mul_toom32_to_toom53_threshold = 19*thres/30;
1216 1.1 mrg print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1217 1.1 mrg
1218 1.1 mrg param.function = speed_mpn_toom42_for_toom53_mul;
1219 1.1 mrg param.function2 = speed_mpn_toom53_for_toom42_mul;
1220 1.1 mrg param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1221 1.1 mrg param.min_size = MPN_TOOM53_MUL_MINSIZE;
1222 1.1 mrg one (&thres, ¶m);
1223 1.1 mrg mul_toom42_to_toom53_threshold = 11*thres/20;
1224 1.1 mrg print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1225 1.1 mrg
1226 1.1 mrg param.function = speed_mpn_toom42_mul;
1227 1.1 mrg param.function2 = speed_mpn_toom63_mul;
1228 1.1 mrg param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1229 1.1 mrg param.min_size = MPN_TOOM63_MUL_MINSIZE;
1230 1.1 mrg one (&thres, ¶m);
1231 1.1 mrg mul_toom42_to_toom63_threshold = thres/2;
1232 1.1 mrg print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1233 1.1 mrg }
1234 1.1 mrg
1235 1.1 mrg
1236 1.1 mrg void
1237 1.1 mrg tune_mullo (void)
1238 1.1 mrg {
1239 1.1 mrg static struct param_t param;
1240 1.1 mrg
1241 1.1 mrg param.function = speed_mpn_mullo_n;
1242 1.1 mrg
1243 1.1 mrg param.name = "MULLO_BASECASE_THRESHOLD";
1244 1.1 mrg param.min_size = 1;
1245 1.1 mrg param.min_is_always = 1;
1246 1.1 mrg param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1247 1.1 mrg param.stop_factor = 1.5;
1248 1.1 mrg param.noprint = 1;
1249 1.1 mrg one (&mullo_basecase_threshold, ¶m);
1250 1.1 mrg
1251 1.1 mrg param.name = "MULLO_DC_THRESHOLD";
1252 1.1 mrg param.min_size = 8;
1253 1.1 mrg param.min_is_always = 0;
1254 1.1 mrg param.max_size = 1000;
1255 1.1 mrg one (&mullo_dc_threshold, ¶m);
1256 1.1 mrg
1257 1.1 mrg if (mullo_basecase_threshold >= mullo_dc_threshold)
1258 1.1 mrg {
1259 1.1 mrg print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1260 1.1 mrg print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1261 1.1 mrg }
1262 1.1 mrg else
1263 1.1 mrg {
1264 1.1 mrg print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1265 1.1 mrg print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1266 1.1 mrg }
1267 1.1 mrg
1268 1.1 mrg #if WANT_FFT
1269 1.1 mrg param.name = "MULLO_MUL_N_THRESHOLD";
1270 1.1 mrg param.min_size = mullo_dc_threshold;
1271 1.1 mrg param.max_size = 2 * mul_fft_threshold;
1272 1.1 mrg param.noprint = 0;
1273 1.1 mrg param.step_factor = 0.03;
1274 1.1 mrg one (&mullo_mul_n_threshold, ¶m);
1275 1.1 mrg #else
1276 1.1 mrg print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1277 1.1 mrg "without FFT use mullo forever");
1278 1.1 mrg #endif
1279 1.1 mrg }
1280 1.1 mrg
1281 1.1 mrg void
1282 1.1 mrg tune_mulmod_bnm1 (void)
1283 1.1 mrg {
1284 1.1 mrg static struct param_t param;
1285 1.1 mrg
1286 1.1 mrg param.name = "MULMOD_BNM1_THRESHOLD";
1287 1.1 mrg param.function = speed_mpn_mulmod_bnm1;
1288 1.1 mrg param.min_size = 4;
1289 1.1 mrg param.max_size = 100;
1290 1.1 mrg one (&mulmod_bnm1_threshold, ¶m);
1291 1.1 mrg }
1292 1.1 mrg
1293 1.1 mrg void
1294 1.1 mrg tune_sqrmod_bnm1 (void)
1295 1.1 mrg {
1296 1.1 mrg static struct param_t param;
1297 1.1 mrg
1298 1.1 mrg param.name = "SQRMOD_BNM1_THRESHOLD";
1299 1.1 mrg param.function = speed_mpn_sqrmod_bnm1;
1300 1.1 mrg param.min_size = 4;
1301 1.1 mrg param.max_size = 100;
1302 1.1 mrg one (&sqrmod_bnm1_threshold, ¶m);
1303 1.1 mrg }
1304 1.1 mrg
1305 1.1 mrg
1306 1.1 mrg /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1307 1.1 mrg is faster only at size==2 then we don't want to bother with extra code
1308 1.1 mrg just for that. Start karatsuba from 4 same as MUL above. */
1309 1.1 mrg
1310 1.1 mrg void
1311 1.1 mrg tune_sqr (void)
1312 1.1 mrg {
1313 1.1 mrg /* disabled until tuned */
1314 1.1 mrg SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1315 1.1 mrg
1316 1.1 mrg if (HAVE_NATIVE_mpn_sqr_basecase)
1317 1.1 mrg {
1318 1.1 mrg print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1319 1.1 mrg sqr_basecase_threshold = 0;
1320 1.1 mrg }
1321 1.1 mrg else
1322 1.1 mrg {
1323 1.1 mrg static struct param_t param;
1324 1.1 mrg param.name = "SQR_BASECASE_THRESHOLD";
1325 1.1 mrg param.function = speed_mpn_sqr;
1326 1.1 mrg param.min_size = 3;
1327 1.1 mrg param.min_is_always = 1;
1328 1.1 mrg param.max_size = TUNE_SQR_TOOM2_MAX;
1329 1.1 mrg param.noprint = 1;
1330 1.1 mrg one (&sqr_basecase_threshold, ¶m);
1331 1.1 mrg }
1332 1.1 mrg
1333 1.1 mrg {
1334 1.1 mrg static struct param_t param;
1335 1.1 mrg param.name = "SQR_TOOM2_THRESHOLD";
1336 1.1 mrg param.function = speed_mpn_sqr;
1337 1.1 mrg param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1338 1.1 mrg param.max_size = TUNE_SQR_TOOM2_MAX;
1339 1.1 mrg param.noprint = 1;
1340 1.1 mrg one (&sqr_toom2_threshold, ¶m);
1341 1.1 mrg
1342 1.1 mrg if (! HAVE_NATIVE_mpn_sqr_basecase
1343 1.1 mrg && sqr_toom2_threshold < sqr_basecase_threshold)
1344 1.1 mrg {
1345 1.1 mrg /* Karatsuba becomes faster than mul_basecase before
1346 1.1 mrg sqr_basecase does. Arrange for the expression
1347 1.1 mrg "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1348 1.1 mrg selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1349 1.1 mrg SQR_TOOM2_THRESHOLD to zero, making
1350 1.1 mrg SQR_BASECASE_THRESHOLD the toom2 threshold. */
1351 1.1 mrg
1352 1.1 mrg sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1353 1.1 mrg SQR_TOOM2_THRESHOLD = 0;
1354 1.1 mrg
1355 1.1 mrg print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1356 1.1 mrg "toom2");
1357 1.1 mrg print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1358 1.1 mrg "never sqr_basecase");
1359 1.1 mrg }
1360 1.1 mrg else
1361 1.1 mrg {
1362 1.1 mrg if (! HAVE_NATIVE_mpn_sqr_basecase)
1363 1.1 mrg print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1364 1.1 mrg print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1365 1.1 mrg }
1366 1.1 mrg }
1367 1.1 mrg
1368 1.1 mrg {
1369 1.1 mrg static struct param_t param;
1370 1.1 mrg mp_size_t toom3_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1371 1.1 mrg
1372 1.1 mrg param.function = speed_mpn_sqr;
1373 1.1 mrg
1374 1.1 mrg param.name = "SQR_TOOM3_THRESHOLD";
1375 1.1 mrg param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_MINSIZE);
1376 1.1 mrg param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1377 1.1 mrg one (&sqr_toom3_threshold, ¶m);
1378 1.1 mrg
1379 1.1 mrg param.name = "SQR_TOOM4_THRESHOLD";
1380 1.1 mrg param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_MINSIZE);
1381 1.1 mrg param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1382 1.1 mrg one (&sqr_toom4_threshold, ¶m);
1383 1.1 mrg
1384 1.1 mrg param.name = "SQR_TOOM6_THRESHOLD";
1385 1.1 mrg param.min_size = MAX (sqr_toom4_threshold, MPN_TOOM6_SQR_MINSIZE);
1386 1.1 mrg param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1387 1.1 mrg one (&sqr_toom6_threshold, ¶m);
1388 1.1 mrg
1389 1.1 mrg param.name = "SQR_TOOM8_THRESHOLD";
1390 1.1 mrg param.min_size = MAX (sqr_toom6_threshold, MPN_TOOM8_SQR_MINSIZE);
1391 1.1 mrg param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1392 1.1 mrg one (&sqr_toom8_threshold, ¶m);
1393 1.1 mrg }
1394 1.1 mrg }
1395 1.1 mrg
1396 1.1 mrg
1397 1.1 mrg void
1398 1.1 mrg tune_dc_div (void)
1399 1.1 mrg {
1400 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n */
1401 1.1 mrg {
1402 1.1 mrg static struct param_t param;
1403 1.1 mrg param.name = "DC_DIV_QR_THRESHOLD";
1404 1.1 mrg param.function = speed_mpn_sbpi1_div_qr;
1405 1.1 mrg param.function2 = speed_mpn_dcpi1_div_qr;
1406 1.1 mrg param.min_size = 6;
1407 1.1 mrg one (&dc_div_qr_threshold, ¶m);
1408 1.1 mrg }
1409 1.1 mrg {
1410 1.1 mrg static struct param_t param;
1411 1.1 mrg param.name = "DC_DIVAPPR_Q_THRESHOLD";
1412 1.1 mrg param.function = speed_mpn_sbpi1_divappr_q;
1413 1.1 mrg param.function2 = speed_mpn_dcpi1_divappr_q;
1414 1.1 mrg param.min_size = 6;
1415 1.1 mrg one (&dc_divappr_q_threshold, ¶m);
1416 1.1 mrg }
1417 1.1 mrg }
1418 1.1 mrg
1419 1.1 mrg static double
1420 1.1 mrg speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1421 1.1 mrg {
1422 1.1 mrg if (s->size < DC_DIV_QR_THRESHOLD)
1423 1.1 mrg return speed_mpn_sbpi1_div_qr (s);
1424 1.1 mrg else
1425 1.1 mrg return speed_mpn_dcpi1_div_qr (s);
1426 1.1 mrg }
1427 1.1 mrg
1428 1.1 mrg void
1429 1.1 mrg tune_mu_div (void)
1430 1.1 mrg {
1431 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n */
1432 1.1 mrg {
1433 1.1 mrg static struct param_t param;
1434 1.1 mrg param.name = "MU_DIV_QR_THRESHOLD";
1435 1.1 mrg param.function = speed_mpn_dcpi1_div_qr;
1436 1.1 mrg param.function2 = speed_mpn_mu_div_qr;
1437 1.1 mrg param.min_size = 6;
1438 1.1 mrg param.max_size = 5000;
1439 1.1 mrg param.step_factor = 0.02;
1440 1.1 mrg one (&mu_div_qr_threshold, ¶m);
1441 1.1 mrg }
1442 1.1 mrg {
1443 1.1 mrg static struct param_t param;
1444 1.1 mrg param.name = "MU_DIVAPPR_Q_THRESHOLD";
1445 1.1 mrg param.function = speed_mpn_dcpi1_divappr_q;
1446 1.1 mrg param.function2 = speed_mpn_mu_divappr_q;
1447 1.1 mrg param.min_size = 6;
1448 1.1 mrg param.max_size = 5000;
1449 1.1 mrg param.step_factor = 0.02;
1450 1.1 mrg one (&mu_divappr_q_threshold, ¶m);
1451 1.1 mrg }
1452 1.1 mrg {
1453 1.1 mrg static struct param_t param;
1454 1.1 mrg param.name = "MUPI_DIV_QR_THRESHOLD";
1455 1.1 mrg param.function = speed_mpn_sbordcpi1_div_qr;
1456 1.1 mrg param.function2 = speed_mpn_mupi_div_qr;
1457 1.1 mrg param.min_size = 6;
1458 1.1 mrg param.min_is_always = 1;
1459 1.1 mrg param.max_size = 1000;
1460 1.1 mrg param.step_factor = 0.02;
1461 1.1 mrg one (&mupi_div_qr_threshold, ¶m);
1462 1.1 mrg }
1463 1.1 mrg }
1464 1.1 mrg
1465 1.1 mrg void
1466 1.1 mrg tune_dc_bdiv (void)
1467 1.1 mrg {
1468 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n*/
1469 1.1 mrg {
1470 1.1 mrg static struct param_t param;
1471 1.1 mrg param.name = "DC_BDIV_QR_THRESHOLD";
1472 1.1 mrg param.function = speed_mpn_sbpi1_bdiv_qr;
1473 1.1 mrg param.function2 = speed_mpn_dcpi1_bdiv_qr;
1474 1.1 mrg param.min_size = 4;
1475 1.1 mrg one (&dc_bdiv_qr_threshold, ¶m);
1476 1.1 mrg }
1477 1.1 mrg {
1478 1.1 mrg static struct param_t param;
1479 1.1 mrg param.name = "DC_BDIV_Q_THRESHOLD";
1480 1.1 mrg param.function = speed_mpn_sbpi1_bdiv_q;
1481 1.1 mrg param.function2 = speed_mpn_dcpi1_bdiv_q;
1482 1.1 mrg param.min_size = 4;
1483 1.1 mrg one (&dc_bdiv_q_threshold, ¶m);
1484 1.1 mrg }
1485 1.1 mrg }
1486 1.1 mrg
1487 1.1 mrg void
1488 1.1 mrg tune_mu_bdiv (void)
1489 1.1 mrg {
1490 1.1 mrg s.r = 0; /* clear to make speed function do 2n/n*/
1491 1.1 mrg {
1492 1.1 mrg static struct param_t param;
1493 1.1 mrg param.name = "MU_BDIV_QR_THRESHOLD";
1494 1.1 mrg param.function = speed_mpn_dcpi1_bdiv_qr;
1495 1.1 mrg param.function2 = speed_mpn_mu_bdiv_qr;
1496 1.1 mrg param.min_size = 4;
1497 1.1 mrg param.max_size = 5000;
1498 1.1 mrg param.step_factor = 0.02;
1499 1.1 mrg one (&mu_bdiv_qr_threshold, ¶m);
1500 1.1 mrg }
1501 1.1 mrg {
1502 1.1 mrg static struct param_t param;
1503 1.1 mrg param.name = "MU_BDIV_Q_THRESHOLD";
1504 1.1 mrg param.function = speed_mpn_dcpi1_bdiv_q;
1505 1.1 mrg param.function2 = speed_mpn_mu_bdiv_q;
1506 1.1 mrg param.min_size = 4;
1507 1.1 mrg param.max_size = 5000;
1508 1.1 mrg param.step_factor = 0.02;
1509 1.1 mrg one (&mu_bdiv_q_threshold, ¶m);
1510 1.1 mrg }
1511 1.1 mrg }
1512 1.1 mrg
1513 1.1 mrg void
1514 1.1 mrg tune_invertappr (void)
1515 1.1 mrg {
1516 1.1 mrg static struct param_t param;
1517 1.1 mrg
1518 1.1 mrg param.function = speed_mpn_ni_invertappr;
1519 1.1 mrg param.name = "INV_MULMOD_BNM1_THRESHOLD";
1520 1.1 mrg param.min_size = 4;
1521 1.1 mrg one (&inv_mulmod_bnm1_threshold, ¶m);
1522 1.1 mrg
1523 1.1 mrg param.function = speed_mpn_invertappr;
1524 1.1 mrg param.name = "INV_NEWTON_THRESHOLD";
1525 1.1 mrg param.min_size = 3;
1526 1.1 mrg one (&inv_newton_threshold, ¶m);
1527 1.1 mrg }
1528 1.1 mrg
1529 1.1 mrg void
1530 1.1 mrg tune_invert (void)
1531 1.1 mrg {
1532 1.1 mrg static struct param_t param;
1533 1.1 mrg
1534 1.1 mrg param.function = speed_mpn_invert;
1535 1.1 mrg param.name = "INV_APPR_THRESHOLD";
1536 1.1 mrg param.min_size = 3;
1537 1.1 mrg one (&inv_appr_threshold, ¶m);
1538 1.1 mrg }
1539 1.1 mrg
1540 1.1 mrg void
1541 1.1 mrg tune_binvert (void)
1542 1.1 mrg {
1543 1.1 mrg static struct param_t param;
1544 1.1 mrg
1545 1.1 mrg param.function = speed_mpn_binvert;
1546 1.1 mrg param.name = "BINV_NEWTON_THRESHOLD";
1547 1.1 mrg param.min_size = 8; /* pointless with smaller operands */
1548 1.1 mrg one (&binv_newton_threshold, ¶m);
1549 1.1 mrg }
1550 1.1 mrg
1551 1.1 mrg void
1552 1.1 mrg tune_redc (void)
1553 1.1 mrg {
1554 1.1 mrg #define TUNE_REDC_2_MAX 100
1555 1.1 mrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1556 1.1 mrg #define WANT_REDC_2 1
1557 1.1 mrg #endif
1558 1.1 mrg
1559 1.1 mrg #if WANT_REDC_2
1560 1.1 mrg {
1561 1.1 mrg static struct param_t param;
1562 1.1 mrg param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1563 1.1 mrg param.function = speed_mpn_redc_1;
1564 1.1 mrg param.function2 = speed_mpn_redc_2;
1565 1.1 mrg param.min_size = 1;
1566 1.1 mrg param.min_is_always = 1;
1567 1.1 mrg param.max_size = TUNE_REDC_2_MAX;
1568 1.1 mrg param.noprint = 1;
1569 1.1 mrg one (&redc_1_to_redc_2_threshold, ¶m);
1570 1.1 mrg }
1571 1.1 mrg {
1572 1.1 mrg static struct param_t param;
1573 1.1 mrg param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1574 1.1 mrg param.function = speed_mpn_redc_2;
1575 1.1 mrg param.function2 = speed_mpn_redc_n;
1576 1.1 mrg param.min_size = 16;
1577 1.1 mrg param.noprint = 1;
1578 1.1 mrg one (&redc_2_to_redc_n_threshold, ¶m);
1579 1.1 mrg }
1580 1.1 mrg if (redc_1_to_redc_2_threshold >= TUNE_REDC_2_MAX - 1)
1581 1.1 mrg {
1582 1.1 mrg /* Disable REDC_2. This is not supposed to happen. */
1583 1.1 mrg print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1584 1.1 mrg print_define_remark ("REDC_2_TO_REDC_N_THRESHOLD", 0, "anomaly: never REDC_2");
1585 1.1 mrg }
1586 1.1 mrg else
1587 1.1 mrg {
1588 1.1 mrg print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1589 1.1 mrg print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1590 1.1 mrg }
1591 1.1 mrg #else
1592 1.1 mrg {
1593 1.1 mrg static struct param_t param;
1594 1.1 mrg param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1595 1.1 mrg param.function = speed_mpn_redc_1;
1596 1.1 mrg param.function2 = speed_mpn_redc_n;
1597 1.1 mrg param.min_size = 16;
1598 1.1 mrg one (&redc_1_to_redc_n_threshold, ¶m);
1599 1.1 mrg }
1600 1.1 mrg #endif
1601 1.1 mrg }
1602 1.1 mrg
1603 1.1 mrg void
1604 1.1 mrg tune_matrix22_mul (void)
1605 1.1 mrg {
1606 1.1 mrg static struct param_t param;
1607 1.1 mrg param.name = "MATRIX22_STRASSEN_THRESHOLD";
1608 1.1 mrg param.function = speed_mpn_matrix22_mul;
1609 1.1 mrg param.min_size = 2;
1610 1.1 mrg one (&matrix22_strassen_threshold, ¶m);
1611 1.1 mrg }
1612 1.1 mrg
1613 1.1 mrg void
1614 1.1 mrg tune_hgcd (void)
1615 1.1 mrg {
1616 1.1 mrg static struct param_t param;
1617 1.1 mrg param.name = "HGCD_THRESHOLD";
1618 1.1 mrg param.function = speed_mpn_hgcd;
1619 1.1 mrg /* We seem to get strange results for small sizes */
1620 1.1 mrg param.min_size = 30;
1621 1.1 mrg one (&hgcd_threshold, ¶m);
1622 1.1 mrg }
1623 1.1 mrg
1624 1.1 mrg void
1625 1.1 mrg tune_gcd_dc (void)
1626 1.1 mrg {
1627 1.1 mrg static struct param_t param;
1628 1.1 mrg param.name = "GCD_DC_THRESHOLD";
1629 1.1 mrg param.function = speed_mpn_gcd;
1630 1.1 mrg param.min_size = hgcd_threshold;
1631 1.1 mrg param.max_size = 3000;
1632 1.1 mrg param.step_factor = 0.02;
1633 1.1 mrg one (&gcd_dc_threshold, ¶m);
1634 1.1 mrg }
1635 1.1 mrg
1636 1.1 mrg void
1637 1.1 mrg tune_gcdext_dc (void)
1638 1.1 mrg {
1639 1.1 mrg static struct param_t param;
1640 1.1 mrg param.name = "GCDEXT_DC_THRESHOLD";
1641 1.1 mrg param.function = speed_mpn_gcdext;
1642 1.1 mrg param.min_size = hgcd_threshold;
1643 1.1 mrg param.max_size = 3000;
1644 1.1 mrg param.step_factor = 0.02;
1645 1.1 mrg one (&gcdext_dc_threshold, ¶m);
1646 1.1 mrg }
1647 1.1 mrg
1648 1.1 mrg
1649 1.1 mrg /* size_extra==1 reflects the fact that with high<divisor one division is
1650 1.1 mrg always skipped. Forcing high<divisor while testing ensures consistency
1651 1.1 mrg while stepping through sizes, ie. that size-1 divides will be done each
1652 1.1 mrg time.
1653 1.1 mrg
1654 1.1 mrg min_size==2 and min_is_always are used so that if plain division is only
1655 1.1 mrg better at size==1 then don't bother including that code just for that
1656 1.1 mrg case, instead go with preinv always and get a size saving. */
1657 1.1 mrg
1658 1.1 mrg #define DIV_1_PARAMS \
1659 1.1 mrg param.check_size = 256; \
1660 1.1 mrg param.min_size = 2; \
1661 1.1 mrg param.min_is_always = 1; \
1662 1.1 mrg param.data_high = DATA_HIGH_LT_R; \
1663 1.1 mrg param.size_extra = 1; \
1664 1.1 mrg param.stop_factor = 2.0;
1665 1.1 mrg
1666 1.1 mrg
1667 1.1 mrg double (*tuned_speed_mpn_divrem_1) __GMP_PROTO ((struct speed_params *));
1668 1.1 mrg
1669 1.1 mrg void
1670 1.1 mrg tune_divrem_1 (void)
1671 1.1 mrg {
1672 1.1 mrg /* plain version by default */
1673 1.1 mrg tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1674 1.1 mrg
1675 1.1 mrg /* No support for tuning native assembler code, do that by hand and put
1676 1.1 mrg the results in the .asm file, there's no need for such thresholds to
1677 1.1 mrg appear in gmp-mparam.h. */
1678 1.1 mrg if (HAVE_NATIVE_mpn_divrem_1)
1679 1.1 mrg return;
1680 1.1 mrg
1681 1.1 mrg if (GMP_NAIL_BITS != 0)
1682 1.1 mrg {
1683 1.1 mrg print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1684 1.1 mrg "no preinv with nails");
1685 1.1 mrg print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1686 1.1 mrg "no preinv with nails");
1687 1.1 mrg return;
1688 1.1 mrg }
1689 1.1 mrg
1690 1.1 mrg if (UDIV_PREINV_ALWAYS)
1691 1.1 mrg {
1692 1.1 mrg print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1693 1.1 mrg print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1694 1.1 mrg return;
1695 1.1 mrg }
1696 1.1 mrg
1697 1.1 mrg tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1698 1.1 mrg
1699 1.1 mrg /* Tune for the integer part of mpn_divrem_1. This will very possibly be
1700 1.1 mrg a bit out for the fractional part, but that's too bad, the integer part
1701 1.1 mrg is more important. */
1702 1.1 mrg {
1703 1.1 mrg static struct param_t param;
1704 1.1 mrg param.name = "DIVREM_1_NORM_THRESHOLD";
1705 1.1 mrg DIV_1_PARAMS;
1706 1.1 mrg s.r = randlimb_norm ();
1707 1.1 mrg param.function = speed_mpn_divrem_1_tune;
1708 1.1 mrg one (&divrem_1_norm_threshold, ¶m);
1709 1.1 mrg }
1710 1.1 mrg {
1711 1.1 mrg static struct param_t param;
1712 1.1 mrg param.name = "DIVREM_1_UNNORM_THRESHOLD";
1713 1.1 mrg DIV_1_PARAMS;
1714 1.1 mrg s.r = randlimb_half ();
1715 1.1 mrg param.function = speed_mpn_divrem_1_tune;
1716 1.1 mrg one (&divrem_1_unnorm_threshold, ¶m);
1717 1.1 mrg }
1718 1.1 mrg }
1719 1.1 mrg
1720 1.1 mrg
1721 1.1 mrg void
1722 1.1 mrg tune_mod_1 (void)
1723 1.1 mrg {
1724 1.1 mrg /* No support for tuning native assembler code, do that by hand and put
1725 1.1 mrg the results in the .asm file, there's no need for such thresholds to
1726 1.1 mrg appear in gmp-mparam.h. */
1727 1.1 mrg if (HAVE_NATIVE_mpn_mod_1)
1728 1.1 mrg return;
1729 1.1 mrg
1730 1.1 mrg if (GMP_NAIL_BITS != 0)
1731 1.1 mrg {
1732 1.1 mrg print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1733 1.1 mrg "no preinv with nails");
1734 1.1 mrg print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1735 1.1 mrg "no preinv with nails");
1736 1.1 mrg return;
1737 1.1 mrg }
1738 1.1 mrg
1739 1.1 mrg if (UDIV_PREINV_ALWAYS)
1740 1.1 mrg {
1741 1.1 mrg print_define ("MOD_1_NORM_THRESHOLD", 0L);
1742 1.1 mrg print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1743 1.1 mrg }
1744 1.1 mrg else
1745 1.1 mrg {
1746 1.1 mrg {
1747 1.1 mrg static struct param_t param;
1748 1.1 mrg param.name = "MOD_1_NORM_THRESHOLD";
1749 1.1 mrg DIV_1_PARAMS;
1750 1.1 mrg s.r = randlimb_norm ();
1751 1.1 mrg param.function = speed_mpn_mod_1_tune;
1752 1.1 mrg one (&mod_1_norm_threshold, ¶m);
1753 1.1 mrg }
1754 1.1 mrg {
1755 1.1 mrg static struct param_t param;
1756 1.1 mrg param.name = "MOD_1_UNNORM_THRESHOLD";
1757 1.1 mrg DIV_1_PARAMS;
1758 1.1 mrg s.r = randlimb_half ();
1759 1.1 mrg param.function = speed_mpn_mod_1_tune;
1760 1.1 mrg one (&mod_1_unnorm_threshold, ¶m);
1761 1.1 mrg }
1762 1.1 mrg }
1763 1.1 mrg {
1764 1.1 mrg static struct param_t param;
1765 1.1 mrg
1766 1.1 mrg param.check_size = 256;
1767 1.1 mrg
1768 1.1 mrg s.r = randlimb_norm ();
1769 1.1 mrg param.function = speed_mpn_mod_1_tune;
1770 1.1 mrg
1771 1.1 mrg param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
1772 1.1 mrg param.min_size = 2;
1773 1.1 mrg one (&mod_1n_to_mod_1_1_threshold, ¶m);
1774 1.1 mrg }
1775 1.1 mrg {
1776 1.1 mrg static struct param_t param;
1777 1.1 mrg
1778 1.1 mrg param.check_size = 256;
1779 1.1 mrg
1780 1.1 mrg s.r = randlimb_norm () / 5;
1781 1.1 mrg param.function = speed_mpn_mod_1_tune;
1782 1.1 mrg param.noprint = 1;
1783 1.1 mrg
1784 1.1 mrg param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
1785 1.1 mrg param.min_size = 2;
1786 1.1 mrg one (&mod_1u_to_mod_1_1_threshold, ¶m);
1787 1.1 mrg
1788 1.1 mrg param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
1789 1.1 mrg param.min_size = mod_1u_to_mod_1_1_threshold;
1790 1.1 mrg one (&mod_1_1_to_mod_1_2_threshold, ¶m);
1791 1.1 mrg
1792 1.1 mrg if (mod_1u_to_mod_1_1_threshold + 2 >= mod_1_1_to_mod_1_2_threshold)
1793 1.1 mrg {
1794 1.1 mrg /* Disable mod_1_1, mod_1_2 is always faster. Measure when to switch
1795 1.1 mrg (from mod_1_unnorm) to mod_1_2. */
1796 1.1 mrg mod_1_1_to_mod_1_2_threshold = 0;
1797 1.1 mrg
1798 1.1 mrg /* This really measures mod_1u -> mod_1_2 */
1799 1.1 mrg param.min_size = 1;
1800 1.1 mrg one (&mod_1u_to_mod_1_1_threshold, ¶m);
1801 1.1 mrg }
1802 1.1 mrg print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
1803 1.1 mrg
1804 1.1 mrg param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
1805 1.1 mrg param.min_size = mod_1_1_to_mod_1_2_threshold;
1806 1.1 mrg one (&mod_1_2_to_mod_1_4_threshold, ¶m);
1807 1.1 mrg
1808 1.1 mrg if (mod_1_1_to_mod_1_2_threshold + 2 >= mod_1_2_to_mod_1_4_threshold)
1809 1.1 mrg {
1810 1.1 mrg /* Disable mod_1_2, mod_1_4 is always faster. Measure when to switch
1811 1.1 mrg (from mod_1_unnorm or mod_1_1) to mod_1_4. */
1812 1.1 mrg mod_1_2_to_mod_1_4_threshold = 0;
1813 1.1 mrg
1814 1.1 mrg param.min_size = 1;
1815 1.1 mrg one (&mod_1_1_to_mod_1_2_threshold, ¶m);
1816 1.1 mrg }
1817 1.1 mrg print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, NULL);
1818 1.1 mrg print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, NULL);
1819 1.1 mrg }
1820 1.1 mrg
1821 1.1 mrg {
1822 1.1 mrg static struct param_t param;
1823 1.1 mrg
1824 1.1 mrg param.check_size = 256;
1825 1.1 mrg
1826 1.1 mrg param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
1827 1.1 mrg s.r = randlimb_norm ();
1828 1.1 mrg param.function = speed_mpn_preinv_mod_1;
1829 1.1 mrg param.function2 = speed_mpn_mod_1_tune;
1830 1.1 mrg param.min_size = 1;
1831 1.1 mrg one (&preinv_mod_1_to_mod_1_threshold, ¶m);
1832 1.1 mrg }
1833 1.1 mrg }
1834 1.1 mrg
1835 1.1 mrg
1836 1.1 mrg /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1837 1.1 mrg imply that udiv_qrnnd_preinv is worth using, but it seems most
1838 1.1 mrg straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1839 1.1 mrg directly. */
1840 1.1 mrg
1841 1.1 mrg void
1842 1.1 mrg tune_preinv_divrem_1 (void)
1843 1.1 mrg {
1844 1.1 mrg static struct param_t param;
1845 1.1 mrg speed_function_t divrem_1;
1846 1.1 mrg const char *divrem_1_name;
1847 1.1 mrg double t1, t2;
1848 1.1 mrg
1849 1.1 mrg if (GMP_NAIL_BITS != 0)
1850 1.1 mrg {
1851 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
1852 1.1 mrg return;
1853 1.1 mrg }
1854 1.1 mrg
1855 1.1 mrg /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1856 1.1 mrg it's faster than mpn_divrem_1. */
1857 1.1 mrg if (HAVE_NATIVE_mpn_preinv_divrem_1)
1858 1.1 mrg {
1859 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1860 1.1 mrg return;
1861 1.1 mrg }
1862 1.1 mrg
1863 1.1 mrg /* If udiv_qrnnd_preinv is the only division method then of course
1864 1.1 mrg mpn_preinv_divrem_1 should be used. */
1865 1.1 mrg if (UDIV_PREINV_ALWAYS)
1866 1.1 mrg {
1867 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1868 1.1 mrg return;
1869 1.1 mrg }
1870 1.1 mrg
1871 1.1 mrg /* If we've got an assembler version of mpn_divrem_1, then compare against
1872 1.1 mrg that, not the mpn_divrem_1_div generic C. */
1873 1.1 mrg if (HAVE_NATIVE_mpn_divrem_1)
1874 1.1 mrg {
1875 1.1 mrg divrem_1 = speed_mpn_divrem_1;
1876 1.1 mrg divrem_1_name = "mpn_divrem_1";
1877 1.1 mrg }
1878 1.1 mrg else
1879 1.1 mrg {
1880 1.1 mrg divrem_1 = speed_mpn_divrem_1_div;
1881 1.1 mrg divrem_1_name = "mpn_divrem_1_div";
1882 1.1 mrg }
1883 1.1 mrg
1884 1.1 mrg param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1885 1.1 mrg s.size = 200; /* generous but not too big */
1886 1.1 mrg /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
1887 1.1 mrg since in general that's probably most common, though in fact for a
1888 1.1 mrg 64-bit limb mp_bases[10].big_base is normalized. */
1889 1.1 mrg s.r = urandom() & (GMP_NUMB_MASK >> 4);
1890 1.1 mrg if (s.r == 0) s.r = 123;
1891 1.1 mrg
1892 1.1 mrg t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s);
1893 1.1 mrg t2 = tuneup_measure (divrem_1, ¶m, &s);
1894 1.1 mrg if (t1 == -1.0 || t2 == -1.0)
1895 1.1 mrg {
1896 1.1 mrg printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1897 1.1 mrg divrem_1_name, (long) s.size);
1898 1.1 mrg abort ();
1899 1.1 mrg }
1900 1.1 mrg if (option_trace >= 1)
1901 1.1 mrg printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1902 1.1 mrg (long) s.size, t1, divrem_1_name, t2);
1903 1.1 mrg
1904 1.1 mrg print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1905 1.1 mrg }
1906 1.1 mrg
1907 1.1 mrg
1908 1.1 mrg
1909 1.1 mrg void
1910 1.1 mrg tune_divrem_2 (void)
1911 1.1 mrg {
1912 1.1 mrg static struct param_t param;
1913 1.1 mrg
1914 1.1 mrg /* No support for tuning native assembler code, do that by hand and put
1915 1.1 mrg the results in the .asm file, and there's no need for such thresholds
1916 1.1 mrg to appear in gmp-mparam.h. */
1917 1.1 mrg if (HAVE_NATIVE_mpn_divrem_2)
1918 1.1 mrg return;
1919 1.1 mrg
1920 1.1 mrg if (GMP_NAIL_BITS != 0)
1921 1.1 mrg {
1922 1.1 mrg print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
1923 1.1 mrg "no preinv with nails");
1924 1.1 mrg return;
1925 1.1 mrg }
1926 1.1 mrg
1927 1.1 mrg if (UDIV_PREINV_ALWAYS)
1928 1.1 mrg {
1929 1.1 mrg print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1930 1.1 mrg return;
1931 1.1 mrg }
1932 1.1 mrg
1933 1.1 mrg /* Tune for the integer part of mpn_divrem_2. This will very possibly be
1934 1.1 mrg a bit out for the fractional part, but that's too bad, the integer part
1935 1.1 mrg is more important.
1936 1.1 mrg
1937 1.1 mrg min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1938 1.1 mrg code space if plain division is better only at size==2 or size==3. */
1939 1.1 mrg param.name = "DIVREM_2_THRESHOLD";
1940 1.1 mrg param.check_size = 256;
1941 1.1 mrg param.min_size = 4;
1942 1.1 mrg param.min_is_always = 1;
1943 1.1 mrg param.size_extra = 2; /* does qsize==nsize-2 divisions */
1944 1.1 mrg param.stop_factor = 2.0;
1945 1.1 mrg
1946 1.1 mrg s.r = randlimb_norm ();
1947 1.1 mrg param.function = speed_mpn_divrem_2;
1948 1.1 mrg one (&divrem_2_threshold, ¶m);
1949 1.1 mrg }
1950 1.1 mrg
1951 1.1 mrg
1952 1.1 mrg /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1953 1.1 mrg tune for that. Its speed can differ on odd or even divisor, so take an
1954 1.1 mrg average threshold for the two.
1955 1.1 mrg
1956 1.1 mrg mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1957 1.1 mrg might not vary that way, but don't test this since high<divisor isn't
1958 1.1 mrg expected to occur often with small divisors. */
1959 1.1 mrg
1960 1.1 mrg void
1961 1.1 mrg tune_divexact_1 (void)
1962 1.1 mrg {
1963 1.1 mrg static struct param_t param;
1964 1.1 mrg mp_size_t thresh[2], average;
1965 1.1 mrg int low, i;
1966 1.1 mrg
1967 1.1 mrg /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
1968 1.1 mrg full mpn_divrem_1. */
1969 1.1 mrg if (HAVE_NATIVE_mpn_divexact_1)
1970 1.1 mrg {
1971 1.1 mrg print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
1972 1.1 mrg return;
1973 1.1 mrg }
1974 1.1 mrg
1975 1.1 mrg ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1976 1.1 mrg
1977 1.1 mrg param.name = "DIVEXACT_1_THRESHOLD";
1978 1.1 mrg param.data_high = DATA_HIGH_GE_R;
1979 1.1 mrg param.check_size = 256;
1980 1.1 mrg param.min_size = 2;
1981 1.1 mrg param.stop_factor = 1.5;
1982 1.1 mrg param.function = tuned_speed_mpn_divrem_1;
1983 1.1 mrg param.function2 = speed_mpn_divexact_1;
1984 1.1 mrg param.noprint = 1;
1985 1.1 mrg
1986 1.1 mrg print_define_start (param.name);
1987 1.1 mrg
1988 1.1 mrg for (low = 0; low <= 1; low++)
1989 1.1 mrg {
1990 1.1 mrg s.r = randlimb_half();
1991 1.1 mrg if (low == 0)
1992 1.1 mrg s.r |= 1;
1993 1.1 mrg else
1994 1.1 mrg s.r &= ~CNST_LIMB(7);
1995 1.1 mrg
1996 1.1 mrg one (&thresh[low], ¶m);
1997 1.1 mrg if (option_trace)
1998 1.1 mrg printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
1999 1.1 mrg
2000 1.1 mrg if (thresh[low] == MP_SIZE_T_MAX)
2001 1.1 mrg {
2002 1.1 mrg average = MP_SIZE_T_MAX;
2003 1.1 mrg goto divexact_1_done;
2004 1.1 mrg }
2005 1.1 mrg }
2006 1.1 mrg
2007 1.1 mrg if (option_trace)
2008 1.1 mrg {
2009 1.1 mrg printf ("average of:");
2010 1.1 mrg for (i = 0; i < numberof(thresh); i++)
2011 1.1 mrg printf (" %ld", (long) thresh[i]);
2012 1.1 mrg printf ("\n");
2013 1.1 mrg }
2014 1.1 mrg
2015 1.1 mrg average = 0;
2016 1.1 mrg for (i = 0; i < numberof(thresh); i++)
2017 1.1 mrg average += thresh[i];
2018 1.1 mrg average /= numberof(thresh);
2019 1.1 mrg
2020 1.1 mrg /* If divexact turns out to be better as early as 3 limbs, then use it
2021 1.1 mrg always, so as to reduce code size and conditional jumps. */
2022 1.1 mrg if (average <= 3)
2023 1.1 mrg average = 0;
2024 1.1 mrg
2025 1.1 mrg divexact_1_done:
2026 1.1 mrg print_define_end (param.name, average);
2027 1.1 mrg }
2028 1.1 mrg
2029 1.1 mrg
2030 1.1 mrg /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2031 1.1 mrg same as mpn_mod_1, but this might not be true of an assembler
2032 1.1 mrg implementation. The threshold used is an average based on data where a
2033 1.1 mrg divide can be skipped and where it can't.
2034 1.1 mrg
2035 1.1 mrg If modexact turns out to be better as early as 3 limbs, then use it
2036 1.1 mrg always, so as to reduce code size and conditional jumps. */
2037 1.1 mrg
2038 1.1 mrg void
2039 1.1 mrg tune_modexact_1_odd (void)
2040 1.1 mrg {
2041 1.1 mrg static struct param_t param;
2042 1.1 mrg mp_size_t thresh_lt, thresh_ge, average;
2043 1.1 mrg
2044 1.1 mrg #if 0
2045 1.1 mrg /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2046 1.1 mrg of a full mpn_mod_1. */
2047 1.1 mrg if (HAVE_NATIVE_mpn_modexact_1_odd)
2048 1.1 mrg {
2049 1.1 mrg print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2050 1.1 mrg return;
2051 1.1 mrg }
2052 1.1 mrg #endif
2053 1.1 mrg
2054 1.1 mrg param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2055 1.1 mrg param.check_size = 256;
2056 1.1 mrg param.min_size = 2;
2057 1.1 mrg param.stop_factor = 1.5;
2058 1.1 mrg param.function = speed_mpn_modexact_1c_odd;
2059 1.1 mrg param.function2 = speed_mpn_mod_1_tune;
2060 1.1 mrg param.noprint = 1;
2061 1.1 mrg s.r = randlimb_half () | 1;
2062 1.1 mrg
2063 1.1 mrg print_define_start (param.name);
2064 1.1 mrg
2065 1.1 mrg param.data_high = DATA_HIGH_LT_R;
2066 1.1 mrg one (&thresh_lt, ¶m);
2067 1.1 mrg if (option_trace)
2068 1.1 mrg printf ("lt thresh %ld\n", (long) thresh_lt);
2069 1.1 mrg
2070 1.1 mrg average = thresh_lt;
2071 1.1 mrg if (thresh_lt != MP_SIZE_T_MAX)
2072 1.1 mrg {
2073 1.1 mrg param.data_high = DATA_HIGH_GE_R;
2074 1.1 mrg one (&thresh_ge, ¶m);
2075 1.1 mrg if (option_trace)
2076 1.1 mrg printf ("ge thresh %ld\n", (long) thresh_ge);
2077 1.1 mrg
2078 1.1 mrg if (thresh_ge != MP_SIZE_T_MAX)
2079 1.1 mrg {
2080 1.1 mrg average = (thresh_ge + thresh_lt) / 2;
2081 1.1 mrg if (thresh_ge <= 3)
2082 1.1 mrg average = 0;
2083 1.1 mrg }
2084 1.1 mrg }
2085 1.1 mrg
2086 1.1 mrg print_define_end (param.name, average);
2087 1.1 mrg }
2088 1.1 mrg
2089 1.1 mrg
2090 1.1 mrg void
2091 1.1 mrg tune_jacobi_base (void)
2092 1.1 mrg {
2093 1.1 mrg static struct param_t param;
2094 1.1 mrg double t1, t2, t3;
2095 1.1 mrg int method;
2096 1.1 mrg
2097 1.1 mrg s.size = GMP_LIMB_BITS * 3 / 4;
2098 1.1 mrg
2099 1.1 mrg t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s);
2100 1.1 mrg if (option_trace >= 1)
2101 1.1 mrg printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
2102 1.1 mrg
2103 1.1 mrg t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s);
2104 1.1 mrg if (option_trace >= 1)
2105 1.1 mrg printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
2106 1.1 mrg
2107 1.1 mrg t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s);
2108 1.1 mrg if (option_trace >= 1)
2109 1.1 mrg printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
2110 1.1 mrg
2111 1.1 mrg if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
2112 1.1 mrg {
2113 1.1 mrg printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
2114 1.1 mrg (long) s.size);
2115 1.1 mrg abort ();
2116 1.1 mrg }
2117 1.1 mrg
2118 1.1 mrg if (t1 < t2 && t1 < t3)
2119 1.1 mrg method = 1;
2120 1.1 mrg else if (t2 < t3)
2121 1.1 mrg method = 2;
2122 1.1 mrg else
2123 1.1 mrg method = 3;
2124 1.1 mrg
2125 1.1 mrg print_define ("JACOBI_BASE_METHOD", method);
2126 1.1 mrg }
2127 1.1 mrg
2128 1.1 mrg
2129 1.1 mrg void
2130 1.1 mrg tune_get_str (void)
2131 1.1 mrg {
2132 1.1 mrg /* Tune for decimal, it being most common. Some rough testing suggests
2133 1.1 mrg other bases are different, but not by very much. */
2134 1.1 mrg s.r = 10;
2135 1.1 mrg {
2136 1.1 mrg static struct param_t param;
2137 1.1 mrg GET_STR_PRECOMPUTE_THRESHOLD = 0;
2138 1.1 mrg param.name = "GET_STR_DC_THRESHOLD";
2139 1.1 mrg param.function = speed_mpn_get_str;
2140 1.1 mrg param.min_size = 4;
2141 1.1 mrg param.max_size = GET_STR_THRESHOLD_LIMIT;
2142 1.1 mrg one (&get_str_dc_threshold, ¶m);
2143 1.1 mrg }
2144 1.1 mrg {
2145 1.1 mrg static struct param_t param;
2146 1.1 mrg param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2147 1.1 mrg param.function = speed_mpn_get_str;
2148 1.1 mrg param.min_size = GET_STR_DC_THRESHOLD;
2149 1.1 mrg param.max_size = GET_STR_THRESHOLD_LIMIT;
2150 1.1 mrg one (&get_str_precompute_threshold, ¶m);
2151 1.1 mrg }
2152 1.1 mrg }
2153 1.1 mrg
2154 1.1 mrg
2155 1.1 mrg double
2156 1.1 mrg speed_mpn_pre_set_str (struct speed_params *s)
2157 1.1 mrg {
2158 1.1 mrg unsigned char *str;
2159 1.1 mrg mp_ptr wp;
2160 1.1 mrg mp_size_t wn;
2161 1.1 mrg unsigned i;
2162 1.1 mrg int base;
2163 1.1 mrg double t;
2164 1.1 mrg mp_ptr powtab_mem, tp;
2165 1.1 mrg powers_t powtab[GMP_LIMB_BITS];
2166 1.1 mrg mp_size_t un;
2167 1.1 mrg int chars_per_limb;
2168 1.1 mrg TMP_DECL;
2169 1.1 mrg
2170 1.1 mrg SPEED_RESTRICT_COND (s->size >= 1);
2171 1.1 mrg
2172 1.1 mrg base = s->r == 0 ? 10 : s->r;
2173 1.1 mrg SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2174 1.1 mrg
2175 1.1 mrg TMP_MARK;
2176 1.1 mrg
2177 1.1 mrg str = TMP_ALLOC (s->size);
2178 1.1 mrg for (i = 0; i < s->size; i++)
2179 1.1 mrg str[i] = s->xp[i] % base;
2180 1.1 mrg
2181 1.1 mrg wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
2182 1.1 mrg / GMP_LIMB_BITS + 2;
2183 1.1 mrg SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2184 1.1 mrg
2185 1.1 mrg /* use this during development to check wn is big enough */
2186 1.1 mrg /*
2187 1.1 mrg ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2188 1.1 mrg */
2189 1.1 mrg
2190 1.1 mrg speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
2191 1.1 mrg speed_operand_dst (s, wp, wn);
2192 1.1 mrg speed_cache_fill (s);
2193 1.1 mrg
2194 1.1 mrg chars_per_limb = mp_bases[base].chars_per_limb;
2195 1.1 mrg un = s->size / chars_per_limb + 1;
2196 1.1 mrg powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
2197 1.1 mrg mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
2198 1.1 mrg tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2199 1.1 mrg
2200 1.1 mrg speed_starttime ();
2201 1.1 mrg i = s->reps;
2202 1.1 mrg do
2203 1.1 mrg {
2204 1.1 mrg mpn_pre_set_str (wp, str, s->size, powtab, tp);
2205 1.1 mrg }
2206 1.1 mrg while (--i != 0);
2207 1.1 mrg t = speed_endtime ();
2208 1.1 mrg
2209 1.1 mrg TMP_FREE;
2210 1.1 mrg return t;
2211 1.1 mrg }
2212 1.1 mrg
2213 1.1 mrg void
2214 1.1 mrg tune_set_str (void)
2215 1.1 mrg {
2216 1.1 mrg s.r = 10; /* decimal */
2217 1.1 mrg {
2218 1.1 mrg static struct param_t param;
2219 1.1 mrg SET_STR_PRECOMPUTE_THRESHOLD = 0;
2220 1.1 mrg param.step_factor = 0.01;
2221 1.1 mrg param.name = "SET_STR_DC_THRESHOLD";
2222 1.1 mrg param.function = speed_mpn_pre_set_str;
2223 1.1 mrg param.min_size = 100;
2224 1.1 mrg param.max_size = 50000;
2225 1.1 mrg one (&set_str_dc_threshold, ¶m);
2226 1.1 mrg }
2227 1.1 mrg {
2228 1.1 mrg static struct param_t param;
2229 1.1 mrg param.step_factor = 0.02;
2230 1.1 mrg param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2231 1.1 mrg param.function = speed_mpn_set_str;
2232 1.1 mrg param.min_size = SET_STR_DC_THRESHOLD;
2233 1.1 mrg param.max_size = 100000;
2234 1.1 mrg one (&set_str_precompute_threshold, ¶m);
2235 1.1 mrg }
2236 1.1 mrg }
2237 1.1 mrg
2238 1.1 mrg
2239 1.1 mrg void
2240 1.1 mrg tune_fft_mul (void)
2241 1.1 mrg {
2242 1.1 mrg static struct fft_param_t param;
2243 1.1 mrg
2244 1.1 mrg if (option_fft_max_size == 0)
2245 1.1 mrg return;
2246 1.1 mrg
2247 1.1 mrg param.table_name = "MUL_FFT_TABLE3";
2248 1.1 mrg param.threshold_name = "MUL_FFT_THRESHOLD";
2249 1.1 mrg param.p_threshold = &mul_fft_threshold;
2250 1.1 mrg param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2251 1.1 mrg param.p_modf_threshold = &mul_fft_modf_threshold;
2252 1.1 mrg param.first_size = MUL_TOOM33_THRESHOLD / 2;
2253 1.1 mrg param.max_size = option_fft_max_size;
2254 1.1 mrg param.function = speed_mpn_fft_mul;
2255 1.1 mrg param.mul_modf_function = speed_mpn_mul_fft;
2256 1.1 mrg param.mul_function = speed_mpn_mul_n;
2257 1.1 mrg param.sqr = 0;
2258 1.1 mrg fft (¶m);
2259 1.1 mrg }
2260 1.1 mrg
2261 1.1 mrg
2262 1.1 mrg void
2263 1.1 mrg tune_fft_sqr (void)
2264 1.1 mrg {
2265 1.1 mrg static struct fft_param_t param;
2266 1.1 mrg
2267 1.1 mrg if (option_fft_max_size == 0)
2268 1.1 mrg return;
2269 1.1 mrg
2270 1.1 mrg param.table_name = "SQR_FFT_TABLE3";
2271 1.1 mrg param.threshold_name = "SQR_FFT_THRESHOLD";
2272 1.1 mrg param.p_threshold = &sqr_fft_threshold;
2273 1.1 mrg param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2274 1.1 mrg param.p_modf_threshold = &sqr_fft_modf_threshold;
2275 1.1 mrg param.first_size = SQR_TOOM3_THRESHOLD / 2;
2276 1.1 mrg param.max_size = option_fft_max_size;
2277 1.1 mrg param.function = speed_mpn_fft_sqr;
2278 1.1 mrg param.mul_modf_function = speed_mpn_mul_fft_sqr;
2279 1.1 mrg param.mul_function = speed_mpn_sqr;
2280 1.1 mrg param.sqr = 1;
2281 1.1 mrg fft (¶m);
2282 1.1 mrg }
2283 1.1 mrg
2284 1.1 mrg void
2285 1.1 mrg all (void)
2286 1.1 mrg {
2287 1.1 mrg time_t start_time, end_time;
2288 1.1 mrg TMP_DECL;
2289 1.1 mrg
2290 1.1 mrg TMP_MARK;
2291 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2292 1.1 mrg SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2293 1.1 mrg
2294 1.1 mrg mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2295 1.1 mrg mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2296 1.1 mrg
2297 1.1 mrg fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2298 1.1 mrg
2299 1.1 mrg speed_time_init ();
2300 1.1 mrg fprintf (stderr, "Using: %s\n", speed_time_string);
2301 1.1 mrg
2302 1.1 mrg fprintf (stderr, "speed_precision %d", speed_precision);
2303 1.1 mrg if (speed_unittime == 1.0)
2304 1.1 mrg fprintf (stderr, ", speed_unittime 1 cycle");
2305 1.1 mrg else
2306 1.1 mrg fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2307 1.1 mrg if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2308 1.1 mrg fprintf (stderr, ", CPU freq unknown\n");
2309 1.1 mrg else
2310 1.1 mrg fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2311 1.1 mrg
2312 1.1 mrg fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2313 1.1 mrg DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2314 1.1 mrg fprintf (stderr, "\n");
2315 1.1 mrg
2316 1.1 mrg time (&start_time);
2317 1.1 mrg {
2318 1.1 mrg struct tm *tp;
2319 1.1 mrg tp = localtime (&start_time);
2320 1.1 mrg printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2321 1.1 mrg tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2322 1.1 mrg
2323 1.1 mrg #ifdef __GNUC__
2324 1.1 mrg /* gcc sub-minor version doesn't seem to come through as a define */
2325 1.1 mrg printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2326 1.1 mrg #define PRINTED_COMPILER
2327 1.1 mrg #endif
2328 1.1 mrg #if defined (__SUNPRO_C)
2329 1.1 mrg printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2330 1.1 mrg #define PRINTED_COMPILER
2331 1.1 mrg #endif
2332 1.1 mrg #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2333 1.1 mrg /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2334 1.1 mrg printf ("MIPSpro C %d.%d.%d */\n",
2335 1.1 mrg _COMPILER_VERSION / 100,
2336 1.1 mrg _COMPILER_VERSION / 10 % 10,
2337 1.1 mrg _COMPILER_VERSION % 10);
2338 1.1 mrg #define PRINTED_COMPILER
2339 1.1 mrg #endif
2340 1.1 mrg #if defined (__DECC) && defined (__DECC_VER)
2341 1.1 mrg printf ("DEC C %d */\n", __DECC_VER);
2342 1.1 mrg #define PRINTED_COMPILER
2343 1.1 mrg #endif
2344 1.1 mrg #if ! defined (PRINTED_COMPILER)
2345 1.1 mrg printf ("system compiler */\n");
2346 1.1 mrg #endif
2347 1.1 mrg }
2348 1.1 mrg printf ("\n");
2349 1.1 mrg
2350 1.1 mrg tune_divrem_1 ();
2351 1.1 mrg tune_mod_1 ();
2352 1.1 mrg tune_preinv_divrem_1 ();
2353 1.1 mrg tune_divrem_2 ();
2354 1.1 mrg tune_divexact_1 ();
2355 1.1 mrg tune_modexact_1_odd ();
2356 1.1 mrg printf("\n");
2357 1.1 mrg
2358 1.1 mrg tune_mul_n ();
2359 1.1 mrg printf("\n");
2360 1.1 mrg
2361 1.1 mrg tune_mul ();
2362 1.1 mrg printf("\n");
2363 1.1 mrg
2364 1.1 mrg tune_sqr ();
2365 1.1 mrg printf("\n");
2366 1.1 mrg
2367 1.1 mrg tune_mulmod_bnm1 ();
2368 1.1 mrg tune_sqrmod_bnm1 ();
2369 1.1 mrg printf("\n");
2370 1.1 mrg
2371 1.1 mrg tune_fft_mul ();
2372 1.1 mrg printf("\n");
2373 1.1 mrg
2374 1.1 mrg tune_fft_sqr ();
2375 1.1 mrg printf ("\n");
2376 1.1 mrg
2377 1.1 mrg tune_mullo ();
2378 1.1 mrg printf("\n");
2379 1.1 mrg
2380 1.1 mrg tune_dc_div ();
2381 1.1 mrg tune_dc_bdiv ();
2382 1.1 mrg
2383 1.1 mrg printf("\n");
2384 1.1 mrg tune_invertappr ();
2385 1.1 mrg tune_invert ();
2386 1.1 mrg printf("\n");
2387 1.1 mrg
2388 1.1 mrg tune_binvert ();
2389 1.1 mrg tune_redc ();
2390 1.1 mrg printf("\n");
2391 1.1 mrg
2392 1.1 mrg tune_mu_div ();
2393 1.1 mrg tune_mu_bdiv ();
2394 1.1 mrg printf("\n");
2395 1.1 mrg
2396 1.1 mrg tune_matrix22_mul ();
2397 1.1 mrg tune_hgcd ();
2398 1.1 mrg tune_gcd_dc ();
2399 1.1 mrg tune_gcdext_dc ();
2400 1.1 mrg tune_jacobi_base ();
2401 1.1 mrg printf("\n");
2402 1.1 mrg
2403 1.1 mrg tune_get_str ();
2404 1.1 mrg tune_set_str ();
2405 1.1 mrg printf("\n");
2406 1.1 mrg
2407 1.1 mrg time (&end_time);
2408 1.1 mrg printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2409 1.1 mrg (long) (end_time - start_time));
2410 1.1 mrg
2411 1.1 mrg TMP_FREE;
2412 1.1 mrg }
2413 1.1 mrg
2414 1.1 mrg
2415 1.1 mrg int
2416 1.1 mrg main (int argc, char *argv[])
2417 1.1 mrg {
2418 1.1 mrg int opt;
2419 1.1 mrg
2420 1.1 mrg /* Unbuffered so if output is redirected to a file it isn't lost if the
2421 1.1 mrg program is killed part way through. */
2422 1.1 mrg setbuf (stdout, NULL);
2423 1.1 mrg setbuf (stderr, NULL);
2424 1.1 mrg
2425 1.1 mrg while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2426 1.1 mrg {
2427 1.1 mrg switch (opt) {
2428 1.1 mrg case 'f':
2429 1.1 mrg if (optarg[0] == 't')
2430 1.1 mrg option_fft_trace = 2;
2431 1.1 mrg else
2432 1.1 mrg option_fft_max_size = atol (optarg);
2433 1.1 mrg break;
2434 1.1 mrg case 'o':
2435 1.1 mrg speed_option_set (optarg);
2436 1.1 mrg break;
2437 1.1 mrg case 'p':
2438 1.1 mrg speed_precision = atoi (optarg);
2439 1.1 mrg break;
2440 1.1 mrg case 't':
2441 1.1 mrg option_trace++;
2442 1.1 mrg break;
2443 1.1 mrg case '?':
2444 1.1 mrg exit(1);
2445 1.1 mrg }
2446 1.1 mrg }
2447 1.1 mrg
2448 1.1 mrg all ();
2449 1.1 mrg exit (0);
2450 1.1 mrg }
2451