Home | History | Annotate | Line # | Download | only in tune
      1 /* Shared speed subroutines.
      2 
      3 Copyright 1999-2006, 2008-2017, 2019 Free Software Foundation, Inc.
      4 
      5 This file is part of the GNU MP Library.
      6 
      7 The GNU MP Library is free software; you can redistribute it and/or modify
      8 it under the terms of either:
      9 
     10   * the GNU Lesser General Public License as published by the Free
     11     Software Foundation; either version 3 of the License, or (at your
     12     option) any later version.
     13 
     14 or
     15 
     16   * the GNU General Public License as published by the Free Software
     17     Foundation; either version 2 of the License, or (at your option) any
     18     later version.
     19 
     20 or both in parallel, as here.
     21 
     22 The GNU MP Library is distributed in the hope that it will be useful, but
     23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 for more details.
     26 
     27 You should have received copies of the GNU General Public License and the
     28 GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 see https://www.gnu.org/licenses/.  */
     30 
     31 #define __GMP_NO_ATTRIBUTE_CONST_PURE
     32 
     33 #include <errno.h>
     34 #include <fcntl.h>
     35 #include <math.h>
     36 #include <stdio.h>
     37 #include <stdlib.h> /* for qsort */
     38 #include <string.h>
     39 #include <unistd.h>
     40 #if 0
     41 #include <sys/ioctl.h>
     42 #endif
     43 
     44 #include "gmp-impl.h"
     45 #include "longlong.h"
     46 
     47 #include "tests.h"
     48 #include "speed.h"
     49 
     50 
     51 int   speed_option_addrs = 0;
     52 int   speed_option_verbose = 0;
     53 int   speed_option_cycles_broken = 0;
     54 
     55 
     56 /* Provide __clz_tab even if it's not required, for the benefit of new code
     57    being tested with many.pl. */
     58 #ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
     59 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
     60 #include "mp_clz_tab.c"
     61 #undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
     62 #endif
     63 
     64 
     65 void
     66 pentium_wbinvd(void)
     67 {
     68 #if 0
     69   {
     70     static int  fd = -2;
     71 
     72     if (fd == -2)
     73       {
     74 	fd = open ("/dev/wbinvd", O_RDWR);
     75 	if (fd == -1)
     76 	  perror ("open /dev/wbinvd");
     77       }
     78 
     79     if (fd != -1)
     80       ioctl (fd, 0, 0);
     81   }
     82 #endif
     83 
     84 #if 0
     85 #define WBINVDSIZE  1024*1024*2
     86   {
     87     static char  *p = NULL;
     88     int   i, sum;
     89 
     90     if (p == NULL)
     91       p = malloc (WBINVDSIZE);
     92 
     93 #if 0
     94     for (i = 0; i < WBINVDSIZE; i++)
     95       p[i] = i & 0xFF;
     96 #endif
     97 
     98     sum = 0;
     99     for (i = 0; i < WBINVDSIZE; i++)
    100       sum += p[i];
    101 
    102     mpn_cache_fill_dummy (sum);
    103   }
    104 #endif
    105 }
    106 
    107 
    108 int
    109 double_cmp_ptr (const double *p, const double *q)
    110 {
    111   if (*p > *q)  return 1;
    112   if (*p < *q)  return -1;
    113   return 0;
    114 }
    115 
    116 
    117 /* Measure the speed of a given routine.
    118 
    119    The routine is run with enough repetitions to make it take at least
    120    speed_precision * speed_unittime.  This aims to minimize the effects of a
    121    limited accuracy time base and the overhead of the measuring itself.
    122 
    123    Measurements are made looking for 4 results within TOLERANCE of each
    124    other (or 3 for routines taking longer than 2 seconds).  This aims to get
    125    an accurate reading even if some runs are bloated by interrupts or task
    126    switches or whatever.
    127 
    128    The given (*fun)() is expected to run its function "s->reps" many times
    129    and return the total elapsed time measured using speed_starttime() and
    130    speed_endtime().  If the function doesn't support the given s->size or
    131    s->r, -1.0 should be returned.  See the various base routines below.  */
    132 
    133 double
    134 speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
    135 {
    136 #define TOLERANCE    1.01  /* 1% */
    137   const int max_zeros = 10;
    138 
    139   struct speed_params  s_dummy;
    140   int     i, j, e;
    141   double  t[30];
    142   double  t_unsorted[30];
    143   double  reps_d;
    144   int     zeros = 0;
    145 
    146   /* Use dummy parameters if caller doesn't provide any.  Only a few special
    147      "fun"s will cope with this, speed_noop() is one.  */
    148   if (s == NULL)
    149     {
    150       memset (&s_dummy, '\0', sizeof (s_dummy));
    151       s = &s_dummy;
    152     }
    153 
    154   s->reps = 1;
    155   s->time_divisor = 1.0;
    156   for (i = 0; i < numberof (t); i++)
    157     {
    158       for (;;)
    159 	{
    160 	  s->src_num = 0;
    161 	  s->dst_num = 0;
    162 
    163 	  t[i] = (*fun) (s);
    164 
    165 	  if (speed_option_verbose >= 3)
    166 	    gmp_printf("size=%ld reps=%u r=%Md attempt=%d  %.9f\n",
    167 		       (long) s->size, s->reps, s->r, i, t[i]);
    168 
    169 	  if (t[i] == 0.0)
    170 	    {
    171 	      zeros++;
    172 	      if (zeros > max_zeros)
    173 		{
    174 		  fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros);
    175 		  abort ();
    176 		}
    177 	     if (s->reps < 10000)
    178 	       s->reps *= 2;
    179 
    180 	      continue;
    181 	    }
    182 
    183 	  if (t[i] == -1.0)
    184 	    return -1.0;
    185 
    186 	  if (t[i] >= speed_unittime * speed_precision)
    187 	    break;
    188 
    189 	  /* go to a value of reps to make t[i] >= precision */
    190 	  reps_d = ceil (1.1 * s->reps
    191 			 * speed_unittime * speed_precision
    192 			 / MAX (t[i], speed_unittime));
    193 	  if (reps_d > 2e9 || reps_d < 1.0)
    194 	    {
    195 	      fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d);
    196 	      fprintf (stderr, "  (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n",
    197 		       s->reps, speed_unittime, speed_precision, t[i]);
    198 	      abort ();
    199 	    }
    200 	  s->reps = (unsigned) reps_d;
    201 	}
    202       t[i] /= s->reps;
    203       t_unsorted[i] = t[i];
    204 
    205       if (speed_precision == 0)
    206 	return t[i];
    207 
    208       /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
    209       if (t[0] >= 2.0)
    210 	e = 3;
    211       else
    212 	e = 4;
    213 
    214       /* Look for e many t[]'s within TOLERANCE of each other to consider a
    215 	 valid measurement.  Return smallest among them.  */
    216       if (i >= e)
    217 	{
    218 	  qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
    219 	  for (j = e-1; j < i; j++)
    220 	    if (t[j] <= t[j-e+1] * TOLERANCE)
    221 	      return t[j-e+1] / s->time_divisor;
    222 	}
    223     }
    224 
    225   fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
    226 	   e, (TOLERANCE-1.0)*100.0);
    227   fprintf (stderr, "    unsorted         sorted\n");
    228   fprintf (stderr, "  %.12f    %.12f    is about %.1f%%\n",
    229 	   t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0),
    230 	   100*(TOLERANCE-1.0));
    231   for (i = 0; i < numberof (t); i++)
    232     fprintf (stderr, "  %.09f       %.09f\n", t_unsorted[i], t[i]);
    233 
    234   return -1.0;
    235 }
    236 
    237 
    238 /* Read all of ptr,size to get it into the CPU memory cache.
    239 
    240    A call to mpn_cache_fill_dummy() is used to make sure the compiler
    241    doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
    242    would work too, but the function call means we don't rely on every
    243    compiler actually implementing volatile properly.
    244 
    245    mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
    246    it can inline it.  */
    247 
    248 void
    249 mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
    250 {
    251   mp_limb_t  sum = 0;
    252   mp_size_t  i;
    253 
    254   for (i = 0; i < size; i++)
    255     sum += ptr[i];
    256 
    257   mpn_cache_fill_dummy(sum);
    258 }
    259 
    260 
    261 void
    262 mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
    263 {
    264   mpn_cache_fill (ptr, size);
    265 
    266 #if 0
    267   mpn_random (ptr, size);
    268 #endif
    269 
    270 #if 0
    271   mp_size_t  i;
    272 
    273   for (i = 0; i < size; i++)
    274     ptr[i] = i;
    275 #endif
    276 }
    277 
    278 
    279 void
    280 speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
    281 {
    282   if (s->src_num >= numberof (s->src))
    283     {
    284       fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
    285       abort ();
    286     }
    287   s->src[s->src_num].ptr = ptr;
    288   s->src[s->src_num].size = size;
    289   s->src_num++;
    290 }
    291 
    292 
    293 void
    294 speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
    295 {
    296   if (s->dst_num >= numberof (s->dst))
    297     {
    298       fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
    299       abort ();
    300     }
    301   s->dst[s->dst_num].ptr = ptr;
    302   s->dst[s->dst_num].size = size;
    303   s->dst_num++;
    304 }
    305 
    306 
    307 void
    308 speed_cache_fill (struct speed_params *s)
    309 {
    310   static struct speed_params  prev;
    311   int  i;
    312 
    313   /* FIXME: need a better way to get the format string for a pointer */
    314 
    315   if (speed_option_addrs)
    316     {
    317       int  different;
    318 
    319       different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
    320       for (i = 0; i < s->dst_num; i++)
    321 	different |= (s->dst[i].ptr != prev.dst[i].ptr);
    322       for (i = 0; i < s->src_num; i++)
    323 	different |= (s->src[i].ptr != prev.src[i].ptr);
    324 
    325       if (different)
    326 	{
    327 	  if (s->dst_num != 0)
    328 	    {
    329 	      printf ("dst");
    330 	      for (i = 0; i < s->dst_num; i++)
    331 		printf (" %08lX", (unsigned long) s->dst[i].ptr);
    332 	      printf (" ");
    333 	    }
    334 
    335 	  if (s->src_num != 0)
    336 	    {
    337 	      printf ("src");
    338 	      for (i = 0; i < s->src_num; i++)
    339 		printf (" %08lX", (unsigned long) s->src[i].ptr);
    340 	      printf (" ");
    341 	    }
    342 	  printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
    343 
    344 	}
    345 
    346       memcpy (&prev, s, sizeof(prev));
    347     }
    348 
    349   switch (s->cache) {
    350   case 0:
    351     for (i = 0; i < s->dst_num; i++)
    352       mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
    353     for (i = 0; i < s->src_num; i++)
    354       mpn_cache_fill (s->src[i].ptr, s->src[i].size);
    355     break;
    356   case 1:
    357     pentium_wbinvd();
    358     break;
    359   }
    360 }
    361 
    362 
    363 /* Miscellaneous options accepted by tune and speed programs under -o. */
    364 
    365 void
    366 speed_option_set (const char *s)
    367 {
    368   int  n;
    369 
    370   if (strcmp (s, "addrs") == 0)
    371     {
    372       speed_option_addrs = 1;
    373     }
    374   else if (strcmp (s, "verbose") == 0)
    375     {
    376       speed_option_verbose++;
    377     }
    378   else if (sscanf (s, "verbose=%d", &n) == 1)
    379     {
    380       speed_option_verbose = n;
    381     }
    382   else if (strcmp (s, "cycles-broken") == 0)
    383     {
    384       speed_option_cycles_broken = 1;
    385     }
    386   else
    387     {
    388       printf ("Unrecognised -o option: %s\n", s);
    389       exit (1);
    390     }
    391 }
    392 
    393 
    394 /* The following are basic speed running routines for various gmp functions.
    395    Many are very similar and use speed.h macros.
    396 
    397    Each routine allocates it's own destination space for the result of the
    398    function, because only it can know what the function needs.
    399 
    400    speed_starttime() and speed_endtime() are put tight around the code to be
    401    measured.  Any setups are done outside the timed portion.
    402 
    403    Each routine is responsible for its own cache priming.
    404    speed_cache_fill() is a good way to do this, see examples in speed.h.
    405    One cache priming possibility, for CPUs with write-allocate cache, and
    406    functions that don't take too long, is to do one dummy call before timing
    407    so as to cache everything that gets used.  But speed_measure() runs a
    408    routine at least twice and will take the smaller time, so this might not
    409    be necessary.
    410 
    411    Data alignment will be important, for source, destination and temporary
    412    workspace.  A routine can align its destination and workspace.  Programs
    413    using the routines will ensure s->xp and s->yp are aligned.  Aligning
    414    onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
    415    s->align_wp2 should be respected where it makes sense to do so.
    416    SPEED_TMP_ALLOC_LIMBS is a good way to do this.
    417 
    418    A loop of the following form can be expected to turn into good assembler
    419    code on most CPUs, thereby minimizing overhead in the measurement.  It
    420    can always be assumed s->reps >= 1.
    421 
    422 	  i = s->reps
    423 	  do
    424 	    foo();
    425 	  while (--i != 0);
    426 
    427    Additional parameters might be added to "struct speed_params" in the
    428    future.  Routines should ignore anything they don't use.
    429 
    430    s->size can be used creatively, and s->xp and s->yp can be ignored.  For
    431    example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
    432    just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
    433    speed_mpn_mul_1() uses it as a multiplier.  */
    434 
    435 
    436 /* MPN_COPY etc can be macros, so the _CALL forms are necessary */
    437 double
    438 speed_MPN_COPY (struct speed_params *s)
    439 {
    440   SPEED_ROUTINE_MPN_COPY (MPN_COPY);
    441 }
    442 double
    443 speed_MPN_COPY_INCR (struct speed_params *s)
    444 {
    445   SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR);
    446 }
    447 double
    448 speed_MPN_COPY_DECR (struct speed_params *s)
    449 {
    450   SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR);
    451 }
    452 #if HAVE_NATIVE_mpn_copyi
    453 double
    454 speed_mpn_copyi (struct speed_params *s)
    455 {
    456   SPEED_ROUTINE_MPN_COPY (mpn_copyi);
    457 }
    458 #endif
    459 #if HAVE_NATIVE_mpn_copyd
    460 double
    461 speed_mpn_copyd (struct speed_params *s)
    462 {
    463   SPEED_ROUTINE_MPN_COPY (mpn_copyd);
    464 }
    465 #endif
    466 double
    467 speed_memcpy (struct speed_params *s)
    468 {
    469   SPEED_ROUTINE_MPN_COPY_BYTES (memcpy);
    470 }
    471 double
    472 speed_mpn_com (struct speed_params *s)
    473 {
    474   SPEED_ROUTINE_MPN_COPY (mpn_com);
    475 }
    476 double
    477 speed_mpn_neg (struct speed_params *s)
    478 {
    479   SPEED_ROUTINE_MPN_COPY (mpn_neg);
    480 }
    481 double
    482 speed_mpn_sec_tabselect (struct speed_params *s)
    483 {
    484   SPEED_ROUTINE_MPN_TABSELECT (mpn_sec_tabselect);
    485 }
    486 
    487 
    488 double
    489 speed_mpn_addmul_1 (struct speed_params *s)
    490 {
    491   SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
    492 }
    493 double
    494 speed_mpn_submul_1 (struct speed_params *s)
    495 {
    496   SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
    497 }
    498 
    499 #if HAVE_NATIVE_mpn_addmul_2
    500 double
    501 speed_mpn_addmul_2 (struct speed_params *s)
    502 {
    503   SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2);
    504 }
    505 #endif
    506 #if HAVE_NATIVE_mpn_addmul_3
    507 double
    508 speed_mpn_addmul_3 (struct speed_params *s)
    509 {
    510   SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3);
    511 }
    512 #endif
    513 #if HAVE_NATIVE_mpn_addmul_4
    514 double
    515 speed_mpn_addmul_4 (struct speed_params *s)
    516 {
    517   SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4);
    518 }
    519 #endif
    520 #if HAVE_NATIVE_mpn_addmul_5
    521 double
    522 speed_mpn_addmul_5 (struct speed_params *s)
    523 {
    524   SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5);
    525 }
    526 #endif
    527 #if HAVE_NATIVE_mpn_addmul_6
    528 double
    529 speed_mpn_addmul_6 (struct speed_params *s)
    530 {
    531   SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6);
    532 }
    533 #endif
    534 #if HAVE_NATIVE_mpn_addmul_7
    535 double
    536 speed_mpn_addmul_7 (struct speed_params *s)
    537 {
    538   SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7);
    539 }
    540 #endif
    541 #if HAVE_NATIVE_mpn_addmul_8
    542 double
    543 speed_mpn_addmul_8 (struct speed_params *s)
    544 {
    545   SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8);
    546 }
    547 #endif
    548 
    549 double
    550 speed_mpn_mul_1 (struct speed_params *s)
    551 {
    552   SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
    553 }
    554 double
    555 speed_mpn_mul_1_inplace (struct speed_params *s)
    556 {
    557   SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1);
    558 }
    559 
    560 #if HAVE_NATIVE_mpn_mul_2
    561 double
    562 speed_mpn_mul_2 (struct speed_params *s)
    563 {
    564   SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2);
    565 }
    566 #endif
    567 #if HAVE_NATIVE_mpn_mul_3
    568 double
    569 speed_mpn_mul_3 (struct speed_params *s)
    570 {
    571   SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3);
    572 }
    573 #endif
    574 #if HAVE_NATIVE_mpn_mul_4
    575 double
    576 speed_mpn_mul_4 (struct speed_params *s)
    577 {
    578   SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4);
    579 }
    580 #endif
    581 #if HAVE_NATIVE_mpn_mul_5
    582 double
    583 speed_mpn_mul_5 (struct speed_params *s)
    584 {
    585   SPEED_ROUTINE_MPN_UNARY_5 (mpn_mul_5);
    586 }
    587 #endif
    588 #if HAVE_NATIVE_mpn_mul_6
    589 double
    590 speed_mpn_mul_6 (struct speed_params *s)
    591 {
    592   SPEED_ROUTINE_MPN_UNARY_6 (mpn_mul_6);
    593 }
    594 #endif
    595 
    596 
    597 double
    598 speed_mpn_lshift (struct speed_params *s)
    599 {
    600   SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
    601 }
    602 double
    603 speed_mpn_lshiftc (struct speed_params *s)
    604 {
    605   SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc);
    606 }
    607 double
    608 speed_mpn_rshift (struct speed_params *s)
    609 {
    610   SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
    611 }
    612 
    613 
    614 /* The carry-in variants (if available) are good for measuring because they
    615    won't skip a division if high<divisor.  Alternately, use -1 as a divisor
    616    with the plain _1 forms. */
    617 double
    618 speed_mpn_divrem_1 (struct speed_params *s)
    619 {
    620   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
    621 }
    622 double
    623 speed_mpn_divrem_1f (struct speed_params *s)
    624 {
    625   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
    626 }
    627 #if HAVE_NATIVE_mpn_divrem_1c
    628 double
    629 speed_mpn_divrem_1c (struct speed_params *s)
    630 {
    631   SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
    632 }
    633 double
    634 speed_mpn_divrem_1cf (struct speed_params *s)
    635 {
    636   SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
    637 }
    638 #endif
    639 
    640 double
    641 speed_mpn_divrem_1_div (struct speed_params *s)
    642 {
    643   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div);
    644 }
    645 double
    646 speed_mpn_divrem_1f_div (struct speed_params *s)
    647 {
    648   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div);
    649 }
    650 double
    651 speed_mpn_divrem_1_inv (struct speed_params *s)
    652 {
    653   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv);
    654 }
    655 double
    656 speed_mpn_divrem_1f_inv (struct speed_params *s)
    657 {
    658   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv);
    659 }
    660 double
    661 speed_mpn_mod_1_div (struct speed_params *s)
    662 {
    663   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div);
    664 }
    665 double
    666 speed_mpn_mod_1_inv (struct speed_params *s)
    667 {
    668   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv);
    669 }
    670 
    671 double
    672 speed_mpn_preinv_divrem_1 (struct speed_params *s)
    673 {
    674   SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1);
    675 }
    676 double
    677 speed_mpn_preinv_divrem_1f (struct speed_params *s)
    678 {
    679   SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1);
    680 }
    681 
    682 #if GMP_NUMB_BITS % 4 == 0
    683 double
    684 speed_mpn_mod_34lsub1 (struct speed_params *s)
    685 {
    686   SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1);
    687 }
    688 #endif
    689 
    690 double
    691 speed_mpn_divrem_2 (struct speed_params *s)
    692 {
    693   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
    694 }
    695 double
    696 speed_mpn_divrem_2_div (struct speed_params *s)
    697 {
    698   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div);
    699 }
    700 double
    701 speed_mpn_divrem_2_inv (struct speed_params *s)
    702 {
    703   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv);
    704 }
    705 
    706 double
    707 speed_mpn_div_qr_1n_pi1 (struct speed_params *s)
    708 {
    709   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1);
    710 }
    711 double
    712 speed_mpn_div_qr_1n_pi1_1 (struct speed_params *s)
    713 {
    714   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_1);
    715 }
    716 double
    717 speed_mpn_div_qr_1n_pi1_2 (struct speed_params *s)
    718 {
    719   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2);
    720 }
    721 
    722 double
    723 speed_mpn_div_qr_1 (struct speed_params *s)
    724 {
    725   SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1);
    726 }
    727 
    728 double
    729 speed_mpn_div_qr_2n (struct speed_params *s)
    730 {
    731   SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 1);
    732 }
    733 double
    734 speed_mpn_div_qr_2u (struct speed_params *s)
    735 {
    736   SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 0);
    737 }
    738 
    739 double
    740 speed_mpn_mod_1 (struct speed_params *s)
    741 {
    742   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
    743 }
    744 #if HAVE_NATIVE_mpn_mod_1c
    745 double
    746 speed_mpn_mod_1c (struct speed_params *s)
    747 {
    748   SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
    749 }
    750 #endif
    751 double
    752 speed_mpn_preinv_mod_1 (struct speed_params *s)
    753 {
    754   SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1);
    755 }
    756 double
    757 speed_mpn_mod_1_1 (struct speed_params *s)
    758 {
    759   SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps);
    760 }
    761 double
    762 speed_mpn_mod_1_1_1 (struct speed_params *s)
    763 {
    764   SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_1,mpn_mod_1_1p_cps_1);
    765 }
    766 double
    767 speed_mpn_mod_1_1_2 (struct speed_params *s)
    768 {
    769   SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_2,mpn_mod_1_1p_cps_2);
    770 }
    771 double
    772 speed_mpn_mod_1_2 (struct speed_params *s)
    773 {
    774   SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2);
    775 }
    776 double
    777 speed_mpn_mod_1_3 (struct speed_params *s)
    778 {
    779   SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3);
    780 }
    781 double
    782 speed_mpn_mod_1_4 (struct speed_params *s)
    783 {
    784   SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4);
    785 }
    786 
    787 double
    788 speed_mpn_divexact_1 (struct speed_params *s)
    789 {
    790   SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1);
    791 }
    792 
    793 double
    794 speed_mpn_divexact_by3 (struct speed_params *s)
    795 {
    796   SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3);
    797 }
    798 
    799 double
    800 speed_mpn_bdiv_dbm1c (struct speed_params *s)
    801 {
    802   SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c);
    803 }
    804 
    805 double
    806 speed_mpn_bdiv_q_1 (struct speed_params *s)
    807 {
    808   SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1);
    809 }
    810 
    811 double
    812 speed_mpn_pi1_bdiv_q_1 (struct speed_params *s)
    813 {
    814   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1);
    815 }
    816 
    817 #if HAVE_NATIVE_mpn_modexact_1_odd
    818 double
    819 speed_mpn_modexact_1_odd (struct speed_params *s)
    820 {
    821   SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd);
    822 }
    823 #endif
    824 
    825 double
    826 speed_mpn_modexact_1c_odd (struct speed_params *s)
    827 {
    828   SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd);
    829 }
    830 
    831 double
    832 speed_mpz_mod (struct speed_params *s)
    833 {
    834   SPEED_ROUTINE_MPZ_MOD (mpz_mod);
    835 }
    836 
    837 double
    838 speed_mpn_sbpi1_div_qr (struct speed_params *s)
    839 {
    840   SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0);
    841 }
    842 double
    843 speed_mpn_dcpi1_div_qr (struct speed_params *s)
    844 {
    845   SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3);
    846 }
    847 double
    848 speed_mpn_sbpi1_divappr_q (struct speed_params *s)
    849 {
    850   SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0);
    851 }
    852 double
    853 speed_mpn_dcpi1_divappr_q (struct speed_params *s)
    854 {
    855   SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3);
    856 }
    857 double
    858 speed_mpn_mu_div_qr (struct speed_params *s)
    859 {
    860   SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch);
    861 }
    862 double
    863 speed_mpn_mu_divappr_q (struct speed_params *s)
    864 {
    865   SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch);
    866 }
    867 double
    868 speed_mpn_mu_div_q (struct speed_params *s)
    869 {
    870   SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch);
    871 }
    872 double
    873 speed_mpn_mupi_div_qr (struct speed_params *s)
    874 {
    875   SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_preinv_mu_div_qr_itch);
    876 }
    877 
    878 double
    879 speed_mpn_sbpi1_bdiv_qr (struct speed_params *s)
    880 {
    881   SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr);
    882 }
    883 double
    884 speed_mpn_dcpi1_bdiv_qr (struct speed_params *s)
    885 {
    886   SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr);
    887 }
    888 double
    889 speed_mpn_sbpi1_bdiv_q (struct speed_params *s)
    890 {
    891   SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q);
    892 }
    893 double
    894 speed_mpn_dcpi1_bdiv_q (struct speed_params *s)
    895 {
    896   SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q);
    897 }
    898 double
    899 speed_mpn_sbpi1_bdiv_r (struct speed_params *s)
    900 {
    901   SPEED_ROUTINE_MPN_PI1_BDIV_R (mpn_sbpi1_bdiv_r);
    902 }
    903 double
    904 speed_mpn_mu_bdiv_q (struct speed_params *s)
    905 {
    906   SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch);
    907 }
    908 double
    909 speed_mpn_mu_bdiv_qr (struct speed_params *s)
    910 {
    911   SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch);
    912 }
    913 
    914 double
    915 speed_mpn_broot (struct speed_params *s)
    916 {
    917   SPEED_ROUTINE_MPN_BROOT (mpn_broot);
    918 }
    919 double
    920 speed_mpn_broot_invm1 (struct speed_params *s)
    921 {
    922   SPEED_ROUTINE_MPN_BROOT (mpn_broot_invm1);
    923 }
    924 double
    925 speed_mpn_brootinv (struct speed_params *s)
    926 {
    927   SPEED_ROUTINE_MPN_BROOTINV (mpn_brootinv, 5*s->size);
    928 }
    929 
    930 double
    931 speed_mpn_binvert (struct speed_params *s)
    932 {
    933   SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch);
    934 }
    935 
    936 double
    937 speed_mpn_invert (struct speed_params *s)
    938 {
    939   SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch);
    940 }
    941 
    942 double
    943 speed_mpn_invertappr (struct speed_params *s)
    944 {
    945   SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch);
    946 }
    947 
    948 double
    949 speed_mpn_ni_invertappr (struct speed_params *s)
    950 {
    951   SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch);
    952 }
    953 
    954 double
    955 speed_mpn_sec_invert (struct speed_params *s)
    956 {
    957   SPEED_ROUTINE_MPN_SEC_INVERT (mpn_sec_invert, mpn_sec_invert_itch);
    958 }
    959 
    960 double
    961 speed_mpn_redc_1 (struct speed_params *s)
    962 {
    963   SPEED_ROUTINE_REDC_1 (mpn_redc_1);
    964 }
    965 double
    966 speed_mpn_redc_2 (struct speed_params *s)
    967 {
    968   SPEED_ROUTINE_REDC_2 (mpn_redc_2);
    969 }
    970 double
    971 speed_mpn_redc_n (struct speed_params *s)
    972 {
    973   SPEED_ROUTINE_REDC_N (mpn_redc_n);
    974 }
    975 
    976 
    977 double
    978 speed_mpn_popcount (struct speed_params *s)
    979 {
    980   SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
    981 }
    982 double
    983 speed_mpn_hamdist (struct speed_params *s)
    984 {
    985   SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
    986 }
    987 
    988 
    989 double
    990 speed_mpn_add_n (struct speed_params *s)
    991 {
    992   SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
    993 }
    994 double
    995 speed_mpn_sub_n (struct speed_params *s)
    996 {
    997 SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
    998 }
    999 double
   1000 speed_mpn_add_1 (struct speed_params *s)
   1001 {
   1002   SPEED_ROUTINE_MPN_UNARY_1 (mpn_add_1);
   1003 }
   1004 double
   1005 speed_mpn_add_1_inplace (struct speed_params *s)
   1006 {
   1007   SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_add_1);
   1008 }
   1009 double
   1010 speed_mpn_sub_1 (struct speed_params *s)
   1011 {
   1012   SPEED_ROUTINE_MPN_UNARY_1 (mpn_sub_1);
   1013 }
   1014 double
   1015 speed_mpn_sub_1_inplace (struct speed_params *s)
   1016 {
   1017   SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_sub_1);
   1018 }
   1019 
   1020 double
   1021 speed_mpn_add_err1_n (struct speed_params *s)
   1022 {
   1023   SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_add_err1_n);
   1024 }
   1025 double
   1026 speed_mpn_sub_err1_n (struct speed_params *s)
   1027 {
   1028   SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_sub_err1_n);
   1029 }
   1030 double
   1031 speed_mpn_add_err2_n (struct speed_params *s)
   1032 {
   1033   SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_add_err2_n);
   1034 }
   1035 double
   1036 speed_mpn_sub_err2_n (struct speed_params *s)
   1037 {
   1038   SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_sub_err2_n);
   1039 }
   1040 double
   1041 speed_mpn_add_err3_n (struct speed_params *s)
   1042 {
   1043   SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_add_err3_n);
   1044 }
   1045 double
   1046 speed_mpn_sub_err3_n (struct speed_params *s)
   1047 {
   1048   SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_sub_err3_n);
   1049 }
   1050 
   1051 
   1052 #if HAVE_NATIVE_mpn_add_n_sub_n
   1053 double
   1054 speed_mpn_add_n_sub_n (struct speed_params *s)
   1055 {
   1056   SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size));
   1057 }
   1058 #endif
   1059 
   1060 #if HAVE_NATIVE_mpn_addlsh1_n == 1
   1061 double
   1062 speed_mpn_addlsh1_n (struct speed_params *s)
   1063 {
   1064   SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n);
   1065 }
   1066 #endif
   1067 #if HAVE_NATIVE_mpn_sublsh1_n == 1
   1068 double
   1069 speed_mpn_sublsh1_n (struct speed_params *s)
   1070 {
   1071   SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n);
   1072 }
   1073 #endif
   1074 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
   1075 double
   1076 speed_mpn_addlsh1_n_ip1 (struct speed_params *s)
   1077 {
   1078   SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip1);
   1079 }
   1080 #endif
   1081 #if HAVE_NATIVE_mpn_addlsh1_n_ip2
   1082 double
   1083 speed_mpn_addlsh1_n_ip2 (struct speed_params *s)
   1084 {
   1085   SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip2);
   1086 }
   1087 #endif
   1088 #if HAVE_NATIVE_mpn_sublsh1_n_ip1
   1089 double
   1090 speed_mpn_sublsh1_n_ip1 (struct speed_params *s)
   1091 {
   1092   SPEED_ROUTINE_MPN_COPY (mpn_sublsh1_n_ip1);
   1093 }
   1094 #endif
   1095 #if HAVE_NATIVE_mpn_rsblsh1_n == 1
   1096 double
   1097 speed_mpn_rsblsh1_n (struct speed_params *s)
   1098 {
   1099   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n);
   1100 }
   1101 #endif
   1102 #if HAVE_NATIVE_mpn_addlsh2_n == 1
   1103 double
   1104 speed_mpn_addlsh2_n (struct speed_params *s)
   1105 {
   1106   SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n);
   1107 }
   1108 #endif
   1109 #if HAVE_NATIVE_mpn_sublsh2_n == 1
   1110 double
   1111 speed_mpn_sublsh2_n (struct speed_params *s)
   1112 {
   1113   SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n);
   1114 }
   1115 #endif
   1116 #if HAVE_NATIVE_mpn_addlsh2_n_ip1
   1117 double
   1118 speed_mpn_addlsh2_n_ip1 (struct speed_params *s)
   1119 {
   1120   SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip1);
   1121 }
   1122 #endif
   1123 #if HAVE_NATIVE_mpn_addlsh2_n_ip2
   1124 double
   1125 speed_mpn_addlsh2_n_ip2 (struct speed_params *s)
   1126 {
   1127   SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip2);
   1128 }
   1129 #endif
   1130 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
   1131 double
   1132 speed_mpn_sublsh2_n_ip1 (struct speed_params *s)
   1133 {
   1134   SPEED_ROUTINE_MPN_COPY (mpn_sublsh2_n_ip1);
   1135 }
   1136 #endif
   1137 #if HAVE_NATIVE_mpn_rsblsh2_n == 1
   1138 double
   1139 speed_mpn_rsblsh2_n (struct speed_params *s)
   1140 {
   1141   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n);
   1142 }
   1143 #endif
   1144 #if HAVE_NATIVE_mpn_addlsh_n
   1145 double
   1146 speed_mpn_addlsh_n (struct speed_params *s)
   1147 {
   1148   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addlsh_n (wp, xp, yp, s->size, 7));
   1149 }
   1150 #endif
   1151 #if HAVE_NATIVE_mpn_sublsh_n
   1152 double
   1153 speed_mpn_sublsh_n (struct speed_params *s)
   1154 {
   1155   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_sublsh_n (wp, xp, yp, s->size, 7));
   1156 }
   1157 #endif
   1158 #if HAVE_NATIVE_mpn_addlsh_n_ip1
   1159 double
   1160 speed_mpn_addlsh_n_ip1 (struct speed_params *s)
   1161 {
   1162   SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip1 (wp, s->xp, s->size, 7));
   1163 }
   1164 #endif
   1165 #if HAVE_NATIVE_mpn_addlsh_n_ip2
   1166 double
   1167 speed_mpn_addlsh_n_ip2 (struct speed_params *s)
   1168 {
   1169   SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip2 (wp, s->xp, s->size, 7));
   1170 }
   1171 #endif
   1172 #if HAVE_NATIVE_mpn_sublsh_n_ip1
   1173 double
   1174 speed_mpn_sublsh_n_ip1 (struct speed_params *s)
   1175 {
   1176   SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_sublsh_n_ip1 (wp, s->xp, s->size, 7));
   1177 }
   1178 #endif
   1179 #if HAVE_NATIVE_mpn_rsblsh_n
   1180 double
   1181 speed_mpn_rsblsh_n (struct speed_params *s)
   1182 {
   1183   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_rsblsh_n (wp, xp, yp, s->size, 7));
   1184 }
   1185 #endif
   1186 #if HAVE_NATIVE_mpn_rsh1add_n
   1187 double
   1188 speed_mpn_rsh1add_n (struct speed_params *s)
   1189 {
   1190   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n);
   1191 }
   1192 #endif
   1193 #if HAVE_NATIVE_mpn_rsh1sub_n
   1194 double
   1195 speed_mpn_rsh1sub_n (struct speed_params *s)
   1196 {
   1197   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n);
   1198 }
   1199 #endif
   1200 
   1201 double
   1202 speed_mpn_cnd_add_n (struct speed_params *s)
   1203 {
   1204   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_add_n (1, wp, xp, yp, s->size));
   1205 }
   1206 double
   1207 speed_mpn_cnd_sub_n (struct speed_params *s)
   1208 {
   1209   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_sub_n (1, wp, xp, yp, s->size));
   1210 }
   1211 
   1212 /* mpn_and_n etc can be macros and so have to be handled with
   1213    SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
   1214 double
   1215 speed_mpn_and_n (struct speed_params *s)
   1216 {
   1217   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, xp, yp, s->size));
   1218 }
   1219 double
   1220 speed_mpn_andn_n (struct speed_params *s)
   1221 {
   1222   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, xp, yp, s->size));
   1223 }
   1224 double
   1225 speed_mpn_nand_n (struct speed_params *s)
   1226 {
   1227   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, xp, yp, s->size));
   1228 }
   1229 double
   1230 speed_mpn_ior_n (struct speed_params *s)
   1231 {
   1232   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, xp, yp, s->size));
   1233 }
   1234 double
   1235 speed_mpn_iorn_n (struct speed_params *s)
   1236 {
   1237   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, xp, yp, s->size));
   1238 }
   1239 double
   1240 speed_mpn_nior_n (struct speed_params *s)
   1241 {
   1242   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, xp, yp, s->size));
   1243 }
   1244 double
   1245 speed_mpn_xor_n (struct speed_params *s)
   1246 {
   1247   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, xp, yp, s->size));
   1248 }
   1249 double
   1250 speed_mpn_xnor_n (struct speed_params *s)
   1251 {
   1252   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, xp, yp, s->size));
   1253 }
   1254 
   1255 
   1256 double
   1257 speed_mpn_mul_n (struct speed_params *s)
   1258 {
   1259   SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
   1260 }
   1261 double
   1262 speed_mpn_sqr (struct speed_params *s)
   1263 {
   1264   SPEED_ROUTINE_MPN_SQR (mpn_sqr);
   1265 }
   1266 double
   1267 speed_mpn_mul_n_sqr (struct speed_params *s)
   1268 {
   1269   SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
   1270 }
   1271 
   1272 double
   1273 speed_mpn_mul_basecase (struct speed_params *s)
   1274 {
   1275   SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase);
   1276 }
   1277 double
   1278 speed_mpn_mul (struct speed_params *s)
   1279 {
   1280   SPEED_ROUTINE_MPN_MUL(mpn_mul);
   1281 }
   1282 double
   1283 speed_mpn_sqr_basecase (struct speed_params *s)
   1284 {
   1285   /* FIXME: size restrictions on some versions of sqr_basecase */
   1286   SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
   1287 }
   1288 
   1289 #if HAVE_NATIVE_mpn_sqr_diagonal
   1290 double
   1291 speed_mpn_sqr_diagonal (struct speed_params *s)
   1292 {
   1293   SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal);
   1294 }
   1295 #endif
   1296 
   1297 #if HAVE_NATIVE_mpn_sqr_diag_addlsh1
   1298 double
   1299 speed_mpn_sqr_diag_addlsh1 (struct speed_params *s)
   1300 {
   1301   SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL (mpn_sqr_diag_addlsh1 (wp, tp, s->xp, s->size));
   1302 }
   1303 #endif
   1304 
   1305 double
   1306 speed_mpn_toom2_sqr (struct speed_params *s)
   1307 {
   1308   SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr);
   1309 }
   1310 double
   1311 speed_mpn_toom3_sqr (struct speed_params *s)
   1312 {
   1313   SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr);
   1314 }
   1315 double
   1316 speed_mpn_toom4_sqr (struct speed_params *s)
   1317 {
   1318   SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr);
   1319 }
   1320 double
   1321 speed_mpn_toom6_sqr (struct speed_params *s)
   1322 {
   1323   SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr);
   1324 }
   1325 double
   1326 speed_mpn_toom8_sqr (struct speed_params *s)
   1327 {
   1328   SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr);
   1329 }
   1330 double
   1331 speed_mpn_toom22_mul (struct speed_params *s)
   1332 {
   1333   SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul);
   1334 }
   1335 double
   1336 speed_mpn_toom33_mul (struct speed_params *s)
   1337 {
   1338   SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul);
   1339 }
   1340 double
   1341 speed_mpn_toom44_mul (struct speed_params *s)
   1342 {
   1343   SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul);
   1344 }
   1345 double
   1346 speed_mpn_toom6h_mul (struct speed_params *s)
   1347 {
   1348   SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul);
   1349 }
   1350 double
   1351 speed_mpn_toom8h_mul (struct speed_params *s)
   1352 {
   1353   SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul);
   1354 }
   1355 
   1356 double
   1357 speed_mpn_toom32_mul (struct speed_params *s)
   1358 {
   1359   SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul);
   1360 }
   1361 double
   1362 speed_mpn_toom42_mul (struct speed_params *s)
   1363 {
   1364   SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul);
   1365 }
   1366 double
   1367 speed_mpn_toom43_mul (struct speed_params *s)
   1368 {
   1369   SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul);
   1370 }
   1371 double
   1372 speed_mpn_toom63_mul (struct speed_params *s)
   1373 {
   1374   SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul);
   1375 }
   1376 double
   1377 speed_mpn_toom32_for_toom43_mul (struct speed_params *s)
   1378 {
   1379   SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul);
   1380 }
   1381 double
   1382 speed_mpn_toom43_for_toom32_mul (struct speed_params *s)
   1383 {
   1384   SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul);
   1385 }
   1386 double
   1387 speed_mpn_toom32_for_toom53_mul (struct speed_params *s)
   1388 {
   1389   SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul);
   1390 }
   1391 double
   1392 speed_mpn_toom53_for_toom32_mul (struct speed_params *s)
   1393 {
   1394   SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul);
   1395 }
   1396 double
   1397 speed_mpn_toom42_for_toom53_mul (struct speed_params *s)
   1398 {
   1399   SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul);
   1400 }
   1401 double
   1402 speed_mpn_toom53_for_toom42_mul (struct speed_params *s)
   1403 {
   1404   SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul);
   1405 }
   1406 double
   1407 speed_mpn_toom43_for_toom54_mul (struct speed_params *s)
   1408 {
   1409   SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul);
   1410 }
   1411 double
   1412 speed_mpn_toom54_for_toom43_mul (struct speed_params *s)
   1413 {
   1414   SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul);
   1415 }
   1416 
   1417 double
   1418 speed_mpn_nussbaumer_mul (struct speed_params *s)
   1419 {
   1420   SPEED_ROUTINE_MPN_MUL_N_CALL
   1421     (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size));
   1422 }
   1423 double
   1424 speed_mpn_nussbaumer_mul_sqr (struct speed_params *s)
   1425 {
   1426   SPEED_ROUTINE_MPN_SQR_CALL
   1427     (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size));
   1428 }
   1429 
   1430 #if WANT_OLD_FFT_FULL
   1431 double
   1432 speed_mpn_mul_fft_full (struct speed_params *s)
   1433 {
   1434   SPEED_ROUTINE_MPN_MUL_N_CALL
   1435     (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
   1436 }
   1437 double
   1438 speed_mpn_mul_fft_full_sqr (struct speed_params *s)
   1439 {
   1440   SPEED_ROUTINE_MPN_SQR_CALL
   1441     (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
   1442 }
   1443 #endif
   1444 
   1445 /* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
   1446    used as k, otherwise the best k for the size is used.  If s->size isn't a
   1447    multiple of 2^k it's rounded up to make the effective operation size.  */
   1448 
   1449 #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
   1450   {                                                     \
   1451     mp_ptr     wp;                                      \
   1452     mp_size_t  pl;                                      \
   1453     int        k;                                       \
   1454     unsigned   i;                                       \
   1455     double     t;                                       \
   1456     TMP_DECL;                                           \
   1457 							\
   1458     SPEED_RESTRICT_COND (s->size >= 1);                 \
   1459 							\
   1460     if (s->r != 0)                                      \
   1461       k = s->r;                                         \
   1462     else                                                \
   1463       k = mpn_fft_best_k (s->size, sqr);                \
   1464 							\
   1465     TMP_MARK;                                           \
   1466     pl = mpn_fft_next_size (s->size, k);                \
   1467     SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp);      \
   1468 							\
   1469     speed_operand_src (s, s->xp, s->size);              \
   1470     if (!sqr)                                           \
   1471       speed_operand_src (s, s->yp, s->size);            \
   1472     speed_operand_dst (s, wp, pl+1);                    \
   1473     speed_cache_fill (s);                               \
   1474 							\
   1475     speed_starttime ();                                 \
   1476     i = s->reps;                                        \
   1477     do                                                  \
   1478       call;                                             \
   1479     while (--i != 0);                                   \
   1480     t = speed_endtime ();                               \
   1481 							\
   1482     TMP_FREE;                                           \
   1483     return t;                                           \
   1484   }
   1485 
   1486 double
   1487 speed_mpn_mul_fft (struct speed_params *s)
   1488 {
   1489   SPEED_ROUTINE_MPN_MUL_FFT_CALL
   1490     (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
   1491 }
   1492 
   1493 double
   1494 speed_mpn_mul_fft_sqr (struct speed_params *s)
   1495 {
   1496   SPEED_ROUTINE_MPN_MUL_FFT_CALL
   1497     (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
   1498 }
   1499 
   1500 double
   1501 speed_mpn_fft_mul (struct speed_params *s)
   1502 {
   1503   SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size));
   1504 }
   1505 
   1506 double
   1507 speed_mpn_fft_sqr (struct speed_params *s)
   1508 {
   1509   SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size));
   1510 }
   1511 
   1512 double
   1513 speed_mpn_sqrlo (struct speed_params *s)
   1514 {
   1515   SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo);
   1516 }
   1517 double
   1518 speed_mpn_sqrlo_basecase (struct speed_params *s)
   1519 {
   1520   SPEED_RESTRICT_COND (ABOVE_THRESHOLD (s->size, MIN (3, SQRLO_BASECASE_THRESHOLD))
   1521 		       && BELOW_THRESHOLD (s->size, SQRLO_DC_THRESHOLD));
   1522   SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo_basecase);
   1523 }
   1524 double
   1525 speed_mpn_mullo_n (struct speed_params *s)
   1526 {
   1527   SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n);
   1528 }
   1529 double
   1530 speed_mpn_mullo_basecase (struct speed_params *s)
   1531 {
   1532   SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase);
   1533 }
   1534 
   1535 double
   1536 speed_mpn_mulmid_basecase (struct speed_params *s)
   1537 {
   1538   SPEED_ROUTINE_MPN_MULMID (mpn_mulmid_basecase);
   1539 }
   1540 
   1541 double
   1542 speed_mpn_mulmid (struct speed_params *s)
   1543 {
   1544   SPEED_ROUTINE_MPN_MULMID (mpn_mulmid);
   1545 }
   1546 
   1547 double
   1548 speed_mpn_mulmid_n (struct speed_params *s)
   1549 {
   1550   SPEED_ROUTINE_MPN_MULMID_N (mpn_mulmid_n);
   1551 }
   1552 
   1553 double
   1554 speed_mpn_toom42_mulmid (struct speed_params *s)
   1555 {
   1556   SPEED_ROUTINE_MPN_TOOM42_MULMID (mpn_toom42_mulmid);
   1557 }
   1558 
   1559 double
   1560 speed_mpn_mulmod_bnm1 (struct speed_params *s)
   1561 {
   1562   SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp));
   1563 }
   1564 
   1565 double
   1566 speed_mpn_bc_mulmod_bnm1 (struct speed_params *s)
   1567 {
   1568   SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp));
   1569 }
   1570 
   1571 double
   1572 speed_mpn_mulmod_bnm1_rounded (struct speed_params *s)
   1573 {
   1574   SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1);
   1575 }
   1576 
   1577 double
   1578 speed_mpn_sqrmod_bnm1 (struct speed_params *s)
   1579 {
   1580   SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp));
   1581 }
   1582 
   1583 double
   1584 speed_mpn_matrix22_mul (struct speed_params *s)
   1585 {
   1586   /* Speed params only includes 2 inputs, so we have to invent the
   1587      other 6. */
   1588 
   1589   mp_ptr a;
   1590   mp_ptr r;
   1591   mp_ptr b;
   1592   mp_ptr tp;
   1593   mp_size_t itch;
   1594   unsigned i;
   1595   double t;
   1596   TMP_DECL;
   1597 
   1598   TMP_MARK;
   1599   SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp);
   1600   SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp);
   1601   SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp);
   1602 
   1603   MPN_COPY (a, s->xp, s->size);
   1604   mpn_random (a + s->size, 3 * s->size);
   1605   MPN_COPY (b, s->yp, s->size);
   1606   mpn_random (b + s->size, 3 * s->size);
   1607 
   1608   itch = mpn_matrix22_mul_itch (s->size, s->size);
   1609   SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);
   1610 
   1611   speed_operand_src (s, a, 4 * s->size);
   1612   speed_operand_src (s, b, 4 * s->size);
   1613   speed_operand_dst (s, r, 8 * s->size + 4);
   1614   speed_operand_dst (s, tp, itch);
   1615   speed_cache_fill (s);
   1616 
   1617   speed_starttime ();
   1618   i = s->reps;
   1619   do
   1620     {
   1621       mp_size_t sz = s->size;
   1622       MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz);
   1623       MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz);
   1624       MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz);
   1625       MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz);
   1626       mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz,
   1627 			b, b + 1 * sz,     b + 2 * sz,     b + 3 * sz,     sz,
   1628 			tp);
   1629     }
   1630   while (--i != 0);
   1631   t = speed_endtime();
   1632   TMP_FREE;
   1633   return t;
   1634 }
   1635 
   1636 double
   1637 speed_mpn_hgcd2 (struct speed_params *s)
   1638 {
   1639   SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2);
   1640 }
   1641 double
   1642 speed_mpn_hgcd2_1 (struct speed_params *s)
   1643 {
   1644   SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_1);
   1645 }
   1646 double
   1647 speed_mpn_hgcd2_2 (struct speed_params *s)
   1648 {
   1649   SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_2);
   1650 }
   1651 double
   1652 speed_mpn_hgcd2_3 (struct speed_params *s)
   1653 {
   1654   SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_3);
   1655 }
   1656 double
   1657 speed_mpn_hgcd2_4 (struct speed_params *s)
   1658 {
   1659   SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_4);
   1660 }
   1661 double
   1662 speed_mpn_hgcd2_5 (struct speed_params *s)
   1663 {
   1664   SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_5);
   1665 }
   1666 
   1667 double
   1668 speed_mpn_hgcd (struct speed_params *s)
   1669 {
   1670   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd, mpn_hgcd_itch);
   1671 }
   1672 
   1673 double
   1674 speed_mpn_hgcd_lehmer (struct speed_params *s)
   1675 {
   1676   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch);
   1677 }
   1678 
   1679 double
   1680 speed_mpn_hgcd_appr (struct speed_params *s)
   1681 {
   1682   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch);
   1683 }
   1684 
   1685 double
   1686 speed_mpn_hgcd_appr_lehmer (struct speed_params *s)
   1687 {
   1688   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch);
   1689 }
   1690 
   1691 double
   1692 speed_mpn_hgcd_reduce (struct speed_params *s)
   1693 {
   1694   SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch);
   1695 }
   1696 double
   1697 speed_mpn_hgcd_reduce_1 (struct speed_params *s)
   1698 {
   1699   SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch);
   1700 }
   1701 double
   1702 speed_mpn_hgcd_reduce_2 (struct speed_params *s)
   1703 {
   1704   SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch);
   1705 }
   1706 
   1707 double
   1708 speed_mpn_gcd (struct speed_params *s)
   1709 {
   1710   SPEED_ROUTINE_MPN_GCD (mpn_gcd);
   1711 }
   1712 
   1713 double
   1714 speed_mpn_gcdext (struct speed_params *s)
   1715 {
   1716   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
   1717 }
   1718 #if 0
   1719 double
   1720 speed_mpn_gcdext_lehmer (struct speed_params *s)
   1721 {
   1722   SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer);
   1723 }
   1724 #endif
   1725 double
   1726 speed_mpn_gcdext_single (struct speed_params *s)
   1727 {
   1728   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single);
   1729 }
   1730 double
   1731 speed_mpn_gcdext_double (struct speed_params *s)
   1732 {
   1733   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double);
   1734 }
   1735 double
   1736 speed_mpn_gcdext_one_single (struct speed_params *s)
   1737 {
   1738   SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single);
   1739 }
   1740 double
   1741 speed_mpn_gcdext_one_double (struct speed_params *s)
   1742 {
   1743   SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double);
   1744 }
   1745 double
   1746 speed_mpn_gcd_1 (struct speed_params *s)
   1747 {
   1748   SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
   1749 }
   1750 double
   1751 speed_mpn_gcd_11 (struct speed_params *s)
   1752 {
   1753   SPEED_ROUTINE_MPN_GCD_11 (mpn_gcd_11);
   1754 }
   1755 double
   1756 speed_mpn_gcd_1N (struct speed_params *s)
   1757 {
   1758   SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1);
   1759 }
   1760 double
   1761 speed_mpn_gcd_22 (struct speed_params *s)
   1762 {
   1763   SPEED_ROUTINE_MPN_GCD_22 (mpn_gcd_22);
   1764 }
   1765 
   1766 double
   1767 speed_mpz_nextprime (struct speed_params *s)
   1768 {
   1769   SPEED_ROUTINE_MPZ_NEXTPRIME (mpz_nextprime);
   1770 }
   1771 
   1772 double
   1773 speed_mpz_jacobi (struct speed_params *s)
   1774 {
   1775   SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi);
   1776 }
   1777 double
   1778 speed_mpn_jacobi_base (struct speed_params *s)
   1779 {
   1780   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
   1781 }
   1782 double
   1783 speed_mpn_jacobi_base_1 (struct speed_params *s)
   1784 {
   1785   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1);
   1786 }
   1787 double
   1788 speed_mpn_jacobi_base_2 (struct speed_params *s)
   1789 {
   1790   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2);
   1791 }
   1792 double
   1793 speed_mpn_jacobi_base_3 (struct speed_params *s)
   1794 {
   1795   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3);
   1796 }
   1797 double
   1798 speed_mpn_jacobi_base_4 (struct speed_params *s)
   1799 {
   1800   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4);
   1801 }
   1802 
   1803 
   1804 double
   1805 speed_mpn_sqrtrem (struct speed_params *s)
   1806 {
   1807   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, wp2, s->xp, s->size));
   1808 }
   1809 
   1810 double
   1811 speed_mpn_sqrt (struct speed_params *s)
   1812 {
   1813   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, NULL, s->xp, s->size));
   1814 }
   1815 
   1816 double
   1817 speed_mpn_rootrem (struct speed_params *s)
   1818 {
   1819   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, wp2, s->xp, s->size, s->r));
   1820 }
   1821 
   1822 double
   1823 speed_mpn_root (struct speed_params *s)
   1824 {
   1825   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, NULL, s->xp, s->size, s->r));
   1826 }
   1827 
   1828 
   1829 double
   1830 speed_mpn_perfect_power_p (struct speed_params *s)
   1831 {
   1832   SPEED_ROUTINE_MPN_PERFECT_POWER (mpn_perfect_power_p);
   1833 }
   1834 
   1835 double
   1836 speed_mpn_perfect_square_p (struct speed_params *s)
   1837 {
   1838   SPEED_ROUTINE_MPN_PERFECT_SQUARE (mpn_perfect_square_p);
   1839 }
   1840 
   1841 
   1842 double
   1843 speed_mpz_fac_ui (struct speed_params *s)
   1844 {
   1845   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
   1846 }
   1847 
   1848 double
   1849 speed_mpz_2fac_ui (struct speed_params *s)
   1850 {
   1851   SPEED_ROUTINE_MPZ_UI (mpz_2fac_ui);
   1852 }
   1853 
   1854 double
   1855 speed_mpz_primorial_ui (struct speed_params *s)
   1856 {
   1857   SPEED_ROUTINE_MPZ_UI (mpz_primorial_ui);
   1858 }
   1859 
   1860 
   1861 double
   1862 speed_mpn_fib2_ui (struct speed_params *s)
   1863 {
   1864   SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui);
   1865 }
   1866 double
   1867 speed_mpz_fib_ui (struct speed_params *s)
   1868 {
   1869   SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui);
   1870 }
   1871 double
   1872 speed_mpz_fib2_ui (struct speed_params *s)
   1873 {
   1874   SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui);
   1875 }
   1876 double
   1877 speed_mpz_lucnum_ui (struct speed_params *s)
   1878 {
   1879   SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui);
   1880 }
   1881 double
   1882 speed_mpz_lucnum2_ui (struct speed_params *s)
   1883 {
   1884   SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui);
   1885 }
   1886 
   1887 
   1888 double
   1889 speed_mpz_powm (struct speed_params *s)
   1890 {
   1891   SPEED_ROUTINE_MPZ_POWM (mpz_powm);
   1892 }
   1893 double
   1894 speed_mpz_powm_mod (struct speed_params *s)
   1895 {
   1896   SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod);
   1897 }
   1898 double
   1899 speed_mpz_powm_redc (struct speed_params *s)
   1900 {
   1901   SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc);
   1902 }
   1903 double
   1904 speed_mpz_powm_sec (struct speed_params *s)
   1905 {
   1906   SPEED_ROUTINE_MPZ_POWM (mpz_powm_sec);
   1907 }
   1908 double
   1909 speed_mpz_powm_ui (struct speed_params *s)
   1910 {
   1911   SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui);
   1912 }
   1913 
   1914 
   1915 double
   1916 speed_binvert_limb (struct speed_params *s)
   1917 {
   1918   SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb);
   1919 }
   1920 
   1921 
   1922 double
   1923 speed_noop (struct speed_params *s)
   1924 {
   1925   unsigned  i;
   1926 
   1927   speed_starttime ();
   1928   i = s->reps;
   1929   do
   1930     noop ();
   1931   while (--i != 0);
   1932   return speed_endtime ();
   1933 }
   1934 
   1935 double
   1936 speed_noop_wxs (struct speed_params *s)
   1937 {
   1938   mp_ptr   wp;
   1939   unsigned i;
   1940   double   t;
   1941   TMP_DECL;
   1942 
   1943   TMP_MARK;
   1944   wp = TMP_ALLOC_LIMBS (1);
   1945 
   1946   speed_starttime ();
   1947   i = s->reps;
   1948   do
   1949     noop_wxs (wp, s->xp, s->size);
   1950   while (--i != 0);
   1951   t = speed_endtime ();
   1952 
   1953   TMP_FREE;
   1954   return t;
   1955 }
   1956 
   1957 double
   1958 speed_noop_wxys (struct speed_params *s)
   1959 {
   1960   mp_ptr   wp;
   1961   unsigned i;
   1962   double   t;
   1963   TMP_DECL;
   1964 
   1965   TMP_MARK;
   1966   wp = TMP_ALLOC_LIMBS (1);
   1967 
   1968   speed_starttime ();
   1969   i = s->reps;
   1970   do
   1971     noop_wxys (wp, s->xp, s->yp, s->size);
   1972   while (--i != 0);
   1973   t = speed_endtime ();
   1974 
   1975   TMP_FREE;
   1976   return t;
   1977 }
   1978 
   1979 
   1980 #define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
   1981   {                                                     \
   1982     unsigned  i;                                        \
   1983     variables;                                          \
   1984 							\
   1985     speed_starttime ();                                 \
   1986     i = s->reps;                                        \
   1987     do                                                  \
   1988       {                                                 \
   1989 	calls;                                          \
   1990       }                                                 \
   1991     while (--i != 0);                                   \
   1992     return speed_endtime ();                            \
   1993   }
   1994 
   1995 
   1996 /* Compare these to see how much malloc/free costs and then how much
   1997    __gmp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
   1998    mpq_init/clear will be doing a 1 limb allocate, so use that as the size
   1999    when including them in comparisons.  */
   2000 
   2001 double
   2002 speed_malloc_free (struct speed_params *s)
   2003 {
   2004   size_t  bytes = s->size * GMP_LIMB_BYTES;
   2005   SPEED_ROUTINE_ALLOC_FREE (void *p,
   2006 			    p = malloc (bytes);
   2007 			    free (p));
   2008 }
   2009 
   2010 double
   2011 speed_malloc_realloc_free (struct speed_params *s)
   2012 {
   2013   size_t  bytes = s->size * GMP_LIMB_BYTES;
   2014   SPEED_ROUTINE_ALLOC_FREE (void *p,
   2015 			    p = malloc (GMP_LIMB_BYTES);
   2016 			    p = realloc (p, bytes);
   2017 			    free (p));
   2018 }
   2019 
   2020 double
   2021 speed_gmp_allocate_free (struct speed_params *s)
   2022 {
   2023   size_t  bytes = s->size * GMP_LIMB_BYTES;
   2024   SPEED_ROUTINE_ALLOC_FREE (void *p,
   2025 			    p = (*__gmp_allocate_func) (bytes);
   2026 			    (*__gmp_free_func) (p, bytes));
   2027 }
   2028 
   2029 double
   2030 speed_gmp_allocate_reallocate_free (struct speed_params *s)
   2031 {
   2032   size_t  bytes = s->size * GMP_LIMB_BYTES;
   2033   SPEED_ROUTINE_ALLOC_FREE
   2034     (void *p,
   2035      p = (*__gmp_allocate_func) (GMP_LIMB_BYTES);
   2036      p = (*__gmp_reallocate_func) (p, bytes, GMP_LIMB_BYTES);
   2037      (*__gmp_free_func) (p, bytes));
   2038 }
   2039 
   2040 double
   2041 speed_mpz_init_clear (struct speed_params *s)
   2042 {
   2043   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
   2044 			    mpz_init (z);
   2045 			    mpz_clear (z));
   2046 }
   2047 
   2048 double
   2049 speed_mpz_init_realloc_clear (struct speed_params *s)
   2050 {
   2051   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
   2052 			    mpz_init (z);
   2053 			    _mpz_realloc (z, s->size);
   2054 			    mpz_clear (z));
   2055 }
   2056 
   2057 double
   2058 speed_mpq_init_clear (struct speed_params *s)
   2059 {
   2060   SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
   2061 			    mpq_init (q);
   2062 			    mpq_clear (q));
   2063 }
   2064 
   2065 double
   2066 speed_mpf_init_clear (struct speed_params *s)
   2067 {
   2068   SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
   2069 			    mpf_init (f);
   2070 			    mpf_clear (f));
   2071 }
   2072 
   2073 
   2074 /* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
   2075    that repeatedly calling mpz_add with the same data gives branch prediction
   2076    in it an advantage.  */
   2077 
   2078 double
   2079 speed_mpz_add (struct speed_params *s)
   2080 {
   2081   mpz_t     w, x, y;
   2082   unsigned  i;
   2083   double    t;
   2084 
   2085   mpz_init (w);
   2086   mpz_init (x);
   2087   mpz_init (y);
   2088 
   2089   mpz_set_n (x, s->xp, s->size);
   2090   mpz_set_n (y, s->yp, s->size);
   2091   mpz_add (w, x, y);
   2092 
   2093   speed_starttime ();
   2094   i = s->reps;
   2095   do
   2096     {
   2097       mpz_add (w, x, y);
   2098     }
   2099   while (--i != 0);
   2100   t = speed_endtime ();
   2101 
   2102   mpz_clear (w);
   2103   mpz_clear (x);
   2104   mpz_clear (y);
   2105   return t;
   2106 }
   2107 
   2108 
   2109 /* An inverse (s->r) or (s->size)/2 modulo s->size limbs */
   2110 
   2111 double
   2112 speed_mpz_invert (struct speed_params *s)
   2113 {
   2114   mpz_t     a, m, r;
   2115   mp_size_t k;
   2116   unsigned  i;
   2117   double    t;
   2118 
   2119   if (s->r == 0)
   2120     k = s->size/2;
   2121   else if (s->r < GMP_LIMB_HIGHBIT)
   2122     k = s->r;
   2123   else /* s->r < 0 */
   2124     k = s->size - (-s->r);
   2125 
   2126   SPEED_RESTRICT_COND (k > 0 && k <= s->size);
   2127 
   2128   mpz_init_set_n (m, s->yp, s->size);
   2129   mpz_setbit (m, 0);	/* force m to odd */
   2130 
   2131   mpz_init_set_n (a, s->xp, k);
   2132 
   2133   mpz_init (r);
   2134   while (mpz_invert (r, a, m) == 0)
   2135     mpz_add_ui (a, a, 1);
   2136 
   2137   speed_starttime ();
   2138   i = s->reps;
   2139   do
   2140     mpz_invert (r, a, m);
   2141   while (--i != 0);
   2142   t = speed_endtime ();
   2143 
   2144   mpz_clear (r);
   2145   mpz_clear (a);
   2146   mpz_clear (m);
   2147   return t;
   2148   }
   2149 
   2150 /* If r==0, calculate binomial(size,size/2),
   2151    otherwise calculate binomial(size,r). */
   2152 
   2153 double
   2154 speed_mpz_bin_uiui (struct speed_params *s)
   2155 {
   2156   mpz_t          w;
   2157   unsigned long  k;
   2158   unsigned  i;
   2159   double    t;
   2160 
   2161   mpz_init (w);
   2162   if (s->r != 0)
   2163     k = s->r;
   2164   else
   2165     k = s->size/2;
   2166 
   2167   speed_starttime ();
   2168   i = s->reps;
   2169   do
   2170     {
   2171       mpz_bin_uiui (w, s->size, k);
   2172     }
   2173   while (--i != 0);
   2174   t = speed_endtime ();
   2175 
   2176   mpz_clear (w);
   2177   return t;
   2178 }
   2179 
   2180 /* If r==0, calculate binomial(2^size,size),
   2181    otherwise calculate binomial(2^size,r). */
   2182 
   2183 double
   2184 speed_mpz_bin_ui (struct speed_params *s)
   2185 {
   2186   mpz_t          w, x;
   2187   unsigned long  k;
   2188   unsigned  i;
   2189   double    t;
   2190 
   2191   mpz_init (w);
   2192   mpz_init_set_ui (x, 0);
   2193 
   2194   mpz_setbit (x, s->size);
   2195 
   2196   if (s->r != 0)
   2197     k = s->r;
   2198   else
   2199     k = s->size;
   2200 
   2201   speed_starttime ();
   2202   i = s->reps;
   2203   do
   2204     {
   2205       mpz_bin_ui (w, x, k);
   2206     }
   2207   while (--i != 0);
   2208   t = speed_endtime ();
   2209 
   2210   mpz_clear (w);
   2211   mpz_clear (x);
   2212   return t;
   2213 }
   2214 
   2215 /* If r==0, calculate mfac(size,log(size)),
   2216    otherwise calculate mfac(size,r). */
   2217 
   2218 double
   2219 speed_mpz_mfac_uiui (struct speed_params *s)
   2220 {
   2221   mpz_t          w;
   2222   unsigned long  k;
   2223   unsigned  i;
   2224   double    t;
   2225 
   2226   mpz_init (w);
   2227   if (s->r != 0)
   2228     k = s->r;
   2229   else
   2230     for (k = 1; s->size >> k; ++k);
   2231 
   2232   speed_starttime ();
   2233   i = s->reps;
   2234   do
   2235     {
   2236       mpz_mfac_uiui (w, s->size, k);
   2237     }
   2238   while (--i != 0);
   2239   t = speed_endtime ();
   2240 
   2241   mpz_clear (w);
   2242   return t;
   2243 }
   2244 
   2245 /* The multiplies are successively dependent so the latency is measured, not
   2246    the issue rate.  There's only 10 per loop so the code doesn't get too big
   2247    since umul_ppmm is several instructions on some cpus.
   2248 
   2249    Putting the arguments as "h,l,l,h" gets slightly better code from gcc
   2250    2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
   2251    though will probably show up as a bogus extra cycle though.
   2252 
   2253    The measuring function macros are into three parts to avoid overflowing
   2254    preprocessor expansion space if umul_ppmm is big.
   2255 
   2256    Limitations:
   2257 
   2258    The default umul_ppmm doing h*l will be getting increasing numbers of
   2259    high zero bits in the calculation.  CPUs with data-dependent multipliers
   2260    will want to use umul_ppmm.1 to get some randomization into the
   2261    calculation.  The extra xors and fetches will be a slowdown of course.  */
   2262 
   2263 #define SPEED_MACRO_UMUL_PPMM_A \
   2264   {                             \
   2265     mp_limb_t  h, l;            \
   2266     unsigned   i;               \
   2267     double     t;               \
   2268 				\
   2269     s->time_divisor = 10;       \
   2270 				\
   2271     h = s->xp[0];               \
   2272     l = s->yp[0];               \
   2273 				\
   2274     if (s->r == 1)              \
   2275       {                         \
   2276 	speed_starttime ();     \
   2277 	i = s->reps;            \
   2278 	do                      \
   2279 	  {
   2280 
   2281 #define SPEED_MACRO_UMUL_PPMM_B \
   2282 	  }                     \
   2283 	while (--i != 0);       \
   2284 	t = speed_endtime ();   \
   2285       }                         \
   2286     else                        \
   2287       {                         \
   2288 	speed_starttime ();     \
   2289 	i = s->reps;            \
   2290 	do                      \
   2291 	  {
   2292 
   2293 #define SPEED_MACRO_UMUL_PPMM_C                                         \
   2294 	  }                                                             \
   2295 	while (--i != 0);                                               \
   2296 	t = speed_endtime ();                                           \
   2297       }                                                                 \
   2298 									\
   2299     /* stop the compiler optimizing away the whole calculation! */      \
   2300     noop_1 (h);                                                         \
   2301     noop_1 (l);                                                         \
   2302 									\
   2303     return t;                                                           \
   2304   }
   2305 
   2306 
   2307 double
   2308 speed_umul_ppmm (struct speed_params *s)
   2309 {
   2310   SPEED_MACRO_UMUL_PPMM_A;
   2311   {
   2312     umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
   2313      umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
   2314      umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
   2315     umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
   2316      umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
   2317      umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
   2318     umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
   2319      umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
   2320      umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
   2321     umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
   2322   }
   2323   SPEED_MACRO_UMUL_PPMM_B;
   2324   {
   2325     umul_ppmm (h, l, l, h);
   2326      umul_ppmm (h, l, l, h);
   2327      umul_ppmm (h, l, l, h);
   2328     umul_ppmm (h, l, l, h);
   2329      umul_ppmm (h, l, l, h);
   2330      umul_ppmm (h, l, l, h);
   2331     umul_ppmm (h, l, l, h);
   2332      umul_ppmm (h, l, l, h);
   2333      umul_ppmm (h, l, l, h);
   2334     umul_ppmm (h, l, l, h);
   2335   }
   2336   SPEED_MACRO_UMUL_PPMM_C;
   2337 }
   2338 
   2339 
   2340 #if HAVE_NATIVE_mpn_umul_ppmm
   2341 double
   2342 speed_mpn_umul_ppmm (struct speed_params *s)
   2343 {
   2344   SPEED_MACRO_UMUL_PPMM_A;
   2345   {
   2346     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
   2347      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
   2348      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
   2349     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
   2350      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
   2351      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
   2352     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
   2353      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
   2354      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
   2355     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
   2356   }
   2357   SPEED_MACRO_UMUL_PPMM_B;
   2358   {
   2359     h = mpn_umul_ppmm (&l, h, l);
   2360      h = mpn_umul_ppmm (&l, h, l);
   2361      h = mpn_umul_ppmm (&l, h, l);
   2362     h = mpn_umul_ppmm (&l, h, l);
   2363      h = mpn_umul_ppmm (&l, h, l);
   2364      h = mpn_umul_ppmm (&l, h, l);
   2365     h = mpn_umul_ppmm (&l, h, l);
   2366      h = mpn_umul_ppmm (&l, h, l);
   2367      h = mpn_umul_ppmm (&l, h, l);
   2368     h = mpn_umul_ppmm (&l, h, l);
   2369   }
   2370   SPEED_MACRO_UMUL_PPMM_C;
   2371 }
   2372 #endif
   2373 
   2374 #if HAVE_NATIVE_mpn_umul_ppmm_r
   2375 double
   2376 speed_mpn_umul_ppmm_r (struct speed_params *s)
   2377 {
   2378   SPEED_MACRO_UMUL_PPMM_A;
   2379   {
   2380     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
   2381      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
   2382      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
   2383     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
   2384      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
   2385      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
   2386     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
   2387      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
   2388      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
   2389     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
   2390   }
   2391   SPEED_MACRO_UMUL_PPMM_B;
   2392   {
   2393     h = mpn_umul_ppmm_r (h, l, &l);
   2394      h = mpn_umul_ppmm_r (h, l, &l);
   2395      h = mpn_umul_ppmm_r (h, l, &l);
   2396     h = mpn_umul_ppmm_r (h, l, &l);
   2397      h = mpn_umul_ppmm_r (h, l, &l);
   2398      h = mpn_umul_ppmm_r (h, l, &l);
   2399     h = mpn_umul_ppmm_r (h, l, &l);
   2400      h = mpn_umul_ppmm_r (h, l, &l);
   2401      h = mpn_umul_ppmm_r (h, l, &l);
   2402     h = mpn_umul_ppmm_r (h, l, &l);
   2403   }
   2404   SPEED_MACRO_UMUL_PPMM_C;
   2405 }
   2406 #endif
   2407 
   2408 
   2409 /* The divisions are successively dependent so latency is measured, not
   2410    issue rate.  There's only 10 per loop so the code doesn't get too big,
   2411    especially for udiv_qrnnd_preinv and preinv2norm, which are several
   2412    instructions each.
   2413 
   2414    Note that it's only the division which is measured here, there's no data
   2415    fetching and no shifting if the divisor gets normalized.
   2416 
   2417    In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
   2418    generate x86 div instructions with nothing in between.
   2419 
   2420    The measuring function macros are in two parts to avoid overflowing
   2421    preprocessor expansion space if udiv_qrnnd etc are big.
   2422 
   2423    Limitations:
   2424 
   2425    Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
   2426    generated first.
   2427 
   2428    CPUs with data-dependent divisions may want more attention paid to the
   2429    randomness of the data used.  Probably the measurement wanted is over
   2430    uniformly distributed numbers, but what's here might not be giving that.  */
   2431 
   2432 #define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
   2433   {                                                     \
   2434     double     t;                                       \
   2435     unsigned   i;                                       \
   2436     mp_limb_t  q, r, d;                                 \
   2437     mp_limb_t  dinv;                                    \
   2438 							\
   2439     s->time_divisor = 10;                               \
   2440 							\
   2441     /* divisor from "r" parameter, or a default */      \
   2442     d = s->r;                                           \
   2443     if (d == 0)                                         \
   2444       d = mp_bases[10].big_base;                        \
   2445 							\
   2446     if (normalize)                                      \
   2447       {                                                 \
   2448 	unsigned  norm;                                 \
   2449 	count_leading_zeros (norm, d);                  \
   2450 	d <<= norm;                                     \
   2451 	invert_limb (dinv, d);                          \
   2452       }                                                 \
   2453 							\
   2454     q = s->xp[0];                                       \
   2455     r = s->yp[0] % d;                                   \
   2456 							\
   2457     speed_starttime ();                                 \
   2458     i = s->reps;                                        \
   2459     do                                                  \
   2460       {
   2461 
   2462 #define SPEED_ROUTINE_UDIV_QRNND_B                                      \
   2463       }                                                                 \
   2464     while (--i != 0);                                                   \
   2465     t = speed_endtime ();                                               \
   2466 									\
   2467     /* stop the compiler optimizing away the whole calculation! */      \
   2468     noop_1 (q);                                                         \
   2469     noop_1 (r);                                                         \
   2470 									\
   2471     return t;                                                           \
   2472   }
   2473 
   2474 double
   2475 speed_udiv_qrnnd (struct speed_params *s)
   2476 {
   2477   SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
   2478   {
   2479     udiv_qrnnd (q, r, r, q, d);
   2480      udiv_qrnnd (q, r, r, q, d);
   2481      udiv_qrnnd (q, r, r, q, d);
   2482     udiv_qrnnd (q, r, r, q, d);
   2483      udiv_qrnnd (q, r, r, q, d);
   2484      udiv_qrnnd (q, r, r, q, d);
   2485     udiv_qrnnd (q, r, r, q, d);
   2486      udiv_qrnnd (q, r, r, q, d);
   2487      udiv_qrnnd (q, r, r, q, d);
   2488     udiv_qrnnd (q, r, r, q, d);
   2489   }
   2490   SPEED_ROUTINE_UDIV_QRNND_B;
   2491 }
   2492 
   2493 double
   2494 speed_udiv_qrnnd_c (struct speed_params *s)
   2495 {
   2496   SPEED_ROUTINE_UDIV_QRNND_A (1);
   2497   {
   2498     __udiv_qrnnd_c (q, r, r, q, d);
   2499      __udiv_qrnnd_c (q, r, r, q, d);
   2500      __udiv_qrnnd_c (q, r, r, q, d);
   2501     __udiv_qrnnd_c (q, r, r, q, d);
   2502      __udiv_qrnnd_c (q, r, r, q, d);
   2503      __udiv_qrnnd_c (q, r, r, q, d);
   2504     __udiv_qrnnd_c (q, r, r, q, d);
   2505      __udiv_qrnnd_c (q, r, r, q, d);
   2506      __udiv_qrnnd_c (q, r, r, q, d);
   2507     __udiv_qrnnd_c (q, r, r, q, d);
   2508   }
   2509   SPEED_ROUTINE_UDIV_QRNND_B;
   2510 }
   2511 
   2512 #if HAVE_NATIVE_mpn_udiv_qrnnd
   2513 double
   2514 speed_mpn_udiv_qrnnd (struct speed_params *s)
   2515 {
   2516   SPEED_ROUTINE_UDIV_QRNND_A (1);
   2517   {
   2518     q = mpn_udiv_qrnnd (&r, r, q, d);
   2519      q = mpn_udiv_qrnnd (&r, r, q, d);
   2520      q = mpn_udiv_qrnnd (&r, r, q, d);
   2521     q = mpn_udiv_qrnnd (&r, r, q, d);
   2522      q = mpn_udiv_qrnnd (&r, r, q, d);
   2523      q = mpn_udiv_qrnnd (&r, r, q, d);
   2524     q = mpn_udiv_qrnnd (&r, r, q, d);
   2525      q = mpn_udiv_qrnnd (&r, r, q, d);
   2526      q = mpn_udiv_qrnnd (&r, r, q, d);
   2527     q = mpn_udiv_qrnnd (&r, r, q, d);
   2528   }
   2529   SPEED_ROUTINE_UDIV_QRNND_B;
   2530 }
   2531 #endif
   2532 
   2533 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
   2534 double
   2535 speed_mpn_udiv_qrnnd_r (struct speed_params *s)
   2536 {
   2537   SPEED_ROUTINE_UDIV_QRNND_A (1);
   2538   {
   2539     q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2540      q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2541      q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2542     q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2543      q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2544      q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2545     q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2546      q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2547      q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2548     q = mpn_udiv_qrnnd_r (r, q, d, &r);
   2549   }
   2550   SPEED_ROUTINE_UDIV_QRNND_B;
   2551 }
   2552 #endif
   2553 
   2554 
   2555 double
   2556 speed_invert_limb (struct speed_params *s)
   2557 {
   2558   SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d));
   2559 }
   2560 
   2561 
   2562 /* xp[0] might not be particularly random, but should give an indication how
   2563    "/" runs.  Same for speed_operator_mod below.  */
   2564 double
   2565 speed_operator_div (struct speed_params *s)
   2566 {
   2567   double     t;
   2568   unsigned   i;
   2569   mp_limb_t  x, q, d;
   2570 
   2571   s->time_divisor = 10;
   2572 
   2573   /* divisor from "r" parameter, or a default */
   2574   d = s->r;
   2575   if (d == 0)
   2576     d = mp_bases[10].big_base;
   2577 
   2578   x = s->xp[0];
   2579   q = 0;
   2580 
   2581   speed_starttime ();
   2582   i = s->reps;
   2583   do
   2584     {
   2585       q ^= x; q /= d;
   2586        q ^= x; q /= d;
   2587        q ^= x; q /= d;
   2588       q ^= x; q /= d;
   2589        q ^= x; q /= d;
   2590        q ^= x; q /= d;
   2591       q ^= x; q /= d;
   2592        q ^= x; q /= d;
   2593        q ^= x; q /= d;
   2594       q ^= x; q /= d;
   2595     }
   2596   while (--i != 0);
   2597   t = speed_endtime ();
   2598 
   2599   /* stop the compiler optimizing away the whole calculation! */
   2600   noop_1 (q);
   2601 
   2602   return t;
   2603 }
   2604 
   2605 double
   2606 speed_operator_mod (struct speed_params *s)
   2607 {
   2608   double     t;
   2609   unsigned   i;
   2610   mp_limb_t  x, r, d;
   2611 
   2612   s->time_divisor = 10;
   2613 
   2614   /* divisor from "r" parameter, or a default */
   2615   d = s->r;
   2616   if (d == 0)
   2617     d = mp_bases[10].big_base;
   2618 
   2619   x = s->xp[0];
   2620   r = 0;
   2621 
   2622   speed_starttime ();
   2623   i = s->reps;
   2624   do
   2625     {
   2626       r ^= x; r %= d;
   2627        r ^= x; r %= d;
   2628        r ^= x; r %= d;
   2629       r ^= x; r %= d;
   2630        r ^= x; r %= d;
   2631        r ^= x; r %= d;
   2632       r ^= x; r %= d;
   2633        r ^= x; r %= d;
   2634        r ^= x; r %= d;
   2635       r ^= x; r %= d;
   2636     }
   2637   while (--i != 0);
   2638   t = speed_endtime ();
   2639 
   2640   /* stop the compiler optimizing away the whole calculation! */
   2641   noop_1 (r);
   2642 
   2643   return t;
   2644 }
   2645 
   2646 
   2647 /* r==0 measures on data with the values uniformly distributed.  This will
   2648    be typical for count_trailing_zeros in a GCD etc.
   2649 
   2650    r==1 measures on data with the resultant count uniformly distributed
   2651    between 0 and GMP_LIMB_BITS-1.  This is probably sensible for
   2652    count_leading_zeros on the high limbs of divisors.  */
   2653 
   2654 int
   2655 speed_routine_count_zeros_setup (struct speed_params *s,
   2656 				 mp_ptr xp, int leading, int zero)
   2657 {
   2658   int        i, c;
   2659   mp_limb_t  n;
   2660 
   2661   if (s->r == 0)
   2662     {
   2663       /* Make uniformly distributed data.  If zero isn't allowed then change
   2664 	 it to 1 for leading, or 0x800..00 for trailing.  */
   2665       MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE);
   2666       if (! zero)
   2667 	for (i = 0; i < SPEED_BLOCK_SIZE; i++)
   2668 	  if (xp[i] == 0)
   2669 	    xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT;
   2670     }
   2671   else if (s->r == 1)
   2672     {
   2673       /* Make counts uniformly distributed.  A randomly chosen bit is set, and
   2674 	 for leading the rest above it are cleared, or for trailing then the
   2675 	 rest below.  */
   2676       for (i = 0; i < SPEED_BLOCK_SIZE; i++)
   2677 	{
   2678 	  mp_limb_t  set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS);
   2679 	  mp_limb_t  keep_below = set-1;
   2680 	  mp_limb_t  keep_above = MP_LIMB_T_MAX ^ keep_below;
   2681 	  mp_limb_t  keep = (leading ? keep_below : keep_above);
   2682 	  xp[i] = (s->xp_block[i] & keep) | set;
   2683 	}
   2684     }
   2685   else
   2686     {
   2687       return 0;
   2688     }
   2689 
   2690   /* Account for the effect of n^=c. */
   2691   c = 0;
   2692   for (i = 0; i < SPEED_BLOCK_SIZE; i++)
   2693     {
   2694       n = xp[i];
   2695       xp[i] ^= c;
   2696 
   2697       if (leading)
   2698 	count_leading_zeros (c, n);
   2699       else
   2700 	count_trailing_zeros (c, n);
   2701     }
   2702 
   2703   return 1;
   2704 }
   2705 
   2706 double
   2707 speed_count_leading_zeros (struct speed_params *s)
   2708 {
   2709 #ifdef COUNT_LEADING_ZEROS_0
   2710 #define COUNT_LEADING_ZEROS_0_ALLOWED   1
   2711 #else
   2712 #define COUNT_LEADING_ZEROS_0_ALLOWED   0
   2713 #endif
   2714 
   2715   SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
   2716   count_leading_zeros (c, n);
   2717   SPEED_ROUTINE_COUNT_ZEROS_B ();
   2718 }
   2719 double
   2720 speed_count_trailing_zeros (struct speed_params *s)
   2721 {
   2722   SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
   2723   count_trailing_zeros (c, n);
   2724   SPEED_ROUTINE_COUNT_ZEROS_B ();
   2725 }
   2726 
   2727 
   2728 double
   2729 speed_mpn_get_str (struct speed_params *s)
   2730 {
   2731   SPEED_ROUTINE_MPN_GET_STR (mpn_get_str);
   2732 }
   2733 
   2734 double
   2735 speed_mpn_set_str (struct speed_params *s)
   2736 {
   2737   SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base));
   2738 }
   2739 double
   2740 speed_mpn_bc_set_str (struct speed_params *s)
   2741 {
   2742   SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base));
   2743 }
   2744 
   2745 double
   2746 speed_MPN_ZERO (struct speed_params *s)
   2747 {
   2748   SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size));
   2749 }
   2750 
   2751 
   2752 int
   2753 speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate)
   2754 {
   2755   if (s->r == 0)
   2756     gmp_randinit_default (rstate);
   2757   else if (s->r == 1)
   2758     gmp_randinit_mt (rstate);
   2759   else
   2760     {
   2761       return gmp_randinit_lc_2exp_size (rstate, s->r);
   2762     }
   2763   return 1;
   2764 }
   2765 
   2766 double
   2767 speed_gmp_randseed (struct speed_params *s)
   2768 {
   2769   gmp_randstate_t  rstate;
   2770   unsigned  i;
   2771   double    t;
   2772   mpz_t     x;
   2773 
   2774   SPEED_RESTRICT_COND (s->size >= 1);
   2775   SPEED_RESTRICT_COND (speed_randinit (s, rstate));
   2776 
   2777   /* s->size bits of seed */
   2778   mpz_init_set_n (x, s->xp, s->size);
   2779   mpz_fdiv_r_2exp (x, x, (unsigned long) s->size);
   2780 
   2781   /* cache priming */
   2782   gmp_randseed (rstate, x);
   2783 
   2784   speed_starttime ();
   2785   i = s->reps;
   2786   do
   2787     gmp_randseed (rstate, x);
   2788   while (--i != 0);
   2789   t = speed_endtime ();
   2790 
   2791   gmp_randclear (rstate);
   2792   mpz_clear (x);
   2793   return t;
   2794 }
   2795 
   2796 double
   2797 speed_gmp_randseed_ui (struct speed_params *s)
   2798 {
   2799   gmp_randstate_t  rstate;
   2800   unsigned  i, j;
   2801   double    t;
   2802 
   2803   SPEED_RESTRICT_COND (speed_randinit (s, rstate));
   2804 
   2805   /* cache priming */
   2806   gmp_randseed_ui (rstate, 123L);
   2807 
   2808   speed_starttime ();
   2809   i = s->reps;
   2810   j = 0;
   2811   do
   2812     {
   2813       gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]);
   2814       j++;
   2815       if (j >= SPEED_BLOCK_SIZE)
   2816 	j = 0;
   2817     }
   2818   while (--i != 0);
   2819   t = speed_endtime ();
   2820 
   2821   gmp_randclear (rstate);
   2822   return t;
   2823 }
   2824 
   2825 double
   2826 speed_mpz_urandomb (struct speed_params *s)
   2827 {
   2828   gmp_randstate_t  rstate;
   2829   mpz_t     z;
   2830   unsigned  i;
   2831   double    t;
   2832 
   2833   SPEED_RESTRICT_COND (s->size >= 0);
   2834   SPEED_RESTRICT_COND (speed_randinit (s, rstate));
   2835 
   2836   mpz_init (z);
   2837 
   2838   /* cache priming */
   2839   mpz_urandomb (z, rstate, (unsigned long) s->size);
   2840   mpz_urandomb (z, rstate, (unsigned long) s->size);
   2841 
   2842   speed_starttime ();
   2843   i = s->reps;
   2844   do
   2845     mpz_urandomb (z, rstate, (unsigned long) s->size);
   2846   while (--i != 0);
   2847   t = speed_endtime ();
   2848 
   2849   mpz_clear (z);
   2850   gmp_randclear (rstate);
   2851   return t;
   2852 }
   2853