Home | History | Annotate | Line # | Download | only in tune
      1 /* Time routines for speed measurements.
      2 
      3 Copyright 1999-2004, 2010-2012 Free Software Foundation, Inc.
      4 
      5 This file is part of the GNU MP Library.
      6 
      7 The GNU MP Library is free software; you can redistribute it and/or modify
      8 it under the terms of either:
      9 
     10   * the GNU Lesser General Public License as published by the Free
     11     Software Foundation; either version 3 of the License, or (at your
     12     option) any later version.
     13 
     14 or
     15 
     16   * the GNU General Public License as published by the Free Software
     17     Foundation; either version 2 of the License, or (at your option) any
     18     later version.
     19 
     20 or both in parallel, as here.
     21 
     22 The GNU MP Library is distributed in the hope that it will be useful, but
     23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 for more details.
     26 
     27 You should have received copies of the GNU General Public License and the
     28 GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 see https://www.gnu.org/licenses/.  */
     30 
     31 
     32 /* Usage:
     33 
     34    The code in this file implements the lowest level of time measuring,
     35    simple one-time measuring of time between two points.
     36 
     37    void speed_starttime (void)
     38    double speed_endtime (void)
     39        Call speed_starttime to start measuring, and then call speed_endtime
     40        when done.
     41 
     42        speed_endtime returns the time taken, in seconds.  Or if the timebase
     43        is in CPU cycles and the CPU frequency is unknown then speed_endtime
     44        returns cycles.  Applications can identify the cycles return by
     45        checking for speed_cycletime (described below) equal to 1.0.
     46 
     47        If some sort of temporary glitch occurs then speed_endtime returns
     48        0.0.  Currently this is for various cases where a negative time has
     49        occurred.  This unfortunately occurs with getrusage on some systems,
     50        and with the hppa cycle counter on hpux.
     51 
     52    double speed_cycletime
     53        The time in seconds for each CPU cycle.  For example on a 100 MHz CPU
     54        this would be 1.0e-8.
     55 
     56        If the CPU frequency is unknown, then speed_cycletime is either 0.0
     57        or 1.0.  It's 0.0 when speed_endtime is returning seconds, or it's
     58        1.0 when speed_endtime is returning cycles.
     59 
     60        It may be noted that "speed_endtime() / speed_cycletime" gives a
     61        measured time in cycles, irrespective of whether speed_endtime is
     62        returning cycles or seconds.  (Assuming cycles can be had, ie. it's
     63        either cycles already or the cpu frequency is known.  See also
     64        speed_cycletime_need_cycles below.)
     65 
     66    double speed_unittime
     67        The unit of time measurement accuracy for the timing method in use.
     68        This is in seconds or cycles, as per speed_endtime.
     69 
     70    char speed_time_string[]
     71        A null-terminated string describing the time method in use.
     72 
     73    void speed_time_init (void)
     74        Initialize time measuring.  speed_starttime() does this
     75        automatically, so it's only needed if an application wants to inspect
     76        the above global variables before making a measurement.
     77 
     78    int speed_precision
     79        The intended accuracy of time measurements.  speed_measure() in
     80        common.c for instance runs target routines with enough repetitions so
     81        it takes at least "speed_unittime * speed_precision" (this expression
     82        works for both cycles or seconds from speed_endtime).
     83 
     84        A program can provide an option so the user to set speed_precision.
     85        If speed_precision is zero when speed_time_init or speed_starttime
     86        first run then it gets a default based on the measuring method
     87        chosen.  (More precision for higher accuracy methods.)
     88 
     89    void speed_cycletime_need_seconds (void)
     90        Call this to demand that speed_endtime will return seconds, and not
     91        cycles.  If only cycles are available then an error is printed and
     92        the program exits.
     93 
     94    void speed_cycletime_need_cycles (void)
     95        Call this to demand that speed_cycletime is non-zero, so that
     96        "speed_endtime() / speed_cycletime" will give times in cycles.
     97 
     98 
     99 
    100    Notes:
    101 
    102    Various combinations of cycle counter, read_real_time(), getrusage(),
    103    gettimeofday() and times() can arise, according to which are available
    104    and their precision.
    105 
    106 
    107    Allowing speed_endtime() to return either seconds or cycles is only a
    108    slight complication and makes it possible for the speed program to do
    109    some sensible things without demanding the CPU frequency.  If seconds are
    110    being measured then it can always print seconds, and if cycles are being
    111    measured then it can always print them without needing to know how long
    112    they are.  Also the tune program doesn't care at all what the units are.
    113 
    114    GMP_CPU_FREQUENCY can always be set when the automated methods in freq.c
    115    fail.  This will be needed if times in seconds are wanted but a cycle
    116    counter is being used, or if times in cycles are wanted but getrusage or
    117    another seconds based timer is in use.
    118 
    119    If the measuring method uses a cycle counter but supplements it with
    120    getrusage or the like, then knowing the CPU frequency is mandatory since
    121    the code compares values from the two.
    122 
    123 
    124    Not done:
    125 
    126    Solaris gethrtime() seems no more than a slow way to access the Sparc V9
    127    cycle counter.  gethrvtime() seems to be relevant only to light weight
    128    processes, it doesn't for instance give nanosecond virtual time.  So
    129    neither of these are used.
    130 
    131 
    132    Bugs:
    133 
    134    getrusage_microseconds_p is fundamentally flawed, getrusage and
    135    gettimeofday can have resolutions other than clock ticks or microseconds,
    136    for instance IRIX 5 has a tick of 10 ms but a getrusage of 1 ms.
    137 
    138 
    139    Enhancements:
    140 
    141    The SGI hardware counter has 64 bits on some machines, which could be
    142    used when available.  But perhaps 32 bits is enough range, and then rely
    143    on the getrusage supplement.
    144 
    145    Maybe getrusage (or times) should be used as a supplement for any
    146    wall-clock measuring method.  Currently a wall clock with a good range
    147    (eg. a 64-bit cycle counter) is used without a supplement.
    148 
    149    On PowerPC the timebase registers could be used, but would have to do
    150    something to find out the speed.  On 6xx chips it's normally 1/4 bus
    151    speed, on 4xx chips it's either that or an external clock.  Measuring
    152    against gettimeofday might be ok.  */
    153 
    154 #include "config.h"
    155 
    156 #include <errno.h>
    157 #include <setjmp.h>
    158 #include <signal.h>
    159 #include <stddef.h>
    160 #include <stdio.h>
    161 #include <string.h>
    162 #include <stdlib.h> /* for getenv() */
    163 
    164 #if HAVE_FCNTL_H
    165 #include <fcntl.h>  /* for open() */
    166 #endif
    167 
    168 #if HAVE_STDINT_H
    169 #include <stdint.h> /* for uint64_t */
    170 #endif
    171 
    172 #if HAVE_UNISTD_H
    173 #include <unistd.h> /* for sysconf() */
    174 #endif
    175 
    176 #include <sys/types.h>
    177 
    178 #if TIME_WITH_SYS_TIME
    179 # include <sys/time.h>  /* for struct timeval */
    180 # include <time.h>
    181 #else
    182 # if HAVE_SYS_TIME_H
    183 #  include <sys/time.h>
    184 # else
    185 #  include <time.h>
    186 # endif
    187 #endif
    188 
    189 #if HAVE_SYS_MMAN_H
    190 #include <sys/mman.h>      /* for mmap() */
    191 #endif
    192 
    193 #if HAVE_SYS_RESOURCE_H
    194 #include <sys/resource.h>  /* for struct rusage */
    195 #endif
    196 
    197 #if HAVE_SYS_SYSSGI_H
    198 #include <sys/syssgi.h>    /* for syssgi() */
    199 #endif
    200 
    201 #if HAVE_SYS_SYSTEMCFG_H
    202 #include <sys/systemcfg.h> /* for RTC_POWER on AIX */
    203 #endif
    204 
    205 #if HAVE_SYS_TIMES_H
    206 #include <sys/times.h>  /* for times() and struct tms */
    207 #endif
    208 
    209 #include "gmp-impl.h"
    210 
    211 #include "speed.h"
    212 
    213 
    214 /* strerror is only used for some stuff on newish systems, no need to have a
    215    proper replacement */
    216 #if ! HAVE_STRERROR
    217 #define strerror(n)  "<strerror not available>"
    218 #endif
    219 
    220 
    221 char    speed_time_string[256];
    222 int     speed_precision = 0;
    223 double  speed_unittime;
    224 double  speed_cycletime = 0.0;
    225 
    226 
    227 /* don't rely on "unsigned" to "double" conversion, it's broken in SunOS 4
    228    native cc */
    229 #define M_2POWU   (((double) INT_MAX + 1.0) * 2.0)
    230 
    231 #define M_2POW32  4294967296.0
    232 #define M_2POW64  (M_2POW32 * M_2POW32)
    233 
    234 
    235 /* Conditionals for the time functions available are done with normal C
    236    code, which is a lot easier than wildly nested preprocessor directives.
    237 
    238    The choice of what to use is partly made at run-time, according to
    239    whether the cycle counter works and the measured accuracy of getrusage
    240    and gettimeofday.
    241 
    242    A routine that's not available won't be getting called, but is an abort()
    243    to be sure it isn't called mistakenly.
    244 
    245    It can be assumed that if a function exists then its data type will, but
    246    if the function doesn't then the data type might or might not exist, so
    247    the type can't be used unconditionally.  The "struct_rusage" etc macros
    248    provide dummies when the respective function doesn't exist. */
    249 
    250 
    251 #if HAVE_SPEED_CYCLECOUNTER
    252 static const int have_cycles = HAVE_SPEED_CYCLECOUNTER;
    253 #else
    254 static const int have_cycles = 0;
    255 #define speed_cyclecounter(p)  ASSERT_FAIL (speed_cyclecounter not available)
    256 #endif
    257 
    258 /* "stck" returns ticks since 1 Jan 1900 00:00 GMT, where each tick is 2^-12
    259    microseconds.  Same #ifdefs here as in longlong.h.  */
    260 #if defined (__GNUC__) && ! defined (NO_ASM)                            \
    261   && (defined (__i370__) || defined (__s390__) || defined (__mvs__))
    262 static const int  have_stck = 1;
    263 static const int  use_stck = 1;  /* always use when available */
    264 typedef uint64_t  stck_t; /* gcc for s390 is quite new, always has uint64_t */
    265 #define STCK(timestamp)                 \
    266   do {                                  \
    267     asm ("stck %0" : "=Q" (timestamp)); \
    268   } while (0)
    269 #else
    270 static const int  have_stck = 0;
    271 static const int  use_stck = 0;
    272 typedef unsigned long  stck_t;   /* dummy */
    273 #define STCK(timestamp)  ASSERT_FAIL (stck instruction not available)
    274 #endif
    275 #define STCK_PERIOD      (1.0 / 4096e6)   /* 2^-12 microseconds */
    276 
    277 /* mftb
    278    Enhancement: On 64-bit chips mftb gives a 64-bit value, no need for mftbu
    279    and a loop (see powerpc64.asm).  */
    280 #if HAVE_HOST_CPU_FAMILY_powerpc
    281 static const int  have_mftb = 1;
    282 #if defined (__GNUC__) && ! defined (NO_ASM)
    283 #define MFTB(a)                         \
    284   do {                                  \
    285     unsigned  __h1, __l, __h2;          \
    286     do {                                \
    287       asm volatile ("mftbu %0\n"        \
    288 		    "mftb  %1\n"        \
    289 		    "mftbu %2"          \
    290 		    : "=r" (__h1),      \
    291 		      "=r" (__l),       \
    292 		      "=r" (__h2));     \
    293     } while (__h1 != __h2);             \
    294     a[0] = __l;                         \
    295     a[1] = __h1;                        \
    296   } while (0)
    297 #else
    298 #define MFTB(a)   mftb_function (a)
    299 #endif
    300 #else /* ! powerpc */
    301 static const int  have_mftb = 0;
    302 #define MFTB(a)                         \
    303   do {                                  \
    304     a[0] = 0;                           \
    305     a[1] = 0;                           \
    306     ASSERT_FAIL (mftb not available);   \
    307   } while (0)
    308 #endif
    309 
    310 /* Unicos 10.X has syssgi(), but not mmap(). */
    311 #if HAVE_SYSSGI && HAVE_MMAP
    312 static const int  have_sgi = 1;
    313 #else
    314 static const int  have_sgi = 0;
    315 #endif
    316 
    317 #if HAVE_READ_REAL_TIME
    318 static const int have_rrt = 1;
    319 #else
    320 static const int have_rrt = 0;
    321 #define read_real_time(t,s)     ASSERT_FAIL (read_real_time not available)
    322 #define time_base_to_time(t,s)  ASSERT_FAIL (time_base_to_time not available)
    323 #define RTC_POWER     1
    324 #define RTC_POWER_PC  2
    325 #define timebasestruct_t   struct timebasestruct_dummy
    326 struct timebasestruct_dummy {
    327   int             flag;
    328   unsigned int    tb_high;
    329   unsigned int    tb_low;
    330 };
    331 #endif
    332 
    333 #if HAVE_CLOCK_GETTIME
    334 static const int have_cgt = 1;
    335 #define struct_timespec  struct timespec
    336 #else
    337 static const int have_cgt = 0;
    338 #define struct_timespec       struct timespec_dummy
    339 #define clock_gettime(id,ts)  (ASSERT_FAIL (clock_gettime not available), -1)
    340 #define clock_getres(id,ts)   (ASSERT_FAIL (clock_getres not available), -1)
    341 #endif
    342 
    343 #if HAVE_GETRUSAGE
    344 static const int have_grus = 1;
    345 #define struct_rusage   struct rusage
    346 #else
    347 static const int have_grus = 0;
    348 #define getrusage(n,ru)  ASSERT_FAIL (getrusage not available)
    349 #define struct_rusage    struct rusage_dummy
    350 #endif
    351 
    352 #if HAVE_GETTIMEOFDAY
    353 static const int have_gtod = 1;
    354 #define struct_timeval   struct timeval
    355 #else
    356 static const int have_gtod = 0;
    357 #define gettimeofday(tv,tz)  ASSERT_FAIL (gettimeofday not available)
    358 #define struct_timeval   struct timeval_dummy
    359 #endif
    360 
    361 #if HAVE_TIMES
    362 static const int have_times = 1;
    363 #define struct_tms   struct tms
    364 #else
    365 static const int have_times = 0;
    366 #define times(tms)   ASSERT_FAIL (times not available)
    367 #define struct_tms   struct tms_dummy
    368 #endif
    369 
    370 struct tms_dummy {
    371   long  tms_utime;
    372 };
    373 struct timeval_dummy {
    374   long  tv_sec;
    375   long  tv_usec;
    376 };
    377 struct rusage_dummy {
    378   struct_timeval ru_utime;
    379 };
    380 struct timespec_dummy {
    381   long  tv_sec;
    382   long  tv_nsec;
    383 };
    384 
    385 static int  use_cycles;
    386 static int  use_mftb;
    387 static int  use_sgi;
    388 static int  use_rrt;
    389 static int  use_cgt;
    390 static int  use_gtod;
    391 static int  use_grus;
    392 static int  use_times;
    393 static int  use_tick_boundary;
    394 
    395 static unsigned         start_cycles[2];
    396 static stck_t           start_stck;
    397 static unsigned         start_mftb[2];
    398 static unsigned         start_sgi;
    399 static timebasestruct_t start_rrt;
    400 static struct_timespec  start_cgt;
    401 static struct_rusage    start_grus;
    402 static struct_timeval   start_gtod;
    403 static struct_tms       start_times;
    404 
    405 static double  cycles_limit = 1e100;
    406 static double  mftb_unittime;
    407 static double  sgi_unittime;
    408 static double  cgt_unittime;
    409 static double  grus_unittime;
    410 static double  gtod_unittime;
    411 static double  times_unittime;
    412 
    413 /* for RTC_POWER format, ie. seconds and nanoseconds */
    414 #define TIMEBASESTRUCT_SECS(t)  ((t)->tb_high + (t)->tb_low * 1e-9)
    415 
    416 
    417 /* Return a string representing a time in seconds, nicely formatted.
    418    Eg. "10.25ms".  */
    419 char *
    420 unittime_string (double t)
    421 {
    422   static char  buf[128];
    423 
    424   const char  *unit;
    425   int         prec;
    426 
    427   /* choose units and scale */
    428   if (t < 1e-6)
    429     t *= 1e9, unit = "ns";
    430   else if (t < 1e-3)
    431     t *= 1e6, unit = "us";
    432   else if (t < 1.0)
    433     t *= 1e3, unit = "ms";
    434   else
    435     unit = "s";
    436 
    437   /* want 4 significant figures */
    438   if (t < 1.0)
    439     prec = 4;
    440   else if (t < 10.0)
    441     prec = 3;
    442   else if (t < 100.0)
    443     prec = 2;
    444   else
    445     prec = 1;
    446 
    447   sprintf (buf, "%.*f%s", prec, t, unit);
    448   return buf;
    449 }
    450 
    451 
    452 static jmp_buf  cycles_works_buf;
    453 
    454 static RETSIGTYPE
    455 cycles_works_handler (int sig)
    456 {
    457   longjmp (cycles_works_buf, 1);
    458 }
    459 
    460 int
    461 cycles_works_p (void)
    462 {
    463   static int  result = -1;
    464 
    465   if (result != -1)
    466     goto done;
    467 
    468   /* FIXME: On linux, the cycle counter is not saved and restored over
    469    * context switches, making it almost useless for precise cputime
    470    * measurements. When available, it's better to use clock_gettime,
    471    * which seems to have reasonable accuracy (tested on x86_32,
    472    * linux-2.6.26, glibc-2.7). However, there are also some linux
    473    * systems where clock_gettime is broken in one way or the other,
    474    * like CLOCK_PROCESS_CPUTIME_ID not implemented (easy case) or
    475    * kind-of implemented but broken (needs code to detect that), and
    476    * on those systems a wall-clock cycle counter is the least bad
    477    * fallback.
    478    *
    479    * So we need some code to disable the cycle counter on some but not
    480    * all linux systems. */
    481 #ifdef SIGILL
    482   {
    483     RETSIGTYPE (*old_handler) (int);
    484     unsigned  cycles[2];
    485 
    486     old_handler = signal (SIGILL, cycles_works_handler);
    487     if (old_handler == SIG_ERR)
    488       {
    489 	if (speed_option_verbose)
    490 	  printf ("cycles_works_p(): SIGILL not supported, assuming speed_cyclecounter() works\n");
    491 	goto yes;
    492       }
    493     if (setjmp (cycles_works_buf))
    494       {
    495 	if (speed_option_verbose)
    496 	  printf ("cycles_works_p(): SIGILL during speed_cyclecounter(), so doesn't work\n");
    497 	result = 0;
    498 	goto done;
    499       }
    500     speed_cyclecounter (cycles);
    501     signal (SIGILL, old_handler);
    502     if (speed_option_verbose)
    503       printf ("cycles_works_p(): speed_cyclecounter() works\n");
    504   }
    505 #else
    506 
    507   if (speed_option_verbose)
    508     printf ("cycles_works_p(): SIGILL not defined, assuming speed_cyclecounter() works\n");
    509   goto yes;
    510 #endif
    511 
    512  yes:
    513   result = 1;
    514 
    515  done:
    516   return result;
    517 }
    518 
    519 
    520 /* The number of clock ticks per second, but looking at sysconf rather than
    521    just CLK_TCK, where possible.  */
    522 long
    523 clk_tck (void)
    524 {
    525   static long  result = -1L;
    526   if (result != -1L)
    527     return result;
    528 
    529 #if HAVE_SYSCONF
    530   result = sysconf (_SC_CLK_TCK);
    531   if (result != -1L)
    532     {
    533       if (speed_option_verbose)
    534 	printf ("sysconf(_SC_CLK_TCK) is %ld per second\n", result);
    535       return result;
    536     }
    537 
    538   fprintf (stderr,
    539 	   "sysconf(_SC_CLK_TCK) not working, using CLK_TCK instead\n");
    540 #endif
    541 
    542 #ifdef CLK_TCK
    543   result = CLK_TCK;
    544   if (speed_option_verbose)
    545     printf ("CLK_TCK is %ld per second\n", result);
    546   return result;
    547 #else
    548   fprintf (stderr, "CLK_TCK not defined, cannot continue\n");
    549   abort ();
    550 #endif
    551 }
    552 
    553 
    554 /* If two times can be observed less than half a clock tick apart, then
    555    assume "get" is microsecond accurate.
    556 
    557    Two times only 1 microsecond apart are not believed, since some kernels
    558    take it upon themselves to ensure gettimeofday doesn't return the same
    559    value twice, for the benefit of applications using it for a timestamp.
    560    This is obviously very stupid given the speed of CPUs these days.
    561 
    562    Making "reps" many calls to noop_1() is designed to waste some CPU, with
    563    a view to getting measurements 2 microseconds (or more) apart.  "reps" is
    564    increased progressively until such a period is seen.
    565 
    566    The outer loop "attempts" are just to allow for any random nonsense or
    567    system load upsetting the measurements (ie. making two successive calls
    568    to "get" come out as a longer interval than normal).
    569 
    570    Bugs:
    571 
    572    The assumption that any interval less than a half tick implies
    573    microsecond resolution is obviously fairly rash, the true resolution
    574    could be anything between a microsecond and that half tick.  Perhaps
    575    something special would have to be done on a system where this is the
    576    case, since there's no obvious reliable way to detect it
    577    automatically.  */
    578 
    579 #define MICROSECONDS_P(name, type, get, sec, usec)                      \
    580   {                                                                     \
    581     static int  result = -1;                                            \
    582     type      st, et;                                                   \
    583     long      dt, half_tick;                                            \
    584     unsigned  attempt, reps, i, j;                                      \
    585 									\
    586     if (result != -1)                                                   \
    587       return result;                                                    \
    588 									\
    589     result = 0;                                                         \
    590     half_tick = (1000000L / clk_tck ()) / 2;                            \
    591 									\
    592     for (attempt = 0; attempt < 5; attempt++)                           \
    593       {                                                                 \
    594 	reps = 0;                                                       \
    595 	for (;;)                                                        \
    596 	  {                                                             \
    597 	    get (st);                                                   \
    598 	    for (i = 0; i < reps; i++)                                  \
    599 	      for (j = 0; j < 100; j++)                                 \
    600 		noop_1 (CNST_LIMB(0));                                  \
    601 	    get (et);                                                   \
    602 									\
    603 	    dt = (sec(et)-sec(st))*1000000L + usec(et)-usec(st);        \
    604 									\
    605 	    if (speed_option_verbose >= 2)                              \
    606 	      printf ("%s attempt=%u, reps=%u, dt=%ld\n",               \
    607 		      name, attempt, reps, dt);                         \
    608 									\
    609 	    if (dt >= 2)                                                \
    610 	      break;                                                    \
    611 									\
    612 	    reps = (reps == 0 ? 1 : 2*reps);                            \
    613 	    if (reps == 0)                                              \
    614 	      break;  /* uint overflow, not normal */                   \
    615 	  }                                                             \
    616 									\
    617 	if (dt < half_tick)                                             \
    618 	  {                                                             \
    619 	    result = 1;                                                 \
    620 	    break;                                                      \
    621 	  }                                                             \
    622       }                                                                 \
    623 									\
    624     if (speed_option_verbose)                                           \
    625       {                                                                 \
    626 	if (result)                                                     \
    627 	  printf ("%s is microsecond accurate\n", name);                \
    628 	else                                                            \
    629 	  printf ("%s is only %s clock tick accurate\n",                \
    630 		  name, unittime_string (1.0/clk_tck()));               \
    631       }                                                                 \
    632     return result;                                                      \
    633   }
    634 
    635 
    636 int
    637 gettimeofday_microseconds_p (void)
    638 {
    639 #define call_gettimeofday(t)   gettimeofday (&(t), NULL)
    640 #define timeval_tv_sec(t)      ((t).tv_sec)
    641 #define timeval_tv_usec(t)     ((t).tv_usec)
    642   MICROSECONDS_P ("gettimeofday", struct_timeval,
    643 		  call_gettimeofday, timeval_tv_sec, timeval_tv_usec);
    644 }
    645 
    646 int
    647 getrusage_microseconds_p (void)
    648 {
    649 #define call_getrusage(t)   getrusage (0, &(t))
    650 #define rusage_tv_sec(t)    ((t).ru_utime.tv_sec)
    651 #define rusage_tv_usec(t)   ((t).ru_utime.tv_usec)
    652   MICROSECONDS_P ("getrusage", struct_rusage,
    653 		  call_getrusage, rusage_tv_sec, rusage_tv_usec);
    654 }
    655 
    656 /* Test whether getrusage goes backwards, return non-zero if it does
    657    (suggesting it's flawed).
    658 
    659    On a macintosh m68040-unknown-netbsd1.4.1 getrusage looks like it's
    660    microsecond accurate, but has been seen remaining unchanged after many
    661    microseconds have elapsed.  It also regularly goes backwards by 1000 to
    662    5000 usecs, this has been seen after between 500 and 4000 attempts taking
    663    perhaps 0.03 seconds.  We consider this too broken for good measuring.
    664    We used to have configure pretend getrusage didn't exist on this system,
    665    but a runtime test should be more reliable, since we imagine the problem
    666    is not confined to just this exact system tuple.  */
    667 
    668 int
    669 getrusage_backwards_p (void)
    670 {
    671   static int result = -1;
    672   struct rusage  start, prev, next;
    673   long  d;
    674   int   i;
    675 
    676   if (result != -1)
    677     return result;
    678 
    679   getrusage (0, &start);
    680   memcpy (&next, &start, sizeof (next));
    681 
    682   result = 0;
    683   i = 0;
    684   for (;;)
    685     {
    686       memcpy (&prev, &next, sizeof (prev));
    687       getrusage (0, &next);
    688 
    689       if (next.ru_utime.tv_sec < prev.ru_utime.tv_sec
    690 	  || (next.ru_utime.tv_sec == prev.ru_utime.tv_sec
    691 	      && next.ru_utime.tv_usec < prev.ru_utime.tv_usec))
    692 	{
    693 	  if (speed_option_verbose)
    694 	    printf ("getrusage went backwards (attempt %d: %ld.%06ld -> %ld.%06ld)\n",
    695 		    i,
    696 		    (long) prev.ru_utime.tv_sec, (long) prev.ru_utime.tv_usec,
    697 		    (long) next.ru_utime.tv_sec, (long) next.ru_utime.tv_usec);
    698 	  result = 1;
    699 	  break;
    700 	}
    701 
    702       /* minimum 1000 attempts, then stop after either 0.1 seconds or 50000
    703 	 attempts, whichever comes first */
    704       d = 1000000 * (next.ru_utime.tv_sec - start.ru_utime.tv_sec)
    705 	+ (next.ru_utime.tv_usec - start.ru_utime.tv_usec);
    706       i++;
    707       if (i > 50000 || (i > 1000 && d > 100000))
    708 	break;
    709     }
    710 
    711   return result;
    712 }
    713 
    714 /* CLOCK_PROCESS_CPUTIME_ID looks like it's going to be in a future version
    715    of glibc (some time post 2.2).
    716 
    717    CLOCK_VIRTUAL is process time, available in BSD systems (though sometimes
    718    defined, but returning -1 for an error).  */
    719 
    720 #ifdef CLOCK_PROCESS_CPUTIME_ID
    721 # define CGT_ID        CLOCK_PROCESS_CPUTIME_ID
    722 #else
    723 # ifdef CLOCK_VIRTUAL
    724 #  define CGT_ID       CLOCK_VIRTUAL
    725 # endif
    726 #endif
    727 #ifdef CGT_ID
    728 const int  have_cgt_id = 1;
    729 #else
    730 const int  have_cgt_id = 0;
    731 # define CGT_ID       (ASSERT_FAIL (CGT_ID not determined), -1)
    732 #endif
    733 
    734 #define CGT_DELAY_COUNT 1000
    735 
    736 int
    737 cgt_works_p (void)
    738 {
    739   static int  result = -1;
    740   struct_timespec  unit;
    741 
    742   if (! have_cgt)
    743     return 0;
    744 
    745   if (! have_cgt_id)
    746     {
    747       if (speed_option_verbose)
    748 	printf ("clock_gettime don't know what ID to use\n");
    749       result = 0;
    750       return result;
    751     }
    752 
    753   if (result != -1)
    754     return result;
    755 
    756   /* trial run to see if it works */
    757   if (clock_gettime (CGT_ID, &unit) != 0)
    758     {
    759       if (speed_option_verbose)
    760 	printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
    761       result = 0;
    762       return result;
    763     }
    764 
    765   /* get the resolution */
    766   if (clock_getres (CGT_ID, &unit) != 0)
    767     {
    768       if (speed_option_verbose)
    769 	printf ("clock_getres id=%d error: %s\n", CGT_ID, strerror (errno));
    770       result = 0;
    771       return result;
    772     }
    773 
    774   cgt_unittime = unit.tv_sec + unit.tv_nsec * 1e-9;
    775   if (speed_option_verbose)
    776     printf ("clock_gettime is %s accurate\n", unittime_string (cgt_unittime));
    777 
    778   if (cgt_unittime < 10e-9)
    779     {
    780       /* Do we believe this? */
    781       struct timespec start, end;
    782       static volatile int counter;
    783       double duration;
    784       if (clock_gettime (CGT_ID, &start))
    785 	{
    786 	  if (speed_option_verbose)
    787 	    printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
    788 	  result = 0;
    789 	  return result;
    790 	}
    791       /* Loop of at least 1000 memory accesses, ought to take at
    792 	 least 100 ns*/
    793       for (counter = 0; counter < CGT_DELAY_COUNT; counter++)
    794 	;
    795       if (clock_gettime (CGT_ID, &end))
    796 	{
    797 	  if (speed_option_verbose)
    798 	    printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
    799 	  result = 0;
    800 	  return result;
    801 	}
    802       duration = (end.tv_sec + end.tv_nsec * 1e-9
    803 		  - start.tv_sec - start.tv_nsec * 1e-9);
    804       if (speed_option_verbose)
    805 	printf ("delay loop of %d rounds took %s (according to clock_gettime)\n",
    806 		CGT_DELAY_COUNT, unittime_string (duration));
    807       if (duration < 100e-9)
    808 	{
    809 	  if (speed_option_verbose)
    810 	    printf ("clock_gettime id=%d not believable\n", CGT_ID);
    811 	  result = 0;
    812 	  return result;
    813 	}
    814     }
    815   result = 1;
    816   return result;
    817 }
    818 
    819 
    820 static double
    821 freq_measure_mftb_one (void)
    822 {
    823 #define call_gettimeofday(t)   gettimeofday (&(t), NULL)
    824 #define timeval_tv_sec(t)      ((t).tv_sec)
    825 #define timeval_tv_usec(t)     ((t).tv_usec)
    826   FREQ_MEASURE_ONE ("mftb", struct_timeval,
    827 		    call_gettimeofday, MFTB,
    828 		    timeval_tv_sec, timeval_tv_usec);
    829 }
    830 
    831 
    832 static jmp_buf  mftb_works_buf;
    833 
    834 static RETSIGTYPE
    835 mftb_works_handler (int sig)
    836 {
    837   longjmp (mftb_works_buf, 1);
    838 }
    839 
    840 int
    841 mftb_works_p (void)
    842 {
    843   unsigned   a[2];
    844   RETSIGTYPE (*old_handler) (int);
    845   double     cycletime;
    846 
    847   /* suppress a warning about a[] unused */
    848   a[0] = 0;
    849 
    850   if (! have_mftb)
    851     return 0;
    852 
    853 #ifdef SIGILL
    854   old_handler = signal (SIGILL, mftb_works_handler);
    855   if (old_handler == SIG_ERR)
    856     {
    857       if (speed_option_verbose)
    858 	printf ("mftb_works_p(): SIGILL not supported, assuming mftb works\n");
    859       return 1;
    860     }
    861   if (setjmp (mftb_works_buf))
    862     {
    863       if (speed_option_verbose)
    864 	printf ("mftb_works_p(): SIGILL during mftb, so doesn't work\n");
    865       return 0;
    866     }
    867   MFTB (a);
    868   signal (SIGILL, old_handler);
    869   if (speed_option_verbose)
    870     printf ("mftb_works_p(): mftb works\n");
    871 #else
    872 
    873   if (speed_option_verbose)
    874     printf ("mftb_works_p(): SIGILL not defined, assuming mftb works\n");
    875 #endif
    876 
    877 #if ! HAVE_GETTIMEOFDAY
    878   if (speed_option_verbose)
    879     printf ("mftb_works_p(): no gettimeofday available to measure mftb\n");
    880   return 0;
    881 #endif
    882 
    883   /* The time base is normally 1/4 of the bus speed on 6xx and 7xx chips, on
    884      other chips it can be driven from an external clock. */
    885   cycletime = freq_measure ("mftb", freq_measure_mftb_one);
    886   if (cycletime == -1.0)
    887     {
    888       if (speed_option_verbose)
    889 	printf ("mftb_works_p(): cannot measure mftb period\n");
    890       return 0;
    891     }
    892 
    893   mftb_unittime = cycletime;
    894   return 1;
    895 }
    896 
    897 
    898 volatile unsigned  *sgi_addr;
    899 
    900 int
    901 sgi_works_p (void)
    902 {
    903 #if HAVE_SYSSGI && HAVE_MMAP
    904   static int  result = -1;
    905 
    906   size_t          pagesize, offset;
    907   __psunsigned_t  phys, physpage;
    908   void            *virtpage;
    909   unsigned        period_picoseconds;
    910   int             size, fd;
    911 
    912   if (result != -1)
    913     return result;
    914 
    915   phys = syssgi (SGI_QUERY_CYCLECNTR, &period_picoseconds);
    916   if (phys == (__psunsigned_t) -1)
    917     {
    918       /* ENODEV is the error when a counter is not available */
    919       if (speed_option_verbose)
    920 	printf ("syssgi SGI_QUERY_CYCLECNTR error: %s\n", strerror (errno));
    921       result = 0;
    922       return result;
    923     }
    924   sgi_unittime = period_picoseconds * 1e-12;
    925 
    926   /* IRIX 5 doesn't have SGI_CYCLECNTR_SIZE, assume 32 bits in that case.
    927      Challenge/ONYX hardware has a 64 bit byte counter, but there seems no
    928      obvious way to identify that without SGI_CYCLECNTR_SIZE.  */
    929 #ifdef SGI_CYCLECNTR_SIZE
    930   size = syssgi (SGI_CYCLECNTR_SIZE);
    931   if (size == -1)
    932     {
    933       if (speed_option_verbose)
    934 	{
    935 	  printf ("syssgi SGI_CYCLECNTR_SIZE error: %s\n", strerror (errno));
    936 	  printf ("    will assume size==4\n");
    937 	}
    938       size = 32;
    939     }
    940 #else
    941   size = 32;
    942 #endif
    943 
    944   if (size < 32)
    945     {
    946       printf ("syssgi SGI_CYCLECNTR_SIZE gives %d, expected 32 or 64\n", size);
    947       result = 0;
    948       return result;
    949     }
    950 
    951   pagesize = getpagesize();
    952   offset = (size_t) phys & (pagesize-1);
    953   physpage = phys - offset;
    954 
    955   /* shouldn't cross over a page boundary */
    956   ASSERT_ALWAYS (offset + size/8 <= pagesize);
    957 
    958   fd = open("/dev/mmem", O_RDONLY);
    959   if (fd == -1)
    960     {
    961       if (speed_option_verbose)
    962 	printf ("open /dev/mmem: %s\n", strerror (errno));
    963       result = 0;
    964       return result;
    965     }
    966 
    967   virtpage = mmap (0, pagesize, PROT_READ, MAP_PRIVATE, fd, (off_t) physpage);
    968   if (virtpage == (void *) -1)
    969     {
    970       if (speed_option_verbose)
    971 	printf ("mmap /dev/mmem: %s\n", strerror (errno));
    972       result = 0;
    973       return result;
    974     }
    975 
    976   /* address of least significant 4 bytes, knowing mips is big endian */
    977   sgi_addr = (unsigned *) ((char *) virtpage + offset
    978 			   + size/8 - sizeof(unsigned));
    979   result = 1;
    980   return result;
    981 
    982 #else /* ! (HAVE_SYSSGI && HAVE_MMAP) */
    983   return 0;
    984 #endif
    985 }
    986 
    987 
    988 #define DEFAULT(var,n)  \
    989   do {                  \
    990     if (! (var))        \
    991       (var) = (n);      \
    992   } while (0)
    993 
    994 void
    995 speed_time_init (void)
    996 {
    997   double supplement_unittime = 0.0;
    998 
    999   static int  speed_time_initialized = 0;
   1000   if (speed_time_initialized)
   1001     return;
   1002   speed_time_initialized = 1;
   1003 
   1004   speed_cycletime_init ();
   1005 
   1006   if (!speed_option_cycles_broken && have_cycles && cycles_works_p ())
   1007     {
   1008       use_cycles = 1;
   1009       DEFAULT (speed_cycletime, 1.0);
   1010       speed_unittime = speed_cycletime;
   1011       DEFAULT (speed_precision, 10000);
   1012       strcpy (speed_time_string, "CPU cycle counter");
   1013 
   1014       /* only used if a supplementary method is chosen below */
   1015       cycles_limit = (have_cycles == 1 ? M_2POW32 : M_2POW64) / 2.0
   1016 	* speed_cycletime;
   1017 
   1018       if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p())
   1019 	{
   1020 	  /* this is a good combination */
   1021 	  use_grus = 1;
   1022 	  supplement_unittime = grus_unittime = 1.0e-6;
   1023 	  strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond getrusage()");
   1024 	}
   1025       else if (have_cycles == 1)
   1026 	{
   1027 	  /* When speed_cyclecounter has a limited range, look for something
   1028 	     to supplement it. */
   1029 	  if (have_gtod && gettimeofday_microseconds_p())
   1030 	    {
   1031 	      use_gtod = 1;
   1032 	      supplement_unittime = gtod_unittime = 1.0e-6;
   1033 	      strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond gettimeofday()");
   1034 	    }
   1035 	  else if (have_grus)
   1036 	    {
   1037 	      use_grus = 1;
   1038 	      supplement_unittime = grus_unittime = 1.0 / (double) clk_tck ();
   1039 	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick getrusage()", unittime_string (supplement_unittime));
   1040 	    }
   1041 	  else if (have_times)
   1042 	    {
   1043 	      use_times = 1;
   1044 	      supplement_unittime = times_unittime = 1.0 / (double) clk_tck ();
   1045 	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick times()", unittime_string (supplement_unittime));
   1046 	    }
   1047 	  else if (have_gtod)
   1048 	    {
   1049 	      use_gtod = 1;
   1050 	      supplement_unittime = gtod_unittime = 1.0 / (double) clk_tck ();
   1051 	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick gettimeofday()", unittime_string (supplement_unittime));
   1052 	    }
   1053 	  else
   1054 	    {
   1055 	      fprintf (stderr, "WARNING: cycle counter is 32 bits and there's no other functions.\n");
   1056 	      fprintf (stderr, "    Wraparounds may produce bad results on long measurements.\n");
   1057 	    }
   1058 	}
   1059 
   1060       if (use_grus || use_times || use_gtod)
   1061 	{
   1062 	  /* must know cycle period to compare cycles to other measuring
   1063 	     (via cycles_limit) */
   1064 	  speed_cycletime_need_seconds ();
   1065 
   1066 	  if (speed_precision * supplement_unittime > cycles_limit)
   1067 	    {
   1068 	      fprintf (stderr, "WARNING: requested precision can't always be achieved due to limited range\n");
   1069 	      fprintf (stderr, "    cycle counter and limited precision supplemental method\n");
   1070 	      fprintf (stderr, "    (%s)\n", speed_time_string);
   1071 	    }
   1072 	}
   1073     }
   1074   else if (have_stck)
   1075     {
   1076       strcpy (speed_time_string, "STCK timestamp");
   1077       /* stck is in units of 2^-12 microseconds, which is very likely higher
   1078 	 resolution than a cpu cycle */
   1079       if (speed_cycletime == 0.0)
   1080 	speed_cycletime_fail
   1081 	  ("Need to know CPU frequency for effective stck unit");
   1082       speed_unittime = MAX (speed_cycletime, STCK_PERIOD);
   1083       DEFAULT (speed_precision, 10000);
   1084     }
   1085   else if (have_mftb && mftb_works_p ())
   1086     {
   1087       use_mftb = 1;
   1088       DEFAULT (speed_precision, 10000);
   1089       speed_unittime = mftb_unittime;
   1090       sprintf (speed_time_string, "mftb counter (%s)",
   1091 	       unittime_string (speed_unittime));
   1092     }
   1093   else if (have_sgi && sgi_works_p ())
   1094     {
   1095       use_sgi = 1;
   1096       DEFAULT (speed_precision, 10000);
   1097       speed_unittime = sgi_unittime;
   1098       sprintf (speed_time_string, "syssgi() mmap counter (%s), supplemented by millisecond getrusage()",
   1099 	       unittime_string (speed_unittime));
   1100       /* supplemented with getrusage, which we assume to have 1ms resolution */
   1101       use_grus = 1;
   1102       supplement_unittime = 1e-3;
   1103     }
   1104   else if (have_rrt)
   1105     {
   1106       timebasestruct_t  t;
   1107       use_rrt = 1;
   1108       DEFAULT (speed_precision, 10000);
   1109       read_real_time (&t, sizeof(t));
   1110       switch (t.flag) {
   1111       case RTC_POWER:
   1112 	/* FIXME: What's the actual RTC resolution? */
   1113 	speed_unittime = 1e-7;
   1114 	strcpy (speed_time_string, "read_real_time() power nanoseconds");
   1115 	break;
   1116       case RTC_POWER_PC:
   1117 	t.tb_high = 1;
   1118 	t.tb_low = 0;
   1119 	time_base_to_time (&t, sizeof(t));
   1120 	speed_unittime = TIMEBASESTRUCT_SECS(&t) / M_2POW32;
   1121 	sprintf (speed_time_string, "%s read_real_time() powerpc ticks",
   1122 		 unittime_string (speed_unittime));
   1123 	break;
   1124       default:
   1125 	fprintf (stderr, "ERROR: Unrecognised timebasestruct_t flag=%d\n",
   1126 		 t.flag);
   1127 	abort ();
   1128       }
   1129     }
   1130   else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5e-6)
   1131     {
   1132       /* use clock_gettime if microsecond or better resolution */
   1133     choose_cgt:
   1134       use_cgt = 1;
   1135       speed_unittime = cgt_unittime;
   1136       DEFAULT (speed_precision, (cgt_unittime <= 0.1e-6 ? 10000 : 1000));
   1137       strcpy (speed_time_string, "microsecond accurate clock_gettime()");
   1138     }
   1139   else if (have_times && clk_tck() > 1000000)
   1140     {
   1141       /* Cray vector systems have times() which is clock cycle resolution
   1142 	 (eg. 450 MHz).  */
   1143       DEFAULT (speed_precision, 10000);
   1144       goto choose_times;
   1145     }
   1146   else if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p())
   1147     {
   1148       use_grus = 1;
   1149       speed_unittime = grus_unittime = 1.0e-6;
   1150       DEFAULT (speed_precision, 1000);
   1151       strcpy (speed_time_string, "microsecond accurate getrusage()");
   1152     }
   1153   else if (have_gtod && gettimeofday_microseconds_p())
   1154     {
   1155       use_gtod = 1;
   1156       speed_unittime = gtod_unittime = 1.0e-6;
   1157       DEFAULT (speed_precision, 1000);
   1158       strcpy (speed_time_string, "microsecond accurate gettimeofday()");
   1159     }
   1160   else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5/clk_tck())
   1161     {
   1162       /* use clock_gettime if 1 tick or better resolution */
   1163       goto choose_cgt;
   1164     }
   1165   else if (have_times)
   1166     {
   1167       use_tick_boundary = 1;
   1168       DEFAULT (speed_precision, 200);
   1169     choose_times:
   1170       use_times = 1;
   1171       speed_unittime = times_unittime = 1.0 / (double) clk_tck ();
   1172       sprintf (speed_time_string, "%s clock tick times()",
   1173 	       unittime_string (speed_unittime));
   1174     }
   1175   else if (have_grus)
   1176     {
   1177       use_grus = 1;
   1178       use_tick_boundary = 1;
   1179       speed_unittime = grus_unittime = 1.0 / (double) clk_tck ();
   1180       DEFAULT (speed_precision, 200);
   1181       sprintf (speed_time_string, "%s clock tick getrusage()\n",
   1182 	       unittime_string (speed_unittime));
   1183     }
   1184   else if (have_gtod)
   1185     {
   1186       use_gtod = 1;
   1187       use_tick_boundary = 1;
   1188       speed_unittime = gtod_unittime = 1.0 / (double) clk_tck ();
   1189       DEFAULT (speed_precision, 200);
   1190       sprintf (speed_time_string, "%s clock tick gettimeofday()",
   1191 	       unittime_string (speed_unittime));
   1192     }
   1193   else
   1194     {
   1195       fprintf (stderr, "No time measuring method available\n");
   1196       fprintf (stderr, "None of: speed_cyclecounter(), STCK(), getrusage(), gettimeofday(), times()\n");
   1197       abort ();
   1198     }
   1199 
   1200   if (speed_option_verbose)
   1201     {
   1202       printf ("speed_time_init: %s\n", speed_time_string);
   1203       printf ("    speed_precision     %d\n", speed_precision);
   1204       printf ("    speed_unittime      %.2g\n", speed_unittime);
   1205       if (supplement_unittime)
   1206 	printf ("    supplement_unittime %.2g\n", supplement_unittime);
   1207       printf ("    use_tick_boundary   %d\n", use_tick_boundary);
   1208       if (have_cycles)
   1209 	printf ("    cycles_limit        %.2g seconds\n", cycles_limit);
   1210     }
   1211 }
   1212 
   1213 
   1214 
   1215 /* Burn up CPU until a clock tick boundary, for greater accuracy.  Set the
   1216    corresponding "start_foo" appropriately too. */
   1217 
   1218 void
   1219 grus_tick_boundary (void)
   1220 {
   1221   struct_rusage  prev;
   1222   getrusage (0, &prev);
   1223   do {
   1224     getrusage (0, &start_grus);
   1225   } while (start_grus.ru_utime.tv_usec == prev.ru_utime.tv_usec);
   1226 }
   1227 
   1228 void
   1229 gtod_tick_boundary (void)
   1230 {
   1231   struct_timeval  prev;
   1232   gettimeofday (&prev, NULL);
   1233   do {
   1234     gettimeofday (&start_gtod, NULL);
   1235   } while (start_gtod.tv_usec == prev.tv_usec);
   1236 }
   1237 
   1238 void
   1239 times_tick_boundary (void)
   1240 {
   1241   struct_tms  prev;
   1242   times (&prev);
   1243   do
   1244     times (&start_times);
   1245   while (start_times.tms_utime == prev.tms_utime);
   1246 }
   1247 
   1248 
   1249 /* "have_" values are tested to let unused code go dead.  */
   1250 
   1251 void
   1252 speed_starttime (void)
   1253 {
   1254   speed_time_init ();
   1255 
   1256   if (have_grus && use_grus)
   1257     {
   1258       if (use_tick_boundary)
   1259 	grus_tick_boundary ();
   1260       else
   1261 	getrusage (0, &start_grus);
   1262     }
   1263 
   1264   if (have_gtod && use_gtod)
   1265     {
   1266       if (use_tick_boundary)
   1267 	gtod_tick_boundary ();
   1268       else
   1269 	gettimeofday (&start_gtod, NULL);
   1270     }
   1271 
   1272   if (have_times && use_times)
   1273     {
   1274       if (use_tick_boundary)
   1275 	times_tick_boundary ();
   1276       else
   1277 	times (&start_times);
   1278     }
   1279 
   1280   if (have_cgt && use_cgt)
   1281     clock_gettime (CGT_ID, &start_cgt);
   1282 
   1283   if (have_rrt && use_rrt)
   1284     read_real_time (&start_rrt, sizeof(start_rrt));
   1285 
   1286   if (have_sgi && use_sgi)
   1287     start_sgi = *sgi_addr;
   1288 
   1289   if (have_mftb && use_mftb)
   1290     MFTB (start_mftb);
   1291 
   1292   if (have_stck && use_stck)
   1293     STCK (start_stck);
   1294 
   1295   /* Cycles sampled last for maximum accuracy. */
   1296   if (have_cycles && use_cycles)
   1297     speed_cyclecounter (start_cycles);
   1298 }
   1299 
   1300 
   1301 /* Calculate the difference between two cycle counter samples, as a "double"
   1302    counter of cycles.
   1303 
   1304    The start and end values are allowed to cancel in integers in case the
   1305    counter values are bigger than the 53 bits that normally fit in a double.
   1306 
   1307    This works even if speed_cyclecounter() puts a value bigger than 32-bits
   1308    in the low word (the high word always gets a 2**32 multiplier though). */
   1309 
   1310 double
   1311 speed_cyclecounter_diff (const unsigned end[2], const unsigned start[2])
   1312 {
   1313   unsigned  d;
   1314   double    t;
   1315 
   1316   if (have_cycles == 1)
   1317     {
   1318       t = (end[0] - start[0]);
   1319     }
   1320   else
   1321     {
   1322       d = end[0] - start[0];
   1323       t = d - (d > end[0] ? M_2POWU : 0.0);
   1324       t += (end[1] - start[1]) * M_2POW32;
   1325     }
   1326   return t;
   1327 }
   1328 
   1329 
   1330 double
   1331 speed_mftb_diff (const unsigned end[2], const unsigned start[2])
   1332 {
   1333   unsigned  d;
   1334   double    t;
   1335 
   1336   d = end[0] - start[0];
   1337   t = (double) d - (d > end[0] ? M_2POW32 : 0.0);
   1338   t += (end[1] - start[1]) * M_2POW32;
   1339   return t;
   1340 }
   1341 
   1342 
   1343 /* Calculate the difference between "start" and "end" using fields "sec" and
   1344    "psec", where each "psec" is a "punit" of a second.
   1345 
   1346    The seconds parts are allowed to cancel before being combined with the
   1347    psec parts, in case a simple "sec+psec*punit" exceeds the precision of a
   1348    double.
   1349 
   1350    Total time is only calculated in a "double" since an integer count of
   1351    psecs might overflow.  2^32 microseconds is only a bit over an hour, or
   1352    2^32 nanoseconds only about 4 seconds.
   1353 
   1354    The casts to "long" are for the benefit of timebasestruct_t, where the
   1355    fields are only "unsigned int", but we want a signed difference.  */
   1356 
   1357 #define DIFF_SECS_ROUTINE(sec, psec, punit)                     \
   1358   {                                                             \
   1359     long  sec_diff, psec_diff;                                  \
   1360     sec_diff = (long) end->sec - (long) start->sec;             \
   1361     psec_diff = (long) end->psec - (long) start->psec;          \
   1362     return (double) sec_diff + punit * (double) psec_diff;      \
   1363   }
   1364 
   1365 double
   1366 timeval_diff_secs (const struct_timeval *end, const struct_timeval *start)
   1367 {
   1368   DIFF_SECS_ROUTINE (tv_sec, tv_usec, 1e-6);
   1369 }
   1370 
   1371 double
   1372 rusage_diff_secs (const struct_rusage *end, const struct_rusage *start)
   1373 {
   1374   DIFF_SECS_ROUTINE (ru_utime.tv_sec, ru_utime.tv_usec, 1e-6);
   1375 }
   1376 
   1377 double
   1378 timespec_diff_secs (const struct_timespec *end, const struct_timespec *start)
   1379 {
   1380   DIFF_SECS_ROUTINE (tv_sec, tv_nsec, 1e-9);
   1381 }
   1382 
   1383 /* This is for use after time_base_to_time, ie. for seconds and nanoseconds. */
   1384 double
   1385 timebasestruct_diff_secs (const timebasestruct_t *end,
   1386 			  const timebasestruct_t *start)
   1387 {
   1388   DIFF_SECS_ROUTINE (tb_high, tb_low, 1e-9);
   1389 }
   1390 
   1391 
   1392 double
   1393 speed_endtime (void)
   1394 {
   1395 #define END_USE(name,value)                             \
   1396   do {                                                  \
   1397     if (speed_option_verbose >= 3)                      \
   1398       printf ("speed_endtime(): used %s\n", name);      \
   1399     result = value;                                     \
   1400     goto done;                                          \
   1401   } while (0)
   1402 
   1403 #define END_ENOUGH(name,value)                                          \
   1404   do {                                                                  \
   1405     if (speed_option_verbose >= 3)                                      \
   1406       printf ("speed_endtime(): %s gives enough precision\n", name);    \
   1407     result = value;                                                     \
   1408     goto done;                                                          \
   1409   } while (0)
   1410 
   1411 #define END_EXCEED(name,value)                                            \
   1412   do {                                                                    \
   1413     if (speed_option_verbose >= 3)                                        \
   1414       printf ("speed_endtime(): cycle counter limit exceeded, used %s\n", \
   1415 	      name);                                                      \
   1416     result = value;                                                       \
   1417     goto done;                                                            \
   1418   } while (0)
   1419 
   1420   unsigned          end_cycles[2];
   1421   stck_t            end_stck;
   1422   unsigned          end_mftb[2];
   1423   unsigned          end_sgi;
   1424   timebasestruct_t  end_rrt;
   1425   struct_timespec   end_cgt;
   1426   struct_timeval    end_gtod;
   1427   struct_rusage     end_grus;
   1428   struct_tms        end_times;
   1429   double            t_gtod, t_grus, t_times, t_cgt;
   1430   double            t_rrt, t_sgi, t_mftb, t_stck, t_cycles;
   1431   double            result;
   1432 
   1433   /* Cycles sampled first for maximum accuracy.
   1434      "have_" values tested to let unused code go dead.  */
   1435 
   1436   if (have_cycles && use_cycles)  speed_cyclecounter (end_cycles);
   1437   if (have_stck   && use_stck)    STCK (end_stck);
   1438   if (have_mftb   && use_mftb)    MFTB (end_mftb);
   1439   if (have_sgi    && use_sgi)     end_sgi = *sgi_addr;
   1440   if (have_rrt    && use_rrt)     read_real_time (&end_rrt, sizeof(end_rrt));
   1441   if (have_cgt    && use_cgt)     clock_gettime (CGT_ID, &end_cgt);
   1442   if (have_gtod   && use_gtod)    gettimeofday (&end_gtod, NULL);
   1443   if (have_grus   && use_grus)    getrusage (0, &end_grus);
   1444   if (have_times  && use_times)   times (&end_times);
   1445 
   1446   result = -1.0;
   1447 
   1448   if (speed_option_verbose >= 4)
   1449     {
   1450       printf ("speed_endtime():\n");
   1451       if (use_cycles)
   1452 	printf ("   cycles  0x%X,0x%X -> 0x%X,0x%X\n",
   1453 		start_cycles[1], start_cycles[0],
   1454 		end_cycles[1], end_cycles[0]);
   1455 
   1456       if (use_stck)
   1457 	printf ("   stck  0x%lX -> 0x%lX\n", start_stck, end_stck);
   1458 
   1459       if (use_mftb)
   1460 	printf ("   mftb  0x%X,%08X -> 0x%X,%08X\n",
   1461 		start_mftb[1], start_mftb[0],
   1462 		end_mftb[1], end_mftb[0]);
   1463 
   1464       if (use_sgi)
   1465 	printf ("   sgi  0x%X -> 0x%X\n", start_sgi, end_sgi);
   1466 
   1467       if (use_rrt)
   1468 	printf ("   read_real_time  (%d)%u,%u -> (%d)%u,%u\n",
   1469 		start_rrt.flag, start_rrt.tb_high, start_rrt.tb_low,
   1470 		end_rrt.flag, end_rrt.tb_high, end_rrt.tb_low);
   1471 
   1472       if (use_cgt)
   1473 	printf ("   clock_gettime  %ld.%09ld -> %ld.%09ld\n",
   1474 		(long) start_cgt.tv_sec, (long) start_cgt.tv_nsec,
   1475 		(long) end_cgt.tv_sec, (long) end_cgt.tv_nsec);
   1476 
   1477       if (use_gtod)
   1478 	printf ("   gettimeofday  %ld.%06ld -> %ld.%06ld\n",
   1479 		(long) start_gtod.tv_sec,
   1480 		(long) start_gtod.tv_usec,
   1481 		(long) end_gtod.tv_sec,
   1482 		(long) end_gtod.tv_usec);
   1483 
   1484       if (use_grus)
   1485 	printf ("   getrusage  %ld.%06ld -> %ld.%06ld\n",
   1486 		(long) start_grus.ru_utime.tv_sec,
   1487 		(long) start_grus.ru_utime.tv_usec,
   1488 		(long) end_grus.ru_utime.tv_sec,
   1489 		(long) end_grus.ru_utime.tv_usec);
   1490 
   1491       if (use_times)
   1492 	printf ("   times  %ld -> %ld\n",
   1493 		start_times.tms_utime, end_times.tms_utime);
   1494     }
   1495 
   1496   if (use_rrt)
   1497     {
   1498       time_base_to_time (&start_rrt, sizeof(start_rrt));
   1499       time_base_to_time (&end_rrt, sizeof(end_rrt));
   1500       t_rrt = timebasestruct_diff_secs (&end_rrt, &start_rrt);
   1501       END_USE ("read_real_time()", t_rrt);
   1502     }
   1503 
   1504   if (use_cgt)
   1505     {
   1506       t_cgt = timespec_diff_secs (&end_cgt, &start_cgt);
   1507       END_USE ("clock_gettime()", t_cgt);
   1508     }
   1509 
   1510   if (use_grus)
   1511     {
   1512       t_grus = rusage_diff_secs (&end_grus, &start_grus);
   1513 
   1514       /* Use getrusage() if the cycle counter limit would be exceeded, or if
   1515 	 it provides enough accuracy already. */
   1516       if (use_cycles)
   1517 	{
   1518 	  if (t_grus >= speed_precision*grus_unittime)
   1519 	    END_ENOUGH ("getrusage()", t_grus);
   1520 	  if (t_grus >= cycles_limit)
   1521 	    END_EXCEED ("getrusage()", t_grus);
   1522 	}
   1523     }
   1524 
   1525   if (use_times)
   1526     {
   1527       t_times = (end_times.tms_utime - start_times.tms_utime) * times_unittime;
   1528 
   1529       /* Use times() if the cycle counter limit would be exceeded, or if
   1530 	 it provides enough accuracy already. */
   1531       if (use_cycles)
   1532 	{
   1533 	  if (t_times >= speed_precision*times_unittime)
   1534 	    END_ENOUGH ("times()", t_times);
   1535 	  if (t_times >= cycles_limit)
   1536 	    END_EXCEED ("times()", t_times);
   1537 	}
   1538     }
   1539 
   1540   if (use_gtod)
   1541     {
   1542       t_gtod = timeval_diff_secs (&end_gtod, &start_gtod);
   1543 
   1544       /* Use gettimeofday() if it measured a value bigger than the cycle
   1545 	 counter can handle.  */
   1546       if (use_cycles)
   1547 	{
   1548 	  if (t_gtod >= cycles_limit)
   1549 	    END_EXCEED ("gettimeofday()", t_gtod);
   1550 	}
   1551     }
   1552 
   1553   if (use_mftb)
   1554     {
   1555       t_mftb = speed_mftb_diff (end_mftb, start_mftb) * mftb_unittime;
   1556       END_USE ("mftb", t_mftb);
   1557     }
   1558 
   1559   if (use_stck)
   1560     {
   1561       t_stck = (end_stck - start_stck) * STCK_PERIOD;
   1562       END_USE ("stck", t_stck);
   1563     }
   1564 
   1565   if (use_sgi)
   1566     {
   1567       t_sgi = (end_sgi - start_sgi) * sgi_unittime;
   1568       END_USE ("SGI hardware counter", t_sgi);
   1569     }
   1570 
   1571   if (use_cycles)
   1572     {
   1573       t_cycles = speed_cyclecounter_diff (end_cycles, start_cycles)
   1574 	* speed_cycletime;
   1575       END_USE ("cycle counter", t_cycles);
   1576     }
   1577 
   1578   if (use_grus && getrusage_microseconds_p())
   1579     END_USE ("getrusage()", t_grus);
   1580 
   1581   if (use_gtod && gettimeofday_microseconds_p())
   1582     END_USE ("gettimeofday()", t_gtod);
   1583 
   1584   if (use_times)  END_USE ("times()",        t_times);
   1585   if (use_grus)   END_USE ("getrusage()",    t_grus);
   1586   if (use_gtod)   END_USE ("gettimeofday()", t_gtod);
   1587 
   1588   fprintf (stderr, "speed_endtime(): oops, no time method available\n");
   1589   abort ();
   1590 
   1591  done:
   1592   if (result < 0.0)
   1593     {
   1594       if (speed_option_verbose >= 2)
   1595 	fprintf (stderr, "speed_endtime(): warning, treating negative time as zero: %.9f\n", result);
   1596       result = 0.0;
   1597     }
   1598   return result;
   1599 }
   1600