Home | History | Annotate | Line # | Download | only in common
hwcdrv.c revision 1.1.1.1
      1 /* Copyright (C) 2021 Free Software Foundation, Inc.
      2    Contributed by Oracle.
      3 
      4    This file is part of GNU Binutils.
      5 
      6    This program is free software; you can redistribute it and/or modify
      7    it under the terms of the GNU General Public License as published by
      8    the Free Software Foundation; either version 3, or (at your option)
      9    any later version.
     10 
     11    This program is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14    GNU General Public License for more details.
     15 
     16    You should have received a copy of the GNU General Public License
     17    along with this program; if not, write to the Free Software
     18    Foundation, 51 Franklin Street - Fifth Floor, Boston,
     19    MA 02110-1301, USA.  */
     20 
     21 #include <errno.h>
     22 #include <unistd.h>
     23 #include <fcntl.h>
     24 #include <sys/mman.h>
     25 #include <sys/ioctl.h>
     26 #include <sys/syscall.h>
     27 #include <linux/perf_event.h>
     28 
     29 #include "hwcdrv.h"
     30 
     31 /*---------------------------------------------------------------------------*/
     32 /* macros */
     33 #define IS_GLOBAL /* Mark global symbols */
     34 
     35 #include "cpuid.c" /* ftns for identifying a chip */
     36 
     37 static hdrv_pcbe_api_t hdrv_pcbe_core_api;
     38 static hdrv_pcbe_api_t hdrv_pcbe_opteron_api;
     39 static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = {
     40   &hdrv_pcbe_core_api,
     41   &hdrv_pcbe_opteron_api,
     42   NULL
     43 };
     44 #include "opteron_pcbe.c" /* CPU-specific code */
     45 #include "core_pcbe.c" /* CPU-specific code  */
     46 
     47 extern hwcdrv_api_t hwcdrv_pcl_api;
     48 IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = {
     49   &hwcdrv_pcl_api,
     50   NULL
     51 };
     52 
     53 /*---------------------------------------------------------------------------*/
     54 
     55 /* utils for drivers */
     56 IS_GLOBAL int
     57 hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs)
     58 {
     59   unsigned int pmc_assigned[MAX_PICS];
     60   unsigned idx;
     61   for (int ii = 0; ii < MAX_PICS; ii++)
     62     pmc_assigned[ii] = 0;
     63 
     64   /* assign the HWCs that we already know about */
     65   for (idx = 0; idx < numctrs; idx++)
     66     {
     67       regno_t regno = entries[idx]->reg_num;
     68       if (regno == REGNO_ANY)
     69 	{
     70 	  /* check to see if list of possible registers only contains one entry */
     71 	  regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list);
     72 	}
     73       if (regno != REGNO_ANY)
     74 	{
     75 	  if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno))
     76 	    {
     77 	      logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
     78 	      return HWCFUNCS_ERROR_HWCARGS;
     79 	    }
     80 	  TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno);
     81 	  entries[idx]->reg_num = regno; /* assigning back to entries */
     82 	  pmc_assigned[regno] = 1;
     83 	}
     84     }
     85 
     86   /* assign HWCs that are currently REGNO_ANY */
     87   for (idx = 0; idx < numctrs; idx++)
     88     {
     89       if (entries[idx]->reg_num == REGNO_ANY)
     90 	{
     91 	  int assigned = 0;
     92 	  regno_t *reg_list = entries[idx]->reg_list;
     93 	  for (; reg_list && *reg_list != REGNO_ANY; reg_list++)
     94 	    {
     95 	      regno_t regno = *reg_list;
     96 	      if (regno < 0 || regno >= MAX_PICS)
     97 		{
     98 		  logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
     99 		  return HWCFUNCS_ERROR_HWCARGS;
    100 		}
    101 	      if (pmc_assigned[regno] == 0)
    102 		{
    103 		  TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned:   idx=%d, regno=%d\n", idx, regno);
    104 		  entries[idx]->reg_num = regno; /* assigning back to entries */
    105 		  pmc_assigned[regno] = 1;
    106 		  assigned = 1;
    107 		  break;
    108 		}
    109 	    }
    110 	  if (!assigned)
    111 	    {
    112 	      logerr (GTXT ("Counter '%s' could not be bound to a register\n"),
    113 		      entries[idx]->name ? entries[idx]->name : "<NULL>");
    114 	      return HWCFUNCS_ERROR_HWCARGS;
    115 	    }
    116 	}
    117     }
    118   return 0;
    119 }
    120 
    121 IS_GLOBAL int
    122 hwcdrv_lookup_cpuver (const char * cpcN_cciname)
    123 {
    124   libcpc2_cpu_lookup_t *plookup;
    125   static libcpc2_cpu_lookup_t cpu_table[] = {
    126     LIBCPC2_CPU_LOOKUP_LIST
    127   };
    128   if (cpcN_cciname == NULL)
    129     return CPUVER_UNDEFINED;
    130 
    131   /* search table for name */
    132   for (plookup = cpu_table; plookup->cpc2_cciname; plookup++)
    133     {
    134       int n = strlen (plookup->cpc2_cciname);
    135       if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n))
    136 	return plookup->cpc2_cpuver;
    137     }
    138   /* unknown, but does have a descriptive string */
    139   TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' "
    140 	    "could not be determined\n",
    141 	    cpcN_cciname);
    142   return CPUVER_GENERIC;
    143 }
    144 
    145 /*---------------------------------------------------------------------------*/
    146 /* utils to generate x86 register definitions on Linux */
    147 
    148 /*
    149  *  This code is structured as though we're going to initialize the
    150  *  HWC by writing the Intel MSR register directly.  That is, we
    151  *  assume the lowest 16 bits of the event number will have the event
    152  *  and that higher bits will set attributes.
    153  *
    154  *  While SPARC is different, we can nonetheless use basically the
    155  *  same "x86"-named functions:
    156  *
    157  *  - The event code will still be 16 bits.  It will still
    158  *    be in the lowest 16 bits of the event number.  Though
    159  *    perf_event_code() on SPARC will expect those bits to
    160  *    shifted, hwcdrv_pcl.c can easily perform that shift.
    161  *
    162  *  - On SPARC we support only two attributes, "user" and "system",
    163  *    which hwcdrv_pcl.c already converts to the "exclude_user"
    164  *    and "exclude_kernel" fields expected by perf_event_open().
    165  *    "user" and "system" are stored in event bits 16 and 17.
    166  *    For M8, a 4-bit mask of supported PICs is stored in bits [23:20].
    167  */
    168 
    169 IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0;
    170 
    171 static const attr_info_t perfctr_sparc_attrs[] = {
    172   {NTXT ("user"),   0, 0x01, 16}, //usr
    173   {NTXT ("system"), 0, 0x01, 17}, //os
    174   {NULL, 0, 0x00, 0},
    175 };
    176 static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */
    177   {NTXT ("umask"),  0, 0xff, 8},
    178   {NTXT ("user"),   0, 0x01, 16}, //usr
    179   //{NTXT("nouser"),  1, 0x01, 16}, //usr (inverted)
    180   {NTXT ("system"), 0, 0x01, 17}, //os
    181   {NTXT ("edge"),   0, 0x01, 18},
    182   {NTXT ("pc"),     0, 0x01, 19},
    183   {NTXT ("inv"),    0, 0x01, 23},
    184   {NTXT ("cmask"),  0, 0xff, 24},
    185   {NULL, 0, 0x00, 0},
    186 };
    187 const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs;
    188 
    189 static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */
    190     // (0xff <<  0) |   /* event*/
    191     // (0xff <<  8) |   /* umask */
    192     // (0x01 << 17) |   /* os */
    193     // (0x01 << 18) |   /* edge */
    194     // (0x01 << 19) |   /* pc */
    195     (0x01 << 20) |      /* int */
    196     // (0x01 << 21) |   /* reserved */
    197     (0x01 << 22) |      /* enable */
    198     // (0x01 << 23) |   /* inv */
    199     // (0xff << 24) |   /* cmask */
    200     0;
    201 
    202 static int
    203 myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc,
    204 			    eventsel_t *eventsel, eventsel_t *valid_umask,
    205 			    uint_t *pmc_sel)
    206 {
    207   if (hwcdrv_get_x86_eventnum &&
    208       !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel))
    209     return 0;
    210 
    211   /* check for numerically-specified counters */
    212   char * endptr;
    213   uint64_t num = strtoull (eventname, &endptr, 0);
    214   if (*eventname && !*endptr)
    215     {
    216       *eventsel = EXTENDED_EVNUM_2_EVSEL (num);
    217       *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */
    218       *pmc_sel = pmc;
    219       return 0;
    220     }
    221 
    222   /* name does not specify a numeric value */
    223   *eventsel = (eventsel_t) - 1;
    224   *valid_umask = 0x0;
    225   *pmc_sel = pmc;
    226   return -1;
    227 }
    228 
    229 static int
    230 mask_shift_set (eventsel_t *presult, eventsel_t invalue,
    231 		eventsel_t mask, eventsel_t shift)
    232 {
    233   if (invalue & ~mask)
    234     return -1; /* invalue attempts to set bits outside of mask */
    235   *presult &= ~(mask << shift); /* clear all the mask bits */
    236   *presult |= (invalue << shift); /* set bits according to invalue */
    237   return 0;
    238 }
    239 
    240 static int
    241 set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask,
    242 		   hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly)
    243 {
    244   eventsel_t evntsel = *result_mask;
    245   for (int ii = 0; ii < (int) nattrs; ii++)
    246     {
    247       const char *attrname = attrs[ii].ca_name;
    248       eventsel_t attrval = (eventsel_t) attrs[ii].ca_val;
    249       const char *tmpname;
    250       int attr_found = 0;
    251       for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++)
    252 	{
    253 	  if (strcmp (attrname, tmpname) == 0)
    254 	    {
    255 	      if (strcmp (attrname, "umask") == 0)
    256 		{
    257 		  if (attrval & ~evnt_valid_umask)
    258 		    {
    259 		      logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"),
    260 			      nameOnly, (long long) evnt_valid_umask);
    261 		      return -1;
    262 		    }
    263 		}
    264 	      if (mask_shift_set (&evntsel,
    265 				  perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval,
    266 				  perfctr_attrs_table[jj].mask,
    267 				  perfctr_attrs_table[jj].shift))
    268 		{
    269 		  logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"),
    270 			  nameOnly, attrname, (long long) attrval);
    271 		  return -1;
    272 		}
    273 	      TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n",
    274 			nameOnly, attrname, (long long) attrval);
    275 	      attr_found = 1;
    276 	      break;
    277 	    }
    278 	}
    279       if (!attr_found)
    280 	{
    281 	  logerr (GTXT ("attribute `%s' is invalid\n"), attrname);
    282 	  return -1;
    283 	}
    284     }
    285   *result_mask = evntsel;
    286   return 0;
    287 }
    288 
    289 IS_GLOBAL int
    290 hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name,
    291 			   eventsel_t *return_event, uint_t *return_pmc_sel)
    292 {
    293   hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1];
    294   unsigned nattrs = 0;
    295   char *nameOnly = NULL;
    296   eventsel_t evntsel = 0; // event number
    297   eventsel_t evnt_valid_umask = 0;
    298   uint_t pmc_sel = 0;
    299   int rc = -1;
    300   *return_event = 0;
    301   *return_pmc_sel = 0;
    302   void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS,
    303 				   &nattrs, NULL);
    304   if (!attr_mem)
    305     {
    306       logerr (GTXT ("out of memory, could not parse attributes\n"));
    307       return -1;
    308     }
    309   hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL);
    310   if (regno == REGNO_ANY)
    311     {
    312       logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly);
    313       goto attr_wrapup;
    314     }
    315 
    316   /* look up evntsel */
    317   if (myperfctr_get_x86_eventnum (nameOnly, regno,
    318 				  &evntsel, &evnt_valid_umask, &pmc_sel))
    319     {
    320       logerr (GTXT ("counter `%s' is not valid\n"), nameOnly);
    321       goto attr_wrapup;
    322     }
    323   TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n",
    324 	    (long long) evntsel, pmc_sel, nameOnly, nattrs);
    325 
    326   /* determine event attributes */
    327   eventsel_t evnt_attrs = perfctr_evntsel_enable_bits;
    328   if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly))
    329     goto attr_wrapup;
    330   if (evntsel & evnt_attrs)
    331     TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n",
    332 	      (long long) evntsel, (long long) evnt_attrs,
    333 	      (long long) (evntsel & evnt_attrs));
    334   *return_event = evntsel | evnt_attrs;
    335   *return_pmc_sel = pmc_sel;
    336   rc = 0;
    337 
    338 attr_wrapup:
    339   free (attr_mem);
    340   free (nameOnly);
    341   return rc;
    342 }
    343 
    344 #ifdef __x86_64__
    345 #define syscall_instr          "syscall"
    346 #define syscall_clobber        "rcx", "r11", "memory"
    347 #endif
    348 #ifdef __i386__
    349 #define syscall_instr          "int $0x80"
    350 #define syscall_clobber        "memory"
    351 #endif
    352 
    353 static inline int
    354 perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid,
    355 		 int cpu, int group_fd, unsigned long flags)
    356 {
    357   /* It seems that perf_event_open() sometimes fails spuriously,
    358    * even while an immediate retry succeeds.
    359    * So, let's try a few retries if the call fails just to be sure.
    360    */
    361   int rc;
    362   for (int retry = 0; retry < 5; retry++)
    363     {
    364       rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags);
    365       if (rc != -1)
    366 	return rc;
    367     }
    368   return rc;
    369 }
    370 
    371 /*---------------------------------------------------------------------------*/
    372 /* macros & fwd prototypes */
    373 
    374 #define HWCDRV_API      static /* Mark functions used by hwcdrv API */
    375 
    376 HWCDRV_API int hwcdrv_start (void);
    377 HWCDRV_API int hwcdrv_free_counters ();
    378 
    379 static pid_t
    380 hwcdrv_gettid (void)
    381 {
    382 #ifndef LIBCOLLECTOR_SRC
    383   return syscall (__NR_gettid);
    384 #elif defined(intel)
    385   pid_t r;
    386   __asm__ __volatile__(syscall_instr
    387 		       : "=a" (r) : "0" (__NR_gettid)
    388 		       : syscall_clobber);
    389   return r;
    390 #else
    391   return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm
    392 #endif
    393 }
    394 
    395 /*---------------------------------------------------------------------------*/
    396 /* types */
    397 
    398 #define NPAGES_PER_BUF  1 // number of pages to be used for perf_event samples
    399 // must be a power of 2
    400 
    401 /*---------------------------------------------------------------------------*/
    402 
    403 /* typedefs */
    404 
    405 typedef struct
    406 { // event (hwc) definition
    407   unsigned int reg_num; // PMC assignment, potentially for detecting conflicts
    408   eventsel_t eventsel;          // raw event bits (Intel/AMD)
    409   uint64_t counter_preload;     // number of HWC events before signal
    410   struct perf_event_attr hw;    // perf_event definition
    411   hrtime_t min_time;            // minimum time we're targeting between events
    412   char *name;
    413 } perf_event_def_t;
    414 
    415 typedef struct
    416 { // runtime state of perf_event buffer
    417   void *buf;                    // pointer to mmapped buffer
    418   size_t pagesz;                // size of pages
    419 } buffer_state_t;
    420 
    421 typedef struct
    422 { // runtime state of counter values
    423   uint64_t prev_ena_ts;         // previous perf_event "enabled" time
    424   uint64_t prev_run_ts;         // previous perf_event "running" time
    425   uint64_t prev_value;          // previous HWC value
    426 } counter_value_state_t;
    427 
    428 typedef struct
    429 { // per-counter information
    430   perf_event_def_t *ev_def;     // global HWC definition for one counter
    431   int fd;                       // perf_event fd
    432   buffer_state_t buf_state;     // perf_event buffer's state
    433   counter_value_state_t value_state; // counter state
    434   int needs_restart;            // workaround for dbx failure to preserve si_fd
    435   uint64_t last_overflow_period;
    436   hrtime_t last_overflow_time;
    437 } counter_state_t;
    438 
    439 typedef struct
    440 { // per-thread context
    441   counter_state_t *ctr_list;
    442   int signal_fd;                // fd that caused the most recent signal
    443   pthread_t tid;                // for debugging signal delivery problems
    444 } hdrv_pcl_ctx_t;
    445 
    446 /*---------------------------------------------------------------------------*/
    447 
    448 /* static variables */
    449 static struct
    450 {
    451   int library_ok;
    452   int internal_open_called;
    453   hwcfuncs_tsd_get_fn_t find_vpc_ctx;
    454   unsigned hwcdef_cnt;      /* number of *active* hardware counters */
    455   hwcdrv_get_events_fn_t *get_events;
    456 } hdrv_pcl_state;
    457 
    458 static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED};
    459 static perf_event_def_t global_perf_event_def[MAX_PICS];
    460 
    461 #define COUNTERS_ENABLED()      (hdrv_pcl_state.hwcdef_cnt)
    462 
    463 
    464 /* perf_event buffer formatting and handling */
    465 static void
    466 reset_buf (buffer_state_t *bufstate)
    467 {
    468   TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n");
    469   struct perf_event_mmap_page *metadata = bufstate->buf;
    470   if (metadata)
    471     metadata->data_tail = metadata->data_head;
    472 }
    473 
    474 static int
    475 skip_buf (buffer_state_t *bufstate, size_t sz)
    476 {
    477   TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n");
    478   struct perf_event_mmap_page *metadata = bufstate->buf;
    479   if (metadata == NULL)
    480     return -1;
    481   size_t pgsz = bufstate->pagesz;
    482   size_t bufsz = NPAGES_PER_BUF*pgsz;
    483   uint64_t d_tail = metadata->data_tail;
    484   uint64_t d_head = metadata->data_head;
    485 
    486   // validate request size
    487   if (sz > d_head - d_tail || sz >= bufsz)
    488     {
    489       reset_buf (bufstate);
    490       return -1;
    491     }
    492   metadata->data_tail = d_tail + sz; // advance tail
    493   return 0;
    494 }
    495 
    496 static int
    497 read_buf (buffer_state_t *bufstate, void *buf, size_t sz)
    498 {
    499   struct perf_event_mmap_page *metadata = bufstate->buf;
    500   if (metadata == NULL)
    501     return -1;
    502   size_t pgsz = bufstate->pagesz;
    503   size_t bufsz = NPAGES_PER_BUF*pgsz;
    504   uint64_t d_tail = metadata->data_tail;
    505   uint64_t d_head = metadata->data_head;
    506 
    507   // validate request size
    508   if (sz > d_head - d_tail || sz >= bufsz)
    509     {
    510       reset_buf (bufstate);
    511       return -1;
    512     }
    513   char *buf_base = ((char *) metadata) + pgsz; // start of data buffer
    514   uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer
    515   size_t nbytes = sz;
    516   if (start_pos + sz > bufsz)
    517     {
    518       // will wrap past end of buffer
    519       nbytes = bufsz - start_pos;
    520       memcpy (buf, buf_base + start_pos, nbytes);
    521       start_pos = 0; // wrap to start
    522       buf = (void *) (((char *) buf) + nbytes);
    523       nbytes = sz - nbytes;
    524     }
    525   memcpy (buf, buf_base + start_pos, nbytes);
    526   metadata->data_tail += sz;
    527   return 0;
    528 }
    529 
    530 static int
    531 read_u64 (buffer_state_t *bufstate, uint64_t *value)
    532 {
    533   return read_buf (bufstate, value, sizeof (uint64_t));
    534 }
    535 
    536 static int
    537 read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue,
    538 	     uint64_t *rlost)
    539 {
    540   // returns count of bytes read
    541   buffer_state_t *bufstate = &ctr_state->buf_state;
    542   counter_value_state_t *cntstate = &ctr_state->value_state;
    543   int readsz = 0;
    544 
    545   // PERF_SAMPLE_IP
    546   uint64_t ipc = 0;
    547   int rc = read_u64 (bufstate, &ipc);
    548   if (rc)
    549     return -1;
    550   readsz += sizeof (uint64_t);
    551 
    552   // PERF_SAMPLE_READ: value
    553   uint64_t value = 0;
    554   rc = read_u64 (bufstate, &value);
    555   if (rc)
    556     return -2;
    557   readsz += sizeof (uint64_t);
    558 
    559   /* Bug 20806896
    560    * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and
    561    * running times in the sample data that correspond to the metadata times
    562    *     metadata->time_enabled
    563    *     metadata->time_running
    564    * from the PREVIOUS (not current) sample.  Probably just ignore this bug
    565    * since it's on old kernels and we only use the enabled and running times
    566    * to construct loss_estimate.
    567    */
    568   // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED
    569   uint64_t enabled_time = 0;
    570   rc = read_u64 (bufstate, &enabled_time);
    571   if (rc)
    572     return -3;
    573   readsz += sizeof (uint64_t);
    574 
    575   // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING
    576   uint64_t running_time = 0;
    577   rc = read_u64 (bufstate, &running_time);
    578   if (rc)
    579     return -4;
    580   readsz += sizeof (uint64_t);
    581 
    582   uint64_t value_delta = value - cntstate->prev_value;
    583   uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts;
    584   uint64_t running_delta = running_time - cntstate->prev_run_ts;
    585   cntstate->prev_value = value;
    586   cntstate->prev_ena_ts = enabled_time;
    587   cntstate->prev_run_ts = running_time;
    588 
    589   // 24830461 need workaround for Linux anomalous HWC skid overrun
    590   int set_error_flag = 0;
    591   if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */)
    592     set_error_flag = 1;
    593 
    594   uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing
    595   if (running_delta == enabled_delta)
    596     {
    597       // counter was running 100% of time, no multiplexing
    598     }
    599   else if (running_delta == 0)
    600     loss_estimate = 1; // token amount to aid in debugging perfctr oddities
    601   else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll))
    602     {
    603       // running should be smaller than enabled, can't estimate
    604       /*
    605        * 21418391 HWC can have a negative count
    606        *
    607        * We've also seen enabled not only be smaller than running
    608        * but in fact go negative.  Guard against this.
    609        */
    610       loss_estimate = 2; // token amount to aid in debugging perfctr oddities
    611     }
    612   else
    613     {
    614       // counter was running less than 100% of time
    615       // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479
    616       uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta;
    617       value_delta = scaled_delta;
    618 #if 0
    619       // We should perhaps warn the user that multiplexing is going on,
    620       // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values.
    621       // For now we simply don't report.
    622       // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(),
    623       // but at that level "lost" has a meaning that's considerably broader than just multiplexing.
    624       collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
    625 				     SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
    626 				     ctr_list[idx].last_overflow_period, new_period);
    627 #endif
    628     }
    629   TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3,
    630 	    "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu "
    631 	    "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n",
    632 	    ctr_state->ev_def->name, (long long) ipc,
    633 	    (long long) enabled_delta, (long long) running_delta,
    634 	    (long long) value_delta, (long long) value_delta,
    635 	    (unsigned long long) loss_estimate,
    636 	    loss_estimate ? ", WARNING - SCALED" : "",
    637 	    set_error_flag ? ", ERRORFLAG" : "");
    638   if (set_error_flag == 1)
    639     value_delta |= (1ULL << 63)     /* HWCVAL_ERR_FLAG */;
    640   *rvalue = value_delta;
    641   *rlost = loss_estimate;
    642   if (readsz != msgsz)
    643     {
    644       TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n");
    645       return -5;
    646     }
    647   return 0;
    648 }
    649 
    650 static void
    651 dump_perf_event_attr (struct perf_event_attr *at)
    652 {
    653   TprintfT (DBG_LT2, "dump_perf_event_attr:  size=%d  type=%d  sample_period=%lld\n"
    654 	    "  config=0x%llx  config1=0x%llx  config2=0x%llx  wakeup_events=%lld __reserved_1=%lld\n",
    655 	    (int) at->size, (int) at->type, (unsigned long long) at->sample_period,
    656 	    (unsigned long long) at->config, (unsigned long long) at->config1,
    657 	    (unsigned long long) at->config2, (unsigned long long) at->wakeup_events,
    658 	    (unsigned long long) at->__reserved_1);
    659 #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, "  %-10s : %lld\n", #fld, (long long) at->fld)
    660   DUMP_F (disabled);
    661   DUMP_F (inherit);
    662   DUMP_F (pinned);
    663   DUMP_F (exclusive);
    664   DUMP_F (exclude_user);
    665   DUMP_F (exclude_kernel);
    666   DUMP_F (exclude_hv);
    667   DUMP_F (exclude_idle);
    668   //    DUMP_F(xmmap);
    669   DUMP_F (comm);
    670   DUMP_F (freq);
    671   DUMP_F (inherit_stat);
    672   DUMP_F (enable_on_exec);
    673   DUMP_F (task);
    674   DUMP_F (watermark);
    675 }
    676 
    677 static void
    678 init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period)
    679 {
    680   memset (hw, 0, sizeof (struct perf_event_attr));
    681   hw->size = sizeof (struct perf_event_attr); // fwd/bwd compat
    682 
    683 #if defined(__i386__) || defined(__x86_64)
    684   //note: Nehalem/Westmere OFFCORE_RESPONSE in upper 32 bits
    685   hw->config = event;
    686   hw->type = PERF_TYPE_RAW;     // hw/sw/trace/raw...
    687 #elif defined(__aarch64__)
    688   hw->type = (event >> 24) & 7;
    689   hw->config = event & 0xff;
    690 #elif defined(sparc)
    691   //SPARC needs to be shifted up 16 bits
    692   hw->config = (event & 0xFFFF) << 16;  // uint64_t event
    693   uint64_t regs = (event >> 20) & 0xf;  // see sparc_pcbe.c
    694   hw->config |= regs << 4;  // for M8, supported PICs need to be placed at bits [7:4]
    695   hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw...
    696 #endif
    697 
    698   hw->sample_period = period;
    699   hw->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_READ |
    700 	  // PERF_SAMPLE_TID		|
    701 	  // PERF_SAMPLE_TIME		| // possibly interesting
    702 	  // PERF_SAMPLE_ADDR		|
    703 	  PERF_SAMPLE_READ | // HWC value
    704 	  // PERF_SAMPLE_CALLCHAIN	| // interesting
    705 	  // PERF_SAMPLE_ID		|
    706 	  // PERF_SAMPLE_CPU		| // possibly interesting
    707 	  // PERF_SAMPLE_PERIOD		|
    708 	  // PERF_SAMPLE_STREAM_ID	|
    709 	  // PERF_SAMPLE_RAW		|
    710 	  0;
    711   hw->read_format =
    712 	  PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled
    713 	  PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled
    714 	  // PERF_FORMAT_ID		|
    715 	  // PERF_FORMAT_GROUP		|
    716 	  0;
    717   hw->disabled = 1; /* off by default */
    718 
    719   // Note: the following override config.priv bits!
    720   hw->exclude_user = (event & (1 << 16)) == 0;      /* don't count user */
    721   hw->exclude_kernel = (event & (1 << 17)) == 0;    /* ditto kernel */
    722   hw->exclude_hv = 1;       /* ditto hypervisor */
    723   hw->wakeup_events = 1;    /* wakeup every n events */
    724   dump_perf_event_attr (hw);
    725 }
    726 
    727 static int
    728 start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string)
    729 {
    730   // pe_attr should have been initialized in hwcdrv_create_counters()
    731   struct perf_event_attr pe_attr;
    732   memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr));
    733 
    734   // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set
    735   pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period;
    736 
    737   int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0);
    738   if (hwc_fd == -1)
    739     {
    740       TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n",
    741 		error_string, ii, errno);
    742       return 1;
    743     }
    744 
    745   size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata
    746   void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call?
    747 		     PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0);
    748   if (buf == MAP_FAILED)
    749     {
    750       TprintfT (0, "sz = %ld, pgsz = %ld\n  err=%s idx=%d mmap failed: %s\n",
    751 		(long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno));
    752       return 1;
    753     }
    754   pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def?  we never seem to use it
    755   pctx->ctr_list[ii].fd = hwc_fd;
    756   pctx->ctr_list[ii].buf_state.buf = buf;
    757   pctx->ctr_list[ii].buf_state.pagesz = pgsz;
    758   pctx->ctr_list[ii].value_state.prev_ena_ts = 0;
    759   pctx->ctr_list[ii].value_state.prev_run_ts = 0;
    760   pctx->ctr_list[ii].value_state.prev_value = 0;
    761   pctx->ctr_list[ii].last_overflow_time = gethrtime ();
    762 
    763   /* set async mode */
    764   long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC;
    765   int rc = fcntl (hwc_fd, F_SETFL, flags);
    766   if (rc == -1)
    767     {
    768       TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii);
    769       return 1;
    770     }
    771 
    772   /*
    773    * set lwp ownership of the fd
    774    * See BUGS section of "man perf_event_open":
    775    *     The F_SETOWN_EX option to fcntl(2) is needed to properly get
    776    *     overflow signals in threads.  This was introduced in Linux 2.6.32.
    777    * Legacy references:
    778    *     see http://lkml.org/lkml/2009/8/4/128
    779    *     google man fcntl F_SETOWN_EX -conflict
    780    *       "From Linux 2.6.32 onward, use F_SETOWN_EX to target
    781    *       SIGIO and SIGURG signals at a particular thread."
    782    *     http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html
    783    *     See 2010 CSCADS presentation by Eranian
    784    */
    785   struct f_owner_ex fowner_ex;
    786   fowner_ex.type = F_OWNER_TID;
    787   fowner_ex.pid = pctx->tid;
    788   rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex);
    789   if (rc == -1)
    790     {
    791       TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii);
    792       return 1;
    793     }
    794 
    795   /* Use sigio so handler can determine FD via siginfo->si_fd. */
    796   rc = fcntl (hwc_fd, F_SETSIG, SIGIO);
    797   if (rc == -1)
    798     {
    799       TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii);
    800       return 1;
    801     }
    802   return 0;
    803 }
    804 
    805 static int
    806 stop_one_ctr (int ii, counter_state_t *ctr_list)
    807 {
    808   int hwc_rc = 0;
    809   if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1))
    810     {
    811       TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno);
    812       hwc_rc = HWCFUNCS_ERROR_GENERIC;
    813     }
    814   void *buf = ctr_list[ii].buf_state.buf;
    815   if (buf)
    816     {
    817       size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz;
    818       ctr_list[ii].buf_state.buf = NULL;
    819       int tmprc = munmap (buf, bufsz);
    820       if (tmprc)
    821 	{
    822 	  TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno);
    823 	  hwc_rc = HWCFUNCS_ERROR_GENERIC;
    824 	}
    825     }
    826   if (-1 == close (ctr_list[ii].fd))
    827     {
    828       TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno);
    829       hwc_rc = HWCFUNCS_ERROR_GENERIC;
    830     }
    831   return hwc_rc;
    832 }
    833 
    834 /* HWCDRV_API for thread-specific actions */
    835 HWCDRV_API int
    836 hwcdrv_lwp_init (void)
    837 {
    838   return hwcdrv_start ();
    839 }
    840 
    841 HWCDRV_API void
    842 hwcdrv_lwp_fini (void)
    843 {
    844   hwcdrv_free_counters ();  /* also sets pctx->ctr_list=NULL; */
    845 }
    846 
    847 /* open */
    848 static int
    849 hdrv_pcl_internal_open ()
    850 {
    851   if (hdrv_pcl_state.internal_open_called)
    852     {
    853       TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n");
    854       return HWCFUNCS_ERROR_ALREADY_CALLED;
    855     }
    856 
    857   // determine if PCL is available
    858   perf_event_def_t tmp_event_def;
    859   memset (&tmp_event_def, 0, sizeof (tmp_event_def));
    860   struct perf_event_attr *pe_attr = &tmp_event_def.hw;
    861   init_perf_event (pe_attr, 0, 0);
    862   pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event
    863   pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts
    864   int hwc_fd = perf_event_open (pe_attr,
    865 				0, // pid/tid, 0 is self
    866 				-1, // cpu, -1 is per-thread mode
    867 				-1, // group_fd, -1 is root
    868 				0); // flags
    869   if (hwc_fd == -1)
    870     {
    871       TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
    872 		" perf_event_open() failed, errno=%d\n", errno);
    873       goto internal_open_error;
    874     }
    875 
    876   /* see if the PCL is new enough to know about F_SETOWN_EX */
    877   struct f_owner_ex fowner_ex;
    878   fowner_ex.type = F_OWNER_TID;
    879   fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID
    880   if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1)
    881     {
    882       TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: "
    883 		"F_SETOWN failed, errno=%d\n", errno);
    884       close (hwc_fd);
    885       goto internal_open_error;
    886     }
    887   close (hwc_fd);
    888 
    889   hdrv_pcl_state.internal_open_called = 1;
    890   hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted
    891   hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
    892   TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n");
    893   for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++)
    894     {
    895       hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii];
    896       if (!ppcbe->hdrv_pcbe_init ())
    897 	{
    898 	  hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name ();
    899 	  hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname);
    900 	  if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
    901 	    goto internal_open_error;
    902 	  hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters ();
    903 	  hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref ();
    904 	  hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events;
    905 	  hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum;
    906 	  break;
    907 	}
    908     }
    909   if (hdrv_pcl_about.cpcN_npics > MAX_PICS)
    910     {
    911       TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
    912 		" reducing number of HWCs from %u to %u on processor '%s'\n",
    913 		hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname);
    914       hdrv_pcl_about.cpcN_npics = MAX_PICS;
    915     }
    916   TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:"
    917 	    " perf_event cpuver=%d, name='%s'\n",
    918 	    hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname);
    919   return 0;
    920 
    921 internal_open_error:
    922   hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
    923   hdrv_pcl_about.cpcN_npics = 0;
    924   hdrv_pcl_about.cpcN_docref = NULL;
    925   hdrv_pcl_about.cpcN_cciname = NULL;
    926   return HWCFUNCS_ERROR_NOT_SUPPORTED;
    927 }
    928 
    929 static void *
    930 single_thread_tsd_ftn ()
    931 {
    932   static hdrv_pcl_ctx_t tsd_context;
    933   return &tsd_context;
    934 }
    935 
    936 /* HWCDRV_API */
    937 HWCDRV_API int
    938 hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz)
    939 {
    940   hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn;
    941   if (tsd_sz)
    942     *tsd_sz = sizeof (hdrv_pcl_ctx_t);
    943 
    944   if (hdrv_pcl_state.internal_open_called)
    945     return HWCFUNCS_ERROR_ALREADY_CALLED;
    946   return hdrv_pcl_internal_open ();
    947 }
    948 
    949 HWCDRV_API void
    950 hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics,
    951 		 const char **docref, uint64_t *support)
    952 {
    953   if (cpuver)
    954     *cpuver = hdrv_pcl_about.cpcN_cpuver;
    955   if (cciname)
    956     *cciname = hdrv_pcl_about.cpcN_cciname;
    957   if (npics)
    958     *npics = hdrv_pcl_about.cpcN_npics;
    959   if (docref)
    960     *docref = hdrv_pcl_about.cpcN_docref;
    961   if (support)
    962     *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID;
    963 }
    964 
    965 HWCDRV_API int
    966 hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn)
    967 {
    968   if (tsd_ftn)
    969     hdrv_pcl_state.find_vpc_ctx = tsd_ftn;
    970   else
    971     {
    972       TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n");
    973       return HWCFUNCS_ERROR_UNAVAIL;
    974     }
    975   return 0;
    976 }
    977 
    978 HWCDRV_API int
    979 hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb)
    980 {
    981   int count = 0;
    982   if (hwc_cb && hdrv_pcl_state.get_events)
    983     count = hdrv_pcl_state.get_events (hwc_cb);
    984   if (attr_cb)
    985     for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++)
    986       attr_cb (perfctr_attrs_table[ii].attrname);
    987   if (!count)
    988     return -1;
    989   return 0;
    990 }
    991 
    992 HWCDRV_API int
    993 hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs)
    994 {
    995   return hwcdrv_assign_all_regnos (entries, numctrs);
    996 }
    997 
    998 static int
    999 internal_hwc_start (int fd)
   1000 {
   1001   int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1);
   1002   if (rc == -1)
   1003     {
   1004       TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:"
   1005 		" PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno);
   1006       return HWCFUNCS_ERROR_UNAVAIL;
   1007     }
   1008   TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd);
   1009   return 0;
   1010 }
   1011 
   1012 HWCDRV_API int
   1013 hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events)
   1014 {
   1015   /* set expired counters to overflow value and all others to 0 */
   1016   /* return 0: OK, counters should be restarted */
   1017   /* return non-zero: eventp not set, counters should not be restarted */
   1018   /* clear return values */
   1019   int ii;
   1020   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1021     {
   1022       eventp->ce_pic[ii] = 0;
   1023       lost_events->ce_pic[ii] = 0;
   1024     }
   1025   hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event?
   1026   eventp->ce_hrt = sig_ts;
   1027   lost_events->ce_hrt = sig_ts;
   1028 
   1029   /* determine source signal */
   1030   int signal_fd = -1;
   1031   switch (si->si_code)
   1032     {
   1033     case POLL_HUP: /* expected value from pcl */
   1034       /* According to Stephane Eranian:
   1035        * "expect POLL_HUP instead of POLL_IN because we are
   1036        * in one-shot mode (IOC_REFRESH)"
   1037        */
   1038       signal_fd = si->si_fd;
   1039       break;
   1040     case SI_TKILL: /* event forwarded by tkill */
   1041       /* DBX can only forward SI_TKILL when it detects POLL_HUP
   1042        * unfortunately, this means that si->si_fd has been lost...
   1043        * We need to process the buffers, but we don't know the fd!
   1044        */
   1045       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1046 		" SI_TKILL detected\n", sig_ts);
   1047       break;
   1048     default:
   1049       // "sometimes we see a POLL_IN (1) with very high event rates,"
   1050       // according to eranian(?)
   1051       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1052 		" unexpected si_code 0x%x\n", sig_ts, si->si_code);
   1053       return HWCFUNCS_ERROR_GENERIC;
   1054     }
   1055 
   1056   hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
   1057   if (!pctx)
   1058     {
   1059       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1060 		" tsd context is NULL\n", sig_ts);
   1061       return HWCFUNCS_ERROR_UNEXPECTED;
   1062     }
   1063   counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
   1064   if (!ctr_list)
   1065     {
   1066       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1067 		" ctr_list is NULL\n", sig_ts);
   1068       return HWCFUNCS_ERROR_UNEXPECTED;
   1069     }
   1070 
   1071   /* clear needs_restart flag */
   1072   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1073     ctr_list[ii].needs_restart = 0;
   1074 
   1075   /* attempt to identify the counter to read */
   1076   int signal_idx = -1;
   1077   pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t
   1078   if (signal_fd != -1)
   1079     {
   1080       for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1081 	{
   1082 	  if (ctr_list[ii].fd == signal_fd)
   1083 	    {
   1084 	      signal_idx = ii;
   1085 	      break;
   1086 	    }
   1087 	}
   1088     }
   1089 
   1090   if (signal_idx < 0)
   1091     {
   1092       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1093 		" pmc not determined!\n", sig_ts);
   1094       lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */
   1095       // note: bogus value may get overwritten in loop below
   1096     }
   1097 
   1098   /* capture sample(s).  In addition to signal_idx, check other counters. */
   1099   struct perf_event_header sheader;
   1100   int idx;
   1101   for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++)
   1102     {
   1103       int num_recs = 0;
   1104       while (1)
   1105 	{
   1106 	  /* check for samples */
   1107 	  struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf;
   1108 	  if (metadata == NULL)
   1109 	    break; // empty
   1110 	  if (metadata->data_tail == metadata->data_head)
   1111 	    break; // empty
   1112 
   1113 	  /* read header */
   1114 	  if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader)))
   1115 	    break;
   1116 	  num_recs++;
   1117 
   1118 	  /* check for PERF_RECORD_SAMPLE */
   1119 	  size_t datasz = sheader.size - sizeof (struct perf_event_header);
   1120 	  if (sheader.type != PERF_RECORD_SAMPLE)
   1121 	    {
   1122 	      TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1123 			" unexpected recd type=%d\n",
   1124 			sig_ts, sheader.type);
   1125 	      if (skip_buf (&ctr_list[idx].buf_state, datasz))
   1126 		{
   1127 		  TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1128 			    " skip recd type=%d failed\n", sig_ts, sheader.type);
   1129 		  lost_events->ce_pic[idx] = 4; /* record a bogus value */
   1130 		  break; // failed to skip buffer??
   1131 		}
   1132 	      lost_events->ce_pic[idx] = 2; /* record a bogus value */
   1133 	      continue; // advance to next record
   1134 	    }
   1135 
   1136 	  /* type is PERF_RECORD_SAMPLE */
   1137 	  uint64_t value, lostv;
   1138 	  if (read_sample (&ctr_list[idx], datasz, &value, &lostv))
   1139 	    {
   1140 	      TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1141 			" read_sample() failed\n", sig_ts);
   1142 	      lost_events->ce_pic[idx] = 3; // record a bogus value
   1143 	      break;                        // failed to read sample data??
   1144 	    }
   1145 	  TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:"
   1146 		    " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts,
   1147 		    idx, (unsigned long long) value, (unsigned long long) lostv);
   1148 	  if (eventp->ce_pic[idx])
   1149 	    {
   1150 	      TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1151 			" idx=%d previous sample recorded as lost_event\n", sig_ts, idx);
   1152 	      lost_events->ce_pic[idx] += eventp->ce_pic[idx];
   1153 	    }
   1154 	  eventp->ce_pic[idx] = value;
   1155 	  lost_events->ce_pic[idx] += lostv;
   1156 	}
   1157 
   1158       /* debug output for unexpected (but common) cases */
   1159       if (idx == signal_idx)
   1160 	{
   1161 	  if (num_recs != 1)
   1162 	    TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1163 		      " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx);
   1164 	}
   1165       else if (num_recs)
   1166 	TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1167 		  " %d unexpected record(s) for idx=%d (signal_idx=%d)\n",
   1168 		  sig_ts, num_recs, idx, signal_idx);
   1169 
   1170       /* trigger counter restart whenever records were found */
   1171       if (num_recs)
   1172 	{
   1173 	  /* check whether to adapt the overflow interval */
   1174 	  /* This is the Linux version.
   1175 	   * The Solaris version is in hwprofile.c collector_update_overflow_counters().
   1176 	   */
   1177 	  hrtime_t min_time = global_perf_event_def[idx].min_time;
   1178 	  if (min_time > 0 // overflow interval is adaptive
   1179 	      && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min
   1180 	    {
   1181 	      /* pick a new overflow interval */
   1182 	      /* roughly doubled, but add funny numbers */
   1183 	      /* hopefully the result is prime or not a multiple of some # of ops/loop */
   1184 	      uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37;
   1185 #if 0
   1186 	      // On Solaris, we report the adjustment to the log file.
   1187 	      // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ.
   1188 	      // For now we simply don't report.
   1189 	      collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
   1190 					     SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
   1191 					     ctr_list[idx].last_overflow_period, new_period);
   1192 #endif
   1193 	      /* There are a variety of ways of resetting the period on Linux.
   1194 	       * The most elegant is
   1195 	       *     ioctl(fd,PERF_EVENT_IOC_PERIOD,&period)
   1196 	       * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD:
   1197 	       *     > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel.
   1198 	       *     > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect
   1199 	       *         until after the next overflow.
   1200 	       * So we're kind of stuck shutting the fd down and restarting it with the new period.
   1201 	       */
   1202 	      if (stop_one_ctr (idx, ctr_list))
   1203 		{
   1204 		  // EUGENE figure out what to do on error
   1205 		}
   1206 	      ctr_list[idx].last_overflow_period = new_period;
   1207 	      if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):"))
   1208 		{
   1209 		  // EUGENE figure out what to do on error
   1210 		}
   1211 	    }
   1212 	  ctr_list[idx].last_overflow_time = sig_ts;
   1213 #if 0
   1214 	  ctr_list[idx].needs_restart = 1;
   1215 #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart()
   1216 	  internal_hwc_start (ctr_list[idx].fd);
   1217 #endif
   1218 	}
   1219     }
   1220   return 0; // OK to restart counters
   1221 }
   1222 
   1223 HWCDRV_API int
   1224 hwcdrv_sighlr_restart (const hwc_event_t *pp)
   1225 {
   1226 #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow()
   1227   hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
   1228   if (!pctx)
   1229     {
   1230       TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n");
   1231       return -1;
   1232     }
   1233   counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
   1234   if (!ctr_list)
   1235     {
   1236       TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n");
   1237       return -1;
   1238     }
   1239   int errors = 0;
   1240   for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1241     {
   1242       if (ctr_list[ii].needs_restart)
   1243 	errors |= internal_hwc_start (ctr_list[ii].fd);
   1244       ctr_list[ii].needs_restart = 0;
   1245     }
   1246   return errors;
   1247 #else
   1248   return 0;
   1249 #endif
   1250 }
   1251 
   1252 /* create counters based on hwcdef[] */
   1253 HWCDRV_API int
   1254 hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef)
   1255 {
   1256   if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics)
   1257     {
   1258       logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/
   1259       return HWCFUNCS_ERROR_HWCARGS;
   1260     }
   1261   if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
   1262     {
   1263       logerr (GTXT ("Processor not supported\n"));
   1264       return HWCFUNCS_ERROR_HWCARGS;
   1265     }
   1266 
   1267   /* add counters */
   1268   for (unsigned idx = 0; idx < hwcdef_cnt; idx++)
   1269     {
   1270       perf_event_def_t *glb_event_def = &global_perf_event_def[idx];
   1271       memset (glb_event_def, 0, sizeof (perf_event_def_t));
   1272       unsigned int pmc_sel;
   1273       eventsel_t evntsel;
   1274       if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num,
   1275 				     hwcdef[idx].int_name, &evntsel, &pmc_sel))
   1276 	{
   1277 	  TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n");
   1278 	  return HWCFUNCS_ERROR_HWCARGS;
   1279 	}
   1280       glb_event_def->reg_num = pmc_sel;
   1281       glb_event_def->eventsel = evntsel;
   1282       glb_event_def->counter_preload = hwcdef[idx].val;
   1283       glb_event_def->min_time = hwcdef[idx].min_time;
   1284       glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor
   1285       init_perf_event (&glb_event_def->hw, glb_event_def->eventsel,
   1286 		       glb_event_def->counter_preload);
   1287       TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld"
   1288 		"(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n",
   1289 		idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload,
   1290 		(long long) glb_event_def->min_time, (int) glb_event_def->reg_num,
   1291 		(long long) glb_event_def->eventsel,
   1292 		(long long) HW_INTERVAL_PRESET (hwcdef[idx].val),
   1293 		(long long) glb_event_def->hw.exclude_user,
   1294 		(long long) glb_event_def->hw.exclude_kernel);
   1295     }
   1296 
   1297   hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt;
   1298   return 0;
   1299 }
   1300 
   1301 HWCDRV_API int
   1302 hwcdrv_free_counters () // note: only performs shutdown for this thread
   1303 {
   1304   hdrv_pcl_ctx_t * pctx;
   1305   if (!COUNTERS_ENABLED ())
   1306     return 0;
   1307   pctx = hdrv_pcl_state.find_vpc_ctx ();
   1308   if (!pctx)
   1309     {
   1310       TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n");
   1311       return HWCFUNCS_ERROR_GENERIC;
   1312     }
   1313   counter_state_t *ctr_list = pctx->ctr_list;
   1314   if (!ctr_list)
   1315     {
   1316       // fork child: prolog suspends hwcs, then epilog frees them
   1317       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n");
   1318       return 0;
   1319     }
   1320   int hwc_rc = 0;
   1321   for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1322     if (stop_one_ctr (ii, ctr_list))
   1323       hwc_rc = HWCFUNCS_ERROR_GENERIC;
   1324   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", pctx->tid);
   1325   pctx->ctr_list = NULL;
   1326   return hwc_rc;
   1327 }
   1328 
   1329 HWCDRV_API int
   1330 hwcdrv_start (void) /* must be called from each thread ? */
   1331 {
   1332   hdrv_pcl_ctx_t *pctx = NULL;
   1333   if (!COUNTERS_ENABLED ())
   1334     {
   1335       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n");
   1336       return 0;
   1337     }
   1338   if (!hdrv_pcl_state.library_ok)
   1339     {
   1340       TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n");
   1341       return HWCFUNCS_ERROR_NOT_SUPPORTED;
   1342     }
   1343 
   1344   /*
   1345    * set up per-thread context
   1346    */
   1347   pctx = hdrv_pcl_state.find_vpc_ctx ();
   1348   if (!pctx)
   1349     {
   1350       TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n");
   1351       return HWCFUNCS_ERROR_UNEXPECTED;
   1352     }
   1353   pctx->tid = hwcdrv_gettid ();
   1354   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", pctx->tid);
   1355 
   1356   /*
   1357    * create per-thread counter list
   1358    */
   1359   counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt,
   1360 							  sizeof (counter_state_t));
   1361   if (!ctr_list)
   1362     {
   1363       TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n");
   1364       return HWCFUNCS_ERROR_MEMORY;
   1365     }
   1366   int ii;
   1367   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1368     ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely
   1369   pctx->ctr_list = ctr_list;
   1370 
   1371   /*
   1372    * bind the counters
   1373    */
   1374   size_t pgsz = sysconf (_SC_PAGESIZE);
   1375   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1376     {
   1377       ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period;
   1378       if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup;
   1379     }
   1380 
   1381   /*
   1382    * start the counters
   1383    */
   1384   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1385     {
   1386       int rc = internal_hwc_start (ctr_list[ii].fd);
   1387       if (rc < 0)
   1388 	goto hwcdrv_start_cleanup;
   1389     }
   1390   return 0;
   1391 
   1392 hwcdrv_start_cleanup:
   1393   hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds
   1394   return HWCFUNCS_ERROR_UNAVAIL;
   1395 }
   1396 
   1397 HWCDRV_API int
   1398 hwcdrv_lwp_suspend (void) /* must be called from each thread */
   1399 {
   1400   if (!COUNTERS_ENABLED ())
   1401     {
   1402       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n");
   1403       return 0;
   1404     }
   1405   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n");
   1406   return hwcdrv_free_counters ();
   1407 }
   1408 
   1409 HWCDRV_API int
   1410 hwcdrv_lwp_resume (void) /* must be called from each thread */
   1411 {
   1412   if (!COUNTERS_ENABLED ())
   1413     {
   1414       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n");
   1415       return 0;
   1416     }
   1417   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n");
   1418   return hwcdrv_start ();
   1419 }
   1420 
   1421 HWCDRV_API int
   1422 hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data)
   1423 {
   1424   overflow_data->ce_hrt = 0;
   1425   for (int i = 0; i < MAX_PICS; i++)
   1426     {
   1427       overflow_data->ce_pic[i] = 0;
   1428       if (sampled_data)
   1429 	HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]);
   1430     }
   1431   return 0;
   1432 }
   1433 
   1434 /*---------------------------------------------------------------------------*/
   1435 /* HWCDRV_API */
   1436 
   1437 hwcdrv_api_t hwcdrv_pcl_api = {
   1438   hwcdrv_init,
   1439   hwcdrv_get_info,
   1440   hwcdrv_enable_mt,
   1441   hwcdrv_get_descriptions,
   1442   hwcdrv_assign_regnos,
   1443   hwcdrv_create_counters,
   1444   hwcdrv_start,
   1445   hwcdrv_overflow,
   1446   hwcdrv_read_events,
   1447   hwcdrv_sighlr_restart,
   1448   hwcdrv_lwp_suspend,
   1449   hwcdrv_lwp_resume,
   1450   hwcdrv_free_counters,
   1451   hwcdrv_lwp_init,
   1452   hwcdrv_lwp_fini,
   1453     -1                      // hwcdrv_init_status
   1454 };
   1455