Home | History | Annotate | Line # | Download | only in common
hwcdrv.c revision 1.1.1.3
      1 /* Copyright (C) 2021-2025 Free Software Foundation, Inc.
      2    Contributed by Oracle.
      3 
      4    This file is part of GNU Binutils.
      5 
      6    This program is free software; you can redistribute it and/or modify
      7    it under the terms of the GNU General Public License as published by
      8    the Free Software Foundation; either version 3, or (at your option)
      9    any later version.
     10 
     11    This program is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14    GNU General Public License for more details.
     15 
     16    You should have received a copy of the GNU General Public License
     17    along with this program; if not, write to the Free Software
     18    Foundation, 51 Franklin Street - Fifth Floor, Boston,
     19    MA 02110-1301, USA.  */
     20 
     21 #include <errno.h>
     22 #include <unistd.h>
     23 #include <fcntl.h>
     24 #include <sys/mman.h>
     25 #include <sys/ioctl.h>
     26 #include <sys/syscall.h>
     27 #include <linux/perf_event.h>
     28 
     29 #include "hwcdrv.h"
     30 
     31 /*---------------------------------------------------------------------------*/
     32 /* macros */
     33 #define IS_GLOBAL /* Mark global symbols */
     34 
     35 #include "cpuid.c" /* ftns for identifying a chip */
     36 
     37 static hdrv_pcbe_api_t *pcbe_driver = NULL;
     38 static hdrv_pcbe_api_t hdrv_pcbe_core_api;
     39 static hdrv_pcbe_api_t hdrv_pcbe_opteron_api;
     40 static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = {
     41   &hdrv_pcbe_core_api,
     42   &hdrv_pcbe_opteron_api,
     43   NULL
     44 };
     45 #include "opteron_pcbe.c" /* CPU-specific code */
     46 #include "core_pcbe.c" /* CPU-specific code  */
     47 
     48 /*---------------------------------------------------------------------------*/
     49 static int
     50 hwcdrv_lookup_cpuver (const char * cpcN_cciname)
     51 {
     52   /* returns hwc_cpus.h ID for a given string. */
     53   libcpc2_cpu_lookup_t *plookup;
     54   static libcpc2_cpu_lookup_t cpu_table[] = {
     55     LIBCPC2_CPU_LOOKUP_LIST
     56   };
     57   if (cpcN_cciname == NULL)
     58     return CPUVER_UNDEFINED;
     59 
     60   /* search table for name */
     61   for (plookup = cpu_table; plookup->cpc2_cciname; plookup++)
     62     {
     63       int n = strlen (plookup->cpc2_cciname);
     64       if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n))
     65 	return plookup->cpc2_cpuver;
     66     }
     67   /* unknown, but does have a descriptive string */
     68   TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' "
     69 	    "could not be determined\n",
     70 	    cpcN_cciname);
     71   return CPUVER_GENERIC;
     72 }
     73 
     74 /*---------------------------------------------------------------------------*/
     75 /* utils to generate x86 register definitions on Linux */
     76 
     77 /*
     78  *  This code is structured as though we're going to initialize the
     79  *  HWC by writing the Intel MSR register directly.  That is, we
     80  *  assume the lowest 16 bits of the event number will have the event
     81  *  and that higher bits will set attributes.
     82  *
     83  *  While SPARC is different, we can nonetheless use basically the
     84  *  same "x86"-named functions:
     85  *
     86  *  - The event code will still be 16 bits.  It will still
     87  *    be in the lowest 16 bits of the event number.  Though
     88  *    perf_event_code() on SPARC will expect those bits to
     89  *    shifted, hwcdrv_pcl.c can easily perform that shift.
     90  *
     91  *  - On SPARC we support only two attributes, "user" and "system",
     92  *    which hwcdrv_pcl.c already converts to the "exclude_user"
     93  *    and "exclude_kernel" fields expected by perf_event_open().
     94  *    "user" and "system" are stored in event bits 16 and 17.
     95  *    For M8, a 4-bit mask of supported PICs is stored in bits [23:20].
     96  */
     97 
     98 static const attr_info_t perfctr_sparc_attrs[] = {
     99   {NTXT ("user"),   0, 0x01, 16}, //usr
    100   {NTXT ("system"), 0, 0x01, 17}, //os
    101   {NULL, 0, 0x00, 0},
    102 };
    103 static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */
    104   {NTXT ("umask"),  0, 0xff, 8},
    105   {NTXT ("user"),   0, 0x01, 16}, //usr
    106   //{NTXT("nouser"),  1, 0x01, 16}, //usr (inverted)
    107   {NTXT ("system"), 0, 0x01, 17}, //os
    108   {NTXT ("edge"),   0, 0x01, 18},
    109   {NTXT ("pc"),     0, 0x01, 19},
    110   {NTXT ("inv"),    0, 0x01, 23},
    111   {NTXT ("cmask"),  0, 0xff, 24},
    112   {NULL, 0, 0x00, 0},
    113 };
    114 const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs;
    115 
    116 static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */
    117     // (0xff <<  0) |   /* event*/
    118     // (0xff <<  8) |   /* umask */
    119     // (0x01 << 17) |   /* os */
    120     // (0x01 << 18) |   /* edge */
    121     // (0x01 << 19) |   /* pc */
    122     (0x01 << 20) |      /* int */
    123     // (0x01 << 21) |   /* reserved */
    124     (0x01 << 22) |      /* enable */
    125     // (0x01 << 23) |   /* inv */
    126     // (0xff << 24) |   /* cmask */
    127     0;
    128 
    129 static int
    130 myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc,
    131 			    eventsel_t *eventsel, eventsel_t *valid_umask,
    132 			    uint_t *pmc_sel)
    133 {
    134   if (pcbe_driver && pcbe_driver->hdrv_pcbe_get_eventnum &&
    135       !pcbe_driver->hdrv_pcbe_get_eventnum (eventname, pmc, eventsel,
    136 					    valid_umask, pmc_sel))
    137     return 0;
    138 
    139   /* check for numerically-specified counters */
    140   char * endptr;
    141   uint64_t num = strtoull (eventname, &endptr, 0);
    142   if (*eventname && !*endptr)
    143     {
    144       *eventsel = EXTENDED_EVNUM_2_EVSEL (num);
    145       *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */
    146       *pmc_sel = pmc;
    147       return 0;
    148     }
    149 
    150   /* name does not specify a numeric value */
    151   *eventsel = (eventsel_t) - 1;
    152   *valid_umask = 0x0;
    153   *pmc_sel = pmc;
    154   return -1;
    155 }
    156 
    157 static int
    158 mask_shift_set (eventsel_t *presult, eventsel_t invalue,
    159 		eventsel_t mask, eventsel_t shift)
    160 {
    161   if (invalue & ~mask)
    162     return -1; /* invalue attempts to set bits outside of mask */
    163   *presult &= ~(mask << shift); /* clear all the mask bits */
    164   *presult |= (invalue << shift); /* set bits according to invalue */
    165   return 0;
    166 }
    167 
    168 static int
    169 set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask,
    170 		   hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly)
    171 {
    172   eventsel_t evntsel = *result_mask;
    173   for (int ii = 0; ii < (int) nattrs; ii++)
    174     {
    175       const char *attrname = attrs[ii].ca_name;
    176       eventsel_t attrval = (eventsel_t) attrs[ii].ca_val;
    177       const char *tmpname;
    178       int attr_found = 0;
    179       for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++)
    180 	{
    181 	  if (strcmp (attrname, tmpname) == 0)
    182 	    {
    183 	      if (strcmp (attrname, "umask") == 0)
    184 		{
    185 		  if (attrval & ~evnt_valid_umask)
    186 		    {
    187 		      logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"),
    188 			      nameOnly, (long long) evnt_valid_umask);
    189 		      return -1;
    190 		    }
    191 		}
    192 	      if (mask_shift_set (&evntsel,
    193 				  perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval,
    194 				  perfctr_attrs_table[jj].mask,
    195 				  perfctr_attrs_table[jj].shift))
    196 		{
    197 		  logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"),
    198 			  nameOnly, attrname, (long long) attrval);
    199 		  return -1;
    200 		}
    201 	      TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n",
    202 			nameOnly, attrname, (long long) attrval);
    203 	      attr_found = 1;
    204 	      break;
    205 	    }
    206 	}
    207       if (!attr_found)
    208 	{
    209 	  logerr (GTXT ("attribute `%s' is invalid\n"), attrname);
    210 	  return -1;
    211 	}
    212     }
    213   *result_mask = evntsel;
    214   return 0;
    215 }
    216 
    217 static int
    218 hwcfuncs_get_x86_eventsel (Hwcentry *h,
    219 			   eventsel_t *return_event, uint_t *return_pmc_sel)
    220 {
    221   hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1];
    222   unsigned nattrs = 0;
    223   char *nameOnly = NULL;
    224   eventsel_t evntsel = h->config;
    225   eventsel_t evnt_valid_umask = 0;
    226   uint_t pmc_sel = 0;
    227   int rc = -1;
    228   *return_event = 0;
    229   *return_pmc_sel = 0;
    230   void *attr_mem = hwcfuncs_parse_attrs (h->int_name, attrs, HWCFUNCS_MAX_ATTRS,
    231 				   &nattrs, NULL);
    232   if (!attr_mem)
    233     {
    234       logerr (GTXT ("out of memory, could not parse attributes\n"));
    235       return -1;
    236     }
    237   hwcfuncs_parse_ctr (h->int_name, NULL, &nameOnly, NULL, NULL, NULL);
    238 
    239   /* look up evntsel */
    240   if (myperfctr_get_x86_eventnum (nameOnly, h->reg_num,
    241 				  &evntsel, &evnt_valid_umask, &pmc_sel))
    242     {
    243       logerr (GTXT ("counter `%s' is not valid\n"), nameOnly);
    244       goto attr_wrapup;
    245     }
    246   TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n",
    247 	    (long long) evntsel, pmc_sel, nameOnly, nattrs);
    248 
    249   /* determine event attributes */
    250   eventsel_t evnt_attrs = perfctr_evntsel_enable_bits;
    251   if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly))
    252     goto attr_wrapup;
    253   if (evntsel & evnt_attrs)
    254     TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n",
    255 	      (long long) evntsel, (long long) evnt_attrs,
    256 	      (long long) (evntsel & evnt_attrs));
    257   *return_event = evntsel | evnt_attrs;
    258   *return_pmc_sel = pmc_sel;
    259   rc = 0;
    260 
    261 attr_wrapup:
    262   free (attr_mem);
    263   free (nameOnly);
    264   return rc;
    265 }
    266 
    267 #ifdef __x86_64__
    268 #define syscall_instr          "syscall"
    269 #define syscall_clobber        "rcx", "r11", "memory"
    270 #endif
    271 #ifdef __i386__
    272 #define syscall_instr          "int $0x80"
    273 #define syscall_clobber        "memory"
    274 #endif
    275 
    276 static inline int
    277 perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid,
    278 		 int cpu, int group_fd, unsigned long flags)
    279 {
    280   /* It seems that perf_event_open() sometimes fails spuriously,
    281    * even while an immediate retry succeeds.
    282    * So, let's try a few retries if the call fails just to be sure.
    283    */
    284   int rc;
    285   for (int retry = 0; retry < 5; retry++)
    286     {
    287       rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags);
    288       if (rc != -1)
    289 	return rc;
    290       TprintfT (0, "perf_event_open %d: errno=%d %s\n", retry, errno, strerror(errno));
    291     }
    292   return rc;
    293 }
    294 
    295 /*---------------------------------------------------------------------------*/
    296 /* macros & fwd prototypes */
    297 
    298 #define HWCDRV_API      static /* Mark functions used by hwcdrv API */
    299 
    300 HWCDRV_API int hwcdrv_start (void);
    301 HWCDRV_API int hwcdrv_free_counters ();
    302 
    303 static pid_t
    304 hwcdrv_gettid (void)
    305 {
    306 #ifndef LIBCOLLECTOR_SRC
    307   return syscall (__NR_gettid);
    308 #elif defined(intel)
    309   pid_t r;
    310   __asm__ __volatile__(syscall_instr
    311 		       : "=a" (r) : "0" (__NR_gettid)
    312 		       : syscall_clobber);
    313   return r;
    314 #else
    315   return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm
    316 #endif
    317 }
    318 
    319 /*---------------------------------------------------------------------------*/
    320 /* types */
    321 
    322 #define NPAGES_PER_BUF  1 // number of pages to be used for perf_event samples
    323 // must be a power of 2
    324 
    325 /*---------------------------------------------------------------------------*/
    326 
    327 /* typedefs */
    328 
    329 typedef struct
    330 { // event (hwc) definition
    331   unsigned int reg_num; // PMC assignment, potentially for detecting conflicts
    332   eventsel_t eventsel;          // raw event bits (Intel/AMD)
    333   uint64_t counter_preload;     // number of HWC events before signal
    334   struct perf_event_attr hw;    // perf_event definition
    335   hrtime_t min_time;            // minimum time we're targeting between events
    336   char *name;
    337 } perf_event_def_t;
    338 static perf_event_def_t event_def_0;
    339 
    340 typedef struct
    341 { // runtime state of perf_event buffer
    342   void *buf;                    // pointer to mmapped buffer
    343   size_t pagesz;                // size of pages
    344 } buffer_state_t;
    345 
    346 typedef struct
    347 { // runtime state of counter values
    348   uint64_t prev_ena_ts;         // previous perf_event "enabled" time
    349   uint64_t prev_run_ts;         // previous perf_event "running" time
    350   uint64_t prev_value;          // previous HWC value
    351 } counter_value_state_t;
    352 
    353 typedef struct
    354 { // per-counter information
    355   perf_event_def_t *ev_def;     // global HWC definition for one counter
    356   int fd;                       // perf_event fd
    357   buffer_state_t buf_state;     // perf_event buffer's state
    358   counter_value_state_t value_state; // counter state
    359   int needs_restart;            // workaround for dbx failure to preserve si_fd
    360   uint64_t last_overflow_period;
    361   hrtime_t last_overflow_time;
    362 } counter_state_t;
    363 
    364 typedef struct
    365 { // per-thread context
    366   counter_state_t *ctr_list;
    367   int signal_fd;                // fd that caused the most recent signal
    368   pid_t tid;			// for debugging signal delivery problems
    369 } hdrv_pcl_ctx_t;
    370 
    371 /*---------------------------------------------------------------------------*/
    372 
    373 /* static variables */
    374 static struct
    375 {
    376   int library_ok;
    377   int internal_open_called;
    378   hwcfuncs_tsd_get_fn_t find_vpc_ctx;
    379   unsigned hwcdef_cnt;      /* number of *active* hardware counters */
    380 } hdrv_pcl_state;
    381 
    382 static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED};
    383 static perf_event_def_t global_perf_event_def[MAX_PICS];
    384 
    385 #define COUNTERS_ENABLED()      (hdrv_pcl_state.hwcdef_cnt)
    386 
    387 
    388 /* perf_event buffer formatting and handling */
    389 static void
    390 reset_buf (buffer_state_t *bufstate)
    391 {
    392   TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n");
    393   struct perf_event_mmap_page *metadata = bufstate->buf;
    394   if (metadata)
    395     metadata->data_tail = metadata->data_head;
    396 }
    397 
    398 static int
    399 skip_buf (buffer_state_t *bufstate, size_t sz)
    400 {
    401   TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n");
    402   struct perf_event_mmap_page *metadata = bufstate->buf;
    403   if (metadata == NULL)
    404     return -1;
    405   size_t pgsz = bufstate->pagesz;
    406   size_t bufsz = NPAGES_PER_BUF*pgsz;
    407   uint64_t d_tail = metadata->data_tail;
    408   uint64_t d_head = metadata->data_head;
    409 
    410   // validate request size
    411   if (sz > d_head - d_tail || sz >= bufsz)
    412     {
    413       reset_buf (bufstate);
    414       return -1;
    415     }
    416   metadata->data_tail = d_tail + sz; // advance tail
    417   return 0;
    418 }
    419 
    420 static int
    421 read_buf (buffer_state_t *bufstate, void *buf, size_t sz)
    422 {
    423   struct perf_event_mmap_page *metadata = bufstate->buf;
    424   if (metadata == NULL)
    425     return -1;
    426   size_t pgsz = bufstate->pagesz;
    427   size_t bufsz = NPAGES_PER_BUF*pgsz;
    428   uint64_t d_tail = metadata->data_tail;
    429   uint64_t d_head = metadata->data_head;
    430 
    431   // validate request size
    432   if (sz > d_head - d_tail || sz >= bufsz)
    433     {
    434       reset_buf (bufstate);
    435       return -1;
    436     }
    437   char *buf_base = ((char *) metadata) + pgsz; // start of data buffer
    438   uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer
    439   size_t nbytes = sz;
    440   if (start_pos + sz > bufsz)
    441     {
    442       // will wrap past end of buffer
    443       nbytes = bufsz - start_pos;
    444       memcpy (buf, buf_base + start_pos, nbytes);
    445       start_pos = 0; // wrap to start
    446       buf = (void *) (((char *) buf) + nbytes);
    447       nbytes = sz - nbytes;
    448     }
    449   memcpy (buf, buf_base + start_pos, nbytes);
    450   metadata->data_tail += sz;
    451   return 0;
    452 }
    453 
    454 static int
    455 read_u64 (buffer_state_t *bufstate, uint64_t *value)
    456 {
    457   return read_buf (bufstate, value, sizeof (uint64_t));
    458 }
    459 
    460 static int
    461 read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue,
    462 	     uint64_t *rlost)
    463 {
    464   // returns count of bytes read
    465   buffer_state_t *bufstate = &ctr_state->buf_state;
    466   counter_value_state_t *cntstate = &ctr_state->value_state;
    467   int readsz = 0;
    468 
    469   // PERF_SAMPLE_IP
    470   uint64_t ipc = 0;
    471   int rc = read_u64 (bufstate, &ipc);
    472   if (rc)
    473     return -1;
    474   readsz += sizeof (uint64_t);
    475 
    476   // PERF_SAMPLE_READ: value
    477   uint64_t value = 0;
    478   rc = read_u64 (bufstate, &value);
    479   if (rc)
    480     return -2;
    481   readsz += sizeof (uint64_t);
    482 
    483   /* Bug 20806896
    484    * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and
    485    * running times in the sample data that correspond to the metadata times
    486    *     metadata->time_enabled
    487    *     metadata->time_running
    488    * from the PREVIOUS (not current) sample.  Probably just ignore this bug
    489    * since it's on old kernels and we only use the enabled and running times
    490    * to construct loss_estimate.
    491    */
    492   // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED
    493   uint64_t enabled_time = 0;
    494   rc = read_u64 (bufstate, &enabled_time);
    495   if (rc)
    496     return -3;
    497   readsz += sizeof (uint64_t);
    498 
    499   // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING
    500   uint64_t running_time = 0;
    501   rc = read_u64 (bufstate, &running_time);
    502   if (rc)
    503     return -4;
    504   readsz += sizeof (uint64_t);
    505 
    506   uint64_t value_delta = value - cntstate->prev_value;
    507   uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts;
    508   uint64_t running_delta = running_time - cntstate->prev_run_ts;
    509   cntstate->prev_value = value;
    510   cntstate->prev_ena_ts = enabled_time;
    511   cntstate->prev_run_ts = running_time;
    512 
    513   // 24830461 need workaround for Linux anomalous HWC skid overrun
    514   int set_error_flag = 0;
    515   if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */)
    516     set_error_flag = 1;
    517 
    518   uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing
    519   if (running_delta == enabled_delta)
    520     {
    521       // counter was running 100% of time, no multiplexing
    522     }
    523   else if (running_delta == 0)
    524     loss_estimate = 1; // token amount to aid in debugging perfctr oddities
    525   else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll))
    526     {
    527       // running should be smaller than enabled, can't estimate
    528       /*
    529        * 21418391 HWC can have a negative count
    530        *
    531        * We've also seen enabled not only be smaller than running
    532        * but in fact go negative.  Guard against this.
    533        */
    534       loss_estimate = 2; // token amount to aid in debugging perfctr oddities
    535     }
    536   else
    537     {
    538       // counter was running less than 100% of time
    539       // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479
    540       uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta;
    541       value_delta = scaled_delta;
    542 #if 0
    543       // We should perhaps warn the user that multiplexing is going on,
    544       // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values.
    545       // For now we simply don't report.
    546       // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(),
    547       // but at that level "lost" has a meaning that's considerably broader than just multiplexing.
    548       collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
    549 				     SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
    550 				     ctr_list[idx].last_overflow_period, new_period);
    551 #endif
    552     }
    553   TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3,
    554 	    "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu "
    555 	    "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n",
    556 	    ctr_state->ev_def->name, (long long) ipc,
    557 	    (long long) enabled_delta, (long long) running_delta,
    558 	    (long long) value_delta, (long long) value_delta,
    559 	    (unsigned long long) loss_estimate,
    560 	    loss_estimate ? ", WARNING - SCALED" : "",
    561 	    set_error_flag ? ", ERRORFLAG" : "");
    562   if (set_error_flag == 1)
    563     value_delta |= (1ULL << 63)     /* HWCVAL_ERR_FLAG */;
    564   *rvalue = value_delta;
    565   *rlost = loss_estimate;
    566   if (readsz != msgsz)
    567     {
    568       TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n");
    569       return -5;
    570     }
    571   return 0;
    572 }
    573 
    574 static void
    575 dump_perf_event_attr (struct perf_event_attr *at)
    576 {
    577 #if defined(DEBUG)
    578   TprintfT (DBG_LT2, "dump_perf_event_attr:  size=%d  type=%d  sample_period=%lld\n"
    579 	    "  config=0x%llx  config1=0x%llx  config2=0x%llx  wakeup_events=%lld __reserved_1=%lld\n",
    580 	    (int) at->size, (int) at->type, (unsigned long long) at->sample_period,
    581 	    (unsigned long long) at->config, (unsigned long long) at->config1,
    582 	    (unsigned long long) at->config2, (unsigned long long) at->wakeup_events,
    583 	    (unsigned long long) at->__reserved_1);
    584 #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, "  %-10s : %lld\n", #fld, (long long) at->fld)
    585   DUMP_F (disabled);
    586   DUMP_F (inherit);
    587   DUMP_F (pinned);
    588   DUMP_F (exclusive);
    589   DUMP_F (exclude_user);
    590   DUMP_F (exclude_kernel);
    591   DUMP_F (exclude_hv);
    592   DUMP_F (exclude_idle);
    593   DUMP_F (comm);
    594   DUMP_F (freq);
    595   DUMP_F (inherit_stat);
    596   DUMP_F (enable_on_exec);
    597   DUMP_F (task);
    598   DUMP_F (watermark);
    599 #endif
    600 }
    601 
    602 static void
    603 init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period,
    604 		 Hwcentry *hwce)
    605 {
    606   static struct perf_event_attr perf_event_attr_0 = {
    607     .size = sizeof (struct perf_event_attr),
    608     .disabled = 1, /* off by default */
    609     .exclude_hv = 1,
    610     .wakeup_events = 1 /* wakeup every n events */
    611   };
    612   *hw = perf_event_attr_0;
    613   if (hwce && hwce->use_perf_event_type)
    614     {
    615       hw->config = hwce->config;
    616       hw->config1 = hwce->config1;
    617       hw->type = hwce->type;
    618     }
    619   else
    620     { // backward compatibility. The old interface had no 'hwce' argument.
    621       hw->config = event;
    622       hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw...
    623     }
    624   hw->sample_period = period;
    625   hw->sample_type = PERF_SAMPLE_IP |
    626 	  // PERF_SAMPLE_TID		|
    627 	  // PERF_SAMPLE_TIME		| // possibly interesting
    628 	  // PERF_SAMPLE_ADDR		|
    629 	  PERF_SAMPLE_READ | // HWC value
    630 	  // PERF_SAMPLE_CALLCHAIN	| // interesting
    631 	  // PERF_SAMPLE_ID		|
    632 	  // PERF_SAMPLE_CPU		| // possibly interesting
    633 	  // PERF_SAMPLE_PERIOD		|
    634 	  // PERF_SAMPLE_STREAM_ID	|
    635 	  // PERF_SAMPLE_RAW		|
    636 	  0;
    637   hw->read_format =
    638 	  PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled
    639 	  PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled
    640 	  // PERF_FORMAT_ID		|
    641 	  // PERF_FORMAT_GROUP		|
    642 	  0;
    643 
    644   // Note: the following override config.priv bits!
    645   hw->exclude_user = (event & (1 << 16)) == 0;      /* don't count user */
    646   hw->exclude_kernel = (event & (1 << 17)) == 0;    /* ditto kernel */
    647   dump_perf_event_attr (hw);
    648 }
    649 
    650 static int
    651 start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string)
    652 {
    653   // pe_attr should have been initialized in hwcdrv_create_counters()
    654   struct perf_event_attr pe_attr;
    655   memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr));
    656 
    657   // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set
    658   pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period;
    659 
    660   int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0);
    661   if (hwc_fd == -1)
    662     {
    663       TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n",
    664 		error_string, ii, errno);
    665       return 1;
    666     }
    667 
    668   size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata
    669   void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call?
    670 		     PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0);
    671   if (buf == MAP_FAILED)
    672     {
    673       TprintfT (0, "sz = %ld, pgsz = %ld\n  err=%s idx=%d mmap failed: %s\n",
    674 		(long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno));
    675       return 1;
    676     }
    677   pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def?  we never seem to use it
    678   pctx->ctr_list[ii].fd = hwc_fd;
    679   pctx->ctr_list[ii].buf_state.buf = buf;
    680   pctx->ctr_list[ii].buf_state.pagesz = pgsz;
    681   pctx->ctr_list[ii].value_state.prev_ena_ts = 0;
    682   pctx->ctr_list[ii].value_state.prev_run_ts = 0;
    683   pctx->ctr_list[ii].value_state.prev_value = 0;
    684   pctx->ctr_list[ii].last_overflow_time = gethrtime ();
    685 
    686   /* set async mode */
    687   long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC;
    688   int rc = fcntl (hwc_fd, F_SETFL, flags);
    689   if (rc == -1)
    690     {
    691       TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii);
    692       return 1;
    693     }
    694 
    695   /*
    696    * set lwp ownership of the fd
    697    * See BUGS section of "man perf_event_open":
    698    *     The F_SETOWN_EX option to fcntl(2) is needed to properly get
    699    *     overflow signals in threads.  This was introduced in Linux 2.6.32.
    700    * Legacy references:
    701    *     see http://lkml.org/lkml/2009/8/4/128
    702    *     google man fcntl F_SETOWN_EX -conflict
    703    *       "From Linux 2.6.32 onward, use F_SETOWN_EX to target
    704    *       SIGIO and SIGURG signals at a particular thread."
    705    *     http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html
    706    *     See 2010 CSCADS presentation by Eranian
    707    */
    708   struct f_owner_ex fowner_ex;
    709   fowner_ex.type = F_OWNER_TID;
    710   fowner_ex.pid = pctx->tid;
    711   rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex);
    712   if (rc == -1)
    713     {
    714       TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii);
    715       return 1;
    716     }
    717 
    718   /* Use sigio so handler can determine FD via siginfo->si_fd. */
    719   rc = fcntl (hwc_fd, F_SETSIG, SIGIO);
    720   if (rc == -1)
    721     {
    722       TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii);
    723       return 1;
    724     }
    725   return 0;
    726 }
    727 
    728 static int
    729 stop_one_ctr (int ii, counter_state_t *ctr_list)
    730 {
    731   int hwc_rc = 0;
    732   if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1))
    733     {
    734       TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno);
    735       hwc_rc = HWCFUNCS_ERROR_GENERIC;
    736     }
    737   void *buf = ctr_list[ii].buf_state.buf;
    738   if (buf)
    739     {
    740       size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz;
    741       ctr_list[ii].buf_state.buf = NULL;
    742       int tmprc = munmap (buf, bufsz);
    743       if (tmprc)
    744 	{
    745 	  TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno);
    746 	  hwc_rc = HWCFUNCS_ERROR_GENERIC;
    747 	}
    748     }
    749   if (-1 == close (ctr_list[ii].fd))
    750     {
    751       TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno);
    752       hwc_rc = HWCFUNCS_ERROR_GENERIC;
    753     }
    754   return hwc_rc;
    755 }
    756 
    757 /* HWCDRV_API for thread-specific actions */
    758 HWCDRV_API int
    759 hwcdrv_lwp_init (void)
    760 {
    761   return hwcdrv_start ();
    762 }
    763 
    764 HWCDRV_API void
    765 hwcdrv_lwp_fini (void)
    766 {
    767   hwcdrv_free_counters ();  /* also sets pctx->ctr_list=NULL; */
    768 }
    769 
    770 /* open */
    771 static int
    772 hdrv_pcl_internal_open ()
    773 {
    774   if (hdrv_pcl_state.internal_open_called)
    775     {
    776       TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n");
    777       return HWCFUNCS_ERROR_ALREADY_CALLED;
    778     }
    779 
    780   // determine if PCL is available
    781   perf_event_def_t tmp_event_def = event_def_0;
    782   struct perf_event_attr *pe_attr = &tmp_event_def.hw;
    783   init_perf_event (pe_attr, 0, 0, NULL);
    784   pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event
    785   pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts
    786   int hwc_fd = perf_event_open (pe_attr,
    787 				0, // pid/tid, 0 is self
    788 				-1, // cpu, -1 is per-thread mode
    789 				-1, // group_fd, -1 is root
    790 				0); // flags
    791   if (hwc_fd == -1)
    792     {
    793       TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
    794 		" perf_event_open() failed, errno=%d\n", errno);
    795       goto internal_open_error;
    796     }
    797 
    798   /* see if the PCL is new enough to know about F_SETOWN_EX */
    799   struct f_owner_ex fowner_ex;
    800   fowner_ex.type = F_OWNER_TID;
    801   fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID
    802   if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1)
    803     {
    804       TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: "
    805 		"F_SETOWN failed, errno=%d\n", errno);
    806       close (hwc_fd);
    807       goto internal_open_error;
    808     }
    809   close (hwc_fd);
    810 
    811   hdrv_pcl_state.internal_open_called = 1;
    812   hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted
    813   hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
    814   TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n");
    815   for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++)
    816     {
    817       hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii];
    818       if (!ppcbe->hdrv_pcbe_init ())
    819 	{
    820 	  pcbe_driver = ppcbe;
    821 	  hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name ();
    822 	  hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname);
    823 	  if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
    824 	    goto internal_open_error;
    825 	  hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters ();
    826 	  hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref ();
    827 	  break;
    828 	}
    829     }
    830   if (hdrv_pcl_about.cpcN_npics > MAX_PICS)
    831     {
    832       TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
    833 		" reducing number of HWCs from %u to %u on processor '%s'\n",
    834 		hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname);
    835       hdrv_pcl_about.cpcN_npics = MAX_PICS;
    836     }
    837   TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:"
    838 	    " perf_event cpuver=%d, name='%s'\n",
    839 	    hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname);
    840   return 0;
    841 
    842 internal_open_error:
    843   hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
    844   hdrv_pcl_about.cpcN_npics = 0;
    845   hdrv_pcl_about.cpcN_docref = NULL;
    846   hdrv_pcl_about.cpcN_cciname = NULL;
    847   return HWCFUNCS_ERROR_NOT_SUPPORTED;
    848 }
    849 
    850 static void *
    851 single_thread_tsd_ftn ()
    852 {
    853   static hdrv_pcl_ctx_t tsd_context;
    854   return &tsd_context;
    855 }
    856 
    857 /* HWCDRV_API */
    858 HWCDRV_API int
    859 hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz)
    860 {
    861   hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn;
    862   if (tsd_sz)
    863     *tsd_sz = sizeof (hdrv_pcl_ctx_t);
    864 
    865   if (hdrv_pcl_state.internal_open_called)
    866     return HWCFUNCS_ERROR_ALREADY_CALLED;
    867   return hdrv_pcl_internal_open ();
    868 }
    869 
    870 HWCDRV_API void
    871 hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics,
    872 		 const char **docref, uint64_t *support)
    873 {
    874   if (cpuver)
    875     *cpuver = hdrv_pcl_about.cpcN_cpuver;
    876   if (cciname)
    877     *cciname = hdrv_pcl_about.cpcN_cciname;
    878   if (npics)
    879     *npics = hdrv_pcl_about.cpcN_npics;
    880   if (docref)
    881     *docref = hdrv_pcl_about.cpcN_docref;
    882   if (support)
    883     *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID;
    884 }
    885 
    886 HWCDRV_API int
    887 hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn)
    888 {
    889   if (tsd_ftn)
    890     hdrv_pcl_state.find_vpc_ctx = tsd_ftn;
    891   else
    892     {
    893       TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n");
    894       return HWCFUNCS_ERROR_UNAVAIL;
    895     }
    896   return 0;
    897 }
    898 
    899 HWCDRV_API int
    900 hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb,
    901 			 Hwcentry *raw_hwc_tbl)
    902 {
    903   int count = 0;
    904   if (hwc_cb && pcbe_driver && pcbe_driver->hdrv_pcbe_get_events)
    905     count = pcbe_driver->hdrv_pcbe_get_events (hwc_cb, raw_hwc_tbl);
    906   if (attr_cb)
    907     for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++)
    908       attr_cb (perfctr_attrs_table[ii].attrname);
    909   if (!count)
    910     return -1;
    911   return 0;
    912 }
    913 
    914 HWCDRV_API int
    915 hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs)
    916 {
    917   return 0;
    918 }
    919 
    920 static int
    921 internal_hwc_start (int fd)
    922 {
    923   int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1);
    924   if (rc == -1)
    925     {
    926       TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:"
    927 		" PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno);
    928       return HWCFUNCS_ERROR_UNAVAIL;
    929     }
    930   TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd);
    931   return 0;
    932 }
    933 
    934 HWCDRV_API int
    935 hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events)
    936 {
    937   /* set expired counters to overflow value and all others to 0 */
    938   /* return 0: OK, counters should be restarted */
    939   /* return non-zero: eventp not set, counters should not be restarted */
    940   /* clear return values */
    941   int ii;
    942   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    943     {
    944       eventp->ce_pic[ii] = 0;
    945       lost_events->ce_pic[ii] = 0;
    946     }
    947   hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event?
    948   eventp->ce_hrt = sig_ts;
    949   lost_events->ce_hrt = sig_ts;
    950 
    951   /* determine source signal */
    952   int signal_fd = -1;
    953   switch (si->si_code)
    954     {
    955     case POLL_HUP: /* expected value from pcl */
    956       /* According to Stephane Eranian:
    957        * "expect POLL_HUP instead of POLL_IN because we are
    958        * in one-shot mode (IOC_REFRESH)"
    959        */
    960       signal_fd = si->si_fd;
    961       break;
    962     case SI_TKILL: /* event forwarded by tkill */
    963       /* DBX can only forward SI_TKILL when it detects POLL_HUP
    964        * unfortunately, this means that si->si_fd has been lost...
    965        * We need to process the buffers, but we don't know the fd!
    966        */
    967       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    968 		" SI_TKILL detected\n", sig_ts);
    969       break;
    970     default:
    971       // "sometimes we see a POLL_IN (1) with very high event rates,"
    972       // according to eranian(?)
    973       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    974 		" unexpected si_code 0x%x\n", sig_ts, si->si_code);
    975       return HWCFUNCS_ERROR_GENERIC;
    976     }
    977 
    978   hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
    979   if (!pctx)
    980     {
    981       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    982 		" tsd context is NULL\n", sig_ts);
    983       return HWCFUNCS_ERROR_UNEXPECTED;
    984     }
    985   counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
    986   if (!ctr_list)
    987     {
    988       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    989 		" ctr_list is NULL\n", sig_ts);
    990       return HWCFUNCS_ERROR_UNEXPECTED;
    991     }
    992 
    993   /* clear needs_restart flag */
    994   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    995     ctr_list[ii].needs_restart = 0;
    996 
    997   /* attempt to identify the counter to read */
    998   int signal_idx = -1;
    999   pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t
   1000   if (signal_fd != -1)
   1001     {
   1002       for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1003 	{
   1004 	  if (ctr_list[ii].fd == signal_fd)
   1005 	    {
   1006 	      signal_idx = ii;
   1007 	      break;
   1008 	    }
   1009 	}
   1010     }
   1011 
   1012   if (signal_idx < 0)
   1013     {
   1014       TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1015 		" pmc not determined!\n", sig_ts);
   1016       lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */
   1017       // note: bogus value may get overwritten in loop below
   1018     }
   1019 
   1020   /* capture sample(s).  In addition to signal_idx, check other counters. */
   1021   struct perf_event_header sheader;
   1022   int idx;
   1023   for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++)
   1024     {
   1025       int num_recs = 0;
   1026       while (1)
   1027 	{
   1028 	  /* check for samples */
   1029 	  struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf;
   1030 	  if (metadata == NULL)
   1031 	    break; // empty
   1032 	  if (metadata->data_tail == metadata->data_head)
   1033 	    break; // empty
   1034 
   1035 	  /* read header */
   1036 	  if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader)))
   1037 	    break;
   1038 	  num_recs++;
   1039 
   1040 	  /* check for PERF_RECORD_SAMPLE */
   1041 	  size_t datasz = sheader.size - sizeof (struct perf_event_header);
   1042 	  if (sheader.type != PERF_RECORD_SAMPLE)
   1043 	    {
   1044 	      TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1045 			" unexpected recd type=%d\n",
   1046 			sig_ts, sheader.type);
   1047 	      if (skip_buf (&ctr_list[idx].buf_state, datasz))
   1048 		{
   1049 		  TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1050 			    " skip recd type=%d failed\n", sig_ts, sheader.type);
   1051 		  lost_events->ce_pic[idx] = 4; /* record a bogus value */
   1052 		  break; // failed to skip buffer??
   1053 		}
   1054 	      lost_events->ce_pic[idx] = 2; /* record a bogus value */
   1055 	      continue; // advance to next record
   1056 	    }
   1057 
   1058 	  /* type is PERF_RECORD_SAMPLE */
   1059 	  uint64_t value, lostv;
   1060 	  if (read_sample (&ctr_list[idx], datasz, &value, &lostv))
   1061 	    {
   1062 	      TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
   1063 			" read_sample() failed\n", sig_ts);
   1064 	      lost_events->ce_pic[idx] = 3; // record a bogus value
   1065 	      break;                        // failed to read sample data??
   1066 	    }
   1067 	  TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:"
   1068 		    " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts,
   1069 		    idx, (unsigned long long) value, (unsigned long long) lostv);
   1070 	  if (eventp->ce_pic[idx])
   1071 	    {
   1072 	      TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1073 			" idx=%d previous sample recorded as lost_event\n", sig_ts, idx);
   1074 	      lost_events->ce_pic[idx] += eventp->ce_pic[idx];
   1075 	    }
   1076 	  eventp->ce_pic[idx] = value;
   1077 	  lost_events->ce_pic[idx] += lostv;
   1078 	}
   1079 
   1080       /* debug output for unexpected (but common) cases */
   1081       if (idx == signal_idx)
   1082 	{
   1083 	  if (num_recs != 1)
   1084 	    TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1085 		      " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx);
   1086 	}
   1087       else if (num_recs)
   1088 	TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
   1089 		  " %d unexpected record(s) for idx=%d (signal_idx=%d)\n",
   1090 		  sig_ts, num_recs, idx, signal_idx);
   1091 
   1092       /* trigger counter restart whenever records were found */
   1093       if (num_recs)
   1094 	{
   1095 	  /* check whether to adapt the overflow interval */
   1096 	  /* This is the Linux version.
   1097 	   * The Solaris version is in hwprofile.c collector_update_overflow_counters().
   1098 	   */
   1099 	  hrtime_t min_time = global_perf_event_def[idx].min_time;
   1100 	  if (min_time > 0 // overflow interval is adaptive
   1101 	      && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min
   1102 	    {
   1103 	      /* pick a new overflow interval */
   1104 	      /* roughly doubled, but add funny numbers */
   1105 	      /* hopefully the result is prime or not a multiple of some # of ops/loop */
   1106 	      uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37;
   1107 #if 0
   1108 	      // On Solaris, we report the adjustment to the log file.
   1109 	      // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ.
   1110 	      // For now we simply don't report.
   1111 	      collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
   1112 					     SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
   1113 					     ctr_list[idx].last_overflow_period, new_period);
   1114 #endif
   1115 	      /* There are a variety of ways of resetting the period on Linux.
   1116 	       * The most elegant is
   1117 	       *     ioctl(fd,PERF_EVENT_IOC_PERIOD,&period)
   1118 	       * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD:
   1119 	       *     > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel.
   1120 	       *     > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect
   1121 	       *         until after the next overflow.
   1122 	       * So we're kind of stuck shutting the fd down and restarting it with the new period.
   1123 	       */
   1124 	      if (stop_one_ctr (idx, ctr_list))
   1125 		{
   1126 		  // EUGENE figure out what to do on error
   1127 		}
   1128 	      ctr_list[idx].last_overflow_period = new_period;
   1129 	      if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):"))
   1130 		{
   1131 		  // EUGENE figure out what to do on error
   1132 		}
   1133 	    }
   1134 	  ctr_list[idx].last_overflow_time = sig_ts;
   1135 #if 0
   1136 	  ctr_list[idx].needs_restart = 1;
   1137 #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart()
   1138 	  internal_hwc_start (ctr_list[idx].fd);
   1139 #endif
   1140 	}
   1141     }
   1142   return 0; // OK to restart counters
   1143 }
   1144 
   1145 HWCDRV_API int
   1146 hwcdrv_sighlr_restart (const hwc_event_t *pp)
   1147 {
   1148 #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow()
   1149   hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
   1150   if (!pctx)
   1151     {
   1152       TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n");
   1153       return -1;
   1154     }
   1155   counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
   1156   if (!ctr_list)
   1157     {
   1158       TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n");
   1159       return -1;
   1160     }
   1161   int errors = 0;
   1162   for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1163     {
   1164       if (ctr_list[ii].needs_restart)
   1165 	errors |= internal_hwc_start (ctr_list[ii].fd);
   1166       ctr_list[ii].needs_restart = 0;
   1167     }
   1168   return errors;
   1169 #else
   1170   return 0;
   1171 #endif
   1172 }
   1173 
   1174 /* create counters based on hwcdef[] */
   1175 HWCDRV_API int
   1176 hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef)
   1177 {
   1178   if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics)
   1179     {
   1180       logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/
   1181       return HWCFUNCS_ERROR_HWCARGS;
   1182     }
   1183   if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
   1184     {
   1185       logerr (GTXT ("Processor not supported\n"));
   1186       return HWCFUNCS_ERROR_HWCARGS;
   1187     }
   1188 
   1189   /* add counters */
   1190   for (unsigned idx = 0; idx < hwcdef_cnt; idx++)
   1191     {
   1192       perf_event_def_t *glb_event_def = &global_perf_event_def[idx];
   1193       *glb_event_def = event_def_0;
   1194       unsigned int pmc_sel;
   1195       eventsel_t evntsel;
   1196       if (hwcfuncs_get_x86_eventsel (hwcdef + idx, &evntsel, &pmc_sel))
   1197 	{
   1198 	  TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n");
   1199 	  return HWCFUNCS_ERROR_HWCARGS;
   1200 	}
   1201       glb_event_def->reg_num = pmc_sel;
   1202       glb_event_def->eventsel = evntsel;
   1203       glb_event_def->counter_preload = hwcdef[idx].val;
   1204       glb_event_def->min_time = hwcdef[idx].min_time;
   1205       glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor
   1206       init_perf_event (&glb_event_def->hw, glb_event_def->eventsel,
   1207 		       glb_event_def->counter_preload, hwcdef + idx);
   1208       TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld"
   1209 		"(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n",
   1210 		idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload,
   1211 		(long long) glb_event_def->min_time, (int) glb_event_def->reg_num,
   1212 		(long long) glb_event_def->eventsel,
   1213 		(long long) HW_INTERVAL_PRESET (hwcdef[idx].val),
   1214 		(long long) glb_event_def->hw.exclude_user,
   1215 		(long long) glb_event_def->hw.exclude_kernel);
   1216     }
   1217 
   1218   hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt;
   1219   return 0;
   1220 }
   1221 
   1222 HWCDRV_API int
   1223 hwcdrv_free_counters () // note: only performs shutdown for this thread
   1224 {
   1225   hdrv_pcl_ctx_t * pctx;
   1226   if (!COUNTERS_ENABLED ())
   1227     return 0;
   1228   pctx = hdrv_pcl_state.find_vpc_ctx ();
   1229   if (!pctx)
   1230     {
   1231       TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n");
   1232       return HWCFUNCS_ERROR_GENERIC;
   1233     }
   1234   counter_state_t *ctr_list = pctx->ctr_list;
   1235   if (!ctr_list)
   1236     {
   1237       // fork child: prolog suspends hwcs, then epilog frees them
   1238       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n");
   1239       return 0;
   1240     }
   1241   int hwc_rc = 0;
   1242   for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1243     if (stop_one_ctr (ii, ctr_list))
   1244       hwc_rc = HWCFUNCS_ERROR_GENERIC;
   1245   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", (long) pctx->tid);
   1246   pctx->ctr_list = NULL;
   1247   return hwc_rc;
   1248 }
   1249 
   1250 HWCDRV_API int
   1251 hwcdrv_start (void) /* must be called from each thread ? */
   1252 {
   1253   hdrv_pcl_ctx_t *pctx = NULL;
   1254   if (!COUNTERS_ENABLED ())
   1255     {
   1256       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n");
   1257       return 0;
   1258     }
   1259   if (!hdrv_pcl_state.library_ok)
   1260     {
   1261       TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n");
   1262       return HWCFUNCS_ERROR_NOT_SUPPORTED;
   1263     }
   1264 
   1265   /*
   1266    * set up per-thread context
   1267    */
   1268   pctx = hdrv_pcl_state.find_vpc_ctx ();
   1269   if (!pctx)
   1270     {
   1271       TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n");
   1272       return HWCFUNCS_ERROR_UNEXPECTED;
   1273     }
   1274   pctx->tid = hwcdrv_gettid ();
   1275   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", (long) pctx->tid);
   1276 
   1277   /*
   1278    * create per-thread counter list
   1279    */
   1280   counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt,
   1281 							  sizeof (counter_state_t));
   1282   if (!ctr_list)
   1283     {
   1284       TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n");
   1285       return HWCFUNCS_ERROR_MEMORY;
   1286     }
   1287   int ii;
   1288   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1289     ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely
   1290   pctx->ctr_list = ctr_list;
   1291 
   1292   /*
   1293    * bind the counters
   1294    */
   1295   size_t pgsz = sysconf (_SC_PAGESIZE);
   1296   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1297     {
   1298       ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period;
   1299       if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup;
   1300     }
   1301 
   1302   /*
   1303    * start the counters
   1304    */
   1305   for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
   1306     {
   1307       int rc = internal_hwc_start (ctr_list[ii].fd);
   1308       if (rc < 0)
   1309 	goto hwcdrv_start_cleanup;
   1310     }
   1311   return 0;
   1312 
   1313 hwcdrv_start_cleanup:
   1314   hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds
   1315   return HWCFUNCS_ERROR_UNAVAIL;
   1316 }
   1317 
   1318 HWCDRV_API int
   1319 hwcdrv_lwp_suspend (void) /* must be called from each thread */
   1320 {
   1321   if (!COUNTERS_ENABLED ())
   1322     {
   1323       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n");
   1324       return 0;
   1325     }
   1326   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n");
   1327   return hwcdrv_free_counters ();
   1328 }
   1329 
   1330 HWCDRV_API int
   1331 hwcdrv_lwp_resume (void) /* must be called from each thread */
   1332 {
   1333   if (!COUNTERS_ENABLED ())
   1334     {
   1335       TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n");
   1336       return 0;
   1337     }
   1338   TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n");
   1339   return hwcdrv_start ();
   1340 }
   1341 
   1342 HWCDRV_API int
   1343 hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data)
   1344 {
   1345   overflow_data->ce_hrt = 0;
   1346   for (int i = 0; i < MAX_PICS; i++)
   1347     {
   1348       overflow_data->ce_pic[i] = 0;
   1349       if (sampled_data)
   1350 	HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]);
   1351     }
   1352   return 0;
   1353 }
   1354 
   1355 /*---------------------------------------------------------------------------*/
   1356 /* HWCDRV_API */
   1357 
   1358 hwcdrv_api_t hwcdrv_pcl_api = {
   1359   hwcdrv_init,
   1360   hwcdrv_get_info,
   1361   hwcdrv_enable_mt,
   1362   hwcdrv_get_descriptions,
   1363   hwcdrv_assign_regnos,
   1364   hwcdrv_create_counters,
   1365   hwcdrv_start,
   1366   hwcdrv_overflow,
   1367   hwcdrv_read_events,
   1368   hwcdrv_sighlr_restart,
   1369   hwcdrv_lwp_suspend,
   1370   hwcdrv_lwp_resume,
   1371   hwcdrv_free_counters,
   1372   hwcdrv_lwp_init,
   1373   hwcdrv_lwp_fini,
   1374     -1                      // hwcdrv_init_status
   1375 };
   1376