Home | History | Annotate | Line # | Download | only in tprof
tprof.c revision 1.21.2.1
      1 /*	$NetBSD: tprof.c,v 1.21.2.1 2022/12/23 08:09:48 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.21.2.1 2022/12/23 08:09:48 martin Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/kernel.h>
     35 
     36 #include <sys/callout.h>
     37 #include <sys/conf.h>
     38 #include <sys/cpu.h>
     39 #include <sys/kmem.h>
     40 #include <sys/module.h>
     41 #include <sys/percpu.h>
     42 #include <sys/poll.h>
     43 #include <sys/proc.h>
     44 #include <sys/queue.h>
     45 #include <sys/select.h>
     46 #include <sys/workqueue.h>
     47 #include <sys/xcall.h>
     48 
     49 #include <dev/tprof/tprof.h>
     50 #include <dev/tprof/tprof_ioctl.h>
     51 
     52 #include "ioconf.h"
     53 
     54 #ifndef TPROF_HZ
     55 #define TPROF_HZ	10000
     56 #endif
     57 
     58 /*
     59  * locking order:
     60  *	tprof_reader_lock -> tprof_lock
     61  *	tprof_startstop_lock -> tprof_lock
     62  */
     63 
     64 /*
     65  * protected by:
     66  *	L: tprof_lock
     67  *	R: tprof_reader_lock
     68  *	S: tprof_startstop_lock
     69  *	s: writer should hold tprof_startstop_lock and tprof_lock
     70  *	   reader should hold tprof_startstop_lock or tprof_lock
     71  */
     72 
     73 typedef struct tprof_buf {
     74 	u_int b_used;
     75 	u_int b_size;
     76 	u_int b_overflow;
     77 	u_int b_unused;
     78 	STAILQ_ENTRY(tprof_buf) b_list;
     79 	tprof_sample_t b_data[];
     80 } tprof_buf_t;
     81 #define	TPROF_BUF_BYTESIZE(sz) \
     82 	(sizeof(tprof_buf_t) + (sz) * sizeof(tprof_sample_t))
     83 #define	TPROF_MAX_SAMPLES_PER_BUF	TPROF_HZ
     84 
     85 typedef struct {
     86 	tprof_buf_t *c_buf;
     87 	uint32_t c_cpuid;
     88 	struct work c_work;
     89 	callout_t c_callout;
     90 } __aligned(CACHE_LINE_SIZE) tprof_cpu_t;
     91 
     92 typedef struct tprof_backend {
     93 	/*
     94 	 * tprof_backend_softc_t must be passed as an argument to the interrupt
     95 	 * handler, but since this is difficult to implement in armv7/v8. Then,
     96 	 * tprof_backend is exposed. Additionally, softc must be placed at the
     97 	 * beginning of struct tprof_backend.
     98 	 */
     99 	tprof_backend_softc_t tb_softc;
    100 
    101 	const char *tb_name;
    102 	const tprof_backend_ops_t *tb_ops;
    103 	LIST_ENTRY(tprof_backend) tb_list;
    104 } tprof_backend_t;
    105 
    106 static kmutex_t tprof_lock;
    107 static u_int tprof_nworker;		/* L: # of running worker LWPs */
    108 static lwp_t *tprof_owner;
    109 static STAILQ_HEAD(, tprof_buf) tprof_list; /* L: global buffer list */
    110 static u_int tprof_nbuf_on_list;	/* L: # of buffers on tprof_list */
    111 static struct workqueue *tprof_wq;
    112 static struct percpu *tprof_cpus __read_mostly;	/* tprof_cpu_t * */
    113 static u_int tprof_samples_per_buf;
    114 static u_int tprof_max_buf;
    115 
    116 tprof_backend_t *tprof_backend;	/* S: */
    117 static LIST_HEAD(, tprof_backend) tprof_backends =
    118     LIST_HEAD_INITIALIZER(tprof_backend); /* S: */
    119 
    120 static kmutex_t tprof_reader_lock;
    121 static kcondvar_t tprof_reader_cv;	/* L: */
    122 static off_t tprof_reader_offset;	/* R: */
    123 
    124 static kmutex_t tprof_startstop_lock;
    125 static kcondvar_t tprof_cv;		/* L: */
    126 static struct selinfo tprof_selp;	/* L: */
    127 
    128 static struct tprof_stat tprof_stat;	/* L: */
    129 
    130 static tprof_cpu_t *
    131 tprof_cpu_direct(struct cpu_info *ci)
    132 {
    133 	tprof_cpu_t **cp;
    134 
    135 	cp = percpu_getptr_remote(tprof_cpus, ci);
    136 	return *cp;
    137 }
    138 
    139 static tprof_cpu_t *
    140 tprof_cpu(struct cpu_info *ci)
    141 {
    142 	tprof_cpu_t *c;
    143 
    144 	/*
    145 	 * As long as xcalls are blocked -- e.g., by kpreempt_disable
    146 	 * -- the percpu object will not be swapped and destroyed.  We
    147 	 * can't write to it, because the data may have already been
    148 	 * moved to a new buffer, but we can safely read from it.
    149 	 */
    150 	kpreempt_disable();
    151 	c = tprof_cpu_direct(ci);
    152 	kpreempt_enable();
    153 
    154 	return c;
    155 }
    156 
    157 static tprof_cpu_t *
    158 tprof_curcpu(void)
    159 {
    160 
    161 	return tprof_cpu(curcpu());
    162 }
    163 
    164 static tprof_buf_t *
    165 tprof_buf_alloc(void)
    166 {
    167 	tprof_buf_t *new;
    168 	u_int size = tprof_samples_per_buf;
    169 
    170 	new = kmem_alloc(TPROF_BUF_BYTESIZE(size), KM_SLEEP);
    171 	new->b_used = 0;
    172 	new->b_size = size;
    173 	new->b_overflow = 0;
    174 	return new;
    175 }
    176 
    177 static void
    178 tprof_buf_free(tprof_buf_t *buf)
    179 {
    180 
    181 	kmem_free(buf, TPROF_BUF_BYTESIZE(buf->b_size));
    182 }
    183 
    184 static tprof_buf_t *
    185 tprof_buf_switch(tprof_cpu_t *c, tprof_buf_t *new)
    186 {
    187 	tprof_buf_t *old;
    188 
    189 	old = c->c_buf;
    190 	c->c_buf = new;
    191 	return old;
    192 }
    193 
    194 static tprof_buf_t *
    195 tprof_buf_refresh(void)
    196 {
    197 	tprof_cpu_t * const c = tprof_curcpu();
    198 	tprof_buf_t *new;
    199 
    200 	new = tprof_buf_alloc();
    201 	return tprof_buf_switch(c, new);
    202 }
    203 
    204 static void
    205 tprof_worker(struct work *wk, void *dummy)
    206 {
    207 	tprof_cpu_t * const c = tprof_curcpu();
    208 	tprof_buf_t *buf;
    209 	tprof_backend_t *tb;
    210 	bool shouldstop;
    211 
    212 	KASSERT(wk == &c->c_work);
    213 	KASSERT(dummy == NULL);
    214 
    215 	/*
    216 	 * get a per cpu buffer.
    217 	 */
    218 	buf = tprof_buf_refresh();
    219 
    220 	/*
    221 	 * and put it on the global list for read(2).
    222 	 */
    223 	mutex_enter(&tprof_lock);
    224 	tb = tprof_backend;
    225 	shouldstop = (tb == NULL || tb->tb_softc.sc_ctr_running_mask == 0);
    226 	if (shouldstop) {
    227 		KASSERT(tprof_nworker > 0);
    228 		tprof_nworker--;
    229 		cv_broadcast(&tprof_cv);
    230 		cv_broadcast(&tprof_reader_cv);
    231 	}
    232 	if (buf->b_used == 0) {
    233 		tprof_stat.ts_emptybuf++;
    234 	} else if (tprof_nbuf_on_list < tprof_max_buf) {
    235 		tprof_stat.ts_sample += buf->b_used;
    236 		tprof_stat.ts_overflow += buf->b_overflow;
    237 		tprof_stat.ts_buf++;
    238 		STAILQ_INSERT_TAIL(&tprof_list, buf, b_list);
    239 		tprof_nbuf_on_list++;
    240 		buf = NULL;
    241 		selnotify(&tprof_selp, 0, NOTE_SUBMIT);
    242 		cv_broadcast(&tprof_reader_cv);
    243 	} else {
    244 		tprof_stat.ts_dropbuf_sample += buf->b_used;
    245 		tprof_stat.ts_dropbuf++;
    246 	}
    247 	mutex_exit(&tprof_lock);
    248 	if (buf) {
    249 		tprof_buf_free(buf);
    250 	}
    251 	if (!shouldstop) {
    252 		callout_schedule(&c->c_callout, hz / 8);
    253 	}
    254 }
    255 
    256 static void
    257 tprof_kick(void *vp)
    258 {
    259 	struct cpu_info * const ci = vp;
    260 	tprof_cpu_t * const c = tprof_cpu(ci);
    261 
    262 	workqueue_enqueue(tprof_wq, &c->c_work, ci);
    263 }
    264 
    265 static void
    266 tprof_stop1(void)
    267 {
    268 	CPU_INFO_ITERATOR cii;
    269 	struct cpu_info *ci;
    270 
    271 	KASSERT(mutex_owned(&tprof_startstop_lock));
    272 	KASSERT(tprof_nworker == 0);
    273 
    274 	for (CPU_INFO_FOREACH(cii, ci)) {
    275 		tprof_cpu_t * const c = tprof_cpu(ci);
    276 		tprof_buf_t *old;
    277 
    278 		old = tprof_buf_switch(c, NULL);
    279 		if (old != NULL) {
    280 			tprof_buf_free(old);
    281 		}
    282 		callout_destroy(&c->c_callout);
    283 	}
    284 	workqueue_destroy(tprof_wq);
    285 }
    286 
    287 static void
    288 tprof_getinfo(struct tprof_info *info)
    289 {
    290 	tprof_backend_t *tb;
    291 
    292 	KASSERT(mutex_owned(&tprof_startstop_lock));
    293 
    294 	memset(info, 0, sizeof(*info));
    295 	info->ti_version = TPROF_VERSION;
    296 	if ((tb = tprof_backend) != NULL) {
    297 		info->ti_ident = tb->tb_ops->tbo_ident();
    298 	}
    299 }
    300 
    301 static int
    302 tprof_getncounters(u_int *ncounters)
    303 {
    304 	tprof_backend_t *tb;
    305 
    306 	tb = tprof_backend;
    307 	if (tb == NULL)
    308 		return ENOENT;
    309 
    310 	*ncounters = tb->tb_ops->tbo_ncounters();
    311 	return 0;
    312 }
    313 
    314 static void
    315 tprof_start_cpu(void *arg1, void *arg2)
    316 {
    317 	tprof_backend_t *tb = arg1;
    318 	tprof_countermask_t runmask = (uintptr_t)arg2;
    319 
    320 	tb->tb_ops->tbo_start(runmask);
    321 }
    322 
    323 static void
    324 tprof_stop_cpu(void *arg1, void *arg2)
    325 {
    326 	tprof_backend_t *tb = arg1;
    327 	tprof_countermask_t stopmask = (uintptr_t)arg2;
    328 
    329 	tb->tb_ops->tbo_stop(stopmask);
    330 }
    331 
    332 static int
    333 tprof_start(tprof_countermask_t runmask)
    334 {
    335 	CPU_INFO_ITERATOR cii;
    336 	struct cpu_info *ci;
    337 	tprof_backend_t *tb;
    338 	uint64_t xc;
    339 	int error;
    340 	bool firstrun;
    341 
    342 	KASSERT(mutex_owned(&tprof_startstop_lock));
    343 
    344 	tb = tprof_backend;
    345 	if (tb == NULL) {
    346 		error = ENOENT;
    347 		goto done;
    348 	}
    349 
    350 	runmask &= ~tb->tb_softc.sc_ctr_running_mask;
    351 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
    352 	if (runmask == 0) {
    353 		/*
    354 		 * targets are already running.
    355 		 * unconfigured counters are ignored.
    356 		 */
    357 		error = 0;
    358 		goto done;
    359 	}
    360 
    361 	firstrun = (tb->tb_softc.sc_ctr_running_mask == 0);
    362 	if (firstrun) {
    363 		if (tb->tb_ops->tbo_establish != NULL) {
    364 			error = tb->tb_ops->tbo_establish(&tb->tb_softc);
    365 			if (error != 0)
    366 				goto done;
    367 		}
    368 
    369 		tprof_samples_per_buf = TPROF_MAX_SAMPLES_PER_BUF;
    370 		tprof_max_buf = ncpu * 3;
    371 		error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker,
    372 		    NULL, PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
    373 		if (error != 0) {
    374 			if (tb->tb_ops->tbo_disestablish != NULL)
    375 				tb->tb_ops->tbo_disestablish(&tb->tb_softc);
    376 			goto done;
    377 		}
    378 
    379 		for (CPU_INFO_FOREACH(cii, ci)) {
    380 			tprof_cpu_t * const c = tprof_cpu(ci);
    381 			tprof_buf_t *new;
    382 			tprof_buf_t *old;
    383 
    384 			new = tprof_buf_alloc();
    385 			old = tprof_buf_switch(c, new);
    386 			if (old != NULL) {
    387 				tprof_buf_free(old);
    388 			}
    389 			callout_init(&c->c_callout, CALLOUT_MPSAFE);
    390 			callout_setfunc(&c->c_callout, tprof_kick, ci);
    391 		}
    392 	}
    393 
    394 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
    395 	xc = xc_broadcast(0, tprof_start_cpu, tb, (void *)(uintptr_t)runmask);
    396 	xc_wait(xc);
    397 	mutex_enter(&tprof_lock);
    398 	tb->tb_softc.sc_ctr_running_mask |= runmask;
    399 	mutex_exit(&tprof_lock);
    400 
    401 	if (firstrun) {
    402 		for (CPU_INFO_FOREACH(cii, ci)) {
    403 			tprof_cpu_t * const c = tprof_cpu(ci);
    404 
    405 			mutex_enter(&tprof_lock);
    406 			tprof_nworker++;
    407 			mutex_exit(&tprof_lock);
    408 			workqueue_enqueue(tprof_wq, &c->c_work, ci);
    409 		}
    410 	}
    411 	error = 0;
    412 
    413 done:
    414 	return error;
    415 }
    416 
    417 static void
    418 tprof_stop(tprof_countermask_t stopmask)
    419 {
    420 	tprof_backend_t *tb;
    421 	uint64_t xc;
    422 
    423 	tb = tprof_backend;
    424 	if (tb == NULL)
    425 		return;
    426 
    427 	KASSERT(mutex_owned(&tprof_startstop_lock));
    428 	stopmask &= tb->tb_softc.sc_ctr_running_mask;
    429 	if (stopmask == 0) {
    430 		/* targets are not running */
    431 		goto done;
    432 	}
    433 
    434 	xc = xc_broadcast(0, tprof_stop_cpu, tb, (void *)(uintptr_t)stopmask);
    435 	xc_wait(xc);
    436 	mutex_enter(&tprof_lock);
    437 	tb->tb_softc.sc_ctr_running_mask &= ~stopmask;
    438 	mutex_exit(&tprof_lock);
    439 
    440 	/* all counters have stopped? */
    441 	if (tb->tb_softc.sc_ctr_running_mask == 0) {
    442 		mutex_enter(&tprof_lock);
    443 		cv_broadcast(&tprof_reader_cv);
    444 		while (tprof_nworker > 0) {
    445 			cv_wait(&tprof_cv, &tprof_lock);
    446 		}
    447 		mutex_exit(&tprof_lock);
    448 
    449 		tprof_stop1();
    450 		if (tb->tb_ops->tbo_disestablish != NULL)
    451 			tb->tb_ops->tbo_disestablish(&tb->tb_softc);
    452 	}
    453 done:
    454 	;
    455 }
    456 
    457 static void
    458 tprof_init_percpu_counters_offset(void *vp, void *vp2, struct cpu_info *ci)
    459 {
    460 	uint64_t *counters_offset = vp;
    461 	u_int counter = (uintptr_t)vp2;
    462 
    463 	tprof_backend_t *tb = tprof_backend;
    464 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
    465 	counters_offset[counter] = param->p_value;
    466 }
    467 
    468 static void
    469 tprof_configure_event_cpu(void *arg1, void *arg2)
    470 {
    471 	tprof_backend_t *tb = arg1;
    472 	u_int counter = (uintptr_t)arg2;
    473 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
    474 
    475 	tb->tb_ops->tbo_configure_event(counter, param);
    476 }
    477 
    478 static int
    479 tprof_configure_event(const tprof_param_t *param)
    480 {
    481 	tprof_backend_t *tb;
    482 	tprof_backend_softc_t *sc;
    483 	tprof_param_t *sc_param;
    484 	uint64_t xc;
    485 	int c, error;
    486 
    487 	if ((param->p_flags & (TPROF_PARAM_USER | TPROF_PARAM_KERN)) == 0) {
    488 		error = EINVAL;
    489 		goto done;
    490 	}
    491 
    492 	tb = tprof_backend;
    493 	if (tb == NULL) {
    494 		error = ENOENT;
    495 		goto done;
    496 	}
    497 	sc = &tb->tb_softc;
    498 
    499 	c = param->p_counter;
    500 	if (c >= tb->tb_softc.sc_ncounters) {
    501 		error = EINVAL;
    502 		goto done;
    503 	}
    504 
    505 	if (tb->tb_ops->tbo_valid_event != NULL) {
    506 		error = tb->tb_ops->tbo_valid_event(param->p_counter, param);
    507 		if (error != 0)
    508 			goto done;
    509 	}
    510 
    511 	/* if already running, stop the counter */
    512 	if (ISSET(c, tb->tb_softc.sc_ctr_running_mask))
    513 		tprof_stop(__BIT(c));
    514 
    515 	sc->sc_count[c].ctr_bitwidth =
    516 	    tb->tb_ops->tbo_counter_bitwidth(param->p_counter);
    517 
    518 	sc_param = &sc->sc_count[c].ctr_param;
    519 	memcpy(sc_param, param, sizeof(*sc_param));	/* save copy of param */
    520 
    521 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
    522 		uint64_t freq, inum, dnum;
    523 
    524 		freq = tb->tb_ops->tbo_counter_estimate_freq(c);
    525 		sc->sc_count[c].ctr_counter_val = freq / TPROF_HZ;
    526 		if (sc->sc_count[c].ctr_counter_val == 0) {
    527 			printf("%s: counter#%d frequency (%"PRIu64") is"
    528 			    " very low relative to TPROF_HZ (%u)\n", __func__,
    529 			    c, freq, TPROF_HZ);
    530 			sc->sc_count[c].ctr_counter_val =
    531 			    4000000000ULL / TPROF_HZ;
    532 		}
    533 
    534 		switch (param->p_flags & TPROF_PARAM_VALUE2_MASK) {
    535 		case TPROF_PARAM_VALUE2_SCALE:
    536 			if (sc_param->p_value2 == 0)
    537 				break;
    538 			/*
    539 			 * p_value2 is 64-bit fixed-point
    540 			 * upper 32 bits are the integer part
    541 			 * lower 32 bits are the decimal part
    542 			 */
    543 			inum = sc_param->p_value2 >> 32;
    544 			dnum = sc_param->p_value2 & __BITS(31, 0);
    545 			sc->sc_count[c].ctr_counter_val =
    546 			    sc->sc_count[c].ctr_counter_val * inum +
    547 			    (sc->sc_count[c].ctr_counter_val * dnum >> 32);
    548 			if (sc->sc_count[c].ctr_counter_val == 0)
    549 				sc->sc_count[c].ctr_counter_val = 1;
    550 			break;
    551 		case TPROF_PARAM_VALUE2_TRIGGERCOUNT:
    552 			if (sc_param->p_value2 == 0)
    553 				sc_param->p_value2 = 1;
    554 			if (sc_param->p_value2 >
    555 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0)) {
    556 				sc_param->p_value2 =
    557 				    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
    558 			}
    559 			sc->sc_count[c].ctr_counter_val = sc_param->p_value2;
    560 			break;
    561 		default:
    562 			break;
    563 		}
    564 		sc->sc_count[c].ctr_counter_reset_val =
    565 		    -sc->sc_count[c].ctr_counter_val;
    566 		sc->sc_count[c].ctr_counter_reset_val &=
    567 		    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
    568 	} else {
    569 		sc->sc_count[c].ctr_counter_val = 0;
    570 		sc->sc_count[c].ctr_counter_reset_val = 0;
    571 	}
    572 
    573 	/* At this point, p_value is used as an initial value */
    574 	percpu_foreach(tb->tb_softc.sc_ctr_offset_percpu,
    575 	    tprof_init_percpu_counters_offset, (void *)(uintptr_t)c);
    576 	/* On the backend side, p_value is used as the reset value */
    577 	sc_param->p_value = tb->tb_softc.sc_count[c].ctr_counter_reset_val;
    578 
    579 	xc = xc_broadcast(0, tprof_configure_event_cpu,
    580 	    tb, (void *)(uintptr_t)c);
    581 	xc_wait(xc);
    582 
    583 	mutex_enter(&tprof_lock);
    584 	/* update counters bitmasks */
    585 	SET(tb->tb_softc.sc_ctr_configured_mask, __BIT(c));
    586 	CLR(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
    587 	CLR(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
    588 	/* profiled counter requires overflow handling */
    589 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
    590 		SET(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
    591 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
    592 	}
    593 	/* counters with less than 64bits also require overflow handling */
    594 	if (sc->sc_count[c].ctr_bitwidth != 64)
    595 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
    596 	mutex_exit(&tprof_lock);
    597 
    598 	error = 0;
    599 
    600  done:
    601 	return error;
    602 }
    603 
    604 static void
    605 tprof_getcounts_cpu(void *arg1, void *arg2)
    606 {
    607 	tprof_backend_t *tb = arg1;
    608 	tprof_backend_softc_t *sc = &tb->tb_softc;
    609 	uint64_t *counters = arg2;
    610 	uint64_t *counters_offset;
    611 	unsigned int c;
    612 
    613 	tprof_countermask_t configmask = sc->sc_ctr_configured_mask;
    614 	counters_offset = percpu_getref(sc->sc_ctr_offset_percpu);
    615 	for (c = 0; c < sc->sc_ncounters; c++) {
    616 		if (ISSET(configmask, __BIT(c))) {
    617 			uint64_t ctr = tb->tb_ops->tbo_counter_read(c);
    618 			counters[c] = counters_offset[c] +
    619 			    ((ctr - sc->sc_count[c].ctr_counter_reset_val) &
    620 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0));
    621 		} else {
    622 			counters[c] = 0;
    623 		}
    624 	}
    625 	percpu_putref(sc->sc_ctr_offset_percpu);
    626 }
    627 
    628 static int
    629 tprof_getcounts(tprof_counts_t *counts)
    630 {
    631 	struct cpu_info *ci;
    632 	tprof_backend_t *tb;
    633 	uint64_t xc;
    634 
    635 	tb = tprof_backend;
    636 	if (tb == NULL)
    637 		return ENOENT;
    638 
    639 	if (counts->c_cpu >= ncpu)
    640 		return ESRCH;
    641 	ci = cpu_lookup(counts->c_cpu);
    642 	if (ci == NULL)
    643 		return ESRCH;
    644 
    645 	xc = xc_unicast(0, tprof_getcounts_cpu, tb, counts->c_count, ci);
    646 	xc_wait(xc);
    647 
    648 	counts->c_ncounters = tb->tb_softc.sc_ncounters;
    649 	counts->c_runningmask = tb->tb_softc.sc_ctr_running_mask;
    650 	return 0;
    651 }
    652 
    653 /*
    654  * tprof_clear: drain unread samples.
    655  */
    656 
    657 static void
    658 tprof_clear(void)
    659 {
    660 	tprof_buf_t *buf;
    661 
    662 	mutex_enter(&tprof_reader_lock);
    663 	mutex_enter(&tprof_lock);
    664 	while ((buf = STAILQ_FIRST(&tprof_list)) != NULL) {
    665 		if (buf != NULL) {
    666 			STAILQ_REMOVE_HEAD(&tprof_list, b_list);
    667 			KASSERT(tprof_nbuf_on_list > 0);
    668 			tprof_nbuf_on_list--;
    669 			mutex_exit(&tprof_lock);
    670 			tprof_buf_free(buf);
    671 			mutex_enter(&tprof_lock);
    672 		}
    673 	}
    674 	KASSERT(tprof_nbuf_on_list == 0);
    675 	mutex_exit(&tprof_lock);
    676 	tprof_reader_offset = 0;
    677 	mutex_exit(&tprof_reader_lock);
    678 
    679 	memset(&tprof_stat, 0, sizeof(tprof_stat));
    680 }
    681 
    682 static tprof_backend_t *
    683 tprof_backend_lookup(const char *name)
    684 {
    685 	tprof_backend_t *tb;
    686 
    687 	KASSERT(mutex_owned(&tprof_startstop_lock));
    688 
    689 	LIST_FOREACH(tb, &tprof_backends, tb_list) {
    690 		if (!strcmp(tb->tb_name, name)) {
    691 			return tb;
    692 		}
    693 	}
    694 	return NULL;
    695 }
    696 
    697 /* -------------------- backend interfaces */
    698 
    699 /*
    700  * tprof_sample: record a sample on the per-cpu buffer.
    701  *
    702  * be careful; can be called in NMI context.
    703  * we are bluntly assuming the followings are safe.
    704  *	curcpu()
    705  *	curlwp->l_lid
    706  *	curlwp->l_proc->p_pid
    707  */
    708 
    709 void
    710 tprof_sample(void *unused, const tprof_frame_info_t *tfi)
    711 {
    712 	tprof_cpu_t * const c = tprof_cpu_direct(curcpu());
    713 	tprof_buf_t * const buf = c->c_buf;
    714 	tprof_sample_t *sp;
    715 	const uintptr_t pc = tfi->tfi_pc;
    716 	const lwp_t * const l = curlwp;
    717 	u_int idx;
    718 
    719 	idx = buf->b_used;
    720 	if (__predict_false(idx >= buf->b_size)) {
    721 		buf->b_overflow++;
    722 		return;
    723 	}
    724 	sp = &buf->b_data[idx];
    725 	sp->s_pid = l->l_proc->p_pid;
    726 	sp->s_lwpid = l->l_lid;
    727 	sp->s_cpuid = c->c_cpuid;
    728 	sp->s_flags = ((tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0) |
    729 	    __SHIFTIN(tfi->tfi_counter, TPROF_SAMPLE_COUNTER_MASK);
    730 	sp->s_pc = pc;
    731 	buf->b_used = idx + 1;
    732 }
    733 
    734 /*
    735  * tprof_backend_register:
    736  */
    737 
    738 int
    739 tprof_backend_register(const char *name, const tprof_backend_ops_t *ops,
    740     int vers)
    741 {
    742 	tprof_backend_t *tb;
    743 
    744 	if (vers != TPROF_BACKEND_VERSION) {
    745 		return EINVAL;
    746 	}
    747 
    748 	mutex_enter(&tprof_startstop_lock);
    749 	tb = tprof_backend_lookup(name);
    750 	if (tb != NULL) {
    751 		mutex_exit(&tprof_startstop_lock);
    752 		return EEXIST;
    753 	}
    754 #if 1 /* XXX for now */
    755 	if (!LIST_EMPTY(&tprof_backends)) {
    756 		mutex_exit(&tprof_startstop_lock);
    757 		return ENOTSUP;
    758 	}
    759 #endif
    760 	tb = kmem_zalloc(sizeof(*tb), KM_SLEEP);
    761 	tb->tb_name = name;
    762 	tb->tb_ops = ops;
    763 	LIST_INSERT_HEAD(&tprof_backends, tb, tb_list);
    764 #if 1 /* XXX for now */
    765 	if (tprof_backend == NULL) {
    766 		tprof_backend = tb;
    767 	}
    768 #endif
    769 	mutex_exit(&tprof_startstop_lock);
    770 
    771 	/* init backend softc */
    772 	tb->tb_softc.sc_ncounters = tb->tb_ops->tbo_ncounters();
    773 	tb->tb_softc.sc_ctr_offset_percpu_size =
    774 	    sizeof(uint64_t) * tb->tb_softc.sc_ncounters;
    775 	tb->tb_softc.sc_ctr_offset_percpu =
    776 	    percpu_alloc(tb->tb_softc.sc_ctr_offset_percpu_size);
    777 
    778 	return 0;
    779 }
    780 
    781 /*
    782  * tprof_backend_unregister:
    783  */
    784 
    785 int
    786 tprof_backend_unregister(const char *name)
    787 {
    788 	tprof_backend_t *tb;
    789 
    790 	mutex_enter(&tprof_startstop_lock);
    791 	tb = tprof_backend_lookup(name);
    792 #if defined(DIAGNOSTIC)
    793 	if (tb == NULL) {
    794 		mutex_exit(&tprof_startstop_lock);
    795 		panic("%s: not found '%s'", __func__, name);
    796 	}
    797 #endif /* defined(DIAGNOSTIC) */
    798 	if (tb->tb_softc.sc_ctr_running_mask != 0) {
    799 		mutex_exit(&tprof_startstop_lock);
    800 		return EBUSY;
    801 	}
    802 #if 1 /* XXX for now */
    803 	if (tprof_backend == tb) {
    804 		tprof_backend = NULL;
    805 	}
    806 #endif
    807 	LIST_REMOVE(tb, tb_list);
    808 	mutex_exit(&tprof_startstop_lock);
    809 
    810 	/* fini backend softc */
    811 	percpu_free(tb->tb_softc.sc_ctr_offset_percpu,
    812 	    tb->tb_softc.sc_ctr_offset_percpu_size);
    813 
    814 	/* free backend */
    815 	kmem_free(tb, sizeof(*tb));
    816 
    817 	return 0;
    818 }
    819 
    820 /* -------------------- cdevsw interfaces */
    821 
    822 static int
    823 tprof_open(dev_t dev, int flags, int type, struct lwp *l)
    824 {
    825 
    826 	if (minor(dev) != 0) {
    827 		return EXDEV;
    828 	}
    829 	mutex_enter(&tprof_lock);
    830 	if (tprof_owner != NULL) {
    831 		mutex_exit(&tprof_lock);
    832 		return  EBUSY;
    833 	}
    834 	tprof_owner = curlwp;
    835 	mutex_exit(&tprof_lock);
    836 
    837 	return 0;
    838 }
    839 
    840 static int
    841 tprof_close(dev_t dev, int flags, int type, struct lwp *l)
    842 {
    843 
    844 	KASSERT(minor(dev) == 0);
    845 
    846 	mutex_enter(&tprof_startstop_lock);
    847 	mutex_enter(&tprof_lock);
    848 	tprof_owner = NULL;
    849 	mutex_exit(&tprof_lock);
    850 	tprof_stop(TPROF_COUNTERMASK_ALL);
    851 	tprof_clear();
    852 
    853 	tprof_backend_t *tb = tprof_backend;
    854 	if (tb != NULL) {
    855 		KASSERT(tb->tb_softc.sc_ctr_running_mask == 0);
    856 		tb->tb_softc.sc_ctr_configured_mask = 0;
    857 		tb->tb_softc.sc_ctr_prof_mask = 0;
    858 		tb->tb_softc.sc_ctr_ovf_mask = 0;
    859 	}
    860 
    861 	mutex_exit(&tprof_startstop_lock);
    862 
    863 	return 0;
    864 }
    865 
    866 static int
    867 tprof_poll(dev_t dev, int events, struct lwp *l)
    868 {
    869 	int revents;
    870 
    871 	revents = events & (POLLIN | POLLRDNORM);
    872 	if (revents == 0)
    873 		return 0;
    874 
    875 	mutex_enter(&tprof_lock);
    876 	if (STAILQ_EMPTY(&tprof_list)) {
    877 		revents = 0;
    878 		selrecord(l, &tprof_selp);
    879 	}
    880 	mutex_exit(&tprof_lock);
    881 
    882 	return revents;
    883 }
    884 
    885 static void
    886 filt_tprof_read_detach(struct knote *kn)
    887 {
    888 	mutex_enter(&tprof_lock);
    889 	selremove_knote(&tprof_selp, kn);
    890 	mutex_exit(&tprof_lock);
    891 }
    892 
    893 static int
    894 filt_tprof_read_event(struct knote *kn, long hint)
    895 {
    896 	int rv = 0;
    897 
    898 	if ((hint & NOTE_SUBMIT) == 0)
    899 		mutex_enter(&tprof_lock);
    900 
    901 	if (!STAILQ_EMPTY(&tprof_list)) {
    902 		tprof_buf_t *buf;
    903 		int64_t n = 0;
    904 
    905 		STAILQ_FOREACH(buf, &tprof_list, b_list) {
    906 			n += buf->b_used;
    907 		}
    908 		kn->kn_data = n * sizeof(tprof_sample_t);
    909 
    910 		rv = 1;
    911 	}
    912 
    913 	if ((hint & NOTE_SUBMIT) == 0)
    914 		mutex_exit(&tprof_lock);
    915 
    916 	return rv;
    917 }
    918 
    919 static const struct filterops tprof_read_filtops = {
    920 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
    921 	.f_attach = NULL,
    922 	.f_detach = filt_tprof_read_detach,
    923 	.f_event = filt_tprof_read_event,
    924 };
    925 
    926 static int
    927 tprof_kqfilter(dev_t dev, struct knote *kn)
    928 {
    929 	switch (kn->kn_filter) {
    930 	case EVFILT_READ:
    931 		kn->kn_fop = &tprof_read_filtops;
    932 		mutex_enter(&tprof_lock);
    933 		selrecord_knote(&tprof_selp, kn);
    934 		mutex_exit(&tprof_lock);
    935 		break;
    936 	default:
    937 		return EINVAL;
    938 	}
    939 
    940 	return 0;
    941 }
    942 
    943 static int
    944 tprof_read(dev_t dev, struct uio *uio, int flags)
    945 {
    946 	tprof_buf_t *buf;
    947 	size_t bytes;
    948 	size_t resid;
    949 	size_t done = 0;
    950 	int error = 0;
    951 
    952 	KASSERT(minor(dev) == 0);
    953 	mutex_enter(&tprof_reader_lock);
    954 	while (uio->uio_resid > 0 && error == 0) {
    955 		/*
    956 		 * take the first buffer from the list.
    957 		 */
    958 		mutex_enter(&tprof_lock);
    959 		buf = STAILQ_FIRST(&tprof_list);
    960 		if (buf == NULL) {
    961 			if (tprof_nworker == 0 || done != 0) {
    962 				mutex_exit(&tprof_lock);
    963 				error = 0;
    964 				break;
    965 			}
    966 			mutex_exit(&tprof_reader_lock);
    967 			error = cv_wait_sig(&tprof_reader_cv, &tprof_lock);
    968 			mutex_exit(&tprof_lock);
    969 			mutex_enter(&tprof_reader_lock);
    970 			continue;
    971 		}
    972 		STAILQ_REMOVE_HEAD(&tprof_list, b_list);
    973 		KASSERT(tprof_nbuf_on_list > 0);
    974 		tprof_nbuf_on_list--;
    975 		mutex_exit(&tprof_lock);
    976 
    977 		/*
    978 		 * copy it out.
    979 		 */
    980 		bytes = MIN(buf->b_used * sizeof(tprof_sample_t) -
    981 		    tprof_reader_offset, uio->uio_resid);
    982 		resid = uio->uio_resid;
    983 		error = uiomove((char *)buf->b_data + tprof_reader_offset,
    984 		    bytes, uio);
    985 		done = resid - uio->uio_resid;
    986 		tprof_reader_offset += done;
    987 
    988 		/*
    989 		 * if we didn't consume the whole buffer,
    990 		 * put it back to the list.
    991 		 */
    992 		if (tprof_reader_offset <
    993 		    buf->b_used * sizeof(tprof_sample_t)) {
    994 			mutex_enter(&tprof_lock);
    995 			STAILQ_INSERT_HEAD(&tprof_list, buf, b_list);
    996 			tprof_nbuf_on_list++;
    997 			cv_broadcast(&tprof_reader_cv);
    998 			mutex_exit(&tprof_lock);
    999 		} else {
   1000 			tprof_buf_free(buf);
   1001 			tprof_reader_offset = 0;
   1002 		}
   1003 	}
   1004 	mutex_exit(&tprof_reader_lock);
   1005 
   1006 	return error;
   1007 }
   1008 
   1009 static int
   1010 tprof_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
   1011 {
   1012 	const tprof_param_t *param;
   1013 	tprof_counts_t *counts;
   1014 	int error = 0;
   1015 
   1016 	KASSERT(minor(dev) == 0);
   1017 
   1018 	switch (cmd) {
   1019 	case TPROF_IOC_GETINFO:
   1020 		mutex_enter(&tprof_startstop_lock);
   1021 		tprof_getinfo(data);
   1022 		mutex_exit(&tprof_startstop_lock);
   1023 		break;
   1024 	case TPROF_IOC_GETNCOUNTERS:
   1025 		mutex_enter(&tprof_lock);
   1026 		error = tprof_getncounters((u_int *)data);
   1027 		mutex_exit(&tprof_lock);
   1028 		break;
   1029 	case TPROF_IOC_START:
   1030 		mutex_enter(&tprof_startstop_lock);
   1031 		error = tprof_start(*(tprof_countermask_t *)data);
   1032 		mutex_exit(&tprof_startstop_lock);
   1033 		break;
   1034 	case TPROF_IOC_STOP:
   1035 		mutex_enter(&tprof_startstop_lock);
   1036 		tprof_stop(*(tprof_countermask_t *)data);
   1037 		mutex_exit(&tprof_startstop_lock);
   1038 		break;
   1039 	case TPROF_IOC_GETSTAT:
   1040 		mutex_enter(&tprof_lock);
   1041 		memcpy(data, &tprof_stat, sizeof(tprof_stat));
   1042 		mutex_exit(&tprof_lock);
   1043 		break;
   1044 	case TPROF_IOC_CONFIGURE_EVENT:
   1045 		param = data;
   1046 		mutex_enter(&tprof_startstop_lock);
   1047 		error = tprof_configure_event(param);
   1048 		mutex_exit(&tprof_startstop_lock);
   1049 		break;
   1050 	case TPROF_IOC_GETCOUNTS:
   1051 		counts = data;
   1052 		mutex_enter(&tprof_startstop_lock);
   1053 		error = tprof_getcounts(counts);
   1054 		mutex_exit(&tprof_startstop_lock);
   1055 		break;
   1056 	default:
   1057 		error = EINVAL;
   1058 		break;
   1059 	}
   1060 
   1061 	return error;
   1062 }
   1063 
   1064 const struct cdevsw tprof_cdevsw = {
   1065 	.d_open = tprof_open,
   1066 	.d_close = tprof_close,
   1067 	.d_read = tprof_read,
   1068 	.d_write = nowrite,
   1069 	.d_ioctl = tprof_ioctl,
   1070 	.d_stop = nostop,
   1071 	.d_tty = notty,
   1072 	.d_poll = tprof_poll,
   1073 	.d_mmap = nommap,
   1074 	.d_kqfilter = tprof_kqfilter,
   1075 	.d_discard = nodiscard,
   1076 	.d_flag = D_OTHER | D_MPSAFE
   1077 };
   1078 
   1079 void
   1080 tprofattach(int nunits)
   1081 {
   1082 
   1083 	/* nothing */
   1084 }
   1085 
   1086 MODULE(MODULE_CLASS_DRIVER, tprof, NULL);
   1087 
   1088 static void
   1089 tprof_cpu_init(void *vcp, void *vcookie, struct cpu_info *ci)
   1090 {
   1091 	tprof_cpu_t **cp = vcp, *c;
   1092 
   1093 	c = kmem_zalloc(sizeof(*c), KM_SLEEP);
   1094 	c->c_buf = NULL;
   1095 	c->c_cpuid = cpu_index(ci);
   1096 	*cp = c;
   1097 }
   1098 
   1099 static void
   1100 tprof_cpu_fini(void *vcp, void *vcookie, struct cpu_info *ci)
   1101 {
   1102 	tprof_cpu_t **cp = vcp, *c;
   1103 
   1104 	c = *cp;
   1105 	KASSERT(c->c_cpuid == cpu_index(ci));
   1106 	KASSERT(c->c_buf == NULL);
   1107 	kmem_free(c, sizeof(*c));
   1108 	*cp = NULL;
   1109 }
   1110 
   1111 static void
   1112 tprof_driver_init(void)
   1113 {
   1114 
   1115 	mutex_init(&tprof_lock, MUTEX_DEFAULT, IPL_NONE);
   1116 	mutex_init(&tprof_reader_lock, MUTEX_DEFAULT, IPL_NONE);
   1117 	mutex_init(&tprof_startstop_lock, MUTEX_DEFAULT, IPL_NONE);
   1118 	selinit(&tprof_selp);
   1119 	cv_init(&tprof_cv, "tprof");
   1120 	cv_init(&tprof_reader_cv, "tprof_rd");
   1121 	STAILQ_INIT(&tprof_list);
   1122 	tprof_cpus = percpu_create(sizeof(tprof_cpu_t *),
   1123 	    tprof_cpu_init, tprof_cpu_fini, NULL);
   1124 }
   1125 
   1126 static void
   1127 tprof_driver_fini(void)
   1128 {
   1129 
   1130 	percpu_free(tprof_cpus, sizeof(tprof_cpu_t *));
   1131 	mutex_destroy(&tprof_lock);
   1132 	mutex_destroy(&tprof_reader_lock);
   1133 	mutex_destroy(&tprof_startstop_lock);
   1134 	seldestroy(&tprof_selp);
   1135 	cv_destroy(&tprof_cv);
   1136 	cv_destroy(&tprof_reader_cv);
   1137 }
   1138 
   1139 static int
   1140 tprof_modcmd(modcmd_t cmd, void *arg)
   1141 {
   1142 
   1143 	switch (cmd) {
   1144 	case MODULE_CMD_INIT:
   1145 		tprof_driver_init();
   1146 #if defined(_MODULE)
   1147 		{
   1148 			devmajor_t bmajor = NODEVMAJOR;
   1149 			devmajor_t cmajor = NODEVMAJOR;
   1150 			int error;
   1151 
   1152 			error = devsw_attach("tprof", NULL, &bmajor,
   1153 			    &tprof_cdevsw, &cmajor);
   1154 			if (error) {
   1155 				tprof_driver_fini();
   1156 				return error;
   1157 			}
   1158 		}
   1159 #endif /* defined(_MODULE) */
   1160 		return 0;
   1161 
   1162 	case MODULE_CMD_FINI:
   1163 #if defined(_MODULE)
   1164 		devsw_detach(NULL, &tprof_cdevsw);
   1165 #endif /* defined(_MODULE) */
   1166 		tprof_driver_fini();
   1167 		return 0;
   1168 
   1169 	default:
   1170 		return ENOTTY;
   1171 	}
   1172 }
   1173