Home | History | Annotate | Line # | Download | only in tprof
tprof.c revision 1.19
      1 /*	$NetBSD: tprof.c,v 1.19 2022/12/01 00:32:52 ryo Exp $	*/
      2 
      3 /*-
      4  * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.19 2022/12/01 00:32:52 ryo Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/kernel.h>
     35 
     36 #include <sys/callout.h>
     37 #include <sys/conf.h>
     38 #include <sys/cpu.h>
     39 #include <sys/kmem.h>
     40 #include <sys/module.h>
     41 #include <sys/percpu.h>
     42 #include <sys/proc.h>
     43 #include <sys/queue.h>
     44 #include <sys/workqueue.h>
     45 #include <sys/xcall.h>
     46 
     47 #include <dev/tprof/tprof.h>
     48 #include <dev/tprof/tprof_ioctl.h>
     49 
     50 #include "ioconf.h"
     51 
     52 #ifndef TPROF_HZ
     53 #define TPROF_HZ	10000
     54 #endif
     55 
     56 /*
     57  * locking order:
     58  *	tprof_reader_lock -> tprof_lock
     59  *	tprof_startstop_lock -> tprof_lock
     60  */
     61 
     62 /*
     63  * protected by:
     64  *	L: tprof_lock
     65  *	R: tprof_reader_lock
     66  *	S: tprof_startstop_lock
     67  *	s: writer should hold tprof_startstop_lock and tprof_lock
     68  *	   reader should hold tprof_startstop_lock or tprof_lock
     69  */
     70 
     71 typedef struct tprof_buf {
     72 	u_int b_used;
     73 	u_int b_size;
     74 	u_int b_overflow;
     75 	u_int b_unused;
     76 	STAILQ_ENTRY(tprof_buf) b_list;
     77 	tprof_sample_t b_data[];
     78 } tprof_buf_t;
     79 #define	TPROF_BUF_BYTESIZE(sz) \
     80 	(sizeof(tprof_buf_t) + (sz) * sizeof(tprof_sample_t))
     81 #define	TPROF_MAX_SAMPLES_PER_BUF	(TPROF_HZ * 2)
     82 
     83 #define	TPROF_MAX_BUF			100
     84 
     85 typedef struct {
     86 	tprof_buf_t *c_buf;
     87 	uint32_t c_cpuid;
     88 	struct work c_work;
     89 	callout_t c_callout;
     90 } __aligned(CACHE_LINE_SIZE) tprof_cpu_t;
     91 
     92 typedef struct tprof_backend {
     93 	/*
     94 	 * tprof_backend_softc_t must be passed as an argument to the interrupt
     95 	 * handler, but since this is difficult to implement in armv7/v8. Then,
     96 	 * tprof_backend is exposed. Additionally, softc must be placed at the
     97 	 * beginning of struct tprof_backend.
     98 	 */
     99 	tprof_backend_softc_t tb_softc;
    100 
    101 	const char *tb_name;
    102 	const tprof_backend_ops_t *tb_ops;
    103 	LIST_ENTRY(tprof_backend) tb_list;
    104 } tprof_backend_t;
    105 
    106 static kmutex_t tprof_lock;
    107 static u_int tprof_nworker;		/* L: # of running worker LWPs */
    108 static lwp_t *tprof_owner;
    109 static STAILQ_HEAD(, tprof_buf) tprof_list; /* L: global buffer list */
    110 static u_int tprof_nbuf_on_list;	/* L: # of buffers on tprof_list */
    111 static struct workqueue *tprof_wq;
    112 static struct percpu *tprof_cpus __read_mostly;	/* tprof_cpu_t * */
    113 static u_int tprof_samples_per_buf;
    114 
    115 tprof_backend_t *tprof_backend;	/* S: */
    116 static LIST_HEAD(, tprof_backend) tprof_backends =
    117     LIST_HEAD_INITIALIZER(tprof_backend); /* S: */
    118 
    119 static kmutex_t tprof_reader_lock;
    120 static kcondvar_t tprof_reader_cv;	/* L: */
    121 static off_t tprof_reader_offset;	/* R: */
    122 
    123 static kmutex_t tprof_startstop_lock;
    124 static kcondvar_t tprof_cv;		/* L: */
    125 
    126 static struct tprof_stat tprof_stat;	/* L: */
    127 
    128 static tprof_cpu_t *
    129 tprof_cpu_direct(struct cpu_info *ci)
    130 {
    131 	tprof_cpu_t **cp;
    132 
    133 	cp = percpu_getptr_remote(tprof_cpus, ci);
    134 	return *cp;
    135 }
    136 
    137 static tprof_cpu_t *
    138 tprof_cpu(struct cpu_info *ci)
    139 {
    140 	tprof_cpu_t *c;
    141 
    142 	/*
    143 	 * As long as xcalls are blocked -- e.g., by kpreempt_disable
    144 	 * -- the percpu object will not be swapped and destroyed.  We
    145 	 * can't write to it, because the data may have already been
    146 	 * moved to a new buffer, but we can safely read from it.
    147 	 */
    148 	kpreempt_disable();
    149 	c = tprof_cpu_direct(ci);
    150 	kpreempt_enable();
    151 
    152 	return c;
    153 }
    154 
    155 static tprof_cpu_t *
    156 tprof_curcpu(void)
    157 {
    158 
    159 	return tprof_cpu(curcpu());
    160 }
    161 
    162 static tprof_buf_t *
    163 tprof_buf_alloc(void)
    164 {
    165 	tprof_buf_t *new;
    166 	u_int size = tprof_samples_per_buf;
    167 
    168 	new = kmem_alloc(TPROF_BUF_BYTESIZE(size), KM_SLEEP);
    169 	new->b_used = 0;
    170 	new->b_size = size;
    171 	new->b_overflow = 0;
    172 	return new;
    173 }
    174 
    175 static void
    176 tprof_buf_free(tprof_buf_t *buf)
    177 {
    178 
    179 	kmem_free(buf, TPROF_BUF_BYTESIZE(buf->b_size));
    180 }
    181 
    182 static tprof_buf_t *
    183 tprof_buf_switch(tprof_cpu_t *c, tprof_buf_t *new)
    184 {
    185 	tprof_buf_t *old;
    186 
    187 	old = c->c_buf;
    188 	c->c_buf = new;
    189 	return old;
    190 }
    191 
    192 static tprof_buf_t *
    193 tprof_buf_refresh(void)
    194 {
    195 	tprof_cpu_t * const c = tprof_curcpu();
    196 	tprof_buf_t *new;
    197 
    198 	new = tprof_buf_alloc();
    199 	return tprof_buf_switch(c, new);
    200 }
    201 
    202 static void
    203 tprof_worker(struct work *wk, void *dummy)
    204 {
    205 	tprof_cpu_t * const c = tprof_curcpu();
    206 	tprof_buf_t *buf;
    207 	tprof_backend_t *tb;
    208 	bool shouldstop;
    209 
    210 	KASSERT(wk == &c->c_work);
    211 	KASSERT(dummy == NULL);
    212 
    213 	/*
    214 	 * get a per cpu buffer.
    215 	 */
    216 	buf = tprof_buf_refresh();
    217 
    218 	/*
    219 	 * and put it on the global list for read(2).
    220 	 */
    221 	mutex_enter(&tprof_lock);
    222 	tb = tprof_backend;
    223 	shouldstop = (tb == NULL || tb->tb_softc.sc_ctr_running_mask == 0);
    224 	if (shouldstop) {
    225 		KASSERT(tprof_nworker > 0);
    226 		tprof_nworker--;
    227 		cv_broadcast(&tprof_cv);
    228 		cv_broadcast(&tprof_reader_cv);
    229 	}
    230 	if (buf->b_used == 0) {
    231 		tprof_stat.ts_emptybuf++;
    232 	} else if (tprof_nbuf_on_list < TPROF_MAX_BUF) {
    233 		tprof_stat.ts_sample += buf->b_used;
    234 		tprof_stat.ts_overflow += buf->b_overflow;
    235 		tprof_stat.ts_buf++;
    236 		STAILQ_INSERT_TAIL(&tprof_list, buf, b_list);
    237 		tprof_nbuf_on_list++;
    238 		buf = NULL;
    239 		cv_broadcast(&tprof_reader_cv);
    240 	} else {
    241 		tprof_stat.ts_dropbuf_sample += buf->b_used;
    242 		tprof_stat.ts_dropbuf++;
    243 	}
    244 	mutex_exit(&tprof_lock);
    245 	if (buf) {
    246 		tprof_buf_free(buf);
    247 	}
    248 	if (!shouldstop) {
    249 		callout_schedule(&c->c_callout, hz);
    250 	}
    251 }
    252 
    253 static void
    254 tprof_kick(void *vp)
    255 {
    256 	struct cpu_info * const ci = vp;
    257 	tprof_cpu_t * const c = tprof_cpu(ci);
    258 
    259 	workqueue_enqueue(tprof_wq, &c->c_work, ci);
    260 }
    261 
    262 static void
    263 tprof_stop1(void)
    264 {
    265 	CPU_INFO_ITERATOR cii;
    266 	struct cpu_info *ci;
    267 
    268 	KASSERT(mutex_owned(&tprof_startstop_lock));
    269 	KASSERT(tprof_nworker == 0);
    270 
    271 	for (CPU_INFO_FOREACH(cii, ci)) {
    272 		tprof_cpu_t * const c = tprof_cpu(ci);
    273 		tprof_buf_t *old;
    274 
    275 		old = tprof_buf_switch(c, NULL);
    276 		if (old != NULL) {
    277 			tprof_buf_free(old);
    278 		}
    279 		callout_destroy(&c->c_callout);
    280 	}
    281 	workqueue_destroy(tprof_wq);
    282 }
    283 
    284 static void
    285 tprof_getinfo(struct tprof_info *info)
    286 {
    287 	tprof_backend_t *tb;
    288 
    289 	KASSERT(mutex_owned(&tprof_startstop_lock));
    290 
    291 	memset(info, 0, sizeof(*info));
    292 	info->ti_version = TPROF_VERSION;
    293 	if ((tb = tprof_backend) != NULL) {
    294 		info->ti_ident = tb->tb_ops->tbo_ident();
    295 	}
    296 }
    297 
    298 static int
    299 tprof_getncounters(u_int *ncounters)
    300 {
    301 	tprof_backend_t *tb;
    302 
    303 	tb = tprof_backend;
    304 	if (tb == NULL)
    305 		return ENOENT;
    306 
    307 	*ncounters = tb->tb_ops->tbo_ncounters();
    308 	return 0;
    309 }
    310 
    311 static void
    312 tprof_start_cpu(void *arg1, void *arg2)
    313 {
    314 	tprof_backend_t *tb = arg1;
    315 	tprof_countermask_t runmask = (uintptr_t)arg2;
    316 
    317 	tb->tb_ops->tbo_start(runmask);
    318 }
    319 
    320 static void
    321 tprof_stop_cpu(void *arg1, void *arg2)
    322 {
    323 	tprof_backend_t *tb = arg1;
    324 	tprof_countermask_t stopmask = (uintptr_t)arg2;
    325 
    326 	tb->tb_ops->tbo_stop(stopmask);
    327 }
    328 
    329 static int
    330 tprof_start(tprof_countermask_t runmask)
    331 {
    332 	CPU_INFO_ITERATOR cii;
    333 	struct cpu_info *ci;
    334 	tprof_backend_t *tb;
    335 	uint64_t xc;
    336 	int error;
    337 	bool firstrun;
    338 
    339 	KASSERT(mutex_owned(&tprof_startstop_lock));
    340 
    341 	tb = tprof_backend;
    342 	if (tb == NULL) {
    343 		error = ENOENT;
    344 		goto done;
    345 	}
    346 
    347 	runmask &= ~tb->tb_softc.sc_ctr_running_mask;
    348 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
    349 	if (runmask == 0) {
    350 		/*
    351 		 * targets are already running.
    352 		 * unconfigured counters are ignored.
    353 		 */
    354 		error = 0;
    355 		goto done;
    356 	}
    357 
    358 	firstrun = (tb->tb_softc.sc_ctr_running_mask == 0);
    359 	if (firstrun) {
    360 		if (tb->tb_ops->tbo_establish != NULL) {
    361 			error = tb->tb_ops->tbo_establish(&tb->tb_softc);
    362 			if (error != 0)
    363 				goto done;
    364 		}
    365 
    366 		tprof_samples_per_buf = TPROF_MAX_SAMPLES_PER_BUF;
    367 		error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker,
    368 		    NULL, PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
    369 		if (error != 0) {
    370 			if (tb->tb_ops->tbo_disestablish != NULL)
    371 				tb->tb_ops->tbo_disestablish(&tb->tb_softc);
    372 			goto done;
    373 		}
    374 
    375 		for (CPU_INFO_FOREACH(cii, ci)) {
    376 			tprof_cpu_t * const c = tprof_cpu(ci);
    377 			tprof_buf_t *new;
    378 			tprof_buf_t *old;
    379 
    380 			new = tprof_buf_alloc();
    381 			old = tprof_buf_switch(c, new);
    382 			if (old != NULL) {
    383 				tprof_buf_free(old);
    384 			}
    385 			callout_init(&c->c_callout, CALLOUT_MPSAFE);
    386 			callout_setfunc(&c->c_callout, tprof_kick, ci);
    387 		}
    388 	}
    389 
    390 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
    391 	xc = xc_broadcast(0, tprof_start_cpu, tb, (void *)(uintptr_t)runmask);
    392 	xc_wait(xc);
    393 	mutex_enter(&tprof_lock);
    394 	tb->tb_softc.sc_ctr_running_mask |= runmask;
    395 	mutex_exit(&tprof_lock);
    396 
    397 	if (firstrun) {
    398 		for (CPU_INFO_FOREACH(cii, ci)) {
    399 			tprof_cpu_t * const c = tprof_cpu(ci);
    400 
    401 			mutex_enter(&tprof_lock);
    402 			tprof_nworker++;
    403 			mutex_exit(&tprof_lock);
    404 			workqueue_enqueue(tprof_wq, &c->c_work, ci);
    405 		}
    406 	}
    407 done:
    408 	return error;
    409 }
    410 
    411 static void
    412 tprof_stop(tprof_countermask_t stopmask)
    413 {
    414 	tprof_backend_t *tb;
    415 	uint64_t xc;
    416 
    417 	tb = tprof_backend;
    418 	if (tb == NULL)
    419 		return;
    420 
    421 	KASSERT(mutex_owned(&tprof_startstop_lock));
    422 	stopmask &= tb->tb_softc.sc_ctr_running_mask;
    423 	if (stopmask == 0) {
    424 		/* targets are not running */
    425 		goto done;
    426 	}
    427 
    428 	xc = xc_broadcast(0, tprof_stop_cpu, tb, (void *)(uintptr_t)stopmask);
    429 	xc_wait(xc);
    430 	mutex_enter(&tprof_lock);
    431 	tb->tb_softc.sc_ctr_running_mask &= ~stopmask;
    432 	mutex_exit(&tprof_lock);
    433 
    434 	/* all counters have stopped? */
    435 	if (tb->tb_softc.sc_ctr_running_mask == 0) {
    436 		mutex_enter(&tprof_lock);
    437 		cv_broadcast(&tprof_reader_cv);
    438 		while (tprof_nworker > 0) {
    439 			cv_wait(&tprof_cv, &tprof_lock);
    440 		}
    441 		mutex_exit(&tprof_lock);
    442 
    443 		tprof_stop1();
    444 		if (tb->tb_ops->tbo_disestablish != NULL)
    445 			tb->tb_ops->tbo_disestablish(&tb->tb_softc);
    446 	}
    447 done:
    448 	;
    449 }
    450 
    451 static void
    452 tprof_init_percpu_counters_offset(void *vp, void *vp2, struct cpu_info *ci)
    453 {
    454 	uint64_t *counters_offset = vp;
    455 	u_int counter = (uintptr_t)vp2;
    456 
    457 	tprof_backend_t *tb = tprof_backend;
    458 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
    459 	counters_offset[counter] = param->p_value;
    460 }
    461 
    462 static void
    463 tprof_configure_event_cpu(void *arg1, void *arg2)
    464 {
    465 	tprof_backend_t *tb = arg1;
    466 	u_int counter = (uintptr_t)arg2;
    467 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
    468 
    469 	tb->tb_ops->tbo_configure_event(counter, param);
    470 }
    471 
    472 static int
    473 tprof_configure_event(const tprof_param_t *param)
    474 {
    475 	tprof_backend_t *tb;
    476 	tprof_backend_softc_t *sc;
    477 	tprof_param_t *sc_param;
    478 	uint64_t xc;
    479 	int c, error;
    480 
    481 	if ((param->p_flags & (TPROF_PARAM_USER | TPROF_PARAM_KERN)) == 0) {
    482 		error = EINVAL;
    483 		goto done;
    484 	}
    485 
    486 	tb = tprof_backend;
    487 	if (tb == NULL) {
    488 		error = ENOENT;
    489 		goto done;
    490 	}
    491 	sc = &tb->tb_softc;
    492 
    493 	c = param->p_counter;
    494 	if (c >= tb->tb_softc.sc_ncounters) {
    495 		error = EINVAL;
    496 		goto done;
    497 	}
    498 
    499 	if (tb->tb_ops->tbo_valid_event != NULL) {
    500 		error = tb->tb_ops->tbo_valid_event(param->p_counter, param);
    501 		if (error != 0)
    502 			goto done;
    503 	}
    504 
    505 	/* if already running, stop the counter */
    506 	if (ISSET(c, tb->tb_softc.sc_ctr_running_mask))
    507 		tprof_stop(__BIT(c));
    508 
    509 	sc->sc_count[c].ctr_bitwidth =
    510 	    tb->tb_ops->tbo_counter_bitwidth(param->p_counter);
    511 
    512 	sc_param = &sc->sc_count[c].ctr_param;
    513 	memcpy(sc_param, param, sizeof(*sc_param));	/* save copy of param */
    514 
    515 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
    516 		uint64_t freq, inum, dnum;
    517 
    518 		freq = tb->tb_ops->tbo_counter_estimate_freq(c);
    519 		sc->sc_count[c].ctr_counter_val = freq / TPROF_HZ;
    520 		if (sc->sc_count[c].ctr_counter_val == 0) {
    521 			printf("%s: counter#%d frequency (%"PRIu64") is"
    522 			    " very low relative to TPROF_HZ (%u)\n", __func__,
    523 			    c, freq, TPROF_HZ);
    524 			sc->sc_count[c].ctr_counter_val =
    525 			    4000000000ULL / TPROF_HZ;
    526 		}
    527 
    528 		switch (param->p_flags & TPROF_PARAM_VALUE2_MASK) {
    529 		case TPROF_PARAM_VALUE2_SCALE:
    530 			if (sc_param->p_value2 == 0)
    531 				break;
    532 			/*
    533 			 * p_value2 is 64-bit fixed-point
    534 			 * upper 32 bits are the integer part
    535 			 * lower 32 bits are the decimal part
    536 			 */
    537 			inum = sc_param->p_value2 >> 32;
    538 			dnum = sc_param->p_value2 & __BITS(31, 0);
    539 			sc->sc_count[c].ctr_counter_val =
    540 			    sc->sc_count[c].ctr_counter_val * inum +
    541 			    (sc->sc_count[c].ctr_counter_val * dnum >> 32);
    542 			if (sc->sc_count[c].ctr_counter_val == 0)
    543 				sc->sc_count[c].ctr_counter_val = 1;
    544 			break;
    545 		case TPROF_PARAM_VALUE2_TRIGGERCOUNT:
    546 			if (sc_param->p_value2 == 0)
    547 				sc_param->p_value2 = 1;
    548 			if (sc_param->p_value2 >
    549 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0)) {
    550 				sc_param->p_value2 =
    551 				    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
    552 			}
    553 			sc->sc_count[c].ctr_counter_val = sc_param->p_value2;
    554 			break;
    555 		default:
    556 			break;
    557 		}
    558 		sc->sc_count[c].ctr_counter_reset_val =
    559 		    -sc->sc_count[c].ctr_counter_val;
    560 		sc->sc_count[c].ctr_counter_reset_val &=
    561 		    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
    562 	} else {
    563 		sc->sc_count[c].ctr_counter_val = 0;
    564 		sc->sc_count[c].ctr_counter_reset_val = 0;
    565 	}
    566 
    567 	/* At this point, p_value is used as an initial value */
    568 	percpu_foreach(tb->tb_softc.sc_ctr_offset_percpu,
    569 	    tprof_init_percpu_counters_offset, (void *)(uintptr_t)c);
    570 	/* On the backend side, p_value is used as the reset value */
    571 	sc_param->p_value = tb->tb_softc.sc_count[c].ctr_counter_reset_val;
    572 
    573 	xc = xc_broadcast(0, tprof_configure_event_cpu,
    574 	    tb, (void *)(uintptr_t)c);
    575 	xc_wait(xc);
    576 
    577 	mutex_enter(&tprof_lock);
    578 	/* update counters bitmasks */
    579 	SET(tb->tb_softc.sc_ctr_configured_mask, __BIT(c));
    580 	CLR(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
    581 	CLR(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
    582 	/* profiled counter requires overflow handling */
    583 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
    584 		SET(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
    585 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
    586 	}
    587 	/* counters with less than 64bits also require overflow handling */
    588 	if (sc->sc_count[c].ctr_bitwidth != 64)
    589 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
    590 	mutex_exit(&tprof_lock);
    591 
    592 	error = 0;
    593 
    594  done:
    595 	return error;
    596 }
    597 
    598 static void
    599 tprof_getcounts_cpu(void *arg1, void *arg2)
    600 {
    601 	tprof_backend_t *tb = arg1;
    602 	tprof_backend_softc_t *sc = &tb->tb_softc;
    603 	uint64_t *counters = arg2;
    604 	uint64_t *counters_offset;
    605 	unsigned int c;
    606 
    607 	tprof_countermask_t configmask = sc->sc_ctr_configured_mask;
    608 	counters_offset = percpu_getref(sc->sc_ctr_offset_percpu);
    609 	for (c = 0; c < sc->sc_ncounters; c++) {
    610 		if (ISSET(configmask, __BIT(c))) {
    611 			uint64_t ctr = tb->tb_ops->tbo_counter_read(c);
    612 			counters[c] = counters_offset[c] +
    613 			    ((ctr - sc->sc_count[c].ctr_counter_reset_val) &
    614 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0));
    615 		} else {
    616 			counters[c] = 0;
    617 		}
    618 	}
    619 	percpu_putref(sc->sc_ctr_offset_percpu);
    620 }
    621 
    622 static int
    623 tprof_getcounts(tprof_counts_t *counts)
    624 {
    625 	struct cpu_info *ci;
    626 	tprof_backend_t *tb;
    627 	uint64_t xc;
    628 
    629 	tb = tprof_backend;
    630 	if (tb == NULL)
    631 		return ENOENT;
    632 
    633 	if (counts->c_cpu >= ncpu)
    634 		return ESRCH;
    635 	ci = cpu_lookup(counts->c_cpu);
    636 	if (ci == NULL)
    637 		return ESRCH;
    638 
    639 	xc = xc_unicast(0, tprof_getcounts_cpu, tb, counts->c_count, ci);
    640 	xc_wait(xc);
    641 
    642 	counts->c_ncounters = tb->tb_softc.sc_ncounters;
    643 	counts->c_runningmask = tb->tb_softc.sc_ctr_running_mask;
    644 	return 0;
    645 }
    646 
    647 /*
    648  * tprof_clear: drain unread samples.
    649  */
    650 
    651 static void
    652 tprof_clear(void)
    653 {
    654 	tprof_buf_t *buf;
    655 
    656 	mutex_enter(&tprof_reader_lock);
    657 	mutex_enter(&tprof_lock);
    658 	while ((buf = STAILQ_FIRST(&tprof_list)) != NULL) {
    659 		if (buf != NULL) {
    660 			STAILQ_REMOVE_HEAD(&tprof_list, b_list);
    661 			KASSERT(tprof_nbuf_on_list > 0);
    662 			tprof_nbuf_on_list--;
    663 			mutex_exit(&tprof_lock);
    664 			tprof_buf_free(buf);
    665 			mutex_enter(&tprof_lock);
    666 		}
    667 	}
    668 	KASSERT(tprof_nbuf_on_list == 0);
    669 	mutex_exit(&tprof_lock);
    670 	tprof_reader_offset = 0;
    671 	mutex_exit(&tprof_reader_lock);
    672 
    673 	memset(&tprof_stat, 0, sizeof(tprof_stat));
    674 }
    675 
    676 static tprof_backend_t *
    677 tprof_backend_lookup(const char *name)
    678 {
    679 	tprof_backend_t *tb;
    680 
    681 	KASSERT(mutex_owned(&tprof_startstop_lock));
    682 
    683 	LIST_FOREACH(tb, &tprof_backends, tb_list) {
    684 		if (!strcmp(tb->tb_name, name)) {
    685 			return tb;
    686 		}
    687 	}
    688 	return NULL;
    689 }
    690 
    691 /* -------------------- backend interfaces */
    692 
    693 /*
    694  * tprof_sample: record a sample on the per-cpu buffer.
    695  *
    696  * be careful; can be called in NMI context.
    697  * we are bluntly assuming the followings are safe.
    698  *	curcpu()
    699  *	curlwp->l_lid
    700  *	curlwp->l_proc->p_pid
    701  */
    702 
    703 void
    704 tprof_sample(void *unused, const tprof_frame_info_t *tfi)
    705 {
    706 	tprof_cpu_t * const c = tprof_cpu_direct(curcpu());
    707 	tprof_buf_t * const buf = c->c_buf;
    708 	tprof_sample_t *sp;
    709 	const uintptr_t pc = tfi->tfi_pc;
    710 	const lwp_t * const l = curlwp;
    711 	u_int idx;
    712 
    713 	idx = buf->b_used;
    714 	if (__predict_false(idx >= buf->b_size)) {
    715 		buf->b_overflow++;
    716 		return;
    717 	}
    718 	sp = &buf->b_data[idx];
    719 	sp->s_pid = l->l_proc->p_pid;
    720 	sp->s_lwpid = l->l_lid;
    721 	sp->s_cpuid = c->c_cpuid;
    722 	sp->s_flags = ((tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0) |
    723 	    __SHIFTIN(tfi->tfi_counter, TPROF_SAMPLE_COUNTER_MASK);
    724 	sp->s_pc = pc;
    725 	buf->b_used = idx + 1;
    726 }
    727 
    728 /*
    729  * tprof_backend_register:
    730  */
    731 
    732 int
    733 tprof_backend_register(const char *name, const tprof_backend_ops_t *ops,
    734     int vers)
    735 {
    736 	tprof_backend_t *tb;
    737 
    738 	if (vers != TPROF_BACKEND_VERSION) {
    739 		return EINVAL;
    740 	}
    741 
    742 	mutex_enter(&tprof_startstop_lock);
    743 	tb = tprof_backend_lookup(name);
    744 	if (tb != NULL) {
    745 		mutex_exit(&tprof_startstop_lock);
    746 		return EEXIST;
    747 	}
    748 #if 1 /* XXX for now */
    749 	if (!LIST_EMPTY(&tprof_backends)) {
    750 		mutex_exit(&tprof_startstop_lock);
    751 		return ENOTSUP;
    752 	}
    753 #endif
    754 	tb = kmem_zalloc(sizeof(*tb), KM_SLEEP);
    755 	tb->tb_name = name;
    756 	tb->tb_ops = ops;
    757 	LIST_INSERT_HEAD(&tprof_backends, tb, tb_list);
    758 #if 1 /* XXX for now */
    759 	if (tprof_backend == NULL) {
    760 		tprof_backend = tb;
    761 	}
    762 #endif
    763 	mutex_exit(&tprof_startstop_lock);
    764 
    765 	/* init backend softc */
    766 	tb->tb_softc.sc_ncounters = tb->tb_ops->tbo_ncounters();
    767 	tb->tb_softc.sc_ctr_offset_percpu_size =
    768 	    sizeof(uint64_t) * tb->tb_softc.sc_ncounters;
    769 	tb->tb_softc.sc_ctr_offset_percpu =
    770 	    percpu_alloc(tb->tb_softc.sc_ctr_offset_percpu_size);
    771 
    772 	return 0;
    773 }
    774 
    775 /*
    776  * tprof_backend_unregister:
    777  */
    778 
    779 int
    780 tprof_backend_unregister(const char *name)
    781 {
    782 	tprof_backend_t *tb;
    783 
    784 	mutex_enter(&tprof_startstop_lock);
    785 	tb = tprof_backend_lookup(name);
    786 #if defined(DIAGNOSTIC)
    787 	if (tb == NULL) {
    788 		mutex_exit(&tprof_startstop_lock);
    789 		panic("%s: not found '%s'", __func__, name);
    790 	}
    791 #endif /* defined(DIAGNOSTIC) */
    792 	if (tb->tb_softc.sc_ctr_running_mask != 0) {
    793 		mutex_exit(&tprof_startstop_lock);
    794 		return EBUSY;
    795 	}
    796 #if 1 /* XXX for now */
    797 	if (tprof_backend == tb) {
    798 		tprof_backend = NULL;
    799 	}
    800 #endif
    801 	LIST_REMOVE(tb, tb_list);
    802 	mutex_exit(&tprof_startstop_lock);
    803 
    804 	/* fini backend softc */
    805 	percpu_free(tb->tb_softc.sc_ctr_offset_percpu,
    806 	    tb->tb_softc.sc_ctr_offset_percpu_size);
    807 
    808 	/* free backend */
    809 	kmem_free(tb, sizeof(*tb));
    810 
    811 	return 0;
    812 }
    813 
    814 /* -------------------- cdevsw interfaces */
    815 
    816 static int
    817 tprof_open(dev_t dev, int flags, int type, struct lwp *l)
    818 {
    819 
    820 	if (minor(dev) != 0) {
    821 		return EXDEV;
    822 	}
    823 	mutex_enter(&tprof_lock);
    824 	if (tprof_owner != NULL) {
    825 		mutex_exit(&tprof_lock);
    826 		return  EBUSY;
    827 	}
    828 	tprof_owner = curlwp;
    829 	mutex_exit(&tprof_lock);
    830 
    831 	return 0;
    832 }
    833 
    834 static int
    835 tprof_close(dev_t dev, int flags, int type, struct lwp *l)
    836 {
    837 
    838 	KASSERT(minor(dev) == 0);
    839 
    840 	mutex_enter(&tprof_startstop_lock);
    841 	mutex_enter(&tprof_lock);
    842 	tprof_owner = NULL;
    843 	mutex_exit(&tprof_lock);
    844 	tprof_stop(TPROF_COUNTERMASK_ALL);
    845 	tprof_clear();
    846 
    847 	tprof_backend_t *tb = tprof_backend;
    848 	if (tb != NULL) {
    849 		KASSERT(tb->tb_softc.sc_ctr_running_mask == 0);
    850 		tb->tb_softc.sc_ctr_configured_mask = 0;
    851 		tb->tb_softc.sc_ctr_prof_mask = 0;
    852 		tb->tb_softc.sc_ctr_ovf_mask = 0;
    853 	}
    854 
    855 	mutex_exit(&tprof_startstop_lock);
    856 
    857 	return 0;
    858 }
    859 
    860 static int
    861 tprof_read(dev_t dev, struct uio *uio, int flags)
    862 {
    863 	tprof_buf_t *buf;
    864 	size_t bytes;
    865 	size_t resid;
    866 	size_t done;
    867 	int error = 0;
    868 
    869 	KASSERT(minor(dev) == 0);
    870 	mutex_enter(&tprof_reader_lock);
    871 	while (uio->uio_resid > 0 && error == 0) {
    872 		/*
    873 		 * take the first buffer from the list.
    874 		 */
    875 		mutex_enter(&tprof_lock);
    876 		buf = STAILQ_FIRST(&tprof_list);
    877 		if (buf == NULL) {
    878 			if (tprof_nworker == 0) {
    879 				mutex_exit(&tprof_lock);
    880 				error = 0;
    881 				break;
    882 			}
    883 			mutex_exit(&tprof_reader_lock);
    884 			error = cv_wait_sig(&tprof_reader_cv, &tprof_lock);
    885 			mutex_exit(&tprof_lock);
    886 			mutex_enter(&tprof_reader_lock);
    887 			continue;
    888 		}
    889 		STAILQ_REMOVE_HEAD(&tprof_list, b_list);
    890 		KASSERT(tprof_nbuf_on_list > 0);
    891 		tprof_nbuf_on_list--;
    892 		mutex_exit(&tprof_lock);
    893 
    894 		/*
    895 		 * copy it out.
    896 		 */
    897 		bytes = MIN(buf->b_used * sizeof(tprof_sample_t) -
    898 		    tprof_reader_offset, uio->uio_resid);
    899 		resid = uio->uio_resid;
    900 		error = uiomove((char *)buf->b_data + tprof_reader_offset,
    901 		    bytes, uio);
    902 		done = resid - uio->uio_resid;
    903 		tprof_reader_offset += done;
    904 
    905 		/*
    906 		 * if we didn't consume the whole buffer,
    907 		 * put it back to the list.
    908 		 */
    909 		if (tprof_reader_offset <
    910 		    buf->b_used * sizeof(tprof_sample_t)) {
    911 			mutex_enter(&tprof_lock);
    912 			STAILQ_INSERT_HEAD(&tprof_list, buf, b_list);
    913 			tprof_nbuf_on_list++;
    914 			cv_broadcast(&tprof_reader_cv);
    915 			mutex_exit(&tprof_lock);
    916 		} else {
    917 			tprof_buf_free(buf);
    918 			tprof_reader_offset = 0;
    919 		}
    920 	}
    921 	mutex_exit(&tprof_reader_lock);
    922 
    923 	return error;
    924 }
    925 
    926 static int
    927 tprof_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
    928 {
    929 	const tprof_param_t *param;
    930 	tprof_counts_t *counts;
    931 	int error = 0;
    932 
    933 	KASSERT(minor(dev) == 0);
    934 
    935 	switch (cmd) {
    936 	case TPROF_IOC_GETINFO:
    937 		mutex_enter(&tprof_startstop_lock);
    938 		tprof_getinfo(data);
    939 		mutex_exit(&tprof_startstop_lock);
    940 		break;
    941 	case TPROF_IOC_GETNCOUNTERS:
    942 		mutex_enter(&tprof_lock);
    943 		error = tprof_getncounters((u_int *)data);
    944 		mutex_exit(&tprof_lock);
    945 		break;
    946 	case TPROF_IOC_START:
    947 		mutex_enter(&tprof_startstop_lock);
    948 		error = tprof_start(*(tprof_countermask_t *)data);
    949 		mutex_exit(&tprof_startstop_lock);
    950 		break;
    951 	case TPROF_IOC_STOP:
    952 		mutex_enter(&tprof_startstop_lock);
    953 		tprof_stop(*(tprof_countermask_t *)data);
    954 		mutex_exit(&tprof_startstop_lock);
    955 		break;
    956 	case TPROF_IOC_GETSTAT:
    957 		mutex_enter(&tprof_lock);
    958 		memcpy(data, &tprof_stat, sizeof(tprof_stat));
    959 		mutex_exit(&tprof_lock);
    960 		break;
    961 	case TPROF_IOC_CONFIGURE_EVENT:
    962 		param = data;
    963 		mutex_enter(&tprof_startstop_lock);
    964 		error = tprof_configure_event(param);
    965 		mutex_exit(&tprof_startstop_lock);
    966 		break;
    967 	case TPROF_IOC_GETCOUNTS:
    968 		counts = data;
    969 		mutex_enter(&tprof_startstop_lock);
    970 		error = tprof_getcounts(counts);
    971 		mutex_exit(&tprof_startstop_lock);
    972 		break;
    973 	default:
    974 		error = EINVAL;
    975 		break;
    976 	}
    977 
    978 	return error;
    979 }
    980 
    981 const struct cdevsw tprof_cdevsw = {
    982 	.d_open = tprof_open,
    983 	.d_close = tprof_close,
    984 	.d_read = tprof_read,
    985 	.d_write = nowrite,
    986 	.d_ioctl = tprof_ioctl,
    987 	.d_stop = nostop,
    988 	.d_tty = notty,
    989 	.d_poll = nopoll,
    990 	.d_mmap = nommap,
    991 	.d_kqfilter = nokqfilter,
    992 	.d_discard = nodiscard,
    993 	.d_flag = D_OTHER | D_MPSAFE
    994 };
    995 
    996 void
    997 tprofattach(int nunits)
    998 {
    999 
   1000 	/* nothing */
   1001 }
   1002 
   1003 MODULE(MODULE_CLASS_DRIVER, tprof, NULL);
   1004 
   1005 static void
   1006 tprof_cpu_init(void *vcp, void *vcookie, struct cpu_info *ci)
   1007 {
   1008 	tprof_cpu_t **cp = vcp, *c;
   1009 
   1010 	c = kmem_zalloc(sizeof(*c), KM_SLEEP);
   1011 	c->c_buf = NULL;
   1012 	c->c_cpuid = cpu_index(ci);
   1013 	*cp = c;
   1014 }
   1015 
   1016 static void
   1017 tprof_cpu_fini(void *vcp, void *vcookie, struct cpu_info *ci)
   1018 {
   1019 	tprof_cpu_t **cp = vcp, *c;
   1020 
   1021 	c = *cp;
   1022 	KASSERT(c->c_cpuid == cpu_index(ci));
   1023 	KASSERT(c->c_buf == NULL);
   1024 	kmem_free(c, sizeof(*c));
   1025 	*cp = NULL;
   1026 }
   1027 
   1028 static void
   1029 tprof_driver_init(void)
   1030 {
   1031 
   1032 	mutex_init(&tprof_lock, MUTEX_DEFAULT, IPL_NONE);
   1033 	mutex_init(&tprof_reader_lock, MUTEX_DEFAULT, IPL_NONE);
   1034 	mutex_init(&tprof_startstop_lock, MUTEX_DEFAULT, IPL_NONE);
   1035 	cv_init(&tprof_cv, "tprof");
   1036 	cv_init(&tprof_reader_cv, "tprof_rd");
   1037 	STAILQ_INIT(&tprof_list);
   1038 	tprof_cpus = percpu_create(sizeof(tprof_cpu_t *),
   1039 	    tprof_cpu_init, tprof_cpu_fini, NULL);
   1040 }
   1041 
   1042 static void
   1043 tprof_driver_fini(void)
   1044 {
   1045 
   1046 	percpu_free(tprof_cpus, sizeof(tprof_cpu_t *));
   1047 	mutex_destroy(&tprof_lock);
   1048 	mutex_destroy(&tprof_reader_lock);
   1049 	mutex_destroy(&tprof_startstop_lock);
   1050 	cv_destroy(&tprof_cv);
   1051 	cv_destroy(&tprof_reader_cv);
   1052 }
   1053 
   1054 static int
   1055 tprof_modcmd(modcmd_t cmd, void *arg)
   1056 {
   1057 
   1058 	switch (cmd) {
   1059 	case MODULE_CMD_INIT:
   1060 		tprof_driver_init();
   1061 #if defined(_MODULE)
   1062 		{
   1063 			devmajor_t bmajor = NODEVMAJOR;
   1064 			devmajor_t cmajor = NODEVMAJOR;
   1065 			int error;
   1066 
   1067 			error = devsw_attach("tprof", NULL, &bmajor,
   1068 			    &tprof_cdevsw, &cmajor);
   1069 			if (error) {
   1070 				tprof_driver_fini();
   1071 				return error;
   1072 			}
   1073 		}
   1074 #endif /* defined(_MODULE) */
   1075 		return 0;
   1076 
   1077 	case MODULE_CMD_FINI:
   1078 #if defined(_MODULE)
   1079 		devsw_detach(NULL, &tprof_cdevsw);
   1080 #endif /* defined(_MODULE) */
   1081 		tprof_driver_fini();
   1082 		return 0;
   1083 
   1084 	default:
   1085 		return ENOTTY;
   1086 	}
   1087 }
   1088