Home | History | Annotate | Line # | Download | only in kern
sys_eventfd.c revision 1.1.2.1
      1 /*	$NetBSD: sys_eventfd.c,v 1.1.2.1 2020/12/14 16:00:51 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.1.2.1 2020/12/14 16:00:51 thorpej Exp $");
     34 
     35 /*
     36  * eventfd
     37  *
     38  * Eventfd objects present a simple counting object associated with a
     39  * file descriptor.  Writes and reads to this file descriptor increment
     40  * and decrement the count, respectively.  When the count is non-zero,
     41  * the descriptor is considered "readable", and when less than the max
     42  * value (EVENTFD_MAXVAL), is considered "writable".
     43  *
     44  * This implementation is API compatible with the Linux eventfd(2)
     45  * interface.
     46  */
     47 
     48 #include <sys/types.h>
     49 #include <sys/condvar.h>
     50 #include <sys/eventfd.h>
     51 #include <sys/file.h>
     52 #include <sys/filedesc.h>
     53 #include <sys/kauth.h>
     54 #include <sys/mutex.h>
     55 #include <sys/poll.h>
     56 #include <sys/proc.h>
     57 #include <sys/select.h>
     58 #include <sys/stat.h>
     59 #include <sys/syscallargs.h>
     60 #include <sys/uio.h>
     61 
     62 struct eventfd {
     63 	kmutex_t	efd_lock;
     64 	kcondvar_t	efd_read_wait;
     65 	kcondvar_t	efd_write_wait;
     66 	kcondvar_t	efd_restart_wait;
     67 	struct selinfo	efd_read_sel;
     68 	struct selinfo	efd_write_sel;
     69 	eventfd_t	efd_val;
     70 	int64_t		efd_nwaiters;
     71 	bool		efd_restarting;
     72 	bool		efd_has_read_waiters;
     73 	bool		efd_has_write_waiters;
     74 	bool		efd_is_semaphore;
     75 
     76 	/*
     77 	 * Information kept for stat(2).
     78 	 */
     79 	struct timespec efd_btime;	/* time created */
     80 	struct timespec	efd_mtime;	/* last write */
     81 	struct timespec	efd_atime;	/* last read */
     82 };
     83 
     84 #define	EVENTFD_MAXVAL	(UINT64_MAX - 1)
     85 
     86 /*
     87  * eventfd_create:
     88  *
     89  *	Create an eventfd object.
     90  */
     91 static struct eventfd *
     92 eventfd_create(unsigned int const val, int const flags)
     93 {
     94 	struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
     95 
     96 	mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
     97 	cv_init(&efd->efd_read_wait, "efdread");
     98 	cv_init(&efd->efd_write_wait, "efdwrite");
     99 	cv_init(&efd->efd_restart_wait, "efdrstrt");
    100 	selinit(&efd->efd_read_sel);
    101 	selinit(&efd->efd_write_sel);
    102 	efd->efd_val = val;
    103 	efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
    104 	getnanotime(&efd->efd_btime);
    105 
    106 	/* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
    107 
    108 	return efd;
    109 }
    110 
    111 /*
    112  * eventfd_destroy:
    113  *
    114  *	Destroy an eventfd object.
    115  */
    116 static void
    117 eventfd_destroy(struct eventfd * const efd)
    118 {
    119 
    120 	KASSERT(efd->efd_nwaiters == 0);
    121 	KASSERT(efd->efd_restarting == false);
    122 	KASSERT(efd->efd_has_read_waiters == false);
    123 	KASSERT(efd->efd_has_write_waiters == false);
    124 
    125 	cv_destroy(&efd->efd_read_wait);
    126 	cv_destroy(&efd->efd_write_wait);
    127 	cv_destroy(&efd->efd_restart_wait);
    128 
    129 	seldestroy(&efd->efd_read_sel);
    130 	seldestroy(&efd->efd_write_sel);
    131 
    132 	mutex_destroy(&efd->efd_lock);
    133 }
    134 
    135 /*
    136  * eventfd_wait:
    137  *
    138  *	Block on an eventfd.  Handles non-blocking, as well as
    139  *	the restart cases.
    140  */
    141 static int
    142 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
    143 {
    144 	kcondvar_t *waitcv;
    145 	int error;
    146 
    147 	if (fflag & FNONBLOCK) {
    148 		return EAGAIN;
    149 	}
    150 
    151 	/*
    152 	 * We're going to block.  If there is a restart in-progress,
    153 	 * wait for that to complete first.
    154 	 */
    155 	while (efd->efd_restarting) {
    156 		cv_wait(&efd->efd_restart_wait, &efd->efd_lock);
    157 	}
    158 
    159 	if (is_write) {
    160 		efd->efd_has_write_waiters = true;
    161 		waitcv = &efd->efd_write_wait;
    162 	} else {
    163 		efd->efd_has_read_waiters = true;
    164 		waitcv = &efd->efd_read_wait;
    165 	}
    166 
    167 	efd->efd_nwaiters++;
    168 	KASSERT(efd->efd_nwaiters > 0);
    169 	error = cv_wait_sig(waitcv, &efd->efd_lock);
    170 	efd->efd_nwaiters--;
    171 	KASSERT(efd->efd_nwaiters >= 0);
    172 
    173 	/*
    174 	 * If a restart was triggered while we were asleep, we need
    175 	 * to return ERESTART if no other error was returned.  If we
    176 	 * are the last waiter coming out of the restart drain, clear
    177 	 * the condition.
    178 	 */
    179 	if (efd->efd_restarting) {
    180 		if (error == 0) {
    181 			error = ERESTART;
    182 		}
    183 		if (efd->efd_nwaiters == 0) {
    184 			efd->efd_restarting = false;
    185 			cv_broadcast(&efd->efd_restart_wait);
    186 		}
    187 	}
    188 
    189 	return error;
    190 }
    191 
    192 /*
    193  * eventfd_wake:
    194  *
    195  *	Wake LWPs block on an eventfd.
    196  */
    197 static void
    198 eventfd_wake(struct eventfd * const efd, bool const is_write)
    199 {
    200 	kcondvar_t *waitcv = NULL;
    201 	struct selinfo *sel;
    202 	int pollev;
    203 
    204 	if (is_write) {
    205 		if (efd->efd_has_read_waiters) {
    206 			waitcv = &efd->efd_read_wait;
    207 			efd->efd_has_read_waiters = false;
    208 		}
    209 		sel = &efd->efd_read_sel;
    210 		pollev = POLLIN | POLLRDNORM;
    211 	} else {
    212 		if (efd->efd_has_write_waiters) {
    213 			waitcv = &efd->efd_write_wait;
    214 			efd->efd_has_write_waiters = false;
    215 		}
    216 		sel = &efd->efd_write_sel;
    217 		pollev = POLLOUT | POLLWRNORM;
    218 	}
    219 	if (waitcv != NULL) {
    220 		cv_broadcast(waitcv);
    221 	}
    222 	selnotify(sel, pollev, NOTE_SUBMIT);
    223 }
    224 
    225 /*
    226  * eventfd file operations
    227  */
    228 
    229 static int
    230 eventfd_fop_read(file_t * const fp, off_t * const offset,
    231     struct uio * const uio, kauth_cred_t const cred, int const flags)
    232 {
    233 	struct eventfd * const efd = fp->f_eventfd;
    234 	int const fflag = fp->f_flag;
    235 	eventfd_t return_value;
    236 	int error;
    237 
    238 	if (uio->uio_resid < sizeof(eventfd_t)) {
    239 		return EINVAL;
    240 	}
    241 
    242 	mutex_enter(&efd->efd_lock);
    243 
    244 	while (efd->efd_val == 0) {
    245 		if ((error = eventfd_wait(efd, fflag, false)) != 0) {
    246 			mutex_exit(&efd->efd_lock);
    247 			return error;
    248 		}
    249 	}
    250 
    251 	if (efd->efd_is_semaphore) {
    252 		return_value = 1;
    253 		efd->efd_val--;
    254 	} else {
    255 		return_value = efd->efd_val;
    256 		efd->efd_val = 0;
    257 	}
    258 
    259 	getnanotime(&efd->efd_atime);
    260 	eventfd_wake(efd, false);
    261 
    262 	/* XXX Should we unlock before the uiomove()? */
    263 
    264 	error = uiomove(&return_value, sizeof(return_value), uio);
    265 
    266 	/* XXX Should we restore eventfd state if uiomove() fails? */
    267 
    268 	mutex_exit(&efd->efd_lock);
    269 
    270 	return error;
    271 }
    272 
    273 static int
    274 eventfd_fop_write(file_t * const fp, off_t * const offset,
    275     struct uio * const uio, kauth_cred_t const cred, int const flags)
    276 {
    277 	struct eventfd * const efd = fp->f_eventfd;
    278 	int const fflag = fp->f_flag;
    279 	eventfd_t write_value;
    280 	int error;
    281 
    282 	if (uio->uio_resid < sizeof(eventfd_t)) {
    283 		return EINVAL;
    284 	}
    285 
    286 	if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
    287 		return error;
    288 	}
    289 
    290 	if (write_value > EVENTFD_MAXVAL) {
    291 		error = EINVAL;
    292 		goto out;
    293 	}
    294 
    295 	mutex_enter(&efd->efd_lock);
    296 
    297 	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
    298 	while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
    299 		if ((error = eventfd_wait(efd, fflag, true)) != 0) {
    300 			mutex_exit(&efd->efd_lock);
    301 			goto out;
    302 		}
    303 	}
    304 
    305 	efd->efd_val += write_value;
    306 	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
    307 
    308 	getnanotime(&efd->efd_mtime);
    309 	eventfd_wake(efd, true);
    310 
    311 	mutex_exit(&efd->efd_lock);
    312 
    313  out:
    314 	if (error) {
    315 		/*
    316 		 * Undo the effect of uiomove() so that the error
    317 		 * gets reported correctly; see dofilewrite().
    318 		 */
    319 		uio->uio_resid += sizeof(write_value);
    320 	}
    321 	return error;
    322 }
    323 
    324 static int
    325 eventfd_fop_poll(file_t * const fp, int const events)
    326 {
    327 	struct eventfd * const efd = fp->f_eventfd;
    328 	int revents = 0;
    329 
    330 	/*
    331 	 * Note that Linux will return POLLERR if the eventfd count
    332 	 * overflows, but that is not possible in the normal read/write
    333 	 * API, only with Linux kernel-internal interfaces.  So, this
    334 	 * implementation never returns POLLERR.
    335 	 *
    336 	 * Also note that the Linux eventfd(2) man page does not
    337 	 * specifically discuss returning POLLRDNORM, but we check
    338 	 * for that event in addition to POLLIN.
    339 	 */
    340 
    341 	mutex_enter(&efd->efd_lock);
    342 
    343 	if (events & (POLLIN | POLLRDNORM)) {
    344 		if (efd->efd_val != 0) {
    345 			revents |= events & (POLLIN | POLLRDNORM);
    346 		} else {
    347 			selrecord(curlwp, &efd->efd_read_sel);
    348 		}
    349 	}
    350 
    351 	if (events & (POLLOUT | POLLWRNORM)) {
    352 		if (efd->efd_val < EVENTFD_MAXVAL) {
    353 			revents |= events & (POLLOUT | POLLWRNORM);
    354 		} else {
    355 			selrecord(curlwp, &efd->efd_write_sel);
    356 		}
    357 	}
    358 
    359 	mutex_exit(&efd->efd_lock);
    360 
    361 	return revents;
    362 }
    363 
    364 static int
    365 eventfd_fop_stat(file_t * const fp, struct stat * const st)
    366 {
    367 	struct eventfd * const efd = fp->f_eventfd;
    368 
    369 	memset(st, 0, sizeof(*st));
    370 
    371 	mutex_enter(&efd->efd_lock);
    372 	st->st_size = (off_t)efd->efd_val;
    373 	st->st_blksize = sizeof(eventfd_t);
    374 	st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
    375 	st->st_blocks = 1;
    376 	st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
    377 	st->st_atimespec = efd->efd_atime;
    378 	st->st_mtimespec = efd->efd_mtime;
    379 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
    380 	st->st_gid = kauth_cred_getegid(fp->f_cred);
    381 	mutex_exit(&efd->efd_lock);
    382 
    383 	return 0;
    384 }
    385 
    386 static int
    387 eventfd_fop_close(file_t * const fp)
    388 {
    389 	struct eventfd * const efd = fp->f_eventfd;
    390 
    391 	fp->f_eventfd = NULL;
    392 	eventfd_destroy(efd);
    393 
    394 	return 0;
    395 }
    396 
    397 static void
    398 eventfd_filt_read_detach(struct knote * const kn)
    399 {
    400 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    401 
    402 	mutex_enter(&efd->efd_lock);
    403 	KASSERT(kn->kn_hook == efd);
    404 	selremove_knote(&efd->efd_read_sel, kn);
    405 	mutex_exit(&efd->efd_lock);
    406 }
    407 
    408 static int
    409 eventfd_filt_read(struct knote * const kn, long const hint)
    410 {
    411 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    412 
    413 	if (hint & NOTE_SUBMIT) {
    414 		KASSERT(mutex_owned(&efd->efd_lock));
    415 	} else {
    416 		mutex_enter(&efd->efd_lock);
    417 	}
    418 
    419 	kn->kn_data = (int64_t)efd->efd_val;
    420 
    421 	if ((hint & NOTE_SUBMIT) == 0) {
    422 		mutex_exit(&efd->efd_lock);
    423 	}
    424 
    425 	return (eventfd_t)kn->kn_data > 0;
    426 }
    427 
    428 static const struct filterops eventfd_read_filterops = {
    429 	.f_isfd = 1,
    430 	.f_detach = eventfd_filt_read_detach,
    431 	.f_event = eventfd_filt_read,
    432 };
    433 
    434 static void
    435 eventfd_filt_write_detach(struct knote * const kn)
    436 {
    437 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    438 
    439 	mutex_enter(&efd->efd_lock);
    440 	KASSERT(kn->kn_hook == efd);
    441 	selremove_knote(&efd->efd_write_sel, kn);
    442 	mutex_exit(&efd->efd_lock);
    443 }
    444 
    445 static int
    446 eventfd_filt_write(struct knote * const kn, long const hint)
    447 {
    448 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    449 
    450 	if (hint & NOTE_SUBMIT) {
    451 		KASSERT(mutex_owned(&efd->efd_lock));
    452 	} else {
    453 		mutex_enter(&efd->efd_lock);
    454 	}
    455 
    456 	kn->kn_data = (int64_t)efd->efd_val;
    457 
    458 	if ((hint & NOTE_SUBMIT) == 0) {
    459 		mutex_exit(&efd->efd_lock);
    460 	}
    461 
    462 	return (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
    463 }
    464 
    465 static const struct filterops eventfd_write_filterops = {
    466 	.f_isfd = 1,
    467 	.f_detach = eventfd_filt_write_detach,
    468 	.f_event = eventfd_filt_write,
    469 };
    470 
    471 static int
    472 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
    473 {
    474 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    475 	struct selinfo *sel;
    476 
    477 	switch (kn->kn_filter) {
    478 	case EVFILT_READ:
    479 		sel = &efd->efd_read_sel;
    480 		kn->kn_fop = &eventfd_read_filterops;
    481 		break;
    482 
    483 	case EVFILT_WRITE:
    484 		sel = &efd->efd_write_sel;
    485 		kn->kn_fop = &eventfd_write_filterops;
    486 		break;
    487 
    488 	default:
    489 		return EINVAL;
    490 	}
    491 
    492 	kn->kn_hook = efd;
    493 
    494 	mutex_enter(&efd->efd_lock);
    495 	selrecord_knote(sel, kn);
    496 	mutex_exit(&efd->efd_lock);
    497 
    498 	return 0;
    499 }
    500 
    501 static void
    502 eventfd_fop_restart(file_t * const fp)
    503 {
    504 	struct eventfd * const efd = fp->f_eventfd;
    505 
    506 	/*
    507 	 * Unblock blocked reads/writes in order to allow close() to complete.
    508 	 * System calls return ERESTART so that the fd is revalidated.
    509 	 */
    510 
    511 	mutex_enter(&efd->efd_lock);
    512 
    513 	if (efd->efd_nwaiters != 0) {
    514 		efd->efd_restarting = true;
    515 		if (efd->efd_has_read_waiters) {
    516 			cv_broadcast(&efd->efd_read_wait);
    517 			efd->efd_has_read_waiters = false;
    518 		}
    519 		if (efd->efd_has_write_waiters) {
    520 			cv_broadcast(&efd->efd_write_wait);
    521 			efd->efd_has_write_waiters = false;
    522 		}
    523 	}
    524 
    525 	mutex_exit(&efd->efd_lock);
    526 }
    527 
    528 static const struct fileops eventfd_fileops = {
    529 	.fo_name = "eventfd",
    530 	.fo_read = eventfd_fop_read,
    531 	.fo_write = eventfd_fop_write,
    532 	.fo_ioctl = fbadop_ioctl,
    533 	.fo_fcntl = fnullop_fcntl,
    534 	.fo_poll = eventfd_fop_poll,
    535 	.fo_stat = eventfd_fop_stat,
    536 	.fo_close = eventfd_fop_close,
    537 	.fo_kqfilter = eventfd_fop_kqfilter,
    538 	.fo_restart = eventfd_fop_restart,
    539 };
    540 
    541 /*
    542  * eventfd(2) system call
    543  */
    544 int
    545 do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
    546     register_t *retval)
    547 {
    548 	file_t *fp;
    549 	int fd, error;
    550 
    551 	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
    552 		return EINVAL;
    553 	}
    554 
    555 	if ((error = fd_allocfile(&fp, &fd)) != 0) {
    556 		return error;
    557 	}
    558 
    559 	fp->f_flag = FREAD | FWRITE;
    560 	if (flags & EFD_NONBLOCK) {
    561 		fp->f_flag |= FNONBLOCK;
    562 	}
    563 	fp->f_type = DTYPE_EVENTFD;
    564 	fp->f_ops = &eventfd_fileops;
    565 	fp->f_eventfd = eventfd_create(val, flags);
    566 	fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
    567 	fd_affix(curproc, fp, fd);
    568 
    569 	*retval = fd;
    570 	return 0;
    571 }
    572 
    573 int
    574 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
    575     register_t *retval)
    576 {
    577 	/* {
    578 		syscallarg(unsigned int) val;
    579 		syscallarg(int) flags;
    580 	} */
    581 
    582 	return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
    583 }
    584