Home | History | Annotate | Line # | Download | only in kern
sys_eventfd.c revision 1.8
      1 /*	$NetBSD: sys_eventfd.c,v 1.8 2021/11/24 16:35:33 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.8 2021/11/24 16:35:33 thorpej Exp $");
     34 
     35 /*
     36  * eventfd
     37  *
     38  * Eventfd objects present a simple counting object associated with a
     39  * file descriptor.  Writes and reads to this file descriptor increment
     40  * and decrement the count, respectively.  When the count is non-zero,
     41  * the descriptor is considered "readable", and when less than the max
     42  * value (EVENTFD_MAXVAL), is considered "writable".
     43  *
     44  * This implementation is API compatible with the Linux eventfd(2)
     45  * interface.
     46  */
     47 
     48 #include <sys/param.h>
     49 #include <sys/types.h>
     50 #include <sys/condvar.h>
     51 #include <sys/eventfd.h>
     52 #include <sys/file.h>
     53 #include <sys/filedesc.h>
     54 #include <sys/kauth.h>
     55 #include <sys/mutex.h>
     56 #include <sys/poll.h>
     57 #include <sys/proc.h>
     58 #include <sys/select.h>
     59 #include <sys/stat.h>
     60 #include <sys/syscallargs.h>
     61 #include <sys/uio.h>
     62 
     63 struct eventfd {
     64 	kmutex_t	efd_lock;
     65 	kcondvar_t	efd_read_wait;
     66 	kcondvar_t	efd_write_wait;
     67 	struct selinfo	efd_read_sel;
     68 	struct selinfo	efd_write_sel;
     69 	eventfd_t	efd_val;
     70 	int64_t		efd_nwaiters;
     71 	bool		efd_restarting;
     72 	bool		efd_has_read_waiters;
     73 	bool		efd_has_write_waiters;
     74 	bool		efd_is_semaphore;
     75 
     76 	/*
     77 	 * Information kept for stat(2).
     78 	 */
     79 	struct timespec efd_btime;	/* time created */
     80 	struct timespec	efd_mtime;	/* last write */
     81 	struct timespec	efd_atime;	/* last read */
     82 };
     83 
     84 #define	EVENTFD_MAXVAL	(UINT64_MAX - 1)
     85 
     86 /*
     87  * eventfd_create:
     88  *
     89  *	Create an eventfd object.
     90  */
     91 static struct eventfd *
     92 eventfd_create(unsigned int const val, int const flags)
     93 {
     94 	struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
     95 
     96 	mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
     97 	cv_init(&efd->efd_read_wait, "efdread");
     98 	cv_init(&efd->efd_write_wait, "efdwrite");
     99 	selinit(&efd->efd_read_sel);
    100 	selinit(&efd->efd_write_sel);
    101 	efd->efd_val = val;
    102 	efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
    103 	getnanotime(&efd->efd_btime);
    104 
    105 	/* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
    106 
    107 	return efd;
    108 }
    109 
    110 /*
    111  * eventfd_destroy:
    112  *
    113  *	Destroy an eventfd object.
    114  */
    115 static void
    116 eventfd_destroy(struct eventfd * const efd)
    117 {
    118 
    119 	KASSERT(efd->efd_nwaiters == 0);
    120 	KASSERT(efd->efd_has_read_waiters == false);
    121 	KASSERT(efd->efd_has_write_waiters == false);
    122 
    123 	cv_destroy(&efd->efd_read_wait);
    124 	cv_destroy(&efd->efd_write_wait);
    125 
    126 	seldestroy(&efd->efd_read_sel);
    127 	seldestroy(&efd->efd_write_sel);
    128 
    129 	mutex_destroy(&efd->efd_lock);
    130 
    131 	kmem_free(efd, sizeof(*efd));
    132 }
    133 
    134 /*
    135  * eventfd_wait:
    136  *
    137  *	Block on an eventfd.  Handles non-blocking, as well as
    138  *	the restart cases.
    139  */
    140 static int
    141 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
    142 {
    143 	kcondvar_t *waitcv;
    144 	int error;
    145 
    146 	if (fflag & FNONBLOCK) {
    147 		return EAGAIN;
    148 	}
    149 
    150 	/*
    151 	 * We're going to block.  Check if we need to return ERESTART.
    152 	 */
    153 	if (efd->efd_restarting) {
    154 		return ERESTART;
    155 	}
    156 
    157 	if (is_write) {
    158 		efd->efd_has_write_waiters = true;
    159 		waitcv = &efd->efd_write_wait;
    160 	} else {
    161 		efd->efd_has_read_waiters = true;
    162 		waitcv = &efd->efd_read_wait;
    163 	}
    164 
    165 	efd->efd_nwaiters++;
    166 	KASSERT(efd->efd_nwaiters > 0);
    167 	error = cv_wait_sig(waitcv, &efd->efd_lock);
    168 	efd->efd_nwaiters--;
    169 	KASSERT(efd->efd_nwaiters >= 0);
    170 
    171 	/*
    172 	 * If a restart was triggered while we were asleep, we need
    173 	 * to return ERESTART if no other error was returned.
    174 	 */
    175 	if (efd->efd_restarting) {
    176 		if (error == 0) {
    177 			error = ERESTART;
    178 		}
    179 	}
    180 
    181 	return error;
    182 }
    183 
    184 /*
    185  * eventfd_wake:
    186  *
    187  *	Wake LWPs block on an eventfd.
    188  */
    189 static void
    190 eventfd_wake(struct eventfd * const efd, bool const is_write)
    191 {
    192 	kcondvar_t *waitcv = NULL;
    193 	struct selinfo *sel;
    194 	int pollev;
    195 
    196 	if (is_write) {
    197 		if (efd->efd_has_read_waiters) {
    198 			waitcv = &efd->efd_read_wait;
    199 			efd->efd_has_read_waiters = false;
    200 		}
    201 		sel = &efd->efd_read_sel;
    202 		pollev = POLLIN | POLLRDNORM;
    203 	} else {
    204 		if (efd->efd_has_write_waiters) {
    205 			waitcv = &efd->efd_write_wait;
    206 			efd->efd_has_write_waiters = false;
    207 		}
    208 		sel = &efd->efd_write_sel;
    209 		pollev = POLLOUT | POLLWRNORM;
    210 	}
    211 	if (waitcv != NULL) {
    212 		cv_broadcast(waitcv);
    213 	}
    214 	selnotify(sel, pollev, NOTE_SUBMIT);
    215 }
    216 
    217 /*
    218  * eventfd file operations
    219  */
    220 
    221 static int
    222 eventfd_fop_read(file_t * const fp, off_t * const offset,
    223     struct uio * const uio, kauth_cred_t const cred, int const flags)
    224 {
    225 	struct eventfd * const efd = fp->f_eventfd;
    226 	int const fflag = fp->f_flag;
    227 	eventfd_t return_value;
    228 	int error;
    229 
    230 	if (uio->uio_resid < sizeof(eventfd_t)) {
    231 		return EINVAL;
    232 	}
    233 
    234 	mutex_enter(&efd->efd_lock);
    235 
    236 	while (efd->efd_val == 0) {
    237 		if ((error = eventfd_wait(efd, fflag, false)) != 0) {
    238 			mutex_exit(&efd->efd_lock);
    239 			return error;
    240 		}
    241 	}
    242 
    243 	if (efd->efd_is_semaphore) {
    244 		return_value = 1;
    245 		efd->efd_val--;
    246 	} else {
    247 		return_value = efd->efd_val;
    248 		efd->efd_val = 0;
    249 	}
    250 
    251 	getnanotime(&efd->efd_atime);
    252 	eventfd_wake(efd, false);
    253 
    254 	mutex_exit(&efd->efd_lock);
    255 
    256 	error = uiomove(&return_value, sizeof(return_value), uio);
    257 
    258 	return error;
    259 }
    260 
    261 static int
    262 eventfd_fop_write(file_t * const fp, off_t * const offset,
    263     struct uio * const uio, kauth_cred_t const cred, int const flags)
    264 {
    265 	struct eventfd * const efd = fp->f_eventfd;
    266 	int const fflag = fp->f_flag;
    267 	eventfd_t write_value;
    268 	int error;
    269 
    270 	if (uio->uio_resid < sizeof(eventfd_t)) {
    271 		return EINVAL;
    272 	}
    273 
    274 	if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
    275 		return error;
    276 	}
    277 
    278 	if (write_value > EVENTFD_MAXVAL) {
    279 		error = EINVAL;
    280 		goto out;
    281 	}
    282 
    283 	mutex_enter(&efd->efd_lock);
    284 
    285 	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
    286 	while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
    287 		if ((error = eventfd_wait(efd, fflag, true)) != 0) {
    288 			mutex_exit(&efd->efd_lock);
    289 			goto out;
    290 		}
    291 	}
    292 
    293 	efd->efd_val += write_value;
    294 	KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
    295 
    296 	getnanotime(&efd->efd_mtime);
    297 	eventfd_wake(efd, true);
    298 
    299 	mutex_exit(&efd->efd_lock);
    300 
    301  out:
    302 	if (error) {
    303 		/*
    304 		 * Undo the effect of uiomove() so that the error
    305 		 * gets reported correctly; see dofilewrite().
    306 		 */
    307 		uio->uio_resid += sizeof(write_value);
    308 	}
    309 	return error;
    310 }
    311 
    312 static int
    313 eventfd_fop_poll(file_t * const fp, int const events)
    314 {
    315 	struct eventfd * const efd = fp->f_eventfd;
    316 	int revents = 0;
    317 
    318 	/*
    319 	 * Note that Linux will return POLLERR if the eventfd count
    320 	 * overflows, but that is not possible in the normal read/write
    321 	 * API, only with Linux kernel-internal interfaces.  So, this
    322 	 * implementation never returns POLLERR.
    323 	 *
    324 	 * Also note that the Linux eventfd(2) man page does not
    325 	 * specifically discuss returning POLLRDNORM, but we check
    326 	 * for that event in addition to POLLIN.
    327 	 */
    328 
    329 	mutex_enter(&efd->efd_lock);
    330 
    331 	if (events & (POLLIN | POLLRDNORM)) {
    332 		if (efd->efd_val != 0) {
    333 			revents |= events & (POLLIN | POLLRDNORM);
    334 		} else {
    335 			selrecord(curlwp, &efd->efd_read_sel);
    336 		}
    337 	}
    338 
    339 	if (events & (POLLOUT | POLLWRNORM)) {
    340 		if (efd->efd_val < EVENTFD_MAXVAL) {
    341 			revents |= events & (POLLOUT | POLLWRNORM);
    342 		} else {
    343 			selrecord(curlwp, &efd->efd_write_sel);
    344 		}
    345 	}
    346 
    347 	mutex_exit(&efd->efd_lock);
    348 
    349 	return revents;
    350 }
    351 
    352 static int
    353 eventfd_fop_stat(file_t * const fp, struct stat * const st)
    354 {
    355 	struct eventfd * const efd = fp->f_eventfd;
    356 
    357 	memset(st, 0, sizeof(*st));
    358 
    359 	mutex_enter(&efd->efd_lock);
    360 	st->st_size = (off_t)efd->efd_val;
    361 	st->st_blksize = sizeof(eventfd_t);
    362 	st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
    363 	st->st_blocks = 1;
    364 	st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
    365 	st->st_atimespec = efd->efd_atime;
    366 	st->st_mtimespec = efd->efd_mtime;
    367 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
    368 	st->st_gid = kauth_cred_getegid(fp->f_cred);
    369 	mutex_exit(&efd->efd_lock);
    370 
    371 	return 0;
    372 }
    373 
    374 static int
    375 eventfd_fop_close(file_t * const fp)
    376 {
    377 	struct eventfd * const efd = fp->f_eventfd;
    378 
    379 	fp->f_eventfd = NULL;
    380 	eventfd_destroy(efd);
    381 
    382 	return 0;
    383 }
    384 
    385 static void
    386 eventfd_filt_read_detach(struct knote * const kn)
    387 {
    388 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    389 
    390 	mutex_enter(&efd->efd_lock);
    391 	KASSERT(kn->kn_hook == efd);
    392 	selremove_knote(&efd->efd_read_sel, kn);
    393 	mutex_exit(&efd->efd_lock);
    394 }
    395 
    396 static int
    397 eventfd_filt_read(struct knote * const kn, long const hint)
    398 {
    399 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    400 	int rv;
    401 
    402 	if (hint & NOTE_SUBMIT) {
    403 		KASSERT(mutex_owned(&efd->efd_lock));
    404 	} else {
    405 		mutex_enter(&efd->efd_lock);
    406 	}
    407 
    408 	kn->kn_data = (int64_t)efd->efd_val;
    409 	rv = (eventfd_t)kn->kn_data > 0;
    410 
    411 	if ((hint & NOTE_SUBMIT) == 0) {
    412 		mutex_exit(&efd->efd_lock);
    413 	}
    414 
    415 	return rv;
    416 }
    417 
    418 static const struct filterops eventfd_read_filterops = {
    419 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
    420 	.f_detach = eventfd_filt_read_detach,
    421 	.f_event = eventfd_filt_read,
    422 };
    423 
    424 static void
    425 eventfd_filt_write_detach(struct knote * const kn)
    426 {
    427 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    428 
    429 	mutex_enter(&efd->efd_lock);
    430 	KASSERT(kn->kn_hook == efd);
    431 	selremove_knote(&efd->efd_write_sel, kn);
    432 	mutex_exit(&efd->efd_lock);
    433 }
    434 
    435 static int
    436 eventfd_filt_write(struct knote * const kn, long const hint)
    437 {
    438 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    439 	int rv;
    440 
    441 	if (hint & NOTE_SUBMIT) {
    442 		KASSERT(mutex_owned(&efd->efd_lock));
    443 	} else {
    444 		mutex_enter(&efd->efd_lock);
    445 	}
    446 
    447 	kn->kn_data = (int64_t)efd->efd_val;
    448 	rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
    449 
    450 	if ((hint & NOTE_SUBMIT) == 0) {
    451 		mutex_exit(&efd->efd_lock);
    452 	}
    453 
    454 	return rv;
    455 }
    456 
    457 static const struct filterops eventfd_write_filterops = {
    458 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
    459 	.f_detach = eventfd_filt_write_detach,
    460 	.f_event = eventfd_filt_write,
    461 };
    462 
    463 static int
    464 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
    465 {
    466 	struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
    467 	struct selinfo *sel;
    468 
    469 	switch (kn->kn_filter) {
    470 	case EVFILT_READ:
    471 		sel = &efd->efd_read_sel;
    472 		kn->kn_fop = &eventfd_read_filterops;
    473 		break;
    474 
    475 	case EVFILT_WRITE:
    476 		sel = &efd->efd_write_sel;
    477 		kn->kn_fop = &eventfd_write_filterops;
    478 		break;
    479 
    480 	default:
    481 		return EINVAL;
    482 	}
    483 
    484 	kn->kn_hook = efd;
    485 
    486 	mutex_enter(&efd->efd_lock);
    487 	selrecord_knote(sel, kn);
    488 	mutex_exit(&efd->efd_lock);
    489 
    490 	return 0;
    491 }
    492 
    493 static void
    494 eventfd_fop_restart(file_t * const fp)
    495 {
    496 	struct eventfd * const efd = fp->f_eventfd;
    497 
    498 	/*
    499 	 * Unblock blocked reads/writes in order to allow close() to complete.
    500 	 * System calls return ERESTART so that the fd is revalidated.
    501 	 */
    502 
    503 	mutex_enter(&efd->efd_lock);
    504 
    505 	if (efd->efd_nwaiters != 0) {
    506 		efd->efd_restarting = true;
    507 		if (efd->efd_has_read_waiters) {
    508 			cv_broadcast(&efd->efd_read_wait);
    509 			efd->efd_has_read_waiters = false;
    510 		}
    511 		if (efd->efd_has_write_waiters) {
    512 			cv_broadcast(&efd->efd_write_wait);
    513 			efd->efd_has_write_waiters = false;
    514 		}
    515 	}
    516 
    517 	mutex_exit(&efd->efd_lock);
    518 }
    519 
    520 static const struct fileops eventfd_fileops = {
    521 	.fo_name = "eventfd",
    522 	.fo_read = eventfd_fop_read,
    523 	.fo_write = eventfd_fop_write,
    524 	.fo_ioctl = fbadop_ioctl,
    525 	.fo_fcntl = fnullop_fcntl,
    526 	.fo_poll = eventfd_fop_poll,
    527 	.fo_stat = eventfd_fop_stat,
    528 	.fo_close = eventfd_fop_close,
    529 	.fo_kqfilter = eventfd_fop_kqfilter,
    530 	.fo_restart = eventfd_fop_restart,
    531 };
    532 
    533 /*
    534  * eventfd(2) system call
    535  */
    536 int
    537 do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
    538     register_t *retval)
    539 {
    540 	file_t *fp;
    541 	int fd, error;
    542 
    543 	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
    544 		return EINVAL;
    545 	}
    546 
    547 	if ((error = fd_allocfile(&fp, &fd)) != 0) {
    548 		return error;
    549 	}
    550 
    551 	fp->f_flag = FREAD | FWRITE;
    552 	if (flags & EFD_NONBLOCK) {
    553 		fp->f_flag |= FNONBLOCK;
    554 	}
    555 	fp->f_type = DTYPE_EVENTFD;
    556 	fp->f_ops = &eventfd_fileops;
    557 	fp->f_eventfd = eventfd_create(val, flags);
    558 	fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
    559 	fd_affix(curproc, fp, fd);
    560 
    561 	*retval = fd;
    562 	return 0;
    563 }
    564 
    565 int
    566 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
    567     register_t *retval)
    568 {
    569 	/* {
    570 		syscallarg(unsigned int) val;
    571 		syscallarg(int) flags;
    572 	} */
    573 
    574 	return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
    575 }
    576