Home | History | Annotate | Line # | Download | only in common
      1 /*	$NetBSD: linux_inotify.c,v 1.8 2025/06/27 21:36:23 andvar Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Theodore Preduta.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 #include <sys/cdefs.h>
     32 __KERNEL_RCSID(0, "$NetBSD: linux_inotify.c,v 1.8 2025/06/27 21:36:23 andvar Exp $");
     33 
     34 #include <sys/param.h>
     35 #include <sys/types.h>
     36 #include <sys/bitops.h>
     37 #include <sys/dirent.h>
     38 #include <sys/event.h>
     39 #include <sys/eventvar.h>
     40 #include <sys/errno.h>
     41 #include <sys/file.h>
     42 #include <sys/filedesc.h>
     43 #include <sys/fcntl.h>
     44 #include <sys/poll.h>
     45 #include <sys/proc.h>
     46 #include <sys/selinfo.h>
     47 #include <sys/select.h>
     48 #include <sys/signal.h>
     49 #include <sys/vnode.h>
     50 
     51 #include <sys/syscallargs.h>
     52 
     53 #include <compat/linux/common/linux_machdep.h>
     54 #include <compat/linux/common/linux_fcntl.h>
     55 #include <compat/linux/common/linux_inotify.h>
     56 #include <compat/linux/common/linux_ipc.h>
     57 #include <compat/linux/common/linux_sched.h>
     58 #include <compat/linux/common/linux_sem.h>
     59 #include <compat/linux/common/linux_signal.h>
     60 
     61 #include <compat/linux/linux_syscallargs.h>
     62 
     63 /*
     64  * inotify(2).  This interface allows the user to get file system
     65  * events and (unlike kqueue(2)) their order is strictly preserved.
     66  * While nice, the API has sufficient gotchas that mean we don't want
     67  * to add native entry points for it.  They are:
     68  *
     69  * - Because data is returned via read(2), this API is prone to
     70  *   unaligned memory accesses.  There is a note in the Linux man page
     71  *   that says the name field of struct linux_inotify_event *can* be
     72  *   used for alignment purposes.  In practice, even Linux doesn't
     73  *   always do this, so for simplicity, we don't ever do this.
     74  */
     75 
     76 #define	LINUX_INOTIFY_MAX_QUEUED	16384
     77 #define	LINUX_INOTIFY_MAX_FROM_KEVENT	3
     78 
     79 #if DEBUG_LINUX
     80 #define	DPRINTF(x) uprintf x
     81 #else
     82 #define	DPRINTF(x) __nothing
     83 #endif
     84 
     85 struct inotify_entry {
     86 	TAILQ_ENTRY(inotify_entry)	ie_entries;
     87 	char				ie_name[NAME_MAX + 1];
     88 	struct linux_inotify_event	ie_event;
     89 };
     90 
     91 struct inotify_dir_entries {
     92 	size_t	ide_count;
     93 	struct inotify_dir_entry {
     94 		char	name[NAME_MAX + 1];
     95 		ino_t	fileno;
     96 	} ide_entries[];
     97 };
     98 #define	INOTIFY_DIR_ENTRIES_SIZE(count)	(sizeof(struct inotify_dir_entries) \
     99     + count * sizeof(struct inotify_dir_entry))
    100 
    101 struct inotifyfd {
    102 	int		ifd_kqfd;	/* kqueue fd used by this inotify */
    103 					/* instance */
    104 	struct selinfo	ifd_sel;	/* for EVFILT_READ by epoll */
    105 	kmutex_t	ifd_lock;	/* lock for ifd_sel, ifd_wds and */
    106 					/* ifd_nwds */
    107 
    108 	struct inotify_dir_entries **ifd_wds;
    109 					/* keeps track of watch descriptors */
    110 					/* for directories: snapshot of the */
    111 					/* directory state */
    112 					/* for files: an inotify_dir_entries */
    113 					/* with ide_count == 0 */
    114 	size_t		ifd_nwds;	/* max watch descriptor that can be */
    115 					/* stored in ifd_wds + 1 */
    116 
    117         TAILQ_HEAD(, inotify_entry) ifd_qhead;	/* queue of pending events */
    118 	size_t		ifd_qcount;	/* number of pending events */
    119 	kcondvar_t	ifd_qcv;	/* condvar for blocking reads */
    120 	kmutex_t	ifd_qlock;	/* lock for ifd_q* and interlock */
    121 					/* for ifd_qcv */
    122 };
    123 
    124 struct inotify_kevent_mask_pair {
    125 	uint32_t inotify;
    126 	uint32_t kevent;
    127 };
    128 
    129 static int	inotify_kev_fetch_changes(void *, const struct kevent *,
    130     struct kevent *, size_t, int);
    131 static int	do_inotify_init(struct lwp *, register_t *, int);
    132 static int	inotify_close_wd(struct inotifyfd *, int);
    133 static uint32_t	inotify_mask_to_kevent_fflags(uint32_t, enum vtype);
    134 static void	do_kevent_to_inotify(int32_t, uint32_t, uint32_t,
    135     struct inotify_entry *, size_t *, char *);
    136 static int	kevent_to_inotify(struct inotifyfd *, int, enum vtype, uint32_t,
    137     uint32_t, struct inotify_entry *, size_t *);
    138 static int	inotify_readdir(file_t *, struct dirent *, int *, bool);
    139 static struct inotify_dir_entries *get_inotify_dir_entries(int, bool);
    140 
    141 static int	inotify_filt_attach(struct knote *);
    142 static void	inotify_filt_detach(struct knote *);
    143 static int	inotify_filt_event(struct knote *, long);
    144 static void	inotify_read_filt_detach(struct knote *);
    145 static int	inotify_read_filt_event(struct knote *, long);
    146 
    147 static int	inotify_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
    148 static int	inotify_close(file_t *);
    149 static int	inotify_poll(file_t *, int);
    150 static int	inotify_kqfilter(file_t *, struct knote *);
    151 static void	inotify_restart(file_t *);
    152 
    153 static const char inotify_filtname[] = "LINUX_INOTIFY";
    154 static int inotify_filtid;
    155 
    156 /* "fake" EVFILT_VNODE that gets attached to ifd_deps */
    157 static const struct filterops inotify_filtops = {
    158 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
    159 	.f_attach = inotify_filt_attach,
    160 	.f_detach = inotify_filt_detach,
    161 	.f_event = inotify_filt_event,
    162 	.f_touch = NULL,
    163 };
    164 
    165 /* EVFILT_READ attached to inotifyfd (to support watching via epoll) */
    166 static const struct filterops inotify_read_filtops = {
    167 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
    168 	.f_attach = NULL, /* attached via .fo_kqfilter */
    169 	.f_detach = inotify_read_filt_detach,
    170 	.f_event = inotify_read_filt_event,
    171 	.f_touch = NULL,
    172 };
    173 
    174 static const struct fileops inotify_fileops = {
    175 	.fo_name = "inotify",
    176 	.fo_read = inotify_read,
    177 	.fo_write = fbadop_write,
    178 	.fo_ioctl = fbadop_ioctl,
    179 	.fo_fcntl = fnullop_fcntl,
    180 	.fo_poll = inotify_poll,
    181 	.fo_stat = fbadop_stat,
    182 	.fo_close = inotify_close,
    183 	.fo_kqfilter = inotify_kqfilter,
    184 	.fo_restart = inotify_restart,
    185 	.fo_fpathconf = (void *)eopnotsupp,
    186 };
    187 
    188 /* basic flag translations */
    189 static const struct inotify_kevent_mask_pair common_inotify_to_kevent[] = {
    190 	{ .inotify = LINUX_IN_ATTRIB,		.kevent = NOTE_ATTRIB, },
    191 	{ .inotify = LINUX_IN_CLOSE_NOWRITE,	.kevent = NOTE_CLOSE, },
    192 	{ .inotify = LINUX_IN_OPEN,		.kevent = NOTE_OPEN, },
    193 	{ .inotify = LINUX_IN_MOVE_SELF,	.kevent = NOTE_RENAME, },
    194 };
    195 static const size_t common_inotify_to_kevent_len =
    196     __arraycount(common_inotify_to_kevent);
    197 
    198 static const struct inotify_kevent_mask_pair vreg_inotify_to_kevent[] = {
    199 	{ .inotify = LINUX_IN_ACCESS,		.kevent = NOTE_READ, },
    200 	{ .inotify = LINUX_IN_ATTRIB,		.kevent = NOTE_ATTRIB|NOTE_LINK, },
    201 	{ .inotify = LINUX_IN_CLOSE_WRITE,	.kevent = NOTE_CLOSE_WRITE, },
    202 	{ .inotify = LINUX_IN_MODIFY,		.kevent = NOTE_WRITE, },
    203 };
    204 static const size_t vreg_inotify_to_kevent_len =
    205     __arraycount(vreg_inotify_to_kevent);
    206 
    207 static const struct inotify_kevent_mask_pair vdir_inotify_to_kevent[] = {
    208 	{ .inotify = LINUX_IN_ACCESS,		.kevent = NOTE_READ, },
    209 	{ .inotify = LINUX_IN_CREATE,		.kevent = NOTE_WRITE, },
    210 	{ .inotify = LINUX_IN_DELETE,		.kevent = NOTE_WRITE, },
    211 	{ .inotify = LINUX_IN_MOVED_FROM,	.kevent = NOTE_WRITE, },
    212 	{ .inotify = LINUX_IN_MOVED_TO,		.kevent = NOTE_WRITE, },
    213 };
    214 static const size_t vdir_inotify_to_kevent_len =
    215     __arraycount(vdir_inotify_to_kevent);
    216 
    217 static const struct inotify_kevent_mask_pair common_kevent_to_inotify[] = {
    218 	{ .kevent = NOTE_ATTRIB,	.inotify = LINUX_IN_ATTRIB, },
    219 	{ .kevent = NOTE_CLOSE,		.inotify = LINUX_IN_CLOSE_NOWRITE, },
    220 	{ .kevent = NOTE_CLOSE_WRITE,	.inotify = LINUX_IN_CLOSE_WRITE, },
    221 	{ .kevent = NOTE_OPEN,		.inotify = LINUX_IN_OPEN, },
    222 	{ .kevent = NOTE_READ,		.inotify = LINUX_IN_ACCESS, },
    223 	{ .kevent = NOTE_RENAME,	.inotify = LINUX_IN_MOVE_SELF, },
    224 	{ .kevent = NOTE_REVOKE,	.inotify = LINUX_IN_UNMOUNT, },
    225 };
    226 static const size_t common_kevent_to_inotify_len =
    227     __arraycount(common_kevent_to_inotify);
    228 
    229 static const struct inotify_kevent_mask_pair vreg_kevent_to_inotify[] = {
    230 	{ .kevent = NOTE_DELETE|NOTE_LINK, .inotify = LINUX_IN_ATTRIB, },
    231 	{ .kevent = NOTE_WRITE,		.inotify = LINUX_IN_MODIFY, },
    232 };
    233 static const size_t vreg_kevent_to_inotify_len =
    234     __arraycount(vreg_kevent_to_inotify);
    235 
    236 /*
    237  * Register the custom kfilter for inotify.
    238  */
    239 int
    240 linux_inotify_init(void)
    241 {
    242 	return kfilter_register(inotify_filtname, &inotify_filtops,
    243 	    &inotify_filtid);
    244 }
    245 
    246 /*
    247  * Unregister the custom kfilter for inotify.
    248  */
    249 int
    250 linux_inotify_fini(void)
    251 {
    252 	return kfilter_unregister(inotify_filtname);
    253 }
    254 
    255 /*
    256  * Copyin callback used by kevent.  This copies already converted
    257  * filters from kernel memory to the kevent internal kernel memory.
    258  * Hence the memcpy instead of copyin.
    259  */
    260 static int
    261 inotify_kev_fetch_changes(void *ctx, const struct kevent *changelist,
    262     struct kevent *changes, size_t index, int n)
    263 {
    264 	memcpy(changes, changelist + index, n * sizeof(*changes));
    265 
    266 	return 0;
    267 }
    268 
    269 /*
    270  * Initialize a new inotify fd.
    271  */
    272 static int
    273 do_inotify_init(struct lwp *l, register_t *retval, int flags)
    274 {
    275 	file_t *fp;
    276 	int error, fd;
    277 	struct proc *p = l->l_proc;
    278 	struct inotifyfd *ifd;
    279 	struct sys_kqueue1_args kqa;
    280 
    281 	if (flags & ~(LINUX_IN_ALL_FLAGS))
    282 		return EINVAL;
    283 
    284 	ifd = kmem_zalloc(sizeof(*ifd), KM_SLEEP);
    285 	mutex_init(&ifd->ifd_lock, MUTEX_DEFAULT, IPL_NONE);
    286 	mutex_init(&ifd->ifd_qlock, MUTEX_DEFAULT, IPL_NONE);
    287 	cv_init(&ifd->ifd_qcv, "inotify");
    288 	selinit(&ifd->ifd_sel);
    289 	TAILQ_INIT(&ifd->ifd_qhead);
    290 
    291 	ifd->ifd_nwds = 1;
    292 	ifd->ifd_wds = kmem_zalloc(ifd->ifd_nwds * sizeof(*ifd->ifd_wds),
    293 	KM_SLEEP);
    294 
    295 	SCARG(&kqa, flags) = 0;
    296 	if (flags & LINUX_IN_NONBLOCK)
    297 		SCARG(&kqa, flags) |= O_NONBLOCK;
    298 	error = sys_kqueue1(l, &kqa, retval);
    299 	if (error != 0)
    300 		goto leave0;
    301 	ifd->ifd_kqfd = *retval;
    302 
    303 	error = fd_allocfile(&fp, &fd);
    304 	if (error != 0)
    305 		goto leave1;
    306 
    307 	fp->f_flag = FREAD;
    308 	if (flags & LINUX_IN_NONBLOCK)
    309 		fp->f_flag |= FNONBLOCK;
    310 	fp->f_type = DTYPE_MISC;
    311 	fp->f_ops = &inotify_fileops;
    312 	fp->f_data = ifd;
    313 	fd_set_exclose(l, fd, (flags & LINUX_IN_CLOEXEC) != 0);
    314 	fd_affix(p, fp, fd);
    315 
    316 	*retval = fd;
    317 	return 0;
    318 
    319 leave1:
    320 	KASSERT(fd_getfile(ifd->ifd_kqfd) != NULL);
    321 	fd_close(ifd->ifd_kqfd);
    322 leave0:
    323 	kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
    324 	kmem_free(ifd, sizeof(*ifd));
    325 
    326 	mutex_destroy(&ifd->ifd_lock);
    327 	mutex_destroy(&ifd->ifd_qlock);
    328 	cv_destroy(&ifd->ifd_qcv);
    329 	seldestroy(&ifd->ifd_sel);
    330 
    331 	return error;
    332 }
    333 
    334 #ifndef __aarch64__
    335 /*
    336  * inotify_init(2).  Initialize a new inotify fd with flags=0.
    337  */
    338 int
    339 linux_sys_inotify_init(struct lwp *l, const void *v, register_t *retval)
    340 {
    341 	return do_inotify_init(l, retval, 0);
    342 }
    343 #endif
    344 
    345 /*
    346  * inotify_init(2).  Initialize a new inotify fd with the given flags.
    347  */
    348 int
    349 linux_sys_inotify_init1(struct lwp *l,
    350     const struct linux_sys_inotify_init1_args *uap, register_t *retval)
    351 {
    352 	/* {
    353 		syscallarg(int) flags;
    354 	} */
    355 
    356 	return do_inotify_init(l, retval, SCARG(uap, flags));
    357 }
    358 
    359 /*
    360  * Convert inotify mask to the fflags of an equivalent kevent.
    361  */
    362 static uint32_t
    363 inotify_mask_to_kevent_fflags(uint32_t mask, enum vtype type)
    364 {
    365 	const struct inotify_kevent_mask_pair *type_inotify_to_kevent;
    366 	uint32_t fflags;
    367 	size_t i, type_inotify_to_kevent_len;
    368 
    369 	switch (type) {
    370 	case VREG:
    371 	case VDIR:
    372 	case VLNK:
    373 		break;
    374 
    375 	default:
    376 		return 0;
    377 	}
    378 
    379 	/* flags that all watches could have */
    380 	fflags = NOTE_DELETE|NOTE_REVOKE;
    381 	for (i = 0; i < common_inotify_to_kevent_len; i++)
    382 		if (mask & common_inotify_to_kevent[i].inotify)
    383 			fflags |= common_inotify_to_kevent[i].kevent;
    384 
    385 	/* flags that depend on type */
    386 	switch (type) {
    387 	case VREG:
    388 		type_inotify_to_kevent = vreg_inotify_to_kevent;
    389 		type_inotify_to_kevent_len = vreg_inotify_to_kevent_len;
    390 		break;
    391 
    392 	case VDIR:
    393 		type_inotify_to_kevent = vdir_inotify_to_kevent;
    394 		type_inotify_to_kevent_len = vdir_inotify_to_kevent_len;
    395 		break;
    396 
    397 	default:
    398 		type_inotify_to_kevent_len = 0;
    399 		break;
    400 	}
    401 	for (i = 0; i < type_inotify_to_kevent_len; i++)
    402 		if (mask & type_inotify_to_kevent[i].inotify)
    403 			fflags |= type_inotify_to_kevent[i].kevent;
    404 
    405 	return fflags;
    406 }
    407 
    408 /*
    409  * inotify_add_watch(2).  Open a fd for pathname (if desired by mask)
    410  * track it and add an equivalent kqueue event for it in
    411  * ifd->ifd_kqfd.
    412  */
    413 int
    414 linux_sys_inotify_add_watch(struct lwp *l,
    415     const struct linux_sys_inotify_add_watch_args *uap, register_t *retval)
    416 {
    417 	/* {
    418 		syscallarg(int) fd;
    419 		syscallarg(const char *) pathname;
    420 		syscallarg(uint32_t) mask;
    421 	} */
    422 	int wd, i, error = 0;
    423 	file_t *fp, *wp, *cur_fp;
    424 	struct inotifyfd *ifd;
    425 	struct inotify_dir_entries **new_wds;
    426 	struct knote *kn, *tmpkn;
    427 	struct sys_open_args oa;
    428 	struct kevent kev;
    429 	struct vnode *wvp;
    430 	namei_simple_flags_t sflags;
    431 	struct kevent_ops k_ops = {
    432 		.keo_private = NULL,
    433 		.keo_fetch_timeout = NULL,
    434 		.keo_fetch_changes = inotify_kev_fetch_changes,
    435 		.keo_put_events = NULL,
    436 	};
    437 	const int fd = SCARG(uap, fd);
    438 	const uint32_t mask = SCARG(uap, mask);
    439 
    440 	if (mask & ~LINUX_IN_ADD_KNOWN)
    441 		return EINVAL;
    442 
    443 	fp = fd_getfile(fd);
    444 	if (fp == NULL)
    445 		return EBADF;
    446 
    447 	if (fp->f_ops != &inotify_fileops) {
    448 		/* not an inotify fd */
    449 		error = EBADF;
    450 		goto leave0;
    451 	}
    452 
    453 	ifd = fp->f_data;
    454 
    455 	mutex_enter(&ifd->ifd_lock);
    456 
    457 	if (mask & LINUX_IN_DONT_FOLLOW)
    458 		sflags = NSM_NOFOLLOW_TRYEMULROOT;
    459 	else
    460 		sflags = NSM_FOLLOW_TRYEMULROOT;
    461 	error = namei_simple_user(SCARG(uap, pathname), sflags, &wvp);
    462 	if (error != 0)
    463 		goto leave1;
    464 
    465 	/* Check to see if we already have a descriptor to wd's file. */
    466         wd = -1;
    467 	for (i = 0; i < ifd->ifd_nwds; i++) {
    468 		if (ifd->ifd_wds[i] != NULL) {
    469 			cur_fp = fd_getfile(i);
    470 			if (cur_fp == NULL) {
    471 				DPRINTF(("%s: wd=%d was closed externally\n",
    472 				    __func__, i));
    473 				error = EBADF;
    474 				goto leave1;
    475 			}
    476 			if (cur_fp->f_type != DTYPE_VNODE) {
    477 				DPRINTF(("%s: wd=%d was replaced "
    478 				    "with a non-vnode\n", __func__, i));
    479 				error = EBADF;
    480 			}
    481 			if (error == 0 && cur_fp->f_vnode == wvp)
    482 				wd = i;
    483 			fd_putfile(i);
    484 			if (error != 0)
    485 				goto leave1;
    486 
    487 			if (wd != -1)
    488 				break;
    489 		}
    490 	}
    491 
    492 	if (wd == -1) {
    493 		/*
    494 		 * If we do not have a descriptor to wd's file, we
    495 		 * need to open the watch descriptor.
    496 		 */
    497 		SCARG(&oa, path) = SCARG(uap, pathname);
    498 		SCARG(&oa, mode) = 0;
    499 		SCARG(&oa, flags) = O_RDONLY;
    500 		if (mask & LINUX_IN_DONT_FOLLOW)
    501 			SCARG(&oa, flags) |= O_NOFOLLOW;
    502 		if (mask & LINUX_IN_ONLYDIR)
    503 			SCARG(&oa, flags) |= O_DIRECTORY;
    504 
    505 		error = sys_open(l, &oa, retval);
    506 		if (error != 0)
    507 			goto leave1;
    508 		wd = *retval;
    509 		wp = fd_getfile(wd);
    510 	        KASSERT(wp != NULL);
    511 		KASSERT(wp->f_type == DTYPE_VNODE);
    512 
    513 		/* translate the flags */
    514 		memset(&kev, 0, sizeof(kev));
    515 		EV_SET(&kev, wd, inotify_filtid, EV_ADD|EV_ENABLE,
    516 		    NOTE_DELETE|NOTE_REVOKE, 0, ifd);
    517 		if (mask & LINUX_IN_ONESHOT)
    518 			kev.flags |= EV_ONESHOT;
    519 		kev.fflags |= inotify_mask_to_kevent_fflags(mask,
    520 		    wp->f_vnode->v_type);
    521 
    522 		error = kevent1(retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL,
    523 		    &k_ops);
    524 		if (error != 0) {
    525 			KASSERT(fd_getfile(wd) != NULL);
    526 			fd_close(wd);
    527 		} else {
    528 			/* Success! */
    529 			*retval = wd;
    530 
    531 			/* Resize ifd_nwds to accommodate wd. */
    532 			if (wd+1 > ifd->ifd_nwds) {
    533 				new_wds = kmem_zalloc(
    534 				    (wd+1) * sizeof(*ifd->ifd_wds), KM_SLEEP);
    535 				memcpy(new_wds, ifd->ifd_wds,
    536 				    ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
    537 
    538 				kmem_free(ifd->ifd_wds,
    539 				    ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
    540 
    541 				ifd->ifd_wds = new_wds;
    542 				ifd->ifd_nwds = wd+1;
    543 			}
    544 
    545 			ifd->ifd_wds[wd] = get_inotify_dir_entries(wd, true);
    546 		}
    547 	} else {
    548 		/*
    549 		 * If we do have a descriptor to wd's file, try to edit
    550 		 * the relevant knote.
    551 		 */
    552 		if (mask & LINUX_IN_MASK_CREATE) {
    553 			error = EEXIST;
    554 			goto leave1;
    555 		}
    556 
    557 		wp = fd_getfile(wd);
    558 		if (wp == NULL) {
    559 			DPRINTF(("%s: wd=%d was closed externally "
    560 			    "(race, probably)\n", __func__, wd));
    561 			error = EBADF;
    562 			goto leave1;
    563 		}
    564 		if (wp->f_type != DTYPE_VNODE) {
    565 			DPRINTF(("%s: wd=%d was replace with a non-vnode "
    566 			    "(race, probably)\n", __func__, wd));
    567 			error = EBADF;
    568 			goto leave2;
    569 		}
    570 
    571 		kev.fflags = NOTE_DELETE | NOTE_REVOKE
    572 		    | inotify_mask_to_kevent_fflags(mask, wp->f_vnode->v_type);
    573 
    574 		mutex_enter(wp->f_vnode->v_interlock);
    575 
    576 		/*
    577 		 * XXX We are forced to find the appropriate knote
    578 		 * manually because we cannot create a custom f_touch
    579 		 * function for inotify_filtops.  See filter_touch()
    580 		 * in kern_event.c for details.
    581 		 */
    582 	        SLIST_FOREACH_SAFE(kn, &wp->f_vnode->v_klist->vk_klist,
    583 		    kn_selnext, tmpkn) {
    584 			if (kn->kn_fop == &inotify_filtops
    585 			    && ifd == kn->kn_kevent.udata) {
    586 				mutex_enter(&kn->kn_kq->kq_lock);
    587 				if (mask & LINUX_IN_MASK_ADD)
    588 					kn->kn_sfflags |= kev.fflags;
    589 				else
    590 					kn->kn_sfflags = kev.fflags;
    591 				wp->f_vnode->v_klist->vk_interest |=
    592 				    kn->kn_sfflags;
    593 				mutex_exit(&kn->kn_kq->kq_lock);
    594 			}
    595 		}
    596 
    597 		mutex_exit(wp->f_vnode->v_interlock);
    598 
    599 		/* Success! */
    600 		*retval = wd;
    601 	}
    602 
    603 leave2:
    604 	fd_putfile(wd);
    605 leave1:
    606 	mutex_exit(&ifd->ifd_lock);
    607 leave0:
    608 	fd_putfile(fd);
    609 	return error;
    610 }
    611 
    612 /*
    613  * Remove a wd from ifd and close wd.
    614  */
    615 static int
    616 inotify_close_wd(struct inotifyfd *ifd, int wd)
    617 {
    618 	file_t *wp;
    619 	int error;
    620 	register_t retval;
    621 	struct kevent kev;
    622 	struct kevent_ops k_ops = {
    623 		.keo_private = NULL,
    624 		.keo_fetch_timeout = NULL,
    625 		.keo_fetch_changes = inotify_kev_fetch_changes,
    626 		.keo_put_events = NULL,
    627 	};
    628 
    629 	mutex_enter(&ifd->ifd_lock);
    630 
    631 	KASSERT(0 <= wd && wd < ifd->ifd_nwds && ifd->ifd_wds[wd] != NULL);
    632 
    633 	kmem_free(ifd->ifd_wds[wd],
    634 	    INOTIFY_DIR_ENTRIES_SIZE(ifd->ifd_wds[wd]->ide_count));
    635 	ifd->ifd_wds[wd] = NULL;
    636 
    637 	mutex_exit(&ifd->ifd_lock);
    638 
    639 	wp = fd_getfile(wd);
    640 	if (wp == NULL) {
    641 		DPRINTF(("%s: wd=%d is already closed\n", __func__, wd));
    642 		return 0;
    643 	}
    644 	KASSERT(!mutex_owned(wp->f_vnode->v_interlock));
    645 
    646 	memset(&kev, 0, sizeof(kev));
    647 	EV_SET(&kev, wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
    648 	error = kevent1(&retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL, &k_ops);
    649 	if (error != 0)
    650 		DPRINTF(("%s: attempt to disable all events for wd=%d "
    651 		    "had error=%d\n", __func__, wd, error));
    652 
    653 	return fd_close(wd);
    654 }
    655 
    656 /*
    657  * inotify_rm_watch(2).  Close wd and remove it from ifd->ifd_wds.
    658  */
    659 int
    660 linux_sys_inotify_rm_watch(struct lwp *l,
    661     const struct linux_sys_inotify_rm_watch_args *uap, register_t *retval)
    662 {
    663 	/* {
    664 		syscallarg(int) fd;
    665 		syscallarg(int) wd;
    666 	} */
    667 	struct inotifyfd *ifd;
    668 	file_t *fp;
    669 	int error = 0;
    670 	const int fd = SCARG(uap, fd);
    671 	const int wd = SCARG(uap, wd);
    672 
    673 	fp = fd_getfile(fd);
    674 	if (fp == NULL)
    675 		return EBADF;
    676 	if (fp->f_ops != &inotify_fileops) {
    677 		/* not an inotify fd */
    678 		error = EINVAL;
    679 		goto leave;
    680 	}
    681 
    682 	ifd = fp->f_data;
    683 	if (wd < 0 || wd >= ifd->ifd_nwds || ifd->ifd_wds[wd] == NULL) {
    684 		error = EINVAL;
    685 		goto leave;
    686 	}
    687 
    688 	error = inotify_close_wd(ifd, wd);
    689 
    690 leave:
    691 	fd_putfile(fd);
    692 	return error;
    693 }
    694 
    695 /*
    696  * Attach the inotify filter.
    697  */
    698 static int
    699 inotify_filt_attach(struct knote *kn)
    700 {
    701 	file_t *fp = kn->kn_obj;
    702 	struct vnode *vp;
    703 
    704 	KASSERT(fp->f_type == DTYPE_VNODE);
    705 	vp = fp->f_vnode;
    706 
    707 	/*
    708 	 * Needs to be set so that we get the same event handling as
    709 	 * EVFILT_VNODE.  Otherwise we don't get any events.
    710 	 *
    711 	 * A consequence of this is that modifications/removals of
    712 	 * this knote need to specify EVFILT_VNODE rather than
    713 	 * inotify_filtid.
    714 	 */
    715 	kn->kn_filter = EVFILT_VNODE;
    716 
    717 	kn->kn_fop = &inotify_filtops;
    718 	kn->kn_hook = vp;
    719 	vn_knote_attach(vp, kn);
    720 
    721 	return 0;
    722 }
    723 
    724 /*
    725  * Detach the inotify filter.
    726  */
    727 static void
    728 inotify_filt_detach(struct knote *kn)
    729 {
    730 	struct vnode *vp = (struct vnode *)kn->kn_hook;
    731 
    732 	vn_knote_detach(vp, kn);
    733 }
    734 
    735 /*
    736  * Create a single inotify event.
    737  */
    738 static void
    739 do_kevent_to_inotify(int32_t wd, uint32_t mask, uint32_t cookie,
    740     struct inotify_entry *buf, size_t *nbuf, char *name)
    741 {
    742 	KASSERT(*nbuf < LINUX_INOTIFY_MAX_FROM_KEVENT);
    743 
    744 	buf += *nbuf;
    745 
    746 	memset(buf, 0, sizeof(*buf));
    747 
    748 	buf->ie_event.wd = wd;
    749 	buf->ie_event.mask = mask;
    750 	buf->ie_event.cookie = cookie;
    751 
    752 	if (name != NULL) {
    753 		buf->ie_event.len = strlen(name) + 1;
    754 		KASSERT(buf->ie_event.len < sizeof(buf->ie_name));
    755 		strcpy(buf->ie_name, name);
    756 	}
    757 
    758 	++(*nbuf);
    759 }
    760 
    761 /*
    762  * Like vn_readdir(), but with vnode locking only if needs_lock is
    763  * true (to avoid double locking in some situations).
    764  */
    765 static int
    766 inotify_readdir(file_t *fp, struct dirent *dep, int *done, bool needs_lock)
    767 {
    768 	struct vnode *vp;
    769 	struct iovec iov;
    770 	struct uio uio;
    771 	int error, eofflag;
    772 
    773 	KASSERT(fp->f_type == DTYPE_VNODE);
    774 	vp = fp->f_vnode;
    775 	KASSERT(vp->v_type == VDIR);
    776 
    777 	iov.iov_base = dep;
    778 	iov.iov_len = sizeof(*dep);
    779 
    780 	uio.uio_iov = &iov;
    781 	uio.uio_iovcnt = 1;
    782 	uio.uio_rw = UIO_READ;
    783 	uio.uio_resid = sizeof(*dep);
    784 	UIO_SETUP_SYSSPACE(&uio);
    785 
    786 	mutex_enter(&fp->f_lock);
    787 	uio.uio_offset = fp->f_offset;
    788 	mutex_exit(&fp->f_lock);
    789 
    790 	/* XXX: should pass whether to lock or not */
    791 	if (needs_lock)
    792 		vn_lock(vp, LK_SHARED | LK_RETRY);
    793 	else
    794 		/*
    795 		 * XXX We need to temporarily drop v_interlock because
    796 		 * it may be temporarily acquired by biowait().
    797 		 */
    798 		mutex_exit(vp->v_interlock);
    799 	KASSERT(!mutex_owned(vp->v_interlock));
    800 	error = VOP_READDIR(vp, &uio, fp->f_cred, &eofflag, NULL, NULL);
    801 	if (needs_lock)
    802 		VOP_UNLOCK(vp);
    803 	else
    804 		mutex_enter(vp->v_interlock);
    805 
    806 	mutex_enter(&fp->f_lock);
    807 	fp->f_offset = uio.uio_offset;
    808 	mutex_exit(&fp->f_lock);
    809 
    810 	*done = sizeof(*dep) - uio.uio_resid;
    811 	return error;
    812 }
    813 
    814 /*
    815  * Create (and allocate) an appropriate inotify_dir_entries struct for wd to be
    816  * used on ifd_wds of inotifyfd.  If the entries on a directory fail to be read,
    817  * NULL is returned.  needs_lock indicates if the vnode's lock is not already
    818  * owned.
    819  */
    820 static struct inotify_dir_entries *
    821 get_inotify_dir_entries(int wd, bool needs_lock)
    822 {
    823 	struct dirent de;
    824 	struct dirent *currdep;
    825 	struct inotify_dir_entries *idep = NULL;
    826 	file_t *wp;
    827 	int done, error;
    828 	size_t i, decount;
    829 
    830 	wp = fd_getfile(wd);
    831 	if (wp == NULL)
    832 		return NULL;
    833 	if (wp->f_type != DTYPE_VNODE)
    834 		goto leave;
    835 
    836 	/* for non-directories, we have 0 entries. */
    837 	if (wp->f_vnode->v_type != VDIR) {
    838 		idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(0), KM_SLEEP);
    839 		goto leave;
    840 	}
    841 
    842 	mutex_enter(&wp->f_lock);
    843 	wp->f_offset = 0;
    844 	mutex_exit(&wp->f_lock);
    845 	decount = 0;
    846 	for (;;) {
    847 		error = inotify_readdir(wp, &de, &done, needs_lock);
    848 		if (error != 0)
    849 			goto leave;
    850 		if (done == 0)
    851 			break;
    852 
    853 		currdep = &de;
    854 	        while ((char *)currdep < ((char *)&de) + done) {
    855 			decount++;
    856 			currdep = _DIRENT_NEXT(currdep);
    857 		}
    858 	}
    859 
    860 	idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(decount), KM_SLEEP);
    861 	idep->ide_count = decount;
    862 
    863 	mutex_enter(&wp->f_lock);
    864 	wp->f_offset = 0;
    865 	mutex_exit(&wp->f_lock);
    866 	for (i = 0; i < decount;) {
    867 		error = inotify_readdir(wp, &de, &done, needs_lock);
    868 		if (error != 0 || done == 0) {
    869 			kmem_free(idep, INOTIFY_DIR_ENTRIES_SIZE(decount));
    870 			idep = NULL;
    871 			goto leave;
    872 		}
    873 
    874 		currdep = &de;
    875 		while ((char *)currdep < ((char *)&de) + done) {
    876 			idep->ide_entries[i].fileno = currdep->d_fileno;
    877 			strcpy(idep->ide_entries[i].name, currdep->d_name);
    878 
    879 			currdep = _DIRENT_NEXT(currdep);
    880 			i++;
    881 		}
    882 	}
    883 
    884 leave:
    885 	fd_putfile(wd);
    886 	return idep;
    887 }
    888 
    889 static size_t
    890 find_entry(struct inotify_dir_entries *i1, struct inotify_dir_entries *i2)
    891 {
    892 	for (size_t i = 0; i < i2->ide_count; i++)
    893 		if (i2->ide_entries[i].fileno != i1->ide_entries[i].fileno)
    894 			return i;
    895 	KASSERTMSG(0, "Entry not found");
    896 	return -1;
    897 }
    898 
    899 static void
    900 handle_write(struct inotifyfd *ifd, int wd, struct inotify_entry *buf,
    901     size_t *nbuf)
    902 {
    903 	struct inotify_dir_entries *old_idep, *new_idep;
    904 	size_t i;
    905 
    906 	mutex_enter(&ifd->ifd_lock);
    907 
    908 	old_idep = ifd->ifd_wds[wd];
    909 	KASSERT(old_idep != NULL);
    910 	new_idep = get_inotify_dir_entries(wd, false);
    911 	if (new_idep == NULL) {
    912 		DPRINTF(("%s: directory for wd=%d could not be read\n",
    913 		    __func__, wd));
    914 		mutex_exit(&ifd->ifd_lock);
    915 		return;
    916 	}
    917 
    918 
    919 	if (old_idep->ide_count < new_idep->ide_count) {
    920 		KASSERT(old_idep->ide_count + 1 == new_idep->ide_count);
    921 
    922 		/* Find the new entry. */
    923 		i = find_entry(new_idep, old_idep);
    924 		do_kevent_to_inotify(wd, LINUX_IN_CREATE, 0,
    925 		    buf, nbuf, new_idep->ide_entries[i].name);
    926 		goto out;
    927 	}
    928 
    929 	if (old_idep->ide_count > new_idep->ide_count) {
    930 		KASSERT(old_idep->ide_count == new_idep->ide_count + 1);
    931 
    932 		/* Find the deleted entry. */
    933 		i = find_entry(old_idep, new_idep);
    934 
    935 		do_kevent_to_inotify(wd, LINUX_IN_DELETE, 0,
    936 		    buf, nbuf, old_idep->ide_entries[i].name);
    937 		goto out;
    938 	}
    939 
    940 	/*
    941 	 * XXX Because we are not watching the entire
    942 	 * file system, the only time we know for sure
    943 	 * that the event is a LINUX_IN_MOVED_FROM/
    944 	 * LINUX_IN_MOVED_TO is when the move happens
    945 	 * within a single directory...  ie. the number
    946 	 * of directory entries has not changed.
    947 	 *
    948 	 * Otherwise all we can say for sure is that
    949 	 * something was created/deleted.  So we issue a
    950 	 * LINUX_IN_CREATE/LINUX_IN_DELETE.
    951 	 */
    952 	ino_t changed = new_idep->ide_entries[new_idep->ide_count - 1].fileno;
    953 
    954 	/* Find the deleted entry. */
    955 	for (i = 0; i < old_idep->ide_count; i++)
    956 		if (old_idep->ide_entries[i].fileno == changed)
    957 			break;
    958 	KASSERT(i != old_idep->ide_count);
    959 
    960 	do_kevent_to_inotify(wd, LINUX_IN_MOVED_FROM, changed, buf, nbuf,
    961 	    old_idep->ide_entries[i].name);
    962 
    963 	do_kevent_to_inotify(wd, LINUX_IN_MOVED_TO, changed, buf, nbuf,
    964 	    new_idep->ide_entries[new_idep->ide_count - 1].name);
    965 
    966 out:
    967 	ifd->ifd_wds[wd] = new_idep;
    968 	mutex_exit(&ifd->ifd_lock);
    969 }
    970 
    971 /*
    972  * Convert a kevent flags and fflags for EVFILT_VNODE to some number
    973  * of inotify events.
    974  */
    975 static int
    976 kevent_to_inotify(struct inotifyfd *ifd, int wd, enum vtype wtype,
    977     uint32_t flags, uint32_t fflags, struct inotify_entry *buf,
    978     size_t *nbuf)
    979 {
    980 	struct stat st;
    981 	file_t *wp;
    982 	size_t i;
    983 	int error = 0;
    984 
    985 	for (i = 0; i < common_kevent_to_inotify_len; i++)
    986 		if (fflags & common_kevent_to_inotify[i].kevent)
    987 			do_kevent_to_inotify(wd,
    988 			    common_kevent_to_inotify[i].inotify, 0, buf, nbuf,
    989 			    NULL);
    990 
    991 	if (wtype == VREG) {
    992 		for (i = 0; i < vreg_kevent_to_inotify_len; i++)
    993 			if (fflags & vreg_kevent_to_inotify[i].kevent)
    994 				do_kevent_to_inotify(wd,
    995 				    vreg_kevent_to_inotify[i].inotify, 0,
    996 				    buf, nbuf, NULL);
    997 	} else if (wtype == VDIR) {
    998 		for (i = 0; i < *nbuf; i++)
    999 			if (buf[i].ie_event.mask &
   1000 			    (LINUX_IN_ACCESS|LINUX_IN_ATTRIB
   1001 		            |LINUX_IN_CLOSE|LINUX_IN_OPEN))
   1002 				buf[i].ie_event.mask |= LINUX_IN_ISDIR;
   1003 
   1004 		/* Need to disambiguate the possible NOTE_WRITEs. */
   1005 		if (fflags & NOTE_WRITE)
   1006 			handle_write(ifd, wd, buf, nbuf);
   1007 	}
   1008 
   1009 	/*
   1010 	 * Need to check if wd is actually has a link count of 0 to issue a
   1011 	 * LINUX_IN_DELETE_SELF.
   1012 	 */
   1013 	if (fflags & NOTE_DELETE) {
   1014 		wp = fd_getfile(wd);
   1015 		KASSERT(wp != NULL);
   1016 		KASSERT(wp->f_type == DTYPE_VNODE);
   1017 		vn_stat(wp->f_vnode, &st);
   1018 		fd_putfile(wd);
   1019 
   1020 		if (st.st_nlink == 0)
   1021 			do_kevent_to_inotify(wd, LINUX_IN_DELETE_SELF, 0,
   1022 			    buf, nbuf, NULL);
   1023 	}
   1024 
   1025 	/* LINUX_IN_IGNORED must be the last event issued for wd. */
   1026 	if ((flags & EV_ONESHOT) || (fflags & (NOTE_REVOKE|NOTE_DELETE))) {
   1027 		do_kevent_to_inotify(wd, LINUX_IN_IGNORED, 0, buf, nbuf, NULL);
   1028 		/*
   1029 		 * XXX in theory we could call inotify_close_wd(ifd, wd) but if
   1030 		 * we get here we must already be holding v_interlock for
   1031 		 * wd... so we can't.
   1032 		 *
   1033 		 * For simplicity we do nothing, and so wd will only be closed
   1034 		 * when the inotify fd is closed.
   1035 		 */
   1036 	}
   1037 
   1038 	return error;
   1039 }
   1040 
   1041 /*
   1042  * Handle an event.  Unlike EVFILT_VNODE, we translate the event to a
   1043  * linux_inotify_event and put it in our own custom queue.
   1044  */
   1045 static int
   1046 inotify_filt_event(struct knote *kn, long hint)
   1047 {
   1048         struct vnode *vp = (struct vnode *)kn->kn_hook;
   1049 	struct inotifyfd *ifd;
   1050 	struct inotify_entry *cur_ie;
   1051 	size_t nbuf, i;
   1052 	uint32_t status;
   1053 	struct inotify_entry buf[LINUX_INOTIFY_MAX_FROM_KEVENT];
   1054 
   1055 	/*
   1056 	 * If KN_WILLDETACH is set then
   1057 	 * 1. kn->kn_kevent.udata has already been trashed with a
   1058 	 *    struct lwp *, so we don't have access to a real ifd
   1059 	 *    anymore, and
   1060 	 * 2. we're about to detach anyways, so we don't really care
   1061 	 *    about the events.
   1062 	 * (Also because of this we need to get ifd under the same
   1063 	 * lock as kn->kn_status.)
   1064 	 */
   1065 	mutex_enter(&kn->kn_kq->kq_lock);
   1066 	status = kn->kn_status;
   1067 	ifd = kn->kn_kevent.udata;
   1068 	mutex_exit(&kn->kn_kq->kq_lock);
   1069 	if (status & KN_WILLDETACH)
   1070 		return 0;
   1071 
   1072 	/*
   1073 	 * If we don't care about the NOTEs in hint, we don't generate
   1074 	 * any events.
   1075 	 */
   1076 	hint &= kn->kn_sfflags;
   1077 	if (hint == 0)
   1078 		return 0;
   1079 
   1080 	KASSERT(mutex_owned(vp->v_interlock));
   1081 	KASSERT(!mutex_owned(&ifd->ifd_lock));
   1082 
   1083 	mutex_enter(&ifd->ifd_qlock);
   1084 
   1085 	/*
   1086 	 * early out: there's no point even traslating the event if we
   1087 	 * have nowhere to put it (and an LINUX_IN_Q_OVERFLOW has
   1088 	 * already been added).
   1089 	 */
   1090 	if (ifd->ifd_qcount >= LINUX_INOTIFY_MAX_QUEUED)
   1091 		goto leave;
   1092 
   1093 	nbuf = 0;
   1094 	(void)kevent_to_inotify(ifd, kn->kn_id, vp->v_type, kn->kn_flags,
   1095 	    hint, buf, &nbuf);
   1096 	for (i = 0; i < nbuf && ifd->ifd_qcount < LINUX_INOTIFY_MAX_QUEUED-1;
   1097 	     i++) {
   1098 		cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
   1099 		memcpy(cur_ie, &buf[i], sizeof(*cur_ie));
   1100 
   1101 		TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
   1102 		ifd->ifd_qcount++;
   1103 	}
   1104 	/* handle early overflow, by adding an overflow event to the end */
   1105 	if (i != nbuf) {
   1106 		nbuf = 0;
   1107 		cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
   1108 		do_kevent_to_inotify(-1, LINUX_IN_Q_OVERFLOW, 0,
   1109 		    cur_ie, &nbuf, NULL);
   1110 
   1111 		TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
   1112 		ifd->ifd_qcount++;
   1113 	}
   1114 
   1115 	if (nbuf > 0) {
   1116 		cv_signal(&ifd->ifd_qcv);
   1117 
   1118 		mutex_enter(&ifd->ifd_lock);
   1119 		selnotify(&ifd->ifd_sel, 0, NOTE_LOWAT);
   1120 		mutex_exit(&ifd->ifd_lock);
   1121 	} else
   1122 		DPRINTF(("%s: hint=%lx resulted in 0 inotify events\n",
   1123 		    __func__, hint));
   1124 
   1125 leave:
   1126 	mutex_exit(&ifd->ifd_qlock);
   1127 	return 0;
   1128 }
   1129 
   1130 /*
   1131  * Read inotify events from the queue.
   1132  */
   1133 static int
   1134 inotify_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
   1135     int flags)
   1136 {
   1137 	struct inotify_entry *cur_iep;
   1138 	size_t cur_size, nread;
   1139 	int error = 0;
   1140 	struct inotifyfd *ifd = fp->f_data;
   1141 
   1142 	mutex_enter(&ifd->ifd_qlock);
   1143 
   1144 	if (ifd->ifd_qcount == 0) {
   1145 		if (fp->f_flag & O_NONBLOCK) {
   1146 			error = EAGAIN;
   1147 			goto leave;
   1148 		}
   1149 
   1150 		while (ifd->ifd_qcount == 0) {
   1151 			/* wait until there is an event to read */
   1152 			error = cv_wait_sig(&ifd->ifd_qcv, &ifd->ifd_qlock);
   1153 			if (error != 0) {
   1154 				error = EINTR;
   1155 				goto leave;
   1156 			}
   1157 		}
   1158 	}
   1159 
   1160 	KASSERT(ifd->ifd_qcount > 0);
   1161 	KASSERT(mutex_owned(&ifd->ifd_qlock));
   1162 
   1163 	nread = 0;
   1164 	while (ifd->ifd_qcount > 0) {
   1165 		cur_iep = TAILQ_FIRST(&ifd->ifd_qhead);
   1166 		KASSERT(cur_iep != NULL);
   1167 
   1168 		cur_size = sizeof(cur_iep->ie_event) + cur_iep->ie_event.len;
   1169 		if (cur_size > uio->uio_resid) {
   1170 			if (nread == 0)
   1171 				error = EINVAL;
   1172 			break;
   1173 		}
   1174 
   1175 		error = uiomove(&cur_iep->ie_event, sizeof(cur_iep->ie_event),
   1176 		    uio);
   1177 		if (error != 0)
   1178 			break;
   1179 		error = uiomove(&cur_iep->ie_name, cur_iep->ie_event.len, uio);
   1180 		if (error != 0)
   1181 			break;
   1182 
   1183 		/* cleanup */
   1184 		TAILQ_REMOVE(&ifd->ifd_qhead, cur_iep, ie_entries);
   1185 		kmem_free(cur_iep, sizeof(*cur_iep));
   1186 
   1187 		nread++;
   1188 		ifd->ifd_qcount--;
   1189 	}
   1190 
   1191 leave:
   1192 	/* Wake up the next reader, if the queue is not empty. */
   1193 	if (ifd->ifd_qcount > 0)
   1194 		cv_signal(&ifd->ifd_qcv);
   1195 
   1196 	mutex_exit(&ifd->ifd_qlock);
   1197 	return error;
   1198 }
   1199 
   1200 /*
   1201  * Close all the file descriptors associated with fp.
   1202  */
   1203 static int
   1204 inotify_close(file_t *fp)
   1205 {
   1206 	int error;
   1207 	size_t i;
   1208 	file_t *kqfp;
   1209 	struct inotifyfd *ifd = fp->f_data;
   1210 
   1211 	for (i = 0; i < ifd->ifd_nwds; i++) {
   1212 		if (ifd->ifd_wds[i] != NULL) {
   1213 			error = inotify_close_wd(ifd, i);
   1214 			if (error != 0)
   1215 				return error;
   1216 		}
   1217 	}
   1218 
   1219 	/* the reference we need to hold is ifd->ifd_kqfp */
   1220 	kqfp = fd_getfile(ifd->ifd_kqfd);
   1221 	if (kqfp == NULL) {
   1222 		DPRINTF(("%s: kqfp=%d is already closed\n", __func__,
   1223 		    ifd->ifd_kqfd));
   1224 	} else {
   1225 		error = fd_close(ifd->ifd_kqfd);
   1226 		if (error != 0)
   1227 			return error;
   1228 	}
   1229 
   1230 	mutex_destroy(&ifd->ifd_lock);
   1231 	mutex_destroy(&ifd->ifd_qlock);
   1232 	cv_destroy(&ifd->ifd_qcv);
   1233 	seldestroy(&ifd->ifd_sel);
   1234 
   1235 	kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
   1236 	kmem_free(ifd, sizeof(*ifd));
   1237 	fp->f_data = NULL;
   1238 
   1239 	return 0;
   1240 }
   1241 
   1242 /*
   1243  * Check if there are pending read events.
   1244  */
   1245 static int
   1246 inotify_poll(file_t *fp, int events)
   1247 {
   1248 	int revents;
   1249 	struct inotifyfd *ifd = fp->f_data;
   1250 
   1251 	revents = 0;
   1252 	if (events & (POLLIN|POLLRDNORM)) {
   1253 		mutex_enter(&ifd->ifd_qlock);
   1254 
   1255 		if (ifd->ifd_qcount > 0)
   1256 			revents |= events & (POLLIN|POLLRDNORM);
   1257 
   1258 		mutex_exit(&ifd->ifd_qlock);
   1259 	}
   1260 
   1261 	return revents;
   1262 }
   1263 
   1264 /*
   1265  * Attach EVFILT_READ to the inotify instance in fp.
   1266  *
   1267  * This is so you can watch inotify with epoll.  No other kqueue
   1268  * filter needs to be supported.
   1269  */
   1270 static int
   1271 inotify_kqfilter(file_t *fp, struct knote *kn)
   1272 {
   1273 	struct inotifyfd *ifd = fp->f_data;
   1274 
   1275 	KASSERT(fp == kn->kn_obj);
   1276 
   1277 	if (kn->kn_filter != EVFILT_READ)
   1278 		return EINVAL;
   1279 
   1280 	kn->kn_fop = &inotify_read_filtops;
   1281 	mutex_enter(&ifd->ifd_lock);
   1282 	selrecord_knote(&ifd->ifd_sel, kn);
   1283 	mutex_exit(&ifd->ifd_lock);
   1284 
   1285 	return 0;
   1286 }
   1287 
   1288 /*
   1289  * Detach a filter from an inotify instance.
   1290  */
   1291 static void
   1292 inotify_read_filt_detach(struct knote *kn)
   1293 {
   1294 	struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
   1295 
   1296 	mutex_enter(&ifd->ifd_lock);
   1297 	selremove_knote(&ifd->ifd_sel, kn);
   1298 	mutex_exit(&ifd->ifd_lock);
   1299 }
   1300 
   1301 /*
   1302  * Handle EVFILT_READ events.  Note that nothing is put in kn_data.
   1303  */
   1304 static int
   1305 inotify_read_filt_event(struct knote *kn, long hint)
   1306 {
   1307 	struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
   1308 
   1309 	if (hint != 0) {
   1310 		KASSERT(mutex_owned(&ifd->ifd_lock));
   1311 		KASSERT(mutex_owned(&ifd->ifd_qlock));
   1312 		KASSERT(hint == NOTE_LOWAT);
   1313 
   1314 		kn->kn_data = ifd->ifd_qcount;
   1315 	}
   1316 
   1317 	return kn->kn_data > 0;
   1318 }
   1319 
   1320 /*
   1321  * Restart the inotify instance.
   1322  */
   1323 static void
   1324 inotify_restart(file_t *fp)
   1325 {
   1326 	struct inotifyfd *ifd = fp->f_data;
   1327 
   1328 	mutex_enter(&ifd->ifd_qlock);
   1329 	cv_broadcast(&ifd->ifd_qcv);
   1330 	mutex_exit(&ifd->ifd_qlock);
   1331 }
   1332