sys_timerfd.c revision 1.3 1 /* $NetBSD: sys_timerfd.c,v 1.3 2021/09/20 11:12:35 skrll Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_timerfd.c,v 1.3 2021/09/20 11:12:35 skrll Exp $");
34
35 /*
36 * timerfd
37 *
38 * Timerfd objects are similar to POSIX timers, except they are associated
39 * with a file descriptor rather than a process. Timerfd objects are
40 * created with the timerfd_create(2) system call, similar to timer_create(2).
41 * The timerfd analogues for timer_gettime(2) and timer_settime(2) are
42 * timerfd_gettime(2) and timerfd_settime(2), respectively.
43 *
44 * When a timerfd object's timer fires, an internal counter is incremented.
45 * When this counter is non-zero, the descriptor associated with the timerfd
46 * object is "readable". Note that this is slightly different than the
47 * POSIX timer "overrun" counter, which only increments if the timer fires
48 * again while the notification signal is already pending. Thus, we are
49 * responsible for incrementing the "overrun" counter each time the timerfd
50 * timer fires.
51 *
52 * This implementation is API compatible with the Linux timerfd interface.
53 */
54
55 #include <sys/param.h>
56 #include <sys/types.h>
57 #include <sys/condvar.h>
58 #include <sys/file.h>
59 #include <sys/filedesc.h>
60 #include <sys/kauth.h>
61 #include <sys/mutex.h>
62 #include <sys/poll.h>
63 #include <sys/proc.h>
64 #include <sys/select.h>
65 #include <sys/stat.h>
66 #include <sys/syscallargs.h>
67 #include <sys/timerfd.h>
68 #include <sys/uio.h>
69
70 /* N.B. all timerfd state is protected by itimer_lock() */
71 struct timerfd {
72 struct itimer tfd_itimer;
73 kcondvar_t tfd_read_wait;
74 kcondvar_t tfd_restart_wait;
75 struct selinfo tfd_read_sel;
76 int64_t tfd_nwaiters;
77 bool tfd_cancel_on_set;
78 bool tfd_cancelled;
79 bool tfd_restarting;
80
81 /*
82 * Information kept for stat(2).
83 */
84 struct timespec tfd_btime; /* time created */
85 struct timespec tfd_mtime; /* last timerfd_settime() */
86 struct timespec tfd_atime; /* last read */
87 };
88
89 static void timerfd_wake(struct timerfd *);
90
91 static inline uint64_t
92 timerfd_fire_count(const struct timerfd * const tfd)
93 {
94 return (unsigned int)tfd->tfd_itimer.it_overruns;
95 }
96
97 static inline bool
98 timerfd_is_readable(const struct timerfd * const tfd)
99 {
100 return tfd->tfd_itimer.it_overruns != 0 || tfd->tfd_cancelled;
101 }
102
103 /*
104 * timerfd_fire:
105 *
106 * Called when the timerfd's timer fires.
107 *
108 * Called from a callout with itimer lock held.
109 */
110 static void
111 timerfd_fire(struct itimer * const it)
112 {
113 struct timerfd * const tfd =
114 container_of(it, struct timerfd, tfd_itimer);
115
116 it->it_overruns++;
117 timerfd_wake(tfd);
118 }
119
120 /*
121 * timerfd_realtime_changed:
122 *
123 * Called when CLOCK_REALTIME is changed with clock_settime()
124 * or settimeofday().
125 *
126 * Called with itimer lock held.
127 */
128 static void
129 timerfd_realtime_changed(struct itimer * const it)
130 {
131 struct timerfd * const tfd =
132 container_of(it, struct timerfd, tfd_itimer);
133
134 /* Should only be called when timer is armed. */
135 KASSERT(timespecisset(&it->it_time.it_value));
136
137 if (tfd->tfd_cancel_on_set) {
138 tfd->tfd_cancelled = true;
139 timerfd_wake(tfd);
140 }
141 }
142
143 static const struct itimer_ops timerfd_itimer_monotonic_ops = {
144 .ito_fire = timerfd_fire,
145 };
146
147 static const struct itimer_ops timerfd_itimer_realtime_ops = {
148 .ito_fire = timerfd_fire,
149 .ito_realtime_changed = timerfd_realtime_changed,
150 };
151
152 /*
153 * timerfd_create:
154 *
155 * Create a timerfd object.
156 */
157 static struct timerfd *
158 timerfd_create(clockid_t const clock_id, int const flags)
159 {
160 struct timerfd * const tfd = kmem_zalloc(sizeof(*tfd), KM_SLEEP);
161
162 KASSERT(clock_id == CLOCK_REALTIME || clock_id == CLOCK_MONOTONIC);
163
164 cv_init(&tfd->tfd_read_wait, "tfdread");
165 cv_init(&tfd->tfd_restart_wait, "tfdrstrt");
166 selinit(&tfd->tfd_read_sel);
167 getnanotime(&tfd->tfd_btime);
168
169 /* Caller deals with TFD_CLOEXEC and TFD_NONBLOCK. */
170
171 itimer_lock();
172 itimer_init(&tfd->tfd_itimer,
173 clock_id == CLOCK_REALTIME ? &timerfd_itimer_realtime_ops
174 : &timerfd_itimer_monotonic_ops,
175 clock_id, NULL);
176 itimer_unlock();
177
178 return tfd;
179 }
180
181 /*
182 * timerfd_destroy:
183 *
184 * Destroy a timerfd object.
185 */
186 static void
187 timerfd_destroy(struct timerfd * const tfd)
188 {
189
190 KASSERT(tfd->tfd_nwaiters == 0);
191 KASSERT(tfd->tfd_restarting == false);
192
193 itimer_lock();
194 itimer_poison(&tfd->tfd_itimer);
195 itimer_fini(&tfd->tfd_itimer); /* drops itimer lock */
196
197 cv_destroy(&tfd->tfd_read_wait);
198 cv_destroy(&tfd->tfd_restart_wait);
199
200 seldestroy(&tfd->tfd_read_sel);
201
202 kmem_free(tfd, sizeof(*tfd));
203 }
204
205 /*
206 * timerfd_wait:
207 *
208 * Block on a timerfd. Handles non-blocking, as well as
209 * the restart cases.
210 */
211 static int
212 timerfd_wait(struct timerfd * const tfd, int const fflag)
213 {
214 extern kmutex_t itimer_mutex; /* XXX */
215 int error;
216
217 if (fflag & FNONBLOCK) {
218 return EAGAIN;
219 }
220
221 /*
222 * We're going to block. If there is a restart in-progress,
223 * wait for that to complete first.
224 */
225 while (tfd->tfd_restarting) {
226 cv_wait(&tfd->tfd_restart_wait, &itimer_mutex);
227 }
228
229 tfd->tfd_nwaiters++;
230 KASSERT(tfd->tfd_nwaiters > 0);
231 error = cv_wait_sig(&tfd->tfd_read_wait, &itimer_mutex);
232 tfd->tfd_nwaiters--;
233 KASSERT(tfd->tfd_nwaiters >= 0);
234
235 /*
236 * If a restart was triggered while we were asleep, we need
237 * to return ERESTART if no other error was returned. If we
238 * are the last waiter coming out of the restart drain, clear
239 * the condition.
240 */
241 if (tfd->tfd_restarting) {
242 if (error == 0) {
243 error = ERESTART;
244 }
245 if (tfd->tfd_nwaiters == 0) {
246 tfd->tfd_restarting = false;
247 cv_broadcast(&tfd->tfd_restart_wait);
248 }
249 }
250
251 return error;
252 }
253
254 /*
255 * timerfd_wake:
256 *
257 * Wake LWPs blocked on a timerfd.
258 */
259 static void
260 timerfd_wake(struct timerfd * const tfd)
261 {
262
263 if (tfd->tfd_nwaiters) {
264 cv_broadcast(&tfd->tfd_read_wait);
265 }
266 selnotify(&tfd->tfd_read_sel, POLLIN | POLLRDNORM, NOTE_SUBMIT);
267 }
268
269 /*
270 * timerfd file operations
271 */
272
273 static int
274 timerfd_fop_read(file_t * const fp, off_t * const offset,
275 struct uio * const uio, kauth_cred_t const cred, int const flags)
276 {
277 struct timerfd * const tfd = fp->f_timerfd;
278 struct itimer * const it = &tfd->tfd_itimer;
279 int const fflag = fp->f_flag;
280 uint64_t return_value;
281 int error;
282
283 if (uio->uio_resid < sizeof(uint64_t)) {
284 return EINVAL;
285 }
286
287 itimer_lock();
288
289 while (!timerfd_is_readable(tfd)) {
290 if ((error = timerfd_wait(tfd, fflag)) != 0) {
291 itimer_unlock();
292 return error;
293 }
294 }
295
296 if (tfd->tfd_cancelled) {
297 itimer_unlock();
298 return ECANCELED;
299 }
300
301 return_value = timerfd_fire_count(tfd);
302 it->it_overruns = 0;
303
304 getnanotime(&tfd->tfd_atime);
305
306 itimer_unlock();
307
308 error = uiomove(&return_value, sizeof(return_value), uio);
309
310 return error;
311 }
312
313 static int
314 timerfd_fop_ioctl(file_t * const fp, unsigned long const cmd, void * const data)
315 {
316 struct timerfd * const tfd = fp->f_timerfd;
317 int error = 0;
318
319 switch (cmd) {
320 case TFD_IOC_SET_TICKS: {
321 const uint64_t * const new_ticksp = data;
322 if (*new_ticksp > INT_MAX) {
323 return EINVAL;
324 }
325 itimer_lock();
326 tfd->tfd_itimer.it_overruns = (int)*new_ticksp;
327 itimer_unlock();
328 break;
329 }
330
331 default:
332 error = EPASSTHROUGH;
333 }
334
335 return error;
336 }
337
338 static int
339 timerfd_fop_poll(file_t * const fp, int const events)
340 {
341 struct timerfd * const tfd = fp->f_timerfd;
342 int revents = events & (POLLOUT | POLLWRNORM);
343
344 if (events & (POLLIN | POLLRDNORM)) {
345 itimer_lock();
346 if (timerfd_is_readable(tfd)) {
347 revents |= events & (POLLIN | POLLRDNORM);
348 } else {
349 selrecord(curlwp, &tfd->tfd_read_sel);
350 }
351 itimer_unlock();
352 }
353
354 return revents;
355 }
356
357 static int
358 timerfd_fop_stat(file_t * const fp, struct stat * const st)
359 {
360 struct timerfd * const tfd = fp->f_timerfd;
361
362 memset(st, 0, sizeof(*st));
363
364 itimer_lock();
365 st->st_size = (off_t)timerfd_fire_count(tfd);
366 st->st_atimespec = tfd->tfd_atime;
367 st->st_mtimespec = tfd->tfd_mtime;
368 itimer_unlock();
369
370 st->st_blksize = sizeof(uint64_t);
371 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
372 st->st_blocks = 1;
373 st->st_birthtimespec = tfd->tfd_btime;
374 st->st_ctimespec = st->st_mtimespec;
375 st->st_uid = kauth_cred_geteuid(fp->f_cred);
376 st->st_gid = kauth_cred_getegid(fp->f_cred);
377
378 return 0;
379 }
380
381 static int
382 timerfd_fop_close(file_t * const fp)
383 {
384 struct timerfd * const tfd = fp->f_timerfd;
385
386 fp->f_timerfd = NULL;
387 timerfd_destroy(tfd);
388
389 return 0;
390 }
391
392 static void
393 timerfd_filt_read_detach(struct knote * const kn)
394 {
395 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd;
396
397 itimer_lock();
398 KASSERT(kn->kn_hook == tfd);
399 selremove_knote(&tfd->tfd_read_sel, kn);
400 itimer_unlock();
401 }
402
403 static int
404 timerfd_filt_read(struct knote * const kn, long const hint)
405 {
406 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd;
407
408 if (hint & NOTE_SUBMIT) {
409 KASSERT(itimer_lock_held());
410 } else {
411 itimer_lock();
412 }
413
414 kn->kn_data = (int64_t)timerfd_fire_count(tfd);
415
416 if ((hint & NOTE_SUBMIT) == 0) {
417 itimer_unlock();
418 }
419
420 return kn->kn_data != 0;
421 }
422
423 static const struct filterops timerfd_read_filterops = {
424 .f_isfd = 1,
425 .f_detach = timerfd_filt_read_detach,
426 .f_event = timerfd_filt_read,
427 };
428
429 static int
430 timerfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
431 {
432 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd;
433 struct selinfo *sel;
434
435 switch (kn->kn_filter) {
436 case EVFILT_READ:
437 sel = &tfd->tfd_read_sel;
438 kn->kn_fop = &timerfd_read_filterops;
439 break;
440
441 default:
442 return EINVAL;
443 }
444
445 kn->kn_hook = tfd;
446
447 itimer_lock();
448 selrecord_knote(sel, kn);
449 itimer_unlock();
450
451 return 0;
452 }
453
454 static void
455 timerfd_fop_restart(file_t * const fp)
456 {
457 struct timerfd * const tfd = fp->f_timerfd;
458
459 /*
460 * Unblock blocked reads in order to allow close() to complete.
461 * System calls return ERESTART so that the fd is revalidated.
462 */
463
464 itimer_lock();
465
466 if (tfd->tfd_nwaiters != 0) {
467 tfd->tfd_restarting = true;
468 cv_broadcast(&tfd->tfd_read_wait);
469 }
470
471 itimer_unlock();
472 }
473
474 static const struct fileops timerfd_fileops = {
475 .fo_name = "timerfd",
476 .fo_read = timerfd_fop_read,
477 .fo_write = fbadop_write,
478 .fo_ioctl = timerfd_fop_ioctl,
479 .fo_fcntl = fnullop_fcntl,
480 .fo_poll = timerfd_fop_poll,
481 .fo_stat = timerfd_fop_stat,
482 .fo_close = timerfd_fop_close,
483 .fo_kqfilter = timerfd_fop_kqfilter,
484 .fo_restart = timerfd_fop_restart,
485 };
486
487 /*
488 * timerfd_create(2) system call
489 */
490 int
491 do_timerfd_create(struct lwp * const l, clockid_t const clock_id,
492 int const flags, register_t *retval)
493 {
494 file_t *fp;
495 int fd, error;
496
497 if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) {
498 return EINVAL;
499 }
500
501 switch (clock_id) {
502 case CLOCK_REALTIME:
503 case CLOCK_MONOTONIC:
504 /* allowed */
505 break;
506
507 default:
508 return EINVAL;
509 }
510
511 if ((error = fd_allocfile(&fp, &fd)) != 0) {
512 return error;
513 }
514
515 fp->f_flag = FREAD;
516 if (flags & TFD_NONBLOCK) {
517 fp->f_flag |= FNONBLOCK;
518 }
519 fp->f_type = DTYPE_TIMERFD;
520 fp->f_ops = &timerfd_fileops;
521 fp->f_timerfd = timerfd_create(clock_id, flags);
522 fd_set_exclose(l, fd, !!(flags & TFD_CLOEXEC));
523 fd_affix(curproc, fp, fd);
524
525 *retval = fd;
526 return 0;
527 }
528
529 int
530 sys_timerfd_create(struct lwp *l, const struct sys_timerfd_create_args *uap,
531 register_t *retval)
532 {
533 /* {
534 syscallarg(clockid_t) clock_id;
535 syscallarg(int) flags;
536 } */
537
538 return do_timerfd_create(l, SCARG(uap, clock_id), SCARG(uap, flags),
539 retval);
540 }
541
542 /*
543 * timerfd_gettime(2) system call.
544 */
545 int
546 do_timerfd_gettime(struct lwp *l, int fd, struct itimerspec *curr_value,
547 register_t *retval)
548 {
549 file_t *fp;
550
551 if ((fp = fd_getfile(fd)) == NULL) {
552 return EBADF;
553 }
554
555 if (fp->f_ops != &timerfd_fileops) {
556 fd_putfile(fd);
557 return EINVAL;
558 }
559
560 struct timerfd * const tfd = fp->f_timerfd;
561 itimer_lock();
562 itimer_gettime(&tfd->tfd_itimer, curr_value);
563 itimer_unlock();
564
565 fd_putfile(fd);
566 return 0;
567 }
568
569 int
570 sys_timerfd_gettime(struct lwp *l, const struct sys_timerfd_gettime_args *uap,
571 register_t *retval)
572 {
573 /* {
574 syscallarg(int) fd;
575 syscallarg(struct itimerspec *) curr_value;
576 } */
577
578 struct itimerspec oits;
579 int error;
580
581 error = do_timerfd_gettime(l, SCARG(uap, fd), &oits, retval);
582 if (error == 0) {
583 error = copyout(&oits, SCARG(uap, curr_value), sizeof(oits));
584 }
585 return error;
586 }
587
588 /*
589 * timerfd_settime(2) system call.
590 */
591 int
592 do_timerfd_settime(struct lwp *l, int fd, int flags,
593 const struct itimerspec *new_value, struct itimerspec *old_value,
594 register_t *retval)
595 {
596 file_t *fp;
597 int error;
598
599 if (flags & ~(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)) {
600 return EINVAL;
601 }
602
603 if ((fp = fd_getfile(fd)) == NULL) {
604 return EBADF;
605 }
606
607 if (fp->f_ops != &timerfd_fileops) {
608 fd_putfile(fd);
609 return EINVAL;
610 }
611
612 struct timerfd * const tfd = fp->f_timerfd;
613 struct itimer * const it = &tfd->tfd_itimer;
614
615 itimer_lock();
616
617 restart:
618 if (old_value != NULL) {
619 *old_value = it->it_time;
620 }
621 it->it_time = *new_value;
622
623 /*
624 * If we've been passed a relative value, convert it to an
625 * absolute, as that's what the itimer facility expects for
626 * non-virtual timers. Also ensure that this doesn't set it
627 * to zero or lets it go negative.
628 * XXXJRT re-factor.
629 */
630 if (timespecisset(&it->it_time.it_value) &&
631 (flags & TFD_TIMER_ABSTIME) == 0) {
632 struct timespec now;
633 if (it->it_clockid == CLOCK_REALTIME) {
634 getnanotime(&now);
635 } else { /* CLOCK_MONOTONIC */
636 getnanouptime(&now);
637 }
638 timespecadd(&it->it_time.it_value, &now,
639 &it->it_time.it_value);
640 }
641
642 error = itimer_settime(it);
643 if (error == ERESTART) {
644 goto restart;
645 }
646 KASSERT(error == 0);
647
648 /* Reset the expirations counter. */
649 it->it_overruns = 0;
650
651 if (it->it_clockid == CLOCK_REALTIME) {
652 tfd->tfd_cancelled = false;
653 tfd->tfd_cancel_on_set = !!(flags & TFD_TIMER_CANCEL_ON_SET);
654 }
655
656 getnanotime(&tfd->tfd_mtime);
657 itimer_unlock();
658
659 fd_putfile(fd);
660 return error;
661 }
662
663 int
664 sys_timerfd_settime(struct lwp *l, const struct sys_timerfd_settime_args *uap,
665 register_t *retval)
666 {
667 /* {
668 syscallarg(int) fd;
669 syscallarg(int) flags;
670 syscallarg(const struct itimerspec *) new_value;
671 syscallarg(struct itimerspec *) old_value;
672 } */
673
674 struct itimerspec nits, oits, *oitsp = NULL;
675 int error;
676
677 error = copyin(SCARG(uap, new_value), &nits, sizeof(nits));
678 if (error) {
679 return error;
680 }
681
682 if (SCARG(uap, old_value) != NULL) {
683 oitsp = &oits;
684 }
685
686 error = do_timerfd_settime(l, SCARG(uap, fd), SCARG(uap, flags),
687 &nits, oitsp, retval);
688 if (error == 0 && oitsp != NULL) {
689 error = copyout(oitsp, SCARG(uap, old_value), sizeof(*oitsp));
690 }
691 return error;
692 }
693