kern_tc.c revision 1.59 1 /* $NetBSD: kern_tc.c,v 1.59 2020/05/27 09:09:50 rin Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * ----------------------------------------------------------------------------
34 * "THE BEER-WARE LICENSE" (Revision 42):
35 * <phk (at) FreeBSD.ORG> wrote this file. As long as you retain this notice you
36 * can do whatever you want with this stuff. If we meet some day, and you think
37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
38 * ---------------------------------------------------------------------------
39 */
40
41 #include <sys/cdefs.h>
42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */
43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.59 2020/05/27 09:09:50 rin Exp $");
44
45 #ifdef _KERNEL_OPT
46 #include "opt_ntp.h"
47 #endif
48
49 #include <sys/param.h>
50 #include <sys/kernel.h>
51 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */
52 #include <sys/sysctl.h>
53 #include <sys/syslog.h>
54 #include <sys/systm.h>
55 #include <sys/timepps.h>
56 #include <sys/timetc.h>
57 #include <sys/timex.h>
58 #include <sys/evcnt.h>
59 #include <sys/kauth.h>
60 #include <sys/mutex.h>
61 #include <sys/atomic.h>
62 #include <sys/xcall.h>
63
64 /*
65 * A large step happens on boot. This constant detects such steps.
66 * It is relatively small so that ntp_update_second gets called enough
67 * in the typical 'missed a couple of seconds' case, but doesn't loop
68 * forever when the time step is large.
69 */
70 #define LARGE_STEP 200
71
72 /*
73 * Implement a dummy timecounter which we can use until we get a real one
74 * in the air. This allows the console and other early stuff to use
75 * time services.
76 */
77
78 static u_int
79 dummy_get_timecount(struct timecounter *tc)
80 {
81 static u_int now;
82
83 return ++now;
84 }
85
86 static struct timecounter dummy_timecounter = {
87 .tc_get_timecount = dummy_get_timecount,
88 .tc_counter_mask = ~0u,
89 .tc_frequency = 1000000,
90 .tc_name = "dummy",
91 .tc_quality = -1000000,
92 .tc_priv = NULL,
93 };
94
95 struct timehands {
96 /* These fields must be initialized by the driver. */
97 struct timecounter *th_counter; /* active timecounter */
98 int64_t th_adjustment; /* frequency adjustment */
99 /* (NTP/adjtime) */
100 uint64_t th_scale; /* scale factor (counter */
101 /* tick->time) */
102 uint64_t th_offset_count; /* offset at last time */
103 /* update (tc_windup()) */
104 struct bintime th_offset; /* bin (up)time at windup */
105 struct timeval th_microtime; /* cached microtime */
106 struct timespec th_nanotime; /* cached nanotime */
107 /* Fields not to be copied in tc_windup start with th_generation. */
108 volatile u_int th_generation; /* current genration */
109 struct timehands *th_next; /* next timehand */
110 };
111
112 static struct timehands th0;
113 static struct timehands th9 = { .th_next = &th0, };
114 static struct timehands th8 = { .th_next = &th9, };
115 static struct timehands th7 = { .th_next = &th8, };
116 static struct timehands th6 = { .th_next = &th7, };
117 static struct timehands th5 = { .th_next = &th6, };
118 static struct timehands th4 = { .th_next = &th5, };
119 static struct timehands th3 = { .th_next = &th4, };
120 static struct timehands th2 = { .th_next = &th3, };
121 static struct timehands th1 = { .th_next = &th2, };
122 static struct timehands th0 = {
123 .th_counter = &dummy_timecounter,
124 .th_scale = (uint64_t)-1 / 1000000,
125 .th_offset = { .sec = 1, .frac = 0 },
126 .th_generation = 1,
127 .th_next = &th1,
128 };
129
130 static struct timehands *volatile timehands = &th0;
131 struct timecounter *timecounter = &dummy_timecounter;
132 static struct timecounter *timecounters = &dummy_timecounter;
133
134 volatile time_t time_second __cacheline_aligned = 1;
135 volatile time_t time_uptime __cacheline_aligned = 1;
136
137 static struct bintime timebasebin;
138
139 static int timestepwarnings;
140
141 kmutex_t timecounter_lock;
142 static u_int timecounter_mods;
143 static volatile int timecounter_removals = 1;
144 static u_int timecounter_bad;
145
146 /*
147 * sysctl helper routine for kern.timercounter.hardware
148 */
149 static int
150 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS)
151 {
152 struct sysctlnode node;
153 int error;
154 char newname[MAX_TCNAMELEN];
155 struct timecounter *newtc, *tc;
156
157 tc = timecounter;
158
159 strlcpy(newname, tc->tc_name, sizeof(newname));
160
161 node = *rnode;
162 node.sysctl_data = newname;
163 node.sysctl_size = sizeof(newname);
164
165 error = sysctl_lookup(SYSCTLFN_CALL(&node));
166
167 if (error ||
168 newp == NULL ||
169 strncmp(newname, tc->tc_name, sizeof(newname)) == 0)
170 return error;
171
172 if (l != NULL && (error = kauth_authorize_system(l->l_cred,
173 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname,
174 NULL, NULL)) != 0)
175 return error;
176
177 if (!cold)
178 mutex_spin_enter(&timecounter_lock);
179 error = EINVAL;
180 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
181 if (strcmp(newname, newtc->tc_name) != 0)
182 continue;
183 /* Warm up new timecounter. */
184 (void)newtc->tc_get_timecount(newtc);
185 (void)newtc->tc_get_timecount(newtc);
186 timecounter = newtc;
187 error = 0;
188 break;
189 }
190 if (!cold)
191 mutex_spin_exit(&timecounter_lock);
192 return error;
193 }
194
195 static int
196 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS)
197 {
198 char buf[MAX_TCNAMELEN+48];
199 char *where;
200 const char *spc;
201 struct timecounter *tc;
202 size_t needed, left, slen;
203 int error, mods;
204
205 if (newp != NULL)
206 return EPERM;
207 if (namelen != 0)
208 return EINVAL;
209
210 mutex_spin_enter(&timecounter_lock);
211 retry:
212 spc = "";
213 error = 0;
214 needed = 0;
215 left = *oldlenp;
216 where = oldp;
217 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
218 if (where == NULL) {
219 needed += sizeof(buf); /* be conservative */
220 } else {
221 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64
222 " Hz)", spc, tc->tc_name, tc->tc_quality,
223 tc->tc_frequency);
224 if (left < slen + 1)
225 break;
226 mods = timecounter_mods;
227 mutex_spin_exit(&timecounter_lock);
228 error = copyout(buf, where, slen + 1);
229 mutex_spin_enter(&timecounter_lock);
230 if (mods != timecounter_mods) {
231 goto retry;
232 }
233 spc = " ";
234 where += slen;
235 needed += slen;
236 left -= slen;
237 }
238 }
239 mutex_spin_exit(&timecounter_lock);
240
241 *oldlenp = needed;
242 return error;
243 }
244
245 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup")
246 {
247 const struct sysctlnode *node;
248
249 sysctl_createv(clog, 0, NULL, &node,
250 CTLFLAG_PERMANENT,
251 CTLTYPE_NODE, "timecounter",
252 SYSCTL_DESCR("time counter information"),
253 NULL, 0, NULL, 0,
254 CTL_KERN, CTL_CREATE, CTL_EOL);
255
256 if (node != NULL) {
257 sysctl_createv(clog, 0, NULL, NULL,
258 CTLFLAG_PERMANENT,
259 CTLTYPE_STRING, "choice",
260 SYSCTL_DESCR("available counters"),
261 sysctl_kern_timecounter_choice, 0, NULL, 0,
262 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
263
264 sysctl_createv(clog, 0, NULL, NULL,
265 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
266 CTLTYPE_STRING, "hardware",
267 SYSCTL_DESCR("currently active time counter"),
268 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN,
269 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
270
271 sysctl_createv(clog, 0, NULL, NULL,
272 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
273 CTLTYPE_INT, "timestepwarnings",
274 SYSCTL_DESCR("log time steps"),
275 NULL, 0, ×tepwarnings, 0,
276 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
277 }
278 }
279
280 #ifdef TC_COUNTERS
281 #define TC_STATS(name) \
282 static struct evcnt n##name = \
283 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \
284 EVCNT_ATTACH_STATIC(n##name)
285 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime);
286 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime);
287 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime);
288 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime);
289 TC_STATS(setclock);
290 #define TC_COUNT(var) var.ev_count++
291 #undef TC_STATS
292 #else
293 #define TC_COUNT(var) /* nothing */
294 #endif /* TC_COUNTERS */
295
296 static void tc_windup(void);
297
298 /*
299 * Return the difference between the timehands' counter value now and what
300 * was when we copied it to the timehands' offset_count.
301 */
302 static inline u_int
303 tc_delta(struct timehands *th)
304 {
305 struct timecounter *tc;
306
307 tc = th->th_counter;
308 return (tc->tc_get_timecount(tc) -
309 th->th_offset_count) & tc->tc_counter_mask;
310 }
311
312 /*
313 * Functions for reading the time. We have to loop until we are sure that
314 * the timehands that we operated on was not updated under our feet. See
315 * the comment in <sys/timevar.h> for a description of these 12 functions.
316 */
317
318 void
319 binuptime(struct bintime *bt)
320 {
321 struct timehands *th;
322 lwp_t *l;
323 u_int lgen, gen;
324
325 TC_COUNT(nbinuptime);
326
327 /*
328 * Provide exclusion against tc_detach().
329 *
330 * We record the number of timecounter removals before accessing
331 * timecounter state. Note that the LWP can be using multiple
332 * "generations" at once, due to interrupts (interrupted while in
333 * this function). Hardware interrupts will borrow the interrupted
334 * LWP's l_tcgen value for this purpose, and can themselves be
335 * interrupted by higher priority interrupts. In this case we need
336 * to ensure that the oldest generation in use is recorded.
337 *
338 * splsched() is too expensive to use, so we take care to structure
339 * this code in such a way that it is not required. Likewise, we
340 * do not disable preemption.
341 *
342 * Memory barriers are also too expensive to use for such a
343 * performance critical function. The good news is that we do not
344 * need memory barriers for this type of exclusion, as the thread
345 * updating timecounter_removals will issue a broadcast cross call
346 * before inspecting our l_tcgen value (this elides memory ordering
347 * issues).
348 */
349 l = curlwp;
350 lgen = l->l_tcgen;
351 if (__predict_true(lgen == 0)) {
352 l->l_tcgen = timecounter_removals;
353 }
354 __insn_barrier();
355
356 do {
357 th = timehands;
358 gen = th->th_generation;
359 *bt = th->th_offset;
360 bintime_addx(bt, th->th_scale * tc_delta(th));
361 } while (gen == 0 || gen != th->th_generation);
362
363 __insn_barrier();
364 l->l_tcgen = lgen;
365 }
366
367 void
368 nanouptime(struct timespec *tsp)
369 {
370 struct bintime bt;
371
372 TC_COUNT(nnanouptime);
373 binuptime(&bt);
374 bintime2timespec(&bt, tsp);
375 }
376
377 void
378 microuptime(struct timeval *tvp)
379 {
380 struct bintime bt;
381
382 TC_COUNT(nmicrouptime);
383 binuptime(&bt);
384 bintime2timeval(&bt, tvp);
385 }
386
387 void
388 bintime(struct bintime *bt)
389 {
390
391 TC_COUNT(nbintime);
392 binuptime(bt);
393 bintime_add(bt, &timebasebin);
394 }
395
396 void
397 nanotime(struct timespec *tsp)
398 {
399 struct bintime bt;
400
401 TC_COUNT(nnanotime);
402 bintime(&bt);
403 bintime2timespec(&bt, tsp);
404 }
405
406 void
407 microtime(struct timeval *tvp)
408 {
409 struct bintime bt;
410
411 TC_COUNT(nmicrotime);
412 bintime(&bt);
413 bintime2timeval(&bt, tvp);
414 }
415
416 void
417 getbinuptime(struct bintime *bt)
418 {
419 struct timehands *th;
420 u_int gen;
421
422 TC_COUNT(ngetbinuptime);
423 do {
424 th = timehands;
425 gen = th->th_generation;
426 *bt = th->th_offset;
427 } while (gen == 0 || gen != th->th_generation);
428 }
429
430 void
431 getnanouptime(struct timespec *tsp)
432 {
433 struct timehands *th;
434 u_int gen;
435
436 TC_COUNT(ngetnanouptime);
437 do {
438 th = timehands;
439 gen = th->th_generation;
440 bintime2timespec(&th->th_offset, tsp);
441 } while (gen == 0 || gen != th->th_generation);
442 }
443
444 void
445 getmicrouptime(struct timeval *tvp)
446 {
447 struct timehands *th;
448 u_int gen;
449
450 TC_COUNT(ngetmicrouptime);
451 do {
452 th = timehands;
453 gen = th->th_generation;
454 bintime2timeval(&th->th_offset, tvp);
455 } while (gen == 0 || gen != th->th_generation);
456 }
457
458 void
459 getbintime(struct bintime *bt)
460 {
461 struct timehands *th;
462 u_int gen;
463
464 TC_COUNT(ngetbintime);
465 do {
466 th = timehands;
467 gen = th->th_generation;
468 *bt = th->th_offset;
469 } while (gen == 0 || gen != th->th_generation);
470 bintime_add(bt, &timebasebin);
471 }
472
473 static inline void
474 dogetnanotime(struct timespec *tsp)
475 {
476 struct timehands *th;
477 u_int gen;
478
479 TC_COUNT(ngetnanotime);
480 do {
481 th = timehands;
482 gen = th->th_generation;
483 *tsp = th->th_nanotime;
484 } while (gen == 0 || gen != th->th_generation);
485 }
486
487 void
488 getnanotime(struct timespec *tsp)
489 {
490
491 dogetnanotime(tsp);
492 }
493
494 void dtrace_getnanotime(struct timespec *tsp);
495
496 void
497 dtrace_getnanotime(struct timespec *tsp)
498 {
499
500 dogetnanotime(tsp);
501 }
502
503 void
504 getmicrotime(struct timeval *tvp)
505 {
506 struct timehands *th;
507 u_int gen;
508
509 TC_COUNT(ngetmicrotime);
510 do {
511 th = timehands;
512 gen = th->th_generation;
513 *tvp = th->th_microtime;
514 } while (gen == 0 || gen != th->th_generation);
515 }
516
517 void
518 getnanoboottime(struct timespec *tsp)
519 {
520 struct bintime bt;
521
522 getbinboottime(&bt);
523 bintime2timespec(&bt, tsp);
524 }
525
526 void
527 getmicroboottime(struct timeval *tvp)
528 {
529 struct bintime bt;
530
531 getbinboottime(&bt);
532 bintime2timeval(&bt, tvp);
533 }
534
535 void
536 getbinboottime(struct bintime *bt)
537 {
538
539 /*
540 * XXX Need lockless read synchronization around timebasebin
541 * (and not just here).
542 */
543 *bt = timebasebin;
544 }
545
546 /*
547 * Initialize a new timecounter and possibly use it.
548 */
549 void
550 tc_init(struct timecounter *tc)
551 {
552 u_int u;
553
554 u = tc->tc_frequency / tc->tc_counter_mask;
555 /* XXX: We need some margin here, 10% is a guess */
556 u *= 11;
557 u /= 10;
558 if (u > hz && tc->tc_quality >= 0) {
559 tc->tc_quality = -2000;
560 aprint_verbose(
561 "timecounter: Timecounter \"%s\" frequency %ju Hz",
562 tc->tc_name, (uintmax_t)tc->tc_frequency);
563 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u);
564 } else if (tc->tc_quality >= 0 || bootverbose) {
565 aprint_verbose(
566 "timecounter: Timecounter \"%s\" frequency %ju Hz "
567 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency,
568 tc->tc_quality);
569 }
570
571 mutex_spin_enter(&timecounter_lock);
572 tc->tc_next = timecounters;
573 timecounters = tc;
574 timecounter_mods++;
575 /*
576 * Never automatically use a timecounter with negative quality.
577 * Even though we run on the dummy counter, switching here may be
578 * worse since this timecounter may not be monotonous.
579 */
580 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality ||
581 (tc->tc_quality == timecounter->tc_quality &&
582 tc->tc_frequency > timecounter->tc_frequency))) {
583 (void)tc->tc_get_timecount(tc);
584 (void)tc->tc_get_timecount(tc);
585 timecounter = tc;
586 tc_windup();
587 }
588 mutex_spin_exit(&timecounter_lock);
589 }
590
591 /*
592 * Pick a new timecounter due to the existing counter going bad.
593 */
594 static void
595 tc_pick(void)
596 {
597 struct timecounter *best, *tc;
598
599 KASSERT(mutex_owned(&timecounter_lock));
600
601 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) {
602 if (tc->tc_quality > best->tc_quality)
603 best = tc;
604 else if (tc->tc_quality < best->tc_quality)
605 continue;
606 else if (tc->tc_frequency > best->tc_frequency)
607 best = tc;
608 }
609 (void)best->tc_get_timecount(best);
610 (void)best->tc_get_timecount(best);
611 timecounter = best;
612 }
613
614 /*
615 * A timecounter has gone bad, arrange to pick a new one at the next
616 * clock tick.
617 */
618 void
619 tc_gonebad(struct timecounter *tc)
620 {
621
622 tc->tc_quality = -100;
623 membar_producer();
624 atomic_inc_uint(&timecounter_bad);
625 }
626
627 /*
628 * Stop using a timecounter and remove it from the timecounters list.
629 */
630 int
631 tc_detach(struct timecounter *target)
632 {
633 struct timecounter *tc;
634 struct timecounter **tcp = NULL;
635 int removals;
636 lwp_t *l;
637
638 /* First, find the timecounter. */
639 mutex_spin_enter(&timecounter_lock);
640 for (tcp = &timecounters, tc = timecounters;
641 tc != NULL;
642 tcp = &tc->tc_next, tc = tc->tc_next) {
643 if (tc == target)
644 break;
645 }
646 if (tc == NULL) {
647 mutex_spin_exit(&timecounter_lock);
648 return ESRCH;
649 }
650
651 /* And now, remove it. */
652 *tcp = tc->tc_next;
653 if (timecounter == target) {
654 tc_pick();
655 tc_windup();
656 }
657 timecounter_mods++;
658 removals = timecounter_removals++;
659 mutex_spin_exit(&timecounter_lock);
660
661 /*
662 * We now have to determine if any threads in the system are still
663 * making use of this timecounter.
664 *
665 * We issue a broadcast cross call to elide memory ordering issues,
666 * then scan all LWPs in the system looking at each's timecounter
667 * generation number. We need to see a value of zero (not actively
668 * using a timecounter) or a value greater than our removal value.
669 *
670 * We may race with threads that read `timecounter_removals' and
671 * and then get preempted before updating `l_tcgen'. This is not
672 * a problem, since it means that these threads have not yet started
673 * accessing timecounter state. All we do need is one clean
674 * snapshot of the system where every thread appears not to be using
675 * old timecounter state.
676 */
677 for (;;) {
678 xc_barrier(0);
679
680 mutex_enter(&proc_lock);
681 LIST_FOREACH(l, &alllwp, l_list) {
682 if (l->l_tcgen == 0 || l->l_tcgen > removals) {
683 /*
684 * Not using timecounter or old timecounter
685 * state at time of our xcall or later.
686 */
687 continue;
688 }
689 break;
690 }
691 mutex_exit(&proc_lock);
692
693 /*
694 * If the timecounter is still in use, wait at least 10ms
695 * before retrying.
696 */
697 if (l == NULL) {
698 return 0;
699 }
700 (void)kpause("tcdetach", false, mstohz(10), NULL);
701 }
702 }
703
704 /* Report the frequency of the current timecounter. */
705 uint64_t
706 tc_getfrequency(void)
707 {
708
709 return timehands->th_counter->tc_frequency;
710 }
711
712 /*
713 * Step our concept of UTC. This is done by modifying our estimate of
714 * when we booted.
715 */
716 void
717 tc_setclock(const struct timespec *ts)
718 {
719 struct timespec ts2;
720 struct bintime bt, bt2;
721
722 mutex_spin_enter(&timecounter_lock);
723 TC_COUNT(nsetclock);
724 binuptime(&bt2);
725 timespec2bintime(ts, &bt);
726 bintime_sub(&bt, &bt2);
727 bintime_add(&bt2, &timebasebin);
728 timebasebin = bt;
729 tc_windup();
730 mutex_spin_exit(&timecounter_lock);
731
732 if (timestepwarnings) {
733 bintime2timespec(&bt2, &ts2);
734 log(LOG_INFO,
735 "Time stepped from %lld.%09ld to %lld.%09ld\n",
736 (long long)ts2.tv_sec, ts2.tv_nsec,
737 (long long)ts->tv_sec, ts->tv_nsec);
738 }
739 }
740
741 /*
742 * Initialize the next struct timehands in the ring and make
743 * it the active timehands. Along the way we might switch to a different
744 * timecounter and/or do seconds processing in NTP. Slightly magic.
745 */
746 static void
747 tc_windup(void)
748 {
749 struct bintime bt;
750 struct timehands *th, *tho;
751 uint64_t scale;
752 u_int delta, ncount, ogen;
753 int i, s_update;
754 time_t t;
755
756 KASSERT(mutex_owned(&timecounter_lock));
757
758 s_update = 0;
759
760 /*
761 * Make the next timehands a copy of the current one, but do not
762 * overwrite the generation or next pointer. While we update
763 * the contents, the generation must be zero. Ensure global
764 * visibility of the generation before proceeding.
765 */
766 tho = timehands;
767 th = tho->th_next;
768 ogen = th->th_generation;
769 th->th_generation = 0;
770 membar_producer();
771 bcopy(tho, th, offsetof(struct timehands, th_generation));
772
773 /*
774 * Capture a timecounter delta on the current timecounter and if
775 * changing timecounters, a counter value from the new timecounter.
776 * Update the offset fields accordingly.
777 */
778 delta = tc_delta(th);
779 if (th->th_counter != timecounter)
780 ncount = timecounter->tc_get_timecount(timecounter);
781 else
782 ncount = 0;
783 th->th_offset_count += delta;
784 bintime_addx(&th->th_offset, th->th_scale * delta);
785
786 /*
787 * Hardware latching timecounters may not generate interrupts on
788 * PPS events, so instead we poll them. There is a finite risk that
789 * the hardware might capture a count which is later than the one we
790 * got above, and therefore possibly in the next NTP second which might
791 * have a different rate than the current NTP second. It doesn't
792 * matter in practice.
793 */
794 if (tho->th_counter->tc_poll_pps)
795 tho->th_counter->tc_poll_pps(tho->th_counter);
796
797 /*
798 * Deal with NTP second processing. The for loop normally
799 * iterates at most once, but in extreme situations it might
800 * keep NTP sane if timeouts are not run for several seconds.
801 * At boot, the time step can be large when the TOD hardware
802 * has been read, so on really large steps, we call
803 * ntp_update_second only twice. We need to call it twice in
804 * case we missed a leap second.
805 * If NTP is not compiled in ntp_update_second still calculates
806 * the adjustment resulting from adjtime() calls.
807 */
808 bt = th->th_offset;
809 bintime_add(&bt, &timebasebin);
810 i = bt.sec - tho->th_microtime.tv_sec;
811 if (i > LARGE_STEP)
812 i = 2;
813 for (; i > 0; i--) {
814 t = bt.sec;
815 ntp_update_second(&th->th_adjustment, &bt.sec);
816 s_update = 1;
817 if (bt.sec != t)
818 timebasebin.sec += bt.sec - t;
819 }
820
821 /* Update the UTC timestamps used by the get*() functions. */
822 /* XXX shouldn't do this here. Should force non-`get' versions. */
823 bintime2timeval(&bt, &th->th_microtime);
824 bintime2timespec(&bt, &th->th_nanotime);
825 /* Now is a good time to change timecounters. */
826 if (th->th_counter != timecounter) {
827 th->th_counter = timecounter;
828 th->th_offset_count = ncount;
829 s_update = 1;
830 }
831
832 /*-
833 * Recalculate the scaling factor. We want the number of 1/2^64
834 * fractions of a second per period of the hardware counter, taking
835 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
836 * processing provides us with.
837 *
838 * The th_adjustment is nanoseconds per second with 32 bit binary
839 * fraction and we want 64 bit binary fraction of second:
840 *
841 * x = a * 2^32 / 10^9 = a * 4.294967296
842 *
843 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
844 * we can only multiply by about 850 without overflowing, but that
845 * leaves suitably precise fractions for multiply before divide.
846 *
847 * Divide before multiply with a fraction of 2199/512 results in a
848 * systematic undercompensation of 10PPM of th_adjustment. On a
849 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable.
850 *
851 * We happily sacrifice the lowest of the 64 bits of our result
852 * to the goddess of code clarity.
853 *
854 */
855 if (s_update) {
856 scale = (uint64_t)1 << 63;
857 scale += (th->th_adjustment / 1024) * 2199;
858 scale /= th->th_counter->tc_frequency;
859 th->th_scale = scale * 2;
860 }
861 /*
862 * Now that the struct timehands is again consistent, set the new
863 * generation number, making sure to not make it zero. Ensure
864 * changes are globally visible before changing.
865 */
866 if (++ogen == 0)
867 ogen = 1;
868 membar_producer();
869 th->th_generation = ogen;
870
871 /*
872 * Go live with the new struct timehands. Ensure changes are
873 * globally visible before changing.
874 */
875 time_second = th->th_microtime.tv_sec;
876 time_uptime = th->th_offset.sec;
877 membar_producer();
878 timehands = th;
879
880 /*
881 * Force users of the old timehand to move on. This is
882 * necessary for MP systems; we need to ensure that the
883 * consumers will move away from the old timehand before
884 * we begin updating it again when we eventually wrap
885 * around.
886 */
887 if (++tho->th_generation == 0)
888 tho->th_generation = 1;
889 }
890
891 /*
892 * RFC 2783 PPS-API implementation.
893 */
894
895 int
896 pps_ioctl(u_long cmd, void *data, struct pps_state *pps)
897 {
898 pps_params_t *app;
899 pps_info_t *pipi;
900 #ifdef PPS_SYNC
901 int *epi;
902 #endif
903
904 KASSERT(mutex_owned(&timecounter_lock));
905
906 KASSERT(pps != NULL);
907
908 switch (cmd) {
909 case PPS_IOC_CREATE:
910 return 0;
911 case PPS_IOC_DESTROY:
912 return 0;
913 case PPS_IOC_SETPARAMS:
914 app = (pps_params_t *)data;
915 if (app->mode & ~pps->ppscap)
916 return EINVAL;
917 pps->ppsparam = *app;
918 return 0;
919 case PPS_IOC_GETPARAMS:
920 app = (pps_params_t *)data;
921 *app = pps->ppsparam;
922 app->api_version = PPS_API_VERS_1;
923 return 0;
924 case PPS_IOC_GETCAP:
925 *(int*)data = pps->ppscap;
926 return 0;
927 case PPS_IOC_FETCH:
928 pipi = (pps_info_t *)data;
929 pps->ppsinfo.current_mode = pps->ppsparam.mode;
930 *pipi = pps->ppsinfo;
931 return 0;
932 case PPS_IOC_KCBIND:
933 #ifdef PPS_SYNC
934 epi = (int *)data;
935 /* XXX Only root should be able to do this */
936 if (*epi & ~pps->ppscap)
937 return EINVAL;
938 pps->kcmode = *epi;
939 return 0;
940 #else
941 return EOPNOTSUPP;
942 #endif
943 default:
944 return EPASSTHROUGH;
945 }
946 }
947
948 void
949 pps_init(struct pps_state *pps)
950 {
951
952 KASSERT(mutex_owned(&timecounter_lock));
953
954 pps->ppscap |= PPS_TSFMT_TSPEC;
955 if (pps->ppscap & PPS_CAPTUREASSERT)
956 pps->ppscap |= PPS_OFFSETASSERT;
957 if (pps->ppscap & PPS_CAPTURECLEAR)
958 pps->ppscap |= PPS_OFFSETCLEAR;
959 }
960
961 /*
962 * capture a timetamp in the pps structure
963 */
964 void
965 pps_capture(struct pps_state *pps)
966 {
967 struct timehands *th;
968
969 KASSERT(mutex_owned(&timecounter_lock));
970 KASSERT(pps != NULL);
971
972 th = timehands;
973 pps->capgen = th->th_generation;
974 pps->capth = th;
975 pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count;
976 if (pps->capgen != th->th_generation)
977 pps->capgen = 0;
978 }
979
980 #ifdef PPS_DEBUG
981 int ppsdebug = 0;
982 #endif
983
984 /*
985 * process a pps_capture()ed event
986 */
987 void
988 pps_event(struct pps_state *pps, int event)
989 {
990 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE);
991 }
992
993 /*
994 * extended pps api / kernel pll/fll entry point
995 *
996 * feed reference time stamps to PPS engine
997 *
998 * will simulate a PPS event and feed
999 * the NTP PLL/FLL if requested.
1000 *
1001 * the ref time stamps should be roughly once
1002 * a second but do not need to be exactly in phase
1003 * with the UTC second but should be close to it.
1004 * this relaxation of requirements allows callout
1005 * driven timestamping mechanisms to feed to pps
1006 * capture/kernel pll logic.
1007 *
1008 * calling pattern is:
1009 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR})
1010 * read timestamp from reference source
1011 * pps_ref_event()
1012 *
1013 * supported refmodes:
1014 * PPS_REFEVNT_CAPTURE
1015 * use system timestamp of pps_capture()
1016 * PPS_REFEVNT_CURRENT
1017 * use system timestamp of this call
1018 * PPS_REFEVNT_CAPCUR
1019 * use average of read capture and current system time stamp
1020 * PPS_REFEVNT_PPS
1021 * assume timestamp on second mark - ref_ts is ignored
1022 *
1023 */
1024
1025 void
1026 pps_ref_event(struct pps_state *pps,
1027 int event,
1028 struct bintime *ref_ts,
1029 int refmode
1030 )
1031 {
1032 struct bintime bt; /* current time */
1033 struct bintime btd; /* time difference */
1034 struct bintime bt_ref; /* reference time */
1035 struct timespec ts, *tsp, *osp;
1036 struct timehands *th;
1037 uint64_t tcount, acount, dcount, *pcount;
1038 int foff, gen;
1039 #ifdef PPS_SYNC
1040 int fhard;
1041 #endif
1042 pps_seq_t *pseq;
1043
1044 KASSERT(mutex_owned(&timecounter_lock));
1045
1046 KASSERT(pps != NULL);
1047
1048 /* pick up current time stamp if needed */
1049 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) {
1050 /* pick up current time stamp */
1051 th = timehands;
1052 gen = th->th_generation;
1053 tcount = (uint64_t)tc_delta(th) + th->th_offset_count;
1054 if (gen != th->th_generation)
1055 gen = 0;
1056
1057 /* If the timecounter was wound up underneath us, bail out. */
1058 if (pps->capgen == 0 ||
1059 pps->capgen != pps->capth->th_generation ||
1060 gen == 0 ||
1061 gen != pps->capgen) {
1062 #ifdef PPS_DEBUG
1063 if (ppsdebug & 0x1) {
1064 log(LOG_DEBUG,
1065 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n",
1066 pps, event);
1067 }
1068 #endif
1069 return;
1070 }
1071 } else {
1072 tcount = 0; /* keep GCC happy */
1073 }
1074
1075 #ifdef PPS_DEBUG
1076 if (ppsdebug & 0x1) {
1077 struct timespec tmsp;
1078
1079 if (ref_ts == NULL) {
1080 tmsp.tv_sec = 0;
1081 tmsp.tv_nsec = 0;
1082 } else {
1083 bintime2timespec(ref_ts, &tmsp);
1084 }
1085
1086 log(LOG_DEBUG,
1087 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64
1088 ".%09"PRIi32", refmode=0x%1x)\n",
1089 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode);
1090 }
1091 #endif
1092
1093 /* setup correct event references */
1094 if (event == PPS_CAPTUREASSERT) {
1095 tsp = &pps->ppsinfo.assert_timestamp;
1096 osp = &pps->ppsparam.assert_offset;
1097 foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
1098 #ifdef PPS_SYNC
1099 fhard = pps->kcmode & PPS_CAPTUREASSERT;
1100 #endif
1101 pcount = &pps->ppscount[0];
1102 pseq = &pps->ppsinfo.assert_sequence;
1103 } else {
1104 tsp = &pps->ppsinfo.clear_timestamp;
1105 osp = &pps->ppsparam.clear_offset;
1106 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
1107 #ifdef PPS_SYNC
1108 fhard = pps->kcmode & PPS_CAPTURECLEAR;
1109 #endif
1110 pcount = &pps->ppscount[1];
1111 pseq = &pps->ppsinfo.clear_sequence;
1112 }
1113
1114 /* determine system time stamp according to refmode */
1115 dcount = 0; /* keep GCC happy */
1116 switch (refmode & PPS_REFEVNT_RMASK) {
1117 case PPS_REFEVNT_CAPTURE:
1118 acount = pps->capcount; /* use capture timestamp */
1119 break;
1120
1121 case PPS_REFEVNT_CURRENT:
1122 acount = tcount; /* use current timestamp */
1123 break;
1124
1125 case PPS_REFEVNT_CAPCUR:
1126 /*
1127 * calculate counter value between pps_capture() and
1128 * pps_ref_event()
1129 */
1130 dcount = tcount - pps->capcount;
1131 acount = (dcount / 2) + pps->capcount;
1132 break;
1133
1134 default: /* ignore call error silently */
1135 return;
1136 }
1137
1138 /*
1139 * If the timecounter changed, we cannot compare the count values, so
1140 * we have to drop the rest of the PPS-stuff until the next event.
1141 */
1142 if (pps->ppstc != pps->capth->th_counter) {
1143 pps->ppstc = pps->capth->th_counter;
1144 pps->capcount = acount;
1145 *pcount = acount;
1146 pps->ppscount[2] = acount;
1147 #ifdef PPS_DEBUG
1148 if (ppsdebug & 0x1) {
1149 log(LOG_DEBUG,
1150 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n",
1151 pps, event);
1152 }
1153 #endif
1154 return;
1155 }
1156
1157 pps->capcount = acount;
1158
1159 /* Convert the count to a bintime. */
1160 bt = pps->capth->th_offset;
1161 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count));
1162 bintime_add(&bt, &timebasebin);
1163
1164 if ((refmode & PPS_REFEVNT_PPS) == 0) {
1165 /* determine difference to reference time stamp */
1166 bt_ref = *ref_ts;
1167
1168 btd = bt;
1169 bintime_sub(&btd, &bt_ref);
1170
1171 /*
1172 * simulate a PPS timestamp by dropping the fraction
1173 * and applying the offset
1174 */
1175 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */
1176 bt.sec++;
1177 bt.frac = 0;
1178 bintime_add(&bt, &btd);
1179 } else {
1180 /*
1181 * create ref_ts from current time -
1182 * we are supposed to be called on
1183 * the second mark
1184 */
1185 bt_ref = bt;
1186 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */
1187 bt_ref.sec++;
1188 bt_ref.frac = 0;
1189 }
1190
1191 /* convert bintime to timestamp */
1192 bintime2timespec(&bt, &ts);
1193
1194 /* If the timecounter was wound up underneath us, bail out. */
1195 if (pps->capgen != pps->capth->th_generation)
1196 return;
1197
1198 /* store time stamp */
1199 *pcount = pps->capcount;
1200 (*pseq)++;
1201 *tsp = ts;
1202
1203 /* add offset correction */
1204 if (foff) {
1205 timespecadd(tsp, osp, tsp);
1206 if (tsp->tv_nsec < 0) {
1207 tsp->tv_nsec += 1000000000;
1208 tsp->tv_sec -= 1;
1209 }
1210 }
1211
1212 #ifdef PPS_DEBUG
1213 if (ppsdebug & 0x2) {
1214 struct timespec ts2;
1215 struct timespec ts3;
1216
1217 bintime2timespec(&bt_ref, &ts2);
1218
1219 bt.sec = 0;
1220 bt.frac = 0;
1221
1222 if (refmode & PPS_REFEVNT_CAPCUR) {
1223 bintime_addx(&bt, pps->capth->th_scale * dcount);
1224 }
1225 bintime2timespec(&bt, &ts3);
1226
1227 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32
1228 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n",
1229 ts2.tv_sec, (int32_t)ts2.tv_nsec,
1230 tsp->tv_sec, (int32_t)tsp->tv_nsec,
1231 timespec2ns(&ts3));
1232 }
1233 #endif
1234
1235 #ifdef PPS_SYNC
1236 if (fhard) {
1237 uint64_t scale;
1238 uint64_t div;
1239
1240 /*
1241 * Feed the NTP PLL/FLL.
1242 * The FLL wants to know how many (hardware) nanoseconds
1243 * elapsed since the previous event (mod 1 second) thus
1244 * we are actually looking at the frequency difference scaled
1245 * in nsec.
1246 * As the counter time stamps are not truly at 1Hz
1247 * we need to scale the count by the elapsed
1248 * reference time.
1249 * valid sampling interval: [0.5..2[ sec
1250 */
1251
1252 /* calculate elapsed raw count */
1253 tcount = pps->capcount - pps->ppscount[2];
1254 pps->ppscount[2] = pps->capcount;
1255 tcount &= pps->capth->th_counter->tc_counter_mask;
1256
1257 /* calculate elapsed ref time */
1258 btd = bt_ref;
1259 bintime_sub(&btd, &pps->ref_time);
1260 pps->ref_time = bt_ref;
1261
1262 /* check that we stay below 2 sec */
1263 if (btd.sec < 0 || btd.sec > 1)
1264 return;
1265
1266 /* we want at least 0.5 sec between samples */
1267 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63)
1268 return;
1269
1270 /*
1271 * calculate cycles per period by multiplying
1272 * the frequency with the elapsed period
1273 * we pick a fraction of 30 bits
1274 * ~1ns resolution for elapsed time
1275 */
1276 div = (uint64_t)btd.sec << 30;
1277 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1);
1278 div *= pps->capth->th_counter->tc_frequency;
1279 div >>= 30;
1280
1281 if (div == 0) /* safeguard */
1282 return;
1283
1284 scale = (uint64_t)1 << 63;
1285 scale /= div;
1286 scale *= 2;
1287
1288 bt.sec = 0;
1289 bt.frac = 0;
1290 bintime_addx(&bt, scale * tcount);
1291 bintime2timespec(&bt, &ts);
1292
1293 #ifdef PPS_DEBUG
1294 if (ppsdebug & 0x4) {
1295 struct timespec ts2;
1296 int64_t df;
1297
1298 bintime2timespec(&bt_ref, &ts2);
1299 df = timespec2ns(&ts);
1300 if (df > 500000000)
1301 df -= 1000000000;
1302 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64
1303 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32
1304 ", freqdiff=%"PRIi64" ns/s\n",
1305 ts2.tv_sec, (int32_t)ts2.tv_nsec,
1306 tsp->tv_sec, (int32_t)tsp->tv_nsec,
1307 df);
1308 }
1309 #endif
1310
1311 hardpps(tsp, timespec2ns(&ts));
1312 }
1313 #endif
1314 }
1315
1316 /*
1317 * Timecounters need to be updated every so often to prevent the hardware
1318 * counter from overflowing. Updating also recalculates the cached values
1319 * used by the get*() family of functions, so their precision depends on
1320 * the update frequency.
1321 */
1322
1323 static int tc_tick;
1324
1325 void
1326 tc_ticktock(void)
1327 {
1328 static int count;
1329
1330 if (++count < tc_tick)
1331 return;
1332 count = 0;
1333 mutex_spin_enter(&timecounter_lock);
1334 if (__predict_false(timecounter_bad != 0)) {
1335 /* An existing timecounter has gone bad, pick a new one. */
1336 (void)atomic_swap_uint(&timecounter_bad, 0);
1337 if (timecounter->tc_quality < 0) {
1338 tc_pick();
1339 }
1340 }
1341 tc_windup();
1342 mutex_spin_exit(&timecounter_lock);
1343 }
1344
1345 void
1346 inittimecounter(void)
1347 {
1348 u_int p;
1349
1350 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH);
1351
1352 /*
1353 * Set the initial timeout to
1354 * max(1, <approx. number of hardclock ticks in a millisecond>).
1355 * People should probably not use the sysctl to set the timeout
1356 * to smaller than its initial value, since that value is the
1357 * smallest reasonable one. If they want better timestamps they
1358 * should use the non-"get"* functions.
1359 */
1360 if (hz > 1000)
1361 tc_tick = (hz + 500) / 1000;
1362 else
1363 tc_tick = 1;
1364 p = (tc_tick * 1000000) / hz;
1365 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n",
1366 p / 1000, p % 1000);
1367
1368 /* warm up new timecounter (again) and get rolling. */
1369 (void)timecounter->tc_get_timecount(timecounter);
1370 (void)timecounter->tc_get_timecount(timecounter);
1371 }
1372