Home | History | Annotate | Line # | Download | only in linux
      1 /*	$NetBSD: linux_work.c,v 1.61 2022/04/09 23:43:31 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2018 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: linux_work.c,v 1.61 2022/04/09 23:43:31 riastradh Exp $");
     34 
     35 #include <sys/types.h>
     36 #include <sys/atomic.h>
     37 #include <sys/callout.h>
     38 #include <sys/condvar.h>
     39 #include <sys/errno.h>
     40 #include <sys/kmem.h>
     41 #include <sys/kthread.h>
     42 #include <sys/lwp.h>
     43 #include <sys/mutex.h>
     44 #ifndef _MODULE
     45 #include <sys/once.h>
     46 #endif
     47 #include <sys/queue.h>
     48 #include <sys/sdt.h>
     49 
     50 #include <linux/workqueue.h>
     51 
     52 TAILQ_HEAD(work_head, work_struct);
     53 TAILQ_HEAD(dwork_head, delayed_work);
     54 
     55 struct workqueue_struct {
     56 	kmutex_t		wq_lock;
     57 	kcondvar_t		wq_cv;
     58 	struct dwork_head	wq_delayed; /* delayed work scheduled */
     59 	struct work_head	wq_rcu;	    /* RCU work scheduled */
     60 	struct work_head	wq_queue;   /* work to run */
     61 	struct work_head	wq_dqueue;  /* delayed work to run now */
     62 	struct work_struct	*wq_current_work;
     63 	int			wq_flags;
     64 	bool			wq_dying;
     65 	uint64_t		wq_gen;
     66 	struct lwp		*wq_lwp;
     67 	const char		*wq_name;
     68 };
     69 
     70 static void __dead	linux_workqueue_thread(void *);
     71 static void		linux_workqueue_timeout(void *);
     72 static bool		work_claimed(struct work_struct *,
     73 			    struct workqueue_struct *);
     74 static struct workqueue_struct *
     75 			work_queue(struct work_struct *);
     76 static bool		acquire_work(struct work_struct *,
     77 			    struct workqueue_struct *);
     78 static void		release_work(struct work_struct *,
     79 			    struct workqueue_struct *);
     80 static void		wait_for_current_work(struct work_struct *,
     81 			    struct workqueue_struct *);
     82 static void		dw_callout_init(struct workqueue_struct *,
     83 			    struct delayed_work *);
     84 static void		dw_callout_destroy(struct workqueue_struct *,
     85 			    struct delayed_work *);
     86 static void		cancel_delayed_work_done(struct workqueue_struct *,
     87 			    struct delayed_work *);
     88 
     89 SDT_PROBE_DEFINE2(sdt, linux, work, acquire,
     90     "struct work_struct *"/*work*/, "struct workqueue_struct *"/*wq*/);
     91 SDT_PROBE_DEFINE2(sdt, linux, work, release,
     92     "struct work_struct *"/*work*/, "struct workqueue_struct *"/*wq*/);
     93 SDT_PROBE_DEFINE2(sdt, linux, work, queue,
     94     "struct work_struct *"/*work*/, "struct workqueue_struct *"/*wq*/);
     95 SDT_PROBE_DEFINE2(sdt, linux, work, rcu,
     96     "struct rcu_work *"/*work*/, "struct workqueue_struct *"/*wq*/);
     97 SDT_PROBE_DEFINE2(sdt, linux, work, cancel,
     98     "struct work_struct *"/*work*/, "struct workqueue_struct *"/*wq*/);
     99 SDT_PROBE_DEFINE3(sdt, linux, work, schedule,
    100     "struct delayed_work *"/*dw*/, "struct workqueue_struct *"/*wq*/,
    101     "unsigned long"/*ticks*/);
    102 SDT_PROBE_DEFINE2(sdt, linux, work, timer,
    103     "struct delayed_work *"/*dw*/, "struct workqueue_struct *"/*wq*/);
    104 SDT_PROBE_DEFINE2(sdt, linux, work, wait__start,
    105     "struct delayed_work *"/*dw*/, "struct workqueue_struct *"/*wq*/);
    106 SDT_PROBE_DEFINE2(sdt, linux, work, wait__done,
    107     "struct delayed_work *"/*dw*/, "struct workqueue_struct *"/*wq*/);
    108 SDT_PROBE_DEFINE2(sdt, linux, work, run,
    109     "struct work_struct *"/*work*/, "struct workqueue_struct *"/*wq*/);
    110 SDT_PROBE_DEFINE2(sdt, linux, work, done,
    111     "struct work_struct *"/*work*/, "struct workqueue_struct *"/*wq*/);
    112 SDT_PROBE_DEFINE1(sdt, linux, work, batch__start,
    113     "struct workqueue_struct *"/*wq*/);
    114 SDT_PROBE_DEFINE1(sdt, linux, work, batch__done,
    115     "struct workqueue_struct *"/*wq*/);
    116 SDT_PROBE_DEFINE1(sdt, linux, work, flush__self,
    117     "struct workqueue_struct *"/*wq*/);
    118 SDT_PROBE_DEFINE1(sdt, linux, work, flush__start,
    119     "struct workqueue_struct *"/*wq*/);
    120 SDT_PROBE_DEFINE1(sdt, linux, work, flush__done,
    121     "struct workqueue_struct *"/*wq*/);
    122 
    123 static specificdata_key_t workqueue_key __read_mostly;
    124 
    125 struct workqueue_struct	*system_highpri_wq __read_mostly;
    126 struct workqueue_struct	*system_long_wq __read_mostly;
    127 struct workqueue_struct	*system_power_efficient_wq __read_mostly;
    128 struct workqueue_struct	*system_unbound_wq __read_mostly;
    129 struct workqueue_struct	*system_wq __read_mostly;
    130 
    131 static inline uintptr_t
    132 atomic_cas_uintptr(volatile uintptr_t *p, uintptr_t old, uintptr_t new)
    133 {
    134 
    135 	return (uintptr_t)atomic_cas_ptr(p, (void *)old, (void *)new);
    136 }
    137 
    138 /*
    139  * linux_workqueue_init()
    140  *
    141  *	Initialize the Linux workqueue subsystem.  Return 0 on success,
    142  *	NetBSD error on failure.
    143  */
    144 static int
    145 linux_workqueue_init0(void)
    146 {
    147 	int error;
    148 
    149 	error = lwp_specific_key_create(&workqueue_key, NULL);
    150 	if (error)
    151 		goto out;
    152 
    153 	system_highpri_wq = alloc_ordered_workqueue("lnxhipwq", 0);
    154 	if (system_highpri_wq == NULL) {
    155 		error = ENOMEM;
    156 		goto out;
    157 	}
    158 
    159 	system_long_wq = alloc_ordered_workqueue("lnxlngwq", 0);
    160 	if (system_long_wq == NULL) {
    161 		error = ENOMEM;
    162 		goto out;
    163 	}
    164 
    165 	system_power_efficient_wq = alloc_ordered_workqueue("lnxpwrwq", 0);
    166 	if (system_power_efficient_wq == NULL) {
    167 		error = ENOMEM;
    168 		goto out;
    169 	}
    170 
    171 	system_unbound_wq = alloc_ordered_workqueue("lnxubdwq", 0);
    172 	if (system_unbound_wq == NULL) {
    173 		error = ENOMEM;
    174 		goto out;
    175 	}
    176 
    177 	system_wq = alloc_ordered_workqueue("lnxsyswq", 0);
    178 	if (system_wq == NULL) {
    179 		error = ENOMEM;
    180 		goto out;
    181 	}
    182 
    183 	/* Success!  */
    184 	error = 0;
    185 
    186 out:	if (error) {
    187 		if (system_highpri_wq)
    188 			destroy_workqueue(system_highpri_wq);
    189 		if (system_long_wq)
    190 			destroy_workqueue(system_long_wq);
    191 		if (system_power_efficient_wq)
    192 			destroy_workqueue(system_power_efficient_wq);
    193 		if (system_unbound_wq)
    194 			destroy_workqueue(system_unbound_wq);
    195 		if (system_wq)
    196 			destroy_workqueue(system_wq);
    197 		if (workqueue_key)
    198 			lwp_specific_key_delete(workqueue_key);
    199 	}
    200 
    201 	return error;
    202 }
    203 
    204 /*
    205  * linux_workqueue_fini()
    206  *
    207  *	Destroy the Linux workqueue subsystem.  Never fails.
    208  */
    209 static void
    210 linux_workqueue_fini0(void)
    211 {
    212 
    213 	destroy_workqueue(system_power_efficient_wq);
    214 	destroy_workqueue(system_long_wq);
    215 	destroy_workqueue(system_wq);
    216 	lwp_specific_key_delete(workqueue_key);
    217 }
    218 
    219 #ifndef _MODULE
    220 static ONCE_DECL(linux_workqueue_init_once);
    221 #endif
    222 
    223 int
    224 linux_workqueue_init(void)
    225 {
    226 #ifdef _MODULE
    227 	return linux_workqueue_init0();
    228 #else
    229 	return INIT_ONCE(&linux_workqueue_init_once, &linux_workqueue_init0);
    230 #endif
    231 }
    232 
    233 void
    234 linux_workqueue_fini(void)
    235 {
    236 #ifdef _MODULE
    237 	return linux_workqueue_fini0();
    238 #else
    239 	return FINI_ONCE(&linux_workqueue_init_once, &linux_workqueue_fini0);
    240 #endif
    241 }
    242 
    243 /*
    245  * Workqueues
    246  */
    247 
    248 /*
    249  * alloc_workqueue(name, flags, max_active)
    250  *
    251  *	Create a workqueue of the given name.  max_active is the
    252  *	maximum number of work items in flight, or 0 for the default.
    253  *	Return NULL on failure, pointer to struct workqueue_struct
    254  *	object on success.
    255  */
    256 struct workqueue_struct *
    257 alloc_workqueue(const char *name, int flags, unsigned max_active)
    258 {
    259 	struct workqueue_struct *wq;
    260 	int error;
    261 
    262 	KASSERT(max_active == 0 || max_active == 1);
    263 
    264 	wq = kmem_zalloc(sizeof(*wq), KM_SLEEP);
    265 
    266 	mutex_init(&wq->wq_lock, MUTEX_DEFAULT, IPL_VM);
    267 	cv_init(&wq->wq_cv, name);
    268 	TAILQ_INIT(&wq->wq_delayed);
    269 	TAILQ_INIT(&wq->wq_rcu);
    270 	TAILQ_INIT(&wq->wq_queue);
    271 	TAILQ_INIT(&wq->wq_dqueue);
    272 	wq->wq_current_work = NULL;
    273 	wq->wq_flags = 0;
    274 	wq->wq_dying = false;
    275 	wq->wq_gen = 0;
    276 	wq->wq_lwp = NULL;
    277 	wq->wq_name = name;
    278 
    279 	error = kthread_create(PRI_NONE,
    280 	    KTHREAD_MPSAFE|KTHREAD_TS|KTHREAD_MUSTJOIN, NULL,
    281 	    &linux_workqueue_thread, wq, &wq->wq_lwp, "%s", name);
    282 	if (error)
    283 		goto fail0;
    284 
    285 	return wq;
    286 
    287 fail0:	KASSERT(TAILQ_EMPTY(&wq->wq_dqueue));
    288 	KASSERT(TAILQ_EMPTY(&wq->wq_queue));
    289 	KASSERT(TAILQ_EMPTY(&wq->wq_rcu));
    290 	KASSERT(TAILQ_EMPTY(&wq->wq_delayed));
    291 	cv_destroy(&wq->wq_cv);
    292 	mutex_destroy(&wq->wq_lock);
    293 	kmem_free(wq, sizeof(*wq));
    294 	return NULL;
    295 }
    296 
    297 /*
    298  * alloc_ordered_workqueue(name, flags)
    299  *
    300  *	Same as alloc_workqueue(name, flags, 1).
    301  */
    302 struct workqueue_struct *
    303 alloc_ordered_workqueue(const char *name, int flags)
    304 {
    305 
    306 	return alloc_workqueue(name, flags, 1);
    307 }
    308 
    309 /*
    310  * destroy_workqueue(wq)
    311  *
    312  *	Destroy a workqueue created with wq.  Cancel any pending
    313  *	delayed work.  Wait for all queued work to complete.
    314  *
    315  *	May sleep.
    316  */
    317 void
    318 destroy_workqueue(struct workqueue_struct *wq)
    319 {
    320 
    321 	/*
    322 	 * Cancel all delayed work.  We do this first because any
    323 	 * delayed work that that has already timed out, which we can't
    324 	 * cancel, may have queued new work.
    325 	 */
    326 	mutex_enter(&wq->wq_lock);
    327 	while (!TAILQ_EMPTY(&wq->wq_delayed)) {
    328 		struct delayed_work *const dw = TAILQ_FIRST(&wq->wq_delayed);
    329 
    330 		KASSERT(work_queue(&dw->work) == wq);
    331 		KASSERTMSG((dw->dw_state == DELAYED_WORK_SCHEDULED ||
    332 			dw->dw_state == DELAYED_WORK_RESCHEDULED ||
    333 			dw->dw_state == DELAYED_WORK_CANCELLED),
    334 		    "delayed work %p in bad state: %d",
    335 		    dw, dw->dw_state);
    336 
    337 		/*
    338 		 * Mark it cancelled and try to stop the callout before
    339 		 * it starts.
    340 		 *
    341 		 * If it's too late and the callout has already begun
    342 		 * to execute, then it will notice that we asked to
    343 		 * cancel it and remove itself from the queue before
    344 		 * returning.
    345 		 *
    346 		 * If we stopped the callout before it started,
    347 		 * however, then we can safely destroy the callout and
    348 		 * dissociate it from the workqueue ourselves.
    349 		 */
    350 		SDT_PROBE2(sdt, linux, work, cancel,  &dw->work, wq);
    351 		dw->dw_state = DELAYED_WORK_CANCELLED;
    352 		if (!callout_halt(&dw->dw_callout, &wq->wq_lock))
    353 			cancel_delayed_work_done(wq, dw);
    354 	}
    355 	mutex_exit(&wq->wq_lock);
    356 
    357 	/* Wait for all scheduled RCU work to complete.  */
    358 	mutex_enter(&wq->wq_lock);
    359 	while (!TAILQ_EMPTY(&wq->wq_rcu))
    360 		cv_wait(&wq->wq_cv, &wq->wq_lock);
    361 	mutex_exit(&wq->wq_lock);
    362 
    363 	/*
    364 	 * At this point, no new work can be put on the queue.
    365 	 */
    366 
    367 	/* Tell the thread to exit.  */
    368 	mutex_enter(&wq->wq_lock);
    369 	wq->wq_dying = true;
    370 	cv_broadcast(&wq->wq_cv);
    371 	mutex_exit(&wq->wq_lock);
    372 
    373 	/* Wait for it to exit.  */
    374 	(void)kthread_join(wq->wq_lwp);
    375 
    376 	KASSERT(wq->wq_dying);
    377 	KASSERT(wq->wq_flags == 0);
    378 	KASSERT(wq->wq_current_work == NULL);
    379 	KASSERT(TAILQ_EMPTY(&wq->wq_dqueue));
    380 	KASSERT(TAILQ_EMPTY(&wq->wq_queue));
    381 	KASSERT(TAILQ_EMPTY(&wq->wq_rcu));
    382 	KASSERT(TAILQ_EMPTY(&wq->wq_delayed));
    383 	cv_destroy(&wq->wq_cv);
    384 	mutex_destroy(&wq->wq_lock);
    385 
    386 	kmem_free(wq, sizeof(*wq));
    387 }
    388 
    389 /*
    391  * Work thread and callout
    392  */
    393 
    394 /*
    395  * linux_workqueue_thread(cookie)
    396  *
    397  *	Main function for a workqueue's worker thread.  Waits until
    398  *	there is work queued, grabs a batch of work off the queue,
    399  *	executes it all, bumps the generation number, and repeats,
    400  *	until dying.
    401  */
    402 static void __dead
    403 linux_workqueue_thread(void *cookie)
    404 {
    405 	struct workqueue_struct *const wq = cookie;
    406 	struct work_head *const q[2] = { &wq->wq_queue, &wq->wq_dqueue };
    407 	struct work_struct marker, *work;
    408 	unsigned i;
    409 
    410 	lwp_setspecific(workqueue_key, wq);
    411 
    412 	mutex_enter(&wq->wq_lock);
    413 	for (;;) {
    414 		/*
    415 		 * Wait until there's activity.  If there's no work and
    416 		 * we're dying, stop here.
    417 		 */
    418 		if (TAILQ_EMPTY(&wq->wq_queue) &&
    419 		    TAILQ_EMPTY(&wq->wq_dqueue)) {
    420 			if (wq->wq_dying)
    421 				break;
    422 			cv_wait(&wq->wq_cv, &wq->wq_lock);
    423 			continue;
    424 		}
    425 
    426 		/*
    427 		 * Start a batch of work.  Use a marker to delimit when
    428 		 * the batch ends so we can advance the generation
    429 		 * after the batch.
    430 		 */
    431 		SDT_PROBE1(sdt, linux, work, batch__start,  wq);
    432 		for (i = 0; i < 2; i++) {
    433 			if (TAILQ_EMPTY(q[i]))
    434 				continue;
    435 			TAILQ_INSERT_TAIL(q[i], &marker, work_entry);
    436 			while ((work = TAILQ_FIRST(q[i])) != &marker) {
    437 				void (*func)(struct work_struct *);
    438 
    439 				KASSERT(work_queue(work) == wq);
    440 				KASSERT(work_claimed(work, wq));
    441 				KASSERTMSG((q[i] != &wq->wq_dqueue ||
    442 					container_of(work, struct delayed_work,
    443 					    work)->dw_state ==
    444 					DELAYED_WORK_IDLE),
    445 				    "delayed work %p queued and scheduled",
    446 				    work);
    447 
    448 				TAILQ_REMOVE(q[i], work, work_entry);
    449 				KASSERT(wq->wq_current_work == NULL);
    450 				wq->wq_current_work = work;
    451 				func = work->func;
    452 				release_work(work, wq);
    453 				/* Can't dereference work after this point.  */
    454 
    455 				mutex_exit(&wq->wq_lock);
    456 				SDT_PROBE2(sdt, linux, work, run,  work, wq);
    457 				(*func)(work);
    458 				SDT_PROBE2(sdt, linux, work, done,  work, wq);
    459 				mutex_enter(&wq->wq_lock);
    460 
    461 				KASSERT(wq->wq_current_work == work);
    462 				wq->wq_current_work = NULL;
    463 				cv_broadcast(&wq->wq_cv);
    464 			}
    465 			TAILQ_REMOVE(q[i], &marker, work_entry);
    466 		}
    467 
    468 		/* Notify cancel that we've completed a batch of work.  */
    469 		wq->wq_gen++;
    470 		cv_broadcast(&wq->wq_cv);
    471 		SDT_PROBE1(sdt, linux, work, batch__done,  wq);
    472 	}
    473 	mutex_exit(&wq->wq_lock);
    474 
    475 	kthread_exit(0);
    476 }
    477 
    478 /*
    479  * linux_workqueue_timeout(cookie)
    480  *
    481  *	Delayed work timeout callback.
    482  *
    483  *	- If scheduled, queue it.
    484  *	- If rescheduled, callout_schedule ourselves again.
    485  *	- If cancelled, destroy the callout and release the work from
    486  *        the workqueue.
    487  */
    488 static void
    489 linux_workqueue_timeout(void *cookie)
    490 {
    491 	struct delayed_work *const dw = cookie;
    492 	struct workqueue_struct *const wq = work_queue(&dw->work);
    493 
    494 	KASSERTMSG(wq != NULL,
    495 	    "delayed work %p state %d resched %d",
    496 	    dw, dw->dw_state, dw->dw_resched);
    497 
    498 	SDT_PROBE2(sdt, linux, work, timer,  dw, wq);
    499 
    500 	mutex_enter(&wq->wq_lock);
    501 	KASSERT(work_queue(&dw->work) == wq);
    502 	switch (dw->dw_state) {
    503 	case DELAYED_WORK_IDLE:
    504 		panic("delayed work callout uninitialized: %p", dw);
    505 	case DELAYED_WORK_SCHEDULED:
    506 		dw_callout_destroy(wq, dw);
    507 		TAILQ_INSERT_TAIL(&wq->wq_dqueue, &dw->work, work_entry);
    508 		cv_broadcast(&wq->wq_cv);
    509 		SDT_PROBE2(sdt, linux, work, queue,  &dw->work, wq);
    510 		break;
    511 	case DELAYED_WORK_RESCHEDULED:
    512 		KASSERT(dw->dw_resched >= 0);
    513 		callout_schedule(&dw->dw_callout, dw->dw_resched);
    514 		dw->dw_state = DELAYED_WORK_SCHEDULED;
    515 		dw->dw_resched = -1;
    516 		break;
    517 	case DELAYED_WORK_CANCELLED:
    518 		cancel_delayed_work_done(wq, dw);
    519 		/* Can't dereference dw after this point.  */
    520 		goto out;
    521 	default:
    522 		panic("delayed work callout in bad state: %p", dw);
    523 	}
    524 	KASSERT(dw->dw_state == DELAYED_WORK_IDLE ||
    525 	    dw->dw_state == DELAYED_WORK_SCHEDULED);
    526 out:	mutex_exit(&wq->wq_lock);
    527 }
    528 
    529 /*
    530  * current_work()
    531  *
    532  *	If in a workqueue worker thread, return the work it is
    533  *	currently executing.  Otherwise return NULL.
    534  */
    535 struct work_struct *
    536 current_work(void)
    537 {
    538 	struct workqueue_struct *wq = lwp_getspecific(workqueue_key);
    539 
    540 	/* If we're not a workqueue thread, then there's no work.  */
    541 	if (wq == NULL)
    542 		return NULL;
    543 
    544 	/*
    545 	 * Otherwise, this should be possible only while work is in
    546 	 * progress.  Return the current work item.
    547 	 */
    548 	KASSERT(wq->wq_current_work != NULL);
    549 	return wq->wq_current_work;
    550 }
    551 
    552 /*
    554  * Work
    555  */
    556 
    557 /*
    558  * INIT_WORK(work, fn)
    559  *
    560  *	Initialize work for use with a workqueue to call fn in a worker
    561  *	thread.  There is no corresponding destruction operation.
    562  */
    563 void
    564 INIT_WORK(struct work_struct *work, void (*fn)(struct work_struct *))
    565 {
    566 
    567 	work->work_owner = 0;
    568 	work->func = fn;
    569 }
    570 
    571 /*
    572  * work_claimed(work, wq)
    573  *
    574  *	True if work is currently claimed by a workqueue, meaning it is
    575  *	either on the queue or scheduled in a callout.  The workqueue
    576  *	must be wq, and caller must hold wq's lock.
    577  */
    578 static bool
    579 work_claimed(struct work_struct *work, struct workqueue_struct *wq)
    580 {
    581 
    582 	KASSERT(work_queue(work) == wq);
    583 	KASSERT(mutex_owned(&wq->wq_lock));
    584 
    585 	return atomic_load_relaxed(&work->work_owner) & 1;
    586 }
    587 
    588 /*
    589  * work_pending(work)
    590  *
    591  *	True if work is currently claimed by any workqueue, scheduled
    592  *	to run on that workqueue.
    593  */
    594 bool
    595 work_pending(const struct work_struct *work)
    596 {
    597 
    598 	return atomic_load_relaxed(&work->work_owner) & 1;
    599 }
    600 
    601 /*
    602  * work_queue(work)
    603  *
    604  *	Return the last queue that work was queued on, or NULL if it
    605  *	was never queued.
    606  */
    607 static struct workqueue_struct *
    608 work_queue(struct work_struct *work)
    609 {
    610 
    611 	return (struct workqueue_struct *)
    612 	    (atomic_load_relaxed(&work->work_owner) & ~(uintptr_t)1);
    613 }
    614 
    615 /*
    616  * acquire_work(work, wq)
    617  *
    618  *	Try to claim work for wq.  If work is already claimed, it must
    619  *	be claimed by wq; return false.  If work is not already
    620  *	claimed, claim it, issue a memory barrier to match any prior
    621  *	release_work, and return true.
    622  *
    623  *	Caller must hold wq's lock.
    624  */
    625 static bool
    626 acquire_work(struct work_struct *work, struct workqueue_struct *wq)
    627 {
    628 	uintptr_t owner0, owner;
    629 
    630 	KASSERT(mutex_owned(&wq->wq_lock));
    631 	KASSERT(((uintptr_t)wq & 1) == 0);
    632 
    633 	owner = (uintptr_t)wq | 1;
    634 	do {
    635 		owner0 = atomic_load_relaxed(&work->work_owner);
    636 		if (owner0 & 1) {
    637 			KASSERT((owner0 & ~(uintptr_t)1) == (uintptr_t)wq);
    638 			return false;
    639 		}
    640 		KASSERT(owner0 == (uintptr_t)NULL || owner0 == (uintptr_t)wq);
    641 	} while (atomic_cas_uintptr(&work->work_owner, owner0, owner) !=
    642 	    owner0);
    643 
    644 	KASSERT(work_queue(work) == wq);
    645 	membar_acquire();
    646 	SDT_PROBE2(sdt, linux, work, acquire,  work, wq);
    647 	return true;
    648 }
    649 
    650 /*
    651  * release_work(work, wq)
    652  *
    653  *	Issue a memory barrier to match any subsequent acquire_work and
    654  *	dissociate work from wq.
    655  *
    656  *	Caller must hold wq's lock and work must be associated with wq.
    657  */
    658 static void
    659 release_work(struct work_struct *work, struct workqueue_struct *wq)
    660 {
    661 
    662 	KASSERT(work_queue(work) == wq);
    663 	KASSERT(mutex_owned(&wq->wq_lock));
    664 
    665 	SDT_PROBE2(sdt, linux, work, release,  work, wq);
    666 	membar_release();
    667 
    668 	/*
    669 	 * Non-interlocked r/m/w is safe here because nobody else can
    670 	 * write to this while the claimed bit is set and the workqueue
    671 	 * lock is held.
    672 	 */
    673 	atomic_store_relaxed(&work->work_owner,
    674 	    atomic_load_relaxed(&work->work_owner) & ~(uintptr_t)1);
    675 }
    676 
    677 /*
    678  * schedule_work(work)
    679  *
    680  *	If work is not already queued on system_wq, queue it to be run
    681  *	by system_wq's worker thread when it next can.  True if it was
    682  *	newly queued, false if it was already queued.  If the work was
    683  *	already running, queue it to run again.
    684  *
    685  *	Caller must ensure work is not queued to run on a different
    686  *	workqueue.
    687  */
    688 bool
    689 schedule_work(struct work_struct *work)
    690 {
    691 
    692 	return queue_work(system_wq, work);
    693 }
    694 
    695 /*
    696  * queue_work(wq, work)
    697  *
    698  *	If work is not already queued on wq, queue it to be run by wq's
    699  *	worker thread when it next can.  True if it was newly queued,
    700  *	false if it was already queued.  If the work was already
    701  *	running, queue it to run again.
    702  *
    703  *	Caller must ensure work is not queued to run on a different
    704  *	workqueue.
    705  */
    706 bool
    707 queue_work(struct workqueue_struct *wq, struct work_struct *work)
    708 {
    709 	bool newly_queued;
    710 
    711 	KASSERT(wq != NULL);
    712 
    713 	mutex_enter(&wq->wq_lock);
    714 	if (__predict_true(acquire_work(work, wq))) {
    715 		/*
    716 		 * It wasn't on any workqueue at all.  Put it on this
    717 		 * one, and signal the worker thread that there is work
    718 		 * to do.
    719 		 */
    720 		TAILQ_INSERT_TAIL(&wq->wq_queue, work, work_entry);
    721 		cv_broadcast(&wq->wq_cv);
    722 		SDT_PROBE2(sdt, linux, work, queue,  work, wq);
    723 		newly_queued = true;
    724 	} else {
    725 		/*
    726 		 * It was already on this workqueue.  Nothing to do
    727 		 * since it is already queued.
    728 		 */
    729 		newly_queued = false;
    730 	}
    731 	mutex_exit(&wq->wq_lock);
    732 
    733 	return newly_queued;
    734 }
    735 
    736 /*
    737  * cancel_work(work)
    738  *
    739  *	If work was queued, remove it from the queue and return true.
    740  *	If work was not queued, return false.  Work may still be
    741  *	running when this returns.
    742  */
    743 bool
    744 cancel_work(struct work_struct *work)
    745 {
    746 	struct workqueue_struct *wq;
    747 	bool cancelled_p = false;
    748 
    749 	/* If there's no workqueue, nothing to cancel.   */
    750 	if ((wq = work_queue(work)) == NULL)
    751 		goto out;
    752 
    753 	mutex_enter(&wq->wq_lock);
    754 	if (__predict_false(work_queue(work) != wq)) {
    755 		/*
    756 		 * It has finished execution or been cancelled by
    757 		 * another thread, and has been moved off the
    758 		 * workqueue, so it's too to cancel.
    759 		 */
    760 		cancelled_p = false;
    761 	} else {
    762 		/* Check whether it's on the queue.  */
    763 		if (work_claimed(work, wq)) {
    764 			/*
    765 			 * It is still on the queue.  Take it off the
    766 			 * queue and report successful cancellation.
    767 			 */
    768 			TAILQ_REMOVE(&wq->wq_queue, work, work_entry);
    769 			SDT_PROBE2(sdt, linux, work, cancel,  work, wq);
    770 			release_work(work, wq);
    771 			/* Can't dereference work after this point.  */
    772 			cancelled_p = true;
    773 		} else {
    774 			/* Not on the queue.  Couldn't cancel it.  */
    775 			cancelled_p = false;
    776 		}
    777 	}
    778 	mutex_exit(&wq->wq_lock);
    779 
    780 out:	return cancelled_p;
    781 }
    782 
    783 /*
    784  * cancel_work_sync(work)
    785  *
    786  *	If work was queued, remove it from the queue and return true.
    787  *	If work was not queued, return false.  Either way, if work is
    788  *	currently running, wait for it to complete.
    789  *
    790  *	May sleep.
    791  */
    792 bool
    793 cancel_work_sync(struct work_struct *work)
    794 {
    795 	struct workqueue_struct *wq;
    796 	bool cancelled_p = false;
    797 
    798 	/* If there's no workqueue, nothing to cancel.   */
    799 	if ((wq = work_queue(work)) == NULL)
    800 		goto out;
    801 
    802 	mutex_enter(&wq->wq_lock);
    803 	if (__predict_false(work_queue(work) != wq)) {
    804 		/*
    805 		 * It has finished execution or been cancelled by
    806 		 * another thread, and has been moved off the
    807 		 * workqueue, so it's too late to cancel.
    808 		 */
    809 		cancelled_p = false;
    810 	} else {
    811 		/* Check whether it's on the queue.  */
    812 		if (work_claimed(work, wq)) {
    813 			/*
    814 			 * It is still on the queue.  Take it off the
    815 			 * queue and report successful cancellation.
    816 			 */
    817 			TAILQ_REMOVE(&wq->wq_queue, work, work_entry);
    818 			SDT_PROBE2(sdt, linux, work, cancel,  work, wq);
    819 			release_work(work, wq);
    820 			/* Can't dereference work after this point.  */
    821 			cancelled_p = true;
    822 		} else {
    823 			/* Not on the queue.  Couldn't cancel it.  */
    824 			cancelled_p = false;
    825 		}
    826 		/* If it's still running, wait for it to complete.  */
    827 		if (wq->wq_current_work == work)
    828 			wait_for_current_work(work, wq);
    829 	}
    830 	mutex_exit(&wq->wq_lock);
    831 
    832 out:	return cancelled_p;
    833 }
    834 
    835 /*
    836  * wait_for_current_work(work, wq)
    837  *
    838  *	wq must be currently executing work.  Wait for it to finish.
    839  *
    840  *	Does not dereference work.
    841  */
    842 static void
    843 wait_for_current_work(struct work_struct *work, struct workqueue_struct *wq)
    844 {
    845 	uint64_t gen;
    846 
    847 	KASSERT(mutex_owned(&wq->wq_lock));
    848 	KASSERT(wq->wq_current_work == work);
    849 
    850 	/* Wait only one generation in case it gets requeued quickly.  */
    851 	SDT_PROBE2(sdt, linux, work, wait__start,  work, wq);
    852 	gen = wq->wq_gen;
    853 	do {
    854 		cv_wait(&wq->wq_cv, &wq->wq_lock);
    855 	} while (wq->wq_current_work == work && wq->wq_gen == gen);
    856 	SDT_PROBE2(sdt, linux, work, wait__done,  work, wq);
    857 }
    858 
    859 /*
    861  * Delayed work
    862  */
    863 
    864 /*
    865  * INIT_DELAYED_WORK(dw, fn)
    866  *
    867  *	Initialize dw for use with a workqueue to call fn in a worker
    868  *	thread after a delay.  There is no corresponding destruction
    869  *	operation.
    870  */
    871 void
    872 INIT_DELAYED_WORK(struct delayed_work *dw, void (*fn)(struct work_struct *))
    873 {
    874 
    875 	INIT_WORK(&dw->work, fn);
    876 	dw->dw_state = DELAYED_WORK_IDLE;
    877 	dw->dw_resched = -1;
    878 
    879 	/*
    880 	 * Defer callout_init until we are going to schedule the
    881 	 * callout, which can then callout_destroy it, because
    882 	 * otherwise since there's no DESTROY_DELAYED_WORK or anything
    883 	 * we have no opportunity to call callout_destroy.
    884 	 */
    885 }
    886 
    887 /*
    888  * schedule_delayed_work(dw, ticks)
    889  *
    890  *	If it is not currently scheduled, schedule dw to run after
    891  *	ticks on system_wq.  If currently executing and not already
    892  *	rescheduled, reschedule it.  True if it was newly scheduled,
    893  *	false if it was already scheduled.
    894  *
    895  *	If ticks == 0, queue it to run as soon as the worker can,
    896  *	without waiting for the next callout tick to run.
    897  */
    898 bool
    899 schedule_delayed_work(struct delayed_work *dw, unsigned long ticks)
    900 {
    901 
    902 	return queue_delayed_work(system_wq, dw, ticks);
    903 }
    904 
    905 /*
    906  * dw_callout_init(wq, dw)
    907  *
    908  *	Initialize the callout of dw and transition to
    909  *	DELAYED_WORK_SCHEDULED.  Caller must use callout_schedule.
    910  */
    911 static void
    912 dw_callout_init(struct workqueue_struct *wq, struct delayed_work *dw)
    913 {
    914 
    915 	KASSERT(mutex_owned(&wq->wq_lock));
    916 	KASSERT(work_queue(&dw->work) == wq);
    917 	KASSERT(dw->dw_state == DELAYED_WORK_IDLE);
    918 
    919 	callout_init(&dw->dw_callout, CALLOUT_MPSAFE);
    920 	callout_setfunc(&dw->dw_callout, &linux_workqueue_timeout, dw);
    921 	TAILQ_INSERT_HEAD(&wq->wq_delayed, dw, dw_entry);
    922 	dw->dw_state = DELAYED_WORK_SCHEDULED;
    923 }
    924 
    925 /*
    926  * dw_callout_destroy(wq, dw)
    927  *
    928  *	Destroy the callout of dw and transition to DELAYED_WORK_IDLE.
    929  */
    930 static void
    931 dw_callout_destroy(struct workqueue_struct *wq, struct delayed_work *dw)
    932 {
    933 
    934 	KASSERT(mutex_owned(&wq->wq_lock));
    935 	KASSERT(work_queue(&dw->work) == wq);
    936 	KASSERT(dw->dw_state == DELAYED_WORK_SCHEDULED ||
    937 	    dw->dw_state == DELAYED_WORK_RESCHEDULED ||
    938 	    dw->dw_state == DELAYED_WORK_CANCELLED);
    939 
    940 	TAILQ_REMOVE(&wq->wq_delayed, dw, dw_entry);
    941 	callout_destroy(&dw->dw_callout);
    942 	dw->dw_resched = -1;
    943 	dw->dw_state = DELAYED_WORK_IDLE;
    944 }
    945 
    946 /*
    947  * cancel_delayed_work_done(wq, dw)
    948  *
    949  *	Complete cancellation of a delayed work: transition from
    950  *	DELAYED_WORK_CANCELLED to DELAYED_WORK_IDLE and off the
    951  *	workqueue.  Caller must not dereference dw after this returns.
    952  */
    953 static void
    954 cancel_delayed_work_done(struct workqueue_struct *wq, struct delayed_work *dw)
    955 {
    956 
    957 	KASSERT(mutex_owned(&wq->wq_lock));
    958 	KASSERT(work_queue(&dw->work) == wq);
    959 	KASSERT(dw->dw_state == DELAYED_WORK_CANCELLED);
    960 
    961 	dw_callout_destroy(wq, dw);
    962 	release_work(&dw->work, wq);
    963 	/* Can't dereference dw after this point.  */
    964 }
    965 
    966 /*
    967  * queue_delayed_work(wq, dw, ticks)
    968  *
    969  *	If it is not currently scheduled, schedule dw to run after
    970  *	ticks on wq.  If currently queued, remove it from the queue
    971  *	first.
    972  *
    973  *	If ticks == 0, queue it to run as soon as the worker can,
    974  *	without waiting for the next callout tick to run.
    975  */
    976 bool
    977 queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dw,
    978     unsigned long ticks)
    979 {
    980 	bool newly_queued;
    981 
    982 	mutex_enter(&wq->wq_lock);
    983 	if (__predict_true(acquire_work(&dw->work, wq))) {
    984 		/*
    985 		 * It wasn't on any workqueue at all.  Schedule it to
    986 		 * run on this one.
    987 		 */
    988 		KASSERT(dw->dw_state == DELAYED_WORK_IDLE);
    989 		if (ticks == 0) {
    990 			TAILQ_INSERT_TAIL(&wq->wq_dqueue, &dw->work,
    991 			    work_entry);
    992 			cv_broadcast(&wq->wq_cv);
    993 			SDT_PROBE2(sdt, linux, work, queue,  &dw->work, wq);
    994 		} else {
    995 			/*
    996 			 * Initialize a callout and schedule to run
    997 			 * after a delay.
    998 			 */
    999 			dw_callout_init(wq, dw);
   1000 			callout_schedule(&dw->dw_callout, MIN(INT_MAX, ticks));
   1001 			SDT_PROBE3(sdt, linux, work, schedule,  dw, wq, ticks);
   1002 		}
   1003 		newly_queued = true;
   1004 	} else {
   1005 		/* It was already on this workqueue.  */
   1006 		switch (dw->dw_state) {
   1007 		case DELAYED_WORK_IDLE:
   1008 		case DELAYED_WORK_SCHEDULED:
   1009 		case DELAYED_WORK_RESCHEDULED:
   1010 			/* On the queue or already scheduled.  Leave it.  */
   1011 			newly_queued = false;
   1012 			break;
   1013 		case DELAYED_WORK_CANCELLED:
   1014 			/*
   1015 			 * Scheduled and the callout began, but it was
   1016 			 * cancelled.  Reschedule it.
   1017 			 */
   1018 			if (ticks == 0) {
   1019 				dw->dw_state = DELAYED_WORK_SCHEDULED;
   1020 				SDT_PROBE2(sdt, linux, work, queue,
   1021 				    &dw->work, wq);
   1022 			} else {
   1023 				dw->dw_state = DELAYED_WORK_RESCHEDULED;
   1024 				dw->dw_resched = MIN(INT_MAX, ticks);
   1025 				SDT_PROBE3(sdt, linux, work, schedule,
   1026 				    dw, wq, ticks);
   1027 			}
   1028 			newly_queued = true;
   1029 			break;
   1030 		default:
   1031 			panic("invalid delayed work state: %d",
   1032 			    dw->dw_state);
   1033 		}
   1034 	}
   1035 	mutex_exit(&wq->wq_lock);
   1036 
   1037 	return newly_queued;
   1038 }
   1039 
   1040 /*
   1041  * mod_delayed_work(wq, dw, ticks)
   1042  *
   1043  *	Schedule dw to run after ticks.  If scheduled or queued,
   1044  *	reschedule.  If ticks == 0, run without delay.
   1045  *
   1046  *	True if it modified the timer of an already scheduled work,
   1047  *	false if it newly scheduled the work.
   1048  */
   1049 bool
   1050 mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dw,
   1051     unsigned long ticks)
   1052 {
   1053 	bool timer_modified;
   1054 
   1055 	mutex_enter(&wq->wq_lock);
   1056 	if (acquire_work(&dw->work, wq)) {
   1057 		/*
   1058 		 * It wasn't on any workqueue at all.  Schedule it to
   1059 		 * run on this one.
   1060 		 */
   1061 		KASSERT(dw->dw_state == DELAYED_WORK_IDLE);
   1062 		if (ticks == 0) {
   1063 			/*
   1064 			 * Run immediately: put it on the queue and
   1065 			 * signal the worker thread.
   1066 			 */
   1067 			TAILQ_INSERT_TAIL(&wq->wq_dqueue, &dw->work,
   1068 			    work_entry);
   1069 			cv_broadcast(&wq->wq_cv);
   1070 			SDT_PROBE2(sdt, linux, work, queue,  &dw->work, wq);
   1071 		} else {
   1072 			/*
   1073 			 * Initialize a callout and schedule to run
   1074 			 * after a delay.
   1075 			 */
   1076 			dw_callout_init(wq, dw);
   1077 			callout_schedule(&dw->dw_callout, MIN(INT_MAX, ticks));
   1078 			SDT_PROBE3(sdt, linux, work, schedule,  dw, wq, ticks);
   1079 		}
   1080 		timer_modified = false;
   1081 	} else {
   1082 		/* It was already on this workqueue.  */
   1083 		switch (dw->dw_state) {
   1084 		case DELAYED_WORK_IDLE:
   1085 			/* On the queue.  */
   1086 			if (ticks == 0) {
   1087 				/* Leave it be.  */
   1088 				SDT_PROBE2(sdt, linux, work, cancel,
   1089 				    &dw->work, wq);
   1090 				SDT_PROBE2(sdt, linux, work, queue,
   1091 				    &dw->work, wq);
   1092 			} else {
   1093 				/* Remove from the queue and schedule.  */
   1094 				TAILQ_REMOVE(&wq->wq_dqueue, &dw->work,
   1095 				    work_entry);
   1096 				dw_callout_init(wq, dw);
   1097 				callout_schedule(&dw->dw_callout,
   1098 				    MIN(INT_MAX, ticks));
   1099 				SDT_PROBE2(sdt, linux, work, cancel,
   1100 				    &dw->work, wq);
   1101 				SDT_PROBE3(sdt, linux, work, schedule,
   1102 				    dw, wq, ticks);
   1103 			}
   1104 			timer_modified = true;
   1105 			break;
   1106 		case DELAYED_WORK_SCHEDULED:
   1107 			/*
   1108 			 * It is scheduled to run after a delay.  Try
   1109 			 * to stop it and reschedule it; if we can't,
   1110 			 * either reschedule it or cancel it to put it
   1111 			 * on the queue, and inform the callout.
   1112 			 */
   1113 			if (callout_stop(&dw->dw_callout)) {
   1114 				/* Can't stop, callout has begun.  */
   1115 				if (ticks == 0) {
   1116 					/*
   1117 					 * We don't actually need to do
   1118 					 * anything.  The callout will
   1119 					 * queue it as soon as it gets
   1120 					 * the lock.
   1121 					 */
   1122 					SDT_PROBE2(sdt, linux, work, cancel,
   1123 					    &dw->work, wq);
   1124 					SDT_PROBE2(sdt, linux, work, queue,
   1125 					    &dw->work, wq);
   1126 				} else {
   1127 					/* Ask the callout to reschedule.  */
   1128 					dw->dw_state = DELAYED_WORK_RESCHEDULED;
   1129 					dw->dw_resched = MIN(INT_MAX, ticks);
   1130 					SDT_PROBE2(sdt, linux, work, cancel,
   1131 					    &dw->work, wq);
   1132 					SDT_PROBE3(sdt, linux, work, schedule,
   1133 					    dw, wq, ticks);
   1134 				}
   1135 			} else {
   1136 				/* We stopped the callout before it began.  */
   1137 				if (ticks == 0) {
   1138 					/*
   1139 					 * Run immediately: destroy the
   1140 					 * callout, put it on the
   1141 					 * queue, and signal the worker
   1142 					 * thread.
   1143 					 */
   1144 					dw_callout_destroy(wq, dw);
   1145 					TAILQ_INSERT_TAIL(&wq->wq_dqueue,
   1146 					    &dw->work, work_entry);
   1147 					cv_broadcast(&wq->wq_cv);
   1148 					SDT_PROBE2(sdt, linux, work, cancel,
   1149 					    &dw->work, wq);
   1150 					SDT_PROBE2(sdt, linux, work, queue,
   1151 					    &dw->work, wq);
   1152 				} else {
   1153 					/*
   1154 					 * Reschedule the callout.  No
   1155 					 * state change.
   1156 					 */
   1157 					callout_schedule(&dw->dw_callout,
   1158 					    MIN(INT_MAX, ticks));
   1159 					SDT_PROBE2(sdt, linux, work, cancel,
   1160 					    &dw->work, wq);
   1161 					SDT_PROBE3(sdt, linux, work, schedule,
   1162 					    dw, wq, ticks);
   1163 				}
   1164 			}
   1165 			timer_modified = true;
   1166 			break;
   1167 		case DELAYED_WORK_RESCHEDULED:
   1168 			/*
   1169 			 * Someone rescheduled it after the callout
   1170 			 * started but before the poor thing even had a
   1171 			 * chance to acquire the lock.
   1172 			 */
   1173 			if (ticks == 0) {
   1174 				/*
   1175 				 * We can just switch back to
   1176 				 * DELAYED_WORK_SCHEDULED so that the
   1177 				 * callout will queue the work as soon
   1178 				 * as it gets the lock.
   1179 				 */
   1180 				dw->dw_state = DELAYED_WORK_SCHEDULED;
   1181 				dw->dw_resched = -1;
   1182 				SDT_PROBE2(sdt, linux, work, cancel,
   1183 				    &dw->work, wq);
   1184 				SDT_PROBE2(sdt, linux, work, queue,
   1185 				    &dw->work, wq);
   1186 			} else {
   1187 				/* Change the rescheduled time.  */
   1188 				dw->dw_resched = ticks;
   1189 				SDT_PROBE2(sdt, linux, work, cancel,
   1190 				    &dw->work, wq);
   1191 				SDT_PROBE3(sdt, linux, work, schedule,
   1192 				    dw, wq, ticks);
   1193 			}
   1194 			timer_modified = true;
   1195 			break;
   1196 		case DELAYED_WORK_CANCELLED:
   1197 			/*
   1198 			 * Someone cancelled it after the callout
   1199 			 * started but before the poor thing even had a
   1200 			 * chance to acquire the lock.
   1201 			 */
   1202 			if (ticks == 0) {
   1203 				/*
   1204 				 * We can just switch back to
   1205 				 * DELAYED_WORK_SCHEDULED so that the
   1206 				 * callout will queue the work as soon
   1207 				 * as it gets the lock.
   1208 				 */
   1209 				dw->dw_state = DELAYED_WORK_SCHEDULED;
   1210 				SDT_PROBE2(sdt, linux, work, queue,
   1211 				    &dw->work, wq);
   1212 			} else {
   1213 				/* Ask it to reschedule.  */
   1214 				dw->dw_state = DELAYED_WORK_RESCHEDULED;
   1215 				dw->dw_resched = MIN(INT_MAX, ticks);
   1216 				SDT_PROBE3(sdt, linux, work, schedule,
   1217 				    dw, wq, ticks);
   1218 			}
   1219 			timer_modified = false;
   1220 			break;
   1221 		default:
   1222 			panic("invalid delayed work state: %d", dw->dw_state);
   1223 		}
   1224 	}
   1225 	mutex_exit(&wq->wq_lock);
   1226 
   1227 	return timer_modified;
   1228 }
   1229 
   1230 /*
   1231  * cancel_delayed_work(dw)
   1232  *
   1233  *	If work was scheduled or queued, remove it from the schedule or
   1234  *	queue and return true.  If work was not scheduled or queued,
   1235  *	return false.  Note that work may already be running; if it
   1236  *	hasn't been rescheduled or requeued, then cancel_delayed_work
   1237  *	will return false, and either way, cancel_delayed_work will NOT
   1238  *	wait for the work to complete.
   1239  */
   1240 bool
   1241 cancel_delayed_work(struct delayed_work *dw)
   1242 {
   1243 	struct workqueue_struct *wq;
   1244 	bool cancelled_p;
   1245 
   1246 	/* If there's no workqueue, nothing to cancel.   */
   1247 	if ((wq = work_queue(&dw->work)) == NULL)
   1248 		return false;
   1249 
   1250 	mutex_enter(&wq->wq_lock);
   1251 	if (__predict_false(work_queue(&dw->work) != wq)) {
   1252 		cancelled_p = false;
   1253 	} else {
   1254 		switch (dw->dw_state) {
   1255 		case DELAYED_WORK_IDLE:
   1256 			/*
   1257 			 * It is either on the queue or already running
   1258 			 * or both.
   1259 			 */
   1260 			if (work_claimed(&dw->work, wq)) {
   1261 				/* On the queue.  Remove and release.  */
   1262 				TAILQ_REMOVE(&wq->wq_dqueue, &dw->work,
   1263 				    work_entry);
   1264 				SDT_PROBE2(sdt, linux, work, cancel,
   1265 				    &dw->work, wq);
   1266 				release_work(&dw->work, wq);
   1267 				/* Can't dereference dw after this point.  */
   1268 				cancelled_p = true;
   1269 			} else {
   1270 				/* Not on the queue, so didn't cancel.  */
   1271 				cancelled_p = false;
   1272 			}
   1273 			break;
   1274 		case DELAYED_WORK_SCHEDULED:
   1275 			/*
   1276 			 * If it is scheduled, mark it cancelled and
   1277 			 * try to stop the callout before it starts.
   1278 			 *
   1279 			 * If it's too late and the callout has already
   1280 			 * begun to execute, tough.
   1281 			 *
   1282 			 * If we stopped the callout before it started,
   1283 			 * however, then destroy the callout and
   1284 			 * dissociate it from the workqueue ourselves.
   1285 			 */
   1286 			dw->dw_state = DELAYED_WORK_CANCELLED;
   1287 			cancelled_p = true;
   1288 			SDT_PROBE2(sdt, linux, work, cancel,  &dw->work, wq);
   1289 			if (!callout_stop(&dw->dw_callout))
   1290 				cancel_delayed_work_done(wq, dw);
   1291 			break;
   1292 		case DELAYED_WORK_RESCHEDULED:
   1293 			/*
   1294 			 * If it is being rescheduled, the callout has
   1295 			 * already fired.  We must ask it to cancel.
   1296 			 */
   1297 			dw->dw_state = DELAYED_WORK_CANCELLED;
   1298 			dw->dw_resched = -1;
   1299 			cancelled_p = true;
   1300 			SDT_PROBE2(sdt, linux, work, cancel,  &dw->work, wq);
   1301 			break;
   1302 		case DELAYED_WORK_CANCELLED:
   1303 			/*
   1304 			 * If it is being cancelled, the callout has
   1305 			 * already fired.  There is nothing more for us
   1306 			 * to do.  Someone else claims credit for
   1307 			 * cancelling it.
   1308 			 */
   1309 			cancelled_p = false;
   1310 			break;
   1311 		default:
   1312 			panic("invalid delayed work state: %d",
   1313 			    dw->dw_state);
   1314 		}
   1315 	}
   1316 	mutex_exit(&wq->wq_lock);
   1317 
   1318 	return cancelled_p;
   1319 }
   1320 
   1321 /*
   1322  * cancel_delayed_work_sync(dw)
   1323  *
   1324  *	If work was scheduled or queued, remove it from the schedule or
   1325  *	queue and return true.  If work was not scheduled or queued,
   1326  *	return false.  Note that work may already be running; if it
   1327  *	hasn't been rescheduled or requeued, then cancel_delayed_work
   1328  *	will return false; either way, wait for it to complete.
   1329  */
   1330 bool
   1331 cancel_delayed_work_sync(struct delayed_work *dw)
   1332 {
   1333 	struct workqueue_struct *wq;
   1334 	bool cancelled_p;
   1335 
   1336 	/* If there's no workqueue, nothing to cancel.  */
   1337 	if ((wq = work_queue(&dw->work)) == NULL)
   1338 		return false;
   1339 
   1340 	mutex_enter(&wq->wq_lock);
   1341 	if (__predict_false(work_queue(&dw->work) != wq)) {
   1342 		cancelled_p = false;
   1343 	} else {
   1344 		switch (dw->dw_state) {
   1345 		case DELAYED_WORK_IDLE:
   1346 			/*
   1347 			 * It is either on the queue or already running
   1348 			 * or both.
   1349 			 */
   1350 			if (work_claimed(&dw->work, wq)) {
   1351 				/* On the queue.  Remove and release.  */
   1352 				TAILQ_REMOVE(&wq->wq_dqueue, &dw->work,
   1353 				    work_entry);
   1354 				SDT_PROBE2(sdt, linux, work, cancel,
   1355 				    &dw->work, wq);
   1356 				release_work(&dw->work, wq);
   1357 				/* Can't dereference dw after this point.  */
   1358 				cancelled_p = true;
   1359 			} else {
   1360 				/* Not on the queue, so didn't cancel. */
   1361 				cancelled_p = false;
   1362 			}
   1363 			/* If it's still running, wait for it to complete.  */
   1364 			if (wq->wq_current_work == &dw->work)
   1365 				wait_for_current_work(&dw->work, wq);
   1366 			break;
   1367 		case DELAYED_WORK_SCHEDULED:
   1368 			/*
   1369 			 * If it is scheduled, mark it cancelled and
   1370 			 * try to stop the callout before it starts.
   1371 			 *
   1372 			 * If it's too late and the callout has already
   1373 			 * begun to execute, we must wait for it to
   1374 			 * complete.  But we got in soon enough to ask
   1375 			 * the callout not to run, so we successfully
   1376 			 * cancelled it in that case.
   1377 			 *
   1378 			 * If we stopped the callout before it started,
   1379 			 * then we must destroy the callout and
   1380 			 * dissociate it from the workqueue ourselves.
   1381 			 */
   1382 			dw->dw_state = DELAYED_WORK_CANCELLED;
   1383 			SDT_PROBE2(sdt, linux, work, cancel,  &dw->work, wq);
   1384 			if (!callout_halt(&dw->dw_callout, &wq->wq_lock))
   1385 				cancel_delayed_work_done(wq, dw);
   1386 			cancelled_p = true;
   1387 			break;
   1388 		case DELAYED_WORK_RESCHEDULED:
   1389 			/*
   1390 			 * If it is being rescheduled, the callout has
   1391 			 * already fired.  We must ask it to cancel and
   1392 			 * wait for it to complete.
   1393 			 */
   1394 			dw->dw_state = DELAYED_WORK_CANCELLED;
   1395 			dw->dw_resched = -1;
   1396 			SDT_PROBE2(sdt, linux, work, cancel,  &dw->work, wq);
   1397 			(void)callout_halt(&dw->dw_callout, &wq->wq_lock);
   1398 			cancelled_p = true;
   1399 			break;
   1400 		case DELAYED_WORK_CANCELLED:
   1401 			/*
   1402 			 * If it is being cancelled, the callout has
   1403 			 * already fired.  We need only wait for it to
   1404 			 * complete.  Someone else, however, claims
   1405 			 * credit for cancelling it.
   1406 			 */
   1407 			(void)callout_halt(&dw->dw_callout, &wq->wq_lock);
   1408 			cancelled_p = false;
   1409 			break;
   1410 		default:
   1411 			panic("invalid delayed work state: %d",
   1412 			    dw->dw_state);
   1413 		}
   1414 	}
   1415 	mutex_exit(&wq->wq_lock);
   1416 
   1417 	return cancelled_p;
   1418 }
   1419 
   1420 /*
   1422  * Flush
   1423  */
   1424 
   1425 /*
   1426  * flush_scheduled_work()
   1427  *
   1428  *	Wait for all work queued on system_wq to complete.  This does
   1429  *	not include delayed work.
   1430  */
   1431 void
   1432 flush_scheduled_work(void)
   1433 {
   1434 
   1435 	flush_workqueue(system_wq);
   1436 }
   1437 
   1438 struct flush_work {
   1439 	kmutex_t		fw_lock;
   1440 	kcondvar_t		fw_cv;
   1441 	struct work_struct	fw_work;
   1442 	bool			fw_done;
   1443 };
   1444 
   1445 static void
   1446 flush_work_cb(struct work_struct *work)
   1447 {
   1448 	struct flush_work *fw = container_of(work, struct flush_work, fw_work);
   1449 
   1450 	mutex_enter(&fw->fw_lock);
   1451 	fw->fw_done = true;
   1452 	cv_broadcast(&fw->fw_cv);
   1453 	mutex_exit(&fw->fw_lock);
   1454 }
   1455 
   1456 /*
   1457  * flush_workqueue(wq)
   1458  *
   1459  *	Wait for all work queued on wq to complete.  This does not
   1460  *	include delayed work.
   1461  */
   1462 void
   1463 flush_workqueue(struct workqueue_struct *wq)
   1464 {
   1465 	struct flush_work fw;
   1466 
   1467 	if (lwp_getspecific(workqueue_key) == wq) {
   1468 		SDT_PROBE1(sdt, linux, work, flush__self,  wq);
   1469 		return;
   1470 	}
   1471 
   1472 	mutex_init(&fw.fw_lock, MUTEX_DEFAULT, IPL_VM);
   1473 	cv_init(&fw.fw_cv, "lxwqflsh");
   1474 	INIT_WORK(&fw.fw_work, &flush_work_cb);
   1475 	fw.fw_done = false;
   1476 
   1477 	SDT_PROBE1(sdt, linux, work, flush__start,  wq);
   1478 	queue_work(wq, &fw.fw_work);
   1479 
   1480 	mutex_enter(&fw.fw_lock);
   1481 	while (!fw.fw_done)
   1482 		cv_wait(&fw.fw_cv, &fw.fw_lock);
   1483 	mutex_exit(&fw.fw_lock);
   1484 	SDT_PROBE1(sdt, linux, work, flush__done,  wq);
   1485 
   1486 	KASSERT(fw.fw_done);
   1487 	/* no DESTROY_WORK */
   1488 	cv_destroy(&fw.fw_cv);
   1489 	mutex_destroy(&fw.fw_lock);
   1490 }
   1491 
   1492 /*
   1493  * drain_workqueue(wq)
   1494  *
   1495  *	Repeatedly flush wq until there is no more work.
   1496  */
   1497 void
   1498 drain_workqueue(struct workqueue_struct *wq)
   1499 {
   1500 	unsigned ntries = 0;
   1501 	bool done;
   1502 
   1503 	do {
   1504 		if (ntries++ == 10 || (ntries % 100) == 0)
   1505 			printf("linux workqueue %s"
   1506 			    ": still clogged after %u flushes",
   1507 			    wq->wq_name, ntries);
   1508 		flush_workqueue(wq);
   1509 		mutex_enter(&wq->wq_lock);
   1510 		done = wq->wq_current_work == NULL;
   1511 		done &= TAILQ_EMPTY(&wq->wq_queue);
   1512 		done &= TAILQ_EMPTY(&wq->wq_dqueue);
   1513 		mutex_exit(&wq->wq_lock);
   1514 	} while (!done);
   1515 }
   1516 
   1517 /*
   1518  * flush_work(work)
   1519  *
   1520  *	If work is queued or currently executing, wait for it to
   1521  *	complete.
   1522  *
   1523  *	Return true if we waited to flush it, false if it was already
   1524  *	idle.
   1525  */
   1526 bool
   1527 flush_work(struct work_struct *work)
   1528 {
   1529 	struct workqueue_struct *wq;
   1530 
   1531 	/* If there's no workqueue, nothing to flush.  */
   1532 	if ((wq = work_queue(work)) == NULL)
   1533 		return false;
   1534 
   1535 	flush_workqueue(wq);
   1536 	return true;
   1537 }
   1538 
   1539 /*
   1540  * flush_delayed_work(dw)
   1541  *
   1542  *	If dw is scheduled to run after a delay, queue it immediately
   1543  *	instead.  Then, if dw is queued or currently executing, wait
   1544  *	for it to complete.
   1545  */
   1546 bool
   1547 flush_delayed_work(struct delayed_work *dw)
   1548 {
   1549 	struct workqueue_struct *wq;
   1550 	bool waited = false;
   1551 
   1552 	/* If there's no workqueue, nothing to flush.  */
   1553 	if ((wq = work_queue(&dw->work)) == NULL)
   1554 		return false;
   1555 
   1556 	mutex_enter(&wq->wq_lock);
   1557 	if (__predict_false(work_queue(&dw->work) != wq)) {
   1558 		/*
   1559 		 * Moved off the queue already (and possibly to another
   1560 		 * queue, though that would be ill-advised), so it must
   1561 		 * have completed, and we have nothing more to do.
   1562 		 */
   1563 		waited = false;
   1564 	} else {
   1565 		switch (dw->dw_state) {
   1566 		case DELAYED_WORK_IDLE:
   1567 			/*
   1568 			 * It has a workqueue assigned and the callout
   1569 			 * is idle, so it must be in progress or on the
   1570 			 * queue.  In that case, we'll wait for it to
   1571 			 * complete.
   1572 			 */
   1573 			break;
   1574 		case DELAYED_WORK_SCHEDULED:
   1575 		case DELAYED_WORK_RESCHEDULED:
   1576 		case DELAYED_WORK_CANCELLED:
   1577 			/*
   1578 			 * The callout is scheduled, and may have even
   1579 			 * started.  Mark it as scheduled so that if
   1580 			 * the callout has fired it will queue the work
   1581 			 * itself.  Try to stop the callout -- if we
   1582 			 * can, queue the work now; if we can't, wait
   1583 			 * for the callout to complete, which entails
   1584 			 * queueing it.
   1585 			 */
   1586 			dw->dw_state = DELAYED_WORK_SCHEDULED;
   1587 			if (!callout_halt(&dw->dw_callout, &wq->wq_lock)) {
   1588 				/*
   1589 				 * We stopped it before it ran.  No
   1590 				 * state change in the interim is
   1591 				 * possible.  Destroy the callout and
   1592 				 * queue it ourselves.
   1593 				 */
   1594 				KASSERT(dw->dw_state ==
   1595 				    DELAYED_WORK_SCHEDULED);
   1596 				dw_callout_destroy(wq, dw);
   1597 				TAILQ_INSERT_TAIL(&wq->wq_dqueue, &dw->work,
   1598 				    work_entry);
   1599 				cv_broadcast(&wq->wq_cv);
   1600 				SDT_PROBE2(sdt, linux, work, queue,
   1601 				    &dw->work, wq);
   1602 			}
   1603 			break;
   1604 		default:
   1605 			panic("invalid delayed work state: %d", dw->dw_state);
   1606 		}
   1607 		/*
   1608 		 * Waiting for the whole queue to flush is overkill,
   1609 		 * but doesn't hurt.
   1610 		 */
   1611 		mutex_exit(&wq->wq_lock);
   1612 		flush_workqueue(wq);
   1613 		mutex_enter(&wq->wq_lock);
   1614 		waited = true;
   1615 	}
   1616 	mutex_exit(&wq->wq_lock);
   1617 
   1618 	return waited;
   1619 }
   1620 
   1621 /*
   1622  * delayed_work_pending(dw)
   1623  *
   1624  *	True if dw is currently scheduled to execute, false if not.
   1625  */
   1626 bool
   1627 delayed_work_pending(const struct delayed_work *dw)
   1628 {
   1629 
   1630 	return work_pending(&dw->work);
   1631 }
   1632 
   1633 /*
   1634  * INIT_RCU_WORK(rw, fn)
   1635  *
   1636  *	Initialize rw for use with a workqueue to call fn in a worker
   1637  *	thread after an RCU grace period.  There is no corresponding
   1638  *	destruction operation.
   1639  */
   1640 void
   1641 INIT_RCU_WORK(struct rcu_work *rw, void (*fn)(struct work_struct *))
   1642 {
   1643 
   1644 	INIT_WORK(&rw->work, fn);
   1645 }
   1646 
   1647 static void
   1648 queue_rcu_work_cb(struct rcu_head *r)
   1649 {
   1650 	struct rcu_work *rw = container_of(r, struct rcu_work, rw_rcu);
   1651 	struct workqueue_struct *wq = work_queue(&rw->work);
   1652 
   1653 	mutex_enter(&wq->wq_lock);
   1654 	KASSERT(work_pending(&rw->work));
   1655 	KASSERT(work_queue(&rw->work) == wq);
   1656 	destroy_rcu_head(&rw->rw_rcu);
   1657 	TAILQ_REMOVE(&wq->wq_rcu, &rw->work, work_entry);
   1658 	TAILQ_INSERT_TAIL(&wq->wq_queue, &rw->work, work_entry);
   1659 	cv_broadcast(&wq->wq_cv);
   1660 	SDT_PROBE2(sdt, linux, work, queue,  &rw->work, wq);
   1661 	mutex_exit(&wq->wq_lock);
   1662 }
   1663 
   1664 /*
   1665  * queue_rcu_work(wq, rw)
   1666  *
   1667  *	Schedule rw to run on wq after an RCU grace period.
   1668  */
   1669 void
   1670 queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rw)
   1671 {
   1672 
   1673 	mutex_enter(&wq->wq_lock);
   1674 	if (acquire_work(&rw->work, wq)) {
   1675 		init_rcu_head(&rw->rw_rcu);
   1676 		SDT_PROBE2(sdt, linux, work, rcu,  rw, wq);
   1677 		TAILQ_INSERT_TAIL(&wq->wq_rcu, &rw->work, work_entry);
   1678 		call_rcu(&rw->rw_rcu, &queue_rcu_work_cb);
   1679 	}
   1680 	mutex_exit(&wq->wq_lock);
   1681 }
   1682