uvm_swap.c revision 1.175.2.3 1 /* $NetBSD: uvm_swap.c,v 1.175.2.3 2018/03/15 09:12:07 pgoyette Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.175.2.3 2018/03/15 09:12:07 pgoyette Exp $");
34
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/buf.h>
42 #include <sys/bufq.h>
43 #include <sys/conf.h>
44 #include <sys/proc.h>
45 #include <sys/namei.h>
46 #include <sys/disklabel.h>
47 #include <sys/errno.h>
48 #include <sys/kernel.h>
49 #include <sys/vnode.h>
50 #include <sys/file.h>
51 #include <sys/vmem.h>
52 #include <sys/blist.h>
53 #include <sys/mount.h>
54 #include <sys/pool.h>
55 #include <sys/kmem.h>
56 #include <sys/syscallargs.h>
57 #include <sys/swap.h>
58 #include <sys/kauth.h>
59 #include <sys/sysctl.h>
60 #include <sys/workqueue.h>
61
62 #include <uvm/uvm.h>
63
64 #include <miscfs/specfs/specdev.h>
65
66 /*
67 * uvm_swap.c: manage configuration and i/o to swap space.
68 */
69
70 /*
71 * swap space is managed in the following way:
72 *
73 * each swap partition or file is described by a "swapdev" structure.
74 * each "swapdev" structure contains a "swapent" structure which contains
75 * information that is passed up to the user (via system calls).
76 *
77 * each swap partition is assigned a "priority" (int) which controls
78 * swap parition usage.
79 *
80 * the system maintains a global data structure describing all swap
81 * partitions/files. there is a sorted LIST of "swappri" structures
82 * which describe "swapdev"'s at that priority. this LIST is headed
83 * by the "swap_priority" global var. each "swappri" contains a
84 * TAILQ of "swapdev" structures at that priority.
85 *
86 * locking:
87 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
88 * system call and prevents the swap priority list from changing
89 * while we are in the middle of a system call (e.g. SWAP_STATS).
90 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
91 * structures including the priority list, the swapdev structures,
92 * and the swapmap arena.
93 *
94 * each swap device has the following info:
95 * - swap device in use (could be disabled, preventing future use)
96 * - swap enabled (allows new allocations on swap)
97 * - map info in /dev/drum
98 * - vnode pointer
99 * for swap files only:
100 * - block size
101 * - max byte count in buffer
102 * - buffer
103 *
104 * userland controls and configures swap with the swapctl(2) system call.
105 * the sys_swapctl performs the following operations:
106 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
107 * [2] SWAP_STATS: given a pointer to an array of swapent structures
108 * (passed in via "arg") of a size passed in via "misc" ... we load
109 * the current swap config into the array. The actual work is done
110 * in the uvm_swap_stats() function.
111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
112 * priority in "misc", start swapping on it.
113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
115 * "misc")
116 */
117
118 /*
119 * swap device priority entry; the list is kept sorted on `spi_priority'.
120 */
121 struct swappri {
122 int spi_priority; /* priority */
123 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
124 /* tailq of swapdevs at this priority */
125 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
126 };
127
128 /*
129 * The following two structures are used to keep track of data transfers
130 * on swap devices associated with regular files.
131 * NOTE: this code is more or less a copy of vnd.c; we use the same
132 * structure names here to ease porting..
133 */
134 struct vndxfer {
135 struct buf *vx_bp; /* Pointer to parent buffer */
136 struct swapdev *vx_sdp;
137 int vx_error;
138 int vx_pending; /* # of pending aux buffers */
139 int vx_flags;
140 #define VX_BUSY 1
141 #define VX_DEAD 2
142 };
143
144 struct vndbuf {
145 struct buf vb_buf;
146 struct vndxfer *vb_xfer;
147 };
148
149 /*
150 * We keep a of pool vndbuf's and vndxfer structures.
151 */
152 static struct pool vndxfer_pool, vndbuf_pool;
153
154 /*
155 * local variables
156 */
157 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
158
159 /* list of all active swap devices [by priority] */
160 LIST_HEAD(swap_priority, swappri);
161 static struct swap_priority swap_priority;
162
163 /* locks */
164 static krwlock_t swap_syscall_lock;
165
166 /* workqueue and use counter for swap to regular files */
167 static int sw_reg_count = 0;
168 static struct workqueue *sw_reg_workqueue;
169
170 /* tuneables */
171 u_int uvm_swapisfull_factor = 99;
172
173 /*
174 * prototypes
175 */
176 static struct swapdev *swapdrum_getsdp(int);
177
178 static struct swapdev *swaplist_find(struct vnode *, bool);
179 static void swaplist_insert(struct swapdev *,
180 struct swappri *, int);
181 static void swaplist_trim(void);
182
183 static int swap_on(struct lwp *, struct swapdev *);
184 static int swap_off(struct lwp *, struct swapdev *);
185
186 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
187 static void sw_reg_biodone(struct buf *);
188 static void sw_reg_iodone(struct work *wk, void *dummy);
189 static void sw_reg_start(struct swapdev *);
190
191 static int uvm_swap_io(struct vm_page **, int, int, int);
192
193 /*
194 * vectored routines for COMPAT_13 and COMPAT_50
195 */
196
197 size_t swapstats_len_13 = 0;
198
199 static void stub_swapstats13_copy(int, int, struct swapdev *,
200 struct swapent13 *);
201 void (*vec_swapstats_copy_13)(int, int, struct swapdev *, struct swapent13 *) =
202 stub_swapstats13_copy;
203
204 size_t swapstats_len_50 = 0;
205
206 static void stub_swapstats50_copy(int, int, struct swapdev *,
207 struct swapent50 *);
208 void (*vec_swapstats_copy_50)(int, int, struct swapdev *, struct swapent50 *) =
209 stub_swapstats50_copy;
210
211 static void
212 stub_swapstats13_copy(int cmd, int inuse, struct swapdev *sdp,
213 struct swapent13 *sep13)
214 {
215
216 /* nothing */
217 }
218
219 static void
220 stub_swapstats50_copy(int cmd, int inuse, struct swapdev *sdp,
221 struct swapent50 *sep50)
222 {
223
224 /* nothing */
225 }
226
227 /*
228 * uvm_swap_init: init the swap system data structures and locks
229 *
230 * => called at boot time from init_main.c after the filesystems
231 * are brought up (which happens after uvm_init())
232 */
233 void
234 uvm_swap_init(void)
235 {
236 UVMHIST_FUNC("uvm_swap_init");
237
238 UVMHIST_CALLED(pdhist);
239 /*
240 * first, init the swap list, its counter, and its lock.
241 * then get a handle on the vnode for /dev/drum by using
242 * the its dev_t number ("swapdev", from MD conf.c).
243 */
244
245 LIST_INIT(&swap_priority);
246 uvmexp.nswapdev = 0;
247 rw_init(&swap_syscall_lock);
248 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
249
250 if (bdevvp(swapdev, &swapdev_vp))
251 panic("%s: can't get vnode for swap device", __func__);
252 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
253 panic("%s: can't lock swap device", __func__);
254 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
255 panic("%s: can't open swap device", __func__);
256 VOP_UNLOCK(swapdev_vp);
257
258 /*
259 * create swap block resource map to map /dev/drum. the range
260 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
261 * that block 0 is reserved (used to indicate an allocation
262 * failure, or no allocation).
263 */
264 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
265 VM_NOSLEEP, IPL_NONE);
266 if (swapmap == 0) {
267 panic("%s: vmem_create failed", __func__);
268 }
269
270 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
271 NULL, IPL_BIO);
272 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
273 NULL, IPL_BIO);
274
275 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
276 }
277
278 /*
279 * swaplist functions: functions that operate on the list of swap
280 * devices on the system.
281 */
282
283 /*
284 * swaplist_insert: insert swap device "sdp" into the global list
285 *
286 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
287 * => caller must provide a newly allocated swappri structure (we will
288 * FREE it if we don't need it... this it to prevent allocation
289 * blocking here while adding swap)
290 */
291 static void
292 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
293 {
294 struct swappri *spp, *pspp;
295 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
296
297 /*
298 * find entry at or after which to insert the new device.
299 */
300 pspp = NULL;
301 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
302 if (priority <= spp->spi_priority)
303 break;
304 pspp = spp;
305 }
306
307 /*
308 * new priority?
309 */
310 if (spp == NULL || spp->spi_priority != priority) {
311 spp = newspp; /* use newspp! */
312 UVMHIST_LOG(pdhist, "created new swappri = %jd",
313 priority, 0, 0, 0);
314
315 spp->spi_priority = priority;
316 TAILQ_INIT(&spp->spi_swapdev);
317
318 if (pspp)
319 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
320 else
321 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
322 } else {
323 /* we don't need a new priority structure, free it */
324 kmem_free(newspp, sizeof(*newspp));
325 }
326
327 /*
328 * priority found (or created). now insert on the priority's
329 * tailq list and bump the total number of swapdevs.
330 */
331 sdp->swd_priority = priority;
332 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
333 uvmexp.nswapdev++;
334 }
335
336 /*
337 * swaplist_find: find and optionally remove a swap device from the
338 * global list.
339 *
340 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
341 * => we return the swapdev we found (and removed)
342 */
343 static struct swapdev *
344 swaplist_find(struct vnode *vp, bool remove)
345 {
346 struct swapdev *sdp;
347 struct swappri *spp;
348
349 /*
350 * search the lists for the requested vp
351 */
352
353 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
354 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
355 if (sdp->swd_vp == vp) {
356 if (remove) {
357 TAILQ_REMOVE(&spp->spi_swapdev,
358 sdp, swd_next);
359 uvmexp.nswapdev--;
360 }
361 return(sdp);
362 }
363 }
364 }
365 return (NULL);
366 }
367
368 /*
369 * swaplist_trim: scan priority list for empty priority entries and kill
370 * them.
371 *
372 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
373 */
374 static void
375 swaplist_trim(void)
376 {
377 struct swappri *spp, *nextspp;
378
379 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
380 if (!TAILQ_EMPTY(&spp->spi_swapdev))
381 continue;
382 LIST_REMOVE(spp, spi_swappri);
383 kmem_free(spp, sizeof(*spp));
384 }
385 }
386
387 /*
388 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
389 * to the "swapdev" that maps that section of the drum.
390 *
391 * => each swapdev takes one big contig chunk of the drum
392 * => caller must hold uvm_swap_data_lock
393 */
394 static struct swapdev *
395 swapdrum_getsdp(int pgno)
396 {
397 struct swapdev *sdp;
398 struct swappri *spp;
399
400 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
401 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
402 if (sdp->swd_flags & SWF_FAKE)
403 continue;
404 if (pgno >= sdp->swd_drumoffset &&
405 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
406 return sdp;
407 }
408 }
409 }
410 return NULL;
411 }
412
413 void swapsys_lock(krw_t op)
414 {
415 rw_enter(&swap_syscall_lock, op);
416 }
417
418 void swapsys_unlock(void)
419 {
420 rw_exit(&swap_syscall_lock);
421 }
422
423 static void
424 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
425 {
426 se->se_dev = sdp->swd_dev;
427 se->se_flags = sdp->swd_flags;
428 se->se_nblks = sdp->swd_nblks;
429 se->se_inuse = inuse;
430 se->se_priority = sdp->swd_priority;
431 KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
432 strcpy(se->se_path, sdp->swd_path);
433 }
434
435 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
436 (void *)enosys;
437 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
438 (void *)enosys;
439
440 /*
441 * sys_swapctl: main entry point for swapctl(2) system call
442 * [with two helper functions: swap_on and swap_off]
443 */
444 int
445 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
446 {
447 /* {
448 syscallarg(int) cmd;
449 syscallarg(void *) arg;
450 syscallarg(int) misc;
451 } */
452 struct vnode *vp;
453 struct nameidata nd;
454 struct swappri *spp;
455 struct swapdev *sdp;
456 #define SWAP_PATH_MAX (PATH_MAX + 1)
457 char *userpath;
458 size_t len = 0;
459 int error;
460 int priority;
461 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
462
463 /*
464 * we handle the non-priv NSWAP and STATS request first.
465 *
466 * SWAP_NSWAP: return number of config'd swap devices
467 * [can also be obtained with uvmexp sysctl]
468 */
469 if (SCARG(uap, cmd) == SWAP_NSWAP) {
470 const int nswapdev = uvmexp.nswapdev;
471 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
472 0, 0, 0);
473 *retval = nswapdev;
474 return 0;
475 }
476
477 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
478
479 /*
480 * ensure serialized syscall access by grabbing the swap_syscall_lock
481 */
482 rw_enter(&swap_syscall_lock, RW_WRITER);
483
484 /*
485 * SWAP_STATS: get stats on current # of configured swap devs
486 *
487 * note that the swap_priority list can't change as long
488 * as we are holding the swap_syscall_lock. we don't want
489 * to grab the uvm_swap_data_lock because we may fault&sleep during
490 * copyout() and we don't want to be holding that lock then!
491 */
492 switch (SCARG(uap, cmd)) {
493 case SWAP_STATS13:
494 error = (*uvm_swap_stats13)(uap, retval);
495 goto out;
496 case SWAP_STATS50:
497 error = (*uvm_swap_stats50)(uap, retval);
498 goto out;
499 case SWAP_STATS:
500 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
501 NULL, sizeof(struct swapent), retval);
502 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
503 goto out;
504
505 case SWAP_GETDUMPDEV:
506 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
507 goto out;
508 default:
509 break;
510 }
511
512 /*
513 * all other requests require superuser privs. verify.
514 */
515 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
516 0, NULL, NULL, NULL)))
517 goto out;
518
519 if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
520 /* drop the current dump device */
521 dumpdev = NODEV;
522 dumpcdev = NODEV;
523 cpu_dumpconf();
524 goto out;
525 }
526
527 /*
528 * at this point we expect a path name in arg. we will
529 * use namei() to gain a vnode reference (vref), and lock
530 * the vnode (VOP_LOCK).
531 *
532 * XXX: a NULL arg means use the root vnode pointer (e.g. for
533 * miniroot)
534 */
535 if (SCARG(uap, arg) == NULL) {
536 vp = rootvp; /* miniroot */
537 vref(vp);
538 if (vn_lock(vp, LK_EXCLUSIVE)) {
539 vrele(vp);
540 error = EBUSY;
541 goto out;
542 }
543 if (SCARG(uap, cmd) == SWAP_ON &&
544 copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
545 panic("swapctl: miniroot copy failed");
546 } else {
547 struct pathbuf *pb;
548
549 /*
550 * This used to allow copying in one extra byte
551 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
552 * This was completely pointless because if anyone
553 * used that extra byte namei would fail with
554 * ENAMETOOLONG anyway, so I've removed the excess
555 * logic. - dholland 20100215
556 */
557
558 error = pathbuf_copyin(SCARG(uap, arg), &pb);
559 if (error) {
560 goto out;
561 }
562 if (SCARG(uap, cmd) == SWAP_ON) {
563 /* get a copy of the string */
564 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
565 len = strlen(userpath) + 1;
566 }
567 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
568 if ((error = namei(&nd))) {
569 pathbuf_destroy(pb);
570 goto out;
571 }
572 vp = nd.ni_vp;
573 pathbuf_destroy(pb);
574 }
575 /* note: "vp" is referenced and locked */
576
577 error = 0; /* assume no error */
578 switch(SCARG(uap, cmd)) {
579
580 case SWAP_DUMPDEV:
581 if (vp->v_type != VBLK) {
582 error = ENOTBLK;
583 break;
584 }
585 if (bdevsw_lookup(vp->v_rdev)) {
586 dumpdev = vp->v_rdev;
587 dumpcdev = devsw_blk2chr(dumpdev);
588 } else
589 dumpdev = NODEV;
590 cpu_dumpconf();
591 break;
592
593 case SWAP_CTL:
594 /*
595 * get new priority, remove old entry (if any) and then
596 * reinsert it in the correct place. finally, prune out
597 * any empty priority structures.
598 */
599 priority = SCARG(uap, misc);
600 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
601 mutex_enter(&uvm_swap_data_lock);
602 if ((sdp = swaplist_find(vp, true)) == NULL) {
603 error = ENOENT;
604 } else {
605 swaplist_insert(sdp, spp, priority);
606 swaplist_trim();
607 }
608 mutex_exit(&uvm_swap_data_lock);
609 if (error)
610 kmem_free(spp, sizeof(*spp));
611 break;
612
613 case SWAP_ON:
614
615 /*
616 * check for duplicates. if none found, then insert a
617 * dummy entry on the list to prevent someone else from
618 * trying to enable this device while we are working on
619 * it.
620 */
621
622 priority = SCARG(uap, misc);
623 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
624 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
625 sdp->swd_flags = SWF_FAKE;
626 sdp->swd_vp = vp;
627 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
628 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
629 mutex_enter(&uvm_swap_data_lock);
630 if (swaplist_find(vp, false) != NULL) {
631 error = EBUSY;
632 mutex_exit(&uvm_swap_data_lock);
633 bufq_free(sdp->swd_tab);
634 kmem_free(sdp, sizeof(*sdp));
635 kmem_free(spp, sizeof(*spp));
636 break;
637 }
638 swaplist_insert(sdp, spp, priority);
639 mutex_exit(&uvm_swap_data_lock);
640
641 KASSERT(len > 0);
642 sdp->swd_pathlen = len;
643 sdp->swd_path = kmem_alloc(len, KM_SLEEP);
644 if (copystr(userpath, sdp->swd_path, len, 0) != 0)
645 panic("swapctl: copystr");
646
647 /*
648 * we've now got a FAKE placeholder in the swap list.
649 * now attempt to enable swap on it. if we fail, undo
650 * what we've done and kill the fake entry we just inserted.
651 * if swap_on is a success, it will clear the SWF_FAKE flag
652 */
653
654 if ((error = swap_on(l, sdp)) != 0) {
655 mutex_enter(&uvm_swap_data_lock);
656 (void) swaplist_find(vp, true); /* kill fake entry */
657 swaplist_trim();
658 mutex_exit(&uvm_swap_data_lock);
659 bufq_free(sdp->swd_tab);
660 kmem_free(sdp->swd_path, sdp->swd_pathlen);
661 kmem_free(sdp, sizeof(*sdp));
662 break;
663 }
664 break;
665
666 case SWAP_OFF:
667 mutex_enter(&uvm_swap_data_lock);
668 if ((sdp = swaplist_find(vp, false)) == NULL) {
669 mutex_exit(&uvm_swap_data_lock);
670 error = ENXIO;
671 break;
672 }
673
674 /*
675 * If a device isn't in use or enabled, we
676 * can't stop swapping from it (again).
677 */
678 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
679 mutex_exit(&uvm_swap_data_lock);
680 error = EBUSY;
681 break;
682 }
683
684 /*
685 * do the real work.
686 */
687 error = swap_off(l, sdp);
688 break;
689
690 default:
691 error = EINVAL;
692 }
693
694 /*
695 * done! release the ref gained by namei() and unlock.
696 */
697 vput(vp);
698 out:
699 rw_exit(&swap_syscall_lock);
700 kmem_free(userpath, SWAP_PATH_MAX);
701
702 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0);
703 return (error);
704 }
705
706 /*
707 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
708 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
709 * emulation to use it directly without going through sys_swapctl().
710 * The problem with using sys_swapctl() there is that it involves
711 * copying the swapent array to the stackgap, and this array's size
712 * is not known at build time. Hence it would not be possible to
713 * ensure it would fit in the stackgap in any case.
714 */
715 int
716 uvm_swap_stats(char *ptr, int misc,
717 void (*f)(void *, const struct swapent *), size_t len,
718 register_t *retval)
719 {
720 struct swappri *spp;
721 struct swapdev *sdp;
722 struct swapent sep;
723 int count = 0;
724 int error;
725
726 KASSERT(len <= sizeof(sep));
727 if (len == 0)
728 return ENOSYS;
729
730 if (misc < 0)
731 return EINVAL;
732
733 if (misc == 0 || uvmexp.nswapdev == 0)
734 return 0;
735
736 /* Make sure userland cannot exhaust kernel memory */
737 if ((size_t)misc > (size_t)uvmexp.nswapdev)
738 misc = uvmexp.nswapdev;
739
740 KASSERT(rw_lock_held(&swap_syscall_lock));
741
742 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
743 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
744 int inuse;
745
746 if (misc-- <= 0)
747 break;
748
749 inuse = btodb((uint64_t)sdp->swd_npginuse <<
750 PAGE_SHIFT);
751
752 swapent_cvt(&sep, sdp, inuse);
753 if (f)
754 (*f)(&sep, &sep);
755 if ((error = copyout(&sep, ptr, len)) != 0)
756 return error;
757 ptr += len;
758 count++;
759 }
760 }
761 *retval = count;
762 return 0;
763 }
764
765 /*
766 * swap_on: attempt to enable a swapdev for swapping. note that the
767 * swapdev is already on the global list, but disabled (marked
768 * SWF_FAKE).
769 *
770 * => we avoid the start of the disk (to protect disk labels)
771 * => we also avoid the miniroot, if we are swapping to root.
772 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
773 * if needed.
774 */
775 static int
776 swap_on(struct lwp *l, struct swapdev *sdp)
777 {
778 struct vnode *vp;
779 int error, npages, nblocks, size;
780 long addr;
781 vmem_addr_t result;
782 struct vattr va;
783 dev_t dev;
784 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
785
786 /*
787 * we want to enable swapping on sdp. the swd_vp contains
788 * the vnode we want (locked and ref'd), and the swd_dev
789 * contains the dev_t of the file, if it a block device.
790 */
791
792 vp = sdp->swd_vp;
793 dev = sdp->swd_dev;
794
795 /*
796 * open the swap file (mostly useful for block device files to
797 * let device driver know what is up).
798 *
799 * we skip the open/close for root on swap because the root
800 * has already been opened when root was mounted (mountroot).
801 */
802 if (vp != rootvp) {
803 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
804 return (error);
805 }
806
807 /* XXX this only works for block devices */
808 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
809
810 /*
811 * we now need to determine the size of the swap area. for
812 * block specials we can call the d_psize function.
813 * for normal files, we must stat [get attrs].
814 *
815 * we put the result in nblks.
816 * for normal files, we also want the filesystem block size
817 * (which we get with statfs).
818 */
819 switch (vp->v_type) {
820 case VBLK:
821 if ((nblocks = bdev_size(dev)) == -1) {
822 error = ENXIO;
823 goto bad;
824 }
825 break;
826
827 case VREG:
828 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
829 goto bad;
830 nblocks = (int)btodb(va.va_size);
831 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
832 /*
833 * limit the max # of outstanding I/O requests we issue
834 * at any one time. take it easy on NFS servers.
835 */
836 if (vp->v_tag == VT_NFS)
837 sdp->swd_maxactive = 2; /* XXX */
838 else
839 sdp->swd_maxactive = 8; /* XXX */
840 break;
841
842 default:
843 error = ENXIO;
844 goto bad;
845 }
846
847 /*
848 * save nblocks in a safe place and convert to pages.
849 */
850
851 sdp->swd_nblks = nblocks;
852 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
853
854 /*
855 * for block special files, we want to make sure that leave
856 * the disklabel and bootblocks alone, so we arrange to skip
857 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
858 * note that because of this the "size" can be less than the
859 * actual number of blocks on the device.
860 */
861 if (vp->v_type == VBLK) {
862 /* we use pages 1 to (size - 1) [inclusive] */
863 size = npages - 1;
864 addr = 1;
865 } else {
866 /* we use pages 0 to (size - 1) [inclusive] */
867 size = npages;
868 addr = 0;
869 }
870
871 /*
872 * make sure we have enough blocks for a reasonable sized swap
873 * area. we want at least one page.
874 */
875
876 if (size < 1) {
877 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
878 error = EINVAL;
879 goto bad;
880 }
881
882 UVMHIST_LOG(pdhist, " dev=%jx: size=%jd addr=%jd", dev, size, addr, 0);
883
884 /*
885 * now we need to allocate an extent to manage this swap device
886 */
887
888 sdp->swd_blist = blist_create(npages);
889 /* mark all expect the `saved' region free. */
890 blist_free(sdp->swd_blist, addr, size);
891
892 /*
893 * if the vnode we are swapping to is the root vnode
894 * (i.e. we are swapping to the miniroot) then we want
895 * to make sure we don't overwrite it. do a statfs to
896 * find its size and skip over it.
897 */
898 if (vp == rootvp) {
899 struct mount *mp;
900 struct statvfs *sp;
901 int rootblocks, rootpages;
902
903 mp = rootvnode->v_mount;
904 sp = &mp->mnt_stat;
905 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
906 /*
907 * XXX: sp->f_blocks isn't the total number of
908 * blocks in the filesystem, it's the number of
909 * data blocks. so, our rootblocks almost
910 * definitely underestimates the total size
911 * of the filesystem - how badly depends on the
912 * details of the filesystem type. there isn't
913 * an obvious way to deal with this cleanly
914 * and perfectly, so for now we just pad our
915 * rootblocks estimate with an extra 5 percent.
916 */
917 rootblocks += (rootblocks >> 5) +
918 (rootblocks >> 6) +
919 (rootblocks >> 7);
920 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
921 if (rootpages > size)
922 panic("swap_on: miniroot larger than swap?");
923
924 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
925 panic("swap_on: unable to preserve miniroot");
926 }
927
928 size -= rootpages;
929 printf("Preserved %d pages of miniroot ", rootpages);
930 printf("leaving %d pages of swap\n", size);
931 }
932
933 /*
934 * add a ref to vp to reflect usage as a swap device.
935 */
936 vref(vp);
937
938 /*
939 * now add the new swapdev to the drum and enable.
940 */
941 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
942 if (error != 0)
943 panic("swapdrum_add");
944 /*
945 * If this is the first regular swap create the workqueue.
946 * => Protected by swap_syscall_lock.
947 */
948 if (vp->v_type != VBLK) {
949 if (sw_reg_count++ == 0) {
950 KASSERT(sw_reg_workqueue == NULL);
951 if (workqueue_create(&sw_reg_workqueue, "swapiod",
952 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
953 panic("%s: workqueue_create failed", __func__);
954 }
955 }
956
957 sdp->swd_drumoffset = (int)result;
958 sdp->swd_drumsize = npages;
959 sdp->swd_npages = size;
960 mutex_enter(&uvm_swap_data_lock);
961 sdp->swd_flags &= ~SWF_FAKE; /* going live */
962 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
963 uvmexp.swpages += size;
964 uvmexp.swpgavail += size;
965 mutex_exit(&uvm_swap_data_lock);
966 return (0);
967
968 /*
969 * failure: clean up and return error.
970 */
971
972 bad:
973 if (sdp->swd_blist) {
974 blist_destroy(sdp->swd_blist);
975 }
976 if (vp != rootvp) {
977 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
978 }
979 return (error);
980 }
981
982 /*
983 * swap_off: stop swapping on swapdev
984 *
985 * => swap data should be locked, we will unlock.
986 */
987 static int
988 swap_off(struct lwp *l, struct swapdev *sdp)
989 {
990 int npages = sdp->swd_npages;
991 int error = 0;
992
993 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
994 UVMHIST_LOG(pdhist, " dev=%jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
995
996 /* disable the swap area being removed */
997 sdp->swd_flags &= ~SWF_ENABLE;
998 uvmexp.swpgavail -= npages;
999 mutex_exit(&uvm_swap_data_lock);
1000
1001 /*
1002 * the idea is to find all the pages that are paged out to this
1003 * device, and page them all in. in uvm, swap-backed pageable
1004 * memory can take two forms: aobjs and anons. call the
1005 * swapoff hook for each subsystem to bring in pages.
1006 */
1007
1008 if (uao_swap_off(sdp->swd_drumoffset,
1009 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1010 amap_swap_off(sdp->swd_drumoffset,
1011 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1012 error = ENOMEM;
1013 } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1014 error = EBUSY;
1015 }
1016
1017 if (error) {
1018 mutex_enter(&uvm_swap_data_lock);
1019 sdp->swd_flags |= SWF_ENABLE;
1020 uvmexp.swpgavail += npages;
1021 mutex_exit(&uvm_swap_data_lock);
1022
1023 return error;
1024 }
1025
1026 /*
1027 * If this is the last regular swap destroy the workqueue.
1028 * => Protected by swap_syscall_lock.
1029 */
1030 if (sdp->swd_vp->v_type != VBLK) {
1031 KASSERT(sw_reg_count > 0);
1032 KASSERT(sw_reg_workqueue != NULL);
1033 if (--sw_reg_count == 0) {
1034 workqueue_destroy(sw_reg_workqueue);
1035 sw_reg_workqueue = NULL;
1036 }
1037 }
1038
1039 /*
1040 * done with the vnode.
1041 * drop our ref on the vnode before calling VOP_CLOSE()
1042 * so that spec_close() can tell if this is the last close.
1043 */
1044 vrele(sdp->swd_vp);
1045 if (sdp->swd_vp != rootvp) {
1046 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1047 }
1048
1049 mutex_enter(&uvm_swap_data_lock);
1050 uvmexp.swpages -= npages;
1051 uvmexp.swpginuse -= sdp->swd_npgbad;
1052
1053 if (swaplist_find(sdp->swd_vp, true) == NULL)
1054 panic("%s: swapdev not in list", __func__);
1055 swaplist_trim();
1056 mutex_exit(&uvm_swap_data_lock);
1057
1058 /*
1059 * free all resources!
1060 */
1061 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1062 blist_destroy(sdp->swd_blist);
1063 bufq_free(sdp->swd_tab);
1064 kmem_free(sdp, sizeof(*sdp));
1065 return (0);
1066 }
1067
1068 void
1069 uvm_swap_shutdown(struct lwp *l)
1070 {
1071 struct swapdev *sdp;
1072 struct swappri *spp;
1073 struct vnode *vp;
1074 int error;
1075
1076 printf("turning of swap...");
1077 rw_enter(&swap_syscall_lock, RW_WRITER);
1078 mutex_enter(&uvm_swap_data_lock);
1079 again:
1080 LIST_FOREACH(spp, &swap_priority, spi_swappri)
1081 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1082 if (sdp->swd_flags & SWF_FAKE)
1083 continue;
1084 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
1085 continue;
1086 #ifdef DEBUG
1087 printf("\nturning off swap on %s...",
1088 sdp->swd_path);
1089 #endif
1090 if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) {
1091 error = EBUSY;
1092 vp = NULL;
1093 } else
1094 error = 0;
1095 if (!error) {
1096 error = swap_off(l, sdp);
1097 mutex_enter(&uvm_swap_data_lock);
1098 }
1099 if (error) {
1100 printf("stopping swap on %s failed "
1101 "with error %d\n", sdp->swd_path, error);
1102 TAILQ_REMOVE(&spp->spi_swapdev, sdp,
1103 swd_next);
1104 uvmexp.nswapdev--;
1105 swaplist_trim();
1106 if (vp)
1107 vput(vp);
1108 }
1109 goto again;
1110 }
1111 printf(" done\n");
1112 mutex_exit(&uvm_swap_data_lock);
1113 rw_exit(&swap_syscall_lock);
1114 }
1115
1116
1117 /*
1118 * /dev/drum interface and i/o functions
1119 */
1120
1121 /*
1122 * swstrategy: perform I/O on the drum
1123 *
1124 * => we must map the i/o request from the drum to the correct swapdev.
1125 */
1126 static void
1127 swstrategy(struct buf *bp)
1128 {
1129 struct swapdev *sdp;
1130 struct vnode *vp;
1131 int pageno, bn;
1132 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1133
1134 /*
1135 * convert block number to swapdev. note that swapdev can't
1136 * be yanked out from under us because we are holding resources
1137 * in it (i.e. the blocks we are doing I/O on).
1138 */
1139 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1140 mutex_enter(&uvm_swap_data_lock);
1141 sdp = swapdrum_getsdp(pageno);
1142 mutex_exit(&uvm_swap_data_lock);
1143 if (sdp == NULL) {
1144 bp->b_error = EINVAL;
1145 bp->b_resid = bp->b_bcount;
1146 biodone(bp);
1147 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1148 return;
1149 }
1150
1151 /*
1152 * convert drum page number to block number on this swapdev.
1153 */
1154
1155 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1156 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1157
1158 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%jx bn=%jx bcount=%jd",
1159 ((bp->b_flags & B_READ) == 0) ? 1 : 0,
1160 sdp->swd_drumoffset, bn, bp->b_bcount);
1161
1162 /*
1163 * for block devices we finish up here.
1164 * for regular files we have to do more work which we delegate
1165 * to sw_reg_strategy().
1166 */
1167
1168 vp = sdp->swd_vp; /* swapdev vnode pointer */
1169 switch (vp->v_type) {
1170 default:
1171 panic("%s: vnode type 0x%x", __func__, vp->v_type);
1172
1173 case VBLK:
1174
1175 /*
1176 * must convert "bp" from an I/O on /dev/drum to an I/O
1177 * on the swapdev (sdp).
1178 */
1179 bp->b_blkno = bn; /* swapdev block number */
1180 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1181
1182 /*
1183 * if we are doing a write, we have to redirect the i/o on
1184 * drum's v_numoutput counter to the swapdevs.
1185 */
1186 if ((bp->b_flags & B_READ) == 0) {
1187 mutex_enter(bp->b_objlock);
1188 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1189 mutex_exit(bp->b_objlock);
1190 mutex_enter(vp->v_interlock);
1191 vp->v_numoutput++; /* put it on swapdev */
1192 mutex_exit(vp->v_interlock);
1193 }
1194
1195 /*
1196 * finally plug in swapdev vnode and start I/O
1197 */
1198 bp->b_vp = vp;
1199 bp->b_objlock = vp->v_interlock;
1200 VOP_STRATEGY(vp, bp);
1201 return;
1202
1203 case VREG:
1204 /*
1205 * delegate to sw_reg_strategy function.
1206 */
1207 sw_reg_strategy(sdp, bp, bn);
1208 return;
1209 }
1210 /* NOTREACHED */
1211 }
1212
1213 /*
1214 * swread: the read function for the drum (just a call to physio)
1215 */
1216 /*ARGSUSED*/
1217 static int
1218 swread(dev_t dev, struct uio *uio, int ioflag)
1219 {
1220 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1221
1222 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0);
1223 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1224 }
1225
1226 /*
1227 * swwrite: the write function for the drum (just a call to physio)
1228 */
1229 /*ARGSUSED*/
1230 static int
1231 swwrite(dev_t dev, struct uio *uio, int ioflag)
1232 {
1233 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1234
1235 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0);
1236 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1237 }
1238
1239 const struct bdevsw swap_bdevsw = {
1240 .d_open = nullopen,
1241 .d_close = nullclose,
1242 .d_strategy = swstrategy,
1243 .d_ioctl = noioctl,
1244 .d_dump = nodump,
1245 .d_psize = nosize,
1246 .d_discard = nodiscard,
1247 .d_flag = D_OTHER
1248 };
1249
1250 const struct cdevsw swap_cdevsw = {
1251 .d_open = nullopen,
1252 .d_close = nullclose,
1253 .d_read = swread,
1254 .d_write = swwrite,
1255 .d_ioctl = noioctl,
1256 .d_stop = nostop,
1257 .d_tty = notty,
1258 .d_poll = nopoll,
1259 .d_mmap = nommap,
1260 .d_kqfilter = nokqfilter,
1261 .d_discard = nodiscard,
1262 .d_flag = D_OTHER,
1263 };
1264
1265 /*
1266 * sw_reg_strategy: handle swap i/o to regular files
1267 */
1268 static void
1269 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1270 {
1271 struct vnode *vp;
1272 struct vndxfer *vnx;
1273 daddr_t nbn;
1274 char *addr;
1275 off_t byteoff;
1276 int s, off, nra, error, sz, resid;
1277 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1278
1279 /*
1280 * allocate a vndxfer head for this transfer and point it to
1281 * our buffer.
1282 */
1283 vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1284 vnx->vx_flags = VX_BUSY;
1285 vnx->vx_error = 0;
1286 vnx->vx_pending = 0;
1287 vnx->vx_bp = bp;
1288 vnx->vx_sdp = sdp;
1289
1290 /*
1291 * setup for main loop where we read filesystem blocks into
1292 * our buffer.
1293 */
1294 error = 0;
1295 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1296 addr = bp->b_data; /* current position in buffer */
1297 byteoff = dbtob((uint64_t)bn);
1298
1299 for (resid = bp->b_resid; resid; resid -= sz) {
1300 struct vndbuf *nbp;
1301
1302 /*
1303 * translate byteoffset into block number. return values:
1304 * vp = vnode of underlying device
1305 * nbn = new block number (on underlying vnode dev)
1306 * nra = num blocks we can read-ahead (excludes requested
1307 * block)
1308 */
1309 nra = 0;
1310 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1311 &vp, &nbn, &nra);
1312
1313 if (error == 0 && nbn == (daddr_t)-1) {
1314 /*
1315 * this used to just set error, but that doesn't
1316 * do the right thing. Instead, it causes random
1317 * memory errors. The panic() should remain until
1318 * this condition doesn't destabilize the system.
1319 */
1320 #if 1
1321 panic("%s: swap to sparse file", __func__);
1322 #else
1323 error = EIO; /* failure */
1324 #endif
1325 }
1326
1327 /*
1328 * punt if there was an error or a hole in the file.
1329 * we must wait for any i/o ops we have already started
1330 * to finish before returning.
1331 *
1332 * XXX we could deal with holes here but it would be
1333 * a hassle (in the write case).
1334 */
1335 if (error) {
1336 s = splbio();
1337 vnx->vx_error = error; /* pass error up */
1338 goto out;
1339 }
1340
1341 /*
1342 * compute the size ("sz") of this transfer (in bytes).
1343 */
1344 off = byteoff % sdp->swd_bsize;
1345 sz = (1 + nra) * sdp->swd_bsize - off;
1346 if (sz > resid)
1347 sz = resid;
1348
1349 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1350 "vp %#jx/%#jx offset 0x%jx/0x%jx",
1351 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
1352
1353 /*
1354 * now get a buf structure. note that the vb_buf is
1355 * at the front of the nbp structure so that you can
1356 * cast pointers between the two structure easily.
1357 */
1358 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1359 buf_init(&nbp->vb_buf);
1360 nbp->vb_buf.b_flags = bp->b_flags;
1361 nbp->vb_buf.b_cflags = bp->b_cflags;
1362 nbp->vb_buf.b_oflags = bp->b_oflags;
1363 nbp->vb_buf.b_bcount = sz;
1364 nbp->vb_buf.b_bufsize = sz;
1365 nbp->vb_buf.b_error = 0;
1366 nbp->vb_buf.b_data = addr;
1367 nbp->vb_buf.b_lblkno = 0;
1368 nbp->vb_buf.b_blkno = nbn + btodb(off);
1369 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1370 nbp->vb_buf.b_iodone = sw_reg_biodone;
1371 nbp->vb_buf.b_vp = vp;
1372 nbp->vb_buf.b_objlock = vp->v_interlock;
1373 if (vp->v_type == VBLK) {
1374 nbp->vb_buf.b_dev = vp->v_rdev;
1375 }
1376
1377 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1378
1379 /*
1380 * Just sort by block number
1381 */
1382 s = splbio();
1383 if (vnx->vx_error != 0) {
1384 buf_destroy(&nbp->vb_buf);
1385 pool_put(&vndbuf_pool, nbp);
1386 goto out;
1387 }
1388 vnx->vx_pending++;
1389
1390 /* sort it in and start I/O if we are not over our limit */
1391 /* XXXAD locking */
1392 bufq_put(sdp->swd_tab, &nbp->vb_buf);
1393 sw_reg_start(sdp);
1394 splx(s);
1395
1396 /*
1397 * advance to the next I/O
1398 */
1399 byteoff += sz;
1400 addr += sz;
1401 }
1402
1403 s = splbio();
1404
1405 out: /* Arrive here at splbio */
1406 vnx->vx_flags &= ~VX_BUSY;
1407 if (vnx->vx_pending == 0) {
1408 error = vnx->vx_error;
1409 pool_put(&vndxfer_pool, vnx);
1410 bp->b_error = error;
1411 biodone(bp);
1412 }
1413 splx(s);
1414 }
1415
1416 /*
1417 * sw_reg_start: start an I/O request on the requested swapdev
1418 *
1419 * => reqs are sorted by b_rawblkno (above)
1420 */
1421 static void
1422 sw_reg_start(struct swapdev *sdp)
1423 {
1424 struct buf *bp;
1425 struct vnode *vp;
1426 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1427
1428 /* recursion control */
1429 if ((sdp->swd_flags & SWF_BUSY) != 0)
1430 return;
1431
1432 sdp->swd_flags |= SWF_BUSY;
1433
1434 while (sdp->swd_active < sdp->swd_maxactive) {
1435 bp = bufq_get(sdp->swd_tab);
1436 if (bp == NULL)
1437 break;
1438 sdp->swd_active++;
1439
1440 UVMHIST_LOG(pdhist,
1441 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %jx",
1442 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
1443 bp->b_bcount);
1444 vp = bp->b_vp;
1445 KASSERT(bp->b_objlock == vp->v_interlock);
1446 if ((bp->b_flags & B_READ) == 0) {
1447 mutex_enter(vp->v_interlock);
1448 vp->v_numoutput++;
1449 mutex_exit(vp->v_interlock);
1450 }
1451 VOP_STRATEGY(vp, bp);
1452 }
1453 sdp->swd_flags &= ~SWF_BUSY;
1454 }
1455
1456 /*
1457 * sw_reg_biodone: one of our i/o's has completed
1458 */
1459 static void
1460 sw_reg_biodone(struct buf *bp)
1461 {
1462 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1463 }
1464
1465 /*
1466 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1467 *
1468 * => note that we can recover the vndbuf struct by casting the buf ptr
1469 */
1470 static void
1471 sw_reg_iodone(struct work *wk, void *dummy)
1472 {
1473 struct vndbuf *vbp = (void *)wk;
1474 struct vndxfer *vnx = vbp->vb_xfer;
1475 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1476 struct swapdev *sdp = vnx->vx_sdp;
1477 int s, resid, error;
1478 KASSERT(&vbp->vb_buf.b_work == wk);
1479 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1480
1481 UVMHIST_LOG(pdhist, " vbp=%#jx vp=%#jx blkno=%jx addr=%#jx",
1482 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
1483 (uintptr_t)vbp->vb_buf.b_data);
1484 UVMHIST_LOG(pdhist, " cnt=%jx resid=%jx",
1485 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1486
1487 /*
1488 * protect vbp at splbio and update.
1489 */
1490
1491 s = splbio();
1492 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1493 pbp->b_resid -= resid;
1494 vnx->vx_pending--;
1495
1496 if (vbp->vb_buf.b_error != 0) {
1497 /* pass error upward */
1498 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1499 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0);
1500 vnx->vx_error = error;
1501 }
1502
1503 /*
1504 * kill vbp structure
1505 */
1506 buf_destroy(&vbp->vb_buf);
1507 pool_put(&vndbuf_pool, vbp);
1508
1509 /*
1510 * wrap up this transaction if it has run to completion or, in
1511 * case of an error, when all auxiliary buffers have returned.
1512 */
1513 if (vnx->vx_error != 0) {
1514 /* pass error upward */
1515 error = vnx->vx_error;
1516 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1517 pbp->b_error = error;
1518 biodone(pbp);
1519 pool_put(&vndxfer_pool, vnx);
1520 }
1521 } else if (pbp->b_resid == 0) {
1522 KASSERT(vnx->vx_pending == 0);
1523 if ((vnx->vx_flags & VX_BUSY) == 0) {
1524 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !",
1525 (uintptr_t)pbp, vnx->vx_error, 0, 0);
1526 biodone(pbp);
1527 pool_put(&vndxfer_pool, vnx);
1528 }
1529 }
1530
1531 /*
1532 * done! start next swapdev I/O if one is pending
1533 */
1534 sdp->swd_active--;
1535 sw_reg_start(sdp);
1536 splx(s);
1537 }
1538
1539
1540 /*
1541 * uvm_swap_alloc: allocate space on swap
1542 *
1543 * => allocation is done "round robin" down the priority list, as we
1544 * allocate in a priority we "rotate" the circle queue.
1545 * => space can be freed with uvm_swap_free
1546 * => we return the page slot number in /dev/drum (0 == invalid slot)
1547 * => we lock uvm_swap_data_lock
1548 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1549 */
1550 int
1551 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1552 {
1553 struct swapdev *sdp;
1554 struct swappri *spp;
1555 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1556
1557 /*
1558 * no swap devices configured yet? definite failure.
1559 */
1560 if (uvmexp.nswapdev < 1)
1561 return 0;
1562
1563 /*
1564 * XXXJAK: BEGIN HACK
1565 *
1566 * blist_alloc() in subr_blist.c will panic if we try to allocate
1567 * too many slots.
1568 */
1569 if (*nslots > BLIST_MAX_ALLOC) {
1570 if (__predict_false(lessok == false))
1571 return 0;
1572 *nslots = BLIST_MAX_ALLOC;
1573 }
1574 /* XXXJAK: END HACK */
1575
1576 /*
1577 * lock data lock, convert slots into blocks, and enter loop
1578 */
1579 mutex_enter(&uvm_swap_data_lock);
1580
1581 ReTry: /* XXXMRG */
1582 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1583 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1584 uint64_t result;
1585
1586 /* if it's not enabled, then we can't swap from it */
1587 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1588 continue;
1589 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1590 continue;
1591 result = blist_alloc(sdp->swd_blist, *nslots);
1592 if (result == BLIST_NONE) {
1593 continue;
1594 }
1595 KASSERT(result < sdp->swd_drumsize);
1596
1597 /*
1598 * successful allocation! now rotate the tailq.
1599 */
1600 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1601 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1602 sdp->swd_npginuse += *nslots;
1603 uvmexp.swpginuse += *nslots;
1604 mutex_exit(&uvm_swap_data_lock);
1605 /* done! return drum slot number */
1606 UVMHIST_LOG(pdhist,
1607 "success! returning %jd slots starting at %jd",
1608 *nslots, result + sdp->swd_drumoffset, 0, 0);
1609 return (result + sdp->swd_drumoffset);
1610 }
1611 }
1612
1613 /* XXXMRG: BEGIN HACK */
1614 if (*nslots > 1 && lessok) {
1615 *nslots = 1;
1616 /* XXXMRG: ugh! blist should support this for us */
1617 goto ReTry;
1618 }
1619 /* XXXMRG: END HACK */
1620
1621 mutex_exit(&uvm_swap_data_lock);
1622 return 0;
1623 }
1624
1625 /*
1626 * uvm_swapisfull: return true if most of available swap is allocated
1627 * and in use. we don't count some small portion as it may be inaccessible
1628 * to us at any given moment, for example if there is lock contention or if
1629 * pages are busy.
1630 */
1631 bool
1632 uvm_swapisfull(void)
1633 {
1634 int swpgonly;
1635 bool rv;
1636
1637 mutex_enter(&uvm_swap_data_lock);
1638 KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1639 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1640 uvm_swapisfull_factor);
1641 rv = (swpgonly >= uvmexp.swpgavail);
1642 mutex_exit(&uvm_swap_data_lock);
1643
1644 return (rv);
1645 }
1646
1647 /*
1648 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1649 *
1650 * => we lock uvm_swap_data_lock
1651 */
1652 void
1653 uvm_swap_markbad(int startslot, int nslots)
1654 {
1655 struct swapdev *sdp;
1656 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1657
1658 mutex_enter(&uvm_swap_data_lock);
1659 sdp = swapdrum_getsdp(startslot);
1660 KASSERT(sdp != NULL);
1661
1662 /*
1663 * we just keep track of how many pages have been marked bad
1664 * in this device, to make everything add up in swap_off().
1665 * we assume here that the range of slots will all be within
1666 * one swap device.
1667 */
1668
1669 KASSERT(uvmexp.swpgonly >= nslots);
1670 uvmexp.swpgonly -= nslots;
1671 sdp->swd_npgbad += nslots;
1672 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
1673 mutex_exit(&uvm_swap_data_lock);
1674 }
1675
1676 /*
1677 * uvm_swap_free: free swap slots
1678 *
1679 * => this can be all or part of an allocation made by uvm_swap_alloc
1680 * => we lock uvm_swap_data_lock
1681 */
1682 void
1683 uvm_swap_free(int startslot, int nslots)
1684 {
1685 struct swapdev *sdp;
1686 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1687
1688 UVMHIST_LOG(pdhist, "freeing %jd slots starting at %jd", nslots,
1689 startslot, 0, 0);
1690
1691 /*
1692 * ignore attempts to free the "bad" slot.
1693 */
1694
1695 if (startslot == SWSLOT_BAD) {
1696 return;
1697 }
1698
1699 /*
1700 * convert drum slot offset back to sdp, free the blocks
1701 * in the extent, and return. must hold pri lock to do
1702 * lookup and access the extent.
1703 */
1704
1705 mutex_enter(&uvm_swap_data_lock);
1706 sdp = swapdrum_getsdp(startslot);
1707 KASSERT(uvmexp.nswapdev >= 1);
1708 KASSERT(sdp != NULL);
1709 KASSERT(sdp->swd_npginuse >= nslots);
1710 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1711 sdp->swd_npginuse -= nslots;
1712 uvmexp.swpginuse -= nslots;
1713 mutex_exit(&uvm_swap_data_lock);
1714 }
1715
1716 /*
1717 * uvm_swap_put: put any number of pages into a contig place on swap
1718 *
1719 * => can be sync or async
1720 */
1721
1722 int
1723 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1724 {
1725 int error;
1726
1727 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1728 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1729 return error;
1730 }
1731
1732 /*
1733 * uvm_swap_get: get a single page from swap
1734 *
1735 * => usually a sync op (from fault)
1736 */
1737
1738 int
1739 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1740 {
1741 int error;
1742
1743 uvmexp.nswget++;
1744 KASSERT(flags & PGO_SYNCIO);
1745 if (swslot == SWSLOT_BAD) {
1746 return EIO;
1747 }
1748
1749 error = uvm_swap_io(&page, swslot, 1, B_READ |
1750 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1751 if (error == 0) {
1752
1753 /*
1754 * this page is no longer only in swap.
1755 */
1756
1757 mutex_enter(&uvm_swap_data_lock);
1758 KASSERT(uvmexp.swpgonly > 0);
1759 uvmexp.swpgonly--;
1760 mutex_exit(&uvm_swap_data_lock);
1761 }
1762 return error;
1763 }
1764
1765 /*
1766 * uvm_swap_io: do an i/o operation to swap
1767 */
1768
1769 static int
1770 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1771 {
1772 daddr_t startblk;
1773 struct buf *bp;
1774 vaddr_t kva;
1775 int error, mapinflags;
1776 bool write, async;
1777 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1778
1779 UVMHIST_LOG(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%jd",
1780 startslot, npages, flags, 0);
1781
1782 write = (flags & B_READ) == 0;
1783 async = (flags & B_ASYNC) != 0;
1784
1785 /*
1786 * allocate a buf for the i/o.
1787 */
1788
1789 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1790 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1791 if (bp == NULL) {
1792 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1793 return ENOMEM;
1794 }
1795
1796 /*
1797 * convert starting drum slot to block number
1798 */
1799
1800 startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1801
1802 /*
1803 * first, map the pages into the kernel.
1804 */
1805
1806 mapinflags = !write ?
1807 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1808 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1809 kva = uvm_pagermapin(pps, npages, mapinflags);
1810
1811 /*
1812 * fill in the bp/sbp. we currently route our i/o through
1813 * /dev/drum's vnode [swapdev_vp].
1814 */
1815
1816 bp->b_cflags = BC_BUSY | BC_NOCACHE;
1817 bp->b_flags = (flags & (B_READ|B_ASYNC));
1818 bp->b_proc = &proc0; /* XXX */
1819 bp->b_vnbufs.le_next = NOLIST;
1820 bp->b_data = (void *)kva;
1821 bp->b_blkno = startblk;
1822 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1823
1824 /*
1825 * bump v_numoutput (counter of number of active outputs).
1826 */
1827
1828 if (write) {
1829 mutex_enter(swapdev_vp->v_interlock);
1830 swapdev_vp->v_numoutput++;
1831 mutex_exit(swapdev_vp->v_interlock);
1832 }
1833
1834 /*
1835 * for async ops we must set up the iodone handler.
1836 */
1837
1838 if (async) {
1839 bp->b_iodone = uvm_aio_biodone;
1840 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1841 if (curlwp == uvm.pagedaemon_lwp)
1842 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1843 else
1844 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1845 } else {
1846 bp->b_iodone = NULL;
1847 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1848 }
1849 UVMHIST_LOG(pdhist,
1850 "about to start io: data = %#jx blkno = 0x%jx, bcount = %jd",
1851 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1852
1853 /*
1854 * now we start the I/O, and if async, return.
1855 */
1856
1857 VOP_STRATEGY(swapdev_vp, bp);
1858 if (async)
1859 return 0;
1860
1861 /*
1862 * must be sync i/o. wait for it to finish
1863 */
1864
1865 error = biowait(bp);
1866
1867 /*
1868 * kill the pager mapping
1869 */
1870
1871 uvm_pagermapout(kva, npages);
1872
1873 /*
1874 * now dispose of the buf and we're done.
1875 */
1876
1877 if (write) {
1878 mutex_enter(swapdev_vp->v_interlock);
1879 vwakeup(bp);
1880 mutex_exit(swapdev_vp->v_interlock);
1881 }
1882 putiobuf(bp);
1883 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0);
1884
1885 return (error);
1886 }
1887