uvm_swap.c revision 1.161.6.1 1 /* $NetBSD: uvm_swap.c,v 1.161.6.1 2013/02/25 00:30:19 tls Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.161.6.1 2013/02/25 00:30:19 tls Exp $");
34
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/buf.h>
42 #include <sys/bufq.h>
43 #include <sys/conf.h>
44 #include <sys/proc.h>
45 #include <sys/namei.h>
46 #include <sys/disklabel.h>
47 #include <sys/errno.h>
48 #include <sys/kernel.h>
49 #include <sys/vnode.h>
50 #include <sys/file.h>
51 #include <sys/vmem.h>
52 #include <sys/blist.h>
53 #include <sys/mount.h>
54 #include <sys/pool.h>
55 #include <sys/kmem.h>
56 #include <sys/syscallargs.h>
57 #include <sys/swap.h>
58 #include <sys/kauth.h>
59 #include <sys/sysctl.h>
60 #include <sys/workqueue.h>
61
62 #include <uvm/uvm.h>
63
64 #include <miscfs/specfs/specdev.h>
65
66 /*
67 * uvm_swap.c: manage configuration and i/o to swap space.
68 */
69
70 /*
71 * swap space is managed in the following way:
72 *
73 * each swap partition or file is described by a "swapdev" structure.
74 * each "swapdev" structure contains a "swapent" structure which contains
75 * information that is passed up to the user (via system calls).
76 *
77 * each swap partition is assigned a "priority" (int) which controls
78 * swap parition usage.
79 *
80 * the system maintains a global data structure describing all swap
81 * partitions/files. there is a sorted LIST of "swappri" structures
82 * which describe "swapdev"'s at that priority. this LIST is headed
83 * by the "swap_priority" global var. each "swappri" contains a
84 * CIRCLEQ of "swapdev" structures at that priority.
85 *
86 * locking:
87 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
88 * system call and prevents the swap priority list from changing
89 * while we are in the middle of a system call (e.g. SWAP_STATS).
90 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
91 * structures including the priority list, the swapdev structures,
92 * and the swapmap arena.
93 *
94 * each swap device has the following info:
95 * - swap device in use (could be disabled, preventing future use)
96 * - swap enabled (allows new allocations on swap)
97 * - map info in /dev/drum
98 * - vnode pointer
99 * for swap files only:
100 * - block size
101 * - max byte count in buffer
102 * - buffer
103 *
104 * userland controls and configures swap with the swapctl(2) system call.
105 * the sys_swapctl performs the following operations:
106 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
107 * [2] SWAP_STATS: given a pointer to an array of swapent structures
108 * (passed in via "arg") of a size passed in via "misc" ... we load
109 * the current swap config into the array. The actual work is done
110 * in the uvm_swap_stats() function.
111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
112 * priority in "misc", start swapping on it.
113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
115 * "misc")
116 */
117
118 /*
119 * swapdev: describes a single swap partition/file
120 *
121 * note the following should be true:
122 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
123 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
124 */
125 struct swapdev {
126 dev_t swd_dev; /* device id */
127 int swd_flags; /* flags:inuse/enable/fake */
128 int swd_priority; /* our priority */
129 int swd_nblks; /* blocks in this device */
130 char *swd_path; /* saved pathname of device */
131 int swd_pathlen; /* length of pathname */
132 int swd_npages; /* #pages we can use */
133 int swd_npginuse; /* #pages in use */
134 int swd_npgbad; /* #pages bad */
135 int swd_drumoffset; /* page0 offset in drum */
136 int swd_drumsize; /* #pages in drum */
137 blist_t swd_blist; /* blist for this swapdev */
138 struct vnode *swd_vp; /* backing vnode */
139 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
140
141 int swd_bsize; /* blocksize (bytes) */
142 int swd_maxactive; /* max active i/o reqs */
143 struct bufq_state *swd_tab; /* buffer list */
144 int swd_active; /* number of active buffers */
145 };
146
147 /*
148 * swap device priority entry; the list is kept sorted on `spi_priority'.
149 */
150 struct swappri {
151 int spi_priority; /* priority */
152 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
153 /* circleq of swapdevs at this priority */
154 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
155 };
156
157 /*
158 * The following two structures are used to keep track of data transfers
159 * on swap devices associated with regular files.
160 * NOTE: this code is more or less a copy of vnd.c; we use the same
161 * structure names here to ease porting..
162 */
163 struct vndxfer {
164 struct buf *vx_bp; /* Pointer to parent buffer */
165 struct swapdev *vx_sdp;
166 int vx_error;
167 int vx_pending; /* # of pending aux buffers */
168 int vx_flags;
169 #define VX_BUSY 1
170 #define VX_DEAD 2
171 };
172
173 struct vndbuf {
174 struct buf vb_buf;
175 struct vndxfer *vb_xfer;
176 };
177
178 /*
179 * NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
180 * dev_t and has no se_path[] member.
181 */
182 struct swapent13 {
183 int32_t se13_dev; /* device id */
184 int se13_flags; /* flags */
185 int se13_nblks; /* total blocks */
186 int se13_inuse; /* blocks in use */
187 int se13_priority; /* priority of this device */
188 };
189
190 /*
191 * NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
192 * dev_t.
193 */
194 struct swapent50 {
195 int32_t se50_dev; /* device id */
196 int se50_flags; /* flags */
197 int se50_nblks; /* total blocks */
198 int se50_inuse; /* blocks in use */
199 int se50_priority; /* priority of this device */
200 char se50_path[PATH_MAX+1]; /* path name */
201 };
202
203 /*
204 * We keep a of pool vndbuf's and vndxfer structures.
205 */
206 static struct pool vndxfer_pool, vndbuf_pool;
207
208 /*
209 * local variables
210 */
211 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
212
213 /* list of all active swap devices [by priority] */
214 LIST_HEAD(swap_priority, swappri);
215 static struct swap_priority swap_priority;
216
217 /* locks */
218 static krwlock_t swap_syscall_lock;
219
220 /* workqueue and use counter for swap to regular files */
221 static int sw_reg_count = 0;
222 static struct workqueue *sw_reg_workqueue;
223
224 /* tuneables */
225 u_int uvm_swapisfull_factor = 99;
226
227 /*
228 * prototypes
229 */
230 static struct swapdev *swapdrum_getsdp(int);
231
232 static struct swapdev *swaplist_find(struct vnode *, bool);
233 static void swaplist_insert(struct swapdev *,
234 struct swappri *, int);
235 static void swaplist_trim(void);
236
237 static int swap_on(struct lwp *, struct swapdev *);
238 static int swap_off(struct lwp *, struct swapdev *);
239
240 static void uvm_swap_stats(int, struct swapent *, int, register_t *);
241
242 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
243 static void sw_reg_biodone(struct buf *);
244 static void sw_reg_iodone(struct work *wk, void *dummy);
245 static void sw_reg_start(struct swapdev *);
246
247 static int uvm_swap_io(struct vm_page **, int, int, int);
248
249 /*
250 * uvm_swap_init: init the swap system data structures and locks
251 *
252 * => called at boot time from init_main.c after the filesystems
253 * are brought up (which happens after uvm_init())
254 */
255 void
256 uvm_swap_init(void)
257 {
258 UVMHIST_FUNC("uvm_swap_init");
259
260 UVMHIST_CALLED(pdhist);
261 /*
262 * first, init the swap list, its counter, and its lock.
263 * then get a handle on the vnode for /dev/drum by using
264 * the its dev_t number ("swapdev", from MD conf.c).
265 */
266
267 LIST_INIT(&swap_priority);
268 uvmexp.nswapdev = 0;
269 rw_init(&swap_syscall_lock);
270 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
271
272 if (bdevvp(swapdev, &swapdev_vp))
273 panic("%s: can't get vnode for swap device", __func__);
274 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
275 panic("%s: can't lock swap device", __func__);
276 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
277 panic("%s: can't open swap device", __func__);
278 VOP_UNLOCK(swapdev_vp);
279
280 /*
281 * create swap block resource map to map /dev/drum. the range
282 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
283 * that block 0 is reserved (used to indicate an allocation
284 * failure, or no allocation).
285 */
286 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
287 VM_NOSLEEP, IPL_NONE);
288 if (swapmap == 0) {
289 panic("%s: vmem_create failed", __func__);
290 }
291
292 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
293 NULL, IPL_BIO);
294 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
295 NULL, IPL_BIO);
296
297 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
298 }
299
300 /*
301 * swaplist functions: functions that operate on the list of swap
302 * devices on the system.
303 */
304
305 /*
306 * swaplist_insert: insert swap device "sdp" into the global list
307 *
308 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
309 * => caller must provide a newly allocated swappri structure (we will
310 * FREE it if we don't need it... this it to prevent allocation
311 * blocking here while adding swap)
312 */
313 static void
314 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
315 {
316 struct swappri *spp, *pspp;
317 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
318
319 /*
320 * find entry at or after which to insert the new device.
321 */
322 pspp = NULL;
323 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
324 if (priority <= spp->spi_priority)
325 break;
326 pspp = spp;
327 }
328
329 /*
330 * new priority?
331 */
332 if (spp == NULL || spp->spi_priority != priority) {
333 spp = newspp; /* use newspp! */
334 UVMHIST_LOG(pdhist, "created new swappri = %d",
335 priority, 0, 0, 0);
336
337 spp->spi_priority = priority;
338 CIRCLEQ_INIT(&spp->spi_swapdev);
339
340 if (pspp)
341 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
342 else
343 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
344 } else {
345 /* we don't need a new priority structure, free it */
346 kmem_free(newspp, sizeof(*newspp));
347 }
348
349 /*
350 * priority found (or created). now insert on the priority's
351 * circleq list and bump the total number of swapdevs.
352 */
353 sdp->swd_priority = priority;
354 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
355 uvmexp.nswapdev++;
356 }
357
358 /*
359 * swaplist_find: find and optionally remove a swap device from the
360 * global list.
361 *
362 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
363 * => we return the swapdev we found (and removed)
364 */
365 static struct swapdev *
366 swaplist_find(struct vnode *vp, bool remove)
367 {
368 struct swapdev *sdp;
369 struct swappri *spp;
370
371 /*
372 * search the lists for the requested vp
373 */
374
375 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
376 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
377 if (sdp->swd_vp == vp) {
378 if (remove) {
379 CIRCLEQ_REMOVE(&spp->spi_swapdev,
380 sdp, swd_next);
381 uvmexp.nswapdev--;
382 }
383 return(sdp);
384 }
385 }
386 }
387 return (NULL);
388 }
389
390 /*
391 * swaplist_trim: scan priority list for empty priority entries and kill
392 * them.
393 *
394 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
395 */
396 static void
397 swaplist_trim(void)
398 {
399 struct swappri *spp, *nextspp;
400
401 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
402 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
403 (void *)&spp->spi_swapdev)
404 continue;
405 LIST_REMOVE(spp, spi_swappri);
406 kmem_free(spp, sizeof(*spp));
407 }
408 }
409
410 /*
411 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
412 * to the "swapdev" that maps that section of the drum.
413 *
414 * => each swapdev takes one big contig chunk of the drum
415 * => caller must hold uvm_swap_data_lock
416 */
417 static struct swapdev *
418 swapdrum_getsdp(int pgno)
419 {
420 struct swapdev *sdp;
421 struct swappri *spp;
422
423 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
424 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
425 if (sdp->swd_flags & SWF_FAKE)
426 continue;
427 if (pgno >= sdp->swd_drumoffset &&
428 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
429 return sdp;
430 }
431 }
432 }
433 return NULL;
434 }
435
436
437 /*
438 * sys_swapctl: main entry point for swapctl(2) system call
439 * [with two helper functions: swap_on and swap_off]
440 */
441 int
442 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
443 {
444 /* {
445 syscallarg(int) cmd;
446 syscallarg(void *) arg;
447 syscallarg(int) misc;
448 } */
449 struct vnode *vp;
450 struct nameidata nd;
451 struct swappri *spp;
452 struct swapdev *sdp;
453 struct swapent *sep;
454 #define SWAP_PATH_MAX (PATH_MAX + 1)
455 char *userpath;
456 size_t len = 0;
457 int error, misc;
458 int priority;
459 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
460
461 /*
462 * we handle the non-priv NSWAP and STATS request first.
463 *
464 * SWAP_NSWAP: return number of config'd swap devices
465 * [can also be obtained with uvmexp sysctl]
466 */
467 if (SCARG(uap, cmd) == SWAP_NSWAP) {
468 const int nswapdev = uvmexp.nswapdev;
469 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", nswapdev, 0, 0, 0);
470 *retval = nswapdev;
471 return 0;
472 }
473
474 misc = SCARG(uap, misc);
475 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
476
477 /*
478 * ensure serialized syscall access by grabbing the swap_syscall_lock
479 */
480 rw_enter(&swap_syscall_lock, RW_WRITER);
481
482 /*
483 * SWAP_STATS: get stats on current # of configured swap devs
484 *
485 * note that the swap_priority list can't change as long
486 * as we are holding the swap_syscall_lock. we don't want
487 * to grab the uvm_swap_data_lock because we may fault&sleep during
488 * copyout() and we don't want to be holding that lock then!
489 */
490 if (SCARG(uap, cmd) == SWAP_STATS
491 #if defined(COMPAT_50)
492 || SCARG(uap, cmd) == SWAP_STATS50
493 #endif
494 #if defined(COMPAT_13)
495 || SCARG(uap, cmd) == SWAP_STATS13
496 #endif
497 ) {
498 if ((size_t)misc > (size_t)uvmexp.nswapdev)
499 misc = uvmexp.nswapdev;
500
501 if (misc == 0) {
502 error = EINVAL;
503 goto out;
504 }
505 KASSERT(misc > 0);
506 #if defined(COMPAT_13)
507 if (SCARG(uap, cmd) == SWAP_STATS13)
508 len = sizeof(struct swapent13) * misc;
509 else
510 #endif
511 #if defined(COMPAT_50)
512 if (SCARG(uap, cmd) == SWAP_STATS50)
513 len = sizeof(struct swapent50) * misc;
514 else
515 #endif
516 len = sizeof(struct swapent) * misc;
517 sep = (struct swapent *)kmem_alloc(len, KM_SLEEP);
518
519 uvm_swap_stats(SCARG(uap, cmd), sep, misc, retval);
520 error = copyout(sep, SCARG(uap, arg), len);
521
522 kmem_free(sep, len);
523 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
524 goto out;
525 }
526 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
527 dev_t *devp = (dev_t *)SCARG(uap, arg);
528
529 error = copyout(&dumpdev, devp, sizeof(dumpdev));
530 goto out;
531 }
532
533 /*
534 * all other requests require superuser privs. verify.
535 */
536 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
537 0, NULL, NULL, NULL)))
538 goto out;
539
540 if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
541 /* drop the current dump device */
542 dumpdev = NODEV;
543 dumpcdev = NODEV;
544 cpu_dumpconf();
545 goto out;
546 }
547
548 /*
549 * at this point we expect a path name in arg. we will
550 * use namei() to gain a vnode reference (vref), and lock
551 * the vnode (VOP_LOCK).
552 *
553 * XXX: a NULL arg means use the root vnode pointer (e.g. for
554 * miniroot)
555 */
556 if (SCARG(uap, arg) == NULL) {
557 vp = rootvp; /* miniroot */
558 vref(vp);
559 if (vn_lock(vp, LK_EXCLUSIVE)) {
560 vrele(vp);
561 error = EBUSY;
562 goto out;
563 }
564 if (SCARG(uap, cmd) == SWAP_ON &&
565 copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
566 panic("swapctl: miniroot copy failed");
567 KASSERT(len > 0);
568 } else {
569 struct pathbuf *pb;
570
571 /*
572 * This used to allow copying in one extra byte
573 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
574 * This was completely pointless because if anyone
575 * used that extra byte namei would fail with
576 * ENAMETOOLONG anyway, so I've removed the excess
577 * logic. - dholland 20100215
578 */
579
580 error = pathbuf_copyin(SCARG(uap, arg), &pb);
581 if (error) {
582 goto out;
583 }
584 if (SCARG(uap, cmd) == SWAP_ON) {
585 /* get a copy of the string */
586 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
587 len = strlen(userpath) + 1;
588 }
589 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
590 if ((error = namei(&nd))) {
591 pathbuf_destroy(pb);
592 goto out;
593 }
594 vp = nd.ni_vp;
595 pathbuf_destroy(pb);
596 }
597 /* note: "vp" is referenced and locked */
598
599 error = 0; /* assume no error */
600 switch(SCARG(uap, cmd)) {
601
602 case SWAP_DUMPDEV:
603 if (vp->v_type != VBLK) {
604 error = ENOTBLK;
605 break;
606 }
607 if (bdevsw_lookup(vp->v_rdev)) {
608 dumpdev = vp->v_rdev;
609 dumpcdev = devsw_blk2chr(dumpdev);
610 } else
611 dumpdev = NODEV;
612 cpu_dumpconf();
613 break;
614
615 case SWAP_CTL:
616 /*
617 * get new priority, remove old entry (if any) and then
618 * reinsert it in the correct place. finally, prune out
619 * any empty priority structures.
620 */
621 priority = SCARG(uap, misc);
622 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
623 mutex_enter(&uvm_swap_data_lock);
624 if ((sdp = swaplist_find(vp, true)) == NULL) {
625 error = ENOENT;
626 } else {
627 swaplist_insert(sdp, spp, priority);
628 swaplist_trim();
629 }
630 mutex_exit(&uvm_swap_data_lock);
631 if (error)
632 kmem_free(spp, sizeof(*spp));
633 break;
634
635 case SWAP_ON:
636
637 /*
638 * check for duplicates. if none found, then insert a
639 * dummy entry on the list to prevent someone else from
640 * trying to enable this device while we are working on
641 * it.
642 */
643
644 priority = SCARG(uap, misc);
645 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
646 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
647 sdp->swd_flags = SWF_FAKE;
648 sdp->swd_vp = vp;
649 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
650 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
651 mutex_enter(&uvm_swap_data_lock);
652 if (swaplist_find(vp, false) != NULL) {
653 error = EBUSY;
654 mutex_exit(&uvm_swap_data_lock);
655 bufq_free(sdp->swd_tab);
656 kmem_free(sdp, sizeof(*sdp));
657 kmem_free(spp, sizeof(*spp));
658 break;
659 }
660 swaplist_insert(sdp, spp, priority);
661 mutex_exit(&uvm_swap_data_lock);
662
663 KASSERT(len > 0);
664 sdp->swd_pathlen = len;
665 sdp->swd_path = kmem_alloc(len, KM_SLEEP);
666 if (copystr(userpath, sdp->swd_path, len, 0) != 0)
667 panic("swapctl: copystr");
668
669 /*
670 * we've now got a FAKE placeholder in the swap list.
671 * now attempt to enable swap on it. if we fail, undo
672 * what we've done and kill the fake entry we just inserted.
673 * if swap_on is a success, it will clear the SWF_FAKE flag
674 */
675
676 if ((error = swap_on(l, sdp)) != 0) {
677 mutex_enter(&uvm_swap_data_lock);
678 (void) swaplist_find(vp, true); /* kill fake entry */
679 swaplist_trim();
680 mutex_exit(&uvm_swap_data_lock);
681 bufq_free(sdp->swd_tab);
682 kmem_free(sdp->swd_path, sdp->swd_pathlen);
683 kmem_free(sdp, sizeof(*sdp));
684 break;
685 }
686 break;
687
688 case SWAP_OFF:
689 mutex_enter(&uvm_swap_data_lock);
690 if ((sdp = swaplist_find(vp, false)) == NULL) {
691 mutex_exit(&uvm_swap_data_lock);
692 error = ENXIO;
693 break;
694 }
695
696 /*
697 * If a device isn't in use or enabled, we
698 * can't stop swapping from it (again).
699 */
700 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
701 mutex_exit(&uvm_swap_data_lock);
702 error = EBUSY;
703 break;
704 }
705
706 /*
707 * do the real work.
708 */
709 error = swap_off(l, sdp);
710 break;
711
712 default:
713 error = EINVAL;
714 }
715
716 /*
717 * done! release the ref gained by namei() and unlock.
718 */
719 vput(vp);
720 out:
721 rw_exit(&swap_syscall_lock);
722 kmem_free(userpath, SWAP_PATH_MAX);
723
724 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
725 return (error);
726 }
727
728 /*
729 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
730 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
731 * emulation to use it directly without going through sys_swapctl().
732 * The problem with using sys_swapctl() there is that it involves
733 * copying the swapent array to the stackgap, and this array's size
734 * is not known at build time. Hence it would not be possible to
735 * ensure it would fit in the stackgap in any case.
736 */
737 static void
738 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval)
739 {
740 struct swappri *spp;
741 struct swapdev *sdp;
742 int count = 0;
743
744 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
745 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
746 int inuse;
747
748 if (sec-- <= 0)
749 break;
750
751 /*
752 * backwards compatibility for system call.
753 * For NetBSD 1.3 and 5.0, we have to use
754 * the 32 bit dev_t. For 5.0 and -current
755 * we have to add the path.
756 */
757 inuse = btodb((uint64_t)sdp->swd_npginuse <<
758 PAGE_SHIFT);
759
760 #if defined(COMPAT_13) || defined(COMPAT_50)
761 if (cmd == SWAP_STATS) {
762 #endif
763 sep->se_dev = sdp->swd_dev;
764 sep->se_flags = sdp->swd_flags;
765 sep->se_nblks = sdp->swd_nblks;
766 sep->se_inuse = inuse;
767 sep->se_priority = sdp->swd_priority;
768 KASSERT(sdp->swd_pathlen <
769 sizeof(sep->se_path));
770 strcpy(sep->se_path, sdp->swd_path);
771 sep++;
772 #if defined(COMPAT_13)
773 } else if (cmd == SWAP_STATS13) {
774 struct swapent13 *sep13 =
775 (struct swapent13 *)sep;
776
777 sep13->se13_dev = sdp->swd_dev;
778 sep13->se13_flags = sdp->swd_flags;
779 sep13->se13_nblks = sdp->swd_nblks;
780 sep13->se13_inuse = inuse;
781 sep13->se13_priority = sdp->swd_priority;
782 sep = (struct swapent *)(sep13 + 1);
783 #endif
784 #if defined(COMPAT_50)
785 } else if (cmd == SWAP_STATS50) {
786 struct swapent50 *sep50 =
787 (struct swapent50 *)sep;
788
789 sep50->se50_dev = sdp->swd_dev;
790 sep50->se50_flags = sdp->swd_flags;
791 sep50->se50_nblks = sdp->swd_nblks;
792 sep50->se50_inuse = inuse;
793 sep50->se50_priority = sdp->swd_priority;
794 KASSERT(sdp->swd_pathlen <
795 sizeof(sep50->se50_path));
796 strcpy(sep50->se50_path, sdp->swd_path);
797 sep = (struct swapent *)(sep50 + 1);
798 #endif
799 #if defined(COMPAT_13) || defined(COMPAT_50)
800 }
801 #endif
802 count++;
803 }
804 }
805 *retval = count;
806 }
807
808 /*
809 * swap_on: attempt to enable a swapdev for swapping. note that the
810 * swapdev is already on the global list, but disabled (marked
811 * SWF_FAKE).
812 *
813 * => we avoid the start of the disk (to protect disk labels)
814 * => we also avoid the miniroot, if we are swapping to root.
815 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
816 * if needed.
817 */
818 static int
819 swap_on(struct lwp *l, struct swapdev *sdp)
820 {
821 struct vnode *vp;
822 int error, npages, nblocks, size;
823 long addr;
824 vmem_addr_t result;
825 struct vattr va;
826 dev_t dev;
827 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
828
829 /*
830 * we want to enable swapping on sdp. the swd_vp contains
831 * the vnode we want (locked and ref'd), and the swd_dev
832 * contains the dev_t of the file, if it a block device.
833 */
834
835 vp = sdp->swd_vp;
836 dev = sdp->swd_dev;
837
838 /*
839 * open the swap file (mostly useful for block device files to
840 * let device driver know what is up).
841 *
842 * we skip the open/close for root on swap because the root
843 * has already been opened when root was mounted (mountroot).
844 */
845 if (vp != rootvp) {
846 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
847 return (error);
848 }
849
850 /* XXX this only works for block devices */
851 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
852
853 /*
854 * we now need to determine the size of the swap area. for
855 * block specials we can call the d_psize function.
856 * for normal files, we must stat [get attrs].
857 *
858 * we put the result in nblks.
859 * for normal files, we also want the filesystem block size
860 * (which we get with statfs).
861 */
862 switch (vp->v_type) {
863 case VBLK:
864 if ((nblocks = bdev_size(dev)) == -1) {
865 error = ENXIO;
866 goto bad;
867 }
868 break;
869
870 case VREG:
871 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
872 goto bad;
873 nblocks = (int)btodb(va.va_size);
874 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
875 /*
876 * limit the max # of outstanding I/O requests we issue
877 * at any one time. take it easy on NFS servers.
878 */
879 if (vp->v_tag == VT_NFS)
880 sdp->swd_maxactive = 2; /* XXX */
881 else
882 sdp->swd_maxactive = 8; /* XXX */
883 break;
884
885 default:
886 error = ENXIO;
887 goto bad;
888 }
889
890 /*
891 * save nblocks in a safe place and convert to pages.
892 */
893
894 sdp->swd_nblks = nblocks;
895 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
896
897 /*
898 * for block special files, we want to make sure that leave
899 * the disklabel and bootblocks alone, so we arrange to skip
900 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
901 * note that because of this the "size" can be less than the
902 * actual number of blocks on the device.
903 */
904 if (vp->v_type == VBLK) {
905 /* we use pages 1 to (size - 1) [inclusive] */
906 size = npages - 1;
907 addr = 1;
908 } else {
909 /* we use pages 0 to (size - 1) [inclusive] */
910 size = npages;
911 addr = 0;
912 }
913
914 /*
915 * make sure we have enough blocks for a reasonable sized swap
916 * area. we want at least one page.
917 */
918
919 if (size < 1) {
920 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
921 error = EINVAL;
922 goto bad;
923 }
924
925 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
926
927 /*
928 * now we need to allocate an extent to manage this swap device
929 */
930
931 sdp->swd_blist = blist_create(npages);
932 /* mark all expect the `saved' region free. */
933 blist_free(sdp->swd_blist, addr, size);
934
935 /*
936 * if the vnode we are swapping to is the root vnode
937 * (i.e. we are swapping to the miniroot) then we want
938 * to make sure we don't overwrite it. do a statfs to
939 * find its size and skip over it.
940 */
941 if (vp == rootvp) {
942 struct mount *mp;
943 struct statvfs *sp;
944 int rootblocks, rootpages;
945
946 mp = rootvnode->v_mount;
947 sp = &mp->mnt_stat;
948 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
949 /*
950 * XXX: sp->f_blocks isn't the total number of
951 * blocks in the filesystem, it's the number of
952 * data blocks. so, our rootblocks almost
953 * definitely underestimates the total size
954 * of the filesystem - how badly depends on the
955 * details of the filesystem type. there isn't
956 * an obvious way to deal with this cleanly
957 * and perfectly, so for now we just pad our
958 * rootblocks estimate with an extra 5 percent.
959 */
960 rootblocks += (rootblocks >> 5) +
961 (rootblocks >> 6) +
962 (rootblocks >> 7);
963 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
964 if (rootpages > size)
965 panic("swap_on: miniroot larger than swap?");
966
967 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
968 panic("swap_on: unable to preserve miniroot");
969 }
970
971 size -= rootpages;
972 printf("Preserved %d pages of miniroot ", rootpages);
973 printf("leaving %d pages of swap\n", size);
974 }
975
976 /*
977 * add a ref to vp to reflect usage as a swap device.
978 */
979 vref(vp);
980
981 /*
982 * now add the new swapdev to the drum and enable.
983 */
984 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
985 if (error != 0)
986 panic("swapdrum_add");
987 /*
988 * If this is the first regular swap create the workqueue.
989 * => Protected by swap_syscall_lock.
990 */
991 if (vp->v_type != VBLK) {
992 if (sw_reg_count++ == 0) {
993 KASSERT(sw_reg_workqueue == NULL);
994 if (workqueue_create(&sw_reg_workqueue, "swapiod",
995 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
996 panic("%s: workqueue_create failed", __func__);
997 }
998 }
999
1000 sdp->swd_drumoffset = (int)result;
1001 sdp->swd_drumsize = npages;
1002 sdp->swd_npages = size;
1003 mutex_enter(&uvm_swap_data_lock);
1004 sdp->swd_flags &= ~SWF_FAKE; /* going live */
1005 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1006 uvmexp.swpages += size;
1007 uvmexp.swpgavail += size;
1008 mutex_exit(&uvm_swap_data_lock);
1009 return (0);
1010
1011 /*
1012 * failure: clean up and return error.
1013 */
1014
1015 bad:
1016 if (sdp->swd_blist) {
1017 blist_destroy(sdp->swd_blist);
1018 }
1019 if (vp != rootvp) {
1020 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1021 }
1022 return (error);
1023 }
1024
1025 /*
1026 * swap_off: stop swapping on swapdev
1027 *
1028 * => swap data should be locked, we will unlock.
1029 */
1030 static int
1031 swap_off(struct lwp *l, struct swapdev *sdp)
1032 {
1033 int npages = sdp->swd_npages;
1034 int error = 0;
1035
1036 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1037 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0);
1038
1039 /* disable the swap area being removed */
1040 sdp->swd_flags &= ~SWF_ENABLE;
1041 uvmexp.swpgavail -= npages;
1042 mutex_exit(&uvm_swap_data_lock);
1043
1044 /*
1045 * the idea is to find all the pages that are paged out to this
1046 * device, and page them all in. in uvm, swap-backed pageable
1047 * memory can take two forms: aobjs and anons. call the
1048 * swapoff hook for each subsystem to bring in pages.
1049 */
1050
1051 if (uao_swap_off(sdp->swd_drumoffset,
1052 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1053 amap_swap_off(sdp->swd_drumoffset,
1054 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1055 error = ENOMEM;
1056 } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1057 error = EBUSY;
1058 }
1059
1060 if (error) {
1061 mutex_enter(&uvm_swap_data_lock);
1062 sdp->swd_flags |= SWF_ENABLE;
1063 uvmexp.swpgavail += npages;
1064 mutex_exit(&uvm_swap_data_lock);
1065
1066 return error;
1067 }
1068
1069 /*
1070 * If this is the last regular swap destroy the workqueue.
1071 * => Protected by swap_syscall_lock.
1072 */
1073 if (sdp->swd_vp->v_type != VBLK) {
1074 KASSERT(sw_reg_count > 0);
1075 KASSERT(sw_reg_workqueue != NULL);
1076 if (--sw_reg_count == 0) {
1077 workqueue_destroy(sw_reg_workqueue);
1078 sw_reg_workqueue = NULL;
1079 }
1080 }
1081
1082 /*
1083 * done with the vnode.
1084 * drop our ref on the vnode before calling VOP_CLOSE()
1085 * so that spec_close() can tell if this is the last close.
1086 */
1087 vrele(sdp->swd_vp);
1088 if (sdp->swd_vp != rootvp) {
1089 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1090 }
1091
1092 mutex_enter(&uvm_swap_data_lock);
1093 uvmexp.swpages -= npages;
1094 uvmexp.swpginuse -= sdp->swd_npgbad;
1095
1096 if (swaplist_find(sdp->swd_vp, true) == NULL)
1097 panic("%s: swapdev not in list", __func__);
1098 swaplist_trim();
1099 mutex_exit(&uvm_swap_data_lock);
1100
1101 /*
1102 * free all resources!
1103 */
1104 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1105 blist_destroy(sdp->swd_blist);
1106 bufq_free(sdp->swd_tab);
1107 kmem_free(sdp, sizeof(*sdp));
1108 return (0);
1109 }
1110
1111 /*
1112 * /dev/drum interface and i/o functions
1113 */
1114
1115 /*
1116 * swstrategy: perform I/O on the drum
1117 *
1118 * => we must map the i/o request from the drum to the correct swapdev.
1119 */
1120 static void
1121 swstrategy(struct buf *bp)
1122 {
1123 struct swapdev *sdp;
1124 struct vnode *vp;
1125 int pageno, bn;
1126 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1127
1128 /*
1129 * convert block number to swapdev. note that swapdev can't
1130 * be yanked out from under us because we are holding resources
1131 * in it (i.e. the blocks we are doing I/O on).
1132 */
1133 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1134 mutex_enter(&uvm_swap_data_lock);
1135 sdp = swapdrum_getsdp(pageno);
1136 mutex_exit(&uvm_swap_data_lock);
1137 if (sdp == NULL) {
1138 bp->b_error = EINVAL;
1139 biodone(bp);
1140 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1141 return;
1142 }
1143
1144 /*
1145 * convert drum page number to block number on this swapdev.
1146 */
1147
1148 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1149 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1150
1151 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld",
1152 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1153 sdp->swd_drumoffset, bn, bp->b_bcount);
1154
1155 /*
1156 * for block devices we finish up here.
1157 * for regular files we have to do more work which we delegate
1158 * to sw_reg_strategy().
1159 */
1160
1161 vp = sdp->swd_vp; /* swapdev vnode pointer */
1162 switch (vp->v_type) {
1163 default:
1164 panic("%s: vnode type 0x%x", __func__, vp->v_type);
1165
1166 case VBLK:
1167
1168 /*
1169 * must convert "bp" from an I/O on /dev/drum to an I/O
1170 * on the swapdev (sdp).
1171 */
1172 bp->b_blkno = bn; /* swapdev block number */
1173 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1174
1175 /*
1176 * if we are doing a write, we have to redirect the i/o on
1177 * drum's v_numoutput counter to the swapdevs.
1178 */
1179 if ((bp->b_flags & B_READ) == 0) {
1180 mutex_enter(bp->b_objlock);
1181 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1182 mutex_exit(bp->b_objlock);
1183 mutex_enter(vp->v_interlock);
1184 vp->v_numoutput++; /* put it on swapdev */
1185 mutex_exit(vp->v_interlock);
1186 }
1187
1188 /*
1189 * finally plug in swapdev vnode and start I/O
1190 */
1191 bp->b_vp = vp;
1192 bp->b_objlock = vp->v_interlock;
1193 VOP_STRATEGY(vp, bp);
1194 return;
1195
1196 case VREG:
1197 /*
1198 * delegate to sw_reg_strategy function.
1199 */
1200 sw_reg_strategy(sdp, bp, bn);
1201 return;
1202 }
1203 /* NOTREACHED */
1204 }
1205
1206 /*
1207 * swread: the read function for the drum (just a call to physio)
1208 */
1209 /*ARGSUSED*/
1210 static int
1211 swread(dev_t dev, struct uio *uio, int ioflag)
1212 {
1213 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1214
1215 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1216 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1217 }
1218
1219 /*
1220 * swwrite: the write function for the drum (just a call to physio)
1221 */
1222 /*ARGSUSED*/
1223 static int
1224 swwrite(dev_t dev, struct uio *uio, int ioflag)
1225 {
1226 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1227
1228 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1229 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1230 }
1231
1232 const struct bdevsw swap_bdevsw = {
1233 nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER,
1234 };
1235
1236 const struct cdevsw swap_cdevsw = {
1237 nullopen, nullclose, swread, swwrite, noioctl,
1238 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER,
1239 };
1240
1241 /*
1242 * sw_reg_strategy: handle swap i/o to regular files
1243 */
1244 static void
1245 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1246 {
1247 struct vnode *vp;
1248 struct vndxfer *vnx;
1249 daddr_t nbn;
1250 char *addr;
1251 off_t byteoff;
1252 int s, off, nra, error, sz, resid;
1253 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1254
1255 /*
1256 * allocate a vndxfer head for this transfer and point it to
1257 * our buffer.
1258 */
1259 vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1260 vnx->vx_flags = VX_BUSY;
1261 vnx->vx_error = 0;
1262 vnx->vx_pending = 0;
1263 vnx->vx_bp = bp;
1264 vnx->vx_sdp = sdp;
1265
1266 /*
1267 * setup for main loop where we read filesystem blocks into
1268 * our buffer.
1269 */
1270 error = 0;
1271 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1272 addr = bp->b_data; /* current position in buffer */
1273 byteoff = dbtob((uint64_t)bn);
1274
1275 for (resid = bp->b_resid; resid; resid -= sz) {
1276 struct vndbuf *nbp;
1277
1278 /*
1279 * translate byteoffset into block number. return values:
1280 * vp = vnode of underlying device
1281 * nbn = new block number (on underlying vnode dev)
1282 * nra = num blocks we can read-ahead (excludes requested
1283 * block)
1284 */
1285 nra = 0;
1286 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1287 &vp, &nbn, &nra);
1288
1289 if (error == 0 && nbn == (daddr_t)-1) {
1290 /*
1291 * this used to just set error, but that doesn't
1292 * do the right thing. Instead, it causes random
1293 * memory errors. The panic() should remain until
1294 * this condition doesn't destabilize the system.
1295 */
1296 #if 1
1297 panic("%s: swap to sparse file", __func__);
1298 #else
1299 error = EIO; /* failure */
1300 #endif
1301 }
1302
1303 /*
1304 * punt if there was an error or a hole in the file.
1305 * we must wait for any i/o ops we have already started
1306 * to finish before returning.
1307 *
1308 * XXX we could deal with holes here but it would be
1309 * a hassle (in the write case).
1310 */
1311 if (error) {
1312 s = splbio();
1313 vnx->vx_error = error; /* pass error up */
1314 goto out;
1315 }
1316
1317 /*
1318 * compute the size ("sz") of this transfer (in bytes).
1319 */
1320 off = byteoff % sdp->swd_bsize;
1321 sz = (1 + nra) * sdp->swd_bsize - off;
1322 if (sz > resid)
1323 sz = resid;
1324
1325 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1326 "vp %p/%p offset 0x%x/0x%x",
1327 sdp->swd_vp, vp, byteoff, nbn);
1328
1329 /*
1330 * now get a buf structure. note that the vb_buf is
1331 * at the front of the nbp structure so that you can
1332 * cast pointers between the two structure easily.
1333 */
1334 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1335 buf_init(&nbp->vb_buf);
1336 nbp->vb_buf.b_flags = bp->b_flags;
1337 nbp->vb_buf.b_cflags = bp->b_cflags;
1338 nbp->vb_buf.b_oflags = bp->b_oflags;
1339 nbp->vb_buf.b_bcount = sz;
1340 nbp->vb_buf.b_bufsize = sz;
1341 nbp->vb_buf.b_error = 0;
1342 nbp->vb_buf.b_data = addr;
1343 nbp->vb_buf.b_lblkno = 0;
1344 nbp->vb_buf.b_blkno = nbn + btodb(off);
1345 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1346 nbp->vb_buf.b_iodone = sw_reg_biodone;
1347 nbp->vb_buf.b_vp = vp;
1348 nbp->vb_buf.b_objlock = vp->v_interlock;
1349 if (vp->v_type == VBLK) {
1350 nbp->vb_buf.b_dev = vp->v_rdev;
1351 }
1352
1353 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1354
1355 /*
1356 * Just sort by block number
1357 */
1358 s = splbio();
1359 if (vnx->vx_error != 0) {
1360 buf_destroy(&nbp->vb_buf);
1361 pool_put(&vndbuf_pool, nbp);
1362 goto out;
1363 }
1364 vnx->vx_pending++;
1365
1366 /* sort it in and start I/O if we are not over our limit */
1367 /* XXXAD locking */
1368 bufq_put(sdp->swd_tab, &nbp->vb_buf);
1369 sw_reg_start(sdp);
1370 splx(s);
1371
1372 /*
1373 * advance to the next I/O
1374 */
1375 byteoff += sz;
1376 addr += sz;
1377 }
1378
1379 s = splbio();
1380
1381 out: /* Arrive here at splbio */
1382 vnx->vx_flags &= ~VX_BUSY;
1383 if (vnx->vx_pending == 0) {
1384 error = vnx->vx_error;
1385 pool_put(&vndxfer_pool, vnx);
1386 bp->b_error = error;
1387 biodone(bp);
1388 }
1389 splx(s);
1390 }
1391
1392 /*
1393 * sw_reg_start: start an I/O request on the requested swapdev
1394 *
1395 * => reqs are sorted by b_rawblkno (above)
1396 */
1397 static void
1398 sw_reg_start(struct swapdev *sdp)
1399 {
1400 struct buf *bp;
1401 struct vnode *vp;
1402 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1403
1404 /* recursion control */
1405 if ((sdp->swd_flags & SWF_BUSY) != 0)
1406 return;
1407
1408 sdp->swd_flags |= SWF_BUSY;
1409
1410 while (sdp->swd_active < sdp->swd_maxactive) {
1411 bp = bufq_get(sdp->swd_tab);
1412 if (bp == NULL)
1413 break;
1414 sdp->swd_active++;
1415
1416 UVMHIST_LOG(pdhist,
1417 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1418 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1419 vp = bp->b_vp;
1420 KASSERT(bp->b_objlock == vp->v_interlock);
1421 if ((bp->b_flags & B_READ) == 0) {
1422 mutex_enter(vp->v_interlock);
1423 vp->v_numoutput++;
1424 mutex_exit(vp->v_interlock);
1425 }
1426 VOP_STRATEGY(vp, bp);
1427 }
1428 sdp->swd_flags &= ~SWF_BUSY;
1429 }
1430
1431 /*
1432 * sw_reg_biodone: one of our i/o's has completed
1433 */
1434 static void
1435 sw_reg_biodone(struct buf *bp)
1436 {
1437 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1438 }
1439
1440 /*
1441 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1442 *
1443 * => note that we can recover the vndbuf struct by casting the buf ptr
1444 */
1445 static void
1446 sw_reg_iodone(struct work *wk, void *dummy)
1447 {
1448 struct vndbuf *vbp = (void *)wk;
1449 struct vndxfer *vnx = vbp->vb_xfer;
1450 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1451 struct swapdev *sdp = vnx->vx_sdp;
1452 int s, resid, error;
1453 KASSERT(&vbp->vb_buf.b_work == wk);
1454 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1455
1456 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1457 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1458 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1459 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1460
1461 /*
1462 * protect vbp at splbio and update.
1463 */
1464
1465 s = splbio();
1466 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1467 pbp->b_resid -= resid;
1468 vnx->vx_pending--;
1469
1470 if (vbp->vb_buf.b_error != 0) {
1471 /* pass error upward */
1472 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1473 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0);
1474 vnx->vx_error = error;
1475 }
1476
1477 /*
1478 * kill vbp structure
1479 */
1480 buf_destroy(&vbp->vb_buf);
1481 pool_put(&vndbuf_pool, vbp);
1482
1483 /*
1484 * wrap up this transaction if it has run to completion or, in
1485 * case of an error, when all auxiliary buffers have returned.
1486 */
1487 if (vnx->vx_error != 0) {
1488 /* pass error upward */
1489 error = vnx->vx_error;
1490 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1491 pbp->b_error = error;
1492 biodone(pbp);
1493 pool_put(&vndxfer_pool, vnx);
1494 }
1495 } else if (pbp->b_resid == 0) {
1496 KASSERT(vnx->vx_pending == 0);
1497 if ((vnx->vx_flags & VX_BUSY) == 0) {
1498 UVMHIST_LOG(pdhist, " iodone error=%d !",
1499 pbp, vnx->vx_error, 0, 0);
1500 biodone(pbp);
1501 pool_put(&vndxfer_pool, vnx);
1502 }
1503 }
1504
1505 /*
1506 * done! start next swapdev I/O if one is pending
1507 */
1508 sdp->swd_active--;
1509 sw_reg_start(sdp);
1510 splx(s);
1511 }
1512
1513
1514 /*
1515 * uvm_swap_alloc: allocate space on swap
1516 *
1517 * => allocation is done "round robin" down the priority list, as we
1518 * allocate in a priority we "rotate" the circle queue.
1519 * => space can be freed with uvm_swap_free
1520 * => we return the page slot number in /dev/drum (0 == invalid slot)
1521 * => we lock uvm_swap_data_lock
1522 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1523 */
1524 int
1525 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1526 {
1527 struct swapdev *sdp;
1528 struct swappri *spp;
1529 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1530
1531 /*
1532 * no swap devices configured yet? definite failure.
1533 */
1534 if (uvmexp.nswapdev < 1)
1535 return 0;
1536
1537 /*
1538 * XXXJAK: BEGIN HACK
1539 *
1540 * blist_alloc() in subr_blist.c will panic if we try to allocate
1541 * too many slots.
1542 */
1543 if (*nslots > BLIST_MAX_ALLOC) {
1544 if (__predict_false(lessok == false))
1545 return 0;
1546 *nslots = BLIST_MAX_ALLOC;
1547 }
1548 /* XXXJAK: END HACK */
1549
1550 /*
1551 * lock data lock, convert slots into blocks, and enter loop
1552 */
1553 mutex_enter(&uvm_swap_data_lock);
1554
1555 ReTry: /* XXXMRG */
1556 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1557 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1558 uint64_t result;
1559
1560 /* if it's not enabled, then we can't swap from it */
1561 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1562 continue;
1563 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1564 continue;
1565 result = blist_alloc(sdp->swd_blist, *nslots);
1566 if (result == BLIST_NONE) {
1567 continue;
1568 }
1569 KASSERT(result < sdp->swd_drumsize);
1570
1571 /*
1572 * successful allocation! now rotate the circleq.
1573 */
1574 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1575 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1576 sdp->swd_npginuse += *nslots;
1577 uvmexp.swpginuse += *nslots;
1578 mutex_exit(&uvm_swap_data_lock);
1579 /* done! return drum slot number */
1580 UVMHIST_LOG(pdhist,
1581 "success! returning %d slots starting at %d",
1582 *nslots, result + sdp->swd_drumoffset, 0, 0);
1583 return (result + sdp->swd_drumoffset);
1584 }
1585 }
1586
1587 /* XXXMRG: BEGIN HACK */
1588 if (*nslots > 1 && lessok) {
1589 *nslots = 1;
1590 /* XXXMRG: ugh! blist should support this for us */
1591 goto ReTry;
1592 }
1593 /* XXXMRG: END HACK */
1594
1595 mutex_exit(&uvm_swap_data_lock);
1596 return 0;
1597 }
1598
1599 /*
1600 * uvm_swapisfull: return true if most of available swap is allocated
1601 * and in use. we don't count some small portion as it may be inaccessible
1602 * to us at any given moment, for example if there is lock contention or if
1603 * pages are busy.
1604 */
1605 bool
1606 uvm_swapisfull(void)
1607 {
1608 int swpgonly;
1609 bool rv;
1610
1611 mutex_enter(&uvm_swap_data_lock);
1612 KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1613 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1614 uvm_swapisfull_factor);
1615 rv = (swpgonly >= uvmexp.swpgavail);
1616 mutex_exit(&uvm_swap_data_lock);
1617
1618 return (rv);
1619 }
1620
1621 /*
1622 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1623 *
1624 * => we lock uvm_swap_data_lock
1625 */
1626 void
1627 uvm_swap_markbad(int startslot, int nslots)
1628 {
1629 struct swapdev *sdp;
1630 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1631
1632 mutex_enter(&uvm_swap_data_lock);
1633 sdp = swapdrum_getsdp(startslot);
1634 KASSERT(sdp != NULL);
1635
1636 /*
1637 * we just keep track of how many pages have been marked bad
1638 * in this device, to make everything add up in swap_off().
1639 * we assume here that the range of slots will all be within
1640 * one swap device.
1641 */
1642
1643 KASSERT(uvmexp.swpgonly >= nslots);
1644 uvmexp.swpgonly -= nslots;
1645 sdp->swd_npgbad += nslots;
1646 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
1647 mutex_exit(&uvm_swap_data_lock);
1648 }
1649
1650 /*
1651 * uvm_swap_free: free swap slots
1652 *
1653 * => this can be all or part of an allocation made by uvm_swap_alloc
1654 * => we lock uvm_swap_data_lock
1655 */
1656 void
1657 uvm_swap_free(int startslot, int nslots)
1658 {
1659 struct swapdev *sdp;
1660 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1661
1662 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1663 startslot, 0, 0);
1664
1665 /*
1666 * ignore attempts to free the "bad" slot.
1667 */
1668
1669 if (startslot == SWSLOT_BAD) {
1670 return;
1671 }
1672
1673 /*
1674 * convert drum slot offset back to sdp, free the blocks
1675 * in the extent, and return. must hold pri lock to do
1676 * lookup and access the extent.
1677 */
1678
1679 mutex_enter(&uvm_swap_data_lock);
1680 sdp = swapdrum_getsdp(startslot);
1681 KASSERT(uvmexp.nswapdev >= 1);
1682 KASSERT(sdp != NULL);
1683 KASSERT(sdp->swd_npginuse >= nslots);
1684 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1685 sdp->swd_npginuse -= nslots;
1686 uvmexp.swpginuse -= nslots;
1687 mutex_exit(&uvm_swap_data_lock);
1688 }
1689
1690 /*
1691 * uvm_swap_put: put any number of pages into a contig place on swap
1692 *
1693 * => can be sync or async
1694 */
1695
1696 int
1697 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1698 {
1699 int error;
1700
1701 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1702 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1703 return error;
1704 }
1705
1706 /*
1707 * uvm_swap_get: get a single page from swap
1708 *
1709 * => usually a sync op (from fault)
1710 */
1711
1712 int
1713 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1714 {
1715 int error;
1716
1717 uvmexp.nswget++;
1718 KASSERT(flags & PGO_SYNCIO);
1719 if (swslot == SWSLOT_BAD) {
1720 return EIO;
1721 }
1722
1723 error = uvm_swap_io(&page, swslot, 1, B_READ |
1724 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1725 if (error == 0) {
1726
1727 /*
1728 * this page is no longer only in swap.
1729 */
1730
1731 mutex_enter(&uvm_swap_data_lock);
1732 KASSERT(uvmexp.swpgonly > 0);
1733 uvmexp.swpgonly--;
1734 mutex_exit(&uvm_swap_data_lock);
1735 }
1736 return error;
1737 }
1738
1739 /*
1740 * uvm_swap_io: do an i/o operation to swap
1741 */
1742
1743 static int
1744 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1745 {
1746 daddr_t startblk;
1747 struct buf *bp;
1748 vaddr_t kva;
1749 int error, mapinflags;
1750 bool write, async;
1751 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1752
1753 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1754 startslot, npages, flags, 0);
1755
1756 write = (flags & B_READ) == 0;
1757 async = (flags & B_ASYNC) != 0;
1758
1759 /*
1760 * allocate a buf for the i/o.
1761 */
1762
1763 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1764 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1765 if (bp == NULL) {
1766 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1767 return ENOMEM;
1768 }
1769
1770 /*
1771 * convert starting drum slot to block number
1772 */
1773
1774 startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1775
1776 /*
1777 * first, map the pages into the kernel.
1778 */
1779
1780 mapinflags = !write ?
1781 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1782 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1783 kva = uvm_pagermapin(pps, npages, mapinflags);
1784
1785 /*
1786 * fill in the bp/sbp. we currently route our i/o through
1787 * /dev/drum's vnode [swapdev_vp].
1788 */
1789
1790 bp->b_cflags = BC_BUSY | BC_NOCACHE;
1791 bp->b_flags = (flags & (B_READ|B_ASYNC));
1792 bp->b_proc = &proc0; /* XXX */
1793 bp->b_vnbufs.le_next = NOLIST;
1794 bp->b_data = (void *)kva;
1795 bp->b_blkno = startblk;
1796 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1797
1798 /*
1799 * bump v_numoutput (counter of number of active outputs).
1800 */
1801
1802 if (write) {
1803 mutex_enter(swapdev_vp->v_interlock);
1804 swapdev_vp->v_numoutput++;
1805 mutex_exit(swapdev_vp->v_interlock);
1806 }
1807
1808 /*
1809 * for async ops we must set up the iodone handler.
1810 */
1811
1812 if (async) {
1813 bp->b_iodone = uvm_aio_biodone;
1814 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1815 if (curlwp == uvm.pagedaemon_lwp)
1816 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1817 else
1818 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1819 } else {
1820 bp->b_iodone = NULL;
1821 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1822 }
1823 UVMHIST_LOG(pdhist,
1824 "about to start io: data = %p blkno = 0x%x, bcount = %ld",
1825 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1826
1827 /*
1828 * now we start the I/O, and if async, return.
1829 */
1830
1831 VOP_STRATEGY(swapdev_vp, bp);
1832 if (async)
1833 return 0;
1834
1835 /*
1836 * must be sync i/o. wait for it to finish
1837 */
1838
1839 error = biowait(bp);
1840
1841 /*
1842 * kill the pager mapping
1843 */
1844
1845 uvm_pagermapout(kva, npages);
1846
1847 /*
1848 * now dispose of the buf and we're done.
1849 */
1850
1851 if (write) {
1852 mutex_enter(swapdev_vp->v_interlock);
1853 vwakeup(bp);
1854 mutex_exit(swapdev_vp->v_interlock);
1855 }
1856 putiobuf(bp);
1857 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0);
1858
1859 return (error);
1860 }
1861