uvm_swap.c revision 1.137.6.4 1 /* $NetBSD: uvm_swap.c,v 1.137.6.4 2008/10/10 22:37:10 skrll Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.137.6.4 2008/10/10 22:37:10 skrll Exp $");
34
35 #include "fs_nfs.h"
36 #include "opt_uvmhist.h"
37 #include "opt_compat_netbsd.h"
38 #include "opt_ddb.h"
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/buf.h>
43 #include <sys/bufq.h>
44 #include <sys/conf.h>
45 #include <sys/proc.h>
46 #include <sys/namei.h>
47 #include <sys/disklabel.h>
48 #include <sys/errno.h>
49 #include <sys/kernel.h>
50 #include <sys/malloc.h>
51 #include <sys/vnode.h>
52 #include <sys/file.h>
53 #include <sys/vmem.h>
54 #include <sys/blist.h>
55 #include <sys/mount.h>
56 #include <sys/pool.h>
57 #include <sys/syscallargs.h>
58 #include <sys/swap.h>
59 #include <sys/kauth.h>
60 #include <sys/sysctl.h>
61 #include <sys/workqueue.h>
62
63 #include <uvm/uvm.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 /*
68 * uvm_swap.c: manage configuration and i/o to swap space.
69 */
70
71 /*
72 * swap space is managed in the following way:
73 *
74 * each swap partition or file is described by a "swapdev" structure.
75 * each "swapdev" structure contains a "swapent" structure which contains
76 * information that is passed up to the user (via system calls).
77 *
78 * each swap partition is assigned a "priority" (int) which controls
79 * swap parition usage.
80 *
81 * the system maintains a global data structure describing all swap
82 * partitions/files. there is a sorted LIST of "swappri" structures
83 * which describe "swapdev"'s at that priority. this LIST is headed
84 * by the "swap_priority" global var. each "swappri" contains a
85 * CIRCLEQ of "swapdev" structures at that priority.
86 *
87 * locking:
88 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
89 * system call and prevents the swap priority list from changing
90 * while we are in the middle of a system call (e.g. SWAP_STATS).
91 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
92 * structures including the priority list, the swapdev structures,
93 * and the swapmap arena.
94 *
95 * each swap device has the following info:
96 * - swap device in use (could be disabled, preventing future use)
97 * - swap enabled (allows new allocations on swap)
98 * - map info in /dev/drum
99 * - vnode pointer
100 * for swap files only:
101 * - block size
102 * - max byte count in buffer
103 * - buffer
104 *
105 * userland controls and configures swap with the swapctl(2) system call.
106 * the sys_swapctl performs the following operations:
107 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
108 * [2] SWAP_STATS: given a pointer to an array of swapent structures
109 * (passed in via "arg") of a size passed in via "misc" ... we load
110 * the current swap config into the array. The actual work is done
111 * in the uvm_swap_stats(9) function.
112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
113 * priority in "misc", start swapping on it.
114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
116 * "misc")
117 */
118
119 /*
120 * swapdev: describes a single swap partition/file
121 *
122 * note the following should be true:
123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
125 */
126 struct swapdev {
127 struct oswapent swd_ose;
128 #define swd_dev swd_ose.ose_dev /* device id */
129 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */
130 #define swd_priority swd_ose.ose_priority /* our priority */
131 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
132 char *swd_path; /* saved pathname of device */
133 int swd_pathlen; /* length of pathname */
134 int swd_npages; /* #pages we can use */
135 int swd_npginuse; /* #pages in use */
136 int swd_npgbad; /* #pages bad */
137 int swd_drumoffset; /* page0 offset in drum */
138 int swd_drumsize; /* #pages in drum */
139 blist_t swd_blist; /* blist for this swapdev */
140 struct vnode *swd_vp; /* backing vnode */
141 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
142
143 int swd_bsize; /* blocksize (bytes) */
144 int swd_maxactive; /* max active i/o reqs */
145 struct bufq_state *swd_tab; /* buffer list */
146 int swd_active; /* number of active buffers */
147 };
148
149 /*
150 * swap device priority entry; the list is kept sorted on `spi_priority'.
151 */
152 struct swappri {
153 int spi_priority; /* priority */
154 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
155 /* circleq of swapdevs at this priority */
156 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
157 };
158
159 /*
160 * The following two structures are used to keep track of data transfers
161 * on swap devices associated with regular files.
162 * NOTE: this code is more or less a copy of vnd.c; we use the same
163 * structure names here to ease porting..
164 */
165 struct vndxfer {
166 struct buf *vx_bp; /* Pointer to parent buffer */
167 struct swapdev *vx_sdp;
168 int vx_error;
169 int vx_pending; /* # of pending aux buffers */
170 int vx_flags;
171 #define VX_BUSY 1
172 #define VX_DEAD 2
173 };
174
175 struct vndbuf {
176 struct buf vb_buf;
177 struct vndxfer *vb_xfer;
178 };
179
180
181 /*
182 * We keep a of pool vndbuf's and vndxfer structures.
183 */
184 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL,
185 IPL_BIO);
186 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL,
187 IPL_BIO);
188
189 /*
190 * local variables
191 */
192 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures");
193 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
194
195 /* list of all active swap devices [by priority] */
196 LIST_HEAD(swap_priority, swappri);
197 static struct swap_priority swap_priority;
198
199 /* locks */
200 static krwlock_t swap_syscall_lock;
201
202 /* workqueue and use counter for swap to regular files */
203 static int sw_reg_count = 0;
204 static struct workqueue *sw_reg_workqueue;
205
206 /*
207 * prototypes
208 */
209 static struct swapdev *swapdrum_getsdp(int);
210
211 static struct swapdev *swaplist_find(struct vnode *, bool);
212 static void swaplist_insert(struct swapdev *,
213 struct swappri *, int);
214 static void swaplist_trim(void);
215
216 static int swap_on(struct lwp *, struct swapdev *);
217 static int swap_off(struct lwp *, struct swapdev *);
218
219 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *);
220
221 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
222 static void sw_reg_biodone(struct buf *);
223 static void sw_reg_iodone(struct work *wk, void *dummy);
224 static void sw_reg_start(struct swapdev *);
225
226 static int uvm_swap_io(struct vm_page **, int, int, int);
227
228 /*
229 * uvm_swap_init: init the swap system data structures and locks
230 *
231 * => called at boot time from init_main.c after the filesystems
232 * are brought up (which happens after uvm_init())
233 */
234 void
235 uvm_swap_init(void)
236 {
237 UVMHIST_FUNC("uvm_swap_init");
238
239 UVMHIST_CALLED(pdhist);
240 /*
241 * first, init the swap list, its counter, and its lock.
242 * then get a handle on the vnode for /dev/drum by using
243 * the its dev_t number ("swapdev", from MD conf.c).
244 */
245
246 LIST_INIT(&swap_priority);
247 uvmexp.nswapdev = 0;
248 rw_init(&swap_syscall_lock);
249 cv_init(&uvm.scheduler_cv, "schedule");
250 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
251
252 /* XXXSMP should be at IPL_VM, but for audio interrupt handlers. */
253 mutex_init(&uvm_scheduler_mutex, MUTEX_SPIN, IPL_SCHED);
254
255 if (bdevvp(swapdev, &swapdev_vp))
256 panic("uvm_swap_init: can't get vnode for swap device");
257 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
258 panic("uvm_swap_init: can't lock swap device");
259 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
260 panic("uvm_swap_init: can't open swap device");
261 VOP_UNLOCK(swapdev_vp, 0);
262
263 /*
264 * create swap block resource map to map /dev/drum. the range
265 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
266 * that block 0 is reserved (used to indicate an allocation
267 * failure, or no allocation).
268 */
269 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
270 VM_NOSLEEP, IPL_NONE);
271 if (swapmap == 0)
272 panic("uvm_swap_init: extent_create failed");
273
274 /*
275 * done!
276 */
277 uvm.swap_running = true;
278 #ifdef __SWAP_BROKEN
279 uvm.swapout_enabled = 0;
280 #else
281 uvm.swapout_enabled = 1;
282 #endif
283 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
284
285 sysctl_createv(NULL, 0, NULL, NULL,
286 CTLFLAG_READWRITE,
287 CTLTYPE_INT, "swapout",
288 SYSCTL_DESCR("Set 0 to disable swapout of kernel stacks"),
289 NULL, 0, &uvm.swapout_enabled, 0, CTL_VM, CTL_CREATE, CTL_EOL);
290 }
291
292 /*
293 * swaplist functions: functions that operate on the list of swap
294 * devices on the system.
295 */
296
297 /*
298 * swaplist_insert: insert swap device "sdp" into the global list
299 *
300 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
301 * => caller must provide a newly malloc'd swappri structure (we will
302 * FREE it if we don't need it... this it to prevent malloc blocking
303 * here while adding swap)
304 */
305 static void
306 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
307 {
308 struct swappri *spp, *pspp;
309 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
310
311 /*
312 * find entry at or after which to insert the new device.
313 */
314 pspp = NULL;
315 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
316 if (priority <= spp->spi_priority)
317 break;
318 pspp = spp;
319 }
320
321 /*
322 * new priority?
323 */
324 if (spp == NULL || spp->spi_priority != priority) {
325 spp = newspp; /* use newspp! */
326 UVMHIST_LOG(pdhist, "created new swappri = %d",
327 priority, 0, 0, 0);
328
329 spp->spi_priority = priority;
330 CIRCLEQ_INIT(&spp->spi_swapdev);
331
332 if (pspp)
333 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
334 else
335 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
336 } else {
337 /* we don't need a new priority structure, free it */
338 FREE(newspp, M_VMSWAP);
339 }
340
341 /*
342 * priority found (or created). now insert on the priority's
343 * circleq list and bump the total number of swapdevs.
344 */
345 sdp->swd_priority = priority;
346 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
347 uvmexp.nswapdev++;
348 }
349
350 /*
351 * swaplist_find: find and optionally remove a swap device from the
352 * global list.
353 *
354 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
355 * => we return the swapdev we found (and removed)
356 */
357 static struct swapdev *
358 swaplist_find(struct vnode *vp, bool remove)
359 {
360 struct swapdev *sdp;
361 struct swappri *spp;
362
363 /*
364 * search the lists for the requested vp
365 */
366
367 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
368 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
369 if (sdp->swd_vp == vp) {
370 if (remove) {
371 CIRCLEQ_REMOVE(&spp->spi_swapdev,
372 sdp, swd_next);
373 uvmexp.nswapdev--;
374 }
375 return(sdp);
376 }
377 }
378 }
379 return (NULL);
380 }
381
382 /*
383 * swaplist_trim: scan priority list for empty priority entries and kill
384 * them.
385 *
386 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
387 */
388 static void
389 swaplist_trim(void)
390 {
391 struct swappri *spp, *nextspp;
392
393 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
394 nextspp = LIST_NEXT(spp, spi_swappri);
395 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
396 (void *)&spp->spi_swapdev)
397 continue;
398 LIST_REMOVE(spp, spi_swappri);
399 free(spp, M_VMSWAP);
400 }
401 }
402
403 /*
404 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
405 * to the "swapdev" that maps that section of the drum.
406 *
407 * => each swapdev takes one big contig chunk of the drum
408 * => caller must hold uvm_swap_data_lock
409 */
410 static struct swapdev *
411 swapdrum_getsdp(int pgno)
412 {
413 struct swapdev *sdp;
414 struct swappri *spp;
415
416 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
417 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
418 if (sdp->swd_flags & SWF_FAKE)
419 continue;
420 if (pgno >= sdp->swd_drumoffset &&
421 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
422 return sdp;
423 }
424 }
425 }
426 return NULL;
427 }
428
429
430 /*
431 * sys_swapctl: main entry point for swapctl(2) system call
432 * [with two helper functions: swap_on and swap_off]
433 */
434 int
435 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
436 {
437 /* {
438 syscallarg(int) cmd;
439 syscallarg(void *) arg;
440 syscallarg(int) misc;
441 } */
442 struct vnode *vp;
443 struct nameidata nd;
444 struct swappri *spp;
445 struct swapdev *sdp;
446 struct swapent *sep;
447 #define SWAP_PATH_MAX (PATH_MAX + 1)
448 char *userpath;
449 size_t len;
450 int error, misc;
451 int priority;
452 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
453
454 misc = SCARG(uap, misc);
455
456 /*
457 * ensure serialized syscall access by grabbing the swap_syscall_lock
458 */
459 rw_enter(&swap_syscall_lock, RW_WRITER);
460
461 userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK);
462 /*
463 * we handle the non-priv NSWAP and STATS request first.
464 *
465 * SWAP_NSWAP: return number of config'd swap devices
466 * [can also be obtained with uvmexp sysctl]
467 */
468 if (SCARG(uap, cmd) == SWAP_NSWAP) {
469 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
470 0, 0, 0);
471 *retval = uvmexp.nswapdev;
472 error = 0;
473 goto out;
474 }
475
476 /*
477 * SWAP_STATS: get stats on current # of configured swap devs
478 *
479 * note that the swap_priority list can't change as long
480 * as we are holding the swap_syscall_lock. we don't want
481 * to grab the uvm_swap_data_lock because we may fault&sleep during
482 * copyout() and we don't want to be holding that lock then!
483 */
484 if (SCARG(uap, cmd) == SWAP_STATS
485 #if defined(COMPAT_13)
486 || SCARG(uap, cmd) == SWAP_OSTATS
487 #endif
488 ) {
489 if ((size_t)misc > (size_t)uvmexp.nswapdev)
490 misc = uvmexp.nswapdev;
491 #if defined(COMPAT_13)
492 if (SCARG(uap, cmd) == SWAP_OSTATS)
493 len = sizeof(struct oswapent) * misc;
494 else
495 #endif
496 len = sizeof(struct swapent) * misc;
497 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK);
498
499 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval);
500 error = copyout(sep, SCARG(uap, arg), len);
501
502 free(sep, M_TEMP);
503 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
504 goto out;
505 }
506 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
507 dev_t *devp = (dev_t *)SCARG(uap, arg);
508
509 error = copyout(&dumpdev, devp, sizeof(dumpdev));
510 goto out;
511 }
512
513 /*
514 * all other requests require superuser privs. verify.
515 */
516 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
517 0, NULL, NULL, NULL)))
518 goto out;
519
520 if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
521 /* drop the current dump device */
522 dumpdev = NODEV;
523 dumpcdev = NODEV;
524 cpu_dumpconf();
525 goto out;
526 }
527
528 /*
529 * at this point we expect a path name in arg. we will
530 * use namei() to gain a vnode reference (vref), and lock
531 * the vnode (VOP_LOCK).
532 *
533 * XXX: a NULL arg means use the root vnode pointer (e.g. for
534 * miniroot)
535 */
536 if (SCARG(uap, arg) == NULL) {
537 vp = rootvp; /* miniroot */
538 if (vget(vp, LK_EXCLUSIVE)) {
539 error = EBUSY;
540 goto out;
541 }
542 if (SCARG(uap, cmd) == SWAP_ON &&
543 copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
544 panic("swapctl: miniroot copy failed");
545 } else {
546 int space;
547 char *where;
548
549 if (SCARG(uap, cmd) == SWAP_ON) {
550 if ((error = copyinstr(SCARG(uap, arg), userpath,
551 SWAP_PATH_MAX, &len)))
552 goto out;
553 space = UIO_SYSSPACE;
554 where = userpath;
555 } else {
556 space = UIO_USERSPACE;
557 where = (char *)SCARG(uap, arg);
558 }
559 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT,
560 space, where);
561 if ((error = namei(&nd)))
562 goto out;
563 vp = nd.ni_vp;
564 }
565 /* note: "vp" is referenced and locked */
566
567 error = 0; /* assume no error */
568 switch(SCARG(uap, cmd)) {
569
570 case SWAP_DUMPDEV:
571 if (vp->v_type != VBLK) {
572 error = ENOTBLK;
573 break;
574 }
575 if (bdevsw_lookup(vp->v_rdev)) {
576 dumpdev = vp->v_rdev;
577 dumpcdev = devsw_blk2chr(dumpdev);
578 } else
579 dumpdev = NODEV;
580 cpu_dumpconf();
581 break;
582
583 case SWAP_CTL:
584 /*
585 * get new priority, remove old entry (if any) and then
586 * reinsert it in the correct place. finally, prune out
587 * any empty priority structures.
588 */
589 priority = SCARG(uap, misc);
590 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
591 mutex_enter(&uvm_swap_data_lock);
592 if ((sdp = swaplist_find(vp, true)) == NULL) {
593 error = ENOENT;
594 } else {
595 swaplist_insert(sdp, spp, priority);
596 swaplist_trim();
597 }
598 mutex_exit(&uvm_swap_data_lock);
599 if (error)
600 free(spp, M_VMSWAP);
601 break;
602
603 case SWAP_ON:
604
605 /*
606 * check for duplicates. if none found, then insert a
607 * dummy entry on the list to prevent someone else from
608 * trying to enable this device while we are working on
609 * it.
610 */
611
612 priority = SCARG(uap, misc);
613 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
614 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
615 memset(sdp, 0, sizeof(*sdp));
616 sdp->swd_flags = SWF_FAKE;
617 sdp->swd_vp = vp;
618 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
619 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
620 mutex_enter(&uvm_swap_data_lock);
621 if (swaplist_find(vp, false) != NULL) {
622 error = EBUSY;
623 mutex_exit(&uvm_swap_data_lock);
624 bufq_free(sdp->swd_tab);
625 free(sdp, M_VMSWAP);
626 free(spp, M_VMSWAP);
627 break;
628 }
629 swaplist_insert(sdp, spp, priority);
630 mutex_exit(&uvm_swap_data_lock);
631
632 sdp->swd_pathlen = len;
633 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
634 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
635 panic("swapctl: copystr");
636
637 /*
638 * we've now got a FAKE placeholder in the swap list.
639 * now attempt to enable swap on it. if we fail, undo
640 * what we've done and kill the fake entry we just inserted.
641 * if swap_on is a success, it will clear the SWF_FAKE flag
642 */
643
644 if ((error = swap_on(l, sdp)) != 0) {
645 mutex_enter(&uvm_swap_data_lock);
646 (void) swaplist_find(vp, true); /* kill fake entry */
647 swaplist_trim();
648 mutex_exit(&uvm_swap_data_lock);
649 bufq_free(sdp->swd_tab);
650 free(sdp->swd_path, M_VMSWAP);
651 free(sdp, M_VMSWAP);
652 break;
653 }
654 break;
655
656 case SWAP_OFF:
657 mutex_enter(&uvm_swap_data_lock);
658 if ((sdp = swaplist_find(vp, false)) == NULL) {
659 mutex_exit(&uvm_swap_data_lock);
660 error = ENXIO;
661 break;
662 }
663
664 /*
665 * If a device isn't in use or enabled, we
666 * can't stop swapping from it (again).
667 */
668 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
669 mutex_exit(&uvm_swap_data_lock);
670 error = EBUSY;
671 break;
672 }
673
674 /*
675 * do the real work.
676 */
677 error = swap_off(l, sdp);
678 break;
679
680 default:
681 error = EINVAL;
682 }
683
684 /*
685 * done! release the ref gained by namei() and unlock.
686 */
687 vput(vp);
688
689 out:
690 free(userpath, M_TEMP);
691 rw_exit(&swap_syscall_lock);
692
693 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
694 return (error);
695 }
696
697 /*
698 * swap_stats: implements swapctl(SWAP_STATS). The function is kept
699 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
700 * emulation to use it directly without going through sys_swapctl().
701 * The problem with using sys_swapctl() there is that it involves
702 * copying the swapent array to the stackgap, and this array's size
703 * is not known at build time. Hence it would not be possible to
704 * ensure it would fit in the stackgap in any case.
705 */
706 void
707 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval)
708 {
709
710 rw_enter(&swap_syscall_lock, RW_READER);
711 uvm_swap_stats_locked(cmd, sep, sec, retval);
712 rw_exit(&swap_syscall_lock);
713 }
714
715 static void
716 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval)
717 {
718 struct swappri *spp;
719 struct swapdev *sdp;
720 int count = 0;
721
722 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
723 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
724 sdp != (void *)&spp->spi_swapdev && sec-- > 0;
725 sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
726 /*
727 * backwards compatibility for system call.
728 * note that we use 'struct oswapent' as an
729 * overlay into both 'struct swapdev' and
730 * the userland 'struct swapent', as we
731 * want to retain backwards compatibility
732 * with NetBSD 1.3.
733 */
734 sdp->swd_ose.ose_inuse =
735 btodb((uint64_t)sdp->swd_npginuse <<
736 PAGE_SHIFT);
737 (void)memcpy(sep, &sdp->swd_ose,
738 sizeof(struct oswapent));
739
740 /* now copy out the path if necessary */
741 #if !defined(COMPAT_13)
742 (void) cmd;
743 #endif
744 #if defined(COMPAT_13)
745 if (cmd == SWAP_STATS)
746 #endif
747 (void)memcpy(&sep->se_path, sdp->swd_path,
748 sdp->swd_pathlen);
749
750 count++;
751 #if defined(COMPAT_13)
752 if (cmd == SWAP_OSTATS)
753 sep = (struct swapent *)
754 ((struct oswapent *)sep + 1);
755 else
756 #endif
757 sep++;
758 }
759 }
760
761 *retval = count;
762 return;
763 }
764
765 /*
766 * swap_on: attempt to enable a swapdev for swapping. note that the
767 * swapdev is already on the global list, but disabled (marked
768 * SWF_FAKE).
769 *
770 * => we avoid the start of the disk (to protect disk labels)
771 * => we also avoid the miniroot, if we are swapping to root.
772 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
773 * if needed.
774 */
775 static int
776 swap_on(struct lwp *l, struct swapdev *sdp)
777 {
778 struct vnode *vp;
779 int error, npages, nblocks, size;
780 long addr;
781 u_long result;
782 struct vattr va;
783 #ifdef NFS
784 extern int (**nfsv2_vnodeop_p)(void *);
785 #endif /* NFS */
786 const struct bdevsw *bdev;
787 dev_t dev;
788 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
789
790 /*
791 * we want to enable swapping on sdp. the swd_vp contains
792 * the vnode we want (locked and ref'd), and the swd_dev
793 * contains the dev_t of the file, if it a block device.
794 */
795
796 vp = sdp->swd_vp;
797 dev = sdp->swd_dev;
798
799 /*
800 * open the swap file (mostly useful for block device files to
801 * let device driver know what is up).
802 *
803 * we skip the open/close for root on swap because the root
804 * has already been opened when root was mounted (mountroot).
805 */
806 if (vp != rootvp) {
807 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
808 return (error);
809 }
810
811 /* XXX this only works for block devices */
812 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
813
814 /*
815 * we now need to determine the size of the swap area. for
816 * block specials we can call the d_psize function.
817 * for normal files, we must stat [get attrs].
818 *
819 * we put the result in nblks.
820 * for normal files, we also want the filesystem block size
821 * (which we get with statfs).
822 */
823 switch (vp->v_type) {
824 case VBLK:
825 bdev = bdevsw_lookup(dev);
826 if (bdev == NULL || bdev->d_psize == NULL ||
827 (nblocks = (*bdev->d_psize)(dev)) == -1) {
828 error = ENXIO;
829 goto bad;
830 }
831 break;
832
833 case VREG:
834 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
835 goto bad;
836 nblocks = (int)btodb(va.va_size);
837 if ((error =
838 VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat)) != 0)
839 goto bad;
840
841 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
842 /*
843 * limit the max # of outstanding I/O requests we issue
844 * at any one time. take it easy on NFS servers.
845 */
846 #ifdef NFS
847 if (vp->v_op == nfsv2_vnodeop_p)
848 sdp->swd_maxactive = 2; /* XXX */
849 else
850 #endif /* NFS */
851 sdp->swd_maxactive = 8; /* XXX */
852 break;
853
854 default:
855 error = ENXIO;
856 goto bad;
857 }
858
859 /*
860 * save nblocks in a safe place and convert to pages.
861 */
862
863 sdp->swd_ose.ose_nblks = nblocks;
864 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
865
866 /*
867 * for block special files, we want to make sure that leave
868 * the disklabel and bootblocks alone, so we arrange to skip
869 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
870 * note that because of this the "size" can be less than the
871 * actual number of blocks on the device.
872 */
873 if (vp->v_type == VBLK) {
874 /* we use pages 1 to (size - 1) [inclusive] */
875 size = npages - 1;
876 addr = 1;
877 } else {
878 /* we use pages 0 to (size - 1) [inclusive] */
879 size = npages;
880 addr = 0;
881 }
882
883 /*
884 * make sure we have enough blocks for a reasonable sized swap
885 * area. we want at least one page.
886 */
887
888 if (size < 1) {
889 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
890 error = EINVAL;
891 goto bad;
892 }
893
894 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
895
896 /*
897 * now we need to allocate an extent to manage this swap device
898 */
899
900 sdp->swd_blist = blist_create(npages);
901 /* mark all expect the `saved' region free. */
902 blist_free(sdp->swd_blist, addr, size);
903
904 /*
905 * if the vnode we are swapping to is the root vnode
906 * (i.e. we are swapping to the miniroot) then we want
907 * to make sure we don't overwrite it. do a statfs to
908 * find its size and skip over it.
909 */
910 if (vp == rootvp) {
911 struct mount *mp;
912 struct statvfs *sp;
913 int rootblocks, rootpages;
914
915 mp = rootvnode->v_mount;
916 sp = &mp->mnt_stat;
917 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
918 /*
919 * XXX: sp->f_blocks isn't the total number of
920 * blocks in the filesystem, it's the number of
921 * data blocks. so, our rootblocks almost
922 * definitely underestimates the total size
923 * of the filesystem - how badly depends on the
924 * details of the filesystem type. there isn't
925 * an obvious way to deal with this cleanly
926 * and perfectly, so for now we just pad our
927 * rootblocks estimate with an extra 5 percent.
928 */
929 rootblocks += (rootblocks >> 5) +
930 (rootblocks >> 6) +
931 (rootblocks >> 7);
932 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
933 if (rootpages > size)
934 panic("swap_on: miniroot larger than swap?");
935
936 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
937 panic("swap_on: unable to preserve miniroot");
938 }
939
940 size -= rootpages;
941 printf("Preserved %d pages of miniroot ", rootpages);
942 printf("leaving %d pages of swap\n", size);
943 }
944
945 /*
946 * add a ref to vp to reflect usage as a swap device.
947 */
948 vref(vp);
949
950 /*
951 * now add the new swapdev to the drum and enable.
952 */
953 result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP);
954 if (result == 0)
955 panic("swapdrum_add");
956 /*
957 * If this is the first regular swap create the workqueue.
958 * => Protected by swap_syscall_lock.
959 */
960 if (vp->v_type != VBLK) {
961 if (sw_reg_count++ == 0) {
962 KASSERT(sw_reg_workqueue == NULL);
963 if (workqueue_create(&sw_reg_workqueue, "swapiod",
964 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
965 panic("swap_add: workqueue_create failed");
966 }
967 }
968
969 sdp->swd_drumoffset = (int)result;
970 sdp->swd_drumsize = npages;
971 sdp->swd_npages = size;
972 mutex_enter(&uvm_swap_data_lock);
973 sdp->swd_flags &= ~SWF_FAKE; /* going live */
974 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
975 uvmexp.swpages += size;
976 uvmexp.swpgavail += size;
977 mutex_exit(&uvm_swap_data_lock);
978 return (0);
979
980 /*
981 * failure: clean up and return error.
982 */
983
984 bad:
985 if (sdp->swd_blist) {
986 blist_destroy(sdp->swd_blist);
987 }
988 if (vp != rootvp) {
989 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
990 }
991 return (error);
992 }
993
994 /*
995 * swap_off: stop swapping on swapdev
996 *
997 * => swap data should be locked, we will unlock.
998 */
999 static int
1000 swap_off(struct lwp *l, struct swapdev *sdp)
1001 {
1002 int npages = sdp->swd_npages;
1003 int error = 0;
1004
1005 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1006 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0);
1007
1008 /* disable the swap area being removed */
1009 sdp->swd_flags &= ~SWF_ENABLE;
1010 uvmexp.swpgavail -= npages;
1011 mutex_exit(&uvm_swap_data_lock);
1012
1013 /*
1014 * the idea is to find all the pages that are paged out to this
1015 * device, and page them all in. in uvm, swap-backed pageable
1016 * memory can take two forms: aobjs and anons. call the
1017 * swapoff hook for each subsystem to bring in pages.
1018 */
1019
1020 if (uao_swap_off(sdp->swd_drumoffset,
1021 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1022 amap_swap_off(sdp->swd_drumoffset,
1023 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1024 error = ENOMEM;
1025 } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1026 error = EBUSY;
1027 }
1028
1029 if (error) {
1030 mutex_enter(&uvm_swap_data_lock);
1031 sdp->swd_flags |= SWF_ENABLE;
1032 uvmexp.swpgavail += npages;
1033 mutex_exit(&uvm_swap_data_lock);
1034
1035 return error;
1036 }
1037
1038 /*
1039 * If this is the last regular swap destroy the workqueue.
1040 * => Protected by swap_syscall_lock.
1041 */
1042 if (sdp->swd_vp->v_type != VBLK) {
1043 KASSERT(sw_reg_count > 0);
1044 KASSERT(sw_reg_workqueue != NULL);
1045 if (--sw_reg_count == 0) {
1046 workqueue_destroy(sw_reg_workqueue);
1047 sw_reg_workqueue = NULL;
1048 }
1049 }
1050
1051 /*
1052 * done with the vnode.
1053 * drop our ref on the vnode before calling VOP_CLOSE()
1054 * so that spec_close() can tell if this is the last close.
1055 */
1056 vrele(sdp->swd_vp);
1057 if (sdp->swd_vp != rootvp) {
1058 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1059 }
1060
1061 mutex_enter(&uvm_swap_data_lock);
1062 uvmexp.swpages -= npages;
1063 uvmexp.swpginuse -= sdp->swd_npgbad;
1064
1065 if (swaplist_find(sdp->swd_vp, true) == NULL)
1066 panic("swap_off: swapdev not in list");
1067 swaplist_trim();
1068 mutex_exit(&uvm_swap_data_lock);
1069
1070 /*
1071 * free all resources!
1072 */
1073 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1074 blist_destroy(sdp->swd_blist);
1075 bufq_free(sdp->swd_tab);
1076 free(sdp, M_VMSWAP);
1077 return (0);
1078 }
1079
1080 /*
1081 * /dev/drum interface and i/o functions
1082 */
1083
1084 /*
1085 * swstrategy: perform I/O on the drum
1086 *
1087 * => we must map the i/o request from the drum to the correct swapdev.
1088 */
1089 static void
1090 swstrategy(struct buf *bp)
1091 {
1092 struct swapdev *sdp;
1093 struct vnode *vp;
1094 int pageno, bn;
1095 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1096
1097 /*
1098 * convert block number to swapdev. note that swapdev can't
1099 * be yanked out from under us because we are holding resources
1100 * in it (i.e. the blocks we are doing I/O on).
1101 */
1102 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1103 mutex_enter(&uvm_swap_data_lock);
1104 sdp = swapdrum_getsdp(pageno);
1105 mutex_exit(&uvm_swap_data_lock);
1106 if (sdp == NULL) {
1107 bp->b_error = EINVAL;
1108 biodone(bp);
1109 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1110 return;
1111 }
1112
1113 /*
1114 * convert drum page number to block number on this swapdev.
1115 */
1116
1117 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1118 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1119
1120 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld",
1121 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1122 sdp->swd_drumoffset, bn, bp->b_bcount);
1123
1124 /*
1125 * for block devices we finish up here.
1126 * for regular files we have to do more work which we delegate
1127 * to sw_reg_strategy().
1128 */
1129
1130 vp = sdp->swd_vp; /* swapdev vnode pointer */
1131 switch (vp->v_type) {
1132 default:
1133 panic("swstrategy: vnode type 0x%x", vp->v_type);
1134
1135 case VBLK:
1136
1137 /*
1138 * must convert "bp" from an I/O on /dev/drum to an I/O
1139 * on the swapdev (sdp).
1140 */
1141 bp->b_blkno = bn; /* swapdev block number */
1142 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1143
1144 /*
1145 * if we are doing a write, we have to redirect the i/o on
1146 * drum's v_numoutput counter to the swapdevs.
1147 */
1148 if ((bp->b_flags & B_READ) == 0) {
1149 mutex_enter(bp->b_objlock);
1150 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1151 mutex_exit(bp->b_objlock);
1152 mutex_enter(&vp->v_interlock);
1153 vp->v_numoutput++; /* put it on swapdev */
1154 mutex_exit(&vp->v_interlock);
1155 }
1156
1157 /*
1158 * finally plug in swapdev vnode and start I/O
1159 */
1160 bp->b_vp = vp;
1161 bp->b_objlock = &vp->v_interlock;
1162 VOP_STRATEGY(vp, bp);
1163 return;
1164
1165 case VREG:
1166 /*
1167 * delegate to sw_reg_strategy function.
1168 */
1169 sw_reg_strategy(sdp, bp, bn);
1170 return;
1171 }
1172 /* NOTREACHED */
1173 }
1174
1175 /*
1176 * swread: the read function for the drum (just a call to physio)
1177 */
1178 /*ARGSUSED*/
1179 static int
1180 swread(dev_t dev, struct uio *uio, int ioflag)
1181 {
1182 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1183
1184 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1185 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1186 }
1187
1188 /*
1189 * swwrite: the write function for the drum (just a call to physio)
1190 */
1191 /*ARGSUSED*/
1192 static int
1193 swwrite(dev_t dev, struct uio *uio, int ioflag)
1194 {
1195 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1196
1197 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1198 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1199 }
1200
1201 const struct bdevsw swap_bdevsw = {
1202 nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER,
1203 };
1204
1205 const struct cdevsw swap_cdevsw = {
1206 nullopen, nullclose, swread, swwrite, noioctl,
1207 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER,
1208 };
1209
1210 /*
1211 * sw_reg_strategy: handle swap i/o to regular files
1212 */
1213 static void
1214 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1215 {
1216 struct vnode *vp;
1217 struct vndxfer *vnx;
1218 daddr_t nbn;
1219 char *addr;
1220 off_t byteoff;
1221 int s, off, nra, error, sz, resid;
1222 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1223
1224 /*
1225 * allocate a vndxfer head for this transfer and point it to
1226 * our buffer.
1227 */
1228 vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1229 vnx->vx_flags = VX_BUSY;
1230 vnx->vx_error = 0;
1231 vnx->vx_pending = 0;
1232 vnx->vx_bp = bp;
1233 vnx->vx_sdp = sdp;
1234
1235 /*
1236 * setup for main loop where we read filesystem blocks into
1237 * our buffer.
1238 */
1239 error = 0;
1240 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1241 addr = bp->b_data; /* current position in buffer */
1242 byteoff = dbtob((uint64_t)bn);
1243
1244 for (resid = bp->b_resid; resid; resid -= sz) {
1245 struct vndbuf *nbp;
1246
1247 /*
1248 * translate byteoffset into block number. return values:
1249 * vp = vnode of underlying device
1250 * nbn = new block number (on underlying vnode dev)
1251 * nra = num blocks we can read-ahead (excludes requested
1252 * block)
1253 */
1254 nra = 0;
1255 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1256 &vp, &nbn, &nra);
1257
1258 if (error == 0 && nbn == (daddr_t)-1) {
1259 /*
1260 * this used to just set error, but that doesn't
1261 * do the right thing. Instead, it causes random
1262 * memory errors. The panic() should remain until
1263 * this condition doesn't destabilize the system.
1264 */
1265 #if 1
1266 panic("sw_reg_strategy: swap to sparse file");
1267 #else
1268 error = EIO; /* failure */
1269 #endif
1270 }
1271
1272 /*
1273 * punt if there was an error or a hole in the file.
1274 * we must wait for any i/o ops we have already started
1275 * to finish before returning.
1276 *
1277 * XXX we could deal with holes here but it would be
1278 * a hassle (in the write case).
1279 */
1280 if (error) {
1281 s = splbio();
1282 vnx->vx_error = error; /* pass error up */
1283 goto out;
1284 }
1285
1286 /*
1287 * compute the size ("sz") of this transfer (in bytes).
1288 */
1289 off = byteoff % sdp->swd_bsize;
1290 sz = (1 + nra) * sdp->swd_bsize - off;
1291 if (sz > resid)
1292 sz = resid;
1293
1294 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1295 "vp %p/%p offset 0x%x/0x%x",
1296 sdp->swd_vp, vp, byteoff, nbn);
1297
1298 /*
1299 * now get a buf structure. note that the vb_buf is
1300 * at the front of the nbp structure so that you can
1301 * cast pointers between the two structure easily.
1302 */
1303 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1304 buf_init(&nbp->vb_buf);
1305 nbp->vb_buf.b_flags = bp->b_flags;
1306 nbp->vb_buf.b_cflags = bp->b_cflags;
1307 nbp->vb_buf.b_oflags = bp->b_oflags;
1308 nbp->vb_buf.b_bcount = sz;
1309 nbp->vb_buf.b_bufsize = sz;
1310 nbp->vb_buf.b_error = 0;
1311 nbp->vb_buf.b_data = addr;
1312 nbp->vb_buf.b_lblkno = 0;
1313 nbp->vb_buf.b_blkno = nbn + btodb(off);
1314 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1315 nbp->vb_buf.b_iodone = sw_reg_biodone;
1316 nbp->vb_buf.b_vp = vp;
1317 nbp->vb_buf.b_objlock = &vp->v_interlock;
1318 if (vp->v_type == VBLK) {
1319 nbp->vb_buf.b_dev = vp->v_rdev;
1320 }
1321
1322 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1323
1324 /*
1325 * Just sort by block number
1326 */
1327 s = splbio();
1328 if (vnx->vx_error != 0) {
1329 buf_destroy(&nbp->vb_buf);
1330 pool_put(&vndbuf_pool, nbp);
1331 goto out;
1332 }
1333 vnx->vx_pending++;
1334
1335 /* sort it in and start I/O if we are not over our limit */
1336 /* XXXAD locking */
1337 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf);
1338 sw_reg_start(sdp);
1339 splx(s);
1340
1341 /*
1342 * advance to the next I/O
1343 */
1344 byteoff += sz;
1345 addr += sz;
1346 }
1347
1348 s = splbio();
1349
1350 out: /* Arrive here at splbio */
1351 vnx->vx_flags &= ~VX_BUSY;
1352 if (vnx->vx_pending == 0) {
1353 error = vnx->vx_error;
1354 pool_put(&vndxfer_pool, vnx);
1355 bp->b_error = error;
1356 biodone(bp);
1357 }
1358 splx(s);
1359 }
1360
1361 /*
1362 * sw_reg_start: start an I/O request on the requested swapdev
1363 *
1364 * => reqs are sorted by b_rawblkno (above)
1365 */
1366 static void
1367 sw_reg_start(struct swapdev *sdp)
1368 {
1369 struct buf *bp;
1370 struct vnode *vp;
1371 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1372
1373 /* recursion control */
1374 if ((sdp->swd_flags & SWF_BUSY) != 0)
1375 return;
1376
1377 sdp->swd_flags |= SWF_BUSY;
1378
1379 while (sdp->swd_active < sdp->swd_maxactive) {
1380 bp = BUFQ_GET(sdp->swd_tab);
1381 if (bp == NULL)
1382 break;
1383 sdp->swd_active++;
1384
1385 UVMHIST_LOG(pdhist,
1386 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1387 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1388 vp = bp->b_vp;
1389 KASSERT(bp->b_objlock == &vp->v_interlock);
1390 if ((bp->b_flags & B_READ) == 0) {
1391 mutex_enter(&vp->v_interlock);
1392 vp->v_numoutput++;
1393 mutex_exit(&vp->v_interlock);
1394 }
1395 VOP_STRATEGY(vp, bp);
1396 }
1397 sdp->swd_flags &= ~SWF_BUSY;
1398 }
1399
1400 /*
1401 * sw_reg_biodone: one of our i/o's has completed
1402 */
1403 static void
1404 sw_reg_biodone(struct buf *bp)
1405 {
1406 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1407 }
1408
1409 /*
1410 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1411 *
1412 * => note that we can recover the vndbuf struct by casting the buf ptr
1413 */
1414 static void
1415 sw_reg_iodone(struct work *wk, void *dummy)
1416 {
1417 struct vndbuf *vbp = (void *)wk;
1418 struct vndxfer *vnx = vbp->vb_xfer;
1419 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1420 struct swapdev *sdp = vnx->vx_sdp;
1421 int s, resid, error;
1422 KASSERT(&vbp->vb_buf.b_work == wk);
1423 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1424
1425 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1426 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1427 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1428 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1429
1430 /*
1431 * protect vbp at splbio and update.
1432 */
1433
1434 s = splbio();
1435 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1436 pbp->b_resid -= resid;
1437 vnx->vx_pending--;
1438
1439 if (vbp->vb_buf.b_error != 0) {
1440 /* pass error upward */
1441 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1442 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0);
1443 vnx->vx_error = error;
1444 }
1445
1446 /*
1447 * kill vbp structure
1448 */
1449 buf_destroy(&vbp->vb_buf);
1450 pool_put(&vndbuf_pool, vbp);
1451
1452 /*
1453 * wrap up this transaction if it has run to completion or, in
1454 * case of an error, when all auxiliary buffers have returned.
1455 */
1456 if (vnx->vx_error != 0) {
1457 /* pass error upward */
1458 error = vnx->vx_error;
1459 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1460 pbp->b_error = error;
1461 biodone(pbp);
1462 pool_put(&vndxfer_pool, vnx);
1463 }
1464 } else if (pbp->b_resid == 0) {
1465 KASSERT(vnx->vx_pending == 0);
1466 if ((vnx->vx_flags & VX_BUSY) == 0) {
1467 UVMHIST_LOG(pdhist, " iodone error=%d !",
1468 pbp, vnx->vx_error, 0, 0);
1469 biodone(pbp);
1470 pool_put(&vndxfer_pool, vnx);
1471 }
1472 }
1473
1474 /*
1475 * done! start next swapdev I/O if one is pending
1476 */
1477 sdp->swd_active--;
1478 sw_reg_start(sdp);
1479 splx(s);
1480 }
1481
1482
1483 /*
1484 * uvm_swap_alloc: allocate space on swap
1485 *
1486 * => allocation is done "round robin" down the priority list, as we
1487 * allocate in a priority we "rotate" the circle queue.
1488 * => space can be freed with uvm_swap_free
1489 * => we return the page slot number in /dev/drum (0 == invalid slot)
1490 * => we lock uvm_swap_data_lock
1491 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1492 */
1493 int
1494 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1495 {
1496 struct swapdev *sdp;
1497 struct swappri *spp;
1498 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1499
1500 /*
1501 * no swap devices configured yet? definite failure.
1502 */
1503 if (uvmexp.nswapdev < 1)
1504 return 0;
1505
1506 /*
1507 * lock data lock, convert slots into blocks, and enter loop
1508 */
1509 mutex_enter(&uvm_swap_data_lock);
1510
1511 ReTry: /* XXXMRG */
1512 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1513 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1514 uint64_t result;
1515
1516 /* if it's not enabled, then we can't swap from it */
1517 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1518 continue;
1519 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1520 continue;
1521 result = blist_alloc(sdp->swd_blist, *nslots);
1522 if (result == BLIST_NONE) {
1523 continue;
1524 }
1525 KASSERT(result < sdp->swd_drumsize);
1526
1527 /*
1528 * successful allocation! now rotate the circleq.
1529 */
1530 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1531 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1532 sdp->swd_npginuse += *nslots;
1533 uvmexp.swpginuse += *nslots;
1534 mutex_exit(&uvm_swap_data_lock);
1535 /* done! return drum slot number */
1536 UVMHIST_LOG(pdhist,
1537 "success! returning %d slots starting at %d",
1538 *nslots, result + sdp->swd_drumoffset, 0, 0);
1539 return (result + sdp->swd_drumoffset);
1540 }
1541 }
1542
1543 /* XXXMRG: BEGIN HACK */
1544 if (*nslots > 1 && lessok) {
1545 *nslots = 1;
1546 /* XXXMRG: ugh! blist should support this for us */
1547 goto ReTry;
1548 }
1549 /* XXXMRG: END HACK */
1550
1551 mutex_exit(&uvm_swap_data_lock);
1552 return 0;
1553 }
1554
1555 bool
1556 uvm_swapisfull(void)
1557 {
1558 bool rv;
1559
1560 mutex_enter(&uvm_swap_data_lock);
1561 KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1562 rv = (uvmexp.swpgonly >= uvmexp.swpgavail);
1563 mutex_exit(&uvm_swap_data_lock);
1564
1565 return (rv);
1566 }
1567
1568 /*
1569 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1570 *
1571 * => we lock uvm_swap_data_lock
1572 */
1573 void
1574 uvm_swap_markbad(int startslot, int nslots)
1575 {
1576 struct swapdev *sdp;
1577 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1578
1579 mutex_enter(&uvm_swap_data_lock);
1580 sdp = swapdrum_getsdp(startslot);
1581 KASSERT(sdp != NULL);
1582
1583 /*
1584 * we just keep track of how many pages have been marked bad
1585 * in this device, to make everything add up in swap_off().
1586 * we assume here that the range of slots will all be within
1587 * one swap device.
1588 */
1589
1590 KASSERT(uvmexp.swpgonly >= nslots);
1591 uvmexp.swpgonly -= nslots;
1592 sdp->swd_npgbad += nslots;
1593 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
1594 mutex_exit(&uvm_swap_data_lock);
1595 }
1596
1597 /*
1598 * uvm_swap_free: free swap slots
1599 *
1600 * => this can be all or part of an allocation made by uvm_swap_alloc
1601 * => we lock uvm_swap_data_lock
1602 */
1603 void
1604 uvm_swap_free(int startslot, int nslots)
1605 {
1606 struct swapdev *sdp;
1607 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1608
1609 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1610 startslot, 0, 0);
1611
1612 /*
1613 * ignore attempts to free the "bad" slot.
1614 */
1615
1616 if (startslot == SWSLOT_BAD) {
1617 return;
1618 }
1619
1620 /*
1621 * convert drum slot offset back to sdp, free the blocks
1622 * in the extent, and return. must hold pri lock to do
1623 * lookup and access the extent.
1624 */
1625
1626 mutex_enter(&uvm_swap_data_lock);
1627 sdp = swapdrum_getsdp(startslot);
1628 KASSERT(uvmexp.nswapdev >= 1);
1629 KASSERT(sdp != NULL);
1630 KASSERT(sdp->swd_npginuse >= nslots);
1631 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1632 sdp->swd_npginuse -= nslots;
1633 uvmexp.swpginuse -= nslots;
1634 mutex_exit(&uvm_swap_data_lock);
1635 }
1636
1637 /*
1638 * uvm_swap_put: put any number of pages into a contig place on swap
1639 *
1640 * => can be sync or async
1641 */
1642
1643 int
1644 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1645 {
1646 int error;
1647
1648 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1649 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1650 return error;
1651 }
1652
1653 /*
1654 * uvm_swap_get: get a single page from swap
1655 *
1656 * => usually a sync op (from fault)
1657 */
1658
1659 int
1660 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1661 {
1662 int error;
1663
1664 uvmexp.nswget++;
1665 KASSERT(flags & PGO_SYNCIO);
1666 if (swslot == SWSLOT_BAD) {
1667 return EIO;
1668 }
1669
1670 error = uvm_swap_io(&page, swslot, 1, B_READ |
1671 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1672 if (error == 0) {
1673
1674 /*
1675 * this page is no longer only in swap.
1676 */
1677
1678 mutex_enter(&uvm_swap_data_lock);
1679 KASSERT(uvmexp.swpgonly > 0);
1680 uvmexp.swpgonly--;
1681 mutex_exit(&uvm_swap_data_lock);
1682 }
1683 return error;
1684 }
1685
1686 /*
1687 * uvm_swap_io: do an i/o operation to swap
1688 */
1689
1690 static int
1691 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1692 {
1693 daddr_t startblk;
1694 struct buf *bp;
1695 vaddr_t kva;
1696 int error, mapinflags;
1697 bool write, async;
1698 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1699
1700 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1701 startslot, npages, flags, 0);
1702
1703 write = (flags & B_READ) == 0;
1704 async = (flags & B_ASYNC) != 0;
1705
1706 /*
1707 * allocate a buf for the i/o.
1708 */
1709
1710 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1711 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1712 if (bp == NULL) {
1713 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1714 return ENOMEM;
1715 }
1716
1717 /*
1718 * convert starting drum slot to block number
1719 */
1720
1721 startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1722
1723 /*
1724 * first, map the pages into the kernel.
1725 */
1726
1727 mapinflags = !write ?
1728 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1729 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1730 kva = uvm_pagermapin(pps, npages, mapinflags);
1731
1732 /*
1733 * fill in the bp/sbp. we currently route our i/o through
1734 * /dev/drum's vnode [swapdev_vp].
1735 */
1736
1737 bp->b_cflags = BC_BUSY | BC_NOCACHE;
1738 bp->b_flags = (flags & (B_READ|B_ASYNC));
1739 bp->b_proc = &proc0; /* XXX */
1740 bp->b_vnbufs.le_next = NOLIST;
1741 bp->b_data = (void *)kva;
1742 bp->b_blkno = startblk;
1743 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1744
1745 /*
1746 * bump v_numoutput (counter of number of active outputs).
1747 */
1748
1749 if (write) {
1750 mutex_enter(&swapdev_vp->v_interlock);
1751 swapdev_vp->v_numoutput++;
1752 mutex_exit(&swapdev_vp->v_interlock);
1753 }
1754
1755 /*
1756 * for async ops we must set up the iodone handler.
1757 */
1758
1759 if (async) {
1760 bp->b_iodone = uvm_aio_biodone;
1761 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1762 if (curlwp == uvm.pagedaemon_lwp)
1763 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1764 else
1765 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1766 } else {
1767 bp->b_iodone = NULL;
1768 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1769 }
1770 UVMHIST_LOG(pdhist,
1771 "about to start io: data = %p blkno = 0x%x, bcount = %ld",
1772 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1773
1774 /*
1775 * now we start the I/O, and if async, return.
1776 */
1777
1778 VOP_STRATEGY(swapdev_vp, bp);
1779 if (async)
1780 return 0;
1781
1782 /*
1783 * must be sync i/o. wait for it to finish
1784 */
1785
1786 error = biowait(bp);
1787
1788 /*
1789 * kill the pager mapping
1790 */
1791
1792 uvm_pagermapout(kva, npages);
1793
1794 /*
1795 * now dispose of the buf and we're done.
1796 */
1797
1798 if (write) {
1799 mutex_enter(&swapdev_vp->v_interlock);
1800 vwakeup(bp);
1801 mutex_exit(&swapdev_vp->v_interlock);
1802 }
1803 putiobuf(bp);
1804 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0);
1805
1806 return (error);
1807 }
1808