uvm_swap.c revision 1.27.4.1 1 /* $NetBSD: uvm_swap.c,v 1.27.4.1 1999/06/07 04:25:38 chs Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32 */
33
34 #include "fs_nfs.h"
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/proc.h>
42 #include <sys/namei.h>
43 #include <sys/disklabel.h>
44 #include <sys/errno.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/extent.h>
50 #include <sys/mount.h>
51 #include <sys/pool.h>
52 #include <sys/syscallargs.h>
53 #include <sys/swap.h>
54
55 #include <vm/vm.h>
56 #include <vm/vm_conf.h>
57
58 #include <uvm/uvm.h>
59
60 #include <miscfs/specfs/specdev.h>
61
62 /*
63 * uvm_swap.c: manage configuration and i/o to swap space.
64 */
65
66 /*
67 * swap space is managed in the following way:
68 *
69 * each swap partition or file is described by a "swapdev" structure.
70 * each "swapdev" structure contains a "swapent" structure which contains
71 * information that is passed up to the user (via system calls).
72 *
73 * each swap partition is assigned a "priority" (int) which controls
74 * swap parition usage.
75 *
76 * the system maintains a global data structure describing all swap
77 * partitions/files. there is a sorted LIST of "swappri" structures
78 * which describe "swapdev"'s at that priority. this LIST is headed
79 * by the "swap_priority" global var. each "swappri" contains a
80 * CIRCLEQ of "swapdev" structures at that priority.
81 *
82 * the system maintains a fixed pool of "swapbuf" structures for use
83 * at swap i/o time. a swapbuf includes a "buf" structure and an
84 * "aiodone" [we want to avoid malloc()'ing anything at swapout time
85 * since memory may be low].
86 *
87 * locking:
88 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl
89 * system call and prevents the swap priority list from changing
90 * while we are in the middle of a system call (e.g. SWAP_STATS).
91 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data
92 * structures including the priority list, the swapdev structures,
93 * and the swapmap extent.
94 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf
95 * pool.
96 *
97 * each swap device has the following info:
98 * - swap device in use (could be disabled, preventing future use)
99 * - swap enabled (allows new allocations on swap)
100 * - map info in /dev/drum
101 * - vnode pointer
102 * for swap files only:
103 * - block size
104 * - max byte count in buffer
105 * - buffer
106 * - credentials to use when doing i/o to file
107 *
108 * userland controls and configures swap with the swapctl(2) system call.
109 * the sys_swapctl performs the following operations:
110 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
111 * [2] SWAP_STATS: given a pointer to an array of swapent structures
112 * (passed in via "arg") of a size passed in via "misc" ... we load
113 * the current swap config into the array.
114 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
115 * priority in "misc", start swapping on it.
116 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
117 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
118 * "misc")
119 */
120
121 /*
122 * SWAP_TO_FILES: allows swapping to plain files.
123 */
124
125 #define SWAP_TO_FILES
126
127 /*
128 * swapdev: describes a single swap partition/file
129 *
130 * note the following should be true:
131 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
132 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
133 */
134 struct swapdev {
135 struct oswapent swd_ose;
136 #define swd_dev swd_ose.ose_dev /* device id */
137 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */
138 #define swd_priority swd_ose.ose_priority /* our priority */
139 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
140 char *swd_path; /* saved pathname of device */
141 int swd_pathlen; /* length of pathname */
142 int swd_npages; /* #pages we can use */
143 int swd_npginuse; /* #pages in use */
144 int swd_drumoffset; /* page0 offset in drum */
145 int swd_drumsize; /* #pages in drum */
146 struct extent *swd_ex; /* extent for this swapdev */
147 struct vnode *swd_vp; /* backing vnode */
148 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
149
150 #ifdef SWAP_TO_FILES
151 int swd_bsize; /* blocksize (bytes) */
152 int swd_maxactive; /* max active i/o reqs */
153 struct buf swd_tab; /* buffer list */
154 struct ucred *swd_cred; /* cred for file access */
155 #endif
156 };
157
158 /*
159 * swap device priority entry; the list is kept sorted on `spi_priority'.
160 */
161 struct swappri {
162 int spi_priority; /* priority */
163 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
164 /* circleq of swapdevs at this priority */
165 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
166 };
167
168 /*
169 * swapbuf, swapbuffer plus async i/o info
170 */
171 struct swapbuf {
172 struct buf sw_buf; /* a buffer structure */
173 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */
174 };
175
176 /*
177 * The following two structures are used to keep track of data transfers
178 * on swap devices associated with regular files.
179 * NOTE: this code is more or less a copy of vnd.c; we use the same
180 * structure names here to ease porting..
181 */
182 struct vndxfer {
183 struct buf *vx_bp; /* Pointer to parent buffer */
184 struct swapdev *vx_sdp;
185 int vx_error;
186 int vx_pending; /* # of pending aux buffers */
187 int vx_flags;
188 #define VX_BUSY 1
189 #define VX_DEAD 2
190 };
191
192 struct vndbuf {
193 struct buf vb_buf;
194 struct vndxfer *vb_xfer;
195 };
196
197
198 /*
199 * We keep a of pool vndbuf's and vndxfer structures.
200 */
201 struct pool *vndxfer_pool;
202 struct pool *vndbuf_pool;
203
204 #define getvndxfer(vnx) do { \
205 int s = splbio(); \
206 vnx = (struct vndxfer *) \
207 pool_get(vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \
208 splx(s); \
209 } while (0)
210
211 #define putvndxfer(vnx) { \
212 pool_put(vndxfer_pool, (void *)(vnx)); \
213 }
214
215 #define getvndbuf(vbp) do { \
216 int s = splbio(); \
217 vbp = (struct vndbuf *) \
218 pool_get(vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \
219 splx(s); \
220 } while (0)
221
222 #define putvndbuf(vbp) { \
223 pool_put(vndbuf_pool, (void *)(vbp)); \
224 }
225
226
227 /*
228 * local variables
229 */
230 static struct extent *swapmap; /* controls the mapping of /dev/drum */
231 SIMPLEQ_HEAD(swapbufhead, swapbuf);
232 struct pool *swapbuf_pool;
233
234 /* list of all active swap devices [by priority] */
235 LIST_HEAD(swap_priority, swappri);
236 static struct swap_priority swap_priority;
237
238 /* locks */
239 lock_data_t swap_syscall_lock;
240
241 /*
242 * prototypes
243 */
244 static void swapdrum_add __P((struct swapdev *, int));
245 static struct swapdev *swapdrum_getsdp __P((int));
246
247 static struct swapdev *swaplist_find __P((struct vnode *, int));
248 static void swaplist_insert __P((struct swapdev *,
249 struct swappri *, int));
250 static void swaplist_trim __P((void));
251
252 static int swap_on __P((struct proc *, struct swapdev *));
253 static int swap_off __P((struct proc *, struct swapdev *));
254
255 #ifdef SWAP_TO_FILES
256 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
257 static void sw_reg_iodone __P((struct buf *));
258 static void sw_reg_start __P((struct swapdev *));
259 #endif
260
261 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
262 static void uvm_swap_bufdone __P((struct buf *));
263 static int uvm_swap_io __P((struct vm_page **, int, int, int));
264
265 /*
266 * uvm_swap_init: init the swap system data structures and locks
267 *
268 * => called at boot time from init_main.c after the filesystems
269 * are brought up (which happens after uvm_init())
270 */
271 void
272 uvm_swap_init()
273 {
274 UVMHIST_FUNC("uvm_swap_init");
275
276 UVMHIST_CALLED(pdhist);
277 /*
278 * first, init the swap list, its counter, and its lock.
279 * then get a handle on the vnode for /dev/drum by using
280 * the its dev_t number ("swapdev", from MD conf.c).
281 */
282
283 LIST_INIT(&swap_priority);
284 uvmexp.nswapdev = 0;
285 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
286 simple_lock_init(&uvm.swap_data_lock);
287
288 if (bdevvp(swapdev, &swapdev_vp))
289 panic("uvm_swap_init: can't get vnode for swap device");
290
291 /*
292 * create swap block resource map to map /dev/drum. the range
293 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
294 * that block 0 is reserved (used to indicate an allocation
295 * failure, or no allocation).
296 */
297 swapmap = extent_create("swapmap", 1, INT_MAX,
298 M_VMSWAP, 0, 0, EX_NOWAIT);
299 if (swapmap == 0)
300 panic("uvm_swap_init: extent_create failed");
301
302 /*
303 * allocate our private pool of "swapbuf" structures (includes
304 * a "buf" structure). ["nswbuf" comes from param.c and can
305 * be adjusted by MD code before we get here].
306 */
307
308 swapbuf_pool =
309 pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0,
310 NULL, NULL, 0);
311 if (swapbuf_pool == NULL)
312 panic("swapinit: pool_create failed");
313 /* XXX - set a maximum on swapbuf_pool? */
314
315 vndxfer_pool =
316 pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0,
317 NULL, NULL, 0);
318 if (vndxfer_pool == NULL)
319 panic("swapinit: pool_create failed");
320
321 vndbuf_pool =
322 pool_create(sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0,
323 NULL, NULL, 0);
324 if (vndbuf_pool == NULL)
325 panic("swapinit: pool_create failed");
326 /*
327 * done!
328 */
329 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
330 }
331
332 /*
333 * swaplist functions: functions that operate on the list of swap
334 * devices on the system.
335 */
336
337 /*
338 * swaplist_insert: insert swap device "sdp" into the global list
339 *
340 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
341 * => caller must provide a newly malloc'd swappri structure (we will
342 * FREE it if we don't need it... this it to prevent malloc blocking
343 * here while adding swap)
344 */
345 static void
346 swaplist_insert(sdp, newspp, priority)
347 struct swapdev *sdp;
348 struct swappri *newspp;
349 int priority;
350 {
351 struct swappri *spp, *pspp;
352 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
353
354 /*
355 * find entry at or after which to insert the new device.
356 */
357 for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL;
358 spp = spp->spi_swappri.le_next) {
359 if (priority <= spp->spi_priority)
360 break;
361 pspp = spp;
362 }
363
364 /*
365 * new priority?
366 */
367 if (spp == NULL || spp->spi_priority != priority) {
368 spp = newspp; /* use newspp! */
369 UVMHIST_LOG(pdhist, "created new swappri = %d", priority, 0, 0, 0);
370
371 spp->spi_priority = priority;
372 CIRCLEQ_INIT(&spp->spi_swapdev);
373
374 if (pspp)
375 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
376 else
377 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
378 } else {
379 /* we don't need a new priority structure, free it */
380 FREE(newspp, M_VMSWAP);
381 }
382
383 /*
384 * priority found (or created). now insert on the priority's
385 * circleq list and bump the total number of swapdevs.
386 */
387 sdp->swd_priority = priority;
388 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
389 uvmexp.nswapdev++;
390
391 /*
392 * done!
393 */
394 }
395
396 /*
397 * swaplist_find: find and optionally remove a swap device from the
398 * global list.
399 *
400 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
401 * => we return the swapdev we found (and removed)
402 */
403 static struct swapdev *
404 swaplist_find(vp, remove)
405 struct vnode *vp;
406 boolean_t remove;
407 {
408 struct swapdev *sdp;
409 struct swappri *spp;
410
411 /*
412 * search the lists for the requested vp
413 */
414 for (spp = swap_priority.lh_first; spp != NULL;
415 spp = spp->spi_swappri.le_next) {
416 for (sdp = spp->spi_swapdev.cqh_first;
417 sdp != (void *)&spp->spi_swapdev;
418 sdp = sdp->swd_next.cqe_next)
419 if (sdp->swd_vp == vp) {
420 if (remove) {
421 CIRCLEQ_REMOVE(&spp->spi_swapdev,
422 sdp, swd_next);
423 uvmexp.nswapdev--;
424 }
425 return(sdp);
426 }
427 }
428 return (NULL);
429 }
430
431
432 /*
433 * swaplist_trim: scan priority list for empty priority entries and kill
434 * them.
435 *
436 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
437 */
438 static void
439 swaplist_trim()
440 {
441 struct swappri *spp, *nextspp;
442
443 for (spp = swap_priority.lh_first; spp != NULL; spp = nextspp) {
444 nextspp = spp->spi_swappri.le_next;
445 if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev)
446 continue;
447 LIST_REMOVE(spp, spi_swappri);
448 free((caddr_t)spp, M_VMSWAP);
449 }
450 }
451
452 /*
453 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
454 *
455 * => caller must hold swap_syscall_lock
456 * => uvm.swap_data_lock should be unlocked (we may sleep)
457 */
458 static void
459 swapdrum_add(sdp, npages)
460 struct swapdev *sdp;
461 int npages;
462 {
463 u_long result;
464
465 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
466 EX_WAITOK, &result))
467 panic("swapdrum_add");
468
469 sdp->swd_drumoffset = result;
470 sdp->swd_drumsize = npages;
471 }
472
473 /*
474 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
475 * to the "swapdev" that maps that section of the drum.
476 *
477 * => each swapdev takes one big contig chunk of the drum
478 * => caller must hold uvm.swap_data_lock
479 */
480 static struct swapdev *
481 swapdrum_getsdp(pgno)
482 int pgno;
483 {
484 struct swapdev *sdp;
485 struct swappri *spp;
486
487 for (spp = swap_priority.lh_first; spp != NULL;
488 spp = spp->spi_swappri.le_next)
489 for (sdp = spp->spi_swapdev.cqh_first;
490 sdp != (void *)&spp->spi_swapdev;
491 sdp = sdp->swd_next.cqe_next)
492 if (pgno >= sdp->swd_drumoffset &&
493 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
494 return sdp;
495 }
496 return NULL;
497 }
498
499
500 /*
501 * sys_swapctl: main entry point for swapctl(2) system call
502 * [with two helper functions: swap_on and swap_off]
503 */
504 int
505 sys_swapctl(p, v, retval)
506 struct proc *p;
507 void *v;
508 register_t *retval;
509 {
510 struct sys_swapctl_args /* {
511 syscallarg(int) cmd;
512 syscallarg(void *) arg;
513 syscallarg(int) misc;
514 } */ *uap = (struct sys_swapctl_args *)v;
515 struct vnode *vp;
516 struct nameidata nd;
517 struct swappri *spp;
518 struct swapdev *sdp;
519 struct swapent *sep;
520 char userpath[PATH_MAX + 1];
521 size_t len;
522 int count, error, misc;
523 int priority;
524 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
525
526 misc = SCARG(uap, misc);
527
528 /*
529 * ensure serialized syscall access by grabbing the swap_syscall_lock
530 */
531 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, (void *)0);
532
533 /*
534 * we handle the non-priv NSWAP and STATS request first.
535 *
536 * SWAP_NSWAP: return number of config'd swap devices
537 * [can also be obtained with uvmexp sysctl]
538 */
539 if (SCARG(uap, cmd) == SWAP_NSWAP) {
540 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
541 0, 0, 0);
542 *retval = uvmexp.nswapdev;
543 error = 0;
544 goto out;
545 }
546
547 /*
548 * SWAP_STATS: get stats on current # of configured swap devs
549 *
550 * note that the swap_priority list can't change as long
551 * as we are holding the swap_syscall_lock. we don't want
552 * to grab the uvm.swap_data_lock because we may fault&sleep during
553 * copyout() and we don't want to be holding that lock then!
554 */
555 if (SCARG(uap, cmd) == SWAP_STATS
556 #if defined(COMPAT_13)
557 || SCARG(uap, cmd) == SWAP_OSTATS
558 #endif
559 ) {
560 sep = (struct swapent *)SCARG(uap, arg);
561 count = 0;
562
563 for (spp = swap_priority.lh_first; spp != NULL;
564 spp = spp->spi_swappri.le_next) {
565 for (sdp = spp->spi_swapdev.cqh_first;
566 sdp != (void *)&spp->spi_swapdev && misc-- > 0;
567 sdp = sdp->swd_next.cqe_next) {
568 /*
569 * backwards compatibility for system call.
570 * note that we use 'struct oswapent' as an
571 * overlay into both 'struct swapdev' and
572 * the userland 'struct swapent', as we
573 * want to retain backwards compatibility
574 * with NetBSD 1.3.
575 */
576 sdp->swd_ose.ose_inuse =
577 btodb(sdp->swd_npginuse << PAGE_SHIFT);
578 error = copyout((caddr_t)&sdp->swd_ose,
579 (caddr_t)sep, sizeof(struct oswapent));
580
581 /* now copy out the path if necessary */
582 #if defined(COMPAT_13)
583 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS)
584 #else
585 if (error == 0)
586 #endif
587 error = copyout((caddr_t)sdp->swd_path,
588 (caddr_t)&sep->se_path,
589 sdp->swd_pathlen);
590
591 if (error)
592 goto out;
593 count++;
594 #if defined(COMPAT_13)
595 if (SCARG(uap, cmd) == SWAP_OSTATS)
596 ((struct oswapent *)sep)++;
597 else
598 #endif
599 sep++;
600 }
601 }
602
603 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
604
605 *retval = count;
606 error = 0;
607 goto out;
608 }
609
610 /*
611 * all other requests require superuser privs. verify.
612 */
613 if ((error = suser(p->p_ucred, &p->p_acflag)))
614 goto out;
615
616 /*
617 * at this point we expect a path name in arg. we will
618 * use namei() to gain a vnode reference (vref), and lock
619 * the vnode (VOP_LOCK).
620 *
621 * XXX: a NULL arg means use the root vnode pointer (e.g. for
622 * miniroot)
623 */
624 if (SCARG(uap, arg) == NULL) {
625 vp = rootvp; /* miniroot */
626 if (vget(vp, LK_EXCLUSIVE)) {
627 error = EBUSY;
628 goto out;
629 }
630 if (SCARG(uap, cmd) == SWAP_ON &&
631 copystr("miniroot", userpath, sizeof userpath, &len))
632 panic("swapctl: miniroot copy failed");
633 } else {
634 int space;
635 char *where;
636
637 if (SCARG(uap, cmd) == SWAP_ON) {
638 if ((error = copyinstr(SCARG(uap, arg), userpath,
639 sizeof userpath, &len)))
640 goto out;
641 space = UIO_SYSSPACE;
642 where = userpath;
643 } else {
644 space = UIO_USERSPACE;
645 where = (char *)SCARG(uap, arg);
646 }
647 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
648 if ((error = namei(&nd)))
649 goto out;
650 vp = nd.ni_vp;
651 }
652 /* note: "vp" is referenced and locked */
653
654 error = 0; /* assume no error */
655 switch(SCARG(uap, cmd)) {
656 case SWAP_DUMPDEV:
657 if (vp->v_type != VBLK) {
658 error = ENOTBLK;
659 goto out;
660 }
661 dumpdev = vp->v_rdev;
662
663 break;
664
665 case SWAP_CTL:
666 /*
667 * get new priority, remove old entry (if any) and then
668 * reinsert it in the correct place. finally, prune out
669 * any empty priority structures.
670 */
671 priority = SCARG(uap, misc);
672 spp = (struct swappri *)
673 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
674 simple_lock(&uvm.swap_data_lock);
675 if ((sdp = swaplist_find(vp, 1)) == NULL) {
676 error = ENOENT;
677 } else {
678 swaplist_insert(sdp, spp, priority);
679 swaplist_trim();
680 }
681 simple_unlock(&uvm.swap_data_lock);
682 if (error)
683 free(spp, M_VMSWAP);
684 break;
685
686 case SWAP_ON:
687 /*
688 * check for duplicates. if none found, then insert a
689 * dummy entry on the list to prevent someone else from
690 * trying to enable this device while we are working on
691 * it.
692 */
693 priority = SCARG(uap, misc);
694 simple_lock(&uvm.swap_data_lock);
695 if ((sdp = swaplist_find(vp, 0)) != NULL) {
696 error = EBUSY;
697 simple_unlock(&uvm.swap_data_lock);
698 break;
699 }
700 sdp = (struct swapdev *)
701 malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
702 spp = (struct swappri *)
703 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
704 memset(sdp, 0, sizeof(*sdp));
705 sdp->swd_flags = SWF_FAKE; /* placeholder only */
706 sdp->swd_vp = vp;
707 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
708 #ifdef SWAP_TO_FILES
709 /*
710 * XXX Is NFS elaboration necessary?
711 */
712 if (vp->v_type == VREG)
713 sdp->swd_cred = crdup(p->p_ucred);
714 #endif
715 swaplist_insert(sdp, spp, priority);
716 simple_unlock(&uvm.swap_data_lock);
717
718 sdp->swd_pathlen = len;
719 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
720 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
721 panic("swapctl: copystr");
722 /*
723 * we've now got a FAKE placeholder in the swap list.
724 * now attempt to enable swap on it. if we fail, undo
725 * what we've done and kill the fake entry we just inserted.
726 * if swap_on is a success, it will clear the SWF_FAKE flag
727 */
728 if ((error = swap_on(p, sdp)) != 0) {
729 simple_lock(&uvm.swap_data_lock);
730 (void) swaplist_find(vp, 1); /* kill fake entry */
731 swaplist_trim();
732 simple_unlock(&uvm.swap_data_lock);
733 #ifdef SWAP_TO_FILES
734 if (vp->v_type == VREG)
735 crfree(sdp->swd_cred);
736 #endif
737 free(sdp->swd_path, M_VMSWAP);
738 free((caddr_t)sdp, M_VMSWAP);
739 break;
740 }
741
742 /*
743 * got it! now add a second reference to vp so that
744 * we keep a reference to the vnode after we return.
745 */
746 vref(vp);
747 break;
748
749 case SWAP_OFF:
750 #if 1
751 /*
752 * find the entry of interest and ensure it is enabled.
753 */
754 simple_lock(&uvm.swap_data_lock);
755 if ((sdp = swaplist_find(vp, 0)) == NULL) {
756 simple_unlock(&uvm.swap_data_lock);
757 error = ENXIO;
758 break;
759 }
760 /*
761 * If a device isn't in use or enabled, we
762 * can't stop swapping from it (again).
763 */
764 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
765 simple_unlock(&uvm.swap_data_lock);
766 error = EBUSY;
767 break;
768 }
769
770 /*
771 * do the real work.
772 */
773 /* XXXCDC: should we call with list locked or unlocked? */
774 if ((error = swap_off(p, sdp)) != 0)
775 /* XXXCDC: might need relock here */
776 goto out;
777
778 /*
779 * now we can kill the entry.
780 */
781 if ((sdp = swaplist_find(vp, 1)) == NULL) {
782 error = ENXIO;
783 break;
784 }
785 simple_unlock(&uvm.swap_data_lock);
786 free((caddr_t)sdp, M_VMSWAP);
787 #else
788 error = EINVAL;
789 #endif
790 break;
791
792 default:
793 error = EINVAL;
794 }
795
796 /*
797 * done! use vput to drop our reference and unlock
798 */
799 vput(vp);
800 out:
801 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0);
802
803 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
804 return (error);
805 }
806
807 /*
808 * swap_on: attempt to enable a swapdev for swapping. note that the
809 * swapdev is already on the global list, but disabled (marked
810 * SWF_FAKE).
811 *
812 * => we avoid the start of the disk (to protect disk labels)
813 * => we also avoid the miniroot, if we are swapping to root.
814 * => caller should leave uvm.swap_data_lock unlocked, we may lock it
815 * if needed.
816 */
817 static int
818 swap_on(p, sdp)
819 struct proc *p;
820 struct swapdev *sdp;
821 {
822 static int count = 0; /* static */
823 struct vnode *vp;
824 int error, npages, nblocks, size;
825 long addr;
826 #ifdef SWAP_TO_FILES
827 struct vattr va;
828 #endif
829 #ifdef NFS
830 extern int (**nfsv2_vnodeop_p) __P((void *));
831 #endif /* NFS */
832 dev_t dev;
833 char *name;
834 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
835
836 /*
837 * we want to enable swapping on sdp. the swd_vp contains
838 * the vnode we want (locked and ref'd), and the swd_dev
839 * contains the dev_t of the file, if it a block device.
840 */
841
842 vp = sdp->swd_vp;
843 dev = sdp->swd_dev;
844
845 /*
846 * open the swap file (mostly useful for block device files to
847 * let device driver know what is up).
848 *
849 * we skip the open/close for root on swap because the root
850 * has already been opened when root was mounted (mountroot).
851 */
852 if (vp != rootvp) {
853 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
854 return (error);
855 }
856
857 /* XXX this only works for block devices */
858 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
859
860 /*
861 * we now need to determine the size of the swap area. for
862 * block specials we can call the d_psize function.
863 * for normal files, we must stat [get attrs].
864 *
865 * we put the result in nblks.
866 * for normal files, we also want the filesystem block size
867 * (which we get with statfs).
868 */
869 switch (vp->v_type) {
870 case VBLK:
871 if (bdevsw[major(dev)].d_psize == 0 ||
872 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
873 error = ENXIO;
874 goto bad;
875 }
876 break;
877
878 #ifdef SWAP_TO_FILES
879 case VREG:
880 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
881 goto bad;
882 nblocks = (int)btodb(va.va_size);
883 if ((error =
884 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
885 goto bad;
886
887 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
888 /*
889 * limit the max # of outstanding I/O requests we issue
890 * at any one time. take it easy on NFS servers.
891 */
892 #ifdef NFS
893 if (vp->v_op == nfsv2_vnodeop_p)
894 sdp->swd_maxactive = 2; /* XXX */
895 else
896 #endif /* NFS */
897 sdp->swd_maxactive = 8; /* XXX */
898 break;
899 #endif
900
901 default:
902 error = ENXIO;
903 goto bad;
904 }
905
906 /*
907 * save nblocks in a safe place and convert to pages.
908 */
909
910 sdp->swd_ose.ose_nblks = nblocks;
911 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
912
913 /*
914 * for block special files, we want to make sure that leave
915 * the disklabel and bootblocks alone, so we arrange to skip
916 * over them (randomly choosing to skip PAGE_SIZE bytes).
917 * note that because of this the "size" can be less than the
918 * actual number of blocks on the device.
919 */
920 if (vp->v_type == VBLK) {
921 /* we use pages 1 to (size - 1) [inclusive] */
922 size = npages - 1;
923 addr = 1;
924 } else {
925 /* we use pages 0 to (size - 1) [inclusive] */
926 size = npages;
927 addr = 0;
928 }
929
930 /*
931 * make sure we have enough blocks for a reasonable sized swap
932 * area. we want at least one page.
933 */
934
935 if (size < 1) {
936 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
937 error = EINVAL;
938 goto bad;
939 }
940
941 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
942
943 /*
944 * now we need to allocate an extent to manage this swap device
945 */
946 name = malloc(12, M_VMSWAP, M_WAITOK);
947 sprintf(name, "swap0x%04x", count++);
948
949 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
950 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
951 0, 0, EX_WAITOK);
952 /* allocate the `saved' region from the extent so it won't be used */
953 if (addr) {
954 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
955 panic("disklabel region");
956 sdp->swd_npginuse += addr;
957 simple_lock(&uvm.swap_data_lock);
958 uvmexp.swpginuse += addr;
959 uvmexp.swpgonly += addr;
960 simple_unlock(&uvm.swap_data_lock);
961 }
962
963 /*
964 * if the vnode we are swapping to is the root vnode
965 * (i.e. we are swapping to the miniroot) then we want
966 * to make sure we don't overwrite it. do a statfs to
967 * find its size and skip over it.
968 */
969 if (vp == rootvp) {
970 struct mount *mp;
971 struct statfs *sp;
972 int rootblocks, rootpages;
973
974 mp = rootvnode->v_mount;
975 sp = &mp->mnt_stat;
976 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
977 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
978 if (rootpages > npages)
979 panic("swap_on: miniroot larger than swap?");
980
981 if (extent_alloc_region(sdp->swd_ex, addr,
982 rootpages, EX_WAITOK))
983 panic("swap_on: unable to preserve miniroot");
984
985 simple_lock(&uvm.swap_data_lock);
986 sdp->swd_npginuse += (rootpages - addr);
987 uvmexp.swpginuse += (rootpages - addr);
988 uvmexp.swpgonly += (rootpages - addr);
989 simple_unlock(&uvm.swap_data_lock);
990
991 size -= rootpages;
992 printf("Preserved %d pages of miniroot ", rootpages);
993 printf("leaving %d pages of swap\n", size);
994 }
995
996 /*
997 * add anons to reflect the new swap space
998 */
999 uvm_anon_add(size);
1000
1001 /*
1002 * now add the new swapdev to the drum and enable.
1003 */
1004 simple_lock(&uvm.swap_data_lock);
1005 swapdrum_add(sdp, npages);
1006 sdp->swd_npages = size;
1007 sdp->swd_flags &= ~SWF_FAKE; /* going live */
1008 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1009 simple_unlock(&uvm.swap_data_lock);
1010 uvmexp.swpages += npages;
1011
1012 #if 0
1013 /*
1014 * At this point we could arrange to reserve memory for the
1015 * swap buffer pools.
1016 *
1017 * I don't think this is necessary, since swapping starts well
1018 * ahead of serious memory deprivation and the memory resource
1019 * pools hold on to actively used memory. This should ensure
1020 * we always have some resources to continue operation.
1021 */
1022
1023 int s = splbio();
1024 int n = 8 * sdp->swd_maxactive;
1025
1026 (void)pool_prime(swapbuf_pool, n, 0);
1027
1028 if (vp->v_type == VREG) {
1029 /* Allocate additional vnx and vnd buffers */
1030 /*
1031 * Allocation Policy:
1032 * (8 * swd_maxactive) vnx headers per swap dev
1033 * (16 * swd_maxactive) vnd buffers per swap dev
1034 */
1035
1036 n = 8 * sdp->swd_maxactive;
1037 (void)pool_prime(vndxfer_pool, n, 0);
1038
1039 n = 16 * sdp->swd_maxactive;
1040 (void)pool_prime(vndbuf_pool, n, 0);
1041 }
1042 splx(s);
1043 #endif
1044
1045 return (0);
1046
1047 bad:
1048 /*
1049 * failure: close device if necessary and return error.
1050 */
1051 if (vp != rootvp)
1052 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
1053 return (error);
1054 }
1055
1056 /*
1057 * swap_off: stop swapping on swapdev
1058 *
1059 * => swap data should be locked, we will unlock.
1060 */
1061 static int
1062 swap_off(p, sdp)
1063 struct proc *p;
1064 struct swapdev *sdp;
1065 {
1066 return 0;
1067 }
1068
1069 /*
1070 * /dev/drum interface and i/o functions
1071 */
1072
1073 /*
1074 * swread: the read function for the drum (just a call to physio)
1075 */
1076 /*ARGSUSED*/
1077 int
1078 swread(dev, uio, ioflag)
1079 dev_t dev;
1080 struct uio *uio;
1081 int ioflag;
1082 {
1083 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1084
1085 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1086 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1087 }
1088
1089 /*
1090 * swwrite: the write function for the drum (just a call to physio)
1091 */
1092 /*ARGSUSED*/
1093 int
1094 swwrite(dev, uio, ioflag)
1095 dev_t dev;
1096 struct uio *uio;
1097 int ioflag;
1098 {
1099 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1100
1101 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1102 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1103 }
1104
1105 /*
1106 * swstrategy: perform I/O on the drum
1107 *
1108 * => we must map the i/o request from the drum to the correct swapdev.
1109 */
1110 void
1111 swstrategy(bp)
1112 struct buf *bp;
1113 {
1114 struct swapdev *sdp;
1115 struct vnode *vp;
1116 int s, pageno, bn;
1117 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1118
1119 /*
1120 * convert block number to swapdev. note that swapdev can't
1121 * be yanked out from under us because we are holding resources
1122 * in it (i.e. the blocks we are doing I/O on).
1123 */
1124 pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT;
1125 simple_lock(&uvm.swap_data_lock);
1126 sdp = swapdrum_getsdp(pageno);
1127 simple_unlock(&uvm.swap_data_lock);
1128 if (sdp == NULL) {
1129 bp->b_error = EINVAL;
1130 bp->b_flags |= B_ERROR;
1131 biodone(bp);
1132 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1133 return;
1134 }
1135
1136 /*
1137 * convert drum page number to block number on this swapdev.
1138 */
1139
1140 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1141 bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */
1142
1143 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n",
1144 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1145 sdp->swd_drumoffset, bn, bp->b_bcount);
1146
1147 /*
1148 * for block devices we finish up here.
1149 * for regular files we have to do more work which we delegate
1150 * to sw_reg_strategy().
1151 */
1152
1153 switch (sdp->swd_vp->v_type) {
1154 default:
1155 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1156 case VBLK:
1157
1158 /*
1159 * must convert "bp" from an I/O on /dev/drum to an I/O
1160 * on the swapdev (sdp).
1161 */
1162 s = splbio();
1163 bp->b_blkno = bn; /* swapdev block number */
1164 vp = sdp->swd_vp; /* swapdev vnode pointer */
1165 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1166 VHOLD(vp); /* "hold" swapdev vp for i/o */
1167
1168 /*
1169 * if we are doing a write, we have to redirect the i/o on
1170 * drum's v_numoutput counter to the swapdevs.
1171 */
1172 if ((bp->b_flags & B_READ) == 0) {
1173 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1174 vp->v_numoutput++; /* put it on swapdev */
1175 }
1176
1177 /*
1178 * dissassocate buffer with /dev/drum vnode
1179 * [could be null if buf was from physio]
1180 */
1181 if (bp->b_vp != NULLVP)
1182 brelvp(bp);
1183
1184 /*
1185 * finally plug in swapdev vnode and start I/O
1186 */
1187 bp->b_vp = vp;
1188 splx(s);
1189 VOP_STRATEGY(bp);
1190 return;
1191 #ifdef SWAP_TO_FILES
1192 case VREG:
1193 /*
1194 * deligate to sw_reg_strategy function.
1195 */
1196 sw_reg_strategy(sdp, bp, bn);
1197 return;
1198 #endif
1199 }
1200 /* NOTREACHED */
1201 }
1202
1203 #ifdef SWAP_TO_FILES
1204 /*
1205 * sw_reg_strategy: handle swap i/o to regular files
1206 */
1207 static void
1208 sw_reg_strategy(sdp, bp, bn)
1209 struct swapdev *sdp;
1210 struct buf *bp;
1211 int bn;
1212 {
1213 struct vnode *vp;
1214 struct vndxfer *vnx;
1215 daddr_t nbn, byteoff;
1216 caddr_t addr;
1217 int s, off, nra, error, sz, resid;
1218 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1219
1220 /*
1221 * allocate a vndxfer head for this transfer and point it to
1222 * our buffer.
1223 */
1224 getvndxfer(vnx);
1225 vnx->vx_flags = VX_BUSY;
1226 vnx->vx_error = 0;
1227 vnx->vx_pending = 0;
1228 vnx->vx_bp = bp;
1229 vnx->vx_sdp = sdp;
1230
1231 /*
1232 * setup for main loop where we read filesystem blocks into
1233 * our buffer.
1234 */
1235 error = 0;
1236 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1237 addr = bp->b_data; /* current position in buffer */
1238 byteoff = dbtob(bn);
1239
1240 for (resid = bp->b_resid; resid; resid -= sz) {
1241 struct vndbuf *nbp;
1242
1243 /*
1244 * translate byteoffset into block number. return values:
1245 * vp = vnode of underlying device
1246 * nbn = new block number (on underlying vnode dev)
1247 * nra = num blocks we can read-ahead (excludes requested
1248 * block)
1249 */
1250 nra = 0;
1251 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1252 &vp, &nbn, &nra);
1253
1254 if (error == 0 && (long)nbn == -1) {
1255 /*
1256 * this used to just set error, but that doesn't
1257 * do the right thing. Instead, it causes random
1258 * memory errors. The panic() should remain until
1259 * this condition doesn't destabilize the system.
1260 */
1261 #if 1
1262 panic("sw_reg_strategy: swap to sparse file");
1263 #else
1264 error = EIO; /* failure */
1265 #endif
1266 }
1267
1268 /*
1269 * punt if there was an error or a hole in the file.
1270 * we must wait for any i/o ops we have already started
1271 * to finish before returning.
1272 *
1273 * XXX we could deal with holes here but it would be
1274 * a hassle (in the write case).
1275 */
1276 if (error) {
1277 s = splbio();
1278 vnx->vx_error = error; /* pass error up */
1279 goto out;
1280 }
1281
1282 /*
1283 * compute the size ("sz") of this transfer (in bytes).
1284 * XXXCDC: ignores read-ahead for non-zero offset
1285 */
1286 if ((off = (byteoff % sdp->swd_bsize)) != 0)
1287 sz = sdp->swd_bsize - off;
1288 else
1289 sz = (1 + nra) * sdp->swd_bsize;
1290
1291 if (resid < sz)
1292 sz = resid;
1293
1294 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x",
1295 sdp->swd_vp, vp, byteoff, nbn);
1296
1297 /*
1298 * now get a buf structure. note that the vb_buf is
1299 * at the front of the nbp structure so that you can
1300 * cast pointers between the two structure easily.
1301 */
1302 getvndbuf(nbp);
1303 nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
1304 nbp->vb_buf.b_bcount = sz;
1305 #if 0
1306 nbp->vb_buf.b_bufsize = bp->b_bufsize; /* XXXCDC: really? */
1307 #endif
1308 nbp->vb_buf.b_bufsize = sz;
1309 nbp->vb_buf.b_error = 0;
1310 nbp->vb_buf.b_data = addr;
1311 nbp->vb_buf.b_blkno = nbn + btodb(off);
1312 nbp->vb_buf.b_proc = bp->b_proc;
1313 nbp->vb_buf.b_iodone = sw_reg_iodone;
1314 nbp->vb_buf.b_vp = NULLVP;
1315 nbp->vb_buf.b_vnbufs.le_next = NOLIST;
1316 nbp->vb_buf.b_rcred = sdp->swd_cred;
1317 nbp->vb_buf.b_wcred = sdp->swd_cred;
1318
1319 /*
1320 * set b_dirtyoff/end and b_validoff/end. this is
1321 * required by the NFS client code (otherwise it will
1322 * just discard our I/O request).
1323 */
1324 if (bp->b_dirtyend == 0) {
1325 nbp->vb_buf.b_dirtyoff = 0;
1326 nbp->vb_buf.b_dirtyend = sz;
1327 } else {
1328 nbp->vb_buf.b_dirtyoff =
1329 max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1330 nbp->vb_buf.b_dirtyend =
1331 min(sz,
1332 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1333 }
1334 if (bp->b_validend == 0) {
1335 nbp->vb_buf.b_validoff = 0;
1336 nbp->vb_buf.b_validend = sz;
1337 } else {
1338 nbp->vb_buf.b_validoff =
1339 max(0, bp->b_validoff - (bp->b_bcount-resid));
1340 nbp->vb_buf.b_validend =
1341 min(sz,
1342 max(0, bp->b_validend - (bp->b_bcount-resid)));
1343 }
1344
1345 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1346
1347 /*
1348 * Just sort by block number
1349 */
1350 nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
1351 s = splbio();
1352 if (vnx->vx_error != 0) {
1353 putvndbuf(nbp);
1354 goto out;
1355 }
1356 vnx->vx_pending++;
1357
1358 /* assoc new buffer with underlying vnode */
1359 bgetvp(vp, &nbp->vb_buf);
1360
1361 /* sort it in and start I/O if we are not over our limit */
1362 disksort(&sdp->swd_tab, &nbp->vb_buf);
1363 sw_reg_start(sdp);
1364 splx(s);
1365
1366 /*
1367 * advance to the next I/O
1368 */
1369 byteoff += sz;
1370 addr += sz;
1371 }
1372
1373 s = splbio();
1374
1375 out: /* Arrive here at splbio */
1376 vnx->vx_flags &= ~VX_BUSY;
1377 if (vnx->vx_pending == 0) {
1378 if (vnx->vx_error != 0) {
1379 bp->b_error = vnx->vx_error;
1380 bp->b_flags |= B_ERROR;
1381 }
1382 putvndxfer(vnx);
1383 biodone(bp);
1384 }
1385 splx(s);
1386 }
1387
1388 /*
1389 * sw_reg_start: start an I/O request on the requested swapdev
1390 *
1391 * => reqs are sorted by disksort (above)
1392 */
1393 static void
1394 sw_reg_start(sdp)
1395 struct swapdev *sdp;
1396 {
1397 struct buf *bp;
1398 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1399
1400 /* recursion control */
1401 if ((sdp->swd_flags & SWF_BUSY) != 0)
1402 return;
1403
1404 sdp->swd_flags |= SWF_BUSY;
1405
1406 while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
1407 bp = sdp->swd_tab.b_actf;
1408 if (bp == NULL)
1409 break;
1410 sdp->swd_tab.b_actf = bp->b_actf;
1411 sdp->swd_tab.b_active++;
1412
1413 UVMHIST_LOG(pdhist,
1414 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1415 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1416 if ((bp->b_flags & B_READ) == 0)
1417 bp->b_vp->v_numoutput++;
1418 VOP_STRATEGY(bp);
1419 }
1420 sdp->swd_flags &= ~SWF_BUSY;
1421 }
1422
1423 /*
1424 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1425 *
1426 * => note that we can recover the vndbuf struct by casting the buf ptr
1427 */
1428 static void
1429 sw_reg_iodone(bp)
1430 struct buf *bp;
1431 {
1432 struct vndbuf *vbp = (struct vndbuf *) bp;
1433 struct vndxfer *vnx = vbp->vb_xfer;
1434 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1435 struct swapdev *sdp = vnx->vx_sdp;
1436 int s, resid;
1437 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1438
1439 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1440 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1441 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1442 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1443
1444 /*
1445 * protect vbp at splbio and update.
1446 */
1447
1448 s = splbio();
1449 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1450 pbp->b_resid -= resid;
1451 vnx->vx_pending--;
1452
1453 if (vbp->vb_buf.b_error) {
1454 UVMHIST_LOG(pdhist, " got error=%d !",
1455 vbp->vb_buf.b_error, 0, 0, 0);
1456
1457 /* pass error upward */
1458 vnx->vx_error = vbp->vb_buf.b_error;
1459 }
1460
1461 /*
1462 * drop "hold" reference to vnode (if one)
1463 * XXXCDC: always set to NULLVP, this is useless, right?
1464 */
1465 if (vbp->vb_buf.b_vp != NULLVP)
1466 brelvp(&vbp->vb_buf);
1467
1468 /*
1469 * kill vbp structure
1470 */
1471 putvndbuf(vbp);
1472
1473 /*
1474 * wrap up this transaction if it has run to completion or, in
1475 * case of an error, when all auxiliary buffers have returned.
1476 */
1477 if (vnx->vx_error != 0) {
1478 /* pass error upward */
1479 pbp->b_flags |= B_ERROR;
1480 pbp->b_error = vnx->vx_error;
1481 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1482 putvndxfer(vnx);
1483 biodone(pbp);
1484 }
1485 } else if (pbp->b_resid == 0) {
1486 #ifdef DIAGNOSTIC
1487 if (vnx->vx_pending != 0)
1488 panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending);
1489 #endif
1490
1491 if ((vnx->vx_flags & VX_BUSY) == 0) {
1492 UVMHIST_LOG(pdhist, " iodone error=%d !",
1493 pbp, vnx->vx_error, 0, 0);
1494 putvndxfer(vnx);
1495 biodone(pbp);
1496 }
1497 }
1498
1499 /*
1500 * done! start next swapdev I/O if one is pending
1501 */
1502 sdp->swd_tab.b_active--;
1503 sw_reg_start(sdp);
1504
1505 splx(s);
1506 }
1507 #endif /* SWAP_TO_FILES */
1508
1509
1510 /*
1511 * uvm_swap_alloc: allocate space on swap
1512 *
1513 * => allocation is done "round robin" down the priority list, as we
1514 * allocate in a priority we "rotate" the circle queue.
1515 * => space can be freed with uvm_swap_free
1516 * => we return the page slot number in /dev/drum (0 == invalid slot)
1517 * => we lock uvm.swap_data_lock
1518 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1519 */
1520 int
1521 uvm_swap_alloc(nslots, lessok)
1522 int *nslots; /* IN/OUT */
1523 boolean_t lessok;
1524 {
1525 struct swapdev *sdp;
1526 struct swappri *spp;
1527 u_long result;
1528 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1529
1530 /*
1531 * no swap devices configured yet? definite failure.
1532 */
1533 if (uvmexp.nswapdev < 1)
1534 return 0;
1535
1536 /*
1537 * lock data lock, convert slots into blocks, and enter loop
1538 */
1539 simple_lock(&uvm.swap_data_lock);
1540
1541 ReTry: /* XXXMRG */
1542 for (spp = swap_priority.lh_first; spp != NULL;
1543 spp = spp->spi_swappri.le_next) {
1544 for (sdp = spp->spi_swapdev.cqh_first;
1545 sdp != (void *)&spp->spi_swapdev;
1546 sdp = sdp->swd_next.cqe_next) {
1547 /* if it's not enabled, then we can't swap from it */
1548 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1549 continue;
1550 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1551 continue;
1552 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1553 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1554 &result) != 0) {
1555 continue;
1556 }
1557
1558 /*
1559 * successful allocation! now rotate the circleq.
1560 */
1561 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1562 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1563 sdp->swd_npginuse += *nslots;
1564 uvmexp.swpginuse += *nslots;
1565 simple_unlock(&uvm.swap_data_lock);
1566 /* done! return drum slot number */
1567 UVMHIST_LOG(pdhist,
1568 "success! returning %d slots starting at %d",
1569 *nslots, result + sdp->swd_drumoffset, 0, 0);
1570 return(result + sdp->swd_drumoffset);
1571 }
1572 }
1573
1574 /* XXXMRG: BEGIN HACK */
1575 if (*nslots > 1 && lessok) {
1576 *nslots = 1;
1577 goto ReTry; /* XXXMRG: ugh! extent should support this for us */
1578 }
1579 /* XXXMRG: END HACK */
1580
1581 simple_unlock(&uvm.swap_data_lock);
1582 return 0; /* failed */
1583 }
1584
1585 /*
1586 * uvm_swap_free: free swap slots
1587 *
1588 * => this can be all or part of an allocation made by uvm_swap_alloc
1589 * => we lock uvm.swap_data_lock
1590 */
1591 void
1592 uvm_swap_free(startslot, nslots)
1593 int startslot;
1594 int nslots;
1595 {
1596 struct swapdev *sdp;
1597 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1598
1599 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1600 startslot, 0, 0);
1601 /*
1602 * convert drum slot offset back to sdp, free the blocks
1603 * in the extent, and return. must hold pri lock to do
1604 * lookup and access the extent.
1605 */
1606 simple_lock(&uvm.swap_data_lock);
1607 sdp = swapdrum_getsdp(startslot);
1608
1609 #ifdef DIAGNOSTIC
1610 if (uvmexp.nswapdev < 1)
1611 panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1612 if (sdp == NULL) {
1613 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1614 nslots);
1615 panic("uvm_swap_free: unmapped address\n");
1616 }
1617 #endif
1618 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1619 EX_MALLOCOK|EX_NOWAIT) != 0)
1620 printf("warning: resource shortage: %d slots of swap lost\n",
1621 nslots);
1622
1623 sdp->swd_npginuse -= nslots;
1624 uvmexp.swpginuse -= nslots;
1625 #ifdef DIAGNOSTIC
1626 if (sdp->swd_npginuse < 0)
1627 panic("uvm_swap_free: inuse < 0");
1628 #endif
1629 simple_unlock(&uvm.swap_data_lock);
1630 }
1631
1632 /*
1633 * uvm_swap_put: put any number of pages into a contig place on swap
1634 *
1635 * => can be sync or async
1636 * => XXXMRG: consider making it an inline or macro
1637 */
1638 int
1639 uvm_swap_put(swslot, ppsp, npages, flags)
1640 int swslot;
1641 struct vm_page **ppsp;
1642 int npages;
1643 int flags;
1644 {
1645 int result;
1646
1647 #if 0
1648 flags |= PGO_SYNCIO; /* XXXMRG: tmp, force sync */
1649 #endif
1650
1651 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1652 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1653
1654 return (result);
1655 }
1656
1657 /*
1658 * uvm_swap_get: get a single page from swap
1659 *
1660 * => usually a sync op (from fault)
1661 * => XXXMRG: consider making it an inline or macro
1662 */
1663 int
1664 uvm_swap_get(page, swslot, flags)
1665 struct vm_page *page;
1666 int swslot, flags;
1667 {
1668 int result;
1669
1670 uvmexp.nswget++;
1671 #ifdef DIAGNOSTIC
1672 if ((flags & PGO_SYNCIO) == 0)
1673 printf("uvm_swap_get: ASYNC get requested?\n");
1674 #endif
1675
1676 /*
1677 * this page is (about to be) no longer only in swap.
1678 */
1679 simple_lock(&uvm.swap_data_lock);
1680 uvmexp.swpgonly--;
1681 simple_unlock(&uvm.swap_data_lock);
1682
1683 result = uvm_swap_io(&page, swslot, 1, B_READ |
1684 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1685
1686 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
1687 /*
1688 * oops, the read failed so it really is still only in swap.
1689 */
1690 simple_lock(&uvm.swap_data_lock);
1691 uvmexp.swpgonly++;
1692 simple_unlock(&uvm.swap_data_lock);
1693 }
1694
1695 return (result);
1696 }
1697
1698 /*
1699 * uvm_swap_io: do an i/o operation to swap
1700 */
1701
1702 static int
1703 uvm_swap_io(pps, startslot, npages, flags)
1704 struct vm_page **pps;
1705 int startslot, npages, flags;
1706 {
1707 daddr_t startblk;
1708 struct swapbuf *sbp;
1709 struct buf *bp;
1710 vaddr_t kva;
1711 int result, s, waitf, pflag;
1712 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1713
1714 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1715 startslot, npages, flags, 0);
1716 /*
1717 * convert starting drum slot to block number
1718 */
1719 startblk = btodb(startslot << PAGE_SHIFT);
1720
1721 /*
1722 * first, map the pages into the kernel (XXX: currently required
1723 * by buffer system). note that we don't let pagermapin alloc
1724 * an aiodesc structure because we don't want to chance a malloc.
1725 * we've got our own pool of aiodesc structures (in swapbuf).
1726 */
1727 waitf = (flags & B_ASYNC) ? M_NOWAIT : M_WAITOK;
1728 kva = uvm_pagermapin(pps, npages, NULL, waitf);
1729 if (kva == NULL)
1730 return (VM_PAGER_AGAIN);
1731
1732 /*
1733 * now allocate a swap buffer off of freesbufs
1734 * [make sure we don't put the pagedaemon to sleep...]
1735 */
1736 s = splbio();
1737 pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc)
1738 ? 0
1739 : PR_WAITOK;
1740 sbp = pool_get(swapbuf_pool, pflag);
1741 splx(s); /* drop splbio */
1742
1743 /*
1744 * if we failed to get a swapbuf, return "try again"
1745 */
1746 if (sbp == NULL)
1747 return (VM_PAGER_AGAIN);
1748
1749 /*
1750 * fill in the bp/sbp. we currently route our i/o through
1751 * /dev/drum's vnode [swapdev_vp].
1752 */
1753 bp = &sbp->sw_buf;
1754 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
1755 bp->b_proc = &proc0; /* XXX */
1756 bp->b_rcred = bp->b_wcred = proc0.p_ucred;
1757 bp->b_vnbufs.le_next = NOLIST;
1758 bp->b_data = (caddr_t)kva;
1759 bp->b_blkno = startblk;
1760 s = splbio();
1761 VHOLD(swapdev_vp);
1762 bp->b_vp = swapdev_vp;
1763 splx(s);
1764 /* XXXCDC: isn't swapdev_vp always a VCHR? */
1765 /* XXXMRG: probably -- this is obviously something inherited... */
1766 if (swapdev_vp->v_type == VBLK)
1767 bp->b_dev = swapdev_vp->v_rdev;
1768 bp->b_bcount = npages << PAGE_SHIFT;
1769
1770 /*
1771 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1772 * and we bump v_numoutput (counter of number of active outputs).
1773 */
1774 if ((bp->b_flags & B_READ) == 0) {
1775 bp->b_dirtyoff = 0;
1776 bp->b_dirtyend = npages << PAGE_SHIFT;
1777 s = splbio();
1778 swapdev_vp->v_numoutput++;
1779 splx(s);
1780 }
1781
1782 /*
1783 * for async ops we must set up the aiodesc and setup the callback
1784 * XXX: we expect no async-reads, but we don't prevent it here.
1785 */
1786 if (flags & B_ASYNC) {
1787 sbp->sw_aio.aiodone = uvm_swap_aiodone;
1788 sbp->sw_aio.kva = kva;
1789 sbp->sw_aio.npages = npages;
1790 sbp->sw_aio.pd_ptr = sbp; /* backpointer */
1791 /* XXX pagedaemon */
1792 sbp->sw_aio.flags = (curproc == uvm.pagedaemon_proc) ?
1793 UVM_AIO_PAGEDAEMON : 0;
1794 bp->b_flags |= B_CALL; /* set callback */
1795 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
1796 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1797 }
1798 UVMHIST_LOG(pdhist,
1799 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
1800 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1801
1802 /*
1803 * now we start the I/O, and if async, return.
1804 */
1805 VOP_STRATEGY(bp);
1806 if (flags & B_ASYNC)
1807 return (VM_PAGER_PEND);
1808
1809 /*
1810 * must be sync i/o. wait for it to finish
1811 */
1812 bp->b_error = biowait(bp);
1813 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1814
1815 /*
1816 * kill the pager mapping
1817 */
1818 uvm_pagermapout(kva, npages);
1819
1820 /*
1821 * now dispose of the swap buffer
1822 */
1823 s = splbio();
1824 if (bp->b_vp)
1825 brelvp(bp);
1826
1827 pool_put(swapbuf_pool, sbp);
1828 splx(s);
1829
1830 /*
1831 * finally return.
1832 */
1833 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0);
1834 return (result);
1835 }
1836
1837 /*
1838 * uvm_swap_bufdone: called from the buffer system when the i/o is done
1839 */
1840 static void
1841 uvm_swap_bufdone(bp)
1842 struct buf *bp;
1843 {
1844 struct swapbuf *sbp = (struct swapbuf *) bp;
1845 int s = splbio();
1846 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
1847
1848 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
1849 #ifdef DIAGNOSTIC
1850 /*
1851 * sanity check: swapbufs are private, so they shouldn't be wanted
1852 */
1853 if (bp->b_flags & B_WANTED)
1854 panic("uvm_swap_bufdone: private buf wanted");
1855 #endif
1856
1857 /*
1858 * drop the buffer's reference to the vnode.
1859 */
1860 if (bp->b_vp)
1861 brelvp(bp);
1862
1863 /*
1864 * now put the aio on the uvm.aio_done list and wake the
1865 * pagedaemon (which will finish up our job in its context).
1866 */
1867 simple_lock(&uvm.aiodoned_lock); /* locks uvm.aio_done */
1868 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
1869 simple_unlock(&uvm.aiodoned_lock);
1870
1871 wakeup(&uvm.aiodoned);
1872 splx(s);
1873 }
1874
1875 /*
1876 * uvm_swap_aiodone: aiodone function for anonymous memory
1877 *
1878 * => this is called in the context of the pagedaemon (but with the
1879 * page queues unlocked!)
1880 * => our "aio" structure must be part of a "swapbuf"
1881 */
1882 static void
1883 uvm_swap_aiodone(aio)
1884 struct uvm_aiodesc *aio;
1885 {
1886 struct swapbuf *sbp = aio->pd_ptr;
1887 struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT];
1888 int lcv, s;
1889 vaddr_t addr;
1890 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
1891
1892 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
1893 #ifdef DIAGNOSTIC
1894 /*
1895 * sanity check
1896 */
1897 if (aio->npages > (MAXBSIZE >> PAGE_SHIFT))
1898 panic("uvm_swap_aiodone: aio too big!");
1899 #endif
1900
1901 /*
1902 * first, we have to recover the page pointers (pps) by poking in the
1903 * kernel pmap (XXX: should be saved in the buf structure).
1904 */
1905 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
1906 addr += PAGE_SIZE, lcv++) {
1907 pps[lcv] = uvm_pageratop(addr);
1908 }
1909
1910 /*
1911 * now we can dispose of the kernel mappings of the buffer
1912 */
1913 uvm_pagermapout(aio->kva, aio->npages);
1914
1915 /*
1916 * now we can dispose of the pages by using the dropcluster function
1917 * [note that we have no "page of interest" so we pass in null]
1918 */
1919 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
1920 PGO_PDFREECLUST, 0);
1921
1922 /*
1923 * finally, we can dispose of the swapbuf
1924 */
1925 s = splbio();
1926 pool_put(swapbuf_pool, sbp);
1927 splx(s);
1928 }
1929