uvm_swap.c revision 1.39 1 /* $NetBSD: uvm_swap.c,v 1.39 2000/11/13 14:50:55 chs Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32 */
33
34 #include "fs_nfs.h"
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/proc.h>
43 #include <sys/namei.h>
44 #include <sys/disklabel.h>
45 #include <sys/errno.h>
46 #include <sys/kernel.h>
47 #include <sys/malloc.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/extent.h>
51 #include <sys/mount.h>
52 #include <sys/pool.h>
53 #include <sys/syscallargs.h>
54 #include <sys/swap.h>
55
56 #include <uvm/uvm.h>
57
58 #include <miscfs/specfs/specdev.h>
59
60 /*
61 * uvm_swap.c: manage configuration and i/o to swap space.
62 */
63
64 /*
65 * swap space is managed in the following way:
66 *
67 * each swap partition or file is described by a "swapdev" structure.
68 * each "swapdev" structure contains a "swapent" structure which contains
69 * information that is passed up to the user (via system calls).
70 *
71 * each swap partition is assigned a "priority" (int) which controls
72 * swap parition usage.
73 *
74 * the system maintains a global data structure describing all swap
75 * partitions/files. there is a sorted LIST of "swappri" structures
76 * which describe "swapdev"'s at that priority. this LIST is headed
77 * by the "swap_priority" global var. each "swappri" contains a
78 * CIRCLEQ of "swapdev" structures at that priority.
79 *
80 * the system maintains a fixed pool of "swapbuf" structures for use
81 * at swap i/o time. a swapbuf includes a "buf" structure and an
82 * "aiodone" [we want to avoid malloc()'ing anything at swapout time
83 * since memory may be low].
84 *
85 * locking:
86 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl
87 * system call and prevents the swap priority list from changing
88 * while we are in the middle of a system call (e.g. SWAP_STATS).
89 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data
90 * structures including the priority list, the swapdev structures,
91 * and the swapmap extent.
92 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf
93 * pool.
94 *
95 * each swap device has the following info:
96 * - swap device in use (could be disabled, preventing future use)
97 * - swap enabled (allows new allocations on swap)
98 * - map info in /dev/drum
99 * - vnode pointer
100 * for swap files only:
101 * - block size
102 * - max byte count in buffer
103 * - buffer
104 * - credentials to use when doing i/o to file
105 *
106 * userland controls and configures swap with the swapctl(2) system call.
107 * the sys_swapctl performs the following operations:
108 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
109 * [2] SWAP_STATS: given a pointer to an array of swapent structures
110 * (passed in via "arg") of a size passed in via "misc" ... we load
111 * the current swap config into the array.
112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
113 * priority in "misc", start swapping on it.
114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
116 * "misc")
117 */
118
119 /*
120 * swapdev: describes a single swap partition/file
121 *
122 * note the following should be true:
123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
125 */
126 struct swapdev {
127 struct oswapent swd_ose;
128 #define swd_dev swd_ose.ose_dev /* device id */
129 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */
130 #define swd_priority swd_ose.ose_priority /* our priority */
131 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
132 char *swd_path; /* saved pathname of device */
133 int swd_pathlen; /* length of pathname */
134 int swd_npages; /* #pages we can use */
135 int swd_npginuse; /* #pages in use */
136 int swd_npgbad; /* #pages bad */
137 int swd_drumoffset; /* page0 offset in drum */
138 int swd_drumsize; /* #pages in drum */
139 struct extent *swd_ex; /* extent for this swapdev */
140 struct vnode *swd_vp; /* backing vnode */
141 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
142
143 int swd_bsize; /* blocksize (bytes) */
144 int swd_maxactive; /* max active i/o reqs */
145 struct buf_queue swd_tab; /* buffer list */
146 int swd_active; /* number of active buffers */
147 struct ucred *swd_cred; /* cred for file access */
148 };
149
150 /*
151 * swap device priority entry; the list is kept sorted on `spi_priority'.
152 */
153 struct swappri {
154 int spi_priority; /* priority */
155 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
156 /* circleq of swapdevs at this priority */
157 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
158 };
159
160 /*
161 * swapbuf, swapbuffer plus async i/o info
162 */
163 struct swapbuf {
164 struct buf sw_buf; /* a buffer structure */
165 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */
166 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */
167 };
168
169 /*
170 * The following two structures are used to keep track of data transfers
171 * on swap devices associated with regular files.
172 * NOTE: this code is more or less a copy of vnd.c; we use the same
173 * structure names here to ease porting..
174 */
175 struct vndxfer {
176 struct buf *vx_bp; /* Pointer to parent buffer */
177 struct swapdev *vx_sdp;
178 int vx_error;
179 int vx_pending; /* # of pending aux buffers */
180 int vx_flags;
181 #define VX_BUSY 1
182 #define VX_DEAD 2
183 };
184
185 struct vndbuf {
186 struct buf vb_buf;
187 struct vndxfer *vb_xfer;
188 };
189
190
191 /*
192 * We keep a of pool vndbuf's and vndxfer structures.
193 */
194 struct pool *vndxfer_pool;
195 struct pool *vndbuf_pool;
196
197 #define getvndxfer(vnx) do { \
198 int s = splbio(); \
199 vnx = pool_get(vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \
200 splx(s); \
201 } while (0)
202
203 #define putvndxfer(vnx) { \
204 pool_put(vndxfer_pool, (void *)(vnx)); \
205 }
206
207 #define getvndbuf(vbp) do { \
208 int s = splbio(); \
209 vbp = pool_get(vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \
210 splx(s); \
211 } while (0)
212
213 #define putvndbuf(vbp) { \
214 pool_put(vndbuf_pool, (void *)(vbp)); \
215 }
216
217 /* /dev/drum */
218 bdev_decl(sw);
219 cdev_decl(sw);
220
221 /*
222 * local variables
223 */
224 static struct extent *swapmap; /* controls the mapping of /dev/drum */
225 SIMPLEQ_HEAD(swapbufhead, swapbuf);
226 struct pool *swapbuf_pool;
227
228 /* list of all active swap devices [by priority] */
229 LIST_HEAD(swap_priority, swappri);
230 static struct swap_priority swap_priority;
231
232 /* locks */
233 lock_data_t swap_syscall_lock;
234
235 /*
236 * prototypes
237 */
238 static void swapdrum_add __P((struct swapdev *, int));
239 static struct swapdev *swapdrum_getsdp __P((int));
240
241 static struct swapdev *swaplist_find __P((struct vnode *, int));
242 static void swaplist_insert __P((struct swapdev *,
243 struct swappri *, int));
244 static void swaplist_trim __P((void));
245
246 static int swap_on __P((struct proc *, struct swapdev *));
247 static int swap_off __P((struct proc *, struct swapdev *));
248
249 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
250 static void sw_reg_iodone __P((struct buf *));
251 static void sw_reg_start __P((struct swapdev *));
252
253 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
254 static void uvm_swap_bufdone __P((struct buf *));
255 static int uvm_swap_io __P((struct vm_page **, int, int, int));
256
257 /*
258 * uvm_swap_init: init the swap system data structures and locks
259 *
260 * => called at boot time from init_main.c after the filesystems
261 * are brought up (which happens after uvm_init())
262 */
263 void
264 uvm_swap_init()
265 {
266 UVMHIST_FUNC("uvm_swap_init");
267
268 UVMHIST_CALLED(pdhist);
269 /*
270 * first, init the swap list, its counter, and its lock.
271 * then get a handle on the vnode for /dev/drum by using
272 * the its dev_t number ("swapdev", from MD conf.c).
273 */
274
275 LIST_INIT(&swap_priority);
276 uvmexp.nswapdev = 0;
277 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
278 simple_lock_init(&uvm.swap_data_lock);
279
280 if (bdevvp(swapdev, &swapdev_vp))
281 panic("uvm_swap_init: can't get vnode for swap device");
282
283 /*
284 * create swap block resource map to map /dev/drum. the range
285 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
286 * that block 0 is reserved (used to indicate an allocation
287 * failure, or no allocation).
288 */
289 swapmap = extent_create("swapmap", 1, INT_MAX,
290 M_VMSWAP, 0, 0, EX_NOWAIT);
291 if (swapmap == 0)
292 panic("uvm_swap_init: extent_create failed");
293
294 /*
295 * allocate our private pool of "swapbuf" structures (includes
296 * a "buf" structure). ["nswbuf" comes from param.c and can
297 * be adjusted by MD code before we get here].
298 */
299
300 swapbuf_pool =
301 pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0,
302 NULL, NULL, 0);
303 if (swapbuf_pool == NULL)
304 panic("swapinit: pool_create failed");
305 /* XXX - set a maximum on swapbuf_pool? */
306
307 vndxfer_pool =
308 pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0,
309 NULL, NULL, 0);
310 if (vndxfer_pool == NULL)
311 panic("swapinit: pool_create failed");
312
313 vndbuf_pool =
314 pool_create(sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0,
315 NULL, NULL, 0);
316 if (vndbuf_pool == NULL)
317 panic("swapinit: pool_create failed");
318 /*
319 * done!
320 */
321 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
322 }
323
324 /*
325 * swaplist functions: functions that operate on the list of swap
326 * devices on the system.
327 */
328
329 /*
330 * swaplist_insert: insert swap device "sdp" into the global list
331 *
332 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
333 * => caller must provide a newly malloc'd swappri structure (we will
334 * FREE it if we don't need it... this it to prevent malloc blocking
335 * here while adding swap)
336 */
337 static void
338 swaplist_insert(sdp, newspp, priority)
339 struct swapdev *sdp;
340 struct swappri *newspp;
341 int priority;
342 {
343 struct swappri *spp, *pspp;
344 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
345
346 /*
347 * find entry at or after which to insert the new device.
348 */
349 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL;
350 spp = LIST_NEXT(spp, spi_swappri)) {
351 if (priority <= spp->spi_priority)
352 break;
353 pspp = spp;
354 }
355
356 /*
357 * new priority?
358 */
359 if (spp == NULL || spp->spi_priority != priority) {
360 spp = newspp; /* use newspp! */
361 UVMHIST_LOG(pdhist, "created new swappri = %d",
362 priority, 0, 0, 0);
363
364 spp->spi_priority = priority;
365 CIRCLEQ_INIT(&spp->spi_swapdev);
366
367 if (pspp)
368 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
369 else
370 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
371 } else {
372 /* we don't need a new priority structure, free it */
373 FREE(newspp, M_VMSWAP);
374 }
375
376 /*
377 * priority found (or created). now insert on the priority's
378 * circleq list and bump the total number of swapdevs.
379 */
380 sdp->swd_priority = priority;
381 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
382 uvmexp.nswapdev++;
383 }
384
385 /*
386 * swaplist_find: find and optionally remove a swap device from the
387 * global list.
388 *
389 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
390 * => we return the swapdev we found (and removed)
391 */
392 static struct swapdev *
393 swaplist_find(vp, remove)
394 struct vnode *vp;
395 boolean_t remove;
396 {
397 struct swapdev *sdp;
398 struct swappri *spp;
399
400 /*
401 * search the lists for the requested vp
402 */
403 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
404 spp = LIST_NEXT(spp, spi_swappri)) {
405 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
406 sdp != (void *)&spp->spi_swapdev;
407 sdp = CIRCLEQ_NEXT(sdp, swd_next))
408 if (sdp->swd_vp == vp) {
409 if (remove) {
410 CIRCLEQ_REMOVE(&spp->spi_swapdev,
411 sdp, swd_next);
412 uvmexp.nswapdev--;
413 }
414 return(sdp);
415 }
416 }
417 return (NULL);
418 }
419
420
421 /*
422 * swaplist_trim: scan priority list for empty priority entries and kill
423 * them.
424 *
425 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
426 */
427 static void
428 swaplist_trim()
429 {
430 struct swappri *spp, *nextspp;
431
432 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
433 nextspp = LIST_NEXT(spp, spi_swappri);
434 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
435 (void *)&spp->spi_swapdev)
436 continue;
437 LIST_REMOVE(spp, spi_swappri);
438 free(spp, M_VMSWAP);
439 }
440 }
441
442 /*
443 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
444 *
445 * => caller must hold swap_syscall_lock
446 * => uvm.swap_data_lock should be unlocked (we may sleep)
447 */
448 static void
449 swapdrum_add(sdp, npages)
450 struct swapdev *sdp;
451 int npages;
452 {
453 u_long result;
454
455 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
456 EX_WAITOK, &result))
457 panic("swapdrum_add");
458
459 sdp->swd_drumoffset = result;
460 sdp->swd_drumsize = npages;
461 }
462
463 /*
464 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
465 * to the "swapdev" that maps that section of the drum.
466 *
467 * => each swapdev takes one big contig chunk of the drum
468 * => caller must hold uvm.swap_data_lock
469 */
470 static struct swapdev *
471 swapdrum_getsdp(pgno)
472 int pgno;
473 {
474 struct swapdev *sdp;
475 struct swappri *spp;
476
477 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
478 spp = LIST_NEXT(spp, spi_swappri))
479 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
480 sdp != (void *)&spp->spi_swapdev;
481 sdp = CIRCLEQ_NEXT(sdp, swd_next))
482 if (pgno >= sdp->swd_drumoffset &&
483 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
484 return sdp;
485 }
486 return NULL;
487 }
488
489
490 /*
491 * sys_swapctl: main entry point for swapctl(2) system call
492 * [with two helper functions: swap_on and swap_off]
493 */
494 int
495 sys_swapctl(p, v, retval)
496 struct proc *p;
497 void *v;
498 register_t *retval;
499 {
500 struct sys_swapctl_args /* {
501 syscallarg(int) cmd;
502 syscallarg(void *) arg;
503 syscallarg(int) misc;
504 } */ *uap = (struct sys_swapctl_args *)v;
505 struct vnode *vp;
506 struct nameidata nd;
507 struct swappri *spp;
508 struct swapdev *sdp;
509 struct swapent *sep;
510 char userpath[PATH_MAX + 1];
511 size_t len;
512 int count, error, misc;
513 int priority;
514 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
515
516 misc = SCARG(uap, misc);
517
518 /*
519 * ensure serialized syscall access by grabbing the swap_syscall_lock
520 */
521 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL);
522
523 /*
524 * we handle the non-priv NSWAP and STATS request first.
525 *
526 * SWAP_NSWAP: return number of config'd swap devices
527 * [can also be obtained with uvmexp sysctl]
528 */
529 if (SCARG(uap, cmd) == SWAP_NSWAP) {
530 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
531 0, 0, 0);
532 *retval = uvmexp.nswapdev;
533 error = 0;
534 goto out;
535 }
536
537 /*
538 * SWAP_STATS: get stats on current # of configured swap devs
539 *
540 * note that the swap_priority list can't change as long
541 * as we are holding the swap_syscall_lock. we don't want
542 * to grab the uvm.swap_data_lock because we may fault&sleep during
543 * copyout() and we don't want to be holding that lock then!
544 */
545 if (SCARG(uap, cmd) == SWAP_STATS
546 #if defined(COMPAT_13)
547 || SCARG(uap, cmd) == SWAP_OSTATS
548 #endif
549 ) {
550 sep = (struct swapent *)SCARG(uap, arg);
551 count = 0;
552
553 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
554 spp = LIST_NEXT(spp, spi_swappri)) {
555 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
556 sdp != (void *)&spp->spi_swapdev && misc-- > 0;
557 sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
558 /*
559 * backwards compatibility for system call.
560 * note that we use 'struct oswapent' as an
561 * overlay into both 'struct swapdev' and
562 * the userland 'struct swapent', as we
563 * want to retain backwards compatibility
564 * with NetBSD 1.3.
565 */
566 sdp->swd_ose.ose_inuse =
567 btodb(sdp->swd_npginuse << PAGE_SHIFT);
568 error = copyout(&sdp->swd_ose, sep,
569 sizeof(struct oswapent));
570
571 /* now copy out the path if necessary */
572 #if defined(COMPAT_13)
573 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS)
574 #else
575 if (error == 0)
576 #endif
577 error = copyout(sdp->swd_path,
578 &sep->se_path, sdp->swd_pathlen);
579
580 if (error)
581 goto out;
582 count++;
583 #if defined(COMPAT_13)
584 if (SCARG(uap, cmd) == SWAP_OSTATS)
585 ((struct oswapent *)sep)++;
586 else
587 #endif
588 sep++;
589 }
590 }
591
592 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
593
594 *retval = count;
595 error = 0;
596 goto out;
597 }
598
599 /*
600 * all other requests require superuser privs. verify.
601 */
602 if ((error = suser(p->p_ucred, &p->p_acflag)))
603 goto out;
604
605 /*
606 * at this point we expect a path name in arg. we will
607 * use namei() to gain a vnode reference (vref), and lock
608 * the vnode (VOP_LOCK).
609 *
610 * XXX: a NULL arg means use the root vnode pointer (e.g. for
611 * miniroot)
612 */
613 if (SCARG(uap, arg) == NULL) {
614 vp = rootvp; /* miniroot */
615 if (vget(vp, LK_EXCLUSIVE)) {
616 error = EBUSY;
617 goto out;
618 }
619 if (SCARG(uap, cmd) == SWAP_ON &&
620 copystr("miniroot", userpath, sizeof userpath, &len))
621 panic("swapctl: miniroot copy failed");
622 } else {
623 int space;
624 char *where;
625
626 if (SCARG(uap, cmd) == SWAP_ON) {
627 if ((error = copyinstr(SCARG(uap, arg), userpath,
628 sizeof userpath, &len)))
629 goto out;
630 space = UIO_SYSSPACE;
631 where = userpath;
632 } else {
633 space = UIO_USERSPACE;
634 where = (char *)SCARG(uap, arg);
635 }
636 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
637 if ((error = namei(&nd)))
638 goto out;
639 vp = nd.ni_vp;
640 }
641 /* note: "vp" is referenced and locked */
642
643 error = 0; /* assume no error */
644 switch(SCARG(uap, cmd)) {
645 case SWAP_DUMPDEV:
646 if (vp->v_type != VBLK) {
647 error = ENOTBLK;
648 goto out;
649 }
650 dumpdev = vp->v_rdev;
651
652 break;
653
654 case SWAP_CTL:
655 /*
656 * get new priority, remove old entry (if any) and then
657 * reinsert it in the correct place. finally, prune out
658 * any empty priority structures.
659 */
660 priority = SCARG(uap, misc);
661 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
662 simple_lock(&uvm.swap_data_lock);
663 if ((sdp = swaplist_find(vp, 1)) == NULL) {
664 error = ENOENT;
665 } else {
666 swaplist_insert(sdp, spp, priority);
667 swaplist_trim();
668 }
669 simple_unlock(&uvm.swap_data_lock);
670 if (error)
671 free(spp, M_VMSWAP);
672 break;
673
674 case SWAP_ON:
675
676 /*
677 * check for duplicates. if none found, then insert a
678 * dummy entry on the list to prevent someone else from
679 * trying to enable this device while we are working on
680 * it.
681 */
682
683 priority = SCARG(uap, misc);
684 simple_lock(&uvm.swap_data_lock);
685 if ((sdp = swaplist_find(vp, 0)) != NULL) {
686 error = EBUSY;
687 simple_unlock(&uvm.swap_data_lock);
688 break;
689 }
690 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
691 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
692 memset(sdp, 0, sizeof(*sdp));
693 sdp->swd_flags = SWF_FAKE; /* placeholder only */
694 sdp->swd_vp = vp;
695 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
696 BUFQ_INIT(&sdp->swd_tab);
697
698 /*
699 * XXX Is NFS elaboration necessary?
700 */
701 if (vp->v_type == VREG) {
702 sdp->swd_cred = crdup(p->p_ucred);
703 }
704
705 swaplist_insert(sdp, spp, priority);
706 simple_unlock(&uvm.swap_data_lock);
707
708 sdp->swd_pathlen = len;
709 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
710 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
711 panic("swapctl: copystr");
712
713 /*
714 * we've now got a FAKE placeholder in the swap list.
715 * now attempt to enable swap on it. if we fail, undo
716 * what we've done and kill the fake entry we just inserted.
717 * if swap_on is a success, it will clear the SWF_FAKE flag
718 */
719
720 if ((error = swap_on(p, sdp)) != 0) {
721 simple_lock(&uvm.swap_data_lock);
722 (void) swaplist_find(vp, 1); /* kill fake entry */
723 swaplist_trim();
724 simple_unlock(&uvm.swap_data_lock);
725 if (vp->v_type == VREG) {
726 crfree(sdp->swd_cred);
727 }
728 free(sdp->swd_path, M_VMSWAP);
729 free(sdp, M_VMSWAP);
730 break;
731 }
732 break;
733
734 case SWAP_OFF:
735 simple_lock(&uvm.swap_data_lock);
736 if ((sdp = swaplist_find(vp, 0)) == NULL) {
737 simple_unlock(&uvm.swap_data_lock);
738 error = ENXIO;
739 break;
740 }
741
742 /*
743 * If a device isn't in use or enabled, we
744 * can't stop swapping from it (again).
745 */
746 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
747 simple_unlock(&uvm.swap_data_lock);
748 error = EBUSY;
749 break;
750 }
751
752 /*
753 * do the real work.
754 */
755 if ((error = swap_off(p, sdp)) != 0)
756 goto out;
757
758 break;
759
760 default:
761 error = EINVAL;
762 }
763
764 /*
765 * done! release the ref gained by namei() and unlock.
766 */
767 vput(vp);
768
769 out:
770 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL);
771
772 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
773 return (error);
774 }
775
776 /*
777 * swap_on: attempt to enable a swapdev for swapping. note that the
778 * swapdev is already on the global list, but disabled (marked
779 * SWF_FAKE).
780 *
781 * => we avoid the start of the disk (to protect disk labels)
782 * => we also avoid the miniroot, if we are swapping to root.
783 * => caller should leave uvm.swap_data_lock unlocked, we may lock it
784 * if needed.
785 */
786 static int
787 swap_on(p, sdp)
788 struct proc *p;
789 struct swapdev *sdp;
790 {
791 static int count = 0; /* static */
792 struct vnode *vp;
793 int error, npages, nblocks, size;
794 long addr;
795 struct vattr va;
796 #ifdef NFS
797 extern int (**nfsv2_vnodeop_p) __P((void *));
798 #endif /* NFS */
799 dev_t dev;
800 char *name;
801 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
802
803 /*
804 * we want to enable swapping on sdp. the swd_vp contains
805 * the vnode we want (locked and ref'd), and the swd_dev
806 * contains the dev_t of the file, if it a block device.
807 */
808
809 vp = sdp->swd_vp;
810 dev = sdp->swd_dev;
811
812 /*
813 * open the swap file (mostly useful for block device files to
814 * let device driver know what is up).
815 *
816 * we skip the open/close for root on swap because the root
817 * has already been opened when root was mounted (mountroot).
818 */
819 if (vp != rootvp) {
820 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
821 return (error);
822 }
823
824 /* XXX this only works for block devices */
825 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
826
827 /*
828 * we now need to determine the size of the swap area. for
829 * block specials we can call the d_psize function.
830 * for normal files, we must stat [get attrs].
831 *
832 * we put the result in nblks.
833 * for normal files, we also want the filesystem block size
834 * (which we get with statfs).
835 */
836 switch (vp->v_type) {
837 case VBLK:
838 if (bdevsw[major(dev)].d_psize == 0 ||
839 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
840 error = ENXIO;
841 goto bad;
842 }
843 break;
844
845 case VREG:
846 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
847 goto bad;
848 nblocks = (int)btodb(va.va_size);
849 if ((error =
850 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
851 goto bad;
852
853 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
854 /*
855 * limit the max # of outstanding I/O requests we issue
856 * at any one time. take it easy on NFS servers.
857 */
858 #ifdef NFS
859 if (vp->v_op == nfsv2_vnodeop_p)
860 sdp->swd_maxactive = 2; /* XXX */
861 else
862 #endif /* NFS */
863 sdp->swd_maxactive = 8; /* XXX */
864 break;
865
866 default:
867 error = ENXIO;
868 goto bad;
869 }
870
871 /*
872 * save nblocks in a safe place and convert to pages.
873 */
874
875 sdp->swd_ose.ose_nblks = nblocks;
876 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
877
878 /*
879 * for block special files, we want to make sure that leave
880 * the disklabel and bootblocks alone, so we arrange to skip
881 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
882 * note that because of this the "size" can be less than the
883 * actual number of blocks on the device.
884 */
885 if (vp->v_type == VBLK) {
886 /* we use pages 1 to (size - 1) [inclusive] */
887 size = npages - 1;
888 addr = 1;
889 } else {
890 /* we use pages 0 to (size - 1) [inclusive] */
891 size = npages;
892 addr = 0;
893 }
894
895 /*
896 * make sure we have enough blocks for a reasonable sized swap
897 * area. we want at least one page.
898 */
899
900 if (size < 1) {
901 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
902 error = EINVAL;
903 goto bad;
904 }
905
906 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
907
908 /*
909 * now we need to allocate an extent to manage this swap device
910 */
911 name = malloc(12, M_VMSWAP, M_WAITOK);
912 sprintf(name, "swap0x%04x", count++);
913
914 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
915 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
916 0, 0, EX_WAITOK);
917 /* allocate the `saved' region from the extent so it won't be used */
918 if (addr) {
919 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
920 panic("disklabel region");
921 }
922
923 /*
924 * if the vnode we are swapping to is the root vnode
925 * (i.e. we are swapping to the miniroot) then we want
926 * to make sure we don't overwrite it. do a statfs to
927 * find its size and skip over it.
928 */
929 if (vp == rootvp) {
930 struct mount *mp;
931 struct statfs *sp;
932 int rootblocks, rootpages;
933
934 mp = rootvnode->v_mount;
935 sp = &mp->mnt_stat;
936 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
937 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
938 if (rootpages > size)
939 panic("swap_on: miniroot larger than swap?");
940
941 if (extent_alloc_region(sdp->swd_ex, addr,
942 rootpages, EX_WAITOK))
943 panic("swap_on: unable to preserve miniroot");
944
945 size -= rootpages;
946 printf("Preserved %d pages of miniroot ", rootpages);
947 printf("leaving %d pages of swap\n", size);
948 }
949
950 /*
951 * add a ref to vp to reflect usage as a swap device.
952 */
953 vref(vp);
954
955 /*
956 * add anons to reflect the new swap space
957 */
958 uvm_anon_add(size);
959
960 /*
961 * now add the new swapdev to the drum and enable.
962 */
963 simple_lock(&uvm.swap_data_lock);
964 swapdrum_add(sdp, npages);
965 sdp->swd_npages = size;
966 sdp->swd_flags &= ~SWF_FAKE; /* going live */
967 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
968 uvmexp.swpages += size;
969 simple_unlock(&uvm.swap_data_lock);
970 return (0);
971
972 bad:
973 /*
974 * failure: close device if necessary and return error.
975 */
976 if (vp != rootvp)
977 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
978 return (error);
979 }
980
981 /*
982 * swap_off: stop swapping on swapdev
983 *
984 * => swap data should be locked, we will unlock.
985 */
986 static int
987 swap_off(p, sdp)
988 struct proc *p;
989 struct swapdev *sdp;
990 {
991 void *name;
992 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
993 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev,0,0,0);
994
995 /* disable the swap area being removed */
996 sdp->swd_flags &= ~SWF_ENABLE;
997 simple_unlock(&uvm.swap_data_lock);
998
999 /*
1000 * the idea is to find all the pages that are paged out to this
1001 * device, and page them all in. in uvm, swap-backed pageable
1002 * memory can take two forms: aobjs and anons. call the
1003 * swapoff hook for each subsystem to bring in pages.
1004 */
1005
1006 if (uao_swap_off(sdp->swd_drumoffset,
1007 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1008 anon_swap_off(sdp->swd_drumoffset,
1009 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1010
1011 simple_lock(&uvm.swap_data_lock);
1012 sdp->swd_flags |= SWF_ENABLE;
1013 simple_unlock(&uvm.swap_data_lock);
1014 return ENOMEM;
1015 }
1016
1017 #ifdef DIAGNOSTIC
1018 if (sdp->swd_npginuse != sdp->swd_npgbad) {
1019 panic("swap_off: sdp %p - %d pages still in use (%d bad)\n",
1020 sdp, sdp->swd_npginuse, sdp->swd_npgbad);
1021 }
1022 #endif
1023
1024 /*
1025 * done with the vnode and saved creds.
1026 * drop our ref on the vnode before calling VOP_CLOSE()
1027 * so that spec_close() can tell if this is the last close.
1028 */
1029 if (sdp->swd_vp->v_type == VREG) {
1030 crfree(sdp->swd_cred);
1031 }
1032 vrele(sdp->swd_vp);
1033 if (sdp->swd_vp != rootvp) {
1034 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1035 }
1036
1037 /* remove anons from the system */
1038 uvm_anon_remove(sdp->swd_npages);
1039
1040 simple_lock(&uvm.swap_data_lock);
1041 uvmexp.swpages -= sdp->swd_npages;
1042
1043 if (swaplist_find(sdp->swd_vp, 1) == NULL)
1044 panic("swap_off: swapdev not in list\n");
1045 swaplist_trim();
1046
1047 /*
1048 * free all resources!
1049 */
1050 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
1051 EX_WAITOK);
1052 name = (void *)sdp->swd_ex->ex_name;
1053 extent_destroy(sdp->swd_ex);
1054 free(name, M_VMSWAP);
1055 free(sdp, M_VMSWAP);
1056 simple_unlock(&uvm.swap_data_lock);
1057 return (0);
1058 }
1059
1060 /*
1061 * /dev/drum interface and i/o functions
1062 */
1063
1064 /*
1065 * swread: the read function for the drum (just a call to physio)
1066 */
1067 /*ARGSUSED*/
1068 int
1069 swread(dev, uio, ioflag)
1070 dev_t dev;
1071 struct uio *uio;
1072 int ioflag;
1073 {
1074 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1075
1076 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1077 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1078 }
1079
1080 /*
1081 * swwrite: the write function for the drum (just a call to physio)
1082 */
1083 /*ARGSUSED*/
1084 int
1085 swwrite(dev, uio, ioflag)
1086 dev_t dev;
1087 struct uio *uio;
1088 int ioflag;
1089 {
1090 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1091
1092 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1093 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1094 }
1095
1096 /*
1097 * swstrategy: perform I/O on the drum
1098 *
1099 * => we must map the i/o request from the drum to the correct swapdev.
1100 */
1101 void
1102 swstrategy(bp)
1103 struct buf *bp;
1104 {
1105 struct swapdev *sdp;
1106 struct vnode *vp;
1107 int s, pageno, bn;
1108 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1109
1110 /*
1111 * convert block number to swapdev. note that swapdev can't
1112 * be yanked out from under us because we are holding resources
1113 * in it (i.e. the blocks we are doing I/O on).
1114 */
1115 pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT;
1116 simple_lock(&uvm.swap_data_lock);
1117 sdp = swapdrum_getsdp(pageno);
1118 simple_unlock(&uvm.swap_data_lock);
1119 if (sdp == NULL) {
1120 bp->b_error = EINVAL;
1121 bp->b_flags |= B_ERROR;
1122 biodone(bp);
1123 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1124 return;
1125 }
1126
1127 /*
1128 * convert drum page number to block number on this swapdev.
1129 */
1130
1131 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1132 bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */
1133
1134 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n",
1135 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1136 sdp->swd_drumoffset, bn, bp->b_bcount);
1137
1138 /*
1139 * for block devices we finish up here.
1140 * for regular files we have to do more work which we delegate
1141 * to sw_reg_strategy().
1142 */
1143
1144 switch (sdp->swd_vp->v_type) {
1145 default:
1146 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1147
1148 case VBLK:
1149
1150 /*
1151 * must convert "bp" from an I/O on /dev/drum to an I/O
1152 * on the swapdev (sdp).
1153 */
1154 s = splbio();
1155 bp->b_blkno = bn; /* swapdev block number */
1156 vp = sdp->swd_vp; /* swapdev vnode pointer */
1157 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1158 VHOLD(vp); /* "hold" swapdev vp for i/o */
1159
1160 /*
1161 * if we are doing a write, we have to redirect the i/o on
1162 * drum's v_numoutput counter to the swapdevs.
1163 */
1164 if ((bp->b_flags & B_READ) == 0) {
1165 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1166 vp->v_numoutput++; /* put it on swapdev */
1167 }
1168
1169 /*
1170 * dissassocate buffer with /dev/drum vnode
1171 * [could be null if buf was from physio]
1172 */
1173 if (bp->b_vp != NULLVP)
1174 brelvp(bp);
1175
1176 /*
1177 * finally plug in swapdev vnode and start I/O
1178 */
1179 bp->b_vp = vp;
1180 splx(s);
1181 VOP_STRATEGY(bp);
1182 return;
1183
1184 case VREG:
1185 /*
1186 * delegate to sw_reg_strategy function.
1187 */
1188 sw_reg_strategy(sdp, bp, bn);
1189 return;
1190 }
1191 /* NOTREACHED */
1192 }
1193
1194 /*
1195 * sw_reg_strategy: handle swap i/o to regular files
1196 */
1197 static void
1198 sw_reg_strategy(sdp, bp, bn)
1199 struct swapdev *sdp;
1200 struct buf *bp;
1201 int bn;
1202 {
1203 struct vnode *vp;
1204 struct vndxfer *vnx;
1205 daddr_t nbn, byteoff;
1206 caddr_t addr;
1207 int s, off, nra, error, sz, resid;
1208 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1209
1210 /*
1211 * allocate a vndxfer head for this transfer and point it to
1212 * our buffer.
1213 */
1214 getvndxfer(vnx);
1215 vnx->vx_flags = VX_BUSY;
1216 vnx->vx_error = 0;
1217 vnx->vx_pending = 0;
1218 vnx->vx_bp = bp;
1219 vnx->vx_sdp = sdp;
1220
1221 /*
1222 * setup for main loop where we read filesystem blocks into
1223 * our buffer.
1224 */
1225 error = 0;
1226 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1227 addr = bp->b_data; /* current position in buffer */
1228 byteoff = dbtob(bn);
1229
1230 for (resid = bp->b_resid; resid; resid -= sz) {
1231 struct vndbuf *nbp;
1232
1233 /*
1234 * translate byteoffset into block number. return values:
1235 * vp = vnode of underlying device
1236 * nbn = new block number (on underlying vnode dev)
1237 * nra = num blocks we can read-ahead (excludes requested
1238 * block)
1239 */
1240 nra = 0;
1241 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1242 &vp, &nbn, &nra);
1243
1244 if (error == 0 && nbn == (daddr_t)-1) {
1245 /*
1246 * this used to just set error, but that doesn't
1247 * do the right thing. Instead, it causes random
1248 * memory errors. The panic() should remain until
1249 * this condition doesn't destabilize the system.
1250 */
1251 #if 1
1252 panic("sw_reg_strategy: swap to sparse file");
1253 #else
1254 error = EIO; /* failure */
1255 #endif
1256 }
1257
1258 /*
1259 * punt if there was an error or a hole in the file.
1260 * we must wait for any i/o ops we have already started
1261 * to finish before returning.
1262 *
1263 * XXX we could deal with holes here but it would be
1264 * a hassle (in the write case).
1265 */
1266 if (error) {
1267 s = splbio();
1268 vnx->vx_error = error; /* pass error up */
1269 goto out;
1270 }
1271
1272 /*
1273 * compute the size ("sz") of this transfer (in bytes).
1274 * XXXCDC: ignores read-ahead for non-zero offset
1275 */
1276 if ((off = (byteoff % sdp->swd_bsize)) != 0)
1277 sz = sdp->swd_bsize - off;
1278 else
1279 sz = (1 + nra) * sdp->swd_bsize;
1280
1281 if (resid < sz)
1282 sz = resid;
1283
1284 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x",
1285 sdp->swd_vp, vp, byteoff, nbn);
1286
1287 /*
1288 * now get a buf structure. note that the vb_buf is
1289 * at the front of the nbp structure so that you can
1290 * cast pointers between the two structure easily.
1291 */
1292 getvndbuf(nbp);
1293 nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
1294 nbp->vb_buf.b_bcount = sz;
1295 nbp->vb_buf.b_bufsize = sz;
1296 nbp->vb_buf.b_error = 0;
1297 nbp->vb_buf.b_data = addr;
1298 nbp->vb_buf.b_blkno = nbn + btodb(off);
1299 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1300 nbp->vb_buf.b_proc = bp->b_proc;
1301 nbp->vb_buf.b_iodone = sw_reg_iodone;
1302 nbp->vb_buf.b_vp = NULLVP;
1303 nbp->vb_buf.b_vnbufs.le_next = NOLIST;
1304 nbp->vb_buf.b_rcred = sdp->swd_cred;
1305 nbp->vb_buf.b_wcred = sdp->swd_cred;
1306 LIST_INIT(&nbp->vb_buf.b_dep);
1307
1308 /*
1309 * set b_dirtyoff/end and b_validoff/end. this is
1310 * required by the NFS client code (otherwise it will
1311 * just discard our I/O request).
1312 */
1313 if (bp->b_dirtyend == 0) {
1314 nbp->vb_buf.b_dirtyoff = 0;
1315 nbp->vb_buf.b_dirtyend = sz;
1316 } else {
1317 nbp->vb_buf.b_dirtyoff =
1318 max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1319 nbp->vb_buf.b_dirtyend =
1320 min(sz,
1321 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1322 }
1323 if (bp->b_validend == 0) {
1324 nbp->vb_buf.b_validoff = 0;
1325 nbp->vb_buf.b_validend = sz;
1326 } else {
1327 nbp->vb_buf.b_validoff =
1328 max(0, bp->b_validoff - (bp->b_bcount-resid));
1329 nbp->vb_buf.b_validend =
1330 min(sz,
1331 max(0, bp->b_validend - (bp->b_bcount-resid)));
1332 }
1333
1334 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1335
1336 /*
1337 * Just sort by block number
1338 */
1339 s = splbio();
1340 if (vnx->vx_error != 0) {
1341 putvndbuf(nbp);
1342 goto out;
1343 }
1344 vnx->vx_pending++;
1345
1346 /* assoc new buffer with underlying vnode */
1347 bgetvp(vp, &nbp->vb_buf);
1348
1349 /* sort it in and start I/O if we are not over our limit */
1350 disksort_blkno(&sdp->swd_tab, &nbp->vb_buf);
1351 sw_reg_start(sdp);
1352 splx(s);
1353
1354 /*
1355 * advance to the next I/O
1356 */
1357 byteoff += sz;
1358 addr += sz;
1359 }
1360
1361 s = splbio();
1362
1363 out: /* Arrive here at splbio */
1364 vnx->vx_flags &= ~VX_BUSY;
1365 if (vnx->vx_pending == 0) {
1366 if (vnx->vx_error != 0) {
1367 bp->b_error = vnx->vx_error;
1368 bp->b_flags |= B_ERROR;
1369 }
1370 putvndxfer(vnx);
1371 biodone(bp);
1372 }
1373 splx(s);
1374 }
1375
1376 /*
1377 * sw_reg_start: start an I/O request on the requested swapdev
1378 *
1379 * => reqs are sorted by disksort (above)
1380 */
1381 static void
1382 sw_reg_start(sdp)
1383 struct swapdev *sdp;
1384 {
1385 struct buf *bp;
1386 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1387
1388 /* recursion control */
1389 if ((sdp->swd_flags & SWF_BUSY) != 0)
1390 return;
1391
1392 sdp->swd_flags |= SWF_BUSY;
1393
1394 while (sdp->swd_active < sdp->swd_maxactive) {
1395 bp = BUFQ_FIRST(&sdp->swd_tab);
1396 if (bp == NULL)
1397 break;
1398 BUFQ_REMOVE(&sdp->swd_tab, bp);
1399 sdp->swd_active++;
1400
1401 UVMHIST_LOG(pdhist,
1402 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1403 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1404 if ((bp->b_flags & B_READ) == 0)
1405 bp->b_vp->v_numoutput++;
1406 VOP_STRATEGY(bp);
1407 }
1408 sdp->swd_flags &= ~SWF_BUSY;
1409 }
1410
1411 /*
1412 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1413 *
1414 * => note that we can recover the vndbuf struct by casting the buf ptr
1415 */
1416 static void
1417 sw_reg_iodone(bp)
1418 struct buf *bp;
1419 {
1420 struct vndbuf *vbp = (struct vndbuf *) bp;
1421 struct vndxfer *vnx = vbp->vb_xfer;
1422 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1423 struct swapdev *sdp = vnx->vx_sdp;
1424 int s, resid;
1425 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1426
1427 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1428 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1429 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1430 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1431
1432 /*
1433 * protect vbp at splbio and update.
1434 */
1435
1436 s = splbio();
1437 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1438 pbp->b_resid -= resid;
1439 vnx->vx_pending--;
1440
1441 if (vbp->vb_buf.b_error) {
1442 UVMHIST_LOG(pdhist, " got error=%d !",
1443 vbp->vb_buf.b_error, 0, 0, 0);
1444
1445 /* pass error upward */
1446 vnx->vx_error = vbp->vb_buf.b_error;
1447 }
1448
1449 /*
1450 * disassociate this buffer from the vnode (if any).
1451 */
1452 if (vbp->vb_buf.b_vp != NULLVP) {
1453 brelvp(&vbp->vb_buf);
1454 }
1455
1456 /*
1457 * kill vbp structure
1458 */
1459 putvndbuf(vbp);
1460
1461 /*
1462 * wrap up this transaction if it has run to completion or, in
1463 * case of an error, when all auxiliary buffers have returned.
1464 */
1465 if (vnx->vx_error != 0) {
1466 /* pass error upward */
1467 pbp->b_flags |= B_ERROR;
1468 pbp->b_error = vnx->vx_error;
1469 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1470 putvndxfer(vnx);
1471 biodone(pbp);
1472 }
1473 } else if (pbp->b_resid == 0) {
1474 #ifdef DIAGNOSTIC
1475 if (vnx->vx_pending != 0)
1476 panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending);
1477 #endif
1478
1479 if ((vnx->vx_flags & VX_BUSY) == 0) {
1480 UVMHIST_LOG(pdhist, " iodone error=%d !",
1481 pbp, vnx->vx_error, 0, 0);
1482 putvndxfer(vnx);
1483 biodone(pbp);
1484 }
1485 }
1486
1487 /*
1488 * done! start next swapdev I/O if one is pending
1489 */
1490 sdp->swd_active--;
1491 sw_reg_start(sdp);
1492 splx(s);
1493 }
1494
1495
1496 /*
1497 * uvm_swap_alloc: allocate space on swap
1498 *
1499 * => allocation is done "round robin" down the priority list, as we
1500 * allocate in a priority we "rotate" the circle queue.
1501 * => space can be freed with uvm_swap_free
1502 * => we return the page slot number in /dev/drum (0 == invalid slot)
1503 * => we lock uvm.swap_data_lock
1504 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1505 */
1506 int
1507 uvm_swap_alloc(nslots, lessok)
1508 int *nslots; /* IN/OUT */
1509 boolean_t lessok;
1510 {
1511 struct swapdev *sdp;
1512 struct swappri *spp;
1513 u_long result;
1514 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1515
1516 /*
1517 * no swap devices configured yet? definite failure.
1518 */
1519 if (uvmexp.nswapdev < 1)
1520 return 0;
1521
1522 /*
1523 * lock data lock, convert slots into blocks, and enter loop
1524 */
1525 simple_lock(&uvm.swap_data_lock);
1526
1527 ReTry: /* XXXMRG */
1528 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
1529 spp = LIST_NEXT(spp, spi_swappri)) {
1530 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
1531 sdp != (void *)&spp->spi_swapdev;
1532 sdp = CIRCLEQ_NEXT(sdp,swd_next)) {
1533 /* if it's not enabled, then we can't swap from it */
1534 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1535 continue;
1536 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1537 continue;
1538 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1539 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1540 &result) != 0) {
1541 continue;
1542 }
1543
1544 /*
1545 * successful allocation! now rotate the circleq.
1546 */
1547 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1548 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1549 sdp->swd_npginuse += *nslots;
1550 uvmexp.swpginuse += *nslots;
1551 simple_unlock(&uvm.swap_data_lock);
1552 /* done! return drum slot number */
1553 UVMHIST_LOG(pdhist,
1554 "success! returning %d slots starting at %d",
1555 *nslots, result + sdp->swd_drumoffset, 0, 0);
1556 return(result + sdp->swd_drumoffset);
1557 }
1558 }
1559
1560 /* XXXMRG: BEGIN HACK */
1561 if (*nslots > 1 && lessok) {
1562 *nslots = 1;
1563 goto ReTry; /* XXXMRG: ugh! extent should support this for us */
1564 }
1565 /* XXXMRG: END HACK */
1566
1567 simple_unlock(&uvm.swap_data_lock);
1568 return 0; /* failed */
1569 }
1570
1571 /*
1572 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1573 *
1574 * => we lock uvm.swap_data_lock
1575 */
1576 void
1577 uvm_swap_markbad(startslot, nslots)
1578 int startslot;
1579 int nslots;
1580 {
1581 struct swapdev *sdp;
1582 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1583
1584 simple_lock(&uvm.swap_data_lock);
1585 sdp = swapdrum_getsdp(startslot);
1586
1587 /*
1588 * we just keep track of how many pages have been marked bad
1589 * in this device, to make everything add up in swap_off().
1590 * we assume here that the range of slots will all be within
1591 * one swap device.
1592 */
1593 sdp->swd_npgbad += nslots;
1594
1595 simple_unlock(&uvm.swap_data_lock);
1596 }
1597
1598 /*
1599 * uvm_swap_free: free swap slots
1600 *
1601 * => this can be all or part of an allocation made by uvm_swap_alloc
1602 * => we lock uvm.swap_data_lock
1603 */
1604 void
1605 uvm_swap_free(startslot, nslots)
1606 int startslot;
1607 int nslots;
1608 {
1609 struct swapdev *sdp;
1610 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1611
1612 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1613 startslot, 0, 0);
1614
1615 /*
1616 * ignore attempts to free the "bad" slot.
1617 */
1618 if (startslot == SWSLOT_BAD) {
1619 return;
1620 }
1621
1622 /*
1623 * convert drum slot offset back to sdp, free the blocks
1624 * in the extent, and return. must hold pri lock to do
1625 * lookup and access the extent.
1626 */
1627 simple_lock(&uvm.swap_data_lock);
1628 sdp = swapdrum_getsdp(startslot);
1629
1630 #ifdef DIAGNOSTIC
1631 if (uvmexp.nswapdev < 1)
1632 panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1633 if (sdp == NULL) {
1634 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1635 nslots);
1636 panic("uvm_swap_free: unmapped address\n");
1637 }
1638 #endif
1639 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1640 EX_MALLOCOK|EX_NOWAIT) != 0) {
1641 printf("warning: resource shortage: %d pages of swap lost\n",
1642 nslots);
1643 }
1644
1645 sdp->swd_npginuse -= nslots;
1646 uvmexp.swpginuse -= nslots;
1647 #ifdef DIAGNOSTIC
1648 if (sdp->swd_npginuse < 0)
1649 panic("uvm_swap_free: inuse < 0");
1650 #endif
1651 simple_unlock(&uvm.swap_data_lock);
1652 }
1653
1654 /*
1655 * uvm_swap_put: put any number of pages into a contig place on swap
1656 *
1657 * => can be sync or async
1658 * => XXXMRG: consider making it an inline or macro
1659 */
1660 int
1661 uvm_swap_put(swslot, ppsp, npages, flags)
1662 int swslot;
1663 struct vm_page **ppsp;
1664 int npages;
1665 int flags;
1666 {
1667 int result;
1668
1669 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1670 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1671
1672 return (result);
1673 }
1674
1675 /*
1676 * uvm_swap_get: get a single page from swap
1677 *
1678 * => usually a sync op (from fault)
1679 * => XXXMRG: consider making it an inline or macro
1680 */
1681 int
1682 uvm_swap_get(page, swslot, flags)
1683 struct vm_page *page;
1684 int swslot, flags;
1685 {
1686 int result;
1687
1688 uvmexp.nswget++;
1689 #ifdef DIAGNOSTIC
1690 if ((flags & PGO_SYNCIO) == 0)
1691 printf("uvm_swap_get: ASYNC get requested?\n");
1692 #endif
1693
1694 if (swslot == SWSLOT_BAD) {
1695 return VM_PAGER_ERROR;
1696 }
1697
1698 /*
1699 * this page is (about to be) no longer only in swap.
1700 */
1701 simple_lock(&uvm.swap_data_lock);
1702 uvmexp.swpgonly--;
1703 simple_unlock(&uvm.swap_data_lock);
1704
1705 result = uvm_swap_io(&page, swslot, 1, B_READ |
1706 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1707
1708 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
1709 /*
1710 * oops, the read failed so it really is still only in swap.
1711 */
1712 simple_lock(&uvm.swap_data_lock);
1713 uvmexp.swpgonly++;
1714 simple_unlock(&uvm.swap_data_lock);
1715 }
1716
1717 return (result);
1718 }
1719
1720 /*
1721 * uvm_swap_io: do an i/o operation to swap
1722 */
1723
1724 static int
1725 uvm_swap_io(pps, startslot, npages, flags)
1726 struct vm_page **pps;
1727 int startslot, npages, flags;
1728 {
1729 daddr_t startblk;
1730 struct swapbuf *sbp;
1731 struct buf *bp;
1732 vaddr_t kva;
1733 int result, s, mapinflags, pflag;
1734 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1735
1736 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1737 startslot, npages, flags, 0);
1738
1739 /*
1740 * convert starting drum slot to block number
1741 */
1742 startblk = btodb(startslot << PAGE_SHIFT);
1743
1744 /*
1745 * first, map the pages into the kernel (XXX: currently required
1746 * by buffer system). note that we don't let pagermapin alloc
1747 * an aiodesc structure because we don't want to chance a malloc.
1748 * we've got our own pool of aiodesc structures (in swapbuf).
1749 */
1750 mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ :
1751 UVMPAGER_MAPIN_WRITE;
1752 if ((flags & B_ASYNC) == 0)
1753 mapinflags |= UVMPAGER_MAPIN_WAITOK;
1754 kva = uvm_pagermapin(pps, npages, NULL, mapinflags);
1755 if (kva == 0)
1756 return (VM_PAGER_AGAIN);
1757
1758 /*
1759 * now allocate a swap buffer off of freesbufs
1760 * [make sure we don't put the pagedaemon to sleep...]
1761 */
1762 s = splbio();
1763 pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc)
1764 ? 0
1765 : PR_WAITOK;
1766 sbp = pool_get(swapbuf_pool, pflag);
1767 splx(s); /* drop splbio */
1768
1769 /*
1770 * if we failed to get a swapbuf, return "try again"
1771 */
1772 if (sbp == NULL)
1773 return (VM_PAGER_AGAIN);
1774
1775 /*
1776 * fill in the bp/sbp. we currently route our i/o through
1777 * /dev/drum's vnode [swapdev_vp].
1778 */
1779 bp = &sbp->sw_buf;
1780 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
1781 bp->b_proc = &proc0; /* XXX */
1782 bp->b_rcred = bp->b_wcred = proc0.p_ucred;
1783 bp->b_vnbufs.le_next = NOLIST;
1784 bp->b_data = (caddr_t)kva;
1785 bp->b_blkno = startblk;
1786 s = splbio();
1787 VHOLD(swapdev_vp);
1788 bp->b_vp = swapdev_vp;
1789 splx(s);
1790 /* XXXCDC: isn't swapdev_vp always a VCHR? */
1791 /* XXXMRG: probably -- this is obviously something inherited... */
1792 if (swapdev_vp->v_type == VBLK)
1793 bp->b_dev = swapdev_vp->v_rdev;
1794 bp->b_bcount = npages << PAGE_SHIFT;
1795 LIST_INIT(&bp->b_dep);
1796
1797 /*
1798 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1799 * and we bump v_numoutput (counter of number of active outputs).
1800 */
1801 if ((bp->b_flags & B_READ) == 0) {
1802 bp->b_dirtyoff = 0;
1803 bp->b_dirtyend = npages << PAGE_SHIFT;
1804 s = splbio();
1805 swapdev_vp->v_numoutput++;
1806 splx(s);
1807 }
1808
1809 /*
1810 * for async ops we must set up the aiodesc and setup the callback
1811 * XXX: we expect no async-reads, but we don't prevent it here.
1812 */
1813 if (flags & B_ASYNC) {
1814 sbp->sw_aio.aiodone = uvm_swap_aiodone;
1815 sbp->sw_aio.kva = kva;
1816 sbp->sw_aio.npages = npages;
1817 sbp->sw_aio.pd_ptr = sbp; /* backpointer */
1818 bp->b_flags |= B_CALL; /* set callback */
1819 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
1820 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1821 }
1822 UVMHIST_LOG(pdhist,
1823 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
1824 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1825
1826 /*
1827 * now we start the I/O, and if async, return.
1828 */
1829 VOP_STRATEGY(bp);
1830 if (flags & B_ASYNC)
1831 return (VM_PAGER_PEND);
1832
1833 /*
1834 * must be sync i/o. wait for it to finish
1835 */
1836 bp->b_error = biowait(bp);
1837 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1838
1839 /*
1840 * kill the pager mapping
1841 */
1842 uvm_pagermapout(kva, npages);
1843
1844 /*
1845 * now dispose of the swap buffer
1846 */
1847 s = splbio();
1848 if (bp->b_vp)
1849 brelvp(bp);
1850
1851 pool_put(swapbuf_pool, sbp);
1852 splx(s);
1853
1854 /*
1855 * finally return.
1856 */
1857 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0);
1858 return (result);
1859 }
1860
1861 /*
1862 * uvm_swap_bufdone: called from the buffer system when the i/o is done
1863 */
1864 static void
1865 uvm_swap_bufdone(bp)
1866 struct buf *bp;
1867 {
1868 struct swapbuf *sbp = (struct swapbuf *) bp;
1869 int s = splbio();
1870 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
1871
1872 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
1873 #ifdef DIAGNOSTIC
1874 /*
1875 * sanity check: swapbufs are private, so they shouldn't be wanted
1876 */
1877 if (bp->b_flags & B_WANTED)
1878 panic("uvm_swap_bufdone: private buf wanted");
1879 #endif
1880
1881 /*
1882 * drop the buffer's reference to the vnode.
1883 */
1884 if (bp->b_vp)
1885 brelvp(bp);
1886
1887 /*
1888 * now put the aio on the uvm.aio_done list and wake the
1889 * pagedaemon (which will finish up our job in its context).
1890 */
1891 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */
1892 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
1893 simple_unlock(&uvm.pagedaemon_lock);
1894
1895 wakeup(&uvm.pagedaemon);
1896 splx(s);
1897 }
1898
1899 /*
1900 * uvm_swap_aiodone: aiodone function for anonymous memory
1901 *
1902 * => this is called in the context of the pagedaemon (but with the
1903 * page queues unlocked!)
1904 * => our "aio" structure must be part of a "swapbuf"
1905 */
1906 static void
1907 uvm_swap_aiodone(aio)
1908 struct uvm_aiodesc *aio;
1909 {
1910 struct swapbuf *sbp = aio->pd_ptr;
1911 struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT];
1912 int lcv, s;
1913 vaddr_t addr;
1914 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
1915
1916 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
1917 #ifdef DIAGNOSTIC
1918 /*
1919 * sanity check
1920 */
1921 if (aio->npages > (MAXBSIZE >> PAGE_SHIFT))
1922 panic("uvm_swap_aiodone: aio too big!");
1923 #endif
1924
1925 /*
1926 * first, we have to recover the page pointers (pps) by poking in the
1927 * kernel pmap (XXX: should be saved in the buf structure).
1928 */
1929 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
1930 addr += PAGE_SIZE, lcv++) {
1931 pps[lcv] = uvm_pageratop(addr);
1932 }
1933
1934 /*
1935 * now we can dispose of the kernel mappings of the buffer
1936 */
1937 uvm_pagermapout(aio->kva, aio->npages);
1938
1939 /*
1940 * now we can dispose of the pages by using the dropcluster function
1941 * [note that we have no "page of interest" so we pass in null]
1942 */
1943 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
1944 PGO_PDFREECLUST);
1945
1946 /*
1947 * finally, we can dispose of the swapbuf
1948 */
1949 s = splbio();
1950 pool_put(swapbuf_pool, sbp);
1951 splx(s);
1952 }
1953