uvm_swap.c revision 1.40 1 /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32 */
33
34 #include "fs_nfs.h"
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/proc.h>
43 #include <sys/namei.h>
44 #include <sys/disklabel.h>
45 #include <sys/errno.h>
46 #include <sys/kernel.h>
47 #include <sys/malloc.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/extent.h>
51 #include <sys/mount.h>
52 #include <sys/pool.h>
53 #include <sys/syscallargs.h>
54 #include <sys/swap.h>
55
56 #include <uvm/uvm.h>
57
58 #include <miscfs/specfs/specdev.h>
59
60 /*
61 * uvm_swap.c: manage configuration and i/o to swap space.
62 */
63
64 /*
65 * swap space is managed in the following way:
66 *
67 * each swap partition or file is described by a "swapdev" structure.
68 * each "swapdev" structure contains a "swapent" structure which contains
69 * information that is passed up to the user (via system calls).
70 *
71 * each swap partition is assigned a "priority" (int) which controls
72 * swap parition usage.
73 *
74 * the system maintains a global data structure describing all swap
75 * partitions/files. there is a sorted LIST of "swappri" structures
76 * which describe "swapdev"'s at that priority. this LIST is headed
77 * by the "swap_priority" global var. each "swappri" contains a
78 * CIRCLEQ of "swapdev" structures at that priority.
79 *
80 * the system maintains a fixed pool of "swapbuf" structures for use
81 * at swap i/o time. a swapbuf includes a "buf" structure and an
82 * "aiodone" [we want to avoid malloc()'ing anything at swapout time
83 * since memory may be low].
84 *
85 * locking:
86 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl
87 * system call and prevents the swap priority list from changing
88 * while we are in the middle of a system call (e.g. SWAP_STATS).
89 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data
90 * structures including the priority list, the swapdev structures,
91 * and the swapmap extent.
92 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf
93 * pool.
94 *
95 * each swap device has the following info:
96 * - swap device in use (could be disabled, preventing future use)
97 * - swap enabled (allows new allocations on swap)
98 * - map info in /dev/drum
99 * - vnode pointer
100 * for swap files only:
101 * - block size
102 * - max byte count in buffer
103 * - buffer
104 * - credentials to use when doing i/o to file
105 *
106 * userland controls and configures swap with the swapctl(2) system call.
107 * the sys_swapctl performs the following operations:
108 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
109 * [2] SWAP_STATS: given a pointer to an array of swapent structures
110 * (passed in via "arg") of a size passed in via "misc" ... we load
111 * the current swap config into the array.
112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
113 * priority in "misc", start swapping on it.
114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
116 * "misc")
117 */
118
119 /*
120 * swapdev: describes a single swap partition/file
121 *
122 * note the following should be true:
123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
125 */
126 struct swapdev {
127 struct oswapent swd_ose;
128 #define swd_dev swd_ose.ose_dev /* device id */
129 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */
130 #define swd_priority swd_ose.ose_priority /* our priority */
131 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
132 char *swd_path; /* saved pathname of device */
133 int swd_pathlen; /* length of pathname */
134 int swd_npages; /* #pages we can use */
135 int swd_npginuse; /* #pages in use */
136 int swd_npgbad; /* #pages bad */
137 int swd_drumoffset; /* page0 offset in drum */
138 int swd_drumsize; /* #pages in drum */
139 struct extent *swd_ex; /* extent for this swapdev */
140 struct vnode *swd_vp; /* backing vnode */
141 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
142
143 int swd_bsize; /* blocksize (bytes) */
144 int swd_maxactive; /* max active i/o reqs */
145 struct buf_queue swd_tab; /* buffer list */
146 int swd_active; /* number of active buffers */
147 struct ucred *swd_cred; /* cred for file access */
148 };
149
150 /*
151 * swap device priority entry; the list is kept sorted on `spi_priority'.
152 */
153 struct swappri {
154 int spi_priority; /* priority */
155 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
156 /* circleq of swapdevs at this priority */
157 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
158 };
159
160 /*
161 * swapbuf, swapbuffer plus async i/o info
162 */
163 struct swapbuf {
164 struct buf sw_buf; /* a buffer structure */
165 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */
166 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */
167 };
168
169 /*
170 * The following two structures are used to keep track of data transfers
171 * on swap devices associated with regular files.
172 * NOTE: this code is more or less a copy of vnd.c; we use the same
173 * structure names here to ease porting..
174 */
175 struct vndxfer {
176 struct buf *vx_bp; /* Pointer to parent buffer */
177 struct swapdev *vx_sdp;
178 int vx_error;
179 int vx_pending; /* # of pending aux buffers */
180 int vx_flags;
181 #define VX_BUSY 1
182 #define VX_DEAD 2
183 };
184
185 struct vndbuf {
186 struct buf vb_buf;
187 struct vndxfer *vb_xfer;
188 };
189
190
191 /*
192 * We keep a of pool vndbuf's and vndxfer structures.
193 */
194 struct pool *vndxfer_pool;
195 struct pool *vndbuf_pool;
196
197 #define getvndxfer(vnx) do { \
198 int s = splbio(); \
199 vnx = pool_get(vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \
200 splx(s); \
201 } while (0)
202
203 #define putvndxfer(vnx) { \
204 pool_put(vndxfer_pool, (void *)(vnx)); \
205 }
206
207 #define getvndbuf(vbp) do { \
208 int s = splbio(); \
209 vbp = pool_get(vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \
210 splx(s); \
211 } while (0)
212
213 #define putvndbuf(vbp) { \
214 pool_put(vndbuf_pool, (void *)(vbp)); \
215 }
216
217 /* /dev/drum */
218 bdev_decl(sw);
219 cdev_decl(sw);
220
221 /*
222 * local variables
223 */
224 static struct extent *swapmap; /* controls the mapping of /dev/drum */
225 SIMPLEQ_HEAD(swapbufhead, swapbuf);
226 struct pool *swapbuf_pool;
227
228 /* list of all active swap devices [by priority] */
229 LIST_HEAD(swap_priority, swappri);
230 static struct swap_priority swap_priority;
231
232 /* locks */
233 lock_data_t swap_syscall_lock;
234
235 /*
236 * prototypes
237 */
238 static void swapdrum_add __P((struct swapdev *, int));
239 static struct swapdev *swapdrum_getsdp __P((int));
240
241 static struct swapdev *swaplist_find __P((struct vnode *, int));
242 static void swaplist_insert __P((struct swapdev *,
243 struct swappri *, int));
244 static void swaplist_trim __P((void));
245
246 static int swap_on __P((struct proc *, struct swapdev *));
247 static int swap_off __P((struct proc *, struct swapdev *));
248
249 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
250 static void sw_reg_iodone __P((struct buf *));
251 static void sw_reg_start __P((struct swapdev *));
252
253 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
254 static void uvm_swap_bufdone __P((struct buf *));
255 static int uvm_swap_io __P((struct vm_page **, int, int, int));
256
257 /*
258 * uvm_swap_init: init the swap system data structures and locks
259 *
260 * => called at boot time from init_main.c after the filesystems
261 * are brought up (which happens after uvm_init())
262 */
263 void
264 uvm_swap_init()
265 {
266 UVMHIST_FUNC("uvm_swap_init");
267
268 UVMHIST_CALLED(pdhist);
269 /*
270 * first, init the swap list, its counter, and its lock.
271 * then get a handle on the vnode for /dev/drum by using
272 * the its dev_t number ("swapdev", from MD conf.c).
273 */
274
275 LIST_INIT(&swap_priority);
276 uvmexp.nswapdev = 0;
277 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
278 simple_lock_init(&uvm.swap_data_lock);
279
280 if (bdevvp(swapdev, &swapdev_vp))
281 panic("uvm_swap_init: can't get vnode for swap device");
282
283 /*
284 * create swap block resource map to map /dev/drum. the range
285 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
286 * that block 0 is reserved (used to indicate an allocation
287 * failure, or no allocation).
288 */
289 swapmap = extent_create("swapmap", 1, INT_MAX,
290 M_VMSWAP, 0, 0, EX_NOWAIT);
291 if (swapmap == 0)
292 panic("uvm_swap_init: extent_create failed");
293
294 /*
295 * allocate our private pool of "swapbuf" structures (includes
296 * a "buf" structure). ["nswbuf" comes from param.c and can
297 * be adjusted by MD code before we get here].
298 */
299
300 swapbuf_pool =
301 pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0,
302 NULL, NULL, 0);
303 if (swapbuf_pool == NULL)
304 panic("swapinit: pool_create failed");
305 /* XXX - set a maximum on swapbuf_pool? */
306
307 vndxfer_pool =
308 pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0,
309 NULL, NULL, 0);
310 if (vndxfer_pool == NULL)
311 panic("swapinit: pool_create failed");
312
313 vndbuf_pool =
314 pool_create(sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0,
315 NULL, NULL, 0);
316 if (vndbuf_pool == NULL)
317 panic("swapinit: pool_create failed");
318 /*
319 * done!
320 */
321 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
322 }
323
324 /*
325 * swaplist functions: functions that operate on the list of swap
326 * devices on the system.
327 */
328
329 /*
330 * swaplist_insert: insert swap device "sdp" into the global list
331 *
332 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
333 * => caller must provide a newly malloc'd swappri structure (we will
334 * FREE it if we don't need it... this it to prevent malloc blocking
335 * here while adding swap)
336 */
337 static void
338 swaplist_insert(sdp, newspp, priority)
339 struct swapdev *sdp;
340 struct swappri *newspp;
341 int priority;
342 {
343 struct swappri *spp, *pspp;
344 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
345
346 /*
347 * find entry at or after which to insert the new device.
348 */
349 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL;
350 spp = LIST_NEXT(spp, spi_swappri)) {
351 if (priority <= spp->spi_priority)
352 break;
353 pspp = spp;
354 }
355
356 /*
357 * new priority?
358 */
359 if (spp == NULL || spp->spi_priority != priority) {
360 spp = newspp; /* use newspp! */
361 UVMHIST_LOG(pdhist, "created new swappri = %d",
362 priority, 0, 0, 0);
363
364 spp->spi_priority = priority;
365 CIRCLEQ_INIT(&spp->spi_swapdev);
366
367 if (pspp)
368 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
369 else
370 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
371 } else {
372 /* we don't need a new priority structure, free it */
373 FREE(newspp, M_VMSWAP);
374 }
375
376 /*
377 * priority found (or created). now insert on the priority's
378 * circleq list and bump the total number of swapdevs.
379 */
380 sdp->swd_priority = priority;
381 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
382 uvmexp.nswapdev++;
383 }
384
385 /*
386 * swaplist_find: find and optionally remove a swap device from the
387 * global list.
388 *
389 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
390 * => we return the swapdev we found (and removed)
391 */
392 static struct swapdev *
393 swaplist_find(vp, remove)
394 struct vnode *vp;
395 boolean_t remove;
396 {
397 struct swapdev *sdp;
398 struct swappri *spp;
399
400 /*
401 * search the lists for the requested vp
402 */
403 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
404 spp = LIST_NEXT(spp, spi_swappri)) {
405 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
406 sdp != (void *)&spp->spi_swapdev;
407 sdp = CIRCLEQ_NEXT(sdp, swd_next))
408 if (sdp->swd_vp == vp) {
409 if (remove) {
410 CIRCLEQ_REMOVE(&spp->spi_swapdev,
411 sdp, swd_next);
412 uvmexp.nswapdev--;
413 }
414 return(sdp);
415 }
416 }
417 return (NULL);
418 }
419
420
421 /*
422 * swaplist_trim: scan priority list for empty priority entries and kill
423 * them.
424 *
425 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
426 */
427 static void
428 swaplist_trim()
429 {
430 struct swappri *spp, *nextspp;
431
432 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
433 nextspp = LIST_NEXT(spp, spi_swappri);
434 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
435 (void *)&spp->spi_swapdev)
436 continue;
437 LIST_REMOVE(spp, spi_swappri);
438 free(spp, M_VMSWAP);
439 }
440 }
441
442 /*
443 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
444 *
445 * => caller must hold swap_syscall_lock
446 * => uvm.swap_data_lock should be unlocked (we may sleep)
447 */
448 static void
449 swapdrum_add(sdp, npages)
450 struct swapdev *sdp;
451 int npages;
452 {
453 u_long result;
454
455 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
456 EX_WAITOK, &result))
457 panic("swapdrum_add");
458
459 sdp->swd_drumoffset = result;
460 sdp->swd_drumsize = npages;
461 }
462
463 /*
464 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
465 * to the "swapdev" that maps that section of the drum.
466 *
467 * => each swapdev takes one big contig chunk of the drum
468 * => caller must hold uvm.swap_data_lock
469 */
470 static struct swapdev *
471 swapdrum_getsdp(pgno)
472 int pgno;
473 {
474 struct swapdev *sdp;
475 struct swappri *spp;
476
477 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
478 spp = LIST_NEXT(spp, spi_swappri))
479 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
480 sdp != (void *)&spp->spi_swapdev;
481 sdp = CIRCLEQ_NEXT(sdp, swd_next))
482 if (pgno >= sdp->swd_drumoffset &&
483 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
484 return sdp;
485 }
486 return NULL;
487 }
488
489
490 /*
491 * sys_swapctl: main entry point for swapctl(2) system call
492 * [with two helper functions: swap_on and swap_off]
493 */
494 int
495 sys_swapctl(p, v, retval)
496 struct proc *p;
497 void *v;
498 register_t *retval;
499 {
500 struct sys_swapctl_args /* {
501 syscallarg(int) cmd;
502 syscallarg(void *) arg;
503 syscallarg(int) misc;
504 } */ *uap = (struct sys_swapctl_args *)v;
505 struct vnode *vp;
506 struct nameidata nd;
507 struct swappri *spp;
508 struct swapdev *sdp;
509 struct swapent *sep;
510 char userpath[PATH_MAX + 1];
511 size_t len;
512 int count, error, misc;
513 int priority;
514 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
515
516 misc = SCARG(uap, misc);
517
518 /*
519 * ensure serialized syscall access by grabbing the swap_syscall_lock
520 */
521 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL);
522
523 /*
524 * we handle the non-priv NSWAP and STATS request first.
525 *
526 * SWAP_NSWAP: return number of config'd swap devices
527 * [can also be obtained with uvmexp sysctl]
528 */
529 if (SCARG(uap, cmd) == SWAP_NSWAP) {
530 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
531 0, 0, 0);
532 *retval = uvmexp.nswapdev;
533 error = 0;
534 goto out;
535 }
536
537 /*
538 * SWAP_STATS: get stats on current # of configured swap devs
539 *
540 * note that the swap_priority list can't change as long
541 * as we are holding the swap_syscall_lock. we don't want
542 * to grab the uvm.swap_data_lock because we may fault&sleep during
543 * copyout() and we don't want to be holding that lock then!
544 */
545 if (SCARG(uap, cmd) == SWAP_STATS
546 #if defined(COMPAT_13)
547 || SCARG(uap, cmd) == SWAP_OSTATS
548 #endif
549 ) {
550 sep = (struct swapent *)SCARG(uap, arg);
551 count = 0;
552
553 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
554 spp = LIST_NEXT(spp, spi_swappri)) {
555 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
556 sdp != (void *)&spp->spi_swapdev && misc-- > 0;
557 sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
558 /*
559 * backwards compatibility for system call.
560 * note that we use 'struct oswapent' as an
561 * overlay into both 'struct swapdev' and
562 * the userland 'struct swapent', as we
563 * want to retain backwards compatibility
564 * with NetBSD 1.3.
565 */
566 sdp->swd_ose.ose_inuse =
567 btodb(sdp->swd_npginuse << PAGE_SHIFT);
568 error = copyout(&sdp->swd_ose, sep,
569 sizeof(struct oswapent));
570
571 /* now copy out the path if necessary */
572 #if defined(COMPAT_13)
573 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS)
574 #else
575 if (error == 0)
576 #endif
577 error = copyout(sdp->swd_path,
578 &sep->se_path, sdp->swd_pathlen);
579
580 if (error)
581 goto out;
582 count++;
583 #if defined(COMPAT_13)
584 if (SCARG(uap, cmd) == SWAP_OSTATS)
585 ((struct oswapent *)sep)++;
586 else
587 #endif
588 sep++;
589 }
590 }
591
592 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
593
594 *retval = count;
595 error = 0;
596 goto out;
597 }
598
599 /*
600 * all other requests require superuser privs. verify.
601 */
602 if ((error = suser(p->p_ucred, &p->p_acflag)))
603 goto out;
604
605 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
606 dev_t *devp = (dev_t *)SCARG(uap, arg);
607
608 error = copyout(&dumpdev, devp, sizeof(dumpdev));
609 goto out;
610 }
611
612 /*
613 * at this point we expect a path name in arg. we will
614 * use namei() to gain a vnode reference (vref), and lock
615 * the vnode (VOP_LOCK).
616 *
617 * XXX: a NULL arg means use the root vnode pointer (e.g. for
618 * miniroot)
619 */
620 if (SCARG(uap, arg) == NULL) {
621 vp = rootvp; /* miniroot */
622 if (vget(vp, LK_EXCLUSIVE)) {
623 error = EBUSY;
624 goto out;
625 }
626 if (SCARG(uap, cmd) == SWAP_ON &&
627 copystr("miniroot", userpath, sizeof userpath, &len))
628 panic("swapctl: miniroot copy failed");
629 } else {
630 int space;
631 char *where;
632
633 if (SCARG(uap, cmd) == SWAP_ON) {
634 if ((error = copyinstr(SCARG(uap, arg), userpath,
635 sizeof userpath, &len)))
636 goto out;
637 space = UIO_SYSSPACE;
638 where = userpath;
639 } else {
640 space = UIO_USERSPACE;
641 where = (char *)SCARG(uap, arg);
642 }
643 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
644 if ((error = namei(&nd)))
645 goto out;
646 vp = nd.ni_vp;
647 }
648 /* note: "vp" is referenced and locked */
649
650 error = 0; /* assume no error */
651 switch(SCARG(uap, cmd)) {
652
653 case SWAP_DUMPDEV:
654 if (vp->v_type != VBLK) {
655 error = ENOTBLK;
656 goto out;
657 }
658 dumpdev = vp->v_rdev;
659
660 break;
661
662 case SWAP_CTL:
663 /*
664 * get new priority, remove old entry (if any) and then
665 * reinsert it in the correct place. finally, prune out
666 * any empty priority structures.
667 */
668 priority = SCARG(uap, misc);
669 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
670 simple_lock(&uvm.swap_data_lock);
671 if ((sdp = swaplist_find(vp, 1)) == NULL) {
672 error = ENOENT;
673 } else {
674 swaplist_insert(sdp, spp, priority);
675 swaplist_trim();
676 }
677 simple_unlock(&uvm.swap_data_lock);
678 if (error)
679 free(spp, M_VMSWAP);
680 break;
681
682 case SWAP_ON:
683
684 /*
685 * check for duplicates. if none found, then insert a
686 * dummy entry on the list to prevent someone else from
687 * trying to enable this device while we are working on
688 * it.
689 */
690
691 priority = SCARG(uap, misc);
692 simple_lock(&uvm.swap_data_lock);
693 if ((sdp = swaplist_find(vp, 0)) != NULL) {
694 error = EBUSY;
695 simple_unlock(&uvm.swap_data_lock);
696 break;
697 }
698 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
699 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
700 memset(sdp, 0, sizeof(*sdp));
701 sdp->swd_flags = SWF_FAKE; /* placeholder only */
702 sdp->swd_vp = vp;
703 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
704 BUFQ_INIT(&sdp->swd_tab);
705
706 /*
707 * XXX Is NFS elaboration necessary?
708 */
709 if (vp->v_type == VREG) {
710 sdp->swd_cred = crdup(p->p_ucred);
711 }
712
713 swaplist_insert(sdp, spp, priority);
714 simple_unlock(&uvm.swap_data_lock);
715
716 sdp->swd_pathlen = len;
717 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
718 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
719 panic("swapctl: copystr");
720
721 /*
722 * we've now got a FAKE placeholder in the swap list.
723 * now attempt to enable swap on it. if we fail, undo
724 * what we've done and kill the fake entry we just inserted.
725 * if swap_on is a success, it will clear the SWF_FAKE flag
726 */
727
728 if ((error = swap_on(p, sdp)) != 0) {
729 simple_lock(&uvm.swap_data_lock);
730 (void) swaplist_find(vp, 1); /* kill fake entry */
731 swaplist_trim();
732 simple_unlock(&uvm.swap_data_lock);
733 if (vp->v_type == VREG) {
734 crfree(sdp->swd_cred);
735 }
736 free(sdp->swd_path, M_VMSWAP);
737 free(sdp, M_VMSWAP);
738 break;
739 }
740 break;
741
742 case SWAP_OFF:
743 simple_lock(&uvm.swap_data_lock);
744 if ((sdp = swaplist_find(vp, 0)) == NULL) {
745 simple_unlock(&uvm.swap_data_lock);
746 error = ENXIO;
747 break;
748 }
749
750 /*
751 * If a device isn't in use or enabled, we
752 * can't stop swapping from it (again).
753 */
754 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
755 simple_unlock(&uvm.swap_data_lock);
756 error = EBUSY;
757 break;
758 }
759
760 /*
761 * do the real work.
762 */
763 if ((error = swap_off(p, sdp)) != 0)
764 goto out;
765
766 break;
767
768 default:
769 error = EINVAL;
770 }
771
772 /*
773 * done! release the ref gained by namei() and unlock.
774 */
775 vput(vp);
776
777 out:
778 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL);
779
780 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
781 return (error);
782 }
783
784 /*
785 * swap_on: attempt to enable a swapdev for swapping. note that the
786 * swapdev is already on the global list, but disabled (marked
787 * SWF_FAKE).
788 *
789 * => we avoid the start of the disk (to protect disk labels)
790 * => we also avoid the miniroot, if we are swapping to root.
791 * => caller should leave uvm.swap_data_lock unlocked, we may lock it
792 * if needed.
793 */
794 static int
795 swap_on(p, sdp)
796 struct proc *p;
797 struct swapdev *sdp;
798 {
799 static int count = 0; /* static */
800 struct vnode *vp;
801 int error, npages, nblocks, size;
802 long addr;
803 struct vattr va;
804 #ifdef NFS
805 extern int (**nfsv2_vnodeop_p) __P((void *));
806 #endif /* NFS */
807 dev_t dev;
808 char *name;
809 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
810
811 /*
812 * we want to enable swapping on sdp. the swd_vp contains
813 * the vnode we want (locked and ref'd), and the swd_dev
814 * contains the dev_t of the file, if it a block device.
815 */
816
817 vp = sdp->swd_vp;
818 dev = sdp->swd_dev;
819
820 /*
821 * open the swap file (mostly useful for block device files to
822 * let device driver know what is up).
823 *
824 * we skip the open/close for root on swap because the root
825 * has already been opened when root was mounted (mountroot).
826 */
827 if (vp != rootvp) {
828 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
829 return (error);
830 }
831
832 /* XXX this only works for block devices */
833 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
834
835 /*
836 * we now need to determine the size of the swap area. for
837 * block specials we can call the d_psize function.
838 * for normal files, we must stat [get attrs].
839 *
840 * we put the result in nblks.
841 * for normal files, we also want the filesystem block size
842 * (which we get with statfs).
843 */
844 switch (vp->v_type) {
845 case VBLK:
846 if (bdevsw[major(dev)].d_psize == 0 ||
847 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
848 error = ENXIO;
849 goto bad;
850 }
851 break;
852
853 case VREG:
854 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
855 goto bad;
856 nblocks = (int)btodb(va.va_size);
857 if ((error =
858 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
859 goto bad;
860
861 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
862 /*
863 * limit the max # of outstanding I/O requests we issue
864 * at any one time. take it easy on NFS servers.
865 */
866 #ifdef NFS
867 if (vp->v_op == nfsv2_vnodeop_p)
868 sdp->swd_maxactive = 2; /* XXX */
869 else
870 #endif /* NFS */
871 sdp->swd_maxactive = 8; /* XXX */
872 break;
873
874 default:
875 error = ENXIO;
876 goto bad;
877 }
878
879 /*
880 * save nblocks in a safe place and convert to pages.
881 */
882
883 sdp->swd_ose.ose_nblks = nblocks;
884 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
885
886 /*
887 * for block special files, we want to make sure that leave
888 * the disklabel and bootblocks alone, so we arrange to skip
889 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
890 * note that because of this the "size" can be less than the
891 * actual number of blocks on the device.
892 */
893 if (vp->v_type == VBLK) {
894 /* we use pages 1 to (size - 1) [inclusive] */
895 size = npages - 1;
896 addr = 1;
897 } else {
898 /* we use pages 0 to (size - 1) [inclusive] */
899 size = npages;
900 addr = 0;
901 }
902
903 /*
904 * make sure we have enough blocks for a reasonable sized swap
905 * area. we want at least one page.
906 */
907
908 if (size < 1) {
909 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
910 error = EINVAL;
911 goto bad;
912 }
913
914 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
915
916 /*
917 * now we need to allocate an extent to manage this swap device
918 */
919 name = malloc(12, M_VMSWAP, M_WAITOK);
920 sprintf(name, "swap0x%04x", count++);
921
922 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
923 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
924 0, 0, EX_WAITOK);
925 /* allocate the `saved' region from the extent so it won't be used */
926 if (addr) {
927 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
928 panic("disklabel region");
929 }
930
931 /*
932 * if the vnode we are swapping to is the root vnode
933 * (i.e. we are swapping to the miniroot) then we want
934 * to make sure we don't overwrite it. do a statfs to
935 * find its size and skip over it.
936 */
937 if (vp == rootvp) {
938 struct mount *mp;
939 struct statfs *sp;
940 int rootblocks, rootpages;
941
942 mp = rootvnode->v_mount;
943 sp = &mp->mnt_stat;
944 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
945 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
946 if (rootpages > size)
947 panic("swap_on: miniroot larger than swap?");
948
949 if (extent_alloc_region(sdp->swd_ex, addr,
950 rootpages, EX_WAITOK))
951 panic("swap_on: unable to preserve miniroot");
952
953 size -= rootpages;
954 printf("Preserved %d pages of miniroot ", rootpages);
955 printf("leaving %d pages of swap\n", size);
956 }
957
958 /*
959 * add a ref to vp to reflect usage as a swap device.
960 */
961 vref(vp);
962
963 /*
964 * add anons to reflect the new swap space
965 */
966 uvm_anon_add(size);
967
968 /*
969 * now add the new swapdev to the drum and enable.
970 */
971 simple_lock(&uvm.swap_data_lock);
972 swapdrum_add(sdp, npages);
973 sdp->swd_npages = size;
974 sdp->swd_flags &= ~SWF_FAKE; /* going live */
975 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
976 uvmexp.swpages += size;
977 simple_unlock(&uvm.swap_data_lock);
978 return (0);
979
980 bad:
981 /*
982 * failure: close device if necessary and return error.
983 */
984 if (vp != rootvp)
985 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
986 return (error);
987 }
988
989 /*
990 * swap_off: stop swapping on swapdev
991 *
992 * => swap data should be locked, we will unlock.
993 */
994 static int
995 swap_off(p, sdp)
996 struct proc *p;
997 struct swapdev *sdp;
998 {
999 void *name;
1000 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1001 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev,0,0,0);
1002
1003 /* disable the swap area being removed */
1004 sdp->swd_flags &= ~SWF_ENABLE;
1005 simple_unlock(&uvm.swap_data_lock);
1006
1007 /*
1008 * the idea is to find all the pages that are paged out to this
1009 * device, and page them all in. in uvm, swap-backed pageable
1010 * memory can take two forms: aobjs and anons. call the
1011 * swapoff hook for each subsystem to bring in pages.
1012 */
1013
1014 if (uao_swap_off(sdp->swd_drumoffset,
1015 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1016 anon_swap_off(sdp->swd_drumoffset,
1017 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1018
1019 simple_lock(&uvm.swap_data_lock);
1020 sdp->swd_flags |= SWF_ENABLE;
1021 simple_unlock(&uvm.swap_data_lock);
1022 return ENOMEM;
1023 }
1024
1025 #ifdef DIAGNOSTIC
1026 if (sdp->swd_npginuse != sdp->swd_npgbad) {
1027 panic("swap_off: sdp %p - %d pages still in use (%d bad)\n",
1028 sdp, sdp->swd_npginuse, sdp->swd_npgbad);
1029 }
1030 #endif
1031
1032 /*
1033 * done with the vnode and saved creds.
1034 * drop our ref on the vnode before calling VOP_CLOSE()
1035 * so that spec_close() can tell if this is the last close.
1036 */
1037 if (sdp->swd_vp->v_type == VREG) {
1038 crfree(sdp->swd_cred);
1039 }
1040 vrele(sdp->swd_vp);
1041 if (sdp->swd_vp != rootvp) {
1042 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1043 }
1044
1045 /* remove anons from the system */
1046 uvm_anon_remove(sdp->swd_npages);
1047
1048 simple_lock(&uvm.swap_data_lock);
1049 uvmexp.swpages -= sdp->swd_npages;
1050
1051 if (swaplist_find(sdp->swd_vp, 1) == NULL)
1052 panic("swap_off: swapdev not in list\n");
1053 swaplist_trim();
1054
1055 /*
1056 * free all resources!
1057 */
1058 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
1059 EX_WAITOK);
1060 name = (void *)sdp->swd_ex->ex_name;
1061 extent_destroy(sdp->swd_ex);
1062 free(name, M_VMSWAP);
1063 free(sdp, M_VMSWAP);
1064 simple_unlock(&uvm.swap_data_lock);
1065 return (0);
1066 }
1067
1068 /*
1069 * /dev/drum interface and i/o functions
1070 */
1071
1072 /*
1073 * swread: the read function for the drum (just a call to physio)
1074 */
1075 /*ARGSUSED*/
1076 int
1077 swread(dev, uio, ioflag)
1078 dev_t dev;
1079 struct uio *uio;
1080 int ioflag;
1081 {
1082 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1083
1084 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1085 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1086 }
1087
1088 /*
1089 * swwrite: the write function for the drum (just a call to physio)
1090 */
1091 /*ARGSUSED*/
1092 int
1093 swwrite(dev, uio, ioflag)
1094 dev_t dev;
1095 struct uio *uio;
1096 int ioflag;
1097 {
1098 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1099
1100 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1101 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1102 }
1103
1104 /*
1105 * swstrategy: perform I/O on the drum
1106 *
1107 * => we must map the i/o request from the drum to the correct swapdev.
1108 */
1109 void
1110 swstrategy(bp)
1111 struct buf *bp;
1112 {
1113 struct swapdev *sdp;
1114 struct vnode *vp;
1115 int s, pageno, bn;
1116 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1117
1118 /*
1119 * convert block number to swapdev. note that swapdev can't
1120 * be yanked out from under us because we are holding resources
1121 * in it (i.e. the blocks we are doing I/O on).
1122 */
1123 pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT;
1124 simple_lock(&uvm.swap_data_lock);
1125 sdp = swapdrum_getsdp(pageno);
1126 simple_unlock(&uvm.swap_data_lock);
1127 if (sdp == NULL) {
1128 bp->b_error = EINVAL;
1129 bp->b_flags |= B_ERROR;
1130 biodone(bp);
1131 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1132 return;
1133 }
1134
1135 /*
1136 * convert drum page number to block number on this swapdev.
1137 */
1138
1139 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1140 bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */
1141
1142 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n",
1143 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1144 sdp->swd_drumoffset, bn, bp->b_bcount);
1145
1146 /*
1147 * for block devices we finish up here.
1148 * for regular files we have to do more work which we delegate
1149 * to sw_reg_strategy().
1150 */
1151
1152 switch (sdp->swd_vp->v_type) {
1153 default:
1154 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1155
1156 case VBLK:
1157
1158 /*
1159 * must convert "bp" from an I/O on /dev/drum to an I/O
1160 * on the swapdev (sdp).
1161 */
1162 s = splbio();
1163 bp->b_blkno = bn; /* swapdev block number */
1164 vp = sdp->swd_vp; /* swapdev vnode pointer */
1165 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1166 VHOLD(vp); /* "hold" swapdev vp for i/o */
1167
1168 /*
1169 * if we are doing a write, we have to redirect the i/o on
1170 * drum's v_numoutput counter to the swapdevs.
1171 */
1172 if ((bp->b_flags & B_READ) == 0) {
1173 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1174 vp->v_numoutput++; /* put it on swapdev */
1175 }
1176
1177 /*
1178 * dissassocate buffer with /dev/drum vnode
1179 * [could be null if buf was from physio]
1180 */
1181 if (bp->b_vp != NULLVP)
1182 brelvp(bp);
1183
1184 /*
1185 * finally plug in swapdev vnode and start I/O
1186 */
1187 bp->b_vp = vp;
1188 splx(s);
1189 VOP_STRATEGY(bp);
1190 return;
1191
1192 case VREG:
1193 /*
1194 * delegate to sw_reg_strategy function.
1195 */
1196 sw_reg_strategy(sdp, bp, bn);
1197 return;
1198 }
1199 /* NOTREACHED */
1200 }
1201
1202 /*
1203 * sw_reg_strategy: handle swap i/o to regular files
1204 */
1205 static void
1206 sw_reg_strategy(sdp, bp, bn)
1207 struct swapdev *sdp;
1208 struct buf *bp;
1209 int bn;
1210 {
1211 struct vnode *vp;
1212 struct vndxfer *vnx;
1213 daddr_t nbn, byteoff;
1214 caddr_t addr;
1215 int s, off, nra, error, sz, resid;
1216 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1217
1218 /*
1219 * allocate a vndxfer head for this transfer and point it to
1220 * our buffer.
1221 */
1222 getvndxfer(vnx);
1223 vnx->vx_flags = VX_BUSY;
1224 vnx->vx_error = 0;
1225 vnx->vx_pending = 0;
1226 vnx->vx_bp = bp;
1227 vnx->vx_sdp = sdp;
1228
1229 /*
1230 * setup for main loop where we read filesystem blocks into
1231 * our buffer.
1232 */
1233 error = 0;
1234 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1235 addr = bp->b_data; /* current position in buffer */
1236 byteoff = dbtob(bn);
1237
1238 for (resid = bp->b_resid; resid; resid -= sz) {
1239 struct vndbuf *nbp;
1240
1241 /*
1242 * translate byteoffset into block number. return values:
1243 * vp = vnode of underlying device
1244 * nbn = new block number (on underlying vnode dev)
1245 * nra = num blocks we can read-ahead (excludes requested
1246 * block)
1247 */
1248 nra = 0;
1249 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1250 &vp, &nbn, &nra);
1251
1252 if (error == 0 && nbn == (daddr_t)-1) {
1253 /*
1254 * this used to just set error, but that doesn't
1255 * do the right thing. Instead, it causes random
1256 * memory errors. The panic() should remain until
1257 * this condition doesn't destabilize the system.
1258 */
1259 #if 1
1260 panic("sw_reg_strategy: swap to sparse file");
1261 #else
1262 error = EIO; /* failure */
1263 #endif
1264 }
1265
1266 /*
1267 * punt if there was an error or a hole in the file.
1268 * we must wait for any i/o ops we have already started
1269 * to finish before returning.
1270 *
1271 * XXX we could deal with holes here but it would be
1272 * a hassle (in the write case).
1273 */
1274 if (error) {
1275 s = splbio();
1276 vnx->vx_error = error; /* pass error up */
1277 goto out;
1278 }
1279
1280 /*
1281 * compute the size ("sz") of this transfer (in bytes).
1282 * XXXCDC: ignores read-ahead for non-zero offset
1283 */
1284 if ((off = (byteoff % sdp->swd_bsize)) != 0)
1285 sz = sdp->swd_bsize - off;
1286 else
1287 sz = (1 + nra) * sdp->swd_bsize;
1288
1289 if (resid < sz)
1290 sz = resid;
1291
1292 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x",
1293 sdp->swd_vp, vp, byteoff, nbn);
1294
1295 /*
1296 * now get a buf structure. note that the vb_buf is
1297 * at the front of the nbp structure so that you can
1298 * cast pointers between the two structure easily.
1299 */
1300 getvndbuf(nbp);
1301 nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
1302 nbp->vb_buf.b_bcount = sz;
1303 nbp->vb_buf.b_bufsize = sz;
1304 nbp->vb_buf.b_error = 0;
1305 nbp->vb_buf.b_data = addr;
1306 nbp->vb_buf.b_blkno = nbn + btodb(off);
1307 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1308 nbp->vb_buf.b_proc = bp->b_proc;
1309 nbp->vb_buf.b_iodone = sw_reg_iodone;
1310 nbp->vb_buf.b_vp = NULLVP;
1311 nbp->vb_buf.b_vnbufs.le_next = NOLIST;
1312 nbp->vb_buf.b_rcred = sdp->swd_cred;
1313 nbp->vb_buf.b_wcred = sdp->swd_cred;
1314 LIST_INIT(&nbp->vb_buf.b_dep);
1315
1316 /*
1317 * set b_dirtyoff/end and b_validoff/end. this is
1318 * required by the NFS client code (otherwise it will
1319 * just discard our I/O request).
1320 */
1321 if (bp->b_dirtyend == 0) {
1322 nbp->vb_buf.b_dirtyoff = 0;
1323 nbp->vb_buf.b_dirtyend = sz;
1324 } else {
1325 nbp->vb_buf.b_dirtyoff =
1326 max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1327 nbp->vb_buf.b_dirtyend =
1328 min(sz,
1329 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1330 }
1331 if (bp->b_validend == 0) {
1332 nbp->vb_buf.b_validoff = 0;
1333 nbp->vb_buf.b_validend = sz;
1334 } else {
1335 nbp->vb_buf.b_validoff =
1336 max(0, bp->b_validoff - (bp->b_bcount-resid));
1337 nbp->vb_buf.b_validend =
1338 min(sz,
1339 max(0, bp->b_validend - (bp->b_bcount-resid)));
1340 }
1341
1342 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1343
1344 /*
1345 * Just sort by block number
1346 */
1347 s = splbio();
1348 if (vnx->vx_error != 0) {
1349 putvndbuf(nbp);
1350 goto out;
1351 }
1352 vnx->vx_pending++;
1353
1354 /* assoc new buffer with underlying vnode */
1355 bgetvp(vp, &nbp->vb_buf);
1356
1357 /* sort it in and start I/O if we are not over our limit */
1358 disksort_blkno(&sdp->swd_tab, &nbp->vb_buf);
1359 sw_reg_start(sdp);
1360 splx(s);
1361
1362 /*
1363 * advance to the next I/O
1364 */
1365 byteoff += sz;
1366 addr += sz;
1367 }
1368
1369 s = splbio();
1370
1371 out: /* Arrive here at splbio */
1372 vnx->vx_flags &= ~VX_BUSY;
1373 if (vnx->vx_pending == 0) {
1374 if (vnx->vx_error != 0) {
1375 bp->b_error = vnx->vx_error;
1376 bp->b_flags |= B_ERROR;
1377 }
1378 putvndxfer(vnx);
1379 biodone(bp);
1380 }
1381 splx(s);
1382 }
1383
1384 /*
1385 * sw_reg_start: start an I/O request on the requested swapdev
1386 *
1387 * => reqs are sorted by disksort (above)
1388 */
1389 static void
1390 sw_reg_start(sdp)
1391 struct swapdev *sdp;
1392 {
1393 struct buf *bp;
1394 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1395
1396 /* recursion control */
1397 if ((sdp->swd_flags & SWF_BUSY) != 0)
1398 return;
1399
1400 sdp->swd_flags |= SWF_BUSY;
1401
1402 while (sdp->swd_active < sdp->swd_maxactive) {
1403 bp = BUFQ_FIRST(&sdp->swd_tab);
1404 if (bp == NULL)
1405 break;
1406 BUFQ_REMOVE(&sdp->swd_tab, bp);
1407 sdp->swd_active++;
1408
1409 UVMHIST_LOG(pdhist,
1410 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1411 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1412 if ((bp->b_flags & B_READ) == 0)
1413 bp->b_vp->v_numoutput++;
1414 VOP_STRATEGY(bp);
1415 }
1416 sdp->swd_flags &= ~SWF_BUSY;
1417 }
1418
1419 /*
1420 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1421 *
1422 * => note that we can recover the vndbuf struct by casting the buf ptr
1423 */
1424 static void
1425 sw_reg_iodone(bp)
1426 struct buf *bp;
1427 {
1428 struct vndbuf *vbp = (struct vndbuf *) bp;
1429 struct vndxfer *vnx = vbp->vb_xfer;
1430 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1431 struct swapdev *sdp = vnx->vx_sdp;
1432 int s, resid;
1433 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1434
1435 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1436 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1437 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1438 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1439
1440 /*
1441 * protect vbp at splbio and update.
1442 */
1443
1444 s = splbio();
1445 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1446 pbp->b_resid -= resid;
1447 vnx->vx_pending--;
1448
1449 if (vbp->vb_buf.b_error) {
1450 UVMHIST_LOG(pdhist, " got error=%d !",
1451 vbp->vb_buf.b_error, 0, 0, 0);
1452
1453 /* pass error upward */
1454 vnx->vx_error = vbp->vb_buf.b_error;
1455 }
1456
1457 /*
1458 * disassociate this buffer from the vnode (if any).
1459 */
1460 if (vbp->vb_buf.b_vp != NULLVP) {
1461 brelvp(&vbp->vb_buf);
1462 }
1463
1464 /*
1465 * kill vbp structure
1466 */
1467 putvndbuf(vbp);
1468
1469 /*
1470 * wrap up this transaction if it has run to completion or, in
1471 * case of an error, when all auxiliary buffers have returned.
1472 */
1473 if (vnx->vx_error != 0) {
1474 /* pass error upward */
1475 pbp->b_flags |= B_ERROR;
1476 pbp->b_error = vnx->vx_error;
1477 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1478 putvndxfer(vnx);
1479 biodone(pbp);
1480 }
1481 } else if (pbp->b_resid == 0) {
1482 #ifdef DIAGNOSTIC
1483 if (vnx->vx_pending != 0)
1484 panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending);
1485 #endif
1486
1487 if ((vnx->vx_flags & VX_BUSY) == 0) {
1488 UVMHIST_LOG(pdhist, " iodone error=%d !",
1489 pbp, vnx->vx_error, 0, 0);
1490 putvndxfer(vnx);
1491 biodone(pbp);
1492 }
1493 }
1494
1495 /*
1496 * done! start next swapdev I/O if one is pending
1497 */
1498 sdp->swd_active--;
1499 sw_reg_start(sdp);
1500 splx(s);
1501 }
1502
1503
1504 /*
1505 * uvm_swap_alloc: allocate space on swap
1506 *
1507 * => allocation is done "round robin" down the priority list, as we
1508 * allocate in a priority we "rotate" the circle queue.
1509 * => space can be freed with uvm_swap_free
1510 * => we return the page slot number in /dev/drum (0 == invalid slot)
1511 * => we lock uvm.swap_data_lock
1512 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1513 */
1514 int
1515 uvm_swap_alloc(nslots, lessok)
1516 int *nslots; /* IN/OUT */
1517 boolean_t lessok;
1518 {
1519 struct swapdev *sdp;
1520 struct swappri *spp;
1521 u_long result;
1522 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1523
1524 /*
1525 * no swap devices configured yet? definite failure.
1526 */
1527 if (uvmexp.nswapdev < 1)
1528 return 0;
1529
1530 /*
1531 * lock data lock, convert slots into blocks, and enter loop
1532 */
1533 simple_lock(&uvm.swap_data_lock);
1534
1535 ReTry: /* XXXMRG */
1536 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
1537 spp = LIST_NEXT(spp, spi_swappri)) {
1538 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
1539 sdp != (void *)&spp->spi_swapdev;
1540 sdp = CIRCLEQ_NEXT(sdp,swd_next)) {
1541 /* if it's not enabled, then we can't swap from it */
1542 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1543 continue;
1544 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1545 continue;
1546 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1547 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1548 &result) != 0) {
1549 continue;
1550 }
1551
1552 /*
1553 * successful allocation! now rotate the circleq.
1554 */
1555 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1556 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1557 sdp->swd_npginuse += *nslots;
1558 uvmexp.swpginuse += *nslots;
1559 simple_unlock(&uvm.swap_data_lock);
1560 /* done! return drum slot number */
1561 UVMHIST_LOG(pdhist,
1562 "success! returning %d slots starting at %d",
1563 *nslots, result + sdp->swd_drumoffset, 0, 0);
1564 return(result + sdp->swd_drumoffset);
1565 }
1566 }
1567
1568 /* XXXMRG: BEGIN HACK */
1569 if (*nslots > 1 && lessok) {
1570 *nslots = 1;
1571 goto ReTry; /* XXXMRG: ugh! extent should support this for us */
1572 }
1573 /* XXXMRG: END HACK */
1574
1575 simple_unlock(&uvm.swap_data_lock);
1576 return 0; /* failed */
1577 }
1578
1579 /*
1580 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1581 *
1582 * => we lock uvm.swap_data_lock
1583 */
1584 void
1585 uvm_swap_markbad(startslot, nslots)
1586 int startslot;
1587 int nslots;
1588 {
1589 struct swapdev *sdp;
1590 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1591
1592 simple_lock(&uvm.swap_data_lock);
1593 sdp = swapdrum_getsdp(startslot);
1594
1595 /*
1596 * we just keep track of how many pages have been marked bad
1597 * in this device, to make everything add up in swap_off().
1598 * we assume here that the range of slots will all be within
1599 * one swap device.
1600 */
1601 sdp->swd_npgbad += nslots;
1602
1603 simple_unlock(&uvm.swap_data_lock);
1604 }
1605
1606 /*
1607 * uvm_swap_free: free swap slots
1608 *
1609 * => this can be all or part of an allocation made by uvm_swap_alloc
1610 * => we lock uvm.swap_data_lock
1611 */
1612 void
1613 uvm_swap_free(startslot, nslots)
1614 int startslot;
1615 int nslots;
1616 {
1617 struct swapdev *sdp;
1618 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1619
1620 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1621 startslot, 0, 0);
1622
1623 /*
1624 * ignore attempts to free the "bad" slot.
1625 */
1626 if (startslot == SWSLOT_BAD) {
1627 return;
1628 }
1629
1630 /*
1631 * convert drum slot offset back to sdp, free the blocks
1632 * in the extent, and return. must hold pri lock to do
1633 * lookup and access the extent.
1634 */
1635 simple_lock(&uvm.swap_data_lock);
1636 sdp = swapdrum_getsdp(startslot);
1637
1638 #ifdef DIAGNOSTIC
1639 if (uvmexp.nswapdev < 1)
1640 panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1641 if (sdp == NULL) {
1642 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1643 nslots);
1644 panic("uvm_swap_free: unmapped address\n");
1645 }
1646 #endif
1647 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1648 EX_MALLOCOK|EX_NOWAIT) != 0) {
1649 printf("warning: resource shortage: %d pages of swap lost\n",
1650 nslots);
1651 }
1652
1653 sdp->swd_npginuse -= nslots;
1654 uvmexp.swpginuse -= nslots;
1655 #ifdef DIAGNOSTIC
1656 if (sdp->swd_npginuse < 0)
1657 panic("uvm_swap_free: inuse < 0");
1658 #endif
1659 simple_unlock(&uvm.swap_data_lock);
1660 }
1661
1662 /*
1663 * uvm_swap_put: put any number of pages into a contig place on swap
1664 *
1665 * => can be sync or async
1666 * => XXXMRG: consider making it an inline or macro
1667 */
1668 int
1669 uvm_swap_put(swslot, ppsp, npages, flags)
1670 int swslot;
1671 struct vm_page **ppsp;
1672 int npages;
1673 int flags;
1674 {
1675 int result;
1676
1677 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1678 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1679
1680 return (result);
1681 }
1682
1683 /*
1684 * uvm_swap_get: get a single page from swap
1685 *
1686 * => usually a sync op (from fault)
1687 * => XXXMRG: consider making it an inline or macro
1688 */
1689 int
1690 uvm_swap_get(page, swslot, flags)
1691 struct vm_page *page;
1692 int swslot, flags;
1693 {
1694 int result;
1695
1696 uvmexp.nswget++;
1697 #ifdef DIAGNOSTIC
1698 if ((flags & PGO_SYNCIO) == 0)
1699 printf("uvm_swap_get: ASYNC get requested?\n");
1700 #endif
1701
1702 if (swslot == SWSLOT_BAD) {
1703 return VM_PAGER_ERROR;
1704 }
1705
1706 /*
1707 * this page is (about to be) no longer only in swap.
1708 */
1709 simple_lock(&uvm.swap_data_lock);
1710 uvmexp.swpgonly--;
1711 simple_unlock(&uvm.swap_data_lock);
1712
1713 result = uvm_swap_io(&page, swslot, 1, B_READ |
1714 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1715
1716 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
1717 /*
1718 * oops, the read failed so it really is still only in swap.
1719 */
1720 simple_lock(&uvm.swap_data_lock);
1721 uvmexp.swpgonly++;
1722 simple_unlock(&uvm.swap_data_lock);
1723 }
1724
1725 return (result);
1726 }
1727
1728 /*
1729 * uvm_swap_io: do an i/o operation to swap
1730 */
1731
1732 static int
1733 uvm_swap_io(pps, startslot, npages, flags)
1734 struct vm_page **pps;
1735 int startslot, npages, flags;
1736 {
1737 daddr_t startblk;
1738 struct swapbuf *sbp;
1739 struct buf *bp;
1740 vaddr_t kva;
1741 int result, s, mapinflags, pflag;
1742 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1743
1744 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1745 startslot, npages, flags, 0);
1746
1747 /*
1748 * convert starting drum slot to block number
1749 */
1750 startblk = btodb(startslot << PAGE_SHIFT);
1751
1752 /*
1753 * first, map the pages into the kernel (XXX: currently required
1754 * by buffer system). note that we don't let pagermapin alloc
1755 * an aiodesc structure because we don't want to chance a malloc.
1756 * we've got our own pool of aiodesc structures (in swapbuf).
1757 */
1758 mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ :
1759 UVMPAGER_MAPIN_WRITE;
1760 if ((flags & B_ASYNC) == 0)
1761 mapinflags |= UVMPAGER_MAPIN_WAITOK;
1762 kva = uvm_pagermapin(pps, npages, NULL, mapinflags);
1763 if (kva == 0)
1764 return (VM_PAGER_AGAIN);
1765
1766 /*
1767 * now allocate a swap buffer off of freesbufs
1768 * [make sure we don't put the pagedaemon to sleep...]
1769 */
1770 s = splbio();
1771 pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc)
1772 ? 0
1773 : PR_WAITOK;
1774 sbp = pool_get(swapbuf_pool, pflag);
1775 splx(s); /* drop splbio */
1776
1777 /*
1778 * if we failed to get a swapbuf, return "try again"
1779 */
1780 if (sbp == NULL)
1781 return (VM_PAGER_AGAIN);
1782
1783 /*
1784 * fill in the bp/sbp. we currently route our i/o through
1785 * /dev/drum's vnode [swapdev_vp].
1786 */
1787 bp = &sbp->sw_buf;
1788 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
1789 bp->b_proc = &proc0; /* XXX */
1790 bp->b_rcred = bp->b_wcred = proc0.p_ucred;
1791 bp->b_vnbufs.le_next = NOLIST;
1792 bp->b_data = (caddr_t)kva;
1793 bp->b_blkno = startblk;
1794 s = splbio();
1795 VHOLD(swapdev_vp);
1796 bp->b_vp = swapdev_vp;
1797 splx(s);
1798 /* XXXCDC: isn't swapdev_vp always a VCHR? */
1799 /* XXXMRG: probably -- this is obviously something inherited... */
1800 if (swapdev_vp->v_type == VBLK)
1801 bp->b_dev = swapdev_vp->v_rdev;
1802 bp->b_bcount = npages << PAGE_SHIFT;
1803 LIST_INIT(&bp->b_dep);
1804
1805 /*
1806 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1807 * and we bump v_numoutput (counter of number of active outputs).
1808 */
1809 if ((bp->b_flags & B_READ) == 0) {
1810 bp->b_dirtyoff = 0;
1811 bp->b_dirtyend = npages << PAGE_SHIFT;
1812 s = splbio();
1813 swapdev_vp->v_numoutput++;
1814 splx(s);
1815 }
1816
1817 /*
1818 * for async ops we must set up the aiodesc and setup the callback
1819 * XXX: we expect no async-reads, but we don't prevent it here.
1820 */
1821 if (flags & B_ASYNC) {
1822 sbp->sw_aio.aiodone = uvm_swap_aiodone;
1823 sbp->sw_aio.kva = kva;
1824 sbp->sw_aio.npages = npages;
1825 sbp->sw_aio.pd_ptr = sbp; /* backpointer */
1826 bp->b_flags |= B_CALL; /* set callback */
1827 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
1828 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1829 }
1830 UVMHIST_LOG(pdhist,
1831 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
1832 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1833
1834 /*
1835 * now we start the I/O, and if async, return.
1836 */
1837 VOP_STRATEGY(bp);
1838 if (flags & B_ASYNC)
1839 return (VM_PAGER_PEND);
1840
1841 /*
1842 * must be sync i/o. wait for it to finish
1843 */
1844 bp->b_error = biowait(bp);
1845 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1846
1847 /*
1848 * kill the pager mapping
1849 */
1850 uvm_pagermapout(kva, npages);
1851
1852 /*
1853 * now dispose of the swap buffer
1854 */
1855 s = splbio();
1856 if (bp->b_vp)
1857 brelvp(bp);
1858
1859 pool_put(swapbuf_pool, sbp);
1860 splx(s);
1861
1862 /*
1863 * finally return.
1864 */
1865 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0);
1866 return (result);
1867 }
1868
1869 /*
1870 * uvm_swap_bufdone: called from the buffer system when the i/o is done
1871 */
1872 static void
1873 uvm_swap_bufdone(bp)
1874 struct buf *bp;
1875 {
1876 struct swapbuf *sbp = (struct swapbuf *) bp;
1877 int s = splbio();
1878 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
1879
1880 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
1881 #ifdef DIAGNOSTIC
1882 /*
1883 * sanity check: swapbufs are private, so they shouldn't be wanted
1884 */
1885 if (bp->b_flags & B_WANTED)
1886 panic("uvm_swap_bufdone: private buf wanted");
1887 #endif
1888
1889 /*
1890 * drop the buffer's reference to the vnode.
1891 */
1892 if (bp->b_vp)
1893 brelvp(bp);
1894
1895 /*
1896 * now put the aio on the uvm.aio_done list and wake the
1897 * pagedaemon (which will finish up our job in its context).
1898 */
1899 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */
1900 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
1901 simple_unlock(&uvm.pagedaemon_lock);
1902
1903 wakeup(&uvm.pagedaemon);
1904 splx(s);
1905 }
1906
1907 /*
1908 * uvm_swap_aiodone: aiodone function for anonymous memory
1909 *
1910 * => this is called in the context of the pagedaemon (but with the
1911 * page queues unlocked!)
1912 * => our "aio" structure must be part of a "swapbuf"
1913 */
1914 static void
1915 uvm_swap_aiodone(aio)
1916 struct uvm_aiodesc *aio;
1917 {
1918 struct swapbuf *sbp = aio->pd_ptr;
1919 struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT];
1920 int lcv, s;
1921 vaddr_t addr;
1922 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
1923
1924 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
1925 #ifdef DIAGNOSTIC
1926 /*
1927 * sanity check
1928 */
1929 if (aio->npages > (MAXBSIZE >> PAGE_SHIFT))
1930 panic("uvm_swap_aiodone: aio too big!");
1931 #endif
1932
1933 /*
1934 * first, we have to recover the page pointers (pps) by poking in the
1935 * kernel pmap (XXX: should be saved in the buf structure).
1936 */
1937 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
1938 addr += PAGE_SIZE, lcv++) {
1939 pps[lcv] = uvm_pageratop(addr);
1940 }
1941
1942 /*
1943 * now we can dispose of the kernel mappings of the buffer
1944 */
1945 uvm_pagermapout(aio->kva, aio->npages);
1946
1947 /*
1948 * now we can dispose of the pages by using the dropcluster function
1949 * [note that we have no "page of interest" so we pass in null]
1950 */
1951 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
1952 PGO_PDFREECLUST);
1953
1954 /*
1955 * finally, we can dispose of the swapbuf
1956 */
1957 s = splbio();
1958 pool_put(swapbuf_pool, sbp);
1959 splx(s);
1960 }
1961