uvm_swap.c revision 1.7 1 /* $NetBSD: uvm_swap.c,v 1.7 1998/03/01 02:25:28 fvdl Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32 */
33
34 #include "fs_nfs.h"
35 #include "opt_uvmhist.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/buf.h>
40 #include <sys/proc.h>
41 #include <sys/namei.h>
42 #include <sys/disklabel.h>
43 #include <sys/errno.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/extent.h>
49 #include <sys/mount.h>
50 #include <sys/syscallargs.h>
51
52 #include <vm/vm.h>
53 #include <vm/vm_swap.h>
54 #include <vm/vm_conf.h>
55
56 #include <uvm/uvm.h>
57
58 #include <miscfs/specfs/specdev.h>
59
60 /*
61 * uvm_swap.c: manage configuration and i/o to swap space.
62 */
63
64 /*
65 * swap space is managed in the following way:
66 *
67 * each swap partition or file is described by a "swapdev" structure.
68 * each "swapdev" structure contains a "swapent" structure which contains
69 * information that is passed up to the user (via system calls).
70 *
71 * each swap partition is assigned a "priority" (int) which controls
72 * swap parition usage.
73 *
74 * the system maintains a global data structure describing all swap
75 * partitions/files. there is a sorted LIST of "swappri" structures
76 * which describe "swapdev"'s at that priority. this LIST is headed
77 * by the "swap_priority" global var. each "swappri" contains a
78 * CIRCLEQ of "swapdev" structures at that priority.
79 *
80 * the system maintains a fixed pool of "swapbuf" structures for use
81 * at swap i/o time. a swapbuf includes a "buf" structure and an
82 * "aiodone" [we want to avoid malloc()'ing anything at swapout time
83 * since memory may be low].
84 *
85 * locking:
86 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl
87 * system call and prevents the swap priority list from changing
88 * while we are in the middle of a system call (e.g. SWAP_STATS).
89 * - swap_data_lock (simple_lock): this lock protects all swap data
90 * structures including the priority list, the swapdev structures,
91 * and the swapmap extent.
92 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf
93 * pool.
94 *
95 * each swap device has the following info:
96 * - swap device in use (could be disabled, preventing future use)
97 * - swap enabled (allows new allocations on swap)
98 * - map info in /dev/drum
99 * - vnode pointer
100 * for swap files only:
101 * - block size
102 * - max byte count in buffer
103 * - buffer
104 * - credentials to use when doing i/o to file
105 *
106 * userland controls and configures swap with the swapctl(2) system call.
107 * the sys_swapctl performs the following operations:
108 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
109 * [2] SWAP_STATS: given a pointer to an array of swapent structures
110 * (passed in via "arg") of a size passed in via "misc" ... we load
111 * the current swap config into the array.
112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
113 * priority in "misc", start swapping on it.
114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
116 * "misc")
117 */
118
119 /*
120 * SWAP_TO_FILES: allows swapping to plain files.
121 */
122
123 #define SWAP_TO_FILES
124
125 /*
126 * swapdev: describes a single swap partition/file
127 *
128 * note the following should be true:
129 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
130 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
131 */
132 struct swapdev {
133 struct swapent swd_se; /* swap entry struct */
134 #define swd_dev swd_se.se_dev /* dev_t for this dev */
135 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake*/
136 #define swd_priority swd_se.se_priority /* our priority */
137 /* also: swd_se.se_nblks, swd_se.se_inuse */
138 int swd_npages; /* #pages we can use */
139 int swd_npginuse; /* #pages in use */
140 int swd_drumoffset; /* page0 offset in drum */
141 int swd_drumsize; /* #pages in drum */
142 struct extent *swd_ex; /* extent for this swapdev*/
143 struct vnode *swd_vp; /* backing vnode */
144 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
145
146 #ifdef SWAP_TO_FILES
147 int swd_bsize; /* blocksize (bytes) */
148 int swd_maxactive; /* max active i/o reqs */
149 struct buf swd_tab; /* buffer list */
150 struct ucred *swd_cred; /* cred for file access */
151 #endif
152 };
153
154 /*
155 * swap device priority entry; the list is kept sorted on `spi_priority'.
156 */
157 struct swappri {
158 int spi_priority; /* priority */
159 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
160 /* circleq of swapdevs at this priority */
161 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
162 };
163
164 /*
165 * swapbuf, swapbuffer plus async i/o info
166 */
167 struct swapbuf {
168 struct buf sw_buf; /* a buffer structure */
169 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */
170 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */
171 };
172
173 /*
174 * The following two structures are used to keep track of data transfers
175 * on swap devices associated with regular files.
176 * NOTE: this code is more or less a copy of vnd.c; we use the same
177 * structure names here to ease porting..
178 */
179 struct vndxfer {
180 struct buf *vx_bp; /* Pointer to parent buffer */
181 struct swapdev *vx_sdp;
182 int vx_error;
183 int vx_pending; /* # of pending aux buffers */
184 int vx_flags;
185 #define VX_BUSY 1
186 #define VX_DEAD 2
187 };
188
189 struct vndbuf {
190 struct buf vb_buf;
191 struct vndxfer *vb_xfer;
192 };
193
194 /*
195 * XXX: Not a very good idea in a swap strategy module!
196 */
197 #define getvndxfer() \
198 ((struct vndxfer *)malloc(sizeof(struct vndxfer), M_DEVBUF, M_WAITOK))
199
200 #define putvndxfer(vnx) \
201 free((caddr_t)(vnx), M_DEVBUF)
202
203 #define getvndbuf() \
204 ((struct vndbuf *)malloc(sizeof(struct vndbuf), M_DEVBUF, M_WAITOK))
205
206 #define putvndbuf(vbp) \
207 free((caddr_t)(vbp), M_DEVBUF)
208
209 /*
210 * local variables
211 */
212 static struct extent *swapmap; /* controls the mapping of /dev/drum */
213 SIMPLEQ_HEAD(swapbufhead, swapbuf);
214 static struct swapbufhead freesbufs; /* list of free swapbufs */
215 static int sbufs_wanted = 0; /* someone sleeping for swapbufs? */
216 static simple_lock_data_t swap_buf_lock;/* locks freesbufs and sbufs_wanted */
217
218 /* list of all active swap devices [by priority] */
219 LIST_HEAD(swap_priority, swappri);
220 static struct swap_priority swap_priority;
221
222 /* locks */
223 lock_data_t swap_syscall_lock;
224 static simple_lock_data_t swap_data_lock;
225
226 /*
227 * prototypes
228 */
229 static void swapdrum_add __P((struct swapdev *, int));
230 static struct swapdev *swapdrum_getsdp __P((int));
231
232 static struct swapdev *swaplist_find __P((struct vnode *, int));
233 static void swaplist_insert __P((struct swapdev *,
234 struct swappri *, int));
235 static void swaplist_trim __P((void));
236
237 static int swap_on __P((struct proc *, struct swapdev *));
238 #ifdef SWAP_OFF_WORKS
239 static int swap_off __P((struct proc *, struct swapdev *));
240 #endif
241
242 #ifdef SWAP_TO_FILES
243 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
244 static void sw_reg_iodone __P((struct buf *));
245 static void sw_reg_start __P((struct swapdev *));
246 #endif
247
248 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
249 static void uvm_swap_bufdone __P((struct buf *));
250 static int uvm_swap_io __P((struct vm_page **, int, int, int));
251
252 /*
253 * uvm_swap_init: init the swap system data structures and locks
254 *
255 * => called at boot time from init_main.c after the filesystems
256 * are brought up (which happens after uvm_init())
257 */
258 void
259 uvm_swap_init()
260 {
261 struct swapbuf *sp;
262 struct proc *p = &proc0; /* XXX */
263 int i;
264 UVMHIST_FUNC("uvm_swap_init");
265
266 UVMHIST_CALLED(pdhist);
267 /*
268 * first, init the swap list, its counter, and its lock.
269 * then get a handle on the vnode for /dev/drum by using
270 * the its dev_t number ("swapdev", from MD conf.c).
271 */
272
273 LIST_INIT(&swap_priority);
274 uvmexp.nswapdev = 0;
275 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
276 simple_lock_init(&swap_data_lock);
277 if (bdevvp(swapdev, &swapdev_vp))
278 panic("uvm_swap_init: can't get vnode for swap device");
279
280 /*
281 * create swap block resource map to map /dev/drum. the range
282 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
283 * that block 0 is reserved (used to indicate an allocation
284 * failure, or no allocation).
285 */
286 swapmap = extent_create("swapmap", 1, INT_MAX,
287 M_VMSWAP, 0, 0, EX_NOWAIT);
288 if (swapmap == 0)
289 panic("uvm_swap_init: extent_create failed");
290
291 /*
292 * allocate our private pool of "swapbuf" structures (includes
293 * a "buf" structure). ["nswbuf" comes from param.c and can
294 * be adjusted by MD code before we get here].
295 */
296
297 sp = malloc(sizeof(*sp) * nswbuf, M_VMSWAP, M_NOWAIT);
298 if (sp == NULL)
299 panic("uvm_swap_init: unable to malloc swap bufs");
300 bzero(sp, sizeof(*sp) * nswbuf);
301 SIMPLEQ_INIT(&freesbufs);
302 simple_lock_init(&swap_buf_lock);
303
304 /* build free list */
305 for (i = 0 ; i < nswbuf ; i++, sp++) {
306 /* p == proc0 */
307 sp->sw_buf.b_rcred = sp->sw_buf.b_wcred = p->p_ucred;
308 sp->sw_buf.b_vnbufs.le_next = NOLIST;
309 SIMPLEQ_INSERT_HEAD(&freesbufs, sp, sw_sq);
310 }
311 printf("uvm_swap: allocated %d swap buffer headers\n", nswbuf);
312
313 /*
314 * done!
315 */
316 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
317 }
318
319 /*
320 * swaplist functions: functions that operate on the list of swap
321 * devices on the system.
322 */
323
324 /*
325 * swaplist_insert: insert swap device "sdp" into the global list
326 *
327 * => caller must hold both swap_syscall_lock and swap_data_lock
328 * => caller must provide a newly malloc'd swappri structure (we will
329 * FREE it if we don't need it... this it to prevent malloc blocking
330 * here while adding swap)
331 */
332 static void
333 swaplist_insert(sdp, newspp, priority)
334 struct swapdev *sdp;
335 struct swappri *newspp;
336 int priority;
337 {
338 struct swappri *spp, *pspp;
339 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
340
341 /*
342 * find entry at or after which to insert the new device.
343 */
344 for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL;
345 spp = spp->spi_swappri.le_next) {
346 if (priority <= spp->spi_priority)
347 break;
348 pspp = spp;
349 }
350
351 /*
352 * new priority?
353 */
354 if (spp == NULL || spp->spi_priority != priority) {
355 spp = newspp; /* use newspp! */
356 UVMHIST_LOG(pdhist, "created new swappri = %d", priority, 0, 0, 0);
357
358 spp->spi_priority = priority;
359 CIRCLEQ_INIT(&spp->spi_swapdev);
360
361 if (pspp)
362 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
363 else
364 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
365 } else {
366 /* we don't need a new priority structure, free it */
367 FREE(newspp, M_VMSWAP);
368 }
369
370 /*
371 * priority found (or created). now insert on the priority's
372 * circleq list and bump the total number of swapdevs.
373 */
374 sdp->swd_priority = priority;
375 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
376 uvmexp.nswapdev++;
377
378 /*
379 * done!
380 */
381 }
382
383 /*
384 * swaplist_find: find and optionally remove a swap device from the
385 * global list.
386 *
387 * => caller must hold both swap_syscall_lock and swap_data_lock
388 * => we return the swapdev we found (and removed)
389 */
390 static struct swapdev *
391 swaplist_find(vp, remove)
392 struct vnode *vp;
393 boolean_t remove;
394 {
395 struct swapdev *sdp;
396 struct swappri *spp;
397
398 /*
399 * search the lists for the requested vp
400 */
401 for (spp = swap_priority.lh_first; spp != NULL;
402 spp = spp->spi_swappri.le_next) {
403 for (sdp = spp->spi_swapdev.cqh_first;
404 sdp != (void *)&spp->spi_swapdev;
405 sdp = sdp->swd_next.cqe_next)
406 if (sdp->swd_vp == vp) {
407 if (remove) {
408 CIRCLEQ_REMOVE(&spp->spi_swapdev,
409 sdp, swd_next);
410 uvmexp.nswapdev--;
411 }
412 return(sdp);
413 }
414 }
415 return (NULL);
416 }
417
418
419 /*
420 * swaplist_trim: scan priority list for empty priority entries and kill
421 * them.
422 *
423 * => caller must hold both swap_syscall_lock and swap_data_lock
424 */
425 static void
426 swaplist_trim()
427 {
428 struct swappri *spp, *nextspp;
429
430 for (spp = swap_priority.lh_first; spp != NULL; spp = nextspp) {
431 nextspp = spp->spi_swappri.le_next;
432 if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev)
433 continue;
434 LIST_REMOVE(spp, spi_swappri);
435 free((caddr_t)spp, M_VMSWAP);
436 }
437 }
438
439 /*
440 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
441 *
442 * => caller must hold swap_syscall_lock
443 * => swap_data_lock should be unlocked (we may sleep)
444 */
445 static void
446 swapdrum_add(sdp, npages)
447 struct swapdev *sdp;
448 int npages;
449 {
450 u_long result;
451
452 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
453 EX_WAITOK, &result))
454 panic("swapdrum_add");
455
456 sdp->swd_drumoffset = result;
457 sdp->swd_drumsize = npages;
458 }
459
460 /*
461 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
462 * to the "swapdev" that maps that section of the drum.
463 *
464 * => each swapdev takes one big contig chunk of the drum
465 * => caller must hold swap_data_lock
466 */
467 static struct swapdev *
468 swapdrum_getsdp(pgno)
469 int pgno;
470 {
471 struct swapdev *sdp;
472 struct swappri *spp;
473
474 for (spp = swap_priority.lh_first; spp != NULL;
475 spp = spp->spi_swappri.le_next)
476 for (sdp = spp->spi_swapdev.cqh_first;
477 sdp != (void *)&spp->spi_swapdev;
478 sdp = sdp->swd_next.cqe_next)
479 if (pgno >= sdp->swd_drumoffset &&
480 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
481 return sdp;
482 }
483 return NULL;
484 }
485
486
487 /*
488 * sys_swapctl: main entry point for swapctl(2) system call
489 * [with two helper functions: swap_on and swap_off]
490 */
491 int
492 sys_swapctl(p, v, retval)
493 struct proc *p;
494 void *v;
495 register_t *retval;
496 {
497 struct sys_swapctl_args /* {
498 syscallarg(int) cmd;
499 syscallarg(void *) arg;
500 syscallarg(int) misc;
501 } */ *uap = (struct sys_swapctl_args *)v;
502 struct vnode *vp;
503 struct nameidata nd;
504 struct swappri *spp;
505 struct swapdev *sdp;
506 struct swapent *sep;
507 int count, error, misc;
508 int priority;
509 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
510
511 misc = SCARG(uap, misc);
512
513 /*
514 * ensure serialized syscall access by grabbing the swap_syscall_lock
515 */
516 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, (void *)0);
517
518 /*
519 * we handle the non-priv NSWAP and STATS request first.
520 *
521 * SWAP_NSWAP: return number of config'd swap devices
522 * [can also be obtained with uvmexp sysctl]
523 */
524 if (SCARG(uap, cmd) == SWAP_NSWAP) {
525 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 0, 0, 0);
526 *retval = uvmexp.nswapdev;
527 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0);
528 return (0);
529 }
530
531 /*
532 * SWAP_STATS: get stats on current # of configured swap devs
533 *
534 * note that the swap_priority list can't change as long
535 * as we are holding the swap_syscall_lock. we don't want
536 * to grab the swap_data_lock because we may fault&sleep during
537 * copyout() and we don't want to be holding that lock then!
538 */
539 if (SCARG(uap, cmd) == SWAP_STATS) {
540 sep = (struct swapent *)SCARG(uap, arg);
541 count = 0;
542
543 for (spp = swap_priority.lh_first; spp != NULL;
544 spp = spp->spi_swappri.le_next) {
545 for (sdp = spp->spi_swapdev.cqh_first;
546 sdp != (void *)&spp->spi_swapdev && misc-- > 0;
547 sdp = sdp->swd_next.cqe_next) {
548 /* backwards compatibility for system call */
549 sdp->swd_se.se_inuse =
550 btodb(sdp->swd_npginuse * PAGE_SIZE);
551 error = copyout((caddr_t)&sdp->swd_se,
552 (caddr_t)sep, sizeof(struct swapent));
553 if (error) {
554 lockmgr(&swap_syscall_lock,
555 LK_RELEASE, (void *)0);
556 return (error);
557 }
558 count++;
559 sep++;
560 }
561 }
562
563 UVMHIST_LOG(pdhist, "<-done SWAP_STATS", 0, 0, 0, 0);
564
565 *retval = count;
566 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0);
567 return (0);
568 }
569
570 /*
571 * all other requests require superuser privs. verify.
572 */
573 if ((error = suser(p->p_ucred, &p->p_acflag))) {
574 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0);
575 return (error);
576 }
577
578 /*
579 * at this point we expect a path name in arg. we will
580 * use namei() to gain a vnode reference (vref), and lock
581 * the vnode (VOP_LOCK).
582 *
583 * XXX: a NULL arg means use the root vnode pointer (e.g. for
584 * miniroot
585 */
586 if (SCARG(uap, arg) == NULL) {
587 vp = rootvp; /* miniroot */
588 if (vget(vp, LK_EXCLUSIVE)) {
589 lockmgr(&swap_syscall_lock, LK_RELEASE,
590 (void *)0);
591 return (EBUSY);
592 }
593 } else {
594 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_USERSPACE,
595 SCARG(uap, arg), p);
596 if ((error = namei(&nd))) {
597 lockmgr(&swap_syscall_lock, LK_RELEASE,
598 (void *)0);
599 return (error);
600 }
601 vp = nd.ni_vp;
602 }
603 /* note: "vp" is referenced and locked */
604
605 error = 0; /* assume no error */
606 switch(SCARG(uap, cmd)) {
607 case SWAP_CTL:
608 /*
609 * get new priority, remove old entry (if any) and then
610 * reinsert it in the correct place. finally, prune out
611 * any empty priority structures.
612 */
613 priority = SCARG(uap, misc);
614 spp = (struct swappri *)
615 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
616 simple_lock(&swap_data_lock);
617 if ((sdp = swaplist_find(vp, 1)) == NULL) {
618 error = ENOENT;
619 } else {
620 swaplist_insert(sdp, spp, priority);
621 swaplist_trim();
622 }
623 simple_unlock(&swap_data_lock);
624 if (error)
625 free(spp, M_VMSWAP);
626 break;
627
628 case SWAP_ON:
629 /*
630 * check for duplicates. if none found, then insert a
631 * dummy entry on the list to prevent someone else from
632 * trying to enable this device while we are working on
633 * it.
634 */
635 priority = SCARG(uap, misc);
636 simple_lock(&swap_data_lock);
637 if ((sdp = swaplist_find(vp, 0)) != NULL) {
638 error = EBUSY;
639 simple_unlock(&swap_data_lock);
640 goto bad;
641 }
642 sdp = (struct swapdev *)
643 malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
644 spp = (struct swappri *)
645 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
646 bzero(sdp, sizeof(*sdp));
647 sdp->swd_flags = SWF_FAKE; /* placeholder only */
648 sdp->swd_vp = vp;
649 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
650 #ifdef SWAP_TO_FILES
651 /*
652 * XXX Is NFS elaboration necessary?
653 */
654 if (vp->v_type == VREG)
655 sdp->swd_cred = crdup(p->p_ucred);
656 #endif
657 swaplist_insert(sdp, spp, priority);
658 simple_unlock(&swap_data_lock);
659
660 /*
661 * we've now got a FAKE placeholder in the swap list.
662 * now attempt to enable swap on it. if we fail, undo
663 * what we've done and kill the fake entry we just inserted.
664 * if swap_on is a success, it will clear the SWF_FAKE flag
665 */
666 if ((error = swap_on(p, sdp)) != 0) {
667 simple_lock(&swap_data_lock);
668 (void) swaplist_find(vp, 1); /* kill fake entry */
669 swaplist_trim();
670 simple_unlock(&swap_data_lock);
671 #ifdef SWAP_TO_FILES
672 if (vp->v_type == VREG)
673 crfree(sdp->swd_cred);
674 #endif
675 free((caddr_t)sdp, M_VMSWAP);
676 break;
677 }
678
679 /*
680 * got it! now add a second reference to vp so that
681 * we keep a reference to the vnode after we return.
682 */
683 vref(vp);
684 break;
685
686 case SWAP_OFF:
687 UVMHIST_LOG(pdhist, "someone is using SWAP_OFF...??", 0,0,0,0);
688 #ifdef SWAP_OFF_WORKS
689 /*
690 * find the entry of interest and ensure it is enabled.
691 */
692 simple_lock(&swap_data_lock);
693 if ((sdp = swaplist_find(vp, 0)) == NULL) {
694 simple_unlock(&swap_data_lock);
695 error = ENXIO;
696 break;
697 }
698 /*
699 * If a device isn't in use or enabled, we
700 * can't stop swapping from it (again).
701 */
702 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
703 simple_unlock(&swap_data_lock);
704 error = EBUSY;
705 goto bad;
706 }
707 /* XXXCDC: should we call with list locked or unlocked? */
708 if ((error = swap_off(p, sdp)) != 0)
709 goto bad;
710 /* XXXCDC: might need relock here */
711
712 /*
713 * now we can kill the entry.
714 */
715 if ((sdp = swaplist_find(vp, 1)) == NULL) {
716 error = ENXIO;
717 break;
718 }
719 simple_unlock(&swap_data_lock);
720 free((caddr_t)sdp, M_VMSWAP);
721 #else
722 error = EINVAL;
723 #endif
724 break;
725
726 default:
727 UVMHIST_LOG(pdhist, "unhandled command: %#x",
728 SCARG(uap, cmd), 0, 0, 0);
729 error = EINVAL;
730 }
731
732 bad:
733 /*
734 * done! use vput to drop our reference and unlock
735 */
736 vput(vp);
737 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0);
738
739 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
740 return (error);
741 }
742
743 /*
744 * swap_on: attempt to enable a swapdev for swapping. note that the
745 * swapdev is already on the global list, but disabled (marked
746 * SWF_FAKE).
747 *
748 * => we avoid the start of the disk (to protect disk labels)
749 * => we also avoid the miniroot, if we are swapping to root.
750 * => caller should leave swap_data_lock unlocked, we may lock it
751 * if needed.
752 */
753 static int
754 swap_on(p, sdp)
755 struct proc *p;
756 struct swapdev *sdp;
757 {
758 static int count = 0; /* static */
759 struct vnode *vp;
760 int error, npages, nblocks, size;
761 long addr;
762 char *storage;
763 int storagesize;
764 #ifdef SWAP_TO_FILES
765 struct vattr va;
766 #endif
767 #ifdef NFS
768 extern int (**nfsv2_vnodeop_p) __P((void *));
769 #endif /* NFS */
770 dev_t dev;
771 char *name;
772 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
773
774 /*
775 * we want to enable swapping on sdp. the swd_vp contains
776 * the vnode we want (locked and ref'd), and the swd_dev
777 * contains the dev_t of the file, if it a block device.
778 */
779
780 vp = sdp->swd_vp;
781 dev = sdp->swd_dev;
782
783 /*
784 * open the swap file (mostly useful for block device files to
785 * let device driver know what is up).
786 *
787 * we skip the open/close for root on swap because the root
788 * has already been opened when root was mounted (mountroot).
789 */
790 if (vp != rootvp) {
791 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
792 return (error);
793 }
794
795 /* XXX this only works for block devices */
796 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
797
798 /*
799 * we now need to determine the size of the swap area. for
800 * block specials we can call the d_psize function.
801 * for normal files, we must stat [get attrs].
802 *
803 * we put the result in nblks.
804 * for normal files, we also want the filesystem block size
805 * (which we get with statfs).
806 */
807 switch (vp->v_type) {
808 case VBLK:
809 if (bdevsw[major(dev)].d_psize == 0 ||
810 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
811 error = ENXIO;
812 goto bad;
813 }
814 break;
815
816 #ifdef SWAP_TO_FILES
817 case VREG:
818 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
819 goto bad;
820 nblocks = (int)btodb(va.va_size);
821 if ((error =
822 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
823 goto bad;
824
825 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
826 /*
827 * limit the max # of outstanding I/O requests we issue
828 * at any one time. take it easy on NFS servers.
829 */
830 #ifdef NFS
831 if (vp->v_op == nfsv2_vnodeop_p)
832 sdp->swd_maxactive = 2; /* XXX */
833 else
834 #endif /* NFS */
835 sdp->swd_maxactive = 8; /* XXX */
836 break;
837 #endif
838
839 default:
840 error = ENXIO;
841 goto bad;
842 }
843
844 /*
845 * save nblocks in a safe place and convert to pages.
846 */
847
848 sdp->swd_se.se_nblks = nblocks;
849 npages = dbtob(nblocks) / PAGE_SIZE;
850
851 /*
852 * for block special files, we want to make sure that leave
853 * the disklabel and bootblocks alone, so we arrange to skip
854 * over them (randomly choosing to skip PAGE_SIZE bytes).
855 * note that because of this the "size" can be less than the
856 * actual number of blocks on the device.
857 */
858 if (vp->v_type == VBLK) {
859 /* we use pages 1 to (size - 1) [inclusive] */
860 size = npages - 1;
861 addr = 1;
862 } else {
863 /* we use pages 0 to (size - 1) [inclusive] */
864 size = npages;
865 addr = 0;
866 }
867
868 /*
869 * make sure we have enough blocks for a reasonable sized swap
870 * area. we want at least one page.
871 */
872
873 if (size < 1) {
874 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
875 error = EINVAL;
876 goto bad;
877 }
878
879 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
880
881 /*
882 * now we need to allocate an extent to manage this swap device
883 */
884 name = malloc(12, M_VMSWAP, M_WAITOK);
885 sprintf(name, "swap0x%04x", count++);
886
887 /*
888 * XXXCDC: what should we make of this extent storage size stuff
889 *
890 * XXXMRG: well, i've come to realise that we need, at most,
891 * blocks2pages(npages)/2 extents (or so), to cover all possible
892 * allocations that may occur in the extent -- every other page
893 * being allocated.
894 */
895 #if 1
896 storagesize = EXTENT_FIXED_STORAGE_SIZE(maxproc * 2);
897 #else
898 /* XXXMRG: this uses lots of memory */
899 storagesize = EXTENT_FIXED_STORAGE_SIZE(npages / 2);
900 #endif
901 storage = malloc(storagesize, M_VMSWAP, M_WAITOK);
902 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
903 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
904 storage, storagesize, EX_WAITOK);
905 /* allocate the `saved' region from the extent so it won't be used */
906 if (addr) {
907 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
908 panic("disklabel region");
909 sdp->swd_npginuse += addr;
910 uvmexp.swpginuse += addr;
911 }
912
913
914 /*
915 * if the vnode we are swapping to is the root vnode
916 * (i.e. we are swapping to the miniroot) then we want
917 * to make sure we don't overwrite it. do a statfs to
918 * find its size and skip over it.
919 */
920 if (vp == rootvp) {
921 struct mount *mp;
922 struct statfs *sp;
923 int rootblocks, rootpages;
924
925 mp = rootvnode->v_mount;
926 sp = &mp->mnt_stat;
927 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
928 rootpages = round_page(dbtob(rootblocks)) / PAGE_SIZE;
929 if (rootpages > npages)
930 panic("swap_on: miniroot larger than swap?");
931
932 if (extent_alloc_region(sdp->swd_ex, addr,
933 rootpages, EX_WAITOK))
934 panic("swap_on: unable to preserve miniroot");
935
936 sdp->swd_npginuse += (rootpages - addr);
937 uvmexp.swpginuse += (rootpages - addr);
938
939 printf("Preserved %d pages of miniroot ", rootpages);
940 printf("leaving %d pages of swap\n", size - rootpages);
941 }
942
943 /*
944 * now add the new swapdev to the drum and enable.
945 */
946 simple_lock(&swap_data_lock);
947 swapdrum_add(sdp, npages);
948 sdp->swd_npages = npages;
949 sdp->swd_flags &= ~SWF_FAKE; /* going live */
950 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
951 simple_unlock(&swap_data_lock);
952 uvmexp.swpages += npages;
953
954 /*
955 * add anon's to reflect the swap space we added
956 */
957 uvm_anon_add(size);
958
959 return (0);
960
961 bad:
962 /*
963 * failure: close device if necessary and return error.
964 */
965 if (vp != rootvp)
966 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
967 return (error);
968 }
969
970 #ifdef SWAP_OFF_WORKS
971 /*
972 * swap_off: stop swapping on swapdev
973 *
974 * XXXCDC: what conditions go here?
975 */
976 static int
977 swap_off(p, sdp)
978 struct proc *p;
979 struct swapdev *sdp;
980 {
981 char *name;
982 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
983
984 /* turn off the enable flag */
985 sdp->swd_flags &= ~SWF_ENABLE;
986
987 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev);
988
989 /*
990 * XXX write me
991 *
992 * the idea is to find out which processes are using this swap
993 * device, and page them all in.
994 *
995 * eventually, we should try to move them out to other swap areas
996 * if available.
997 *
998 * The alternative is to create a redirection map for this swap
999 * device. This should work by moving all the pages of data from
1000 * the ex-swap device to another one, and making an entry in the
1001 * redirection map for it. locking is going to be important for
1002 * this!
1003 *
1004 * XXXCDC: also need to shrink anon pool
1005 */
1006
1007 /* until the above code is written, we must ENODEV */
1008 return ENODEV;
1009
1010 extent_free(swapmap, sdp->swd_mapoffset, sdp->swd_mapsize, EX_WAITOK);
1011 name = sdp->swd_ex->ex_name;
1012 extent_destroy(sdp->swd_ex);
1013 free(name, M_VMSWAP);
1014 free((caddr_t)sdp->swd_ex, M_VMSWAP);
1015 if (sdp->swp_vp != rootvp)
1016 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1017 if (sdp->swd_vp)
1018 vrele(sdp->swd_vp);
1019 free((caddr_t)sdp, M_VMSWAP);
1020 return (0);
1021 }
1022 #endif
1023
1024 /*
1025 * /dev/drum interface and i/o functions
1026 */
1027
1028 /*
1029 * swread: the read function for the drum (just a call to physio)
1030 */
1031 /*ARGSUSED*/
1032 int
1033 swread(dev, uio, ioflag)
1034 dev_t dev;
1035 struct uio *uio;
1036 int ioflag;
1037 {
1038 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1039
1040 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1041 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1042 }
1043
1044 /*
1045 * swwrite: the write function for the drum (just a call to physio)
1046 */
1047 /*ARGSUSED*/
1048 int
1049 swwrite(dev, uio, ioflag)
1050 dev_t dev;
1051 struct uio *uio;
1052 int ioflag;
1053 {
1054 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1055
1056 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1057 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1058 }
1059
1060 /*
1061 * swstrategy: perform I/O on the drum
1062 *
1063 * => we must map the i/o request from the drum to the correct swapdev.
1064 */
1065 void
1066 swstrategy(bp)
1067 struct buf *bp;
1068 {
1069 struct swapdev *sdp;
1070 struct vnode *vp;
1071 int pageno;
1072 int bn;
1073 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1074
1075 /*
1076 * convert block number to swapdev. note that swapdev can't
1077 * be yanked out from under us because we are holding resources
1078 * in it (i.e. the blocks we are doing I/O on).
1079 */
1080 pageno = dbtob(bp->b_blkno) / PAGE_SIZE;
1081 simple_lock(&swap_data_lock);
1082 sdp = swapdrum_getsdp(pageno);
1083 simple_unlock(&swap_data_lock);
1084 if (sdp == NULL) {
1085 bp->b_error = EINVAL;
1086 bp->b_flags |= B_ERROR;
1087 biodone(bp);
1088 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1089 return;
1090 }
1091
1092 /*
1093 * convert drum page number to block number on this swapdev.
1094 */
1095
1096 pageno = pageno - sdp->swd_drumoffset; /* page # on swapdev */
1097 bn = btodb(pageno * PAGE_SIZE); /* convert to diskblock */
1098
1099 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n",
1100 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1101 sdp->swd_drumoffset, bn, bp->b_bcount);
1102
1103
1104 /*
1105 * for block devices we finish up here.
1106 * for regular files we have to do more work which we deligate
1107 * to sw_reg_strategy().
1108 */
1109
1110 switch (sdp->swd_vp->v_type) {
1111 default:
1112 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1113 case VBLK:
1114
1115 /*
1116 * must convert "bp" from an I/O on /dev/drum to an I/O
1117 * on the swapdev (sdp).
1118 */
1119 bp->b_blkno = bn; /* swapdev block number */
1120 vp = sdp->swd_vp; /* swapdev vnode pointer */
1121 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1122 VHOLD(vp); /* "hold" swapdev vp for i/o */
1123
1124 /*
1125 * if we are doing a write, we have to redirect the i/o on
1126 * drum's v_numoutput counter to the swapdevs.
1127 */
1128 if ((bp->b_flags & B_READ) == 0) {
1129 int s = splbio();
1130 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1131 vp->v_numoutput++; /* put it on swapdev */
1132 splx(s);
1133 }
1134
1135 /*
1136 * dissassocate buffer with /dev/drum vnode
1137 * [could be null if buf was from physio]
1138 */
1139 if (bp->b_vp != NULLVP)
1140 brelvp(bp);
1141
1142 /*
1143 * finally plug in swapdev vnode and start I/O
1144 */
1145 bp->b_vp = vp;
1146 VOP_STRATEGY(bp);
1147 return;
1148 #ifdef SWAP_TO_FILES
1149 case VREG:
1150 /*
1151 * deligate to sw_reg_strategy function.
1152 */
1153 sw_reg_strategy(sdp, bp, bn);
1154 return;
1155 #endif
1156 }
1157 /* NOTREACHED */
1158 }
1159
1160 #ifdef SWAP_TO_FILES
1161 /*
1162 * sw_reg_strategy: handle swap i/o to regular files
1163 */
1164 static void
1165 sw_reg_strategy(sdp, bp, bn)
1166 struct swapdev *sdp;
1167 struct buf *bp;
1168 int bn;
1169 {
1170 struct vnode *vp;
1171 struct vndxfer *vnx;
1172 daddr_t nbn;
1173 caddr_t addr;
1174 int byteoff, s, off, nra, error, sz, resid;
1175 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1176
1177 /*
1178 * allocate a vndxfer head for this transfer and point it to
1179 * our buffer.
1180 */
1181 vnx = getvndxfer();
1182 vnx->vx_flags = VX_BUSY;
1183 vnx->vx_error = 0;
1184 vnx->vx_pending = 0;
1185 vnx->vx_bp = bp;
1186 vnx->vx_sdp = sdp;
1187
1188 /*
1189 * setup for main loop where we read filesystem blocks into
1190 * our buffer.
1191 */
1192 error = 0;
1193 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1194 addr = bp->b_data; /* current position in buffer */
1195 byteoff = dbtob(bn); /* XXX: should it be an off_t? */
1196
1197 for (resid = bp->b_resid; resid; resid -= sz) {
1198 struct vndbuf *nbp;
1199
1200 /*
1201 * translate byteoffset into block number. return values:
1202 * vp = vnode of underlying device
1203 * nbn = new block number (on underlying vnode dev)
1204 * nra = num blocks we can read-ahead (excludes requested
1205 * block)
1206 */
1207 nra = 0;
1208 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1209 &vp, &nbn, &nra);
1210
1211 if (error == 0 && (long)nbn == -1)
1212 error = EIO; /* failure */
1213
1214 /*
1215 * punt if there was an error or a hole in the file.
1216 * we must wait for any i/o ops we have already started
1217 * to finish before returning.
1218 *
1219 * XXX we could deal with holes here but it would be
1220 * a hassle (in the write case).
1221 */
1222 if (error) {
1223 s = splbio();
1224 vnx->vx_error = error; /* pass error up */
1225 goto out;
1226 }
1227
1228 /*
1229 * compute the size ("sz") of this transfer (in bytes).
1230 * XXXCDC: ignores read-ahead for non-zero offset
1231 */
1232 if ((off = (byteoff % sdp->swd_bsize)) != 0)
1233 sz = sdp->swd_bsize - off;
1234 else
1235 sz = (1 + nra) * sdp->swd_bsize;
1236
1237 if (resid < sz)
1238 sz = resid;
1239
1240 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p bn 0x%x/0x%x",
1241 sdp->swd_vp, vp, bn, nbn);
1242
1243 /*
1244 * now get a buf structure. note that the vb_buf is
1245 * at the front of the nbp structure so that you can
1246 * cast pointers between the two structure easily.
1247 */
1248 nbp = getvndbuf();
1249 nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
1250 nbp->vb_buf.b_bcount = sz;
1251 nbp->vb_buf.b_bufsize = bp->b_bufsize; /* XXXCDC: really? */
1252 nbp->vb_buf.b_error = 0;
1253 nbp->vb_buf.b_data = addr;
1254 nbp->vb_buf.b_blkno = nbn + btodb(off);
1255 nbp->vb_buf.b_proc = bp->b_proc;
1256 nbp->vb_buf.b_iodone = sw_reg_iodone;
1257 nbp->vb_buf.b_vp = NULLVP;
1258 nbp->vb_buf.b_rcred = sdp->swd_cred;
1259 nbp->vb_buf.b_wcred = sdp->swd_cred;
1260
1261 /*
1262 * set b_dirtyoff/end and b_vaildoff/end. this is
1263 * required by the NFS client code (otherwise it will
1264 * just discard our I/O request).
1265 */
1266 if (bp->b_dirtyend == 0) {
1267 nbp->vb_buf.b_dirtyoff = 0;
1268 nbp->vb_buf.b_dirtyend = sz;
1269 } else {
1270 nbp->vb_buf.b_dirtyoff =
1271 max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1272 nbp->vb_buf.b_dirtyend =
1273 min(sz,
1274 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1275 }
1276 if (bp->b_validend == 0) {
1277 nbp->vb_buf.b_validoff = 0;
1278 nbp->vb_buf.b_validend = sz;
1279 } else {
1280 nbp->vb_buf.b_validoff =
1281 max(0, bp->b_validoff - (bp->b_bcount-resid));
1282 nbp->vb_buf.b_validend =
1283 min(sz,
1284 max(0, bp->b_validend - (bp->b_bcount-resid)));
1285 }
1286
1287 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1288
1289 /*
1290 * Just sort by block number
1291 */
1292 nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
1293 s = splbio();
1294 if (vnx->vx_error != 0) {
1295 putvndbuf(nbp);
1296 goto out;
1297 }
1298 vnx->vx_pending++;
1299
1300 /* assoc new buffer with underlying vnode */
1301 bgetvp(vp, &nbp->vb_buf);
1302
1303 /* sort it in and start I/O if we are not over our limit */
1304 disksort(&sdp->swd_tab, &nbp->vb_buf);
1305 sw_reg_start(sdp);
1306 splx(s);
1307
1308 /*
1309 * advance to the next I/O
1310 */
1311 bn += sz;
1312 addr += sz;
1313 }
1314
1315 s = splbio();
1316
1317 out: /* Arrive here at splbio */
1318 vnx->vx_flags &= ~VX_BUSY;
1319 if (vnx->vx_pending == 0) {
1320 if (vnx->vx_error != 0) {
1321 bp->b_error = vnx->vx_error;
1322 bp->b_flags |= B_ERROR;
1323 }
1324 putvndxfer(vnx);
1325 biodone(bp);
1326 }
1327 splx(s);
1328 }
1329
1330 /*
1331 * sw_reg_start: start an I/O request on the requested swapdev
1332 *
1333 * => reqs are sorted by disksort (above)
1334 */
1335 static void
1336 sw_reg_start(sdp)
1337 struct swapdev *sdp;
1338 {
1339 struct buf *bp;
1340 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1341
1342 /* recursion control */
1343 if ((sdp->swd_flags & SWF_BUSY) != 0)
1344 return;
1345
1346 sdp->swd_flags |= SWF_BUSY;
1347
1348 while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
1349 bp = sdp->swd_tab.b_actf;
1350 if (bp == NULL)
1351 break;
1352 sdp->swd_tab.b_actf = bp->b_actf;
1353 sdp->swd_tab.b_active++;
1354
1355 UVMHIST_LOG(pdhist,
1356 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1357 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1358 if ((bp->b_flags & B_READ) == 0)
1359 bp->b_vp->v_numoutput++;
1360 VOP_STRATEGY(bp);
1361 }
1362 sdp->swd_flags &= ~SWF_BUSY;
1363 }
1364
1365 /*
1366 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1367 *
1368 * => note that we can recover the vndbuf struct by casting the buf ptr
1369 */
1370 static void
1371 sw_reg_iodone(bp)
1372 struct buf *bp;
1373 {
1374 struct vndbuf *vbp = (struct vndbuf *) bp;
1375 struct vndxfer *vnx = vbp->vb_xfer;
1376 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1377 struct swapdev *sdp = vnx->vx_sdp;
1378 int s, resid;
1379 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1380
1381 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1382 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1383 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1384 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1385
1386 /*
1387 * protect vbp at splbio and update.
1388 */
1389
1390 s = splbio();
1391 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1392 pbp->b_resid -= resid;
1393 vnx->vx_pending--;
1394
1395 if (vbp->vb_buf.b_error) {
1396 UVMHIST_LOG(pdhist, " got error=%d !",
1397 vbp->vb_buf.b_error, 0, 0, 0);
1398
1399 /* pass error upward */
1400 vnx->vx_error = vbp->vb_buf.b_error;
1401 }
1402
1403 /*
1404 * drop "hold" reference to vnode (if one)
1405 * XXXCDC: always set to NULLVP, this is useless, right?
1406 */
1407 if (vbp->vb_buf.b_vp != NULLVP)
1408 brelvp(&vbp->vb_buf);
1409
1410 /*
1411 * kill vbp structure
1412 */
1413 putvndbuf(vbp);
1414
1415 /*
1416 * wrap up this transaction if it has run to completion or, in
1417 * case of an error, when all auxiliary buffers have returned.
1418 */
1419 if (vnx->vx_error != 0) {
1420 /* pass error upward */
1421 pbp->b_flags |= B_ERROR;
1422 pbp->b_error = vnx->vx_error;
1423 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1424 putvndxfer(vnx);
1425 biodone(pbp);
1426 }
1427 }
1428
1429 if (pbp->b_resid == 0) {
1430 #ifdef DIAGNOSTIC
1431 if (vnx->vx_pending != 0)
1432 panic("sw_reg_iodone: vnx pending: %d", vnx->vx_pending);
1433 #endif
1434
1435 if ((vnx->vx_flags & VX_BUSY) == 0) {
1436 UVMHIST_LOG(pdhist, " iodone error=%d !",
1437 pbp, vnx->vx_error, 0, 0);
1438 putvndxfer(vnx);
1439 biodone(pbp);
1440 }
1441 }
1442
1443 /*
1444 * done! start next swapdev I/O if one is pending
1445 */
1446 sdp->swd_tab.b_active--;
1447 sw_reg_start(sdp);
1448
1449 splx(s);
1450 }
1451 #endif /* SWAP_TO_FILES */
1452
1453
1454 /*
1455 * uvm_swap_alloc: allocate space on swap
1456 *
1457 * => allocation is done "round robin" down the priority list, as we
1458 * allocate in a priority we "rotate" the circle queue.
1459 * => space can be freed with uvm_swap_free
1460 * => we return the page slot number in /dev/drum (0 == invalid slot)
1461 * => we lock swap_data_lock
1462 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1463 */
1464 int
1465 uvm_swap_alloc(nslots, lessok)
1466 int *nslots; /* IN/OUT */
1467 boolean_t lessok;
1468 {
1469 struct swapdev *sdp;
1470 struct swappri *spp;
1471 u_long result;
1472 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1473
1474 /*
1475 * no swap devices configured yet? definite failure.
1476 */
1477 if (uvmexp.nswapdev < 1)
1478 return 0;
1479
1480 /*
1481 * lock data lock, convert slots into blocks, and enter loop
1482 */
1483 simple_lock(&swap_data_lock);
1484
1485 ReTry: /* XXXMRG */
1486 for (spp = swap_priority.lh_first; spp != NULL;
1487 spp = spp->spi_swappri.le_next) {
1488 for (sdp = spp->spi_swapdev.cqh_first;
1489 sdp != (void *)&spp->spi_swapdev;
1490 sdp = sdp->swd_next.cqe_next) {
1491 /* if it's not enabled, then we can't swap from it */
1492 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1493 continue;
1494 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1495 continue;
1496 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1497 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1498 &result) != 0) {
1499 continue;
1500 }
1501
1502 /*
1503 * successful allocation! now rotate the circleq.
1504 */
1505 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1506 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1507 sdp->swd_npginuse += *nslots;
1508 uvmexp.swpginuse += *nslots;
1509 simple_unlock(&swap_data_lock);
1510 /* done! return drum slot number */
1511 UVMHIST_LOG(pdhist,
1512 "success! returning %d slots starting at %d",
1513 *nslots, result + sdp->swd_drumoffset, 0, 0);
1514 #if 0
1515 {
1516 struct swapdev *sdp2;
1517
1518 sdp2 = swapdrum_getsdp(result + sdp->swd_drumoffset);
1519 if (sdp2 == NULL) {
1520 printf("uvm_swap_alloc: nslots=%d, dev=%x, drumoff=%d, result=%ld",
1521 *nslots, sdp->swd_dev, sdp->swd_drumoffset, result);
1522 panic("uvm_swap_alloc: allocating unmapped swap block!");
1523 }
1524 }
1525 #endif
1526 return(result + sdp->swd_drumoffset);
1527 }
1528 }
1529
1530 /* XXXMRG: BEGIN HACK */
1531 if (*nslots > 1 && lessok) {
1532 *nslots = 1;
1533 goto ReTry; /* XXXMRG: ugh! extent should support this for us */
1534 }
1535 /* XXXMRG: END HACK */
1536
1537 simple_unlock(&swap_data_lock);
1538 return 0; /* failed */
1539 }
1540
1541 /*
1542 * uvm_swap_free: free swap slots
1543 *
1544 * => this can be all or part of an allocation made by uvm_swap_alloc
1545 * => we lock swap_data_lock
1546 */
1547 void
1548 uvm_swap_free(startslot, nslots)
1549 int startslot;
1550 int nslots;
1551 {
1552 struct swapdev *sdp;
1553 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1554
1555 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1556 startslot, 0, 0);
1557 /*
1558 * convert drum slot offset back to sdp, free the blocks
1559 * in the extent, and return. must hold pri lock to do
1560 * lookup and access the extent.
1561 */
1562 simple_lock(&swap_data_lock);
1563 sdp = swapdrum_getsdp(startslot);
1564
1565 #ifdef DIAGNOSTIC
1566 if (uvmexp.nswapdev < 1)
1567 panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1568 if (sdp == NULL) {
1569 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1570 nslots);
1571 panic("uvm_swap_free: unmapped address\n");
1572 }
1573 #endif
1574 extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1575 EX_MALLOCOK|EX_NOWAIT);
1576 sdp->swd_npginuse -= nslots;
1577 uvmexp.swpginuse -= nslots;
1578 #ifdef DIAGNOSTIC
1579 if (sdp->swd_npginuse < 0)
1580 panic("uvm_swap_free: inuse < 0");
1581 #endif
1582 simple_unlock(&swap_data_lock);
1583 }
1584
1585 /*
1586 * uvm_swap_put: put any number of pages into a contig place on swap
1587 *
1588 * => can be sync or async
1589 * => XXXMRG: consider making it an inline or macro
1590 */
1591 int
1592 uvm_swap_put(swslot, ppsp, npages, flags)
1593 int swslot;
1594 struct vm_page **ppsp;
1595 int npages;
1596 int flags;
1597 {
1598 int result;
1599
1600 #if 0
1601 flags |= PGO_SYNCIO; /* XXXMRG: tmp, force sync */
1602 #endif
1603
1604 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1605 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1606
1607 return (result);
1608 }
1609
1610 /*
1611 * uvm_swap_get: get a single page from swap
1612 *
1613 * => usually a sync op (from fault)
1614 * => XXXMRG: consider making it an inline or macro
1615 */
1616 int
1617 uvm_swap_get(page, swslot, flags)
1618 struct vm_page *page;
1619 int swslot, flags;
1620 {
1621 int result;
1622
1623 uvmexp.nswget++;
1624 #ifdef DIAGNOSTIC
1625 if ((flags & PGO_SYNCIO) == 0)
1626 printf("uvm_swap_get: ASYNC get requested?\n");
1627 #endif
1628
1629 result = uvm_swap_io(&page, swslot, 1, B_READ |
1630 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1631
1632 return (result);
1633 }
1634
1635 /*
1636 * uvm_swap_io: do an i/o operation to swap
1637 */
1638
1639 static int
1640 uvm_swap_io(pps, startslot, npages, flags)
1641 struct vm_page **pps;
1642 int startslot, npages, flags;
1643 {
1644 daddr_t startblk;
1645 struct swapbuf *sbp;
1646 struct buf *bp;
1647 vm_offset_t kva;
1648 int result, s, waitf;
1649 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1650
1651 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1652 startslot, npages, flags, 0);
1653 /*
1654 * convert starting drum slot to block number
1655 */
1656 startblk = btodb(startslot * PAGE_SIZE);
1657
1658 /*
1659 * first, map the pages into the kernel (XXX: currently required
1660 * by buffer system). note that we don't let pagermapin alloc
1661 * an aiodesc structure because we don't want to chance a malloc.
1662 * we've got our own pool of aiodesc structures (in swapbuf).
1663 */
1664 waitf = (flags & B_ASYNC) ? M_NOWAIT : M_WAITOK;
1665 kva = uvm_pagermapin(pps, npages, NULL, waitf);
1666 if (kva == NULL)
1667 return (VM_PAGER_AGAIN);
1668
1669 /*
1670 * now allocate a swap buffer off of freesbufs
1671 * [make sure we don't put the pagedaemon to sleep...]
1672 */
1673 s = splbio();
1674 simple_lock(&swap_buf_lock);
1675
1676 /* never put the pagedaemon to sleep! */
1677 if ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) {
1678
1679 sbp = freesbufs.sqh_first;
1680
1681 } else {
1682
1683 /* we can sleep for a sbuf if needed */
1684 while (freesbufs.sqh_first == NULL) {
1685
1686 sbufs_wanted = 1;
1687 UVM_UNLOCK_AND_WAIT(&freesbufs, &swap_buf_lock, 0,
1688 "uvmswiobuf",0);
1689
1690 simple_lock(&swap_buf_lock); /* relock */
1691 }
1692 sbp = freesbufs.sqh_first;
1693 }
1694
1695 if (sbp)
1696 SIMPLEQ_REMOVE_HEAD(&freesbufs, sbp, sw_sq);
1697 simple_unlock(&swap_buf_lock);
1698 splx(s); /* drop splbio */
1699
1700 /*
1701 * if we failed to get a swapbuf, return "try again"
1702 */
1703 if (sbp == NULL)
1704 return (VM_PAGER_AGAIN);
1705
1706 /*
1707 * fill in the bp/sbp. we currently route our i/o through
1708 * /dev/drum's vnode [swapdev_vp].
1709 */
1710 bp = &sbp->sw_buf;
1711 bp->b_flags = B_BUSY | (flags & (B_READ|B_ASYNC));
1712 bp->b_proc = &proc0; /* XXX */
1713 bp->b_data = (caddr_t)kva;
1714 bp->b_blkno = startblk;
1715 VHOLD(swapdev_vp);
1716 bp->b_vp = swapdev_vp;
1717 /* XXXCDC: isn't swapdev_vp always a VCHR? */
1718 /* XXXMRG: probably -- this is obviously something inherited... */
1719 if (swapdev_vp->v_type == VBLK)
1720 bp->b_dev = swapdev_vp->v_rdev;
1721 bp->b_bcount = npages * PAGE_SIZE;
1722
1723 /*
1724 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1725 * and we bump v_numoutput (counter of number of active outputs).
1726 */
1727 if ((bp->b_flags & B_READ) == 0) {
1728 bp->b_dirtyoff = 0;
1729 bp->b_dirtyend = npages * PAGE_SIZE;
1730 s = splbio();
1731 swapdev_vp->v_numoutput++;
1732 splx(s);
1733 }
1734
1735 /*
1736 * for async ops we must set up the aiodesc and setup the callback
1737 * XXX: we expect no async-reads, but we don't prevent it here.
1738 */
1739 if (flags & B_ASYNC) {
1740 sbp->sw_aio.aiodone = uvm_swap_aiodone;
1741 sbp->sw_aio.kva = kva;
1742 sbp->sw_aio.npages = npages;
1743 sbp->sw_aio.pd_ptr = sbp; /* backpointer */
1744 bp->b_flags |= B_CALL; /* set callback */
1745 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
1746 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1747 }
1748 UVMHIST_LOG(pdhist,
1749 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
1750 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1751
1752 /*
1753 * now we start the I/O, and if async, return.
1754 */
1755 VOP_STRATEGY(bp);
1756 if (flags & B_ASYNC)
1757 return (VM_PAGER_PEND);
1758
1759 /*
1760 * must be sync i/o. wait for it to finish
1761 */
1762 bp->b_error = biowait(bp);
1763 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1764
1765 /*
1766 * kill the pager mapping
1767 */
1768 uvm_pagermapout(kva, npages);
1769
1770 /*
1771 * now dispose of the swap buffer
1772 */
1773 s = splbio();
1774 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
1775 if (bp->b_vp)
1776 brelvp(bp);
1777
1778 simple_lock(&swap_buf_lock);
1779 SIMPLEQ_INSERT_HEAD(&freesbufs, sbp, sw_sq);
1780 if (sbufs_wanted) {
1781 sbufs_wanted = 0;
1782 thread_wakeup(&freesbufs);
1783 }
1784 simple_unlock(&swap_buf_lock);
1785 splx(s);
1786
1787 /*
1788 * finally return.
1789 */
1790 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0);
1791 return (result);
1792 }
1793
1794 /*
1795 * uvm_swap_bufdone: called from the buffer system when the i/o is done
1796 */
1797 static void
1798 uvm_swap_bufdone(bp)
1799 struct buf *bp;
1800 {
1801 struct swapbuf *sbp = (struct swapbuf *) bp;
1802 int s = splbio();
1803 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
1804
1805 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
1806 #ifdef DIAGNOSTIC
1807 /*
1808 * sanity check: swapbufs are private, so they shouldn't be wanted
1809 */
1810 if (bp->b_flags & B_WANTED)
1811 panic("uvm_swap_bufdone: private buf wanted");
1812 #endif
1813
1814 /*
1815 * drop buffers reference to the vnode and its flags.
1816 */
1817 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
1818 if (bp->b_vp)
1819 brelvp(bp);
1820
1821 /*
1822 * now put the aio on the uvm.aio_done list and wake the
1823 * pagedaemon (which will finish up our job in its context).
1824 */
1825 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */
1826 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
1827 simple_unlock(&uvm.pagedaemon_lock);
1828
1829 thread_wakeup(&uvm.pagedaemon);
1830 splx(s);
1831 }
1832
1833 /*
1834 * uvm_swap_aiodone: aiodone function for anonymous memory
1835 *
1836 * => this is called in the context of the pagedaemon (but with the
1837 * page queues unlocked!)
1838 * => our "aio" structure must be part of a "swapbuf"
1839 */
1840 static void
1841 uvm_swap_aiodone(aio)
1842 struct uvm_aiodesc *aio;
1843 {
1844 struct swapbuf *sbp = aio->pd_ptr;
1845 /* XXXMRG: does this work if PAGE_SIZE is a variable, eg SUN4C&&SUN4 */
1846 /* XXX it does with GCC */
1847 struct vm_page *pps[MAXBSIZE/PAGE_SIZE];
1848 int lcv, s;
1849 vm_offset_t addr;
1850 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
1851
1852 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
1853 #ifdef DIAGNOSTIC
1854 /*
1855 * sanity check
1856 */
1857 if (aio->npages > (MAXBSIZE/PAGE_SIZE))
1858 panic("uvm_swap_aiodone: aio too big!");
1859 #endif
1860
1861 /*
1862 * first, we have to recover the page pointers (pps) by poking in the
1863 * kernel pmap (XXX: should be saved in the buf structure).
1864 */
1865 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
1866 addr += PAGE_SIZE, lcv++) {
1867 pps[lcv] = uvm_pageratop(addr);
1868 }
1869
1870 /*
1871 * now we can dispose of the kernel mappings of the buffer
1872 */
1873 uvm_pagermapout(aio->kva, aio->npages);
1874
1875 /*
1876 * now we can dispose of the pages by using the dropcluster function
1877 * [note that we have no "page of interest" so we pass in null]
1878 */
1879 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
1880 PGO_PDFREECLUST, 0);
1881
1882 /*
1883 * finally, we can dispose of the swapbuf
1884 */
1885 s = splbio();
1886 simple_lock(&swap_buf_lock);
1887 SIMPLEQ_INSERT_HEAD(&freesbufs, sbp, sw_sq);
1888 if (sbufs_wanted) {
1889 sbufs_wanted = 0;
1890 thread_wakeup(&freesbufs);
1891 }
1892 simple_unlock(&swap_buf_lock);
1893 splx(s);
1894
1895 /*
1896 * done!
1897 */
1898 }
1899